dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / nfs / nfs4_vfsops.c
blobb8a39c2035c6f9c674d4d866d1d76e9ae6fdb095
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
29 * All Rights Reserved
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/pathname.h>
39 #include <sys/sysmacros.h>
40 #include <sys/kmem.h>
41 #include <sys/mkdev.h>
42 #include <sys/mount.h>
43 #include <sys/statvfs.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/utsname.h>
48 #include <sys/bootconf.h>
49 #include <sys/modctl.h>
50 #include <sys/acl.h>
51 #include <sys/flock.h>
52 #include <sys/time.h>
53 #include <sys/disp.h>
54 #include <sys/policy.h>
55 #include <sys/socket.h>
56 #include <sys/netconfig.h>
57 #include <sys/dnlc.h>
58 #include <sys/list.h>
59 #include <sys/mntent.h>
61 #include <rpc/types.h>
62 #include <rpc/auth.h>
63 #include <rpc/rpcsec_gss.h>
64 #include <rpc/clnt.h>
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/mount.h>
69 #include <nfs/nfs_acl.h>
71 #include <sys/fs_subr.h>
73 #include <nfs/nfs4.h>
74 #include <nfs/rnode4.h>
75 #include <nfs/nfs4_clnt.h>
76 #include <sys/fs/autofs.h>
78 #include <sys/sdt.h>
82 * Arguments passed to thread to free data structures from forced unmount.
85 typedef struct {
86 vfs_t *fm_vfsp;
87 int fm_flag;
88 cred_t *fm_cr;
89 } freemountargs_t;
91 static void async_free_mount(vfs_t *, int, cred_t *);
92 static void nfs4_free_mount(vfs_t *, int, cred_t *);
93 static void nfs4_free_mount_thread(freemountargs_t *);
94 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
97 * From rpcsec module (common/rpcsec).
99 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
100 extern void sec_clnt_freeinfo(struct sec_data *);
103 * The order and contents of this structure must be kept in sync with that of
104 * rfsreqcnt_v4_tmpl in nfs_stats.c
106 static char *rfsnames_v4[] = {
107 "null", "compound", "reserved", "access", "close", "commit", "create",
108 "delegpurge", "delegreturn", "getattr", "getfh", "link", "lock",
109 "lockt", "locku", "lookup", "lookupp", "nverify", "open", "openattr",
110 "open_confirm", "open_downgrade", "putfh", "putpubfh", "putrootfh",
111 "read", "readdir", "readlink", "remove", "rename", "renew",
112 "restorefh", "savefh", "secinfo", "setattr", "setclientid",
113 "setclientid_confirm", "verify", "write"
117 * nfs4_max_mount_retry is the number of times the client will redrive
118 * a mount compound before giving up and returning failure. The intent
119 * is to redrive mount compounds which fail NFS4ERR_STALE so that
120 * if a component of the server path being mounted goes stale, it can
121 * "recover" by redriving the mount compund (LOOKUP ops). This recovery
122 * code is needed outside of the recovery framework because mount is a
123 * special case. The client doesn't create vnodes/rnodes for components
124 * of the server path being mounted. The recovery code recovers real
125 * client objects, not STALE FHs which map to components of the server
126 * path being mounted.
128 * We could just fail the mount on the first time, but that would
129 * instantly trigger failover (from nfs4_mount), and the client should
130 * try to re-lookup the STALE FH before doing failover. The easiest
131 * way to "re-lookup" is to simply redrive the mount compound.
133 static int nfs4_max_mount_retry = 2;
136 * nfs4 vfs operations.
138 int nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
139 static int nfs4_unmount(vfs_t *, int, cred_t *);
140 static int nfs4_root(vfs_t *, vnode_t **);
141 static int nfs4_statvfs(vfs_t *, struct statvfs64 *);
142 static int nfs4_sync(vfs_t *, short, cred_t *);
143 static int nfs4_vget(vfs_t *, vnode_t **, fid_t *);
144 static int nfs4_mountroot(vfs_t *, whymountroot_t);
145 static void nfs4_freevfs(vfs_t *);
147 static int nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
148 int, cred_t *, zone_t *);
151 int nfs4_vfsinit(void);
152 void nfs4_vfsfini(void);
153 static void nfs4setclientid_init(void);
154 static void nfs4setclientid_fini(void);
155 static void nfs4setclientid_otw(mntinfo4_t *, servinfo4_t *, cred_t *,
156 struct nfs4_server *, nfs4_error_t *, int *);
157 static void destroy_nfs4_server(nfs4_server_t *);
158 static void remove_mi(nfs4_server_t *, mntinfo4_t *);
160 extern void nfs4_ephemeral_init(void);
161 extern void nfs4_ephemeral_fini(void);
163 /* referral related routines */
164 static servinfo4_t *copy_svp(servinfo4_t *);
165 static void free_knconf_contents(struct knetconfig *k);
166 static char *extract_referral_point(const char *, int);
167 static void setup_newsvpath(servinfo4_t *, int);
168 static void update_servinfo4(servinfo4_t *, fs_location4 *,
169 struct nfs_fsl_info *, char *, int);
172 * Initialize the vfs structure
175 static int nfs4fstyp;
179 * Debug variable to check for rdma based
180 * transport startup and cleanup. Controlled
181 * through /etc/system. Off by default.
183 extern int rdma_debug;
185 const struct vfsops nfs4_vfsops = {
186 .vfs_mount = nfs4_mount,
187 .vfs_unmount = nfs4_unmount,
188 .vfs_root = nfs4_root,
189 .vfs_statvfs = nfs4_statvfs,
190 .vfs_sync = nfs4_sync,
191 .vfs_vget = nfs4_vget,
192 .vfs_mountroot = nfs4_mountroot,
193 .vfs_freevfs = nfs4_freevfs,
197 nfs4init(int fstyp, char *name)
199 int error;
201 error = vfs_setfsops(fstyp, &nfs4_vfsops);
202 if (error != 0) {
203 zcmn_err(GLOBAL_ZONEID, CE_WARN,
204 "nfs4init: bad fstyp");
205 goto out;
208 nfs4fstyp = fstyp;
209 (void) nfs4_vfsinit();
210 (void) nfs4_init_dot_entries();
212 out:
213 if (error)
214 (void) vfs_freevfsops_by_type(fstyp);
216 return (error);
219 void
220 nfs4fini(void)
222 (void) nfs4_destroy_dot_entries();
223 nfs4_vfsfini();
227 * Create a new sec_data structure to store AUTH_DH related data:
228 * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
229 * flag set for NFS V4 since we are avoiding to contact the rpcbind
230 * daemon and is using the IP time service (IPPORT_TIMESERVER).
232 * sec_data can be freed by sec_clnt_freeinfo().
234 static struct sec_data *
235 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
236 struct knetconfig *knconf)
238 struct sec_data *secdata;
239 dh_k4_clntdata_t *data;
240 char *pf, *p;
242 if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
243 return (NULL);
245 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
246 secdata->flags = 0;
248 data = kmem_alloc(sizeof (*data), KM_SLEEP);
250 data->syncaddr.maxlen = syncaddr->maxlen;
251 data->syncaddr.len = syncaddr->len;
252 data->syncaddr.buf = kmem_alloc(syncaddr->len, KM_SLEEP);
253 bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
256 * duplicate the knconf information for the
257 * new opaque data.
259 data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
260 *data->knconf = *knconf;
261 pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
262 p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
263 bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
264 bcopy(knconf->knc_proto, p, KNC_STRSIZE);
265 data->knconf->knc_protofmly = pf;
266 data->knconf->knc_proto = p;
268 /* move server netname to the sec_data structure */
269 data->netname = kmem_alloc(nlen, KM_SLEEP);
270 bcopy(netname, data->netname, nlen);
271 data->netnamelen = (int)nlen;
273 secdata->secmod = AUTH_DH;
274 secdata->rpcflavor = AUTH_DH;
275 secdata->data = (caddr_t)data;
277 return (secdata);
281 * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
282 * is responsible for freeing.
284 sec_data_t *
285 copy_sec_data(sec_data_t *fsecdata)
287 sec_data_t *tsecdata;
289 if (fsecdata == NULL)
290 return (NULL);
292 if (fsecdata->rpcflavor == AUTH_DH) {
293 dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
295 if (fdata == NULL)
296 return (NULL);
298 tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
299 fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
301 return (tsecdata);
304 tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
306 tsecdata->secmod = fsecdata->secmod;
307 tsecdata->rpcflavor = fsecdata->rpcflavor;
308 tsecdata->flags = fsecdata->flags;
309 tsecdata->uid = fsecdata->uid;
311 if (fsecdata->rpcflavor == RPCSEC_GSS) {
312 gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
314 tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
315 } else {
316 tsecdata->data = NULL;
319 return (tsecdata);
322 gss_clntdata_t *
323 copy_sec_data_gss(gss_clntdata_t *fdata)
325 gss_clntdata_t *tdata;
327 if (fdata == NULL)
328 return (NULL);
330 tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
332 tdata->mechanism.length = fdata->mechanism.length;
333 tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
334 KM_SLEEP);
335 bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
336 fdata->mechanism.length);
338 tdata->service = fdata->service;
340 (void) strcpy(tdata->uname, fdata->uname);
341 (void) strcpy(tdata->inst, fdata->inst);
342 (void) strcpy(tdata->realm, fdata->realm);
344 tdata->qop = fdata->qop;
346 return (tdata);
349 static int
350 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
352 servinfo4_t *si;
355 * Iterate over the servinfo4 list to make sure
356 * we do not have a duplicate. Skip any servinfo4
357 * that has been marked "NOT IN USE"
359 for (si = svp_head; si; si = si->sv_next) {
360 (void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
361 if (si->sv_flags & SV4_NOTINUSE) {
362 nfs_rw_exit(&si->sv_lock);
363 continue;
365 nfs_rw_exit(&si->sv_lock);
366 if (si == svp)
367 continue;
368 if (si->sv_addr.len == svp->sv_addr.len &&
369 strcmp(si->sv_knconf->knc_protofmly,
370 svp->sv_knconf->knc_protofmly) == 0 &&
371 bcmp(si->sv_addr.buf, svp->sv_addr.buf,
372 si->sv_addr.len) == 0) {
373 /* it's a duplicate */
374 return (1);
377 /* it's not a duplicate */
378 return (0);
381 void
382 nfs4_free_args(struct nfs_args *nargs)
384 if (nargs->knconf) {
385 if (nargs->knconf->knc_protofmly)
386 kmem_free(nargs->knconf->knc_protofmly,
387 KNC_STRSIZE);
388 if (nargs->knconf->knc_proto)
389 kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
390 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
391 nargs->knconf = NULL;
394 if (nargs->fh) {
395 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
396 nargs->fh = NULL;
399 if (nargs->hostname) {
400 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
401 nargs->hostname = NULL;
404 if (nargs->addr) {
405 if (nargs->addr->buf) {
406 ASSERT(nargs->addr->len);
407 kmem_free(nargs->addr->buf, nargs->addr->len);
409 kmem_free(nargs->addr, sizeof (struct netbuf));
410 nargs->addr = NULL;
413 if (nargs->syncaddr) {
414 ASSERT(nargs->syncaddr->len);
415 if (nargs->syncaddr->buf) {
416 ASSERT(nargs->syncaddr->len);
417 kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
419 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
420 nargs->syncaddr = NULL;
423 if (nargs->netname) {
424 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
425 nargs->netname = NULL;
428 if (nargs->nfs_ext_u.nfs_extA.secdata) {
429 sec_clnt_freeinfo(
430 nargs->nfs_ext_u.nfs_extA.secdata);
431 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
437 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
440 int error;
441 size_t hlen; /* length of hostname */
442 size_t nlen; /* length of netname */
443 char netname[MAXNETNAMELEN+1]; /* server's netname */
444 struct netbuf addr; /* server's address */
445 struct netbuf syncaddr; /* AUTH_DES time sync addr */
446 struct knetconfig *knconf; /* transport structure */
447 struct sec_data *secdata = NULL; /* security data */
448 STRUCT_DECL(nfs_args, args); /* nfs mount arguments */
449 STRUCT_DECL(knetconfig, knconf_tmp);
450 STRUCT_DECL(netbuf, addr_tmp);
451 int flags;
452 char *p, *pf;
453 struct pathname pn;
454 char *userbufptr;
457 bzero(nargs, sizeof (*nargs));
459 STRUCT_INIT(args, get_udatamodel());
460 bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
461 if (copyin(data, STRUCT_BUF(args), MIN(datalen,
462 STRUCT_SIZE(args))))
463 return (EFAULT);
465 nargs->wsize = STRUCT_FGET(args, wsize);
466 nargs->rsize = STRUCT_FGET(args, rsize);
467 nargs->timeo = STRUCT_FGET(args, timeo);
468 nargs->retrans = STRUCT_FGET(args, retrans);
469 nargs->acregmin = STRUCT_FGET(args, acregmin);
470 nargs->acregmax = STRUCT_FGET(args, acregmax);
471 nargs->acdirmin = STRUCT_FGET(args, acdirmin);
472 nargs->acdirmax = STRUCT_FGET(args, acdirmax);
474 flags = STRUCT_FGET(args, flags);
475 nargs->flags = flags;
477 addr.buf = NULL;
478 syncaddr.buf = NULL;
482 * Allocate space for a knetconfig structure and
483 * its strings and copy in from user-land.
485 knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
486 STRUCT_INIT(knconf_tmp, get_udatamodel());
487 if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
488 STRUCT_SIZE(knconf_tmp))) {
489 kmem_free(knconf, sizeof (*knconf));
490 return (EFAULT);
493 knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
494 knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
495 knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
496 if (get_udatamodel() != DATAMODEL_LP64) {
497 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
498 } else {
499 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
502 pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
503 p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
504 error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
505 if (error) {
506 kmem_free(pf, KNC_STRSIZE);
507 kmem_free(p, KNC_STRSIZE);
508 kmem_free(knconf, sizeof (*knconf));
509 return (error);
512 error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
513 if (error) {
514 kmem_free(pf, KNC_STRSIZE);
515 kmem_free(p, KNC_STRSIZE);
516 kmem_free(knconf, sizeof (*knconf));
517 return (error);
521 knconf->knc_protofmly = pf;
522 knconf->knc_proto = p;
524 nargs->knconf = knconf;
527 * Get server address
529 STRUCT_INIT(addr_tmp, get_udatamodel());
530 if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
531 STRUCT_SIZE(addr_tmp))) {
532 error = EFAULT;
533 goto errout;
536 nargs->addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
537 userbufptr = STRUCT_FGETP(addr_tmp, buf);
538 addr.len = STRUCT_FGET(addr_tmp, len);
539 addr.buf = kmem_alloc(addr.len, KM_SLEEP);
540 addr.maxlen = addr.len;
541 if (copyin(userbufptr, addr.buf, addr.len)) {
542 kmem_free(addr.buf, addr.len);
543 error = EFAULT;
544 goto errout;
546 bcopy(&addr, nargs->addr, sizeof (struct netbuf));
549 * Get the root fhandle
551 error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
552 if (error)
553 goto errout;
555 /* Volatile fh: keep server paths, so use actual-size strings */
556 nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
557 bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
558 nargs->fh[pn.pn_pathlen] = '\0';
559 pn_free(&pn);
563 * Get server's hostname
565 if (flags & NFSMNT_HOSTNAME) {
566 error = copyinstr(STRUCT_FGETP(args, hostname),
567 netname, sizeof (netname), &hlen);
568 if (error)
569 goto errout;
570 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
571 (void) strcpy(nargs->hostname, netname);
573 } else {
574 nargs->hostname = NULL;
579 * If there are syncaddr and netname data, load them in. This is
580 * to support data needed for NFSV4 when AUTH_DH is the negotiated
581 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
583 netname[0] = '\0';
584 if (flags & NFSMNT_SECURE) {
586 /* get syncaddr */
587 STRUCT_INIT(addr_tmp, get_udatamodel());
588 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
589 STRUCT_SIZE(addr_tmp))) {
590 error = EINVAL;
591 goto errout;
593 userbufptr = STRUCT_FGETP(addr_tmp, buf);
594 syncaddr.len = STRUCT_FGET(addr_tmp, len);
595 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
596 syncaddr.maxlen = syncaddr.len;
597 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
598 kmem_free(syncaddr.buf, syncaddr.len);
599 error = EFAULT;
600 goto errout;
603 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
604 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
606 /* get server's netname */
607 if (copyinstr(STRUCT_FGETP(args, netname), netname,
608 sizeof (netname), &nlen)) {
609 error = EFAULT;
610 goto errout;
613 netname[nlen] = '\0';
614 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
615 (void) strcpy(nargs->netname, netname);
619 * Get the extention data which has the security data structure.
620 * This includes data for AUTH_SYS as well.
622 if (flags & NFSMNT_NEWARGS) {
623 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
624 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
625 nargs->nfs_args_ext == NFS_ARGS_EXTB) {
627 * Indicating the application is using the new
628 * sec_data structure to pass in the security
629 * data.
631 if (STRUCT_FGETP(args,
632 nfs_ext_u.nfs_extA.secdata) != NULL) {
633 error = sec_clnt_loadinfo(
634 (struct sec_data *)STRUCT_FGETP(args,
635 nfs_ext_u.nfs_extA.secdata),
636 &secdata, get_udatamodel());
638 nargs->nfs_ext_u.nfs_extA.secdata = secdata;
642 if (error)
643 goto errout;
646 * Failover support:
648 * We may have a linked list of nfs_args structures,
649 * which means the user is looking for failover. If
650 * the mount is either not "read-only" or "soft",
651 * we want to bail out with EINVAL.
653 if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
654 nargs->nfs_ext_u.nfs_extB.next =
655 STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
657 errout:
658 if (error)
659 nfs4_free_args(nargs);
661 return (error);
666 * nfs mount vfsop
667 * Set up mount info record and attach it to vfs struct.
670 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
672 char *data = uap->dataptr;
673 int error;
674 vnode_t *rtvp; /* the server's root */
675 mntinfo4_t *mi; /* mount info, pointed at by vfs */
676 struct knetconfig *rdma_knconf; /* rdma transport structure */
677 rnode4_t *rp;
678 struct servinfo4 *svp; /* nfs server info */
679 struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
680 struct servinfo4 *svp_head; /* first nfs server info */
681 struct servinfo4 *svp_2ndlast; /* 2nd last in server info list */
682 struct sec_data *secdata; /* security data */
683 struct nfs_args *args = NULL;
684 int flags, addr_type, removed;
685 zone_t *zone = nfs_zone();
686 nfs4_error_t n4e;
687 zone_t *mntzone = NULL;
689 if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
690 return (EPERM);
691 if (mvp->v_type != VDIR)
692 return (ENOTDIR);
695 * get arguments
697 * nfs_args is now versioned and is extensible, so
698 * uap->datalen might be different from sizeof (args)
699 * in a compatible situation.
701 more:
702 if (!(uap->flags & MS_SYSSPACE)) {
703 if (args == NULL)
704 args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
705 else
706 nfs4_free_args(args);
707 error = nfs4_copyin(data, uap->datalen, args);
708 if (error) {
709 if (args) {
710 kmem_free(args, sizeof (*args));
712 return (error);
714 } else {
715 args = (struct nfs_args *)data;
718 flags = args->flags;
721 * If the request changes the locking type, disallow the remount,
722 * because it's questionable whether we can transfer the
723 * locking state correctly.
725 if (uap->flags & MS_REMOUNT) {
726 if (!(uap->flags & MS_SYSSPACE)) {
727 nfs4_free_args(args);
728 kmem_free(args, sizeof (*args));
730 if ((mi = VFTOMI4(vfsp)) != NULL) {
731 uint_t new_mi_llock;
732 uint_t old_mi_llock;
733 new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
734 old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
735 if (old_mi_llock != new_mi_llock)
736 return (EBUSY);
738 return (0);
742 * For ephemeral mount trigger stub vnodes, we have two problems
743 * to solve: racing threads will likely fail the v_count check, and
744 * we want only one to proceed with the mount.
746 * For stubs, if the mount has already occurred (via a racing thread),
747 * just return success. If not, skip the v_count check and proceed.
748 * Note that we are already serialised at this point.
750 mutex_enter(&mvp->v_lock);
751 if (vn_matchops(mvp, &nfs4_trigger_vnodeops)) {
752 /* mntpt is a v4 stub vnode */
753 ASSERT(RP_ISSTUB(VTOR4(mvp)));
754 ASSERT(!(uap->flags & MS_OVERLAY));
755 ASSERT(!(mvp->v_flag & VROOT));
756 if (vn_mountedvfs(mvp) != NULL) {
757 /* ephemeral mount has already occurred */
758 ASSERT(uap->flags & MS_SYSSPACE);
759 mutex_exit(&mvp->v_lock);
760 return (0);
762 } else {
763 /* mntpt is a non-v4 or v4 non-stub vnode */
764 if (!(uap->flags & MS_OVERLAY) &&
765 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
766 mutex_exit(&mvp->v_lock);
767 if (!(uap->flags & MS_SYSSPACE)) {
768 nfs4_free_args(args);
769 kmem_free(args, sizeof (*args));
771 return (EBUSY);
774 mutex_exit(&mvp->v_lock);
776 /* make sure things are zeroed for errout: */
777 rtvp = NULL;
778 mi = NULL;
779 secdata = NULL;
782 * A valid knetconfig structure is required.
784 if (!(flags & NFSMNT_KNCONF) ||
785 args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
786 args->knconf->knc_proto == NULL ||
787 (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
788 if (!(uap->flags & MS_SYSSPACE)) {
789 nfs4_free_args(args);
790 kmem_free(args, sizeof (*args));
792 return (EINVAL);
795 if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
796 (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
797 if (!(uap->flags & MS_SYSSPACE)) {
798 nfs4_free_args(args);
799 kmem_free(args, sizeof (*args));
801 return (EINVAL);
805 * Allocate a servinfo4 struct.
807 svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
808 nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
809 if (svp_tail) {
810 svp_2ndlast = svp_tail;
811 svp_tail->sv_next = svp;
812 } else {
813 svp_head = svp;
814 svp_2ndlast = svp;
817 svp_tail = svp;
818 svp->sv_knconf = args->knconf;
819 args->knconf = NULL;
822 * Get server address
824 if (args->addr == NULL || args->addr->buf == NULL) {
825 error = EINVAL;
826 goto errout;
829 svp->sv_addr.maxlen = args->addr->maxlen;
830 svp->sv_addr.len = args->addr->len;
831 svp->sv_addr.buf = args->addr->buf;
832 args->addr->buf = NULL;
835 * Get the root fhandle
837 if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
838 error = EINVAL;
839 goto errout;
842 svp->sv_path = args->fh;
843 svp->sv_pathlen = strlen(args->fh) + 1;
844 args->fh = NULL;
847 * Get server's hostname
849 if (flags & NFSMNT_HOSTNAME) {
850 if (args->hostname == NULL || (strlen(args->hostname) >
851 MAXNETNAMELEN)) {
852 error = EINVAL;
853 goto errout;
855 svp->sv_hostnamelen = strlen(args->hostname) + 1;
856 svp->sv_hostname = args->hostname;
857 args->hostname = NULL;
858 } else {
859 char *p = "unknown-host";
860 svp->sv_hostnamelen = strlen(p) + 1;
861 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
862 (void) strcpy(svp->sv_hostname, p);
866 * RDMA MOUNT SUPPORT FOR NFS v4.
867 * Establish, is it possible to use RDMA, if so overload the
868 * knconf with rdma specific knconf and free the orignal knconf.
870 if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
872 * Determine the addr type for RDMA, IPv4 or v6.
874 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
875 addr_type = AF_INET;
876 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
877 addr_type = AF_INET6;
879 if (rdma_reachable(addr_type, &svp->sv_addr,
880 &rdma_knconf) == 0) {
882 * If successful, hijack the orignal knconf and
883 * replace with the new one, depending on the flags.
885 svp->sv_origknconf = svp->sv_knconf;
886 svp->sv_knconf = rdma_knconf;
887 } else {
888 if (flags & NFSMNT_TRYRDMA) {
889 #ifdef DEBUG
890 if (rdma_debug)
891 zcmn_err(getzoneid(), CE_WARN,
892 "no RDMA onboard, revert\n");
893 #endif
896 if (flags & NFSMNT_DORDMA) {
898 * If proto=rdma is specified and no RDMA
899 * path to this server is avialable then
900 * ditch this server.
901 * This is not included in the mountable
902 * server list or the replica list.
903 * Check if more servers are specified;
904 * Failover case, otherwise bail out of mount.
906 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
907 args->nfs_ext_u.nfs_extB.next != NULL) {
908 data = (char *)
909 args->nfs_ext_u.nfs_extB.next;
910 if (uap->flags & MS_RDONLY &&
911 !(flags & NFSMNT_SOFT)) {
912 if (svp_head->sv_next == NULL) {
913 svp_tail = NULL;
914 svp_2ndlast = NULL;
915 sv4_free(svp_head);
916 goto more;
917 } else {
918 svp_tail = svp_2ndlast;
919 svp_2ndlast->sv_next =
920 NULL;
921 sv4_free(svp);
922 goto more;
925 } else {
927 * This is the last server specified
928 * in the nfs_args list passed down
929 * and its not rdma capable.
931 if (svp_head->sv_next == NULL) {
933 * Is this the only one
935 error = EINVAL;
936 #ifdef DEBUG
937 if (rdma_debug)
938 zcmn_err(getzoneid(),
939 CE_WARN,
940 "No RDMA srv");
941 #endif
942 goto errout;
943 } else {
945 * There is list, since some
946 * servers specified before
947 * this passed all requirements
949 svp_tail = svp_2ndlast;
950 svp_2ndlast->sv_next = NULL;
951 sv4_free(svp);
952 goto proceed;
960 * If there are syncaddr and netname data, load them in. This is
961 * to support data needed for NFSV4 when AUTH_DH is the negotiated
962 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
964 if (args->flags & NFSMNT_SECURE) {
965 svp->sv_dhsec = create_authdh_data(args->netname,
966 strlen(args->netname),
967 args->syncaddr, svp->sv_knconf);
971 * Get the extention data which has the security data structure.
972 * This includes data for AUTH_SYS as well.
974 if (flags & NFSMNT_NEWARGS) {
975 switch (args->nfs_args_ext) {
976 case NFS_ARGS_EXTA:
977 case NFS_ARGS_EXTB:
979 * Indicating the application is using the new
980 * sec_data structure to pass in the security
981 * data.
983 secdata = args->nfs_ext_u.nfs_extA.secdata;
984 if (secdata == NULL) {
985 error = EINVAL;
986 } else if (uap->flags & MS_SYSSPACE) {
988 * Need to validate the flavor here if
989 * sysspace, userspace was already
990 * validate from the nfs_copyin function.
992 switch (secdata->rpcflavor) {
993 case AUTH_NONE:
994 case AUTH_UNIX:
995 case AUTH_LOOPBACK:
996 case AUTH_DES:
997 case RPCSEC_GSS:
998 break;
999 default:
1000 error = EINVAL;
1001 goto errout;
1004 args->nfs_ext_u.nfs_extA.secdata = NULL;
1005 break;
1007 default:
1008 error = EINVAL;
1009 break;
1012 } else if (flags & NFSMNT_SECURE) {
1014 * NFSMNT_SECURE is deprecated but we keep it
1015 * to support the rogue user-generated application
1016 * that may use this undocumented interface to do
1017 * AUTH_DH security, e.g. our own rexd.
1019 * Also note that NFSMNT_SECURE is used for passing
1020 * AUTH_DH info to be used in negotiation.
1022 secdata = create_authdh_data(args->netname,
1023 strlen(args->netname), args->syncaddr, svp->sv_knconf);
1025 } else {
1026 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1027 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
1028 secdata->data = NULL;
1031 svp->sv_secdata = secdata;
1034 * User does not explictly specify a flavor, and a user
1035 * defined default flavor is passed down.
1037 if (flags & NFSMNT_SECDEFAULT) {
1038 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1039 svp->sv_flags |= SV4_TRYSECDEFAULT;
1040 nfs_rw_exit(&svp->sv_lock);
1044 * Failover support:
1046 * We may have a linked list of nfs_args structures,
1047 * which means the user is looking for failover. If
1048 * the mount is either not "read-only" or "soft",
1049 * we want to bail out with EINVAL.
1051 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
1052 args->nfs_ext_u.nfs_extB.next != NULL) {
1053 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
1054 data = (char *)args->nfs_ext_u.nfs_extB.next;
1055 goto more;
1057 error = EINVAL;
1058 goto errout;
1062 * Determine the zone we're being mounted into.
1064 zone_hold(mntzone = zone); /* start with this assumption */
1065 if (getzoneid() == GLOBAL_ZONEID) {
1066 zone_rele(mntzone);
1067 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
1068 ASSERT(mntzone != NULL);
1069 if (mntzone != zone) {
1070 error = EBUSY;
1071 goto errout;
1076 * Stop the mount from going any further if the zone is going away.
1078 if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
1079 error = EBUSY;
1080 goto errout;
1084 * Get root vnode.
1086 proceed:
1087 error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
1088 if (error) {
1089 /* if nfs4rootvp failed, it will free svp_head */
1090 svp_head = NULL;
1091 goto errout;
1094 mi = VTOMI4(rtvp);
1097 * Send client id to the server, if necessary
1099 nfs4_error_zinit(&n4e);
1100 nfs4setclientid(mi, cr, FALSE, &n4e);
1102 error = n4e.error;
1104 if (error)
1105 goto errout;
1108 * Set option fields in the mount info record
1111 if (svp_head->sv_next) {
1112 mutex_enter(&mi->mi_lock);
1113 mi->mi_flags |= MI4_LLOCK;
1114 mutex_exit(&mi->mi_lock);
1116 error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
1117 if (error)
1118 goto errout;
1121 * Time to tie in the mirror mount info at last!
1123 if (flags & NFSMNT_EPHEMERAL)
1124 error = nfs4_record_ephemeral_mount(mi, mvp);
1126 errout:
1127 if (error) {
1128 if (rtvp != NULL) {
1129 rp = VTOR4(rtvp);
1130 if (rp->r_flags & R4HASHED)
1131 rp4_rmhash(rp);
1133 if (mi != NULL) {
1134 nfs4_async_stop(vfsp);
1135 nfs4_async_manager_stop(vfsp);
1136 nfs4_remove_mi_from_server(mi, NULL);
1137 if (rtvp != NULL)
1138 VN_RELE(rtvp);
1139 if (mntzone != NULL)
1140 zone_rele(mntzone);
1141 /* need to remove it from the zone */
1142 removed = nfs4_mi_zonelist_remove(mi);
1143 if (removed)
1144 zone_rele_ref(&mi->mi_zone_ref,
1145 ZONE_REF_NFSV4);
1146 MI4_RELE(mi);
1147 if (!(uap->flags & MS_SYSSPACE) && args) {
1148 nfs4_free_args(args);
1149 kmem_free(args, sizeof (*args));
1151 return (error);
1153 if (svp_head)
1154 sv4_free(svp_head);
1157 if (!(uap->flags & MS_SYSSPACE) && args) {
1158 nfs4_free_args(args);
1159 kmem_free(args, sizeof (*args));
1161 if (rtvp != NULL)
1162 VN_RELE(rtvp);
1164 if (mntzone != NULL)
1165 zone_rele(mntzone);
1167 return (error);
1170 #ifdef DEBUG
1171 #define VERS_MSG "NFS4 server "
1172 #else
1173 #define VERS_MSG "NFS server "
1174 #endif
1176 #define READ_MSG \
1177 VERS_MSG "%s returned 0 for read transfer size"
1178 #define WRITE_MSG \
1179 VERS_MSG "%s returned 0 for write transfer size"
1180 #define SIZE_MSG \
1181 VERS_MSG "%s returned 0 for maximum file size"
1184 * Get the symbolic link text from the server for a given filehandle
1185 * of that symlink.
1187 * (get symlink text) PUTFH READLINK
1189 static int
1190 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
1191 int flags)
1193 COMPOUND4args_clnt args;
1194 COMPOUND4res_clnt res;
1195 int doqueue;
1196 nfs_argop4 argop[2];
1197 nfs_resop4 *resop;
1198 READLINK4res *lr_res;
1199 uint_t len;
1200 bool_t needrecov = FALSE;
1201 nfs4_recov_state_t recov_state;
1202 nfs4_sharedfh_t *sfh;
1203 nfs4_error_t e;
1204 int num_retry = nfs4_max_mount_retry;
1205 int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1207 sfh = sfh4_get(fh, mi);
1208 recov_state.rs_flags = 0;
1209 recov_state.rs_num_retry_despite_err = 0;
1211 recov_retry:
1212 nfs4_error_zinit(&e);
1214 args.array_len = 2;
1215 args.array = argop;
1216 args.ctag = TAG_GET_SYMLINK;
1218 if (! recovery) {
1219 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
1220 if (e.error) {
1221 sfh4_rele(&sfh);
1222 return (e.error);
1226 /* 0. putfh symlink fh */
1227 argop[0].argop = OP_CPUTFH;
1228 argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1230 /* 1. readlink */
1231 argop[1].argop = OP_READLINK;
1233 doqueue = 1;
1235 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1237 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1239 if (needrecov && !recovery && num_retry-- > 0) {
1241 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1242 "getlinktext_otw: initiating recovery\n"));
1244 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
1245 OP_READLINK, NULL, NULL, NULL) == FALSE) {
1246 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1247 if (!e.error)
1248 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1249 goto recov_retry;
1254 * If non-NFS4 pcol error and/or we weren't able to recover.
1256 if (e.error != 0) {
1257 if (! recovery)
1258 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1259 sfh4_rele(&sfh);
1260 return (e.error);
1263 if (res.status) {
1264 e.error = geterrno4(res.status);
1265 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1266 if (! recovery)
1267 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1268 sfh4_rele(&sfh);
1269 return (e.error);
1272 /* res.status == NFS4_OK */
1273 ASSERT(res.status == NFS4_OK);
1275 resop = &res.array[1]; /* readlink res */
1276 lr_res = &resop->nfs_resop4_u.opreadlink;
1278 /* treat symlink name as data */
1279 *linktextp = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
1281 if (! recovery)
1282 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1283 sfh4_rele(&sfh);
1284 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1285 return (0);
1289 * Skip over consecutive slashes and "/./" in a pathname.
1291 void
1292 pathname_skipslashdot(struct pathname *pnp)
1294 char *c1, *c2;
1296 while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
1298 c1 = pnp->pn_path + 1;
1299 c2 = pnp->pn_path + 2;
1301 if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
1302 pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
1303 pnp->pn_pathlen = pnp->pn_pathlen - 2;
1304 } else {
1305 pnp->pn_path++;
1306 pnp->pn_pathlen--;
1312 * Resolve a symbolic link path. The symlink is in the nth component of
1313 * svp->sv_path and has an nfs4 file handle "fh".
1314 * Upon return, the sv_path will point to the new path that has the nth
1315 * component resolved to its symlink text.
1318 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
1319 cred_t *cr, int flags)
1321 char *oldpath;
1322 char *symlink, *newpath;
1323 struct pathname oldpn, newpn;
1324 char component[MAXNAMELEN];
1325 int i, addlen, error = 0;
1326 int oldpathlen;
1328 /* Get the symbolic link text over the wire. */
1329 error = getlinktext_otw(mi, fh, &symlink, cr, flags);
1331 if (error || symlink == NULL || strlen(symlink) == 0)
1332 return (error);
1335 * Compose the new pathname.
1336 * Note:
1337 * - only the nth component is resolved for the pathname.
1338 * - pathname.pn_pathlen does not count the ending null byte.
1340 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1341 oldpath = svp->sv_path;
1342 oldpathlen = svp->sv_pathlen;
1343 if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
1344 nfs_rw_exit(&svp->sv_lock);
1345 kmem_free(symlink, strlen(symlink) + 1);
1346 return (error);
1348 nfs_rw_exit(&svp->sv_lock);
1349 pn_alloc(&newpn);
1352 * Skip over previous components from the oldpath so that the
1353 * oldpn.pn_path will point to the symlink component. Skip
1354 * leading slashes and "/./" (no OP_LOOKUP on ".") so that
1355 * pn_getcompnent can get the component.
1357 for (i = 1; i < nth; i++) {
1358 pathname_skipslashdot(&oldpn);
1359 error = pn_getcomponent(&oldpn, component);
1360 if (error)
1361 goto out;
1365 * Copy the old path upto the component right before the symlink
1366 * if the symlink is not an absolute path.
1368 if (symlink[0] != '/') {
1369 addlen = oldpn.pn_path - oldpn.pn_buf;
1370 bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
1371 newpn.pn_pathlen += addlen;
1372 newpn.pn_path += addlen;
1373 newpn.pn_buf[newpn.pn_pathlen] = '/';
1374 newpn.pn_pathlen++;
1375 newpn.pn_path++;
1378 /* copy the resolved symbolic link text */
1379 addlen = strlen(symlink);
1380 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1381 error = ENAMETOOLONG;
1382 goto out;
1384 bcopy(symlink, newpn.pn_path, addlen);
1385 newpn.pn_pathlen += addlen;
1386 newpn.pn_path += addlen;
1389 * Check if there is any remaining path after the symlink component.
1390 * First, skip the symlink component.
1392 pathname_skipslashdot(&oldpn);
1393 if (error = pn_getcomponent(&oldpn, component))
1394 goto out;
1396 addlen = pn_pathleft(&oldpn); /* includes counting the slash */
1399 * Copy the remaining path to the new pathname if there is any.
1401 if (addlen > 0) {
1402 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1403 error = ENAMETOOLONG;
1404 goto out;
1406 bcopy(oldpn.pn_path, newpn.pn_path, addlen);
1407 newpn.pn_pathlen += addlen;
1409 newpn.pn_buf[newpn.pn_pathlen] = '\0';
1411 /* get the newpath and store it in the servinfo4_t */
1412 newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
1413 bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
1414 newpath[newpn.pn_pathlen] = '\0';
1416 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1417 svp->sv_path = newpath;
1418 svp->sv_pathlen = strlen(newpath) + 1;
1419 nfs_rw_exit(&svp->sv_lock);
1421 kmem_free(oldpath, oldpathlen);
1422 out:
1423 kmem_free(symlink, strlen(symlink) + 1);
1424 pn_free(&newpn);
1425 pn_free(&oldpn);
1427 return (error);
1431 * This routine updates servinfo4 structure with the new referred server
1432 * info.
1433 * nfsfsloc has the location related information
1434 * fsp has the hostname and pathname info.
1435 * new path = pathname from referral + part of orig pathname(based on nth).
1437 static void
1438 update_servinfo4(servinfo4_t *svp, fs_location4 *fsp,
1439 struct nfs_fsl_info *nfsfsloc, char *orig_path, int nth)
1441 struct knetconfig *knconf, *svknconf;
1442 struct netbuf *saddr;
1443 sec_data_t *secdata;
1444 utf8string *host;
1445 int i = 0, num_slashes = 0;
1446 char *p, *spath, *op, *new_path;
1448 /* Update knconf */
1449 knconf = svp->sv_knconf;
1450 free_knconf_contents(knconf);
1451 bzero(knconf, sizeof (struct knetconfig));
1452 svknconf = nfsfsloc->knconf;
1453 knconf->knc_semantics = svknconf->knc_semantics;
1454 knconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1455 knconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1456 knconf->knc_rdev = svknconf->knc_rdev;
1457 bcopy(svknconf->knc_protofmly, knconf->knc_protofmly, KNC_STRSIZE);
1458 bcopy(svknconf->knc_proto, knconf->knc_proto, KNC_STRSIZE);
1460 /* Update server address */
1461 saddr = &svp->sv_addr;
1462 if (saddr->buf != NULL)
1463 kmem_free(saddr->buf, saddr->maxlen);
1464 saddr->buf = kmem_alloc(nfsfsloc->addr->maxlen, KM_SLEEP);
1465 saddr->len = nfsfsloc->addr->len;
1466 saddr->maxlen = nfsfsloc->addr->maxlen;
1467 bcopy(nfsfsloc->addr->buf, saddr->buf, nfsfsloc->addr->len);
1469 /* Update server name */
1470 host = fsp->server_val;
1471 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
1472 svp->sv_hostname = kmem_zalloc(host->utf8string_len + 1, KM_SLEEP);
1473 bcopy(host->utf8string_val, svp->sv_hostname, host->utf8string_len);
1474 svp->sv_hostname[host->utf8string_len] = '\0';
1475 svp->sv_hostnamelen = host->utf8string_len + 1;
1478 * Update server path.
1479 * We need to setup proper path here.
1480 * For ex., If we got a path name serv1:/rp/aaa/bbb
1481 * where aaa is a referral and points to serv2:/rpool/aa
1482 * we need to set the path to serv2:/rpool/aa/bbb
1483 * The first part of this below code generates /rpool/aa
1484 * and the second part appends /bbb to the server path.
1486 spath = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1487 *p++ = '/';
1488 for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1489 component4 *comp;
1491 comp = &fsp->rootpath.pathname4_val[i];
1492 /* If no space, null the string and bail */
1493 if ((p - spath) + comp->utf8string_len + 1 > MAXPATHLEN) {
1494 p = spath + MAXPATHLEN - 1;
1495 spath[0] = '\0';
1496 break;
1498 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1499 p += comp->utf8string_len;
1500 *p++ = '/';
1502 if (fsp->rootpath.pathname4_len != 0)
1503 *(p - 1) = '\0';
1504 else
1505 *p = '\0';
1506 p = spath;
1508 new_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1509 (void) strlcpy(new_path, p, MAXPATHLEN);
1510 kmem_free(p, MAXPATHLEN);
1511 i = strlen(new_path);
1513 for (op = orig_path; *op; op++) {
1514 if (*op == '/')
1515 num_slashes++;
1516 if (num_slashes == nth + 2) {
1517 while (*op != '\0') {
1518 new_path[i] = *op;
1519 i++;
1520 op++;
1522 break;
1525 new_path[i] = '\0';
1527 kmem_free(svp->sv_path, svp->sv_pathlen);
1528 svp->sv_pathlen = strlen(new_path) + 1;
1529 svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
1530 bcopy(new_path, svp->sv_path, svp->sv_pathlen);
1531 kmem_free(new_path, MAXPATHLEN);
1534 * All the security data is specific to old server.
1535 * Clean it up except secdata which deals with mount options.
1536 * We need to inherit that data. Copy secdata into our new servinfo4.
1538 if (svp->sv_dhsec) {
1539 sec_clnt_freeinfo(svp->sv_dhsec);
1540 svp->sv_dhsec = NULL;
1542 if (svp->sv_save_secinfo &&
1543 svp->sv_save_secinfo != svp->sv_secinfo) {
1544 secinfo_free(svp->sv_save_secinfo);
1545 svp->sv_save_secinfo = NULL;
1547 if (svp->sv_secinfo) {
1548 secinfo_free(svp->sv_secinfo);
1549 svp->sv_secinfo = NULL;
1551 svp->sv_currsec = NULL;
1553 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1554 *secdata = *svp->sv_secdata;
1555 secdata->data = NULL;
1556 if (svp->sv_secdata) {
1557 sec_clnt_freeinfo(svp->sv_secdata);
1558 svp->sv_secdata = NULL;
1560 svp->sv_secdata = secdata;
1564 * Resolve a referral. The referral is in the n+1th component of
1565 * svp->sv_path and has a parent nfs4 file handle "fh".
1566 * Upon return, the sv_path will point to the new path that has referral
1567 * component resolved to its referred path and part of original path.
1568 * Hostname and other address information is also updated.
1571 resolve_referral(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, int nth,
1572 nfs_fh4 *fh)
1574 nfs4_sharedfh_t *sfh;
1575 struct nfs_fsl_info nfsfsloc;
1576 nfs4_ga_res_t garp;
1577 COMPOUND4res_clnt callres;
1578 fs_location4 *fsp;
1579 char *nm, *orig_path;
1580 int orig_pathlen = 0, ret = -1, index;
1582 if (svp->sv_pathlen <= 0)
1583 return (ret);
1585 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1586 orig_pathlen = svp->sv_pathlen;
1587 orig_path = kmem_alloc(orig_pathlen, KM_SLEEP);
1588 bcopy(svp->sv_path, orig_path, orig_pathlen);
1589 nm = extract_referral_point(svp->sv_path, nth);
1590 setup_newsvpath(svp, nth);
1591 nfs_rw_exit(&svp->sv_lock);
1593 sfh = sfh4_get(fh, mi);
1594 index = nfs4_process_referral(mi, sfh, nm, cr,
1595 &garp, &callres, &nfsfsloc);
1596 sfh4_rele(&sfh);
1597 kmem_free(nm, MAXPATHLEN);
1598 if (index < 0) {
1599 kmem_free(orig_path, orig_pathlen);
1600 return (index);
1603 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1604 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1605 update_servinfo4(svp, fsp, &nfsfsloc, orig_path, nth);
1606 nfs_rw_exit(&svp->sv_lock);
1608 mutex_enter(&mi->mi_lock);
1609 mi->mi_vfs_referral_loop_cnt++;
1610 mutex_exit(&mi->mi_lock);
1612 ret = 0;
1613 bad:
1614 /* Free up XDR memory allocated in nfs4_process_referral() */
1615 xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1616 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1617 kmem_free(orig_path, orig_pathlen);
1619 return (ret);
1623 * Get the root filehandle for the given filesystem and server, and update
1624 * svp.
1626 * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
1627 * to coordinate with recovery. Otherwise, the caller is assumed to be
1628 * the recovery thread or have already done a start_fop.
1630 * Errors are returned by the nfs4_error_t parameter.
1632 static void
1633 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
1634 int flags, cred_t *cr, nfs4_error_t *ep)
1636 COMPOUND4args_clnt args;
1637 COMPOUND4res_clnt res;
1638 int doqueue = 1;
1639 nfs_argop4 *argop;
1640 nfs_resop4 *resop;
1641 nfs4_ga_res_t *garp;
1642 int num_argops;
1643 lookup4_param_t lookuparg;
1644 nfs_fh4 *tmpfhp;
1645 nfs_fh4 *resfhp;
1646 bool_t needrecov = FALSE;
1647 nfs4_recov_state_t recov_state;
1648 int llndx;
1649 int nthcomp;
1650 int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1652 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1653 ASSERT(svp->sv_path != NULL);
1654 if (svp->sv_path[0] == '\0') {
1655 nfs_rw_exit(&svp->sv_lock);
1656 nfs4_error_init(ep, EINVAL);
1657 return;
1659 nfs_rw_exit(&svp->sv_lock);
1661 recov_state.rs_flags = 0;
1662 recov_state.rs_num_retry_despite_err = 0;
1664 recov_retry:
1665 if (mi->mi_vfs_referral_loop_cnt >= NFS4_REFERRAL_LOOP_MAX) {
1666 DTRACE_PROBE3(nfs4clnt__debug__referral__loop, mntinfo4 *,
1667 mi, servinfo4_t *, svp, char *, "nfs4getfh_otw");
1668 nfs4_error_init(ep, EINVAL);
1669 return;
1671 nfs4_error_zinit(ep);
1673 if (!recovery) {
1674 ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
1675 &recov_state, NULL);
1678 * If recovery has been started and this request as
1679 * initiated by a mount, then we must wait for recovery
1680 * to finish before proceeding, otherwise, the error
1681 * cleanup would remove data structures needed by the
1682 * recovery thread.
1684 if (ep->error) {
1685 mutex_enter(&mi->mi_lock);
1686 if (mi->mi_flags & MI4_MOUNTING) {
1687 mi->mi_flags |= MI4_RECOV_FAIL;
1688 mi->mi_error = EIO;
1690 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1691 "nfs4getfh_otw: waiting 4 recovery\n"));
1693 while (mi->mi_flags & MI4_RECOV_ACTIV)
1694 cv_wait(&mi->mi_failover_cv,
1695 &mi->mi_lock);
1697 mutex_exit(&mi->mi_lock);
1698 return;
1702 * If the client does not specify a specific flavor to use
1703 * and has not gotten a secinfo list from the server yet,
1704 * retrieve the secinfo list from the server and use a
1705 * flavor from the list to mount.
1707 * If fail to get the secinfo list from the server, then
1708 * try the default flavor.
1710 if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
1711 svp->sv_secinfo == NULL) {
1712 (void) nfs4_secinfo_path(mi, cr, FALSE);
1716 if (recovery)
1717 args.ctag = TAG_REMAP_MOUNT;
1718 else
1719 args.ctag = TAG_MOUNT;
1721 lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1722 lookuparg.argsp = &args;
1723 lookuparg.resp = &res;
1724 lookuparg.header_len = 2; /* Putrootfh, getfh */
1725 lookuparg.trailer_len = 0;
1726 lookuparg.ga_bits = FATTR4_FSINFO_MASK;
1727 lookuparg.mi = mi;
1729 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1730 ASSERT(svp->sv_path != NULL);
1731 llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
1732 nfs_rw_exit(&svp->sv_lock);
1734 argop = args.array;
1735 num_argops = args.array_len;
1737 /* choose public or root filehandle */
1738 if (flags & NFS4_GETFH_PUBLIC)
1739 argop[0].argop = OP_PUTPUBFH;
1740 else
1741 argop[0].argop = OP_PUTROOTFH;
1743 /* get fh */
1744 argop[1].argop = OP_GETFH;
1746 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1747 "nfs4getfh_otw: %s call, mi 0x%p",
1748 needrecov ? "recov" : "first", (void *)mi));
1750 rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1752 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
1754 if (needrecov) {
1755 bool_t abort;
1757 if (recovery) {
1758 nfs4args_lookup_free(argop, num_argops);
1759 kmem_free(argop,
1760 lookuparg.arglen * sizeof (nfs_argop4));
1761 if (!ep->error)
1762 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1763 return;
1766 NFS4_DEBUG(nfs4_client_recov_debug,
1767 (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
1769 abort = nfs4_start_recovery(ep, mi, NULL,
1770 NULL, NULL, NULL, OP_GETFH, NULL, NULL, NULL);
1771 if (!ep->error) {
1772 ep->error = geterrno4(res.status);
1773 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1775 nfs4args_lookup_free(argop, num_argops);
1776 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1777 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1778 /* have another go? */
1779 if (abort == FALSE)
1780 goto recov_retry;
1781 return;
1785 * No recovery, but check if error is set.
1787 if (ep->error) {
1788 nfs4args_lookup_free(argop, num_argops);
1789 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1790 if (!recovery)
1791 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1792 needrecov);
1793 return;
1796 is_link_err:
1798 /* for non-recovery errors */
1799 if (res.status && res.status != NFS4ERR_SYMLINK &&
1800 res.status != NFS4ERR_MOVED) {
1801 if (!recovery) {
1802 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1803 needrecov);
1805 nfs4args_lookup_free(argop, num_argops);
1806 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1807 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1808 return;
1812 * If any intermediate component in the path is a symbolic link,
1813 * resolve the symlink, then try mount again using the new path.
1815 if (res.status == NFS4ERR_SYMLINK || res.status == NFS4ERR_MOVED) {
1816 int where;
1819 * Need to call nfs4_end_op before resolve_sympath to avoid
1820 * potential nfs4_start_op deadlock.
1822 if (!recovery)
1823 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1824 needrecov);
1827 * This must be from OP_LOOKUP failure. The (cfh) for this
1828 * OP_LOOKUP is a symlink node. Found out where the
1829 * OP_GETFH is for the (cfh) that is a symlink node.
1831 * Example:
1832 * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
1833 * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
1835 * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
1836 * In this case, where = 7, nthcomp = 2.
1838 where = res.array_len - 2;
1839 ASSERT(where > 0);
1841 if (res.status == NFS4ERR_SYMLINK) {
1843 resop = &res.array[where - 1];
1844 ASSERT(resop->resop == OP_GETFH);
1845 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1846 nthcomp = res.array_len/3 - 1;
1847 ep->error = resolve_sympath(mi, svp, nthcomp,
1848 tmpfhp, cr, flags);
1850 } else if (res.status == NFS4ERR_MOVED) {
1852 resop = &res.array[where - 2];
1853 ASSERT(resop->resop == OP_GETFH);
1854 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1855 nthcomp = res.array_len/3 - 1;
1856 ep->error = resolve_referral(mi, svp, cr, nthcomp,
1857 tmpfhp);
1860 nfs4args_lookup_free(argop, num_argops);
1861 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1862 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1864 if (ep->error)
1865 return;
1867 goto recov_retry;
1870 /* getfh */
1871 resop = &res.array[res.array_len - 2];
1872 ASSERT(resop->resop == OP_GETFH);
1873 resfhp = &resop->nfs_resop4_u.opgetfh.object;
1875 /* getattr fsinfo res */
1876 resop++;
1877 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
1879 *vtp = garp->n4g_va.va_type;
1881 mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
1883 mutex_enter(&mi->mi_lock);
1884 if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
1885 mi->mi_flags |= MI4_LINK;
1886 if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
1887 mi->mi_flags |= MI4_SYMLINK;
1888 if (garp->n4g_ext_res->n4g_suppattrs & FATTR4_ACL_MASK)
1889 mi->mi_flags |= MI4_ACL;
1890 mutex_exit(&mi->mi_lock);
1892 if (garp->n4g_ext_res->n4g_maxread == 0)
1893 mi->mi_tsize =
1894 MIN(MAXBSIZE, mi->mi_tsize);
1895 else
1896 mi->mi_tsize =
1897 MIN(garp->n4g_ext_res->n4g_maxread,
1898 mi->mi_tsize);
1900 if (garp->n4g_ext_res->n4g_maxwrite == 0)
1901 mi->mi_stsize =
1902 MIN(MAXBSIZE, mi->mi_stsize);
1903 else
1904 mi->mi_stsize =
1905 MIN(garp->n4g_ext_res->n4g_maxwrite,
1906 mi->mi_stsize);
1908 if (garp->n4g_ext_res->n4g_maxfilesize != 0)
1909 mi->mi_maxfilesize =
1910 MIN(garp->n4g_ext_res->n4g_maxfilesize,
1911 mi->mi_maxfilesize);
1914 * If the final component is a a symbolic link, resolve the symlink,
1915 * then try mount again using the new path.
1917 * Assume no symbolic link for root filesysm "/".
1919 if (*vtp == VLNK) {
1921 * nthcomp is the total result length minus
1922 * the 1st 2 OPs (PUTROOTFH, GETFH),
1923 * then divided by 3 (LOOKUP,GETFH,GETATTR)
1925 * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
1926 * LOOKUP 2nd-comp GETFH GETATTR
1928 * (8 - 2)/3 = 2
1930 nthcomp = (res.array_len - 2)/3;
1933 * Need to call nfs4_end_op before resolve_sympath to avoid
1934 * potential nfs4_start_op deadlock. See RFE 4777612.
1936 if (!recovery)
1937 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1938 needrecov);
1940 ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
1941 flags);
1943 nfs4args_lookup_free(argop, num_argops);
1944 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1945 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1947 if (ep->error)
1948 return;
1950 goto recov_retry;
1954 * We need to figure out where in the compound the getfh
1955 * for the parent directory is. If the object to be mounted is
1956 * the root, then there is no lookup at all:
1957 * PUTROOTFH, GETFH.
1958 * If the object to be mounted is in the root, then the compound is:
1959 * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
1960 * In either of these cases, the index of the GETFH is 1.
1961 * If it is not at the root, then it's something like:
1962 * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
1963 * LOOKUP, GETFH, GETATTR
1964 * In this case, the index is llndx (last lookup index) - 2.
1966 if (llndx == -1 || llndx == 2)
1967 resop = &res.array[1];
1968 else {
1969 ASSERT(llndx > 2);
1970 resop = &res.array[llndx-2];
1973 ASSERT(resop->resop == OP_GETFH);
1974 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1976 /* save the filehandles for the replica */
1977 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1978 ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
1979 svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
1980 bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
1981 tmpfhp->nfs_fh4_len);
1982 ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
1983 svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
1984 bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
1986 /* initialize fsid and supp_attrs for server fs */
1987 svp->sv_fsid = garp->n4g_fsid;
1988 svp->sv_supp_attrs =
1989 garp->n4g_ext_res->n4g_suppattrs | FATTR4_MANDATTR_MASK;
1991 nfs_rw_exit(&svp->sv_lock);
1992 nfs4args_lookup_free(argop, num_argops);
1993 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1994 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1995 if (!recovery)
1996 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
2000 * Save a copy of Servinfo4_t structure.
2001 * We might need when there is a failure in getting file handle
2002 * in case of a referral to replace servinfo4 struct and try again.
2004 static struct servinfo4 *
2005 copy_svp(servinfo4_t *nsvp)
2007 servinfo4_t *svp = NULL;
2008 struct knetconfig *sknconf, *tknconf;
2009 struct netbuf *saddr, *taddr;
2011 svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2012 nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2013 svp->sv_flags = nsvp->sv_flags;
2014 svp->sv_fsid = nsvp->sv_fsid;
2015 svp->sv_hostnamelen = nsvp->sv_hostnamelen;
2016 svp->sv_pathlen = nsvp->sv_pathlen;
2017 svp->sv_supp_attrs = nsvp->sv_supp_attrs;
2019 svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2020 svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2021 bcopy(nsvp->sv_hostname, svp->sv_hostname, svp->sv_hostnamelen);
2022 bcopy(nsvp->sv_path, svp->sv_path, svp->sv_pathlen);
2024 saddr = &nsvp->sv_addr;
2025 taddr = &svp->sv_addr;
2026 taddr->maxlen = saddr->maxlen;
2027 taddr->len = saddr->len;
2028 if (saddr->len > 0) {
2029 taddr->buf = kmem_zalloc(saddr->maxlen, KM_SLEEP);
2030 bcopy(saddr->buf, taddr->buf, saddr->len);
2033 svp->sv_knconf = kmem_zalloc(sizeof (struct knetconfig), KM_SLEEP);
2034 sknconf = nsvp->sv_knconf;
2035 tknconf = svp->sv_knconf;
2036 tknconf->knc_semantics = sknconf->knc_semantics;
2037 tknconf->knc_rdev = sknconf->knc_rdev;
2038 if (sknconf->knc_proto != NULL) {
2039 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2040 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2041 KNC_STRSIZE);
2043 if (sknconf->knc_protofmly != NULL) {
2044 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2045 bcopy(sknconf->knc_protofmly, (char *)tknconf->knc_protofmly,
2046 KNC_STRSIZE);
2049 if (nsvp->sv_origknconf != NULL) {
2050 svp->sv_origknconf = kmem_zalloc(sizeof (struct knetconfig),
2051 KM_SLEEP);
2052 sknconf = nsvp->sv_origknconf;
2053 tknconf = svp->sv_origknconf;
2054 tknconf->knc_semantics = sknconf->knc_semantics;
2055 tknconf->knc_rdev = sknconf->knc_rdev;
2056 if (sknconf->knc_proto != NULL) {
2057 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2058 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2059 KNC_STRSIZE);
2061 if (sknconf->knc_protofmly != NULL) {
2062 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE,
2063 KM_SLEEP);
2064 bcopy(sknconf->knc_protofmly,
2065 (char *)tknconf->knc_protofmly, KNC_STRSIZE);
2069 svp->sv_secdata = copy_sec_data(nsvp->sv_secdata);
2070 svp->sv_dhsec = copy_sec_data(svp->sv_dhsec);
2072 * Rest of the security information is not copied as they are built
2073 * with the information available from secdata and dhsec.
2075 svp->sv_next = NULL;
2077 return (svp);
2080 servinfo4_t *
2081 restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)
2083 servinfo4_t *srvnext, *tmpsrv;
2085 if (strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) {
2087 * Since the hostname changed, we must be dealing
2088 * with a referral, and the lookup failed. We will
2089 * restore the whole servinfo4_t to what it was before.
2091 srvnext = svp->sv_next;
2092 svp->sv_next = NULL;
2093 tmpsrv = copy_svp(origsvp);
2094 sv4_free(svp);
2095 svp = tmpsrv;
2096 svp->sv_next = srvnext;
2097 mutex_enter(&mi->mi_lock);
2098 mi->mi_servers = svp;
2099 mi->mi_curr_serv = svp;
2100 mutex_exit(&mi->mi_lock);
2102 } else if (origsvp->sv_pathlen != svp->sv_pathlen) {
2105 * For symlink case: restore original path because
2106 * it might have contained symlinks that were
2107 * expanded by nfsgetfh_otw before the failure occurred.
2109 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2110 kmem_free(svp->sv_path, svp->sv_pathlen);
2111 svp->sv_path =
2112 kmem_alloc(origsvp->sv_pathlen, KM_SLEEP);
2113 svp->sv_pathlen = origsvp->sv_pathlen;
2114 bcopy(origsvp->sv_path, svp->sv_path,
2115 origsvp->sv_pathlen);
2116 nfs_rw_exit(&svp->sv_lock);
2118 return (svp);
2121 static ushort_t nfs4_max_threads = 8; /* max number of active async threads */
2122 uint_t nfs4_bsize = 32 * 1024; /* client `block' size */
2123 static uint_t nfs4_async_clusters = 1; /* # of reqs from each async queue */
2124 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
2127 * Remap the root filehandle for the given filesystem.
2129 * results returned via the nfs4_error_t parameter.
2131 void
2132 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
2134 struct servinfo4 *svp, *origsvp;
2135 vtype_t vtype;
2136 nfs_fh4 rootfh;
2137 int getfh_flags;
2138 int num_retry;
2140 mutex_enter(&mi->mi_lock);
2142 remap_retry:
2143 svp = mi->mi_curr_serv;
2144 getfh_flags =
2145 (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
2146 getfh_flags |=
2147 (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
2148 mutex_exit(&mi->mi_lock);
2151 * Just in case server path being mounted contains
2152 * symlinks and fails w/STALE, save the initial sv_path
2153 * so we can redrive the initial mount compound with the
2154 * initial sv_path -- not a symlink-expanded version.
2156 * This could only happen if a symlink was expanded
2157 * and the expanded mount compound failed stale. Because
2158 * it could be the case that the symlink was removed at
2159 * the server (and replaced with another symlink/dir,
2160 * we need to use the initial sv_path when attempting
2161 * to re-lookup everything and recover.
2163 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2164 origsvp = copy_svp(svp);
2165 nfs_rw_exit(&svp->sv_lock);
2167 num_retry = nfs4_max_mount_retry;
2169 do {
2171 * Get the root fh from the server. Retry nfs4_max_mount_retry
2172 * (2) times if it fails with STALE since the recovery
2173 * infrastructure doesn't do STALE recovery for components
2174 * of the server path to the object being mounted.
2176 nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
2178 if (ep->error == 0 && ep->stat == NFS4_OK)
2179 break;
2182 * For some reason, the mount compound failed. Before
2183 * retrying, we need to restore original conditions.
2185 svp = restore_svp(mi, svp, origsvp);
2187 } while (num_retry-- > 0);
2189 sv4_free(origsvp);
2191 if (ep->error != 0 || ep->stat != 0) {
2192 return;
2195 if (vtype != VNON && vtype != mi->mi_type) {
2196 /* shouldn't happen */
2197 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2198 "nfs4_remap_root: server root vnode type (%d) doesn't "
2199 "match mount info (%d)", vtype, mi->mi_type);
2202 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2203 rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2204 rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2205 nfs_rw_exit(&svp->sv_lock);
2206 sfh4_update(mi->mi_rootfh, &rootfh);
2209 * It's possible that recovery took place on the filesystem
2210 * and the server has been updated between the time we did
2211 * the nfs4getfh_otw and now. Re-drive the otw operation
2212 * to make sure we have a good fh.
2214 mutex_enter(&mi->mi_lock);
2215 if (mi->mi_curr_serv != svp)
2216 goto remap_retry;
2218 mutex_exit(&mi->mi_lock);
2221 static int
2222 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
2223 int flags, cred_t *cr, zone_t *zone)
2225 vnode_t *rtvp = NULL;
2226 mntinfo4_t *mi;
2227 dev_t nfs_dev;
2228 int error = 0;
2229 rnode4_t *rp;
2230 int i, len;
2231 struct vattr va;
2232 vtype_t vtype = VNON;
2233 vtype_t tmp_vtype = VNON;
2234 struct servinfo4 *firstsvp = NULL, *svp = svp_head;
2235 nfs4_oo_hash_bucket_t *bucketp;
2236 nfs_fh4 fh;
2237 char *droptext = "";
2238 struct nfs_stats *nfsstatsp;
2239 nfs4_fname_t *mfname;
2240 nfs4_error_t e;
2241 int num_retry, removed;
2242 cred_t *lcr = NULL, *tcr = cr;
2243 struct servinfo4 *origsvp;
2244 char *resource;
2246 nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
2247 ASSERT(nfsstatsp != NULL);
2249 ASSERT(nfs_zone() == zone);
2250 ASSERT(crgetref(cr));
2253 * Create a mount record and link it to the vfs struct.
2255 mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
2256 mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
2257 nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
2258 nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
2259 nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
2261 if (!(flags & NFSMNT_SOFT))
2262 mi->mi_flags |= MI4_HARD;
2263 if ((flags & NFSMNT_NOPRINT))
2264 mi->mi_flags |= MI4_NOPRINT;
2265 if (flags & NFSMNT_INT)
2266 mi->mi_flags |= MI4_INT;
2267 if (flags & NFSMNT_PUBLIC)
2268 mi->mi_flags |= MI4_PUBLIC;
2269 if (flags & NFSMNT_MIRRORMOUNT)
2270 mi->mi_flags |= MI4_MIRRORMOUNT;
2271 if (flags & NFSMNT_REFERRAL)
2272 mi->mi_flags |= MI4_REFERRAL;
2273 mi->mi_retrans = NFS_RETRIES;
2274 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
2275 svp->sv_knconf->knc_semantics == NC_TPI_COTS)
2276 mi->mi_timeo = nfs4_cots_timeo;
2277 else
2278 mi->mi_timeo = NFS_TIMEO;
2279 mi->mi_prog = NFS_PROGRAM;
2280 mi->mi_vers = NFS_V4;
2281 mi->mi_rfsnames = rfsnames_v4;
2282 mi->mi_reqs = nfsstatsp->nfs_stats_v4.rfsreqcnt_ptr;
2283 cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
2284 mi->mi_servers = svp;
2285 mi->mi_curr_serv = svp;
2286 mi->mi_acregmin = SEC2HR(ACREGMIN);
2287 mi->mi_acregmax = SEC2HR(ACREGMAX);
2288 mi->mi_acdirmin = SEC2HR(ACDIRMIN);
2289 mi->mi_acdirmax = SEC2HR(ACDIRMAX);
2290 mi->mi_fh_expire_type = FH4_PERSISTENT;
2291 mi->mi_clientid_next = NULL;
2292 mi->mi_clientid_prev = NULL;
2293 mi->mi_srv = NULL;
2294 mi->mi_grace_wait = 0;
2295 mi->mi_error = 0;
2296 mi->mi_srvsettime = 0;
2297 mi->mi_srvset_cnt = 0;
2299 mi->mi_count = 1;
2301 mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
2302 mi->mi_stsize = mi->mi_tsize;
2304 if (flags & NFSMNT_DIRECTIO)
2305 mi->mi_flags |= MI4_DIRECTIO;
2307 mi->mi_flags |= MI4_MOUNTING;
2310 * Make a vfs struct for nfs. We do this here instead of below
2311 * because rtvp needs a vfs before we can do a getattr on it.
2313 * Assign a unique device id to the mount
2315 mutex_enter(&nfs_minor_lock);
2316 do {
2317 nfs_minor = (nfs_minor + 1) & MAXMIN32;
2318 nfs_dev = makedevice(nfs_major, nfs_minor);
2319 } while (vfs_devismounted(nfs_dev));
2320 mutex_exit(&nfs_minor_lock);
2322 vfsp->vfs_dev = nfs_dev;
2323 vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
2324 vfsp->vfs_data = (caddr_t)mi;
2325 vfsp->vfs_fstype = nfsfstyp;
2326 vfsp->vfs_bsize = nfs4_bsize;
2329 * Initialize fields used to support async putpage operations.
2331 for (i = 0; i < NFS4_ASYNC_TYPES; i++)
2332 mi->mi_async_clusters[i] = nfs4_async_clusters;
2333 mi->mi_async_init_clusters = nfs4_async_clusters;
2334 mi->mi_async_curr[NFS4_ASYNC_QUEUE] =
2335 mi->mi_async_curr[NFS4_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
2336 mi->mi_max_threads = nfs4_max_threads;
2337 mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
2338 cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
2339 cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE], NULL, CV_DEFAULT,
2340 NULL);
2341 cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE], NULL,
2342 CV_DEFAULT, NULL);
2343 cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
2344 cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
2346 mi->mi_vfsp = vfsp;
2347 mi->mi_zone = zone;
2348 zone_init_ref(&mi->mi_zone_ref);
2349 zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFSV4);
2350 nfs4_mi_zonelist_add(mi);
2353 * Initialize the <open owner/cred> hash table.
2355 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
2356 bucketp = &(mi->mi_oo_list[i]);
2357 mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
2358 list_create(&bucketp->b_oo_hash_list,
2359 sizeof (nfs4_open_owner_t),
2360 offsetof(nfs4_open_owner_t, oo_hash_node));
2364 * Initialize the freed open owner list.
2366 mi->mi_foo_num = 0;
2367 mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
2368 list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
2369 offsetof(nfs4_open_owner_t, oo_foo_node));
2371 list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
2372 offsetof(nfs4_lost_rqst_t, lr_node));
2374 list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
2375 offsetof(nfs4_bseqid_entry_t, bs_node));
2378 * Initialize the msg buffer.
2380 list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
2381 offsetof(nfs4_debug_msg_t, msg_node));
2382 mi->mi_msg_count = 0;
2383 mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
2386 * Initialize kstats
2388 nfs4_mnt_kstat_init(vfsp);
2391 * Initialize the shared filehandle pool.
2393 sfh4_createtab(&mi->mi_filehandles);
2396 * Save server path we're attempting to mount.
2398 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2399 origsvp = copy_svp(svp);
2400 nfs_rw_exit(&svp->sv_lock);
2403 * Make the GETFH call to get root fh for each replica.
2405 if (svp_head->sv_next)
2406 droptext = ", dropping replica";
2409 * If the uid is set then set the creds for secure mounts
2410 * by proxy processes such as automountd.
2412 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2413 if (svp->sv_secdata->uid != 0 &&
2414 svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
2415 lcr = crdup(cr);
2416 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
2417 tcr = lcr;
2419 nfs_rw_exit(&svp->sv_lock);
2420 for (svp = svp_head; svp; svp = svp->sv_next) {
2421 if (nfs4_chkdup_servinfo4(svp_head, svp)) {
2422 nfs_cmn_err(error, CE_WARN,
2423 VERS_MSG "Host %s is a duplicate%s",
2424 svp->sv_hostname, droptext);
2425 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2426 svp->sv_flags |= SV4_NOTINUSE;
2427 nfs_rw_exit(&svp->sv_lock);
2428 continue;
2430 mi->mi_curr_serv = svp;
2433 * Just in case server path being mounted contains
2434 * symlinks and fails w/STALE, save the initial sv_path
2435 * so we can redrive the initial mount compound with the
2436 * initial sv_path -- not a symlink-expanded version.
2438 * This could only happen if a symlink was expanded
2439 * and the expanded mount compound failed stale. Because
2440 * it could be the case that the symlink was removed at
2441 * the server (and replaced with another symlink/dir,
2442 * we need to use the initial sv_path when attempting
2443 * to re-lookup everything and recover.
2445 * Other mount errors should evenutally be handled here also
2446 * (NFS4ERR_DELAY, NFS4ERR_RESOURCE). For now, all mount
2447 * failures will result in mount being redriven a few times.
2449 num_retry = nfs4_max_mount_retry;
2450 do {
2451 nfs4getfh_otw(mi, svp, &tmp_vtype,
2452 ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
2453 NFS4_GETFH_NEEDSOP, tcr, &e);
2455 if (e.error == 0 && e.stat == NFS4_OK)
2456 break;
2459 * For some reason, the mount compound failed. Before
2460 * retrying, we need to restore original conditions.
2462 svp = restore_svp(mi, svp, origsvp);
2463 svp_head = svp;
2465 } while (num_retry-- > 0);
2466 error = e.error ? e.error : geterrno4(e.stat);
2467 if (error) {
2468 nfs_cmn_err(error, CE_WARN,
2469 VERS_MSG "initial call to %s failed%s: %m",
2470 svp->sv_hostname, droptext);
2471 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2472 svp->sv_flags |= SV4_NOTINUSE;
2473 nfs_rw_exit(&svp->sv_lock);
2474 mi->mi_flags &= ~MI4_RECOV_FAIL;
2475 mi->mi_error = 0;
2476 continue;
2479 if (tmp_vtype == VBAD) {
2480 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2481 VERS_MSG "%s returned a bad file type for "
2482 "root%s", svp->sv_hostname, droptext);
2483 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2484 svp->sv_flags |= SV4_NOTINUSE;
2485 nfs_rw_exit(&svp->sv_lock);
2486 continue;
2489 if (vtype == VNON) {
2490 vtype = tmp_vtype;
2491 } else if (vtype != tmp_vtype) {
2492 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2493 VERS_MSG "%s returned a different file type "
2494 "for root%s", svp->sv_hostname, droptext);
2495 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2496 svp->sv_flags |= SV4_NOTINUSE;
2497 nfs_rw_exit(&svp->sv_lock);
2498 continue;
2500 if (firstsvp == NULL)
2501 firstsvp = svp;
2504 if (firstsvp == NULL) {
2505 if (error == 0)
2506 error = ENOENT;
2507 goto bad;
2510 mi->mi_curr_serv = svp = firstsvp;
2511 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2512 ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
2513 fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2514 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2515 mi->mi_rootfh = sfh4_get(&fh, mi);
2516 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
2517 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
2518 mi->mi_srvparentfh = sfh4_get(&fh, mi);
2519 nfs_rw_exit(&svp->sv_lock);
2522 * Get the fname for filesystem root.
2524 mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
2525 mfname = mi->mi_fname;
2526 fn_hold(mfname);
2529 * Make the root vnode without attributes.
2531 rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
2532 &mfname, NULL, mi, cr, gethrtime());
2533 rtvp->v_type = vtype;
2535 mi->mi_curread = mi->mi_tsize;
2536 mi->mi_curwrite = mi->mi_stsize;
2539 * Start the manager thread responsible for handling async worker
2540 * threads.
2542 MI4_HOLD(mi);
2543 VFS_HOLD(vfsp); /* add reference for thread */
2544 mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
2545 vfsp, 0, minclsyspri);
2546 ASSERT(mi->mi_manager_thread != NULL);
2549 * Create the thread that handles over-the-wire calls for
2550 * fop_inactive.
2551 * This needs to happen after the manager thread is created.
2553 MI4_HOLD(mi);
2554 mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
2555 mi, 0, minclsyspri);
2556 ASSERT(mi->mi_inactive_thread != NULL);
2558 /* If we didn't get a type, get one now */
2559 if (rtvp->v_type == VNON) {
2560 va.va_mask = AT_TYPE;
2561 error = nfs4getattr(rtvp, &va, tcr);
2562 if (error)
2563 goto bad;
2564 rtvp->v_type = va.va_type;
2567 mi->mi_type = rtvp->v_type;
2569 mutex_enter(&mi->mi_lock);
2570 mi->mi_flags &= ~MI4_MOUNTING;
2571 mutex_exit(&mi->mi_lock);
2573 /* Update VFS with new server and path info */
2574 if ((strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) ||
2575 (strcmp(svp->sv_path, origsvp->sv_path) != 0)) {
2576 len = svp->sv_hostnamelen + svp->sv_pathlen;
2577 resource = kmem_zalloc(len, KM_SLEEP);
2578 (void) strcat(resource, svp->sv_hostname);
2579 (void) strcat(resource, ":");
2580 (void) strcat(resource, svp->sv_path);
2581 vfs_setresource(vfsp, resource, 0);
2582 kmem_free(resource, len);
2585 sv4_free(origsvp);
2586 *rtvpp = rtvp;
2587 if (lcr != NULL)
2588 crfree(lcr);
2590 return (0);
2591 bad:
2593 * An error occurred somewhere, need to clean up...
2595 if (lcr != NULL)
2596 crfree(lcr);
2598 if (rtvp != NULL) {
2600 * We need to release our reference to the root vnode and
2601 * destroy the mntinfo4 struct that we just created.
2603 rp = VTOR4(rtvp);
2604 if (rp->r_flags & R4HASHED)
2605 rp4_rmhash(rp);
2606 VN_RELE(rtvp);
2608 nfs4_async_stop(vfsp);
2609 nfs4_async_manager_stop(vfsp);
2610 removed = nfs4_mi_zonelist_remove(mi);
2611 if (removed)
2612 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2615 * This releases the initial "hold" of the mi since it will never
2616 * be referenced by the vfsp. Also, when mount returns to vfs.c
2617 * with an error, the vfsp will be destroyed, not rele'd.
2619 MI4_RELE(mi);
2621 if (origsvp != NULL)
2622 sv4_free(origsvp);
2624 *rtvpp = NULL;
2625 return (error);
2629 * vfs operations
2631 static int
2632 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
2634 mntinfo4_t *mi;
2635 ushort_t omax;
2636 int removed;
2638 bool_t must_unlock;
2640 nfs4_ephemeral_tree_t *eph_tree;
2642 if (secpolicy_fs_unmount(cr, vfsp) != 0)
2643 return (EPERM);
2645 mi = VFTOMI4(vfsp);
2647 if (flag & MS_FORCE) {
2648 vfsp->vfs_flag |= VFS_UNMOUNTED;
2649 if (nfs_zone() != mi->mi_zone) {
2651 * If the request is coming from the wrong zone,
2652 * we don't want to create any new threads, and
2653 * performance is not a concern. Do everything
2654 * inline.
2656 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2657 "nfs4_unmount x-zone forced unmount of vfs %p\n",
2658 (void *)vfsp));
2659 nfs4_free_mount(vfsp, flag, cr);
2660 } else {
2662 * Free data structures asynchronously, to avoid
2663 * blocking the current thread (for performance
2664 * reasons only).
2666 async_free_mount(vfsp, flag, cr);
2669 return (0);
2673 * Wait until all asynchronous putpage operations on
2674 * this file system are complete before flushing rnodes
2675 * from the cache.
2677 omax = mi->mi_max_threads;
2678 if (nfs4_async_stop_sig(vfsp))
2679 return (EINTR);
2681 r4flush(vfsp, cr);
2684 * About the only reason that this would fail would be
2685 * that the harvester is already busy tearing down this
2686 * node. So we fail back to the caller and let them try
2687 * again when needed.
2689 if (nfs4_ephemeral_umount(mi, flag, cr,
2690 &must_unlock, &eph_tree)) {
2691 ASSERT(must_unlock == FALSE);
2692 mutex_enter(&mi->mi_async_lock);
2693 mi->mi_max_threads = omax;
2694 mutex_exit(&mi->mi_async_lock);
2696 return (EBUSY);
2700 * If there are any active vnodes on this file system,
2701 * then the file system is busy and can't be unmounted.
2703 if (check_rtable4(vfsp)) {
2704 nfs4_ephemeral_umount_unlock(&must_unlock, &eph_tree);
2706 mutex_enter(&mi->mi_async_lock);
2707 mi->mi_max_threads = omax;
2708 mutex_exit(&mi->mi_async_lock);
2710 return (EBUSY);
2714 * The unmount can't fail from now on, so record any
2715 * ephemeral changes.
2717 nfs4_ephemeral_umount_activate(mi, &must_unlock, &eph_tree);
2720 * There are no active files that could require over-the-wire
2721 * calls to the server, so stop the async manager and the
2722 * inactive thread.
2724 nfs4_async_manager_stop(vfsp);
2727 * Destroy all rnodes belonging to this file system from the
2728 * rnode hash queues and purge any resources allocated to
2729 * them.
2731 destroy_rtable4(vfsp, cr);
2732 vfsp->vfs_flag |= VFS_UNMOUNTED;
2734 nfs4_remove_mi_from_server(mi, NULL);
2735 removed = nfs4_mi_zonelist_remove(mi);
2736 if (removed)
2737 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2739 return (0);
2743 * find root of nfs
2745 static int
2746 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
2748 mntinfo4_t *mi;
2749 vnode_t *vp;
2750 nfs4_fname_t *mfname;
2751 servinfo4_t *svp;
2753 mi = VFTOMI4(vfsp);
2755 if (nfs_zone() != mi->mi_zone)
2756 return (EPERM);
2758 svp = mi->mi_curr_serv;
2759 if (svp) {
2760 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2761 if (svp->sv_flags & SV4_ROOT_STALE) {
2762 nfs_rw_exit(&svp->sv_lock);
2764 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2765 if (svp->sv_flags & SV4_ROOT_STALE) {
2766 svp->sv_flags &= ~SV4_ROOT_STALE;
2767 nfs_rw_exit(&svp->sv_lock);
2768 return (ENOENT);
2770 nfs_rw_exit(&svp->sv_lock);
2771 } else
2772 nfs_rw_exit(&svp->sv_lock);
2775 mfname = mi->mi_fname;
2776 fn_hold(mfname);
2777 vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
2778 VFTOMI4(vfsp), CRED(), gethrtime());
2780 if (VTOR4(vp)->r_flags & R4STALE) {
2781 VN_RELE(vp);
2782 return (ENOENT);
2785 ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
2787 vp->v_type = mi->mi_type;
2789 *vpp = vp;
2791 return (0);
2794 static int
2795 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
2797 int error;
2798 nfs4_ga_res_t gar;
2799 nfs4_ga_ext_res_t ger;
2801 gar.n4g_ext_res = &ger;
2803 if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
2804 NFS4_STATFS_ATTR_MASK, cr))
2805 return (error);
2807 *sbp = gar.n4g_ext_res->n4g_sb;
2809 return (0);
2813 * Get file system statistics.
2815 static int
2816 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
2818 int error;
2819 vnode_t *vp;
2820 cred_t *cr;
2822 error = nfs4_root(vfsp, &vp);
2823 if (error)
2824 return (error);
2826 cr = CRED();
2828 error = nfs4_statfs_otw(vp, sbp, cr);
2829 if (!error) {
2830 (void) strncpy(sbp->f_basetype,
2831 vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
2832 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
2833 } else {
2834 nfs4_purge_stale_fh(error, vp, cr);
2837 VN_RELE(vp);
2839 return (error);
2842 static kmutex_t nfs4_syncbusy;
2845 * Flush dirty nfs files for file system vfsp.
2846 * If vfsp == NULL, all nfs files are flushed.
2848 * SYNC_CLOSE in flag is passed to us to
2849 * indicate that we are shutting down and or
2850 * rebooting.
2852 static int
2853 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
2856 * Cross-zone calls are OK here, since this translates to a
2857 * fop_putpage(B_ASYNC), which gets picked up by the right zone.
2859 if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
2860 r4flush(vfsp, cr);
2861 mutex_exit(&nfs4_syncbusy);
2865 * if SYNC_CLOSE is set then we know that
2866 * the system is rebooting, mark the mntinfo
2867 * for later examination.
2869 if (vfsp && (flag & SYNC_CLOSE)) {
2870 mntinfo4_t *mi;
2872 mi = VFTOMI4(vfsp);
2873 if (!(mi->mi_flags & MI4_SHUTDOWN)) {
2874 mutex_enter(&mi->mi_lock);
2875 mi->mi_flags |= MI4_SHUTDOWN;
2876 mutex_exit(&mi->mi_lock);
2879 return (0);
2883 * vget is difficult, if not impossible, to support in v4 because we don't
2884 * know the parent directory or name, which makes it impossible to create a
2885 * useful shadow vnode. And we need the shadow vnode for things like
2886 * OPEN.
2889 /* ARGSUSED */
2891 * XXX Check nfs4_vget_pseudo() for dependency.
2893 static int
2894 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2896 return (EREMOTE);
2900 * nfs4_mountroot get called in the case where we are diskless booting. All
2901 * we need from here is the ability to get the server info and from there we
2902 * can simply call nfs4_rootvp.
2904 /* ARGSUSED */
2905 static int
2906 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
2908 vnode_t *rtvp;
2909 char root_hostname[SYS_NMLN+1];
2910 struct servinfo4 *svp;
2911 int error;
2912 int vfsflags;
2913 size_t size;
2914 char *root_path;
2915 struct pathname pn;
2916 char *name;
2917 cred_t *cr;
2918 mntinfo4_t *mi;
2919 struct nfs_args args; /* nfs mount arguments */
2920 static char token[10];
2921 nfs4_error_t n4e;
2923 bzero(&args, sizeof (args));
2925 /* do this BEFORE getfile which causes xid stamps to be initialized */
2926 clkset(-1L); /* hack for now - until we get time svc? */
2928 if (why == ROOT_REMOUNT) {
2930 * Shouldn't happen.
2932 panic("nfs4_mountroot: why == ROOT_REMOUNT");
2935 if (why == ROOT_UNMOUNT) {
2937 * Nothing to do for NFS.
2939 return (0);
2943 * why == ROOT_INIT
2946 name = token;
2947 *name = 0;
2948 (void) getfsname("root", name, sizeof (token));
2950 pn_alloc(&pn);
2951 root_path = pn.pn_path;
2953 svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2954 nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2955 svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
2956 svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2957 svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2960 * Get server address
2961 * Get the root path
2962 * Get server's transport
2963 * Get server's hostname
2964 * Get options
2966 args.addr = &svp->sv_addr;
2967 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2968 args.fh = (char *)&svp->sv_fhandle;
2969 args.knconf = svp->sv_knconf;
2970 args.hostname = root_hostname;
2971 vfsflags = 0;
2972 if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
2973 &args, &vfsflags)) {
2974 if (error == EPROTONOSUPPORT)
2975 nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
2976 "mount_root failed: server doesn't support NFS V4");
2977 else
2978 nfs_cmn_err(error, CE_WARN,
2979 "nfs4_mountroot: mount_root failed: %m");
2980 nfs_rw_exit(&svp->sv_lock);
2981 sv4_free(svp);
2982 pn_free(&pn);
2983 return (error);
2985 nfs_rw_exit(&svp->sv_lock);
2986 svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
2987 svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2988 (void) strcpy(svp->sv_hostname, root_hostname);
2990 svp->sv_pathlen = (int)(strlen(root_path) + 1);
2991 svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2992 (void) strcpy(svp->sv_path, root_path);
2995 * Force root partition to always be mounted with AUTH_UNIX for now
2997 svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
2998 svp->sv_secdata->secmod = AUTH_UNIX;
2999 svp->sv_secdata->rpcflavor = AUTH_UNIX;
3000 svp->sv_secdata->data = NULL;
3002 cr = crgetcred();
3003 rtvp = NULL;
3005 error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
3007 if (error) {
3008 crfree(cr);
3009 pn_free(&pn);
3010 sv4_free(svp);
3011 return (error);
3014 mi = VTOMI4(rtvp);
3017 * Send client id to the server, if necessary
3019 nfs4_error_zinit(&n4e);
3020 nfs4setclientid(mi, cr, FALSE, &n4e);
3021 error = n4e.error;
3023 crfree(cr);
3025 if (error) {
3026 pn_free(&pn);
3027 goto errout;
3030 error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
3031 if (error) {
3032 nfs_cmn_err(error, CE_WARN,
3033 "nfs4_mountroot: invalid root mount options");
3034 pn_free(&pn);
3035 goto errout;
3038 (void) vfs_lock_wait(vfsp);
3039 vfs_add(NULL, vfsp, vfsflags);
3040 vfs_unlock(vfsp);
3042 size = strlen(svp->sv_hostname);
3043 (void) strcpy(rootfs.bo_name, svp->sv_hostname);
3044 rootfs.bo_name[size] = ':';
3045 (void) strcpy(&rootfs.bo_name[size + 1], root_path);
3047 pn_free(&pn);
3049 errout:
3050 if (error) {
3051 sv4_free(svp);
3052 nfs4_async_stop(vfsp);
3053 nfs4_async_manager_stop(vfsp);
3056 if (rtvp != NULL)
3057 VN_RELE(rtvp);
3059 return (error);
3063 * Initialization routine for VFS routines. Should only be called once
3066 nfs4_vfsinit(void)
3068 mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
3069 nfs4setclientid_init();
3070 nfs4_ephemeral_init();
3071 return (0);
3074 void
3075 nfs4_vfsfini(void)
3077 nfs4_ephemeral_fini();
3078 nfs4setclientid_fini();
3079 mutex_destroy(&nfs4_syncbusy);
3082 void
3083 nfs4_freevfs(vfs_t *vfsp)
3085 mntinfo4_t *mi;
3087 /* need to release the initial hold */
3088 mi = VFTOMI4(vfsp);
3091 * At this point, we can no longer reference the vfs
3092 * and need to inform other holders of the reference
3093 * to the mntinfo4_t.
3095 mi->mi_vfsp = NULL;
3097 MI4_RELE(mi);
3101 * Client side SETCLIENTID and SETCLIENTID_CONFIRM
3103 struct nfs4_server nfs4_server_lst =
3104 { &nfs4_server_lst, &nfs4_server_lst };
3106 kmutex_t nfs4_server_lst_lock;
3108 static void
3109 nfs4setclientid_init(void)
3111 mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
3114 static void
3115 nfs4setclientid_fini(void)
3117 mutex_destroy(&nfs4_server_lst_lock);
3120 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
3121 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
3124 * Set the clientid for the server for "mi". No-op if the clientid is
3125 * already set.
3127 * The recovery boolean should be set to TRUE if this function was called
3128 * by the recovery code, and FALSE otherwise. This is used to determine
3129 * if we need to call nfs4_start/end_op as well as grab the mi_recovlock
3130 * for adding a mntinfo4_t to a nfs4_server_t.
3132 * Error is returned via 'n4ep'. If there was a 'n4ep->stat' error, then
3133 * 'n4ep->error' is set to geterrno4(n4ep->stat).
3135 void
3136 nfs4setclientid(mntinfo4_t *mi, cred_t *cr, bool_t recovery, nfs4_error_t *n4ep)
3138 struct nfs4_server *np;
3139 struct servinfo4 *svp = mi->mi_curr_serv;
3140 nfs4_recov_state_t recov_state;
3141 int num_retries = 0;
3142 bool_t retry;
3143 cred_t *lcr = NULL;
3144 int retry_inuse = 1; /* only retry once on NFS4ERR_CLID_INUSE */
3145 time_t lease_time = 0;
3147 recov_state.rs_flags = 0;
3148 recov_state.rs_num_retry_despite_err = 0;
3149 ASSERT(n4ep != NULL);
3151 recov_retry:
3152 retry = FALSE;
3153 nfs4_error_zinit(n4ep);
3154 if (!recovery)
3155 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3157 mutex_enter(&nfs4_server_lst_lock);
3158 np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
3159 mutex_exit(&nfs4_server_lst_lock);
3160 if (!np) {
3161 struct nfs4_server *tnp;
3162 np = new_nfs4_server(svp, cr);
3163 mutex_enter(&np->s_lock);
3165 mutex_enter(&nfs4_server_lst_lock);
3166 tnp = servinfo4_to_nfs4_server(svp);
3167 if (tnp) {
3169 * another thread snuck in and put server on list.
3170 * since we aren't adding it to the nfs4_server_list
3171 * we need to set the ref count to 0 and destroy it.
3173 np->s_refcnt = 0;
3174 destroy_nfs4_server(np);
3175 np = tnp;
3176 } else {
3178 * do not give list a reference until everything
3179 * succeeds
3181 insque(np, &nfs4_server_lst);
3183 mutex_exit(&nfs4_server_lst_lock);
3185 ASSERT(MUTEX_HELD(&np->s_lock));
3187 * If we find the server already has N4S_CLIENTID_SET, then
3188 * just return, we've already done SETCLIENTID to that server
3190 if (np->s_flags & N4S_CLIENTID_SET) {
3191 /* add mi to np's mntinfo4_list */
3192 nfs4_add_mi_to_server(np, mi);
3193 if (!recovery)
3194 nfs_rw_exit(&mi->mi_recovlock);
3195 mutex_exit(&np->s_lock);
3196 nfs4_server_rele(np);
3197 return;
3199 mutex_exit(&np->s_lock);
3203 * Drop the mi_recovlock since nfs4_start_op will
3204 * acquire it again for us.
3206 if (!recovery) {
3207 nfs_rw_exit(&mi->mi_recovlock);
3209 n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3210 if (n4ep->error) {
3211 nfs4_server_rele(np);
3212 return;
3216 mutex_enter(&np->s_lock);
3217 while (np->s_flags & N4S_CLIENTID_PEND) {
3218 if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
3219 mutex_exit(&np->s_lock);
3220 nfs4_server_rele(np);
3221 if (!recovery)
3222 nfs4_end_op(mi, NULL, NULL, &recov_state,
3223 recovery);
3224 n4ep->error = EINTR;
3225 return;
3229 if (np->s_flags & N4S_CLIENTID_SET) {
3230 /* XXX copied/pasted from above */
3231 /* add mi to np's mntinfo4_list */
3232 nfs4_add_mi_to_server(np, mi);
3233 mutex_exit(&np->s_lock);
3234 nfs4_server_rele(np);
3235 if (!recovery)
3236 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3237 return;
3241 * Reset the N4S_CB_PINGED flag. This is used to
3242 * indicate if we have received a CB_NULL from the
3243 * server. Also we reset the waiter flag.
3245 np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
3246 /* any failure must now clear this flag */
3247 np->s_flags |= N4S_CLIENTID_PEND;
3248 mutex_exit(&np->s_lock);
3249 nfs4setclientid_otw(mi, svp, cr, np, n4ep, &retry_inuse);
3251 if (n4ep->error == EACCES) {
3253 * If the uid is set then set the creds for secure mounts
3254 * by proxy processes such as automountd.
3256 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3257 if (svp->sv_secdata->uid != 0) {
3258 lcr = crdup(cr);
3259 (void) crsetugid(lcr, svp->sv_secdata->uid,
3260 crgetgid(cr));
3262 nfs_rw_exit(&svp->sv_lock);
3264 if (lcr != NULL) {
3265 mutex_enter(&np->s_lock);
3266 crfree(np->s_cred);
3267 np->s_cred = lcr;
3268 mutex_exit(&np->s_lock);
3269 nfs4setclientid_otw(mi, svp, lcr, np, n4ep,
3270 &retry_inuse);
3273 mutex_enter(&np->s_lock);
3274 lease_time = np->s_lease_time;
3275 np->s_flags &= ~N4S_CLIENTID_PEND;
3276 mutex_exit(&np->s_lock);
3278 if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
3280 * Start recovery if failover is a possibility. If
3281 * invoked by the recovery thread itself, then just
3282 * return and let it handle the failover first. NB:
3283 * recovery is not allowed if the mount is in progress
3284 * since the infrastructure is not sufficiently setup
3285 * to allow it. Just return the error (after suitable
3286 * retries).
3288 if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
3289 (void) nfs4_start_recovery(n4ep, mi, NULL,
3290 NULL, NULL, NULL, OP_SETCLIENTID, NULL, NULL, NULL);
3292 * Don't retry here, just return and let
3293 * recovery take over.
3295 if (recovery)
3296 retry = FALSE;
3297 } else if (nfs4_rpc_retry_error(n4ep->error) ||
3298 n4ep->stat == NFS4ERR_RESOURCE ||
3299 n4ep->stat == NFS4ERR_STALE_CLIENTID) {
3301 retry = TRUE;
3303 * Always retry if in recovery or once had
3304 * contact with the server (but now it's
3305 * overloaded).
3307 if (recovery == TRUE ||
3308 n4ep->error == ETIMEDOUT ||
3309 n4ep->error == ECONNRESET)
3310 num_retries = 0;
3311 } else if (retry_inuse && n4ep->error == 0 &&
3312 n4ep->stat == NFS4ERR_CLID_INUSE) {
3313 retry = TRUE;
3314 num_retries = 0;
3316 } else {
3318 * Since everything succeeded give the list a reference count if
3319 * it hasn't been given one by add_new_nfs4_server() or if this
3320 * is not a recovery situation in which case it is already on
3321 * the list.
3323 mutex_enter(&np->s_lock);
3324 if ((np->s_flags & N4S_INSERTED) == 0) {
3325 np->s_refcnt++;
3326 np->s_flags |= N4S_INSERTED;
3328 mutex_exit(&np->s_lock);
3331 if (!recovery)
3332 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3335 if (retry && num_retries++ < nfs4_num_sclid_retries) {
3336 if (retry_inuse) {
3337 ddi_sleep(lease_time + nfs4_retry_sclid_delay);
3338 retry_inuse = 0;
3339 } else
3340 ddi_sleep(nfs4_retry_sclid_delay);
3342 nfs4_server_rele(np);
3343 goto recov_retry;
3347 if (n4ep->error == 0)
3348 n4ep->error = geterrno4(n4ep->stat);
3350 /* broadcast before release in case no other threads are waiting */
3351 cv_broadcast(&np->s_clientid_pend);
3352 nfs4_server_rele(np);
3355 int nfs4setclientid_otw_debug = 0;
3358 * This function handles the recovery of STALE_CLIENTID for SETCLIENTID_CONFRIM,
3359 * but nothing else; the calling function must be designed to handle those
3360 * other errors.
3362 static void
3363 nfs4setclientid_otw(mntinfo4_t *mi, struct servinfo4 *svp, cred_t *cr,
3364 struct nfs4_server *np, nfs4_error_t *ep, int *retry_inusep)
3366 COMPOUND4args_clnt args;
3367 COMPOUND4res_clnt res;
3368 nfs_argop4 argop[3];
3369 SETCLIENTID4args *s_args;
3370 SETCLIENTID4resok *s_resok;
3371 int doqueue = 1;
3372 nfs4_ga_res_t *garp = NULL;
3373 timespec_t prop_time, after_time;
3374 verifier4 verf;
3375 clientid4 tmp_clientid;
3377 ASSERT(!MUTEX_HELD(&np->s_lock));
3379 args.ctag = TAG_SETCLIENTID;
3381 args.array = argop;
3382 args.array_len = 3;
3384 /* PUTROOTFH */
3385 argop[0].argop = OP_PUTROOTFH;
3387 /* GETATTR */
3388 argop[1].argop = OP_GETATTR;
3389 argop[1].nfs_argop4_u.opgetattr.attr_request = FATTR4_LEASE_TIME_MASK;
3390 argop[1].nfs_argop4_u.opgetattr.mi = mi;
3392 /* SETCLIENTID */
3393 argop[2].argop = OP_SETCLIENTID;
3395 s_args = &argop[2].nfs_argop4_u.opsetclientid;
3397 mutex_enter(&np->s_lock);
3399 s_args->client.verifier = np->clidtosend.verifier;
3400 s_args->client.id_len = np->clidtosend.id_len;
3401 ASSERT(s_args->client.id_len <= NFS4_OPAQUE_LIMIT);
3402 s_args->client.id_val = np->clidtosend.id_val;
3405 * Callback needs to happen on non-RDMA transport
3406 * Check if we have saved the original knetconfig
3407 * if so, use that instead.
3409 if (svp->sv_origknconf != NULL)
3410 nfs4_cb_args(np, svp->sv_origknconf, s_args);
3411 else
3412 nfs4_cb_args(np, svp->sv_knconf, s_args);
3414 mutex_exit(&np->s_lock);
3416 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3418 if (ep->error)
3419 return;
3421 /* getattr lease_time res */
3422 if ((res.array_len >= 2) &&
3423 (res.array[1].nfs_resop4_u.opgetattr.status == NFS4_OK)) {
3424 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
3426 #ifndef _LP64
3428 * The 32 bit client cannot handle a lease time greater than
3429 * (INT32_MAX/1000000). This is due to the use of the
3430 * lease_time in calls to drv_usectohz() in
3431 * nfs4_renew_lease_thread(). The problem is that
3432 * drv_usectohz() takes a time_t (which is just a long = 4
3433 * bytes) as its parameter. The lease_time is multiplied by
3434 * 1000000 to convert seconds to usecs for the parameter. If
3435 * a number bigger than (INT32_MAX/1000000) is used then we
3436 * overflow on the 32bit client.
3438 if (garp->n4g_ext_res->n4g_leasetime > (INT32_MAX/1000000)) {
3439 garp->n4g_ext_res->n4g_leasetime = INT32_MAX/1000000;
3441 #endif
3443 mutex_enter(&np->s_lock);
3444 np->s_lease_time = garp->n4g_ext_res->n4g_leasetime;
3447 * Keep track of the lease period for the mi's
3448 * mi_msg_list. We need an appropiate time
3449 * bound to associate past facts with a current
3450 * event. The lease period is perfect for this.
3452 mutex_enter(&mi->mi_msg_list_lock);
3453 mi->mi_lease_period = np->s_lease_time;
3454 mutex_exit(&mi->mi_msg_list_lock);
3455 mutex_exit(&np->s_lock);
3459 if (res.status == NFS4ERR_CLID_INUSE) {
3460 clientaddr4 *clid_inuse;
3462 if (!(*retry_inusep)) {
3463 clid_inuse = &res.array->nfs_resop4_u.
3464 opsetclientid.SETCLIENTID4res_u.client_using;
3466 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3467 "NFS4 mount (SETCLIENTID failed)."
3468 " nfs4_client_id.id is in"
3469 "use already by: r_netid<%s> r_addr<%s>",
3470 clid_inuse->r_netid, clid_inuse->r_addr);
3474 * XXX - The client should be more robust in its
3475 * handling of clientid in use errors (regen another
3476 * clientid and try again?)
3478 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3479 return;
3482 if (res.status) {
3483 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3484 return;
3487 s_resok = &res.array[2].nfs_resop4_u.
3488 opsetclientid.SETCLIENTID4res_u.resok4;
3490 tmp_clientid = s_resok->clientid;
3492 verf = s_resok->setclientid_confirm;
3494 #ifdef DEBUG
3495 if (nfs4setclientid_otw_debug) {
3496 union {
3497 clientid4 clientid;
3498 int foo[2];
3499 } cid;
3501 cid.clientid = s_resok->clientid;
3503 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3504 "nfs4setclientid_otw: OK, clientid = %x,%x, "
3505 "verifier = %" PRIx64 "\n", cid.foo[0], cid.foo[1], verf);
3507 #endif
3509 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3511 /* Confirm the client id and get the lease_time attribute */
3513 args.ctag = TAG_SETCLIENTID_CF;
3515 args.array = argop;
3516 args.array_len = 1;
3518 argop[0].argop = OP_SETCLIENTID_CONFIRM;
3520 argop[0].nfs_argop4_u.opsetclientid_confirm.clientid = tmp_clientid;
3521 argop[0].nfs_argop4_u.opsetclientid_confirm.setclientid_confirm = verf;
3523 /* used to figure out RTT for np */
3524 gethrestime(&prop_time);
3526 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlientid_otw: "
3527 "start time: %ld sec %ld nsec", prop_time.tv_sec,
3528 prop_time.tv_nsec));
3530 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3532 gethrestime(&after_time);
3533 mutex_enter(&np->s_lock);
3534 np->propagation_delay.tv_sec =
3535 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3536 mutex_exit(&np->s_lock);
3538 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlcientid_otw: "
3539 "finish time: %ld sec ", after_time.tv_sec));
3541 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setclientid_otw: "
3542 "propagation delay set to %ld sec",
3543 np->propagation_delay.tv_sec));
3545 if (ep->error)
3546 return;
3548 if (res.status == NFS4ERR_CLID_INUSE) {
3549 clientaddr4 *clid_inuse;
3551 if (!(*retry_inusep)) {
3552 clid_inuse = &res.array->nfs_resop4_u.
3553 opsetclientid.SETCLIENTID4res_u.client_using;
3555 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3556 "SETCLIENTID_CONFIRM failed. "
3557 "nfs4_client_id.id is in use already by: "
3558 "r_netid<%s> r_addr<%s>",
3559 clid_inuse->r_netid, clid_inuse->r_addr);
3562 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3563 return;
3566 if (res.status) {
3567 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3568 return;
3571 mutex_enter(&np->s_lock);
3572 np->clientid = tmp_clientid;
3573 np->s_flags |= N4S_CLIENTID_SET;
3575 /* Add mi to np's mntinfo4 list */
3576 nfs4_add_mi_to_server(np, mi);
3578 if (np->lease_valid == NFS4_LEASE_NOT_STARTED) {
3580 * Start lease management thread.
3581 * Keep trying until we succeed.
3584 np->s_refcnt++; /* pass reference to thread */
3585 (void) zthread_create(NULL, 0, nfs4_renew_lease_thread, np, 0,
3586 minclsyspri);
3588 mutex_exit(&np->s_lock);
3590 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3594 * Add mi to sp's mntinfo4_list if it isn't already in the list. Makes
3595 * mi's clientid the same as sp's.
3596 * Assumes sp is locked down.
3598 void
3599 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
3601 mntinfo4_t *tmi;
3602 int in_list = 0;
3604 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3605 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3606 ASSERT(sp != &nfs4_server_lst);
3607 ASSERT(MUTEX_HELD(&sp->s_lock));
3609 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3610 "nfs4_add_mi_to_server: add mi %p to sp %p",
3611 (void*)mi, (void*)sp));
3613 for (tmi = sp->mntinfo4_list;
3614 tmi != NULL;
3615 tmi = tmi->mi_clientid_next) {
3616 if (tmi == mi) {
3617 NFS4_DEBUG(nfs4_client_lease_debug,
3618 (CE_NOTE,
3619 "nfs4_add_mi_to_server: mi in list"));
3620 in_list = 1;
3625 * First put a hold on the mntinfo4's vfsp so that references via
3626 * mntinfo4_list will be valid.
3628 if (!in_list)
3629 VFS_HOLD(mi->mi_vfsp);
3631 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
3632 "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
3634 if (!in_list) {
3635 if (sp->mntinfo4_list)
3636 sp->mntinfo4_list->mi_clientid_prev = mi;
3637 mi->mi_clientid_next = sp->mntinfo4_list;
3638 mi->mi_srv = sp;
3639 sp->mntinfo4_list = mi;
3640 mi->mi_srvsettime = gethrestime_sec();
3641 mi->mi_srvset_cnt++;
3644 /* set mi's clientid to that of sp's for later matching */
3645 mi->mi_clientid = sp->clientid;
3648 * Update the clientid for any other mi's belonging to sp. This
3649 * must be done here while we hold sp->s_lock, so that
3650 * find_nfs4_server() continues to work.
3653 for (tmi = sp->mntinfo4_list;
3654 tmi != NULL;
3655 tmi = tmi->mi_clientid_next) {
3656 if (tmi != mi) {
3657 tmi->mi_clientid = sp->clientid;
3663 * Remove the mi from sp's mntinfo4_list and release its reference.
3664 * Exception: if mi still has open files, flag it for later removal (when
3665 * all the files are closed).
3667 * If this is the last mntinfo4 in sp's list then tell the lease renewal
3668 * thread to exit.
3670 static void
3671 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
3673 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3674 "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
3675 (void*)mi, (void*)sp));
3677 ASSERT(sp != NULL);
3678 ASSERT(MUTEX_HELD(&sp->s_lock));
3679 ASSERT(mi->mi_open_files >= 0);
3682 * First make sure this mntinfo4 can be taken off of the list,
3683 * ie: it doesn't have any open files remaining.
3685 if (mi->mi_open_files > 0) {
3686 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3687 "nfs4_remove_mi_from_server_nolock: don't "
3688 "remove mi since it still has files open"));
3690 mutex_enter(&mi->mi_lock);
3691 mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
3692 mutex_exit(&mi->mi_lock);
3693 return;
3696 VFS_HOLD(mi->mi_vfsp);
3697 remove_mi(sp, mi);
3698 VFS_RELE(mi->mi_vfsp);
3700 if (sp->mntinfo4_list == NULL) {
3701 /* last fs unmounted, kill the thread */
3702 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3703 "remove_mi_from_nfs4_server_nolock: kill the thread"));
3704 nfs4_mark_srv_dead(sp);
3709 * Remove mi from sp's mntinfo4_list and release the vfs reference.
3711 static void
3712 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
3714 ASSERT(MUTEX_HELD(&sp->s_lock));
3717 * We release a reference, and the caller must still have a
3718 * reference.
3720 ASSERT(mi->mi_vfsp->vfs_count >= 2);
3722 if (mi->mi_clientid_prev) {
3723 mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
3724 } else {
3725 /* This is the first mi in sp's mntinfo4_list */
3727 * Make sure the first mntinfo4 in the list is the actual
3728 * mntinfo4 passed in.
3730 ASSERT(sp->mntinfo4_list == mi);
3732 sp->mntinfo4_list = mi->mi_clientid_next;
3734 if (mi->mi_clientid_next)
3735 mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
3737 /* Now mark the mntinfo4's links as being removed */
3738 mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
3739 mi->mi_srv = NULL;
3740 mi->mi_srvset_cnt++;
3742 VFS_RELE(mi->mi_vfsp);
3746 * Free all the entries in sp's mntinfo4_list.
3748 static void
3749 remove_all_mi(nfs4_server_t *sp)
3751 mntinfo4_t *mi;
3753 ASSERT(MUTEX_HELD(&sp->s_lock));
3755 while (sp->mntinfo4_list != NULL) {
3756 mi = sp->mntinfo4_list;
3758 * Grab a reference in case there is only one left (which
3759 * remove_mi() frees).
3761 VFS_HOLD(mi->mi_vfsp);
3762 remove_mi(sp, mi);
3763 VFS_RELE(mi->mi_vfsp);
3768 * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
3770 * This version can be called with a null nfs4_server_t arg,
3771 * and will either find the right one and handle locking, or
3772 * do nothing because the mi wasn't added to an sp's mntinfo4_list.
3774 void
3775 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
3777 nfs4_server_t *sp;
3779 if (esp) {
3780 nfs4_remove_mi_from_server_nolock(mi, esp);
3781 return;
3784 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3785 if (sp = find_nfs4_server_all(mi, 1)) {
3786 nfs4_remove_mi_from_server_nolock(mi, sp);
3787 mutex_exit(&sp->s_lock);
3788 nfs4_server_rele(sp);
3790 nfs_rw_exit(&mi->mi_recovlock);
3794 * Return TRUE if the given server has any non-unmounted filesystems.
3797 bool_t
3798 nfs4_fs_active(nfs4_server_t *sp)
3800 mntinfo4_t *mi;
3802 ASSERT(MUTEX_HELD(&sp->s_lock));
3804 for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
3805 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
3806 return (TRUE);
3809 return (FALSE);
3813 * Mark sp as finished and notify any waiters.
3816 void
3817 nfs4_mark_srv_dead(nfs4_server_t *sp)
3819 ASSERT(MUTEX_HELD(&sp->s_lock));
3821 sp->s_thread_exit = NFS4_THREAD_EXIT;
3822 cv_broadcast(&sp->cv_thread_exit);
3826 * Create a new nfs4_server_t structure.
3827 * Returns new node unlocked and not in list, but with a reference count of
3828 * 1.
3830 struct nfs4_server *
3831 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3833 struct nfs4_server *np;
3834 timespec_t tt;
3835 union {
3836 struct {
3837 uint32_t sec;
3838 uint32_t subsec;
3839 } un_curtime;
3840 verifier4 un_verifier;
3841 } nfs4clientid_verifier;
3843 * We change this ID string carefully and with the Solaris
3844 * NFS server behaviour in mind. "+referrals" indicates
3845 * a client that can handle an NFSv4 referral.
3847 char id_val[] = "Solaris: %s, NFSv4 kernel client +referrals";
3848 int len;
3850 np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
3851 np->saddr.len = svp->sv_addr.len;
3852 np->saddr.maxlen = svp->sv_addr.maxlen;
3853 np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
3854 bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
3855 np->s_refcnt = 1;
3858 * Build the nfs_client_id4 for this server mount. Ensure
3859 * the verifier is useful and that the identification is
3860 * somehow based on the server's address for the case of
3861 * multi-homed servers.
3863 nfs4clientid_verifier.un_verifier = 0;
3864 gethrestime(&tt);
3865 nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
3866 nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
3867 np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
3870 * calculate the length of the opaque identifier. Subtract 2
3871 * for the "%s" and add the traditional +1 for null
3872 * termination.
3874 len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
3875 np->clidtosend.id_len = len + np->saddr.maxlen;
3877 np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
3878 (void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
3879 bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
3881 np->s_flags = 0;
3882 np->mntinfo4_list = NULL;
3883 /* save cred for issuing rfs4calls inside the renew thread */
3884 crhold(cr);
3885 np->s_cred = cr;
3886 cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
3887 mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
3888 nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
3889 list_create(&np->s_deleg_list, sizeof (rnode4_t),
3890 offsetof(rnode4_t, r_deleg_link));
3891 np->s_thread_exit = 0;
3892 np->state_ref_count = 0;
3893 np->lease_valid = NFS4_LEASE_NOT_STARTED;
3894 cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
3895 cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
3896 np->s_otw_call_count = 0;
3897 cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
3898 np->zoneid = getzoneid();
3899 np->zone_globals = nfs4_get_callback_globals();
3900 ASSERT(np->zone_globals != NULL);
3901 return (np);
3905 * Create a new nfs4_server_t structure and add it to the list.
3906 * Returns new node locked; reference must eventually be freed.
3908 static struct nfs4_server *
3909 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3911 nfs4_server_t *sp;
3913 ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3914 sp = new_nfs4_server(svp, cr);
3915 mutex_enter(&sp->s_lock);
3916 insque(sp, &nfs4_server_lst);
3917 sp->s_refcnt++; /* list gets a reference */
3918 sp->s_flags |= N4S_INSERTED;
3919 sp->clientid = 0;
3920 return (sp);
3923 int nfs4_server_t_debug = 0;
3926 #ifdef DEBUG
3927 void
3928 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
3930 int hash16(void *p, int len);
3931 nfs4_server_t *np;
3933 NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
3934 "dumping nfs4_server_t list in %s", txt));
3935 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3936 "mi 0x%p, want clientid %llx, addr %d/%04X",
3937 mi, (longlong_t)clientid, srv_p->sv_addr.len,
3938 hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
3939 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
3940 np = np->forw) {
3941 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3942 "node 0x%p, clientid %llx, addr %d/%04X, cnt %d",
3943 np, (longlong_t)np->clientid, np->saddr.len,
3944 hash16((void *)np->saddr.buf, np->saddr.len),
3945 np->state_ref_count));
3946 if (np->saddr.len == srv_p->sv_addr.len &&
3947 bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3948 np->saddr.len) == 0)
3949 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3950 " - address matches"));
3951 if (np->clientid == clientid || np->clientid == 0)
3952 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3953 " - clientid matches"));
3954 if (np->s_thread_exit != NFS4_THREAD_EXIT)
3955 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3956 " - thread not exiting"));
3958 ddi_sleep(1);
3960 #endif
3964 * Move a mntinfo4_t from one server list to another.
3965 * Locking of the two nfs4_server_t nodes will be done in list order.
3967 * Returns NULL if the current nfs4_server_t for the filesystem could not
3968 * be found (e.g., due to forced unmount). Otherwise returns a reference
3969 * to the new nfs4_server_t, which must eventually be freed.
3971 nfs4_server_t *
3972 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
3974 nfs4_server_t *p, *op = NULL, *np = NULL;
3975 int num_open;
3976 zoneid_t zoneid = nfs_zoneid();
3978 ASSERT(nfs_zone() == mi->mi_zone);
3980 mutex_enter(&nfs4_server_lst_lock);
3981 #ifdef DEBUG
3982 if (nfs4_server_t_debug)
3983 dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
3984 #endif
3985 for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
3986 if (p->zoneid != zoneid)
3987 continue;
3988 if (p->saddr.len == old->sv_addr.len &&
3989 bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
3990 p->s_thread_exit != NFS4_THREAD_EXIT) {
3991 op = p;
3992 mutex_enter(&op->s_lock);
3993 op->s_refcnt++;
3995 if (p->saddr.len == new->sv_addr.len &&
3996 bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
3997 p->s_thread_exit != NFS4_THREAD_EXIT) {
3998 np = p;
3999 mutex_enter(&np->s_lock);
4001 if (op != NULL && np != NULL)
4002 break;
4004 if (op == NULL) {
4006 * Filesystem has been forcibly unmounted. Bail out.
4008 if (np != NULL)
4009 mutex_exit(&np->s_lock);
4010 mutex_exit(&nfs4_server_lst_lock);
4011 return (NULL);
4013 if (np != NULL) {
4014 np->s_refcnt++;
4015 } else {
4016 #ifdef DEBUG
4017 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4018 "nfs4_move_mi: no target nfs4_server, will create."));
4019 #endif
4020 np = add_new_nfs4_server(new, kcred);
4022 mutex_exit(&nfs4_server_lst_lock);
4024 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4025 "nfs4_move_mi: for mi 0x%p, "
4026 "old servinfo4 0x%p, new servinfo4 0x%p, "
4027 "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
4028 (void*)mi, (void*)old, (void*)new,
4029 (void*)op, (void*)np));
4030 ASSERT(op != NULL && np != NULL);
4032 /* discard any delegations */
4033 nfs4_deleg_discard(mi, op);
4035 num_open = mi->mi_open_files;
4036 mi->mi_open_files = 0;
4037 op->state_ref_count -= num_open;
4038 ASSERT(op->state_ref_count >= 0);
4039 np->state_ref_count += num_open;
4040 nfs4_remove_mi_from_server_nolock(mi, op);
4041 mi->mi_open_files = num_open;
4042 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4043 "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
4044 mi->mi_open_files, op->state_ref_count, np->state_ref_count));
4046 nfs4_add_mi_to_server(np, mi);
4048 mutex_exit(&op->s_lock);
4049 mutex_exit(&np->s_lock);
4050 nfs4_server_rele(op);
4052 return (np);
4056 * Need to have the nfs4_server_lst_lock.
4057 * Search the nfs4_server list to find a match on this servinfo4
4058 * based on its address.
4060 * Returns NULL if no match is found. Otherwise returns a reference (which
4061 * must eventually be freed) to a locked nfs4_server.
4063 nfs4_server_t *
4064 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
4066 nfs4_server_t *np;
4067 zoneid_t zoneid = nfs_zoneid();
4069 ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
4070 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4071 if (np->zoneid == zoneid &&
4072 np->saddr.len == srv_p->sv_addr.len &&
4073 bcmp(np->saddr.buf, srv_p->sv_addr.buf,
4074 np->saddr.len) == 0 &&
4075 np->s_thread_exit != NFS4_THREAD_EXIT) {
4076 mutex_enter(&np->s_lock);
4077 np->s_refcnt++;
4078 return (np);
4081 return (NULL);
4085 * Locks the nfs4_server down if it is found and returns a reference that
4086 * must eventually be freed.
4088 static nfs4_server_t *
4089 lookup_nfs4_server(nfs4_server_t *sp, int any_state)
4091 nfs4_server_t *np;
4093 mutex_enter(&nfs4_server_lst_lock);
4094 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4095 mutex_enter(&np->s_lock);
4096 if (np == sp && np->s_refcnt > 0 &&
4097 (np->s_thread_exit != NFS4_THREAD_EXIT || any_state)) {
4098 mutex_exit(&nfs4_server_lst_lock);
4099 np->s_refcnt++;
4100 return (np);
4102 mutex_exit(&np->s_lock);
4104 mutex_exit(&nfs4_server_lst_lock);
4106 return (NULL);
4110 * The caller should be holding mi->mi_recovlock, and it should continue to
4111 * hold the lock until done with the returned nfs4_server_t. Once
4112 * mi->mi_recovlock is released, there is no guarantee that the returned
4113 * mi->nfs4_server_t will continue to correspond to mi.
4115 nfs4_server_t *
4116 find_nfs4_server(mntinfo4_t *mi)
4118 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4119 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4121 return (lookup_nfs4_server(mi->mi_srv, 0));
4125 * Same as above, but takes an "any_state" parameter which can be
4126 * set to 1 if the caller wishes to find nfs4_server_t's which
4127 * have been marked for termination by the exit of the renew
4128 * thread. This should only be used by operations which are
4129 * cleaning up and will not cause an OTW op.
4131 nfs4_server_t *
4132 find_nfs4_server_all(mntinfo4_t *mi, int any_state)
4134 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4135 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4137 return (lookup_nfs4_server(mi->mi_srv, any_state));
4141 * Lock sp, but only if it's still active (in the list and hasn't been
4142 * flagged as exiting) or 'any_state' is non-zero.
4143 * Returns TRUE if sp got locked and adds a reference to sp.
4145 bool_t
4146 nfs4_server_vlock(nfs4_server_t *sp, int any_state)
4148 return (lookup_nfs4_server(sp, any_state) != NULL);
4152 * Release the reference to sp and destroy it if that's the last one.
4155 void
4156 nfs4_server_rele(nfs4_server_t *sp)
4158 mutex_enter(&sp->s_lock);
4159 ASSERT(sp->s_refcnt > 0);
4160 sp->s_refcnt--;
4161 if (sp->s_refcnt > 0) {
4162 mutex_exit(&sp->s_lock);
4163 return;
4165 mutex_exit(&sp->s_lock);
4167 mutex_enter(&nfs4_server_lst_lock);
4168 mutex_enter(&sp->s_lock);
4169 if (sp->s_refcnt > 0) {
4170 mutex_exit(&sp->s_lock);
4171 mutex_exit(&nfs4_server_lst_lock);
4172 return;
4174 remque(sp);
4175 sp->forw = sp->back = NULL;
4176 mutex_exit(&nfs4_server_lst_lock);
4177 destroy_nfs4_server(sp);
4180 static void
4181 destroy_nfs4_server(nfs4_server_t *sp)
4183 ASSERT(MUTEX_HELD(&sp->s_lock));
4184 ASSERT(sp->s_refcnt == 0);
4185 ASSERT(sp->s_otw_call_count == 0);
4187 remove_all_mi(sp);
4189 crfree(sp->s_cred);
4190 kmem_free(sp->saddr.buf, sp->saddr.maxlen);
4191 kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
4192 mutex_exit(&sp->s_lock);
4194 /* destroy the nfs4_server */
4195 nfs4callback_destroy(sp);
4196 list_destroy(&sp->s_deleg_list);
4197 mutex_destroy(&sp->s_lock);
4198 cv_destroy(&sp->cv_thread_exit);
4199 cv_destroy(&sp->s_cv_otw_count);
4200 cv_destroy(&sp->s_clientid_pend);
4201 cv_destroy(&sp->wait_cb_null);
4202 nfs_rw_destroy(&sp->s_recovlock);
4203 kmem_free(sp, sizeof (*sp));
4207 * Fork off a thread to free the data structures for a mount.
4210 static void
4211 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4213 freemountargs_t *args;
4214 args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
4215 args->fm_vfsp = vfsp;
4216 VFS_HOLD(vfsp);
4217 MI4_HOLD(VFTOMI4(vfsp));
4218 args->fm_flag = flag;
4219 args->fm_cr = cr;
4220 crhold(cr);
4221 (void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
4222 minclsyspri);
4225 static void
4226 nfs4_free_mount_thread(freemountargs_t *args)
4228 mntinfo4_t *mi;
4229 nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
4230 mi = VFTOMI4(args->fm_vfsp);
4231 crfree(args->fm_cr);
4232 VFS_RELE(args->fm_vfsp);
4233 MI4_RELE(mi);
4234 kmem_free(args, sizeof (freemountargs_t));
4235 zthread_exit();
4236 /* NOTREACHED */
4240 * Thread to free the data structures for a given filesystem.
4242 static void
4243 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4245 mntinfo4_t *mi = VFTOMI4(vfsp);
4246 nfs4_server_t *sp;
4247 callb_cpr_t cpr_info;
4248 kmutex_t cpr_lock;
4249 boolean_t async_thread;
4250 int removed;
4252 bool_t must_unlock;
4253 nfs4_ephemeral_tree_t *eph_tree;
4256 * We need to participate in the CPR framework if this is a kernel
4257 * thread.
4259 async_thread = (curproc == nfs_zone()->zone_zsched);
4260 if (async_thread) {
4261 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
4262 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
4263 "nfsv4AsyncUnmount");
4267 * We need to wait for all outstanding OTW calls
4268 * and recovery to finish before we remove the mi
4269 * from the nfs4_server_t, as current pending
4270 * calls might still need this linkage (in order
4271 * to find a nfs4_server_t from a mntinfo4_t).
4273 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
4274 sp = find_nfs4_server(mi);
4275 nfs_rw_exit(&mi->mi_recovlock);
4277 if (sp) {
4278 while (sp->s_otw_call_count != 0) {
4279 if (async_thread) {
4280 mutex_enter(&cpr_lock);
4281 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4282 mutex_exit(&cpr_lock);
4284 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
4285 if (async_thread) {
4286 mutex_enter(&cpr_lock);
4287 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4288 mutex_exit(&cpr_lock);
4291 mutex_exit(&sp->s_lock);
4292 nfs4_server_rele(sp);
4293 sp = NULL;
4296 mutex_enter(&mi->mi_lock);
4297 while (mi->mi_in_recovery != 0) {
4298 if (async_thread) {
4299 mutex_enter(&cpr_lock);
4300 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4301 mutex_exit(&cpr_lock);
4303 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
4304 if (async_thread) {
4305 mutex_enter(&cpr_lock);
4306 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4307 mutex_exit(&cpr_lock);
4310 mutex_exit(&mi->mi_lock);
4313 * If we got an error, then do not nuke the
4314 * tree. Either the harvester is busy reclaiming
4315 * this node or we ran into some busy condition.
4317 * The harvester will eventually come along and cleanup.
4318 * The only problem would be the root mount point.
4320 * Since the busy node can occur for a variety
4321 * of reasons and can result in an entry staying
4322 * in df output but no longer accessible from the
4323 * directory tree, we are okay.
4325 if (!nfs4_ephemeral_umount(mi, flag, cr,
4326 &must_unlock, &eph_tree))
4327 nfs4_ephemeral_umount_activate(mi, &must_unlock,
4328 &eph_tree);
4331 * The original purge of the dnlc via 'dounmount'
4332 * doesn't guarantee that another dnlc entry was not
4333 * added while we waitied for all outstanding OTW
4334 * and recovery calls to finish. So re-purge the
4335 * dnlc now.
4337 (void) dnlc_purge_vfsp(vfsp, 0);
4340 * We need to explicitly stop the manager thread; the asyc worker
4341 * threads can timeout and exit on their own.
4343 mutex_enter(&mi->mi_async_lock);
4344 mi->mi_max_threads = 0;
4345 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
4346 mutex_exit(&mi->mi_async_lock);
4347 if (mi->mi_manager_thread)
4348 nfs4_async_manager_stop(vfsp);
4350 destroy_rtable4(vfsp, cr);
4352 nfs4_remove_mi_from_server(mi, NULL);
4354 if (async_thread) {
4355 mutex_enter(&cpr_lock);
4356 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
4357 mutex_destroy(&cpr_lock);
4360 removed = nfs4_mi_zonelist_remove(mi);
4361 if (removed)
4362 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
4365 /* Referral related sub-routines */
4367 /* Freeup knetconfig */
4368 static void
4369 free_knconf_contents(struct knetconfig *k)
4371 if (k == NULL)
4372 return;
4373 if (k->knc_protofmly)
4374 kmem_free(k->knc_protofmly, KNC_STRSIZE);
4375 if (k->knc_proto)
4376 kmem_free(k->knc_proto, KNC_STRSIZE);
4380 * This updates newpath variable with exact name component from the
4381 * path which gave us a NFS4ERR_MOVED error.
4382 * If the path is /rp/aaa/bbb and nth value is 1, aaa is returned.
4384 static char *
4385 extract_referral_point(const char *svp, int nth)
4387 int num_slashes = 0;
4388 const char *p;
4389 char *newpath = NULL;
4390 int i = 0;
4392 newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4393 for (p = svp; *p; p++) {
4394 if (*p == '/')
4395 num_slashes++;
4396 if (num_slashes == nth + 1) {
4397 p++;
4398 while (*p != '/') {
4399 if (*p == '\0')
4400 break;
4401 newpath[i] = *p;
4402 i++;
4403 p++;
4405 newpath[i++] = '\0';
4406 break;
4409 return (newpath);
4413 * This sets up a new path in sv_path to do a lookup of the referral point.
4414 * If the path is /rp/aaa/bbb and the referral point is aaa,
4415 * this updates /rp/aaa. This path will be used to get referral
4416 * location.
4418 static void
4419 setup_newsvpath(servinfo4_t *svp, int nth)
4421 int num_slashes = 0, pathlen, i = 0;
4422 char *newpath, *p;
4424 newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4425 for (p = svp->sv_path; *p; p++) {
4426 newpath[i] = *p;
4427 if (*p == '/')
4428 num_slashes++;
4429 if (num_slashes == nth + 1) {
4430 newpath[i] = '\0';
4431 pathlen = strlen(newpath) + 1;
4432 kmem_free(svp->sv_path, svp->sv_pathlen);
4433 svp->sv_path = kmem_alloc(pathlen, KM_SLEEP);
4434 svp->sv_pathlen = pathlen;
4435 bcopy(newpath, svp->sv_path, pathlen);
4436 break;
4438 i++;
4440 kmem_free(newpath, MAXPATHLEN);