4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cmn_err.h>
38 #include <sys/vtrace.h>
39 #include <sys/session.h>
40 #include <sys/thread.h>
46 #include <sys/policy.h>
48 #include <rpc/types.h>
53 #include <nfs/nfs_clnt.h>
56 #include <nfs/rnode4.h>
57 #include <nfs/nfs4_clnt.h>
60 * client side statistics
62 static const struct clstat4 clstat4_tmpl
= {
63 { "calls", KSTAT_DATA_UINT64
},
64 { "badcalls", KSTAT_DATA_UINT64
},
65 { "referrals", KSTAT_DATA_UINT64
},
66 { "referlinks", KSTAT_DATA_UINT64
},
67 { "clgets", KSTAT_DATA_UINT64
},
68 { "cltoomany", KSTAT_DATA_UINT64
},
70 { "clalloc", KSTAT_DATA_UINT64
},
71 { "noresponse", KSTAT_DATA_UINT64
},
72 { "failover", KSTAT_DATA_UINT64
},
73 { "remap", KSTAT_DATA_UINT64
},
78 struct clstat4_debug clstat4_debug
= {
79 { "nrnode", KSTAT_DATA_UINT64
},
80 { "access", KSTAT_DATA_UINT64
},
81 { "dirent", KSTAT_DATA_UINT64
},
82 { "dirents", KSTAT_DATA_UINT64
},
83 { "reclaim", KSTAT_DATA_UINT64
},
84 { "clreclaim", KSTAT_DATA_UINT64
},
85 { "f_reclaim", KSTAT_DATA_UINT64
},
86 { "a_reclaim", KSTAT_DATA_UINT64
},
87 { "r_reclaim", KSTAT_DATA_UINT64
},
88 { "r_path", KSTAT_DATA_UINT64
},
93 * We keep a global list of per-zone client data, so we can clean up all zones
94 * if we get low on memory.
96 static list_t nfs4_clnt_list
;
97 static kmutex_t nfs4_clnt_list_lock
;
98 zone_key_t nfs4clnt_zone_key
;
100 static struct kmem_cache
*chtab4_cache
;
103 static int nfs4_rfscall_debug
;
104 static int nfs4_try_failover_any
;
105 int nfs4_utf8_debug
= 0;
109 * NFSv4 readdir cache implementation
111 typedef struct rddir4_cache_impl
{
112 rddir4_cache rc
; /* readdir cache element */
113 kmutex_t lock
; /* lock protects count */
114 uint_t count
; /* reference count */
115 avl_node_t tree
; /* AVL tree link */
118 static int rddir4_cache_compar(const void *, const void *);
119 static void rddir4_cache_free(rddir4_cache_impl
*);
120 static rddir4_cache
*rddir4_cache_alloc(int);
121 static void rddir4_cache_hold(rddir4_cache
*);
122 static int try_failover(enum clnt_stat
);
124 static int nfs4_readdir_cache_hits
= 0;
125 static int nfs4_readdir_cache_waits
= 0;
126 static int nfs4_readdir_cache_misses
= 0;
129 * Shared nfs4 functions
133 * Copy an nfs_fh4. The destination storage (to->nfs_fh4_val) must already
138 nfs_fh4_copy(nfs_fh4
*from
, nfs_fh4
*to
)
140 to
->nfs_fh4_len
= from
->nfs_fh4_len
;
141 bcopy(from
->nfs_fh4_val
, to
->nfs_fh4_val
, to
->nfs_fh4_len
);
145 * nfs4cmpfh - compare 2 filehandles.
146 * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
147 * "less" than the second, +1 if the first is "greater" than the second.
151 nfs4cmpfh(const nfs_fh4
*fh4p1
, const nfs_fh4
*fh4p2
)
155 if (fh4p1
->nfs_fh4_len
< fh4p2
->nfs_fh4_len
)
157 if (fh4p1
->nfs_fh4_len
> fh4p2
->nfs_fh4_len
)
159 for (c1
= fh4p1
->nfs_fh4_val
, c2
= fh4p2
->nfs_fh4_val
;
160 c1
< fh4p1
->nfs_fh4_val
+ fh4p1
->nfs_fh4_len
;
172 * Compare two v4 filehandles. Return zero if they're the same, non-zero
173 * if they're not. Like nfs4cmpfh(), but different filehandle
174 * representation, and doesn't provide information about greater than or
179 nfs4cmpfhandle(nfs4_fhandle_t
*fh1
, nfs4_fhandle_t
*fh2
)
181 if (fh1
->fh_len
== fh2
->fh_len
)
182 return (bcmp(fh1
->fh_buf
, fh2
->fh_buf
, fh1
->fh_len
));
188 stateid4_cmp(stateid4
*s1
, stateid4
*s2
)
190 if (bcmp(s1
, s2
, sizeof (stateid4
)) == 0)
203 return (NFS4ERR_PERM
);
205 return (NFS4ERR_NOENT
);
211 return (NFS4ERR_NXIO
);
213 return (NFS4ERR_RESOURCE
);
215 return (NFS4ERR_ACCESS
);
219 return (NFS4ERR_EXIST
);
221 return (NFS4ERR_XDEV
);
225 return (NFS4ERR_NOTDIR
);
227 return (NFS4ERR_ISDIR
);
229 return (NFS4ERR_INVAL
);
231 return (NFS4ERR_RESOURCE
);
233 return (NFS4ERR_FBIG
);
235 return (NFS4ERR_NOSPC
);
237 return (NFS4ERR_ROFS
);
239 return (NFS4ERR_MLINK
);
241 return (NFS4ERR_DEADLOCK
);
243 return (NFS4ERR_DENIED
);
245 return (NFS4ERR_SERVERFAULT
);
247 return (NFS4ERR_NOTSUPP
);
249 return (NFS4ERR_DQUOT
);
251 return (NFS4ERR_NAMETOOLONG
);
253 return (NFS4ERR_INVAL
);
255 return (NFS4ERR_NOTSUPP
);
257 return (NFS4ERR_NOTEMPTY
);
259 return (NFS4ERR_NOTSUPP
);
261 return (NFS4ERR_STALE
);
263 if (curthread
->t_flag
& T_WOULDBLOCK
) {
264 curthread
->t_flag
&= ~T_WOULDBLOCK
;
265 return (NFS4ERR_DELAY
);
267 return (NFS4ERR_LOCKED
);
269 return ((enum nfsstat4
)error
);
274 geterrno4(enum nfsstat4 status
)
307 case NFS4ERR_NAMETOOLONG
:
308 return (ENAMETOOLONG
);
309 case NFS4ERR_NOTEMPTY
:
315 case NFS4ERR_BADHANDLE
:
317 case NFS4ERR_BAD_COOKIE
:
319 case NFS4ERR_NOTSUPP
:
321 case NFS4ERR_TOOSMALL
:
323 case NFS4ERR_SERVERFAULT
:
325 case NFS4ERR_BADTYPE
:
333 case NFS4ERR_EXPIRED
:
339 case NFS4ERR_FHEXPIRED
: /* if got here, failed to get a new fh */
341 case NFS4ERR_SHARE_DENIED
:
343 case NFS4ERR_WRONGSEC
:
345 case NFS4ERR_CLID_INUSE
:
347 case NFS4ERR_RESOURCE
:
351 case NFS4ERR_NOFILEHANDLE
:
353 case NFS4ERR_MINOR_VERS_MISMATCH
:
355 case NFS4ERR_STALE_CLIENTID
:
357 case NFS4ERR_STALE_STATEID
:
359 case NFS4ERR_OLD_STATEID
:
361 case NFS4ERR_BAD_STATEID
:
363 case NFS4ERR_BAD_SEQID
:
365 case NFS4ERR_NOT_SAME
:
367 case NFS4ERR_LOCK_RANGE
:
369 case NFS4ERR_SYMLINK
:
371 case NFS4ERR_RESTOREFH
:
373 case NFS4ERR_LEASE_MOVED
:
375 case NFS4ERR_ATTRNOTSUPP
:
377 case NFS4ERR_NO_GRACE
:
379 case NFS4ERR_RECLAIM_BAD
:
381 case NFS4ERR_RECLAIM_CONFLICT
:
385 case NFS4ERR_LOCKS_HELD
:
387 case NFS4ERR_OPENMODE
:
389 case NFS4ERR_BADOWNER
:
391 * Client and server are in different DNS domains
392 * and the NFSMAPID_DOMAIN in /etc/default/nfs
393 * doesn't match. No good answer here. Return
394 * EACCESS, which translates to "permission denied".
397 case NFS4ERR_BADCHAR
:
399 case NFS4ERR_BADNAME
:
401 case NFS4ERR_BAD_RANGE
:
403 case NFS4ERR_LOCK_NOTSUPP
:
405 case NFS4ERR_OP_ILLEGAL
:
407 case NFS4ERR_DEADLOCK
:
409 case NFS4ERR_FILE_OPEN
:
411 case NFS4ERR_ADMIN_REVOKED
:
413 case NFS4ERR_CB_PATH_DOWN
:
417 zcmn_err(getzoneid(), CE_WARN
, "geterrno4: got status %d",
420 return ((int)status
);
425 nfs4_log_badowner(mntinfo4_t
*mi
, nfs_opnum4 op
)
427 nfs4_server_t
*server
;
430 * Return if already printed/queued a msg
431 * for this mount point.
433 if (mi
->mi_flags
& MI4_BADOWNER_DEBUG
)
436 * Happens once per client <-> server pair.
438 if (nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_READER
,
439 mi
->mi_flags
& MI4_INT
))
442 server
= find_nfs4_server(mi
);
443 if (server
== NULL
) {
444 nfs_rw_exit(&mi
->mi_recovlock
);
448 if (!(server
->s_flags
& N4S_BADOWNER_DEBUG
)) {
449 zcmn_err(mi
->mi_zone
->zone_id
, CE_WARN
,
450 "!NFSMAPID_DOMAIN does not match"
451 " the server: %s domain.\n"
452 "Please check configuration",
453 mi
->mi_curr_serv
->sv_hostname
);
454 server
->s_flags
|= N4S_BADOWNER_DEBUG
;
456 mutex_exit(&server
->s_lock
);
457 nfs4_server_rele(server
);
458 nfs_rw_exit(&mi
->mi_recovlock
);
461 * Happens once per mntinfo4_t.
462 * This error is deemed as one of the recovery facts "RF_BADOWNER",
463 * queue this in the mesg queue for this mount_info. This message
464 * is not printed, meaning its absent from id_to_dump_solo_fact()
465 * but its there for inspection if the queue is ever dumped/inspected.
467 mutex_enter(&mi
->mi_lock
);
468 if (!(mi
->mi_flags
& MI4_BADOWNER_DEBUG
)) {
469 nfs4_queue_fact(RF_BADOWNER
, mi
, NFS4ERR_BADOWNER
, 0, op
,
470 FALSE
, NULL
, 0, NULL
);
471 mi
->mi_flags
|= MI4_BADOWNER_DEBUG
;
473 mutex_exit(&mi
->mi_lock
);
477 nfs4_time_ntov(nfstime4
*ntime
, timestruc_t
*vatime
)
483 * Here check that the nfsv4 time is valid for the system.
484 * nfsv4 time value is a signed 64-bit, and the system time
485 * may be either int64_t or int32_t (depends on the kernel),
486 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
489 if (! NFS4_TIME_OK(ntime
->seconds
)) {
494 /* Invalid to specify 1 billion (or more) nsecs */
495 if (ntime
->nseconds
>= 1000000000)
498 if (ntime
->seconds
< 0) {
499 sec
= ntime
->seconds
+ 1;
500 nsec
= -1000000000 + ntime
->nseconds
;
502 sec
= ntime
->seconds
;
503 nsec
= ntime
->nseconds
;
506 vatime
->tv_sec
= sec
;
507 vatime
->tv_nsec
= nsec
;
513 nfs4_time_vton(timestruc_t
*vatime
, nfstime4
*ntime
)
519 * nfsv4 time value is a signed 64-bit, and the system time
520 * may be either int64_t or int32_t (depends on the kernel),
521 * so all system time values will fit.
523 if (vatime
->tv_nsec
>= 0) {
524 sec
= vatime
->tv_sec
;
525 nsec
= vatime
->tv_nsec
;
527 sec
= vatime
->tv_sec
- 1;
528 nsec
= 1000000000 + vatime
->tv_nsec
;
530 ntime
->seconds
= sec
;
531 ntime
->nseconds
= nsec
;
537 * Converts a utf8 string to a valid null terminated filename string.
539 * XXX - Not actually translating the UTF-8 string as per RFC 2279.
540 * For now, just validate that the UTF-8 string off the wire
541 * does not have characters that will freak out UFS, and leave
545 utf8_to_fn(utf8string
*u8s
, uint_t
*lenp
, char *s
)
547 ASSERT(lenp
!= NULL
);
549 if (u8s
== NULL
|| u8s
->utf8string_len
<= 0 ||
550 u8s
->utf8string_val
== NULL
)
554 * Check for obvious illegal filename chars
556 if (utf8_strchr(u8s
, '/') != NULL
) {
558 if (nfs4_utf8_debug
) {
560 int len
= u8s
->utf8string_len
;
562 path
= kmem_alloc(len
+ 1, KM_SLEEP
);
563 bcopy(u8s
->utf8string_val
, path
, len
);
566 zcmn_err(getzoneid(), CE_WARN
,
567 "Invalid UTF-8 filename: %s", path
);
569 kmem_free(path
, len
+ 1);
575 return (utf8_to_str(u8s
, lenp
, s
));
579 * Converts a utf8 string to a C string.
580 * kmem_allocs a new string if not supplied
583 utf8_to_str(utf8string
*str
, uint_t
*lenp
, char *s
)
590 ASSERT(lenp
!= NULL
);
595 u8p
= str
->utf8string_val
;
596 len
= str
->utf8string_len
;
597 if (len
<= 0 || u8p
== NULL
) {
605 sp
= kmem_alloc(len
+ 1, KM_SLEEP
);
608 * At least check for embedded nulls
610 for (i
= 0; i
< len
; i
++) {
612 if (u8p
[i
] == '\0') {
614 zcmn_err(getzoneid(), CE_WARN
,
615 "Embedded NULL in UTF-8 string");
618 kmem_free(sp
, len
+ 1);
629 * str_to_utf8 - converts a null-terminated C string to a utf8 string
632 str_to_utf8(char *nm
, utf8string
*str
)
639 if (nm
== NULL
|| *nm
== '\0') {
640 str
->utf8string_len
= 0;
641 str
->utf8string_val
= NULL
;
646 str
->utf8string_val
= kmem_alloc(len
, KM_SLEEP
);
647 str
->utf8string_len
= len
;
648 bcopy(nm
, str
->utf8string_val
, len
);
654 utf8_copy(utf8string
*src
, utf8string
*dest
)
661 if (src
->utf8string_len
> 0) {
662 dest
->utf8string_val
= kmem_alloc(src
->utf8string_len
,
664 bcopy(src
->utf8string_val
, dest
->utf8string_val
,
665 src
->utf8string_len
);
666 dest
->utf8string_len
= src
->utf8string_len
;
668 dest
->utf8string_val
= NULL
;
669 dest
->utf8string_len
= 0;
676 utf8_compare(const utf8string
*a
, const utf8string
*b
)
682 if ((a
== NULL
) && (b
== NULL
))
689 alen
= a
->utf8string_len
;
690 blen
= b
->utf8string_len
;
691 aval
= a
->utf8string_val
;
692 bval
= b
->utf8string_val
;
694 if (((alen
== 0) || (aval
== NULL
)) &&
695 ((blen
== 0) || (bval
== NULL
)))
697 else if ((alen
== 0) || (aval
== NULL
))
699 else if ((blen
== 0) || (bval
== NULL
))
702 mlen
= MIN(alen
, blen
);
703 cmp
= strncmp(aval
, bval
, mlen
);
705 if ((cmp
== 0) && (alen
== blen
))
707 else if ((cmp
== 0) && (alen
< blen
))
717 * utf8_dir_verify - checks that the utf8 string is valid
720 utf8_dir_verify(utf8string
*str
)
726 return (NFS4ERR_INVAL
);
728 nm
= str
->utf8string_val
;
729 len
= str
->utf8string_len
;
730 if (nm
== NULL
|| len
== 0) {
731 return (NFS4ERR_INVAL
);
734 if (len
== 1 && nm
[0] == '.')
735 return (NFS4ERR_BADNAME
);
736 if (len
== 2 && nm
[0] == '.' && nm
[1] == '.')
737 return (NFS4ERR_BADNAME
);
739 if (utf8_strchr(str
, '/') != NULL
)
740 return (NFS4ERR_BADNAME
);
742 if (utf8_strchr(str
, '\0') != NULL
)
743 return (NFS4ERR_BADNAME
);
749 * from rpcsec module (common/rpcsec)
751 extern int sec_clnt_geth(CLIENT
*, struct sec_data
*, cred_t
*, AUTH
**);
752 extern void sec_clnt_freeh(AUTH
*);
753 extern void sec_clnt_freeinfo(struct sec_data
*);
756 * authget() gets an auth handle based on the security
757 * information from the servinfo in mountinfo.
758 * The auth handle is stored in ch_client->cl_auth.
760 * First security flavor of choice is to use sv_secdata
761 * which is initiated by the client. If that fails, get
762 * secinfo from the server and then select one from the
763 * server secinfo list .
765 * For RPCSEC_GSS flavor, upon success, a secure context is
766 * established between client and server.
769 authget(servinfo4_t
*svp
, CLIENT
*ch_client
, cred_t
*cr
)
774 * SV4_TRYSECINFO indicates to try the secinfo list from
775 * sv_secinfo until a successful one is reached. Point
776 * sv_currsec to the selected security mechanism for
779 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_WRITER
, 0);
780 if ((svp
->sv_flags
& SV4_TRYSECINFO
) && svp
->sv_secinfo
) {
781 for (i
= svp
->sv_secinfo
->index
; i
< svp
->sv_secinfo
->count
;
783 if (!(error
= sec_clnt_geth(ch_client
,
784 &svp
->sv_secinfo
->sdata
[i
],
785 cr
, &ch_client
->cl_auth
))) {
787 svp
->sv_currsec
= &svp
->sv_secinfo
->sdata
[i
];
788 svp
->sv_secinfo
->index
= i
;
790 svp
->sv_flags
&= ~SV4_TRYSECINFO
;
795 * Allow the caller retry with the security flavor
796 * pointed by svp->sv_secinfo->index when
797 * ETIMEDOUT/ECONNRESET occurs.
799 if (error
== ETIMEDOUT
|| error
== ECONNRESET
) {
800 svp
->sv_secinfo
->index
= i
;
805 /* sv_currsec points to one of the entries in sv_secinfo */
806 if (svp
->sv_currsec
) {
807 error
= sec_clnt_geth(ch_client
, svp
->sv_currsec
, cr
,
808 &ch_client
->cl_auth
);
810 /* If it's null, use sv_secdata. */
811 error
= sec_clnt_geth(ch_client
, svp
->sv_secdata
, cr
,
812 &ch_client
->cl_auth
);
815 nfs_rw_exit(&svp
->sv_lock
);
821 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
824 clget4(clinfo_t
*ci
, servinfo4_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
825 struct chtab
**chp
, struct nfs4_clnt
*nfscl
)
827 struct chhead
*ch
, *newch
;
828 struct chhead
**plistp
;
833 if (newcl
== NULL
|| chp
== NULL
|| ci
== NULL
)
840 * Find an unused handle or create one
843 nfscl
->nfscl_stat
.clgets
.value
.ui64
++;
846 * Find the correct entry in the cache to check for free
847 * client handles. The search is based on the RPC program
848 * number, program version number, dev_t for the transport
849 * device, and the protocol family.
851 mutex_enter(&nfscl
->nfscl_chtable4_lock
);
852 plistp
= &nfscl
->nfscl_chtable4
;
853 for (ch
= nfscl
->nfscl_chtable4
; ch
!= NULL
; ch
= ch
->ch_next
) {
854 if (ch
->ch_prog
== ci
->cl_prog
&&
855 ch
->ch_vers
== ci
->cl_vers
&&
856 ch
->ch_dev
== svp
->sv_knconf
->knc_rdev
&&
857 (strcmp(ch
->ch_protofmly
,
858 svp
->sv_knconf
->knc_protofmly
) == 0))
860 plistp
= &ch
->ch_next
;
864 * If we didn't find a cache entry for this quadruple, then
865 * create one. If we don't have one already preallocated,
866 * then drop the cache lock, create one, and then start over.
867 * If we did have a preallocated entry, then just add it to
868 * the front of the list.
872 mutex_exit(&nfscl
->nfscl_chtable4_lock
);
873 newch
= kmem_alloc(sizeof (*newch
), KM_SLEEP
);
874 newch
->ch_timesused
= 0;
875 newch
->ch_prog
= ci
->cl_prog
;
876 newch
->ch_vers
= ci
->cl_vers
;
877 newch
->ch_dev
= svp
->sv_knconf
->knc_rdev
;
878 newch
->ch_protofmly
= kmem_alloc(
879 strlen(svp
->sv_knconf
->knc_protofmly
) + 1,
881 (void) strcpy(newch
->ch_protofmly
,
882 svp
->sv_knconf
->knc_protofmly
);
883 newch
->ch_list
= NULL
;
888 ch
->ch_next
= nfscl
->nfscl_chtable4
;
889 nfscl
->nfscl_chtable4
= ch
;
891 * We found a cache entry, but if it isn't on the front of the
892 * list, then move it to the front of the list to try to take
893 * advantage of locality of operations.
895 } else if (ch
!= nfscl
->nfscl_chtable4
) {
896 *plistp
= ch
->ch_next
;
897 ch
->ch_next
= nfscl
->nfscl_chtable4
;
898 nfscl
->nfscl_chtable4
= ch
;
902 * If there was a free client handle cached, then remove it
903 * from the list, init it, and use it.
905 if (ch
->ch_list
!= NULL
) {
907 ch
->ch_list
= cp
->ch_list
;
908 mutex_exit(&nfscl
->nfscl_chtable4_lock
);
910 kmem_free(newch
->ch_protofmly
,
911 strlen(newch
->ch_protofmly
) + 1);
912 kmem_free(newch
, sizeof (*newch
));
914 (void) clnt_tli_kinit(cp
->ch_client
, svp
->sv_knconf
,
915 &svp
->sv_addr
, ci
->cl_readsize
, ci
->cl_retrans
, cr
);
918 * Get an auth handle.
920 error
= authget(svp
, cp
->ch_client
, cr
);
921 if (error
|| cp
->ch_client
->cl_auth
== NULL
) {
922 CLNT_DESTROY(cp
->ch_client
);
923 kmem_cache_free(chtab4_cache
, cp
);
924 return ((error
!= 0) ? error
: EINTR
);
927 *newcl
= cp
->ch_client
;
933 * There weren't any free client handles which fit, so allocate
934 * a new one and use that.
937 atomic_inc_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
939 mutex_exit(&nfscl
->nfscl_chtable4_lock
);
941 nfscl
->nfscl_stat
.cltoomany
.value
.ui64
++;
943 kmem_free(newch
->ch_protofmly
, strlen(newch
->ch_protofmly
) + 1);
944 kmem_free(newch
, sizeof (*newch
));
947 cp
= kmem_cache_alloc(chtab4_cache
, KM_SLEEP
);
950 sigintr(&smask
, (int)ci
->cl_flags
& MI4_INT
);
951 error
= clnt_tli_kcreate(svp
->sv_knconf
, &svp
->sv_addr
, ci
->cl_prog
,
952 ci
->cl_vers
, ci
->cl_readsize
, ci
->cl_retrans
, cr
, &cp
->ch_client
);
956 kmem_cache_free(chtab4_cache
, cp
);
958 atomic_dec_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
961 * Warning is unnecessary if error is EINTR.
963 if (error
!= EINTR
) {
964 nfs_cmn_err(error
, CE_WARN
,
965 "clget: couldn't create handle: %m\n");
969 (void) CLNT_CONTROL(cp
->ch_client
, CLSET_PROGRESS
, NULL
);
970 auth_destroy(cp
->ch_client
->cl_auth
);
973 * Get an auth handle.
975 error
= authget(svp
, cp
->ch_client
, cr
);
976 if (error
|| cp
->ch_client
->cl_auth
== NULL
) {
977 CLNT_DESTROY(cp
->ch_client
);
978 kmem_cache_free(chtab4_cache
, cp
);
980 atomic_dec_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
982 return ((error
!= 0) ? error
: EINTR
);
985 *newcl
= cp
->ch_client
;
986 ASSERT(cp
->ch_client
->cl_nosignal
== FALSE
);
992 nfs_clget4(mntinfo4_t
*mi
, servinfo4_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
993 struct chtab
**chp
, struct nfs4_clnt
*nfscl
)
997 int firstcall
, error
= 0;
1000 * Set read buffer size to rsize
1001 * and add room for RPC headers.
1003 ci
.cl_readsize
= mi
->mi_tsize
;
1004 if (ci
.cl_readsize
!= 0)
1005 ci
.cl_readsize
+= (RPC_MAXDATASIZE
- NFS_MAXDATA
);
1008 * If soft mount and server is down just try once.
1009 * meaning: do not retransmit.
1011 if (!(mi
->mi_flags
& MI4_HARD
) && (mi
->mi_flags
& MI4_DOWN
))
1014 ci
.cl_retrans
= mi
->mi_retrans
;
1016 ci
.cl_prog
= mi
->mi_prog
;
1017 ci
.cl_vers
= mi
->mi_vers
;
1018 ci
.cl_flags
= mi
->mi_flags
;
1021 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1022 * security flavor, the client tries to establish a security context
1023 * by contacting the server. If the connection is timed out or reset,
1024 * e.g. server reboot, we will try again.
1026 is_recov
= (curthread
== mi
->mi_recovthread
);
1030 error
= clget4(&ci
, svp
, cr
, newcl
, chp
, nfscl
);
1036 * For forced unmount and zone shutdown, bail out but
1037 * let the recovery thread do one more transmission.
1039 if ((FS_OR_ZONE_GONE4(mi
->mi_vfsp
)) &&
1040 (!is_recov
|| !firstcall
)) {
1045 /* do not retry for soft mount */
1046 if (!(mi
->mi_flags
& MI4_HARD
))
1049 /* let the caller deal with the failover case */
1050 if (FAILOVER_MOUNT4(mi
))
1055 } while (error
== ETIMEDOUT
|| error
== ECONNRESET
);
1061 clfree4(CLIENT
*cl
, struct chtab
*cp
, struct nfs4_clnt
*nfscl
)
1063 if (cl
->cl_auth
!= NULL
) {
1064 sec_clnt_freeh(cl
->cl_auth
);
1069 * Timestamp this cache entry so that we know when it was last
1072 cp
->ch_freed
= gethrestime_sec();
1075 * Add the free client handle to the front of the list.
1076 * This way, the list will be sorted in youngest to oldest
1079 mutex_enter(&nfscl
->nfscl_chtable4_lock
);
1080 cp
->ch_list
= cp
->ch_head
->ch_list
;
1081 cp
->ch_head
->ch_list
= cp
;
1082 mutex_exit(&nfscl
->nfscl_chtable4_lock
);
1085 #define CL_HOLDTIME 60 /* time to hold client handles */
1088 clreclaim4_zone(struct nfs4_clnt
*nfscl
, uint_t cl_holdtime
)
1091 struct chtab
*cp
; /* list of objects that can be reclaimed */
1097 clstat4_debug
.clreclaim
.value
.ui64
++;
1101 * Need to reclaim some memory, so step through the cache
1102 * looking through the lists for entries which can be freed.
1106 mutex_enter(&nfscl
->nfscl_chtable4_lock
);
1109 * Here we step through each non-NULL quadruple and start to
1110 * construct the reclaim list pointed to by cp. Note that
1111 * cp will contain all eligible chtab entries. When this traversal
1112 * completes, chtab entries from the last quadruple will be at the
1113 * front of cp and entries from previously inspected quadruples have
1114 * been appended to the rear of cp.
1116 for (ch
= nfscl
->nfscl_chtable4
; ch
!= NULL
; ch
= ch
->ch_next
) {
1117 if (ch
->ch_list
== NULL
)
1120 * Search each list for entries older then
1121 * cl_holdtime seconds. The lists are maintained
1122 * in youngest to oldest order so that when the
1123 * first entry is found which is old enough, then
1124 * all of the rest of the entries on the list will
1125 * be old enough as well.
1129 while (cpl
!= NULL
&&
1130 cpl
->ch_freed
+ cl_holdtime
> gethrestime_sec()) {
1131 cpp
= &cpl
->ch_list
;
1138 while (cpe
->ch_list
!= NULL
)
1146 mutex_exit(&nfscl
->nfscl_chtable4_lock
);
1149 * If cp is empty, then there is nothing to reclaim here.
1155 * Step through the list of entries to free, destroying each client
1156 * handle and kmem_free'ing the memory for each entry.
1158 while (cp
!= NULL
) {
1162 CLNT_DESTROY(cp
->ch_client
);
1164 kmem_cache_free(chtab4_cache
, cp
);
1170 * Update clalloc so that nfsstat shows the current number
1171 * of allocated client handles.
1173 atomic_add_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
, -n
);
1179 clreclaim4(void *all
)
1181 struct nfs4_clnt
*nfscl
;
1184 * The system is low on memory; go through and try to reclaim some from
1185 * every zone on the system.
1187 mutex_enter(&nfs4_clnt_list_lock
);
1188 nfscl
= list_head(&nfs4_clnt_list
);
1189 for (; nfscl
!= NULL
; nfscl
= list_next(&nfs4_clnt_list
, nfscl
))
1190 clreclaim4_zone(nfscl
, CL_HOLDTIME
);
1191 mutex_exit(&nfs4_clnt_list_lock
);
1195 * Minimum time-out values indexed by call type
1196 * These units are in "eights" of a second to avoid multiplies
1198 static unsigned int minimum_timeo
[] = {
1202 #define SHORTWAIT (NFS_COTS_TIMEO / 10)
1205 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1207 #define MAXTIMO (20*hz)
1208 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1209 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1212 nfs4_rfscall(mntinfo4_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1213 xdrproc_t xdrres
, caddr_t resp
, cred_t
*icr
, int *doqueue
,
1214 enum clnt_stat
*rpc_statusp
, int flags
, struct nfs4_clnt
*nfscl
)
1219 struct rpc_err rpcerr
, rpcerr_tmp
;
1220 enum clnt_stat status
;
1222 struct timeval wait
;
1223 int timeo
; /* in units of hz */
1224 bool_t tryagain
, is_recov
;
1225 bool_t cred_cloned
= FALSE
;
1233 rpcerr
.re_status
= RPC_SUCCESS
;
1236 * If we know that we are rebooting then let's
1237 * not bother with doing any over the wireness.
1239 mutex_enter(&mi
->mi_lock
);
1240 if (mi
->mi_flags
& MI4_SHUTDOWN
) {
1241 mutex_exit(&mi
->mi_lock
);
1244 mutex_exit(&mi
->mi_lock
);
1247 * clget() calls clnt_tli_kinit() which clears the xid, so we
1248 * are guaranteed to reprocess the retry as a new request.
1250 svp
= mi
->mi_curr_serv
;
1251 rpcerr
.re_errno
= nfs_clget4(mi
, svp
, cr
, &client
, &ch
, nfscl
);
1252 if (rpcerr
.re_errno
!= 0)
1253 return (rpcerr
.re_errno
);
1255 timeo
= (mi
->mi_timeo
* hz
) / 10;
1258 * If hard mounted fs, retry call forever unless hard error
1261 * For forced unmount, let the recovery thread through but return
1262 * an error for all others. This is so that user processes can
1263 * exit quickly. The recovery thread bails out after one
1264 * transmission so that it can tell if it needs to continue.
1266 * For zone shutdown, behave as above to encourage quick
1267 * process exit, but also fail quickly when servers have
1268 * timed out before and reduce the timeouts.
1270 is_recov
= (curthread
== mi
->mi_recovthread
);
1275 NFS4_DEBUG(nfs4_rfscall_debug
, (CE_NOTE
,
1276 "nfs4_rfscall: vfs_flag=0x%x, %s",
1277 mi
->mi_vfsp
->vfs_flag
,
1278 is_recov
? "recov thread" : "not recov thread"));
1281 * It's possible while we're retrying the admin
1282 * decided to reboot.
1284 mutex_enter(&mi
->mi_lock
);
1285 if (mi
->mi_flags
& MI4_SHUTDOWN
) {
1286 mutex_exit(&mi
->mi_lock
);
1287 clfree4(client
, ch
, nfscl
);
1292 mutex_exit(&mi
->mi_lock
);
1294 if ((mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
) &&
1295 (!is_recov
|| !firstcall
)) {
1296 clfree4(client
, ch
, nfscl
);
1302 if (zone_status_get(curproc
->p_zone
) >= ZONE_IS_SHUTTING_DOWN
) {
1303 mutex_enter(&mi
->mi_lock
);
1304 if ((mi
->mi_flags
& MI4_TIMEDOUT
) ||
1305 !is_recov
|| !firstcall
) {
1306 mutex_exit(&mi
->mi_lock
);
1307 clfree4(client
, ch
, nfscl
);
1312 mutex_exit(&mi
->mi_lock
);
1313 timeo
= (MIN(mi
->mi_timeo
, SHORTWAIT
) * hz
) / 10;
1317 TICK_TO_TIMEVAL(timeo
, &wait
);
1320 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1321 * and SIGTERM. (Preserving the existing masks).
1322 * Mask out SIGINT if mount option nointr is specified.
1324 sigintr(&smask
, (int)mi
->mi_flags
& MI4_INT
);
1325 if (!(mi
->mi_flags
& MI4_INT
))
1326 client
->cl_nosignal
= TRUE
;
1329 * If there is a current signal, then don't bother
1330 * even trying to send out the request because we
1331 * won't be able to block waiting for the response.
1332 * Simply assume RPC_INTR and get on with it.
1334 if (ttolwp(curthread
) != NULL
&& ISSIG(curthread
, JUSTLOOKING
))
1337 status
= CLNT_CALL(client
, which
, xdrargs
, argsp
,
1338 xdrres
, resp
, wait
);
1341 if (!(mi
->mi_flags
& MI4_INT
))
1342 client
->cl_nosignal
= FALSE
;
1344 * restore original signal mask
1354 * There is no way to recover from this error,
1355 * even if mount option nointr is specified.
1356 * SIGKILL, for example, cannot be blocked.
1358 rpcerr
.re_status
= RPC_INTR
;
1359 rpcerr
.re_errno
= EINTR
;
1364 * If the NFS server is local (vold) and
1365 * it goes away then we get RPC_UDERROR.
1366 * This is a retryable error, so we would
1367 * loop, so check to see if the specific
1368 * error was ECONNRESET, indicating that
1369 * target did not exist at all. If so,
1370 * return with RPC_PROGUNAVAIL and
1371 * ECONNRESET to indicate why.
1373 CLNT_GETERR(client
, &rpcerr
);
1374 if (rpcerr
.re_errno
== ECONNRESET
) {
1375 rpcerr
.re_status
= RPC_PROGUNAVAIL
;
1376 rpcerr
.re_errno
= ECONNRESET
;
1381 default: /* probably RPC_TIMEDOUT */
1383 if (IS_UNRECOVERABLE_RPC(status
))
1387 * increment server not responding count
1389 mutex_enter(&mi
->mi_lock
);
1390 mi
->mi_noresponse
++;
1391 mutex_exit(&mi
->mi_lock
);
1393 nfscl
->nfscl_stat
.noresponse
.value
.ui64
++;
1396 * On zone shutdown, mark server dead and move on.
1398 if (zone_status_get(curproc
->p_zone
) >=
1399 ZONE_IS_SHUTTING_DOWN
) {
1400 mutex_enter(&mi
->mi_lock
);
1401 mi
->mi_flags
|= MI4_TIMEDOUT
;
1402 mutex_exit(&mi
->mi_lock
);
1403 clfree4(client
, ch
, nfscl
);
1410 * NFS client failover support:
1411 * return and let the caller take care of
1412 * failover. We only return for failover mounts
1413 * because otherwise we want the "not responding"
1414 * message, the timer updates, etc.
1416 if (mi
->mi_vers
== 4 && FAILOVER_MOUNT4(mi
) &&
1417 (error
= try_failover(status
)) != 0) {
1418 clfree4(client
, ch
, nfscl
);
1421 *rpc_statusp
= status
;
1425 if (flags
& RFSCALL_SOFT
)
1431 * The call is in progress (over COTS).
1432 * Try the CLNT_CALL again, but don't
1433 * print a noisy error message.
1435 if (status
== RPC_INPROGRESS
)
1438 timeo
= backoff(timeo
);
1439 CLNT_GETERR(client
, &rpcerr_tmp
);
1441 mutex_enter(&mi
->mi_lock
);
1442 if (!(mi
->mi_flags
& MI4_PRINTED
)) {
1443 mi
->mi_flags
|= MI4_PRINTED
;
1444 mutex_exit(&mi
->mi_lock
);
1445 if ((status
== RPC_CANTSEND
) &&
1446 (rpcerr_tmp
.re_errno
== ENOBUFS
))
1447 nfs4_queue_fact(RF_SENDQ_FULL
, mi
, 0,
1448 0, 0, FALSE
, NULL
, 0, NULL
);
1450 nfs4_queue_fact(RF_SRV_NOT_RESPOND
, mi
,
1451 0, 0, 0, FALSE
, NULL
, 0, NULL
);
1453 mutex_exit(&mi
->mi_lock
);
1455 if (*doqueue
&& nfs_has_ctty()) {
1457 if (!(mi
->mi_flags
& MI4_NOPRINT
)) {
1458 if ((status
== RPC_CANTSEND
) &&
1459 (rpcerr_tmp
.re_errno
== ENOBUFS
))
1460 nfs4_queue_fact(RF_SENDQ_FULL
,
1461 mi
, 0, 0, 0, FALSE
, NULL
,
1465 RF_SRV_NOT_RESPOND
, mi
, 0,
1466 0, 0, FALSE
, NULL
, 0, NULL
);
1472 DTRACE_PROBE2(nfs4__rfscall_debug
, enum clnt_stat
, status
,
1473 int, rpcerr
.re_errno
);
1475 if (status
!= RPC_SUCCESS
) {
1476 zoneid_t zoneid
= mi
->mi_zone
->zone_id
;
1479 * Let soft mounts use the timed out message.
1481 if (status
== RPC_INPROGRESS
)
1482 status
= RPC_TIMEDOUT
;
1483 nfscl
->nfscl_stat
.badcalls
.value
.ui64
++;
1484 if (status
!= RPC_INTR
) {
1485 mutex_enter(&mi
->mi_lock
);
1486 mi
->mi_flags
|= MI4_DOWN
;
1487 mutex_exit(&mi
->mi_lock
);
1488 CLNT_GETERR(client
, &rpcerr
);
1490 bufp
= clnt_sperror(client
, svp
->sv_hostname
);
1491 zprintf(zoneid
, "NFS%d %s failed for %s\n",
1492 mi
->mi_vers
, mi
->mi_rfsnames
[which
], bufp
);
1493 if (nfs_has_ctty()) {
1494 if (!(mi
->mi_flags
& MI4_NOPRINT
)) {
1495 uprintf("NFS%d %s failed for %s\n",
1496 mi
->mi_vers
, mi
->mi_rfsnames
[which
],
1500 kmem_free(bufp
, MAXPATHLEN
);
1503 "NFS %s failed for server %s: error %d (%s)\n",
1504 mi
->mi_rfsnames
[which
], svp
->sv_hostname
,
1505 status
, clnt_sperrno(status
));
1506 if (nfs_has_ctty()) {
1507 if (!(mi
->mi_flags
& MI4_NOPRINT
)) {
1509 "NFS %s failed for server %s: error %d (%s)\n",
1510 mi
->mi_rfsnames
[which
],
1511 svp
->sv_hostname
, status
,
1512 clnt_sperrno(status
));
1517 * when CLNT_CALL() fails with RPC_AUTHERROR,
1518 * re_errno is set appropriately depending on
1519 * the authentication error
1521 if (status
== RPC_VERSMISMATCH
||
1522 status
== RPC_PROGVERSMISMATCH
)
1523 rpcerr
.re_errno
= EIO
;
1527 * Test the value of mi_down and mi_printed without
1528 * holding the mi_lock mutex. If they are both zero,
1529 * then it is okay to skip the down and printed
1530 * processing. This saves on a mutex_enter and
1531 * mutex_exit pair for a normal, successful RPC.
1532 * This was just complete overhead.
1534 if (mi
->mi_flags
& (MI4_DOWN
| MI4_PRINTED
)) {
1535 mutex_enter(&mi
->mi_lock
);
1536 mi
->mi_flags
&= ~MI4_DOWN
;
1537 if (mi
->mi_flags
& MI4_PRINTED
) {
1538 mi
->mi_flags
&= ~MI4_PRINTED
;
1539 mutex_exit(&mi
->mi_lock
);
1540 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1541 nfs4_queue_fact(RF_SRV_OK
, mi
, 0, 0,
1542 0, FALSE
, NULL
, 0, NULL
);
1544 mutex_exit(&mi
->mi_lock
);
1547 if (*doqueue
== 0) {
1548 if (!(mi
->mi_flags
& MI4_NOPRINT
) &&
1549 !(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1550 nfs4_queue_fact(RF_SRV_OK
, mi
, 0, 0, 0,
1551 FALSE
, NULL
, 0, NULL
);
1557 clfree4(client
, ch
, nfscl
);
1561 ASSERT(rpcerr
.re_status
== RPC_SUCCESS
|| rpcerr
.re_errno
!= 0);
1563 TRACE_1(TR_FAC_NFS
, TR_RFSCALL_END
, "nfs4_rfscall_end:errno %d",
1566 *rpc_statusp
= status
;
1567 return (rpcerr
.re_errno
);
1571 * rfs4call - general wrapper for RPC calls initiated by the client
1574 rfs4call(mntinfo4_t
*mi
, COMPOUND4args_clnt
*argsp
, COMPOUND4res_clnt
*resp
,
1575 cred_t
*cr
, int *doqueue
, int flags
, nfs4_error_t
*ep
)
1578 enum clnt_stat rpc_status
= NFS4_OK
;
1580 struct nfs4_clnt
*nfscl
;
1582 ASSERT(nfs_zone() == mi
->mi_zone
);
1583 nfscl
= zone_getspecific(nfs4clnt_zone_key
, nfs_zone());
1584 ASSERT(nfscl
!= NULL
);
1586 nfscl
->nfscl_stat
.calls
.value
.ui64
++;
1587 mi
->mi_reqs
[NFSPROC4_COMPOUND
].value
.ui64
++;
1589 /* Set up the results struct for XDR usage */
1590 resp
->argsp
= argsp
;
1593 resp
->decode_len
= 0;
1595 error
= nfs4_rfscall(mi
, NFSPROC4_COMPOUND
,
1596 xdr_COMPOUND4args_clnt
, (caddr_t
)argsp
,
1597 xdr_COMPOUND4res_clnt
, (caddr_t
)resp
, cr
,
1598 doqueue
, &rpc_status
, flags
, nfscl
);
1600 /* Return now if it was an RPC error */
1603 ep
->stat
= resp
->status
;
1604 ep
->rpc_status
= rpc_status
;
1608 /* else we'll count the processed operations */
1609 num_resops
= resp
->decode_len
;
1610 for (i
= 0; i
< num_resops
; i
++) {
1612 * Count the individual operations
1613 * processed by the server.
1615 if (resp
->array
[i
].resop
>= NFSPROC4_NULL
&&
1616 resp
->array
[i
].resop
<= OP_WRITE
)
1617 mi
->mi_reqs
[resp
->array
[i
].resop
].value
.ui64
++;
1621 ep
->stat
= resp
->status
;
1622 ep
->rpc_status
= rpc_status
;
1626 * nfs4rename_update - updates stored state after a rename. Currently this
1627 * is the path of the object and anything under it, and the filehandle of
1628 * the renamed object.
1631 nfs4rename_update(vnode_t
*renvp
, vnode_t
*ndvp
, nfs_fh4
*nfh4p
, char *nnm
)
1633 sfh4_update(VTOR4(renvp
)->r_fh
, nfh4p
);
1634 fn_move(VTOSV(renvp
)->sv_name
, VTOSV(ndvp
)->sv_name
, nnm
);
1638 * Routine to look up the filehandle for the given path and rootvp.
1641 * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1643 * - error: return value (errno value) and/or *statp is set appropriately.
1645 #define RML_ORDINARY 1
1646 #define RML_NAMED_ATTR 2
1647 #define RML_ATTRDIR 3
1650 remap_lookup(nfs4_fname_t
*fname
, vnode_t
*rootvp
,
1651 int filetype
, cred_t
*cr
,
1652 nfs_fh4
*fhp
, nfs4_ga_res_t
*garp
, /* fh, attrs for object */
1653 nfs_fh4
*pfhp
, nfs4_ga_res_t
*pgarp
, /* fh, attrs for parent */
1656 COMPOUND4args_clnt args
;
1657 COMPOUND4res_clnt res
;
1661 lookup4_param_t lookuparg
;
1667 ASSERT(fname
!= NULL
);
1668 ASSERT(rootvp
->v_type
== VDIR
);
1670 mi
= VTOMI4(rootvp
);
1671 path
= fn_path(fname
);
1673 case RML_NAMED_ATTR
:
1674 lookuparg
.l4_getattrs
= LKP4_LAST_NAMED_ATTR
;
1675 args
.ctag
= TAG_REMAP_LOOKUP_NA
;
1678 lookuparg
.l4_getattrs
= LKP4_LAST_ATTRDIR
;
1679 args
.ctag
= TAG_REMAP_LOOKUP_AD
;
1682 lookuparg
.l4_getattrs
= LKP4_ALL_ATTRIBUTES
;
1683 args
.ctag
= TAG_REMAP_LOOKUP
;
1689 lookuparg
.argsp
= &args
;
1690 lookuparg
.resp
= &res
;
1691 lookuparg
.header_len
= 1; /* Putfh */
1692 lookuparg
.trailer_len
= 0;
1693 lookuparg
.ga_bits
= NFS4_VATTR_MASK
;
1694 lookuparg
.mi
= VTOMI4(rootvp
);
1696 (void) nfs4lookup_setup(path
, &lookuparg
, 1);
1698 /* 0: putfh directory */
1700 argop
[0].argop
= OP_CPUTFH
;
1701 argop
[0].nfs_argop4_u
.opcputfh
.sfh
= VTOR4(rootvp
)->r_fh
;
1703 num_argops
= args
.array_len
;
1705 rfs4call(mi
, &args
, &res
, cr
, &doqueue
, RFSCALL_SOFT
, ep
);
1707 if (ep
->error
|| res
.status
!= NFS4_OK
)
1710 /* get the object filehandle */
1711 resop
= &res
.array
[res
.array_len
- 2];
1712 if (resop
->resop
!= OP_GETFH
) {
1713 nfs4_queue_event(RE_FAIL_REMAP_OP
, mi
, NULL
,
1714 0, NULL
, NULL
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1715 ep
->stat
= NFS4ERR_SERVERFAULT
;
1718 tmpfhp
= &resop
->nfs_resop4_u
.opgetfh
.object
;
1719 if (tmpfhp
->nfs_fh4_len
> NFS4_FHSIZE
) {
1720 nfs4_queue_event(RE_FAIL_REMAP_LEN
, mi
, NULL
,
1721 tmpfhp
->nfs_fh4_len
, NULL
, NULL
, 0, NULL
, 0, TAG_NONE
,
1723 ep
->stat
= NFS4ERR_SERVERFAULT
;
1726 fhp
->nfs_fh4_val
= kmem_alloc(tmpfhp
->nfs_fh4_len
, KM_SLEEP
);
1727 nfs_fh4_copy(tmpfhp
, fhp
);
1729 /* get the object attributes */
1730 resop
= &res
.array
[res
.array_len
- 1];
1731 if (garp
&& resop
->resop
== OP_GETATTR
)
1732 *garp
= resop
->nfs_resop4_u
.opgetattr
.ga_res
;
1734 /* See if there are enough fields in the response for parent info */
1735 if ((int)res
.array_len
- 5 <= 0)
1738 /* get the parent filehandle */
1739 resop
= &res
.array
[res
.array_len
- 5];
1740 if (resop
->resop
!= OP_GETFH
) {
1741 nfs4_queue_event(RE_FAIL_REMAP_OP
, mi
, NULL
,
1742 0, NULL
, NULL
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1743 ep
->stat
= NFS4ERR_SERVERFAULT
;
1746 tmpfhp
= &resop
->nfs_resop4_u
.opgetfh
.object
;
1747 if (tmpfhp
->nfs_fh4_len
> NFS4_FHSIZE
) {
1748 nfs4_queue_event(RE_FAIL_REMAP_LEN
, mi
, NULL
,
1749 tmpfhp
->nfs_fh4_len
, NULL
, NULL
, 0, NULL
, 0, TAG_NONE
,
1751 ep
->stat
= NFS4ERR_SERVERFAULT
;
1754 pfhp
->nfs_fh4_val
= kmem_alloc(tmpfhp
->nfs_fh4_len
, KM_SLEEP
);
1755 nfs_fh4_copy(tmpfhp
, pfhp
);
1757 /* get the parent attributes */
1758 resop
= &res
.array
[res
.array_len
- 4];
1759 if (pgarp
&& resop
->resop
== OP_GETATTR
)
1760 *pgarp
= resop
->nfs_resop4_u
.opgetattr
.ga_res
;
1764 * It is too hard to remember where all the OP_LOOKUPs are
1766 nfs4args_lookup_free(argop
, num_argops
);
1767 kmem_free(argop
, lookuparg
.arglen
* sizeof (nfs_argop4
));
1770 xdr_free(xdr_COMPOUND4res_clnt
, (caddr_t
)&res
);
1771 kmem_free(path
, strlen(path
)+1);
1775 * NFS client failover / volatile filehandle support
1777 * Recover the filehandle for the given rnode.
1779 * Errors are returned via the nfs4_error_t parameter.
1783 nfs4_remap_file(mntinfo4_t
*mi
, vnode_t
*vp
, int flags
, nfs4_error_t
*ep
)
1786 rnode4_t
*rp
= VTOR4(vp
);
1787 vnode_t
*rootvp
= NULL
;
1788 vnode_t
*dvp
= NULL
;
1789 cred_t
*cr
, *cred_otw
;
1790 nfs4_ga_res_t gar
, pgar
;
1791 nfs_fh4 newfh
= {0, NULL
}, newpfh
= {0, NULL
};
1792 int filetype
= RML_ORDINARY
;
1793 nfs4_recov_state_t recov
= {NULL
, 0, 0};
1795 nfs4_open_stream_t
*osp
= NULL
;
1796 bool_t first_time
= TRUE
; /* first time getting OTW cred */
1797 bool_t last_time
= FALSE
; /* last time getting OTW cred */
1799 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
1800 "nfs4_remap_file: remapping %s", rnode4info(rp
)));
1801 ASSERT(nfs4_consistent_type(vp
));
1803 if (vp
->v_flag
& VROOT
) {
1804 nfs4_remap_root(mi
, ep
, flags
);
1809 * Given the root fh, use the path stored in
1810 * the rnode to find the fh for the new server.
1812 ep
->error
= VFS_ROOT(mi
->mi_vfsp
, &rootvp
);
1816 cr
= curthread
->t_cred
;
1820 * Releases the osp, if it is provided.
1821 * Puts a hold on the cred_otw and the new osp (if found).
1823 cred_otw
= nfs4_get_otw_cred_by_osp(rp
, cr
, &osp
,
1824 &first_time
, &last_time
);
1825 ASSERT(cred_otw
!= NULL
);
1827 if (rp
->r_flags
& R4ISXATTR
) {
1828 filetype
= RML_NAMED_ATTR
;
1829 (void) vtodv(vp
, &dvp
, cred_otw
, FALSE
);
1832 if (vp
->v_flag
& V_XATTRDIR
) {
1833 filetype
= RML_ATTRDIR
;
1836 if (filetype
== RML_ORDINARY
&& rootvp
->v_type
== VREG
) {
1837 /* file mount, doesn't need a remap */
1842 remap_lookup(rp
->r_svnode
.sv_name
, rootvp
, filetype
, cred_otw
,
1843 &newfh
, &gar
, &newpfh
, &pgar
, ep
);
1845 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
1846 "nfs4_remap_file: remap_lookup returned %d/%d",
1847 ep
->error
, ep
->stat
));
1849 if (last_time
== FALSE
&& ep
->error
== EACCES
) {
1853 goto get_remap_cred
;
1861 if (recov
.rs_flags
& NFS4_RS_DELAY_MSG
) {
1862 mutex_enter(&rp
->r_statelock
);
1863 rp
->r_delay_interval
= 0;
1864 mutex_exit(&rp
->r_statelock
);
1865 uprintf("NFS File Available..\n");
1868 case NFS4ERR_FHEXPIRED
:
1869 case NFS4ERR_BADHANDLE
:
1872 * If we ran into filehandle problems, we should try to
1873 * remap the root vnode first and hope life gets better.
1874 * But we need to avoid loops.
1876 if (badfhcount
++ > 0)
1878 if (newfh
.nfs_fh4_len
!= 0) {
1879 kmem_free(newfh
.nfs_fh4_val
, newfh
.nfs_fh4_len
);
1880 newfh
.nfs_fh4_len
= 0;
1882 if (newpfh
.nfs_fh4_len
!= 0) {
1883 kmem_free(newpfh
.nfs_fh4_val
, newpfh
.nfs_fh4_len
);
1884 newpfh
.nfs_fh4_len
= 0;
1886 /* relative path - remap rootvp then retry */
1889 nfs4_remap_root(mi
, ep
, flags
);
1890 if (ep
->error
!= 0 || ep
->stat
!= NFS4_OK
)
1892 ep
->error
= VFS_ROOT(mi
->mi_vfsp
, &rootvp
);
1898 nfs4_set_delay_wait(vp
);
1899 ep
->error
= nfs4_wait_for_delay(vp
, &recov
);
1903 case NFS4ERR_ACCESS
:
1904 /* get new cred, try again */
1905 if (last_time
== TRUE
)
1910 goto get_remap_cred
;
1916 * Check on the new and old rnodes before updating;
1917 * if the vnode type or size changes, issue a warning
1918 * and mark the file dead.
1920 mutex_enter(&rp
->r_statelock
);
1921 if (flags
& NFS4_REMAP_CKATTRS
) {
1922 if (vp
->v_type
!= gar
.n4g_va
.va_type
||
1923 (vp
->v_type
!= VDIR
&&
1924 rp
->r_size
!= gar
.n4g_va
.va_size
)) {
1925 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
1926 "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
1927 (int)rp
->r_size
, (int)gar
.n4g_va
.va_size
,
1928 vp
->v_type
, gar
.n4g_va
.va_type
));
1929 mutex_exit(&rp
->r_statelock
);
1930 nfs4_queue_event(RE_FILE_DIFF
, mi
,
1931 rp
->r_server
->sv_hostname
, 0, vp
, NULL
, 0, NULL
, 0,
1932 TAG_NONE
, TAG_NONE
, 0, 0);
1933 nfs4_fail_recov(vp
, NULL
, 0, NFS4_OK
);
1937 ASSERT(gar
.n4g_va
.va_type
!= VNON
);
1938 rp
->r_server
= mi
->mi_curr_serv
;
1941 * Turn this object into a "stub" object if we
1942 * crossed an underlying server fs boundary.
1944 * This stub will be for a mirror-mount.
1945 * A referral would look like a boundary crossing
1946 * as well, but would not be the same type of object,
1947 * so we would expect to mark the object dead.
1949 * See comment in r4_do_attrcache() for more details.
1952 if (gar
.n4g_fsid_valid
) {
1953 (void) nfs_rw_enter_sig(&rp
->r_server
->sv_lock
, RW_READER
, 0);
1954 rp
->r_srv_fsid
= gar
.n4g_fsid
;
1955 if (!FATTR4_FSID_EQ(&gar
.n4g_fsid
, &rp
->r_server
->sv_fsid
))
1957 nfs_rw_exit(&rp
->r_server
->sv_lock
);
1960 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
1961 "remap_file: fsid attr not provided by server. rp=%p",
1966 r4_stub_mirrormount(rp
);
1969 mutex_exit(&rp
->r_statelock
);
1970 nfs4_attrcache_noinval(vp
, &gar
, gethrtime()); /* force update */
1971 sfh4_update(rp
->r_fh
, &newfh
);
1972 ASSERT(nfs4_consistent_type(vp
));
1975 * If we got parent info, use it to update the parent
1977 if (newpfh
.nfs_fh4_len
!= 0) {
1978 if (rp
->r_svnode
.sv_dfh
!= NULL
)
1979 sfh4_update(rp
->r_svnode
.sv_dfh
, &newpfh
);
1981 /* force update of attrs */
1982 nfs4_attrcache_noinval(dvp
, &pgar
, gethrtime());
1986 if (newfh
.nfs_fh4_len
!= 0)
1987 kmem_free(newfh
.nfs_fh4_val
, newfh
.nfs_fh4_len
);
1988 if (newpfh
.nfs_fh4_len
!= 0)
1989 kmem_free(newpfh
.nfs_fh4_val
, newpfh
.nfs_fh4_len
);
1990 if (cred_otw
!= NULL
)
1997 open_stream_rele(osp
, rp
);
2001 * Client-side failover support: remap the filehandle for vp if it appears
2002 * necessary. errors are returned via the nfs4_error_t parameter; though,
2003 * if there is a problem, we will just try again later.
2007 nfs4_check_remap(mntinfo4_t
*mi
, vnode_t
*vp
, int flags
, nfs4_error_t
*ep
)
2012 if (!(vp
->v_vfsp
->vfs_flag
& VFS_RDONLY
))
2015 if (VTOR4(vp
)->r_server
== mi
->mi_curr_serv
)
2018 nfs4_remap_file(mi
, vp
, flags
, ep
);
2022 * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2024 * Our caller has a filehandle for ".." relative to a particular
2025 * directory object. We want to find or create a parent vnode
2026 * with that filehandle and return it. We can of course create
2027 * a vnode from this filehandle, but we need to also make sure
2028 * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2029 * that we have a parent FH for future reopens as well. If
2030 * we have a remap failure, we won't be able to reopen this
2031 * file, but we won't treat that as fatal because a reopen
2032 * is at least unlikely. Someday nfs4_reopen() should look
2033 * for a missing parent FH and try a remap to recover from it.
2035 * need_start_op argument indicates whether this function should
2036 * do a start_op before calling remap_lookup(). This should
2037 * be FALSE, if you are the recovery thread or in an op; otherwise,
2041 nfs4_make_dotdot(nfs4_sharedfh_t
*fhp
, hrtime_t t
, vnode_t
*dvp
,
2042 cred_t
*cr
, vnode_t
**vpp
, int need_start_op
)
2044 mntinfo4_t
*mi
= VTOMI4(dvp
);
2045 nfs4_fname_t
*np
= NULL
, *pnp
= NULL
;
2046 vnode_t
*vp
= NULL
, *rootvp
= NULL
;
2048 nfs_fh4 newfh
= {0, NULL
}, newpfh
= {0, NULL
};
2049 nfs4_ga_res_t gar
, pgar
;
2051 nfs4_error_t e
= { 0, NFS4_OK
, RPC_SUCCESS
};
2052 nfs4_sharedfh_t
*sfh
= NULL
, *psfh
= NULL
;
2053 nfs4_recov_state_t recov_state
;
2057 * ensure need_start_op is correct
2060 int no_need_start_op
= (tsd_get(nfs4_tsd_key
) ||
2061 (curthread
== mi
->mi_recovthread
));
2062 /* C needs a ^^ operator! */
2063 ASSERT(((need_start_op
) && (!no_need_start_op
)) ||
2064 ((! need_start_op
) && (no_need_start_op
)));
2067 ASSERT(VTOMI4(dvp
)->mi_zone
== nfs_zone());
2069 NFS4_DEBUG(nfs4_client_shadow_debug
, (CE_NOTE
,
2070 "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp
,
2071 rnode4info(VTOR4(dvp
))));
2074 * rootvp might be needed eventually. Holding it now will
2075 * ensure that r4find_unlocked() will find it, if ".." is the root.
2077 e
.error
= VFS_ROOT(mi
->mi_vfsp
, &rootvp
);
2080 rp
= r4find_unlocked(fhp
, mi
->mi_vfsp
);
2088 * Since we don't have the rnode, we have to go over the wire.
2089 * remap_lookup() can get all of the filehandles and attributes
2090 * we need in one operation.
2092 np
= fn_parent(VTOSV(dvp
)->sv_name
);
2093 /* if a parent was not found return an error */
2099 recov_state
.rs_flags
= 0;
2100 recov_state
.rs_num_retry_despite_err
= 0;
2102 if (need_start_op
) {
2103 e
.error
= nfs4_start_fop(mi
, rootvp
, NULL
, OH_LOOKUP
,
2104 &recov_state
, NULL
);
2110 pgar
.n4g_va
.va_type
= VNON
;
2111 gar
.n4g_va
.va_type
= VNON
;
2113 remap_lookup(np
, rootvp
, RML_ORDINARY
, cr
,
2114 &newfh
, &gar
, &newpfh
, &pgar
, &e
);
2115 if (nfs4_needs_recovery(&e
, FALSE
, mi
->mi_vfsp
)) {
2116 if (need_start_op
) {
2119 abort
= nfs4_start_recovery(&e
, mi
,
2120 rootvp
, NULL
, NULL
, NULL
, OP_LOOKUP
, NULL
, NULL
,
2123 nfs4_end_fop(mi
, rootvp
, NULL
, OH_LOOKUP
,
2124 &recov_state
, FALSE
);
2129 nfs4_end_fop(mi
, rootvp
, NULL
, OH_LOOKUP
,
2130 &recov_state
, TRUE
);
2141 if ((e
.error
!= 0) ||
2142 (va
.va_type
!= VDIR
)) {
2144 nfs4_end_fop(mi
, rootvp
, NULL
, OH_LOOKUP
,
2145 &recov_state
, FALSE
);
2151 if (e
.stat
!= NFS4_OK
) {
2153 nfs4_end_fop(mi
, rootvp
, NULL
, OH_LOOKUP
,
2154 &recov_state
, FALSE
);
2160 * It is possible for remap_lookup() to return with no error,
2161 * but without providing the parent filehandle and attrs.
2163 if (pva
.va_type
!= VDIR
) {
2165 * Call remap_lookup() again, this time with the
2166 * newpfh and pgar args in the first position.
2168 pnp
= fn_parent(np
);
2170 remap_lookup(pnp
, rootvp
, RML_ORDINARY
, cr
,
2171 &newpfh
, &pgar
, NULL
, NULL
, &e
);
2173 * This remap_lookup call modifies pgar. The following
2174 * line prevents trouble when checking the va_type of
2175 * pva later in this code.
2179 if (nfs4_needs_recovery(&e
, FALSE
,
2181 if (need_start_op
) {
2184 abort
= nfs4_start_recovery(&e
, mi
,
2185 rootvp
, NULL
, NULL
, NULL
,
2186 OP_LOOKUP
, NULL
, NULL
, NULL
);
2188 nfs4_end_fop(mi
, rootvp
, NULL
,
2189 OH_LOOKUP
, &recov_state
,
2195 nfs4_end_fop(mi
, rootvp
, NULL
,
2196 OH_LOOKUP
, &recov_state
, TRUE
);
2204 if (e
.stat
!= NFS4_OK
) {
2206 nfs4_end_fop(mi
, rootvp
, NULL
,
2207 OH_LOOKUP
, &recov_state
, FALSE
);
2212 if ((pnp
== NULL
) ||
2214 (pva
.va_type
== VNON
)) {
2216 nfs4_end_fop(mi
, rootvp
, NULL
, OH_LOOKUP
,
2217 &recov_state
, FALSE
);
2223 ASSERT(newpfh
.nfs_fh4_len
!= 0);
2225 nfs4_end_fop(mi
, rootvp
, NULL
, OH_LOOKUP
, &recov_state
, FALSE
);
2226 psfh
= sfh4_get(&newpfh
, mi
);
2228 sfh
= sfh4_get(&newfh
, mi
);
2229 vp
= makenfs4node_by_fh(sfh
, psfh
, &np
, &gar
, mi
, cr
, t
);
2236 if (newfh
.nfs_fh4_len
!= 0)
2237 kmem_free(newfh
.nfs_fh4_val
, newfh
.nfs_fh4_len
);
2238 if (newpfh
.nfs_fh4_len
!= 0)
2239 kmem_free(newpfh
.nfs_fh4_val
, newpfh
.nfs_fh4_len
);
2251 size_t r_path_memuse
= 0;
2255 * NFS client failover support
2257 * sv4_free() frees the malloc'd portion of a "servinfo_t".
2260 sv4_free(servinfo4_t
*svp
)
2263 struct knetconfig
*knconf
;
2265 while (svp
!= NULL
) {
2266 next
= svp
->sv_next
;
2268 sec_clnt_freeinfo(svp
->sv_dhsec
);
2269 if (svp
->sv_secdata
)
2270 sec_clnt_freeinfo(svp
->sv_secdata
);
2271 if (svp
->sv_save_secinfo
&&
2272 svp
->sv_save_secinfo
!= svp
->sv_secinfo
)
2273 secinfo_free(svp
->sv_save_secinfo
);
2274 if (svp
->sv_secinfo
)
2275 secinfo_free(svp
->sv_secinfo
);
2276 if (svp
->sv_hostname
&& svp
->sv_hostnamelen
> 0)
2277 kmem_free(svp
->sv_hostname
, svp
->sv_hostnamelen
);
2278 knconf
= svp
->sv_knconf
;
2279 if (knconf
!= NULL
) {
2280 if (knconf
->knc_protofmly
!= NULL
)
2281 kmem_free(knconf
->knc_protofmly
, KNC_STRSIZE
);
2282 if (knconf
->knc_proto
!= NULL
)
2283 kmem_free(knconf
->knc_proto
, KNC_STRSIZE
);
2284 kmem_free(knconf
, sizeof (*knconf
));
2286 knconf
= svp
->sv_origknconf
;
2287 if (knconf
!= NULL
) {
2288 if (knconf
->knc_protofmly
!= NULL
)
2289 kmem_free(knconf
->knc_protofmly
, KNC_STRSIZE
);
2290 if (knconf
->knc_proto
!= NULL
)
2291 kmem_free(knconf
->knc_proto
, KNC_STRSIZE
);
2292 kmem_free(knconf
, sizeof (*knconf
));
2294 if (svp
->sv_addr
.buf
!= NULL
&& svp
->sv_addr
.maxlen
!= 0)
2295 kmem_free(svp
->sv_addr
.buf
, svp
->sv_addr
.maxlen
);
2296 if (svp
->sv_path
!= NULL
) {
2297 kmem_free(svp
->sv_path
, svp
->sv_pathlen
);
2299 nfs_rw_destroy(&svp
->sv_lock
);
2300 kmem_free(svp
, sizeof (*svp
));
2306 nfs4_printfhandle(nfs4_fhandle_t
*fhp
)
2314 * 13 == "(file handle:"
2315 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2317 * 8 == maximum strlen of "%x"
2320 bufsize
= 13 + ((NFS_FHANDLE_LEN
/ sizeof (*ip
)) * (1 + 8)) + 3;
2321 buf
= kmem_alloc(bufsize
, KM_NOSLEEP
);
2326 (void) strcpy(cp
, "(file handle:");
2329 for (ip
= (int *)fhp
->fh_buf
;
2330 ip
< (int *)&fhp
->fh_buf
[fhp
->fh_len
];
2332 (void) sprintf(cp
, " %x", *ip
);
2336 (void) strcpy(cp
, ")\n");
2338 zcmn_err(getzoneid(), CE_CONT
, "%s", buf
);
2340 kmem_free(buf
, bufsize
);
2344 * The NFSv4 readdir cache subsystem.
2346 * We provide a set of interfaces to allow the rest of the system to utilize
2347 * a caching mechanism while encapsulating the details of the actual
2348 * implementation. This should allow for better maintainability and
2349 * extensibility by consolidating the implementation details in one location.
2353 * Comparator used by AVL routines.
2356 rddir4_cache_compar(const void *x
, const void *y
)
2358 rddir4_cache_impl
*ai
= (rddir4_cache_impl
*)x
;
2359 rddir4_cache_impl
*bi
= (rddir4_cache_impl
*)y
;
2360 rddir4_cache
*a
= &ai
->rc
;
2361 rddir4_cache
*b
= &bi
->rc
;
2363 if (a
->nfs4_cookie
== b
->nfs4_cookie
) {
2364 if (a
->buflen
== b
->buflen
)
2366 if (a
->buflen
< b
->buflen
)
2371 if (a
->nfs4_cookie
< b
->nfs4_cookie
)
2378 * Allocate an opaque handle for the readdir cache.
2381 rddir4_cache_create(rnode4_t
*rp
)
2383 ASSERT(rp
->r_dir
== NULL
);
2385 rp
->r_dir
= kmem_alloc(sizeof (avl_tree_t
), KM_SLEEP
);
2387 avl_create(rp
->r_dir
, rddir4_cache_compar
, sizeof (rddir4_cache_impl
),
2388 offsetof(rddir4_cache_impl
, tree
));
2392 * Purge the cache of all cached readdir responses.
2395 rddir4_cache_purge(rnode4_t
*rp
)
2397 rddir4_cache_impl
*rdip
;
2398 rddir4_cache_impl
*nrdip
;
2400 ASSERT(MUTEX_HELD(&rp
->r_statelock
));
2402 if (rp
->r_dir
== NULL
)
2405 rdip
= avl_first(rp
->r_dir
);
2407 while (rdip
!= NULL
) {
2408 nrdip
= AVL_NEXT(rp
->r_dir
, rdip
);
2409 avl_remove(rp
->r_dir
, rdip
);
2410 rdip
->rc
.flags
&= ~RDDIRCACHED
;
2411 rddir4_cache_rele(rp
, &rdip
->rc
);
2414 ASSERT(avl_numnodes(rp
->r_dir
) == 0);
2418 * Destroy the readdir cache.
2421 rddir4_cache_destroy(rnode4_t
*rp
)
2423 ASSERT(MUTEX_HELD(&rp
->r_statelock
));
2424 if (rp
->r_dir
== NULL
)
2427 rddir4_cache_purge(rp
);
2428 avl_destroy(rp
->r_dir
);
2429 kmem_free(rp
->r_dir
, sizeof (avl_tree_t
));
2434 * Locate a readdir response from the readdir cache.
2438 * NULL - If there is an unrecoverable situation like the operation may have
2441 * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2442 * The flags are set approprately, such that the caller knows
2443 * what state the entry is in.
2446 rddir4_cache_lookup(rnode4_t
*rp
, offset_t cookie
, int count
)
2448 rddir4_cache_impl
*rdip
= NULL
;
2449 rddir4_cache_impl srdip
;
2451 rddir4_cache
*rdc
= NULL
;
2452 rddir4_cache
*nrdc
= NULL
;
2456 ASSERT(nfs_rw_lock_held(&rp
->r_rwlock
, RW_READER
));
2457 ASSERT(MUTEX_HELD(&rp
->r_statelock
));
2459 * Check to see if the readdir cache has been disabled. If so, then
2460 * simply allocate an rddir4_cache entry and return it, since caching
2461 * operations do not apply.
2463 if (rp
->r_dir
== NULL
) {
2466 * Drop the lock because we are doing a sleeping
2469 mutex_exit(&rp
->r_statelock
);
2470 rdc
= rddir4_cache_alloc(KM_SLEEP
);
2471 rdc
->nfs4_cookie
= cookie
;
2472 rdc
->buflen
= count
;
2473 mutex_enter(&rp
->r_statelock
);
2480 srdc
->nfs4_cookie
= cookie
;
2481 srdc
->buflen
= count
;
2483 rdip
= avl_find(rp
->r_dir
, &srdip
, &where
);
2486 * If we didn't find an entry then create one and insert it
2491 * Check for the case where we have made a second pass through
2492 * the cache due to a lockless allocation. If we find that no
2493 * thread has already inserted this entry, do the insert now
2497 avl_insert(rp
->r_dir
, nrdc
->data
, where
);
2498 nrdc
->flags
|= RDDIRCACHED
;
2499 rddir4_cache_hold(nrdc
);
2504 nfs4_readdir_cache_misses
++;
2507 * First, try to allocate an entry without sleeping. If that
2508 * fails then drop the lock and do a sleeping allocation.
2510 nrdc
= rddir4_cache_alloc(KM_NOSLEEP
);
2512 nrdc
->nfs4_cookie
= cookie
;
2513 nrdc
->buflen
= count
;
2514 avl_insert(rp
->r_dir
, nrdc
->data
, where
);
2515 nrdc
->flags
|= RDDIRCACHED
;
2516 rddir4_cache_hold(nrdc
);
2521 * Drop the lock and do a sleeping allocation. We incur
2522 * additional overhead by having to search the cache again,
2523 * but this case should be rare.
2525 mutex_exit(&rp
->r_statelock
);
2526 nrdc
= rddir4_cache_alloc(KM_SLEEP
);
2527 nrdc
->nfs4_cookie
= cookie
;
2528 nrdc
->buflen
= count
;
2529 mutex_enter(&rp
->r_statelock
);
2531 * We need to take another pass through the cache
2532 * since we dropped our lock to perform the alloc.
2533 * Another thread may have come by and inserted the
2534 * entry we are interested in.
2540 * Check to see if we need to free our entry. This can happen if
2541 * another thread came along beat us to the insert. We can
2542 * safely call rddir4_cache_free directly because no other thread
2543 * would have a reference to this entry.
2546 rddir4_cache_free((rddir4_cache_impl
*)nrdc
->data
);
2549 nfs4_readdir_cache_hits
++;
2552 * Found something. Make sure it's ready to return.
2555 rddir4_cache_hold(rdc
);
2557 * If the cache entry is in the process of being filled in, wait
2558 * until this completes. The RDDIRWAIT bit is set to indicate that
2559 * someone is waiting and when the thread currently filling the entry
2560 * is done, it should do a cv_broadcast to wakeup all of the threads
2561 * waiting for it to finish. If the thread wakes up to find that
2562 * someone new is now trying to complete the the entry, go back
2565 while (rdc
->flags
& RDDIR
) {
2567 * The entry is not complete.
2569 nfs_rw_exit(&rp
->r_rwlock
);
2570 rdc
->flags
|= RDDIRWAIT
;
2572 nfs4_readdir_cache_waits
++;
2574 while (rdc
->flags
& RDDIRWAIT
) {
2575 if (!cv_wait_sig(&rdc
->cv
, &rp
->r_statelock
)) {
2577 * We got interrupted, probably the user
2578 * typed ^C or an alarm fired. We free the
2579 * new entry if we allocated one.
2581 rddir4_cache_rele(rp
, rdc
);
2582 mutex_exit(&rp
->r_statelock
);
2583 (void) nfs_rw_enter_sig(&rp
->r_rwlock
,
2585 mutex_enter(&rp
->r_statelock
);
2589 mutex_exit(&rp
->r_statelock
);
2590 (void) nfs_rw_enter_sig(&rp
->r_rwlock
,
2592 mutex_enter(&rp
->r_statelock
);
2596 * The entry we were waiting on may have been purged from
2597 * the cache and should no longer be used, release it and
2600 if (!(rdc
->flags
& RDDIRCACHED
)) {
2601 rddir4_cache_rele(rp
, rdc
);
2606 * The entry is completed. Return it.
2612 * Allocate a cache element and return it. Can return NULL if memory is
2615 static rddir4_cache
*
2616 rddir4_cache_alloc(int flags
)
2618 rddir4_cache_impl
*rdip
= NULL
;
2619 rddir4_cache
*rc
= NULL
;
2621 rdip
= kmem_alloc(sizeof (rddir4_cache_impl
), flags
);
2625 rc
->data
= (void *)rdip
;
2626 rc
->nfs4_cookie
= 0;
2627 rc
->nfs4_ncookie
= 0;
2634 * A readdir is required so set the flag.
2636 rc
->flags
= RDDIRREQ
;
2637 cv_init(&rc
->cv
, NULL
, CV_DEFAULT
, NULL
);
2639 mutex_init(&rdip
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2642 atomic_inc_64(&clstat4_debug
.dirent
.value
.ui64
);
2649 * Increment the reference count to this cache element.
2652 rddir4_cache_hold(rddir4_cache
*rc
)
2654 rddir4_cache_impl
*rdip
= (rddir4_cache_impl
*)rc
->data
;
2656 mutex_enter(&rdip
->lock
);
2658 mutex_exit(&rdip
->lock
);
2662 * Release a reference to this cache element. If the count is zero then
2666 rddir4_cache_rele(rnode4_t
*rp
, rddir4_cache
*rdc
)
2668 rddir4_cache_impl
*rdip
= (rddir4_cache_impl
*)rdc
->data
;
2670 ASSERT(MUTEX_HELD(&rp
->r_statelock
));
2673 * Check to see if we have any waiters. If so, we can wake them
2674 * so that they can proceed.
2676 if (rdc
->flags
& RDDIRWAIT
) {
2677 rdc
->flags
&= ~RDDIRWAIT
;
2678 cv_broadcast(&rdc
->cv
);
2681 mutex_enter(&rdip
->lock
);
2682 ASSERT(rdip
->count
> 0);
2683 if (--rdip
->count
== 0) {
2684 mutex_exit(&rdip
->lock
);
2685 rddir4_cache_free(rdip
);
2687 mutex_exit(&rdip
->lock
);
2691 * Free a cache element.
2694 rddir4_cache_free(rddir4_cache_impl
*rdip
)
2696 rddir4_cache
*rc
= &rdip
->rc
;
2699 atomic_dec_64(&clstat4_debug
.dirent
.value
.ui64
);
2701 if (rc
->entries
!= NULL
)
2702 kmem_free(rc
->entries
, rc
->buflen
);
2703 cv_destroy(&rc
->cv
);
2704 mutex_destroy(&rdip
->lock
);
2705 kmem_free(rdip
, sizeof (*rdip
));
2709 * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2713 cl4_snapshot(kstat_t
*ksp
, void *buf
, int rw
)
2715 ksp
->ks_snaptime
= gethrtime();
2716 if (rw
== KSTAT_WRITE
) {
2717 bcopy(buf
, ksp
->ks_private
, sizeof (clstat4_tmpl
));
2720 * Currently only the global zone can write to kstats, but we
2721 * add the check just for paranoia.
2723 if (INGLOBALZONE(curproc
))
2724 bcopy((char *)buf
+ sizeof (clstat4_tmpl
),
2725 &clstat4_debug
, sizeof (clstat4_debug
));
2728 bcopy(ksp
->ks_private
, buf
, sizeof (clstat4_tmpl
));
2731 * If we're displaying the "global" debug kstat values, we
2732 * display them as-is to all zones since in fact they apply to
2733 * the system as a whole.
2735 bcopy(&clstat4_debug
, (char *)buf
+ sizeof (clstat4_tmpl
),
2736 sizeof (clstat4_debug
));
2748 clinit4_zone(zoneid_t zoneid
)
2750 kstat_t
*nfs4_client_kstat
;
2751 struct nfs4_clnt
*nfscl
;
2754 nfscl
= kmem_alloc(sizeof (*nfscl
), KM_SLEEP
);
2755 mutex_init(&nfscl
->nfscl_chtable4_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2756 nfscl
->nfscl_chtable4
= NULL
;
2757 nfscl
->nfscl_zoneid
= zoneid
;
2759 bcopy(&clstat4_tmpl
, &nfscl
->nfscl_stat
, sizeof (clstat4_tmpl
));
2760 ndata
= sizeof (clstat4_tmpl
) / sizeof (kstat_named_t
);
2762 ndata
+= sizeof (clstat4_debug
) / sizeof (kstat_named_t
);
2764 if ((nfs4_client_kstat
= kstat_create_zone("nfs", 0, "nfs4_client",
2765 "misc", KSTAT_TYPE_NAMED
, ndata
,
2766 KSTAT_FLAG_VIRTUAL
| KSTAT_FLAG_WRITABLE
, zoneid
)) != NULL
) {
2767 nfs4_client_kstat
->ks_private
= &nfscl
->nfscl_stat
;
2768 nfs4_client_kstat
->ks_snapshot
= cl4_snapshot
;
2769 kstat_install(nfs4_client_kstat
);
2771 mutex_enter(&nfs4_clnt_list_lock
);
2772 list_insert_head(&nfs4_clnt_list
, nfscl
);
2773 mutex_exit(&nfs4_clnt_list_lock
);
2780 clfini4_zone(zoneid_t zoneid
, void *arg
)
2782 struct nfs4_clnt
*nfscl
= arg
;
2783 chhead_t
*chp
, *next
;
2787 mutex_enter(&nfs4_clnt_list_lock
);
2788 list_remove(&nfs4_clnt_list
, nfscl
);
2789 mutex_exit(&nfs4_clnt_list_lock
);
2790 clreclaim4_zone(nfscl
, 0);
2791 for (chp
= nfscl
->nfscl_chtable4
; chp
!= NULL
; chp
= next
) {
2792 ASSERT(chp
->ch_list
== NULL
);
2793 kmem_free(chp
->ch_protofmly
, strlen(chp
->ch_protofmly
) + 1);
2794 next
= chp
->ch_next
;
2795 kmem_free(chp
, sizeof (*chp
));
2797 kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid
);
2798 mutex_destroy(&nfscl
->nfscl_chtable4_lock
);
2799 kmem_free(nfscl
, sizeof (*nfscl
));
2803 * Called by endpnt_destructor to make sure the client handles are
2804 * cleaned up before the RPC endpoints. This becomes a no-op if
2805 * clfini_zone (above) is called first. This function is needed
2806 * (rather than relying on clfini_zone to clean up) because the ZSD
2807 * callbacks have no ordering mechanism, so we have no way to ensure
2808 * that clfini_zone is called before endpnt_destructor.
2811 clcleanup4_zone(zoneid_t zoneid
)
2813 struct nfs4_clnt
*nfscl
;
2815 mutex_enter(&nfs4_clnt_list_lock
);
2816 nfscl
= list_head(&nfs4_clnt_list
);
2817 for (; nfscl
!= NULL
; nfscl
= list_next(&nfs4_clnt_list
, nfscl
)) {
2818 if (nfscl
->nfscl_zoneid
== zoneid
) {
2819 clreclaim4_zone(nfscl
, 0);
2823 mutex_exit(&nfs4_clnt_list_lock
);
2827 nfs4_subr_init(void)
2830 * Allocate and initialize the client handle cache
2832 chtab4_cache
= kmem_cache_create("client_handle4_cache",
2833 sizeof (struct chtab
), 0, NULL
, NULL
, clreclaim4
, NULL
,
2837 * Initialize the list of per-zone client handles (and associated data).
2838 * This needs to be done before we call zone_key_create().
2840 list_create(&nfs4_clnt_list
, sizeof (struct nfs4_clnt
),
2841 offsetof(struct nfs4_clnt
, nfscl_node
));
2844 * Initialize the zone_key for per-zone client handle lists.
2846 zone_key_create(&nfs4clnt_zone_key
, clinit4_zone
, NULL
, clfini4_zone
);
2848 if (nfs4err_delay_time
== 0)
2849 nfs4err_delay_time
= NFS4ERR_DELAY_TIME
;
2855 nfs4_subr_fini(void)
2858 * Deallocate the client handle cache
2860 kmem_cache_destroy(chtab4_cache
);
2863 * Destroy the zone_key
2865 (void) zone_key_delete(nfs4clnt_zone_key
);
2870 * Set or Clear direct I/O flag
2871 * fop_rwlock() is held for write access to prevent a race condition
2872 * which would occur if a process is in the middle of a write when
2873 * directio flag gets set. It is possible that all pages may not get flushed.
2875 * This is a copy of nfs_directio, changes here may need to be made
2876 * there and vice versa.
2880 nfs4_directio(vnode_t
*vp
, int cmd
, cred_t
*cr
)
2887 if (cmd
== DIRECTIO_ON
) {
2889 if (rp
->r_flags
& R4DIRECTIO
)
2893 * Flush the page cache.
2896 (void) fop_rwlock(vp
, V_WRITELOCK_TRUE
, NULL
);
2898 if (rp
->r_flags
& R4DIRECTIO
) {
2899 fop_rwunlock(vp
, V_WRITELOCK_TRUE
, NULL
);
2903 if (nfs4_has_pages(vp
) &&
2904 ((rp
->r_flags
& R4DIRTY
) || rp
->r_awcount
> 0)) {
2905 error
= fop_putpage(vp
, 0, 0,
2908 if (error
== ENOSPC
|| error
== EDQUOT
) {
2909 mutex_enter(&rp
->r_statelock
);
2911 rp
->r_error
= error
;
2912 mutex_exit(&rp
->r_statelock
);
2914 fop_rwunlock(vp
, V_WRITELOCK_TRUE
, NULL
);
2919 mutex_enter(&rp
->r_statelock
);
2920 rp
->r_flags
|= R4DIRECTIO
;
2921 mutex_exit(&rp
->r_statelock
);
2922 fop_rwunlock(vp
, V_WRITELOCK_TRUE
, NULL
);
2926 if (cmd
== DIRECTIO_OFF
) {
2927 mutex_enter(&rp
->r_statelock
);
2928 rp
->r_flags
&= ~R4DIRECTIO
; /* disable direct mode */
2929 mutex_exit(&rp
->r_statelock
);
2937 * Return TRUE if the file has any pages. Always go back to
2938 * the master vnode to check it since none of the shadows
2943 nfs4_has_pages(vnode_t
*vp
)
2948 if (IS_SHADOW(vp
, rp
))
2949 vp
= RTOV4(rp
); /* RTOV4 always gives the master */
2951 return (vn_has_cached_data(vp
));
2955 * This table is used to determine whether the client should attempt
2956 * failover based on the clnt_stat value returned by CLNT_CALL. The
2957 * clnt_stat is used as an index into the table. If
2958 * the error value that corresponds to the clnt_stat value in the
2959 * table is non-zero, then that is the error to be returned AND
2960 * that signals that failover should be attempted.
2962 * Special note: If the RPC_ values change, then direct indexing of the
2963 * table is no longer valid, but having the RPC_ values in the table
2964 * allow the functions to detect the change and issue a warning.
2965 * In this case, the code will always attempt failover as a defensive
2969 static struct try_failover_tab
{
2970 enum clnt_stat cstat
;
2972 } try_failover_table
[] = {
2975 RPC_CANTENCODEARGS
, 0,
2976 RPC_CANTDECODERES
, 0,
2977 RPC_CANTSEND
, ECOMM
,
2978 RPC_CANTRECV
, ECOMM
,
2979 RPC_TIMEDOUT
, ETIMEDOUT
,
2980 RPC_VERSMISMATCH
, 0,
2983 RPC_PROGVERSMISMATCH
, 0,
2985 RPC_CANTDECODEARGS
, 0,
2986 RPC_SYSTEMERROR
, ENOSR
,
2987 RPC_UNKNOWNHOST
, EHOSTUNREACH
,
2988 RPC_RPCBFAILURE
, ENETUNREACH
,
2989 RPC_PROGNOTREGISTERED
, ECONNREFUSED
,
2990 RPC_FAILED
, ETIMEDOUT
,
2991 RPC_UNKNOWNPROTO
, EHOSTUNREACH
,
2993 RPC_UNKNOWNADDR
, EHOSTUNREACH
,
2995 RPC_NOBROADCAST
, EHOSTUNREACH
,
2996 RPC_N2AXLATEFAILURE
, ECONNREFUSED
,
2999 RPC_STALERACHANDLE
, EINVAL
,
3000 RPC_CANTCONNECT
, ECONNREFUSED
,
3001 RPC_XPRTFAILED
, ECONNABORTED
,
3002 RPC_CANTCREATESTREAM
, ECONNREFUSED
,
3003 RPC_CANTSTORE
, ENOBUFS
3007 * nfs4_try_failover - determine whether the client should
3008 * attempt failover based on the values stored in the nfs4_error_t.
3011 nfs4_try_failover(nfs4_error_t
*ep
)
3013 if (ep
->error
== ETIMEDOUT
|| ep
->stat
== NFS4ERR_RESOURCE
)
3016 if (ep
->error
&& ep
->rpc_status
!= RPC_SUCCESS
)
3017 return (try_failover(ep
->rpc_status
) != 0 ? TRUE
: FALSE
);
3023 * try_failover - internal version of nfs4_try_failover, called
3024 * only by rfscall and aclcall. Determine if failover is warranted
3025 * based on the clnt_stat and return the error number if it is.
3028 try_failover(enum clnt_stat rpc_status
)
3032 if (rpc_status
== RPC_SUCCESS
)
3036 if (rpc_status
!= 0 && nfs4_try_failover_any
) {
3042 * The rpc status is used as an index into the table.
3043 * If the rpc status is outside of the range of the
3044 * table or if the rpc error numbers have been changed
3045 * since the table was constructed, then print a warning
3046 * (DEBUG only) and try failover anyway. Otherwise, just
3047 * grab the resulting error number out of the table.
3049 if (rpc_status
< RPC_SUCCESS
|| rpc_status
>=
3050 sizeof (try_failover_table
)/sizeof (try_failover_table
[0]) ||
3051 try_failover_table
[rpc_status
].cstat
!= rpc_status
) {
3055 cmn_err(CE_NOTE
, "try_failover: unexpected rpc error %d",
3059 err
= try_failover_table
[rpc_status
].error
;
3063 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
3064 "nfs4_try_failover: %strying failover on error %d",
3065 err
? "" : "NOT ", rpc_status
));
3071 nfs4_error_zinit(nfs4_error_t
*ep
)
3075 ep
->rpc_status
= RPC_SUCCESS
;
3079 nfs4_error_init(nfs4_error_t
*ep
, int error
)
3083 ep
->rpc_status
= RPC_SUCCESS
;
3090 * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3091 * use the same algorithm as for NFS v3.
3095 hash16(void *p
, int len
)
3101 /* protect against non word aligned */
3102 if ((rem
= len
& 3) != 0)
3105 for (i
= 0, wp
= (uint_t
*)p
; i
< len
; i
+= 4, wp
++) {
3106 key
^= (*wp
>> 16) ^ *wp
;
3109 /* hash left-over bytes */
3110 for (i
= 0; i
< rem
; i
++)
3111 key
^= *((uchar_t
*)p
+ i
);
3113 return (key
& 0xffff);
3117 * rnode4info - return filehandle and path information for an rnode.
3118 * XXX MT issues: uses a single static buffer, no locking of path.
3121 rnode4info(rnode4_t
*rp
)
3123 static char buf
[80];
3124 nfs4_fhandle_t fhandle
;
3130 if (rp
->r_flags
& R4ISXATTR
)
3132 else if (RTOV4(rp
)->v_flag
& V_XATTRDIR
)
3134 else if (RTOV4(rp
)->v_flag
& VROOT
)
3136 else if (RTOV4(rp
)->v_type
== VDIR
)
3138 else if (RTOV4(rp
)->v_type
== VREG
)
3142 sfh4_copyval(rp
->r_fh
, &fhandle
);
3143 path
= fn_path(rp
->r_svnode
.sv_name
);
3144 (void) snprintf(buf
, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3145 (void *)rp
, path
, type
, rp
->r_flags
,
3146 hash16((void *)&fhandle
.fh_buf
, fhandle
.fh_len
));
3147 kmem_free(path
, strlen(path
)+1);