4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
43 #include <sys/tiuser.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
53 #include <sys/bitmap.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
69 #include <rpc/types.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
81 * The hash queues for the access to active and cached rnodes
82 * are organized as doubly linked lists. A reader/writer lock
83 * for each hash bucket is used to control access and to synchronize
84 * lookups, additions, and deletions from the hash queue.
86 * The rnode freelist is organized as a doubly linked list with
87 * a head pointer. Additions and deletions are synchronized via
90 * In order to add an rnode to the free list, it must be hashed into
91 * a hash queue and the exclusive lock to the hash queue be held.
92 * If an rnode is not hashed into a hash queue, then it is destroyed
93 * because it represents no valuable information that can be reused
94 * about the file. The exclusive lock to the hash queue must be
95 * held in order to prevent a lookup in the hash queue from finding
96 * the rnode and using it and assuming that the rnode is not on the
97 * freelist. The lookup in the hash queue will have the hash queue
98 * locked, either exclusive or shared.
100 * The vnode reference count for each rnode is not allowed to drop
101 * below 1. This prevents external entities, such as the VM
102 * subsystem, from acquiring references to vnodes already on the
103 * freelist and then trying to place them back on the freelist
104 * when their reference is released. This means that the when an
105 * rnode is looked up in the hash queues, then either the rnode
106 * is removed from the freelist and that reference is transferred to
107 * the new reference or the vnode reference count must be incremented
108 * accordingly. The mutex for the freelist must be held in order to
109 * accurately test to see if the rnode is on the freelist or not.
110 * The hash queue lock might be held shared and it is possible that
111 * two different threads may race to remove the rnode from the
112 * freelist. This race can be resolved by holding the mutex for the
113 * freelist. Please note that the mutex for the freelist does not
114 * need to held if the rnode is not on the freelist. It can not be
115 * placed on the freelist due to the requirement that the thread
116 * putting the rnode on the freelist must hold the exclusive lock
117 * to the hash queue and the thread doing the lookup in the hash
118 * queue is holding either a shared or exclusive lock to the hash
121 * The lock ordering is:
123 * hash bucket lock -> vnode lock
124 * hash bucket lock -> freelist lock
126 static rhashq_t
*rtable
;
128 static kmutex_t rpfreelist_lock
;
129 static rnode_t
*rpfreelist
= NULL
;
130 static long rnew
= 0;
133 static int rtablesize
;
134 static int rtablemask
;
136 static int hashlen
= 4;
138 static struct kmem_cache
*rnode_cache
;
141 * Mutex to protect the following variables:
145 kmutex_t nfs_minor_lock
;
149 /* Do we allow preepoch (negative) time values otw? */
150 bool_t nfs_allow_preepoch_time
= FALSE
; /* default: do not allow preepoch */
155 static acache_hash_t
*acache
;
156 static long nacache
; /* used strictly to size the number of hash queues */
158 static int acachesize
;
159 static int acachemask
;
160 static struct kmem_cache
*acache_cache
;
163 * Client side utilities
167 * client side statistics
169 static const struct clstat clstat_tmpl
= {
170 { "calls", KSTAT_DATA_UINT64
},
171 { "badcalls", KSTAT_DATA_UINT64
},
172 { "clgets", KSTAT_DATA_UINT64
},
173 { "cltoomany", KSTAT_DATA_UINT64
},
175 { "clalloc", KSTAT_DATA_UINT64
},
176 { "noresponse", KSTAT_DATA_UINT64
},
177 { "failover", KSTAT_DATA_UINT64
},
178 { "remap", KSTAT_DATA_UINT64
},
183 * The following are statistics that describe behavior of the system as a whole
184 * and doesn't correspond to any one particular zone.
187 static struct clstat_debug
{
188 kstat_named_t nrnode
; /* number of allocated rnodes */
189 kstat_named_t access
; /* size of access cache */
190 kstat_named_t dirent
; /* size of readdir cache */
191 kstat_named_t dirents
; /* size of readdir buf cache */
192 kstat_named_t reclaim
; /* number of reclaims */
193 kstat_named_t clreclaim
; /* number of cl reclaims */
194 kstat_named_t f_reclaim
; /* number of free reclaims */
195 kstat_named_t a_reclaim
; /* number of active reclaims */
196 kstat_named_t r_reclaim
; /* number of rnode reclaims */
197 kstat_named_t rpath
; /* bytes used to store rpaths */
199 { "nrnode", KSTAT_DATA_UINT64
},
200 { "access", KSTAT_DATA_UINT64
},
201 { "dirent", KSTAT_DATA_UINT64
},
202 { "dirents", KSTAT_DATA_UINT64
},
203 { "reclaim", KSTAT_DATA_UINT64
},
204 { "clreclaim", KSTAT_DATA_UINT64
},
205 { "f_reclaim", KSTAT_DATA_UINT64
},
206 { "a_reclaim", KSTAT_DATA_UINT64
},
207 { "r_reclaim", KSTAT_DATA_UINT64
},
208 { "r_path", KSTAT_DATA_UINT64
},
213 * We keep a global list of per-zone client data, so we can clean up all zones
214 * if we get low on memory.
216 static list_t nfs_clnt_list
;
217 static kmutex_t nfs_clnt_list_lock
;
218 static zone_key_t nfsclnt_zone_key
;
220 static struct kmem_cache
*chtab_cache
;
223 * Some servers do not properly update the attributes of the
224 * directory when changes are made. To allow interoperability
225 * with these broken servers, the nfs_disable_rddir_cache
226 * parameter must be set in /etc/system
228 int nfs_disable_rddir_cache
= 0;
230 int clget(clinfo_t
*, servinfo_t
*, cred_t
*, CLIENT
**,
232 void clfree(CLIENT
*, struct chtab
*);
233 static int acl_clget(mntinfo_t
*, servinfo_t
*, cred_t
*, CLIENT
**,
234 struct chtab
**, struct nfs_clnt
*);
235 static int nfs_clget(mntinfo_t
*, servinfo_t
*, cred_t
*, CLIENT
**,
236 struct chtab
**, struct nfs_clnt
*);
237 static void clreclaim(void *);
238 static int nfs_feedback(int, int, mntinfo_t
*);
239 static int rfscall(mntinfo_t
*, rpcproc_t
, xdrproc_t
, caddr_t
, xdrproc_t
,
240 caddr_t
, cred_t
*, int *, enum clnt_stat
*, int,
242 static int aclcall(mntinfo_t
*, rpcproc_t
, xdrproc_t
, caddr_t
, xdrproc_t
,
243 caddr_t
, cred_t
*, int *, int, failinfo_t
*);
244 static void rinactive(rnode_t
*, cred_t
*);
245 static int rtablehash(nfs_fhandle
*);
246 static vnode_t
*make_rnode(nfs_fhandle
*, rhashq_t
*, struct vfs
*,
247 const struct vnodeops
*,
248 int (*)(vnode_t
*, page_t
*, uoff_t
*, size_t *, int,
250 int (*)(const void *, const void *), int *, cred_t
*,
252 static void rp_rmfree(rnode_t
*);
253 static void rp_addhash(rnode_t
*);
254 static void rp_rmhash_locked(rnode_t
*);
255 static rnode_t
*rfind(rhashq_t
*, nfs_fhandle
*, struct vfs
*);
256 static void destroy_rnode(rnode_t
*);
257 static void rddir_cache_free(rddir_cache
*);
258 static int nfs_free_data_reclaim(rnode_t
*);
259 static int nfs_active_data_reclaim(rnode_t
*);
260 static int nfs_free_reclaim(void);
261 static int nfs_active_reclaim(void);
262 static int nfs_rnode_reclaim(void);
263 static void nfs_reclaim(void *);
264 static int failover_safe(failinfo_t
*);
265 static void failover_newserver(mntinfo_t
*mi
);
266 static void failover_thread(mntinfo_t
*mi
);
267 static int failover_wait(mntinfo_t
*);
268 static int failover_remap(failinfo_t
*);
269 static int failover_lookup(char *, vnode_t
*,
270 int (*)(vnode_t
*, char *, vnode_t
**,
271 struct pathname
*, int, vnode_t
*, cred_t
*, int),
272 int (*)(vnode_t
*, vnode_t
**, bool_t
, cred_t
*, int),
274 static void nfs_free_r_path(rnode_t
*);
275 static void nfs_set_vroot(vnode_t
*);
276 static char *nfs_getsrvnames(mntinfo_t
*, size_t *);
279 * from rpcsec module (common/rpcsec)
281 extern int sec_clnt_geth(CLIENT
*, struct sec_data
*, cred_t
*, AUTH
**);
282 extern void sec_clnt_freeh(AUTH
*);
283 extern void sec_clnt_freeinfo(struct sec_data
*);
286 * EIO or EINTR are not recoverable errors.
288 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
291 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
292 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
294 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
295 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
298 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
301 clget_impl(clinfo_t
*ci
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
302 struct chtab
**chp
, struct nfs_clnt
*nfscl
)
304 struct chhead
*ch
, *newch
;
305 struct chhead
**plistp
;
310 if (newcl
== NULL
|| chp
== NULL
|| ci
== NULL
)
317 * Find an unused handle or create one
320 nfscl
->nfscl_stat
.clgets
.value
.ui64
++;
323 * Find the correct entry in the cache to check for free
324 * client handles. The search is based on the RPC program
325 * number, program version number, dev_t for the transport
326 * device, and the protocol family.
328 mutex_enter(&nfscl
->nfscl_chtable_lock
);
329 plistp
= &nfscl
->nfscl_chtable
;
330 for (ch
= nfscl
->nfscl_chtable
; ch
!= NULL
; ch
= ch
->ch_next
) {
331 if (ch
->ch_prog
== ci
->cl_prog
&&
332 ch
->ch_vers
== ci
->cl_vers
&&
333 ch
->ch_dev
== svp
->sv_knconf
->knc_rdev
&&
334 (strcmp(ch
->ch_protofmly
,
335 svp
->sv_knconf
->knc_protofmly
) == 0))
337 plistp
= &ch
->ch_next
;
341 * If we didn't find a cache entry for this quadruple, then
342 * create one. If we don't have one already preallocated,
343 * then drop the cache lock, create one, and then start over.
344 * If we did have a preallocated entry, then just add it to
345 * the front of the list.
349 mutex_exit(&nfscl
->nfscl_chtable_lock
);
350 newch
= kmem_alloc(sizeof (*newch
), KM_SLEEP
);
351 newch
->ch_timesused
= 0;
352 newch
->ch_prog
= ci
->cl_prog
;
353 newch
->ch_vers
= ci
->cl_vers
;
354 newch
->ch_dev
= svp
->sv_knconf
->knc_rdev
;
355 newch
->ch_protofmly
= kmem_alloc(
356 strlen(svp
->sv_knconf
->knc_protofmly
) + 1,
358 (void) strcpy(newch
->ch_protofmly
,
359 svp
->sv_knconf
->knc_protofmly
);
360 newch
->ch_list
= NULL
;
365 ch
->ch_next
= nfscl
->nfscl_chtable
;
366 nfscl
->nfscl_chtable
= ch
;
368 * We found a cache entry, but if it isn't on the front of the
369 * list, then move it to the front of the list to try to take
370 * advantage of locality of operations.
372 } else if (ch
!= nfscl
->nfscl_chtable
) {
373 *plistp
= ch
->ch_next
;
374 ch
->ch_next
= nfscl
->nfscl_chtable
;
375 nfscl
->nfscl_chtable
= ch
;
379 * If there was a free client handle cached, then remove it
380 * from the list, init it, and use it.
382 if (ch
->ch_list
!= NULL
) {
384 ch
->ch_list
= cp
->ch_list
;
385 mutex_exit(&nfscl
->nfscl_chtable_lock
);
387 kmem_free(newch
->ch_protofmly
,
388 strlen(newch
->ch_protofmly
) + 1);
389 kmem_free(newch
, sizeof (*newch
));
391 (void) clnt_tli_kinit(cp
->ch_client
, svp
->sv_knconf
,
392 &svp
->sv_addr
, ci
->cl_readsize
, ci
->cl_retrans
, cr
);
393 error
= sec_clnt_geth(cp
->ch_client
, svp
->sv_secdata
, cr
,
394 &cp
->ch_client
->cl_auth
);
395 if (error
|| cp
->ch_client
->cl_auth
== NULL
) {
396 CLNT_DESTROY(cp
->ch_client
);
397 kmem_cache_free(chtab_cache
, cp
);
398 return ((error
!= 0) ? error
: EINTR
);
401 *newcl
= cp
->ch_client
;
407 * There weren't any free client handles which fit, so allocate
408 * a new one and use that.
411 atomic_inc_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
413 mutex_exit(&nfscl
->nfscl_chtable_lock
);
415 nfscl
->nfscl_stat
.cltoomany
.value
.ui64
++;
417 kmem_free(newch
->ch_protofmly
, strlen(newch
->ch_protofmly
) + 1);
418 kmem_free(newch
, sizeof (*newch
));
421 cp
= kmem_cache_alloc(chtab_cache
, KM_SLEEP
);
424 sigintr(&smask
, (int)ci
->cl_flags
& MI_INT
);
425 error
= clnt_tli_kcreate(svp
->sv_knconf
, &svp
->sv_addr
, ci
->cl_prog
,
426 ci
->cl_vers
, ci
->cl_readsize
, ci
->cl_retrans
, cr
, &cp
->ch_client
);
430 kmem_cache_free(chtab_cache
, cp
);
432 atomic_dec_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
435 * Warning is unnecessary if error is EINTR.
437 if (error
!= EINTR
) {
438 nfs_cmn_err(error
, CE_WARN
,
439 "clget: couldn't create handle: %m\n");
443 (void) CLNT_CONTROL(cp
->ch_client
, CLSET_PROGRESS
, NULL
);
444 auth_destroy(cp
->ch_client
->cl_auth
);
445 error
= sec_clnt_geth(cp
->ch_client
, svp
->sv_secdata
, cr
,
446 &cp
->ch_client
->cl_auth
);
447 if (error
|| cp
->ch_client
->cl_auth
== NULL
) {
448 CLNT_DESTROY(cp
->ch_client
);
449 kmem_cache_free(chtab_cache
, cp
);
451 atomic_dec_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
453 return ((error
!= 0) ? error
: EINTR
);
456 *newcl
= cp
->ch_client
;
457 ASSERT(cp
->ch_client
->cl_nosignal
== FALSE
);
463 clget(clinfo_t
*ci
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
466 struct nfs_clnt
*nfscl
;
468 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
469 ASSERT(nfscl
!= NULL
);
471 return (clget_impl(ci
, svp
, cr
, newcl
, chp
, nfscl
));
475 acl_clget(mntinfo_t
*mi
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
476 struct chtab
**chp
, struct nfs_clnt
*nfscl
)
482 * Set read buffer size to rsize
483 * and add room for RPC headers.
485 ci
.cl_readsize
= mi
->mi_tsize
;
486 if (ci
.cl_readsize
!= 0)
487 ci
.cl_readsize
+= (RPC_MAXDATASIZE
- NFS_MAXDATA
);
490 * If soft mount and server is down just try once.
491 * meaning: do not retransmit.
493 if (!(mi
->mi_flags
& MI_HARD
) && (mi
->mi_flags
& MI_DOWN
))
496 ci
.cl_retrans
= mi
->mi_retrans
;
498 ci
.cl_prog
= NFS_ACL_PROGRAM
;
499 ci
.cl_vers
= mi
->mi_vers
;
500 ci
.cl_flags
= mi
->mi_flags
;
503 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
504 * security flavor, the client tries to establish a security context
505 * by contacting the server. If the connection is timed out or reset,
506 * e.g. server reboot, we will try again.
509 error
= clget_impl(&ci
, svp
, cr
, newcl
, chp
, nfscl
);
515 * For forced unmount or zone shutdown, bail out, no retry.
517 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
522 /* do not retry for softmount */
523 if (!(mi
->mi_flags
& MI_HARD
))
526 /* let the caller deal with the failover case */
527 if (FAILOVER_MOUNT(mi
))
530 } while (error
== ETIMEDOUT
|| error
== ECONNRESET
);
536 nfs_clget(mntinfo_t
*mi
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
537 struct chtab
**chp
, struct nfs_clnt
*nfscl
)
543 * Set read buffer size to rsize
544 * and add room for RPC headers.
546 ci
.cl_readsize
= mi
->mi_tsize
;
547 if (ci
.cl_readsize
!= 0)
548 ci
.cl_readsize
+= (RPC_MAXDATASIZE
- NFS_MAXDATA
);
551 * If soft mount and server is down just try once.
552 * meaning: do not retransmit.
554 if (!(mi
->mi_flags
& MI_HARD
) && (mi
->mi_flags
& MI_DOWN
))
557 ci
.cl_retrans
= mi
->mi_retrans
;
559 ci
.cl_prog
= mi
->mi_prog
;
560 ci
.cl_vers
= mi
->mi_vers
;
561 ci
.cl_flags
= mi
->mi_flags
;
564 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
565 * security flavor, the client tries to establish a security context
566 * by contacting the server. If the connection is timed out or reset,
567 * e.g. server reboot, we will try again.
570 error
= clget_impl(&ci
, svp
, cr
, newcl
, chp
, nfscl
);
576 * For forced unmount or zone shutdown, bail out, no retry.
578 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
583 /* do not retry for softmount */
584 if (!(mi
->mi_flags
& MI_HARD
))
587 /* let the caller deal with the failover case */
588 if (FAILOVER_MOUNT(mi
))
591 } while (error
== ETIMEDOUT
|| error
== ECONNRESET
);
597 clfree_impl(CLIENT
*cl
, struct chtab
*cp
, struct nfs_clnt
*nfscl
)
599 if (cl
->cl_auth
!= NULL
) {
600 sec_clnt_freeh(cl
->cl_auth
);
605 * Timestamp this cache entry so that we know when it was last
608 cp
->ch_freed
= gethrestime_sec();
611 * Add the free client handle to the front of the list.
612 * This way, the list will be sorted in youngest to oldest
615 mutex_enter(&nfscl
->nfscl_chtable_lock
);
616 cp
->ch_list
= cp
->ch_head
->ch_list
;
617 cp
->ch_head
->ch_list
= cp
;
618 mutex_exit(&nfscl
->nfscl_chtable_lock
);
622 clfree(CLIENT
*cl
, struct chtab
*cp
)
624 struct nfs_clnt
*nfscl
;
626 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
627 ASSERT(nfscl
!= NULL
);
629 clfree_impl(cl
, cp
, nfscl
);
632 #define CL_HOLDTIME 60 /* time to hold client handles */
635 clreclaim_zone(struct nfs_clnt
*nfscl
, uint_t cl_holdtime
)
638 struct chtab
*cp
; /* list of objects that can be reclaimed */
647 * Need to reclaim some memory, so step through the cache
648 * looking through the lists for entries which can be freed.
652 mutex_enter(&nfscl
->nfscl_chtable_lock
);
655 * Here we step through each non-NULL quadruple and start to
656 * construct the reclaim list pointed to by cp. Note that
657 * cp will contain all eligible chtab entries. When this traversal
658 * completes, chtab entries from the last quadruple will be at the
659 * front of cp and entries from previously inspected quadruples have
660 * been appended to the rear of cp.
662 for (ch
= nfscl
->nfscl_chtable
; ch
!= NULL
; ch
= ch
->ch_next
) {
663 if (ch
->ch_list
== NULL
)
666 * Search each list for entries older then
667 * cl_holdtime seconds. The lists are maintained
668 * in youngest to oldest order so that when the
669 * first entry is found which is old enough, then
670 * all of the rest of the entries on the list will
671 * be old enough as well.
675 while (cpl
!= NULL
&&
676 cpl
->ch_freed
+ cl_holdtime
> gethrestime_sec()) {
684 while (cpe
->ch_list
!= NULL
)
692 mutex_exit(&nfscl
->nfscl_chtable_lock
);
695 * If cp is empty, then there is nothing to reclaim here.
701 * Step through the list of entries to free, destroying each client
702 * handle and kmem_free'ing the memory for each entry.
708 CLNT_DESTROY(cp
->ch_client
);
710 kmem_cache_free(chtab_cache
, cp
);
716 * Update clalloc so that nfsstat shows the current number
717 * of allocated client handles.
719 atomic_add_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
, -n
);
727 struct nfs_clnt
*nfscl
;
730 clstat_debug
.clreclaim
.value
.ui64
++;
733 * The system is low on memory; go through and try to reclaim some from
734 * every zone on the system.
736 mutex_enter(&nfs_clnt_list_lock
);
737 nfscl
= list_head(&nfs_clnt_list
);
738 for (; nfscl
!= NULL
; nfscl
= list_next(&nfs_clnt_list
, nfscl
))
739 clreclaim_zone(nfscl
, CL_HOLDTIME
);
740 mutex_exit(&nfs_clnt_list_lock
);
744 * Minimum time-out values indexed by call type
745 * These units are in "eights" of a second to avoid multiplies
747 static unsigned int minimum_timeo
[] = {
752 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
754 #define MAXTIMO (20*hz)
755 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
756 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
758 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
759 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
760 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
763 * Function called when rfscall notices that we have been
764 * re-transmitting, or when we get a response without retransmissions.
765 * Return 1 if the transfer size was adjusted down - 0 if no change.
768 nfs_feedback(int flag
, int which
, mntinfo_t
*mi
)
773 mutex_enter(&mi
->mi_lock
);
774 if (flag
== FEEDBACK_REXMIT1
) {
775 if (mi
->mi_timers
[NFS_CALLTYPES
].rt_rtxcur
!= 0 &&
776 mi
->mi_timers
[NFS_CALLTYPES
].rt_rtxcur
< REDUCE_NFS_TIME
)
778 if (mi
->mi_curread
> MIN_NFS_TSIZE
) {
780 if (mi
->mi_curread
< MIN_NFS_TSIZE
)
781 mi
->mi_curread
= MIN_NFS_TSIZE
;
785 if (mi
->mi_curwrite
> MIN_NFS_TSIZE
) {
786 mi
->mi_curwrite
/= 2;
787 if (mi
->mi_curwrite
< MIN_NFS_TSIZE
)
788 mi
->mi_curwrite
= MIN_NFS_TSIZE
;
791 } else if (flag
== FEEDBACK_OK
) {
792 kind
= mi
->mi_timer_type
[which
];
794 mi
->mi_timers
[kind
].rt_srtt
>= INCREASE_NFS_TIME
)
797 if (mi
->mi_curread
>= mi
->mi_tsize
)
799 mi
->mi_curread
+= MIN_NFS_TSIZE
;
800 if (mi
->mi_curread
> mi
->mi_tsize
/2)
801 mi
->mi_curread
= mi
->mi_tsize
;
802 } else if (kind
== 2) {
803 if (mi
->mi_curwrite
>= mi
->mi_stsize
)
805 mi
->mi_curwrite
+= MIN_NFS_TSIZE
;
806 if (mi
->mi_curwrite
> mi
->mi_stsize
/2)
807 mi
->mi_curwrite
= mi
->mi_stsize
;
811 mutex_exit(&mi
->mi_lock
);
816 static int rfs2call_hits
= 0;
817 static int rfs2call_misses
= 0;
821 rfs2call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
822 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
823 enum nfsstat
*statusp
, int flags
, failinfo_t
*fi
)
826 enum clnt_stat rpc_status
;
828 ASSERT(statusp
!= NULL
);
830 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
831 cr
, douprintf
, &rpc_status
, flags
, fi
);
834 * See crnetadjust() for comments.
836 if (*statusp
== NFSERR_ACCES
&&
837 (cr
= crnetadjust(cr
)) != NULL
) {
841 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
, xdrres
,
842 resp
, cr
, douprintf
, NULL
, flags
, fi
);
845 if (*statusp
== NFSERR_ACCES
)
849 } else if (rpc_status
== RPC_PROCUNAVAIL
) {
850 *statusp
= NFSERR_OPNOTSUPP
;
857 #define NFS3_JUKEBOX_DELAY 10 * hz
859 static clock_t nfs3_jukebox_delay
= 0;
862 static int rfs3call_hits
= 0;
863 static int rfs3call_misses
= 0;
867 rfs3call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
868 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
869 nfsstat3
*statusp
, int flags
, failinfo_t
*fi
)
876 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
877 cr
, douprintf
, NULL
, flags
, fi
);
880 if (*statusp
== NFS3ERR_JUKEBOX
) {
881 if (ttoproc(curthread
) == &p0
) {
885 if (!user_informed
) {
888 "file temporarily unavailable on the server, retrying...\n");
890 delay(nfs3_jukebox_delay
);
893 * See crnetadjust() for comments.
895 else if (*statusp
== NFS3ERR_ACCES
&&
896 (crr
= crnetadjust(cr
)) != NULL
) {
900 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
,
901 xdrres
, resp
, crr
, douprintf
,
906 if (*statusp
== NFS3ERR_ACCES
)
911 } while (!rpcerror
&& *statusp
== NFS3ERR_JUKEBOX
);
916 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
917 #define INC_READERS(mi) { \
920 #define DEC_READERS(mi) { \
922 if (mi->mi_readers == 0) \
923 cv_broadcast(&mi->mi_failover_cv); \
927 rfscall(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
928 xdrproc_t xdrres
, caddr_t resp
, cred_t
*icr
, int *douprintf
,
929 enum clnt_stat
*rpc_status
, int flags
, failinfo_t
*fi
)
934 enum clnt_stat status
;
935 struct rpc_err rpcerr
, rpcerr_tmp
;
937 int timeo
; /* in units of hz */
938 int my_rsize
, my_wsize
;
940 bool_t cred_cloned
= FALSE
;
943 struct nfs_clnt
*nfscl
;
944 zoneid_t zoneid
= getzoneid();
951 TRACE_2(TR_FAC_NFS
, TR_RFSCALL_START
,
952 "rfscall_start:which %d mi %p", which
, mi
);
954 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
955 ASSERT(nfscl
!= NULL
);
957 nfscl
->nfscl_stat
.calls
.value
.ui64
++;
958 mi
->mi_reqs
[which
].value
.ui64
++;
960 rpcerr
.re_status
= RPC_SUCCESS
;
963 * In case of forced unmount or zone shutdown, return EIO.
966 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
967 rpcerr
.re_status
= RPC_FAILED
;
968 rpcerr
.re_errno
= EIO
;
969 return (rpcerr
.re_errno
);
973 * Remember the transfer sizes in case
974 * nfs_feedback changes them underneath us.
976 my_rsize
= mi
->mi_curread
;
977 my_wsize
= mi
->mi_curwrite
;
980 * NFS client failover support
982 * If this rnode is not in sync with the current server (VALID_FH),
983 * we'd like to do a remap to get in sync. We can be interrupted
984 * in failover_remap(), and if so we'll bail. Otherwise, we'll
985 * use the best info we have to try the RPC. Part of that is
986 * unconditionally updating the filehandle copy kept for V3.
988 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
989 * rw_enter(); we're trying to keep the current server from being
990 * changed on us until we're done with the remapping and have a
991 * matching client handle. We don't want to sending a filehandle
995 if (FAILOVER_MOUNT(mi
)) {
996 mutex_enter(&mi
->mi_lock
);
997 if (!(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
998 if (failover_wait(mi
)) {
999 mutex_exit(&mi
->mi_lock
);
1004 mutex_exit(&mi
->mi_lock
);
1006 if (!VALID_FH(fi
) &&
1007 !(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
1010 svp
= mi
->mi_curr_serv
;
1011 remaperr
= failover_remap(fi
);
1012 if (remaperr
!= 0) {
1014 if (remaperr
!= EINTR
)
1015 nfs_cmn_err(remaperr
, CE_WARN
,
1016 "rfscall couldn't failover: %m");
1018 mutex_enter(&mi
->mi_lock
);
1020 mutex_exit(&mi
->mi_lock
);
1022 * If failover_remap returns ETIMEDOUT
1023 * and the filesystem is hard mounted
1024 * we have to retry the call with a new
1027 if ((mi
->mi_flags
& MI_HARD
) &&
1028 IS_RECOVERABLE_ERROR(remaperr
)) {
1029 if (svp
== mi
->mi_curr_serv
)
1030 failover_newserver(mi
);
1031 rpcerr
.re_status
= RPC_SUCCESS
;
1034 rpcerr
.re_errno
= remaperr
;
1038 if (fi
->fhp
&& fi
->copyproc
)
1039 (*fi
->copyproc
)(fi
->fhp
, fi
->vp
);
1044 * clget() calls clnt_tli_kinit() which clears the xid, so we
1045 * are guaranteed to reprocess the retry as a new request.
1047 svp
= mi
->mi_curr_serv
;
1048 rpcerr
.re_errno
= nfs_clget(mi
, svp
, cr
, &client
, &ch
, nfscl
);
1050 if (FAILOVER_MOUNT(mi
)) {
1051 mutex_enter(&mi
->mi_lock
);
1053 mutex_exit(&mi
->mi_lock
);
1055 if ((rpcerr
.re_errno
== ETIMEDOUT
||
1056 rpcerr
.re_errno
== ECONNRESET
) &&
1057 failover_safe(fi
)) {
1058 if (svp
== mi
->mi_curr_serv
)
1059 failover_newserver(mi
);
1063 if (rpcerr
.re_errno
!= 0)
1064 return (rpcerr
.re_errno
);
1066 if (svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS_ORD
||
1067 svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS
) {
1068 timeo
= (mi
->mi_timeo
* hz
) / 10;
1070 mutex_enter(&mi
->mi_lock
);
1071 timeo
= CLNT_SETTIMERS(client
,
1072 &(mi
->mi_timers
[mi
->mi_timer_type
[which
]]),
1073 &(mi
->mi_timers
[NFS_CALLTYPES
]),
1074 (minimum_timeo
[mi
->mi_call_type
[which
]]*hz
)>>3,
1075 (void (*)())NULL
, (caddr_t
)mi
, 0);
1076 mutex_exit(&mi
->mi_lock
);
1080 * If hard mounted fs, retry call forever unless hard error occurs.
1085 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
1086 status
= RPC_FAILED
;
1087 rpcerr
.re_status
= RPC_FAILED
;
1088 rpcerr
.re_errno
= EIO
;
1092 TICK_TO_TIMEVAL(timeo
, &wait
);
1095 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1096 * and SIGTERM. (Preserving the existing masks).
1097 * Mask out SIGINT if mount option nointr is specified.
1099 sigintr(&smask
, (int)mi
->mi_flags
& MI_INT
);
1100 if (!(mi
->mi_flags
& MI_INT
))
1101 client
->cl_nosignal
= TRUE
;
1104 * If there is a current signal, then don't bother
1105 * even trying to send out the request because we
1106 * won't be able to block waiting for the response.
1107 * Simply assume RPC_INTR and get on with it.
1109 if (ttolwp(curthread
) != NULL
&& ISSIG(curthread
, JUSTLOOKING
))
1112 status
= CLNT_CALL(client
, which
, xdrargs
, argsp
,
1113 xdrres
, resp
, wait
);
1116 if (!(mi
->mi_flags
& MI_INT
))
1117 client
->cl_nosignal
= FALSE
;
1119 * restore original signal mask
1125 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1126 mi
->mi_timer_type
[which
] != 0 &&
1127 (mi
->mi_curread
!= my_rsize
||
1128 mi
->mi_curwrite
!= my_wsize
))
1129 (void) nfs_feedback(FEEDBACK_OK
, which
, mi
);
1134 * There is no way to recover from this error,
1135 * even if mount option nointr is specified.
1136 * SIGKILL, for example, cannot be blocked.
1138 rpcerr
.re_status
= RPC_INTR
;
1139 rpcerr
.re_errno
= EINTR
;
1144 * If the NFS server is local (vold) and
1145 * it goes away then we get RPC_UDERROR.
1146 * This is a retryable error, so we would
1147 * loop, so check to see if the specific
1148 * error was ECONNRESET, indicating that
1149 * target did not exist at all. If so,
1150 * return with RPC_PROGUNAVAIL and
1151 * ECONNRESET to indicate why.
1153 CLNT_GETERR(client
, &rpcerr
);
1154 if (rpcerr
.re_errno
== ECONNRESET
) {
1155 rpcerr
.re_status
= RPC_PROGUNAVAIL
;
1156 rpcerr
.re_errno
= ECONNRESET
;
1161 default: /* probably RPC_TIMEDOUT */
1162 if (IS_UNRECOVERABLE_RPC(status
))
1166 * increment server not responding count
1168 mutex_enter(&mi
->mi_lock
);
1169 mi
->mi_noresponse
++;
1170 mutex_exit(&mi
->mi_lock
);
1172 nfscl
->nfscl_stat
.noresponse
.value
.ui64
++;
1175 if (!(mi
->mi_flags
& MI_HARD
)) {
1176 if (!(mi
->mi_flags
& MI_SEMISOFT
) ||
1177 (mi
->mi_ss_call_type
[which
] == 0))
1182 * The call is in progress (over COTS).
1183 * Try the CLNT_CALL again, but don't
1184 * print a noisy error message.
1186 if (status
== RPC_INPROGRESS
) {
1191 if (flags
& RFSCALL_SOFT
)
1195 * On zone shutdown, just move on.
1197 if (zone_status_get(curproc
->p_zone
) >=
1198 ZONE_IS_SHUTTING_DOWN
) {
1199 rpcerr
.re_status
= RPC_FAILED
;
1200 rpcerr
.re_errno
= EIO
;
1205 * NFS client failover support
1207 * If the current server just failed us, we'll
1208 * start the process of finding a new server.
1209 * After that, we can just retry.
1211 if (FAILOVER_MOUNT(mi
) && failover_safe(fi
)) {
1212 if (svp
== mi
->mi_curr_serv
)
1213 failover_newserver(mi
);
1214 clfree_impl(client
, ch
, nfscl
);
1219 timeo
= backoff(timeo
);
1221 CLNT_GETERR(client
, &rpcerr_tmp
);
1222 if ((status
== RPC_CANTSEND
) &&
1223 (rpcerr_tmp
.re_errno
== ENOBUFS
))
1224 msg
= SRV_QFULL_MSG
;
1226 msg
= SRV_NOTRESP_MSG
;
1228 mutex_enter(&mi
->mi_lock
);
1229 if (!(mi
->mi_flags
& MI_PRINTED
)) {
1230 mi
->mi_flags
|= MI_PRINTED
;
1231 mutex_exit(&mi
->mi_lock
);
1233 zprintf(zoneid
, msg
, mi
->mi_vers
,
1236 zprintf(zoneid
, msg
, svp
->sv_hostname
);
1239 mutex_exit(&mi
->mi_lock
);
1240 if (*douprintf
&& nfs_has_ctty()) {
1242 if (!(mi
->mi_flags
& MI_NOPRINT
))
1244 uprintf(msg
, mi
->mi_vers
,
1247 uprintf(msg
, svp
->sv_hostname
);
1252 * If doing dynamic adjustment of transfer
1253 * size and if it's a read or write call
1254 * and if the transfer size changed while
1255 * retransmitting or if the feedback routine
1256 * changed the transfer size,
1257 * then exit rfscall so that the transfer
1258 * size can be adjusted at the vnops level.
1260 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1261 mi
->mi_timer_type
[which
] != 0 &&
1262 (mi
->mi_curread
!= my_rsize
||
1263 mi
->mi_curwrite
!= my_wsize
||
1264 nfs_feedback(FEEDBACK_REXMIT1
, which
, mi
))) {
1266 * On read or write calls, return
1267 * back to the vnode ops level if
1268 * the transfer size changed.
1270 clfree_impl(client
, ch
, nfscl
);
1273 return (ENFS_TRYAGAIN
);
1278 if (status
!= RPC_SUCCESS
) {
1280 * Let soft mounts use the timed out message.
1282 if (status
== RPC_INPROGRESS
)
1283 status
= RPC_TIMEDOUT
;
1284 nfscl
->nfscl_stat
.badcalls
.value
.ui64
++;
1285 if (status
!= RPC_INTR
) {
1286 mutex_enter(&mi
->mi_lock
);
1287 mi
->mi_flags
|= MI_DOWN
;
1288 mutex_exit(&mi
->mi_lock
);
1289 CLNT_GETERR(client
, &rpcerr
);
1291 bufp
= clnt_sperror(client
, svp
->sv_hostname
);
1292 zprintf(zoneid
, "NFS%d %s failed for %s\n",
1293 mi
->mi_vers
, mi
->mi_rfsnames
[which
], bufp
);
1294 if (nfs_has_ctty()) {
1295 if (!(mi
->mi_flags
& MI_NOPRINT
)) {
1296 uprintf("NFS%d %s failed for %s\n",
1297 mi
->mi_vers
, mi
->mi_rfsnames
[which
],
1301 kmem_free(bufp
, MAXPATHLEN
);
1304 "NFS %s failed for server %s: error %d (%s)\n",
1305 mi
->mi_rfsnames
[which
], svp
->sv_hostname
,
1306 status
, clnt_sperrno(status
));
1307 if (nfs_has_ctty()) {
1308 if (!(mi
->mi_flags
& MI_NOPRINT
)) {
1310 "NFS %s failed for server %s: error %d (%s)\n",
1311 mi
->mi_rfsnames
[which
],
1312 svp
->sv_hostname
, status
,
1313 clnt_sperrno(status
));
1318 * when CLNT_CALL() fails with RPC_AUTHERROR,
1319 * re_errno is set appropriately depending on
1320 * the authentication error
1322 if (status
== RPC_VERSMISMATCH
||
1323 status
== RPC_PROGVERSMISMATCH
)
1324 rpcerr
.re_errno
= EIO
;
1328 * Test the value of mi_down and mi_printed without
1329 * holding the mi_lock mutex. If they are both zero,
1330 * then it is okay to skip the down and printed
1331 * processing. This saves on a mutex_enter and
1332 * mutex_exit pair for a normal, successful RPC.
1333 * This was just complete overhead.
1335 if (mi
->mi_flags
& (MI_DOWN
| MI_PRINTED
)) {
1336 mutex_enter(&mi
->mi_lock
);
1337 mi
->mi_flags
&= ~MI_DOWN
;
1338 if (mi
->mi_flags
& MI_PRINTED
) {
1339 mi
->mi_flags
&= ~MI_PRINTED
;
1340 mutex_exit(&mi
->mi_lock
);
1342 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1343 zprintf(zoneid
, "NFS%d server %s ok\n",
1344 mi
->mi_vers
, svp
->sv_hostname
);
1346 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1347 zprintf(zoneid
, "NFS server %s ok\n",
1351 mutex_exit(&mi
->mi_lock
);
1354 if (*douprintf
== 0) {
1355 if (!(mi
->mi_flags
& MI_NOPRINT
))
1357 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1358 uprintf("NFS%d server %s ok\n",
1359 mi
->mi_vers
, svp
->sv_hostname
);
1361 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1362 uprintf("NFS server %s ok\n", svp
->sv_hostname
);
1368 clfree_impl(client
, ch
, nfscl
);
1372 ASSERT(rpcerr
.re_status
== RPC_SUCCESS
|| rpcerr
.re_errno
!= 0);
1374 if (rpc_status
!= NULL
)
1375 *rpc_status
= rpcerr
.re_status
;
1377 TRACE_1(TR_FAC_NFS
, TR_RFSCALL_END
, "rfscall_end:errno %d",
1380 return (rpcerr
.re_errno
);
1384 static int acl2call_hits
= 0;
1385 static int acl2call_misses
= 0;
1389 acl2call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1390 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
1391 enum nfsstat
*statusp
, int flags
, failinfo_t
*fi
)
1395 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
1396 cr
, douprintf
, flags
, fi
);
1399 * See comments with crnetadjust().
1401 if (*statusp
== NFSERR_ACCES
&&
1402 (cr
= crnetadjust(cr
)) != NULL
) {
1406 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
, xdrres
,
1407 resp
, cr
, douprintf
, flags
, fi
);
1410 if (*statusp
== NFSERR_ACCES
)
1420 static int acl3call_hits
= 0;
1421 static int acl3call_misses
= 0;
1425 acl3call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1426 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
1427 nfsstat3
*statusp
, int flags
, failinfo_t
*fi
)
1435 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
1436 cr
, douprintf
, flags
, fi
);
1439 if (*statusp
== NFS3ERR_JUKEBOX
) {
1440 if (!user_informed
) {
1443 "file temporarily unavailable on the server, retrying...\n");
1445 delay(nfs3_jukebox_delay
);
1448 * See crnetadjust() for comments.
1450 else if (*statusp
== NFS3ERR_ACCES
&&
1451 (crr
= crnetadjust(cr
)) != NULL
) {
1455 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
,
1456 xdrres
, resp
, crr
, douprintf
, flags
, fi
);
1460 if (*statusp
== NFS3ERR_ACCES
)
1465 } while (!rpcerror
&& *statusp
== NFS3ERR_JUKEBOX
);
1471 aclcall(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1472 xdrproc_t xdrres
, caddr_t resp
, cred_t
*icr
, int *douprintf
,
1473 int flags
, failinfo_t
*fi
)
1478 bool_t cred_cloned
= FALSE
;
1479 enum clnt_stat status
;
1480 struct rpc_err rpcerr
;
1481 struct timeval wait
;
1482 int timeo
; /* in units of hz */
1484 int my_rsize
, my_wsize
;
1489 struct nfs_clnt
*nfscl
;
1490 zoneid_t zoneid
= getzoneid();
1496 TRACE_2(TR_FAC_NFS
, TR_RFSCALL_START
,
1497 "rfscall_start:which %d mi %p", which
, mi
);
1500 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
1501 ASSERT(nfscl
!= NULL
);
1503 nfscl
->nfscl_stat
.calls
.value
.ui64
++;
1504 mi
->mi_aclreqs
[which
].value
.ui64
++;
1506 rpcerr
.re_status
= RPC_SUCCESS
;
1508 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
1509 rpcerr
.re_status
= RPC_FAILED
;
1510 rpcerr
.re_errno
= EIO
;
1511 return (rpcerr
.re_errno
);
1516 * Remember the transfer sizes in case
1517 * nfs_feedback changes them underneath us.
1519 my_rsize
= mi
->mi_curread
;
1520 my_wsize
= mi
->mi_curwrite
;
1524 * NFS client failover support
1526 * If this rnode is not in sync with the current server (VALID_FH),
1527 * we'd like to do a remap to get in sync. We can be interrupted
1528 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1529 * use the best info we have to try the RPC. Part of that is
1530 * unconditionally updating the filehandle copy kept for V3.
1532 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1533 * rw_enter(); we're trying to keep the current server from being
1534 * changed on us until we're done with the remapping and have a
1535 * matching client handle. We don't want to sending a filehandle
1536 * to the wrong host.
1539 if (FAILOVER_MOUNT(mi
)) {
1540 mutex_enter(&mi
->mi_lock
);
1541 if (!(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
1542 if (failover_wait(mi
)) {
1543 mutex_exit(&mi
->mi_lock
);
1548 mutex_exit(&mi
->mi_lock
);
1550 if (!VALID_FH(fi
) &&
1551 !(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
1554 svp
= mi
->mi_curr_serv
;
1555 remaperr
= failover_remap(fi
);
1556 if (remaperr
!= 0) {
1558 if (remaperr
!= EINTR
)
1559 nfs_cmn_err(remaperr
, CE_WARN
,
1560 "aclcall couldn't failover: %m");
1562 mutex_enter(&mi
->mi_lock
);
1564 mutex_exit(&mi
->mi_lock
);
1567 * If failover_remap returns ETIMEDOUT
1568 * and the filesystem is hard mounted
1569 * we have to retry the call with a new
1572 if ((mi
->mi_flags
& MI_HARD
) &&
1573 IS_RECOVERABLE_ERROR(remaperr
)) {
1574 if (svp
== mi
->mi_curr_serv
)
1575 failover_newserver(mi
);
1576 rpcerr
.re_status
= RPC_SUCCESS
;
1582 if (fi
->fhp
&& fi
->copyproc
)
1583 (*fi
->copyproc
)(fi
->fhp
, fi
->vp
);
1588 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1589 * are guaranteed to reprocess the retry as a new request.
1591 svp
= mi
->mi_curr_serv
;
1592 rpcerr
.re_errno
= acl_clget(mi
, svp
, cr
, &client
, &ch
, nfscl
);
1593 if (FAILOVER_MOUNT(mi
)) {
1594 mutex_enter(&mi
->mi_lock
);
1596 mutex_exit(&mi
->mi_lock
);
1598 if ((rpcerr
.re_errno
== ETIMEDOUT
||
1599 rpcerr
.re_errno
== ECONNRESET
) &&
1600 failover_safe(fi
)) {
1601 if (svp
== mi
->mi_curr_serv
)
1602 failover_newserver(mi
);
1606 if (rpcerr
.re_errno
!= 0) {
1609 return (rpcerr
.re_errno
);
1612 if (svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS_ORD
||
1613 svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS
) {
1614 timeo
= (mi
->mi_timeo
* hz
) / 10;
1616 mutex_enter(&mi
->mi_lock
);
1617 timeo
= CLNT_SETTIMERS(client
,
1618 &(mi
->mi_timers
[mi
->mi_acl_timer_type
[which
]]),
1619 &(mi
->mi_timers
[NFS_CALLTYPES
]),
1620 (minimum_timeo
[mi
->mi_acl_call_type
[which
]]*hz
)>>3,
1621 (void (*)()) 0, (caddr_t
)mi
, 0);
1622 mutex_exit(&mi
->mi_lock
);
1626 * If hard mounted fs, retry call forever unless hard error occurs.
1631 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
1632 status
= RPC_FAILED
;
1633 rpcerr
.re_status
= RPC_FAILED
;
1634 rpcerr
.re_errno
= EIO
;
1638 TICK_TO_TIMEVAL(timeo
, &wait
);
1641 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1642 * and SIGTERM. (Preserving the existing masks).
1643 * Mask out SIGINT if mount option nointr is specified.
1645 sigintr(&smask
, (int)mi
->mi_flags
& MI_INT
);
1646 if (!(mi
->mi_flags
& MI_INT
))
1647 client
->cl_nosignal
= TRUE
;
1650 * If there is a current signal, then don't bother
1651 * even trying to send out the request because we
1652 * won't be able to block waiting for the response.
1653 * Simply assume RPC_INTR and get on with it.
1655 if (ttolwp(curthread
) != NULL
&& ISSIG(curthread
, JUSTLOOKING
))
1658 status
= CLNT_CALL(client
, which
, xdrargs
, argsp
,
1659 xdrres
, resp
, wait
);
1662 if (!(mi
->mi_flags
& MI_INT
))
1663 client
->cl_nosignal
= FALSE
;
1665 * restore original signal mask
1672 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1673 mi
->mi_timer_type
[which
] != 0 &&
1674 (mi
->mi_curread
!= my_rsize
||
1675 mi
->mi_curwrite
!= my_wsize
))
1676 (void) nfs_feedback(FEEDBACK_OK
, which
, mi
);
1681 * Unfortunately, there are servers in the world which
1682 * are not coded correctly. They are not prepared to
1683 * handle RPC requests to the NFS port which are not
1684 * NFS requests. Thus, they may try to process the
1685 * NFS_ACL request as if it were an NFS request. This
1686 * does not work. Generally, an error will be generated
1687 * on the client because it will not be able to decode
1688 * the response from the server. However, it seems
1689 * possible that the server may not be able to decode
1690 * the arguments. Thus, the criteria for deciding
1691 * whether the server supports NFS_ACL or not is whether
1692 * the following RPC errors are returned from CLNT_CALL.
1694 case RPC_CANTDECODERES
:
1695 case RPC_PROGUNAVAIL
:
1696 case RPC_CANTDECODEARGS
:
1697 case RPC_PROGVERSMISMATCH
:
1698 mutex_enter(&mi
->mi_lock
);
1699 mi
->mi_flags
&= ~(MI_ACL
| MI_EXTATTR
);
1700 mutex_exit(&mi
->mi_lock
);
1704 * If the server supports NFS_ACL but not the new ops
1705 * for extended attributes, make sure we don't retry.
1707 case RPC_PROCUNAVAIL
:
1708 mutex_enter(&mi
->mi_lock
);
1709 mi
->mi_flags
&= ~MI_EXTATTR
;
1710 mutex_exit(&mi
->mi_lock
);
1715 * There is no way to recover from this error,
1716 * even if mount option nointr is specified.
1717 * SIGKILL, for example, cannot be blocked.
1719 rpcerr
.re_status
= RPC_INTR
;
1720 rpcerr
.re_errno
= EINTR
;
1725 * If the NFS server is local (vold) and
1726 * it goes away then we get RPC_UDERROR.
1727 * This is a retryable error, so we would
1728 * loop, so check to see if the specific
1729 * error was ECONNRESET, indicating that
1730 * target did not exist at all. If so,
1731 * return with RPC_PROGUNAVAIL and
1732 * ECONNRESET to indicate why.
1734 CLNT_GETERR(client
, &rpcerr
);
1735 if (rpcerr
.re_errno
== ECONNRESET
) {
1736 rpcerr
.re_status
= RPC_PROGUNAVAIL
;
1737 rpcerr
.re_errno
= ECONNRESET
;
1742 default: /* probably RPC_TIMEDOUT */
1743 if (IS_UNRECOVERABLE_RPC(status
))
1747 * increment server not responding count
1749 mutex_enter(&mi
->mi_lock
);
1750 mi
->mi_noresponse
++;
1751 mutex_exit(&mi
->mi_lock
);
1753 nfscl
->nfscl_stat
.noresponse
.value
.ui64
++;
1756 if (!(mi
->mi_flags
& MI_HARD
)) {
1757 if (!(mi
->mi_flags
& MI_SEMISOFT
) ||
1758 (mi
->mi_acl_ss_call_type
[which
] == 0))
1763 * The call is in progress (over COTS).
1764 * Try the CLNT_CALL again, but don't
1765 * print a noisy error message.
1767 if (status
== RPC_INPROGRESS
) {
1772 if (flags
& RFSCALL_SOFT
)
1776 * On zone shutdown, just move on.
1778 if (zone_status_get(curproc
->p_zone
) >=
1779 ZONE_IS_SHUTTING_DOWN
) {
1780 rpcerr
.re_status
= RPC_FAILED
;
1781 rpcerr
.re_errno
= EIO
;
1786 * NFS client failover support
1788 * If the current server just failed us, we'll
1789 * start the process of finding a new server.
1790 * After that, we can just retry.
1792 if (FAILOVER_MOUNT(mi
) && failover_safe(fi
)) {
1793 if (svp
== mi
->mi_curr_serv
)
1794 failover_newserver(mi
);
1795 clfree_impl(client
, ch
, nfscl
);
1800 timeo
= backoff(timeo
);
1801 mutex_enter(&mi
->mi_lock
);
1802 if (!(mi
->mi_flags
& MI_PRINTED
)) {
1803 mi
->mi_flags
|= MI_PRINTED
;
1804 mutex_exit(&mi
->mi_lock
);
1807 "NFS_ACL%d server %s not responding still trying\n",
1808 mi
->mi_vers
, svp
->sv_hostname
);
1811 "NFS server %s not responding still trying\n",
1815 mutex_exit(&mi
->mi_lock
);
1816 if (*douprintf
&& nfs_has_ctty()) {
1818 if (!(mi
->mi_flags
& MI_NOPRINT
))
1821 "NFS_ACL%d server %s not responding still trying\n",
1822 mi
->mi_vers
, svp
->sv_hostname
);
1825 "NFS server %s not responding still trying\n",
1832 * If doing dynamic adjustment of transfer
1833 * size and if it's a read or write call
1834 * and if the transfer size changed while
1835 * retransmitting or if the feedback routine
1836 * changed the transfer size,
1837 * then exit rfscall so that the transfer
1838 * size can be adjusted at the vnops level.
1840 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1841 mi
->mi_acl_timer_type
[which
] != 0 &&
1842 (mi
->mi_curread
!= my_rsize
||
1843 mi
->mi_curwrite
!= my_wsize
||
1844 nfs_feedback(FEEDBACK_REXMIT1
, which
, mi
))) {
1846 * On read or write calls, return
1847 * back to the vnode ops level if
1848 * the transfer size changed.
1850 clfree_impl(client
, ch
, nfscl
);
1853 return (ENFS_TRYAGAIN
);
1859 if (status
!= RPC_SUCCESS
) {
1861 * Let soft mounts use the timed out message.
1863 if (status
== RPC_INPROGRESS
)
1864 status
= RPC_TIMEDOUT
;
1865 nfscl
->nfscl_stat
.badcalls
.value
.ui64
++;
1866 if (status
== RPC_CANTDECODERES
||
1867 status
== RPC_PROGUNAVAIL
||
1868 status
== RPC_PROCUNAVAIL
||
1869 status
== RPC_CANTDECODEARGS
||
1870 status
== RPC_PROGVERSMISMATCH
)
1871 CLNT_GETERR(client
, &rpcerr
);
1872 else if (status
!= RPC_INTR
) {
1873 mutex_enter(&mi
->mi_lock
);
1874 mi
->mi_flags
|= MI_DOWN
;
1875 mutex_exit(&mi
->mi_lock
);
1876 CLNT_GETERR(client
, &rpcerr
);
1878 bufp
= clnt_sperror(client
, svp
->sv_hostname
);
1879 zprintf(zoneid
, "NFS_ACL%d %s failed for %s\n",
1880 mi
->mi_vers
, mi
->mi_aclnames
[which
], bufp
);
1881 if (nfs_has_ctty()) {
1882 if (!(mi
->mi_flags
& MI_NOPRINT
)) {
1883 uprintf("NFS_ACL%d %s failed for %s\n",
1884 mi
->mi_vers
, mi
->mi_aclnames
[which
],
1888 kmem_free(bufp
, MAXPATHLEN
);
1891 "NFS %s failed for server %s: error %d (%s)\n",
1892 mi
->mi_aclnames
[which
], svp
->sv_hostname
,
1893 status
, clnt_sperrno(status
));
1894 if (nfs_has_ctty()) {
1895 if (!(mi
->mi_flags
& MI_NOPRINT
))
1897 "NFS %s failed for server %s: error %d (%s)\n",
1898 mi
->mi_aclnames
[which
],
1899 svp
->sv_hostname
, status
,
1900 clnt_sperrno(status
));
1904 * when CLNT_CALL() fails with RPC_AUTHERROR,
1905 * re_errno is set appropriately depending on
1906 * the authentication error
1908 if (status
== RPC_VERSMISMATCH
||
1909 status
== RPC_PROGVERSMISMATCH
)
1910 rpcerr
.re_errno
= EIO
;
1914 * Test the value of mi_down and mi_printed without
1915 * holding the mi_lock mutex. If they are both zero,
1916 * then it is okay to skip the down and printed
1917 * processing. This saves on a mutex_enter and
1918 * mutex_exit pair for a normal, successful RPC.
1919 * This was just complete overhead.
1921 if (mi
->mi_flags
& (MI_DOWN
| MI_PRINTED
)) {
1922 mutex_enter(&mi
->mi_lock
);
1923 mi
->mi_flags
&= ~MI_DOWN
;
1924 if (mi
->mi_flags
& MI_PRINTED
) {
1925 mi
->mi_flags
&= ~MI_PRINTED
;
1926 mutex_exit(&mi
->mi_lock
);
1928 zprintf(zoneid
, "NFS_ACL%d server %s ok\n",
1929 mi
->mi_vers
, svp
->sv_hostname
);
1931 zprintf(zoneid
, "NFS server %s ok\n",
1935 mutex_exit(&mi
->mi_lock
);
1938 if (*douprintf
== 0) {
1939 if (!(mi
->mi_flags
& MI_NOPRINT
))
1941 uprintf("NFS_ACL%d server %s ok\n",
1942 mi
->mi_vers
, svp
->sv_hostname
);
1944 uprintf("NFS server %s ok\n", svp
->sv_hostname
);
1950 clfree_impl(client
, ch
, nfscl
);
1954 ASSERT(rpcerr
.re_status
== RPC_SUCCESS
|| rpcerr
.re_errno
!= 0);
1957 TRACE_1(TR_FAC_NFS
, TR_RFSCALL_END
, "rfscall_end:errno %d",
1961 return (rpcerr
.re_errno
);
1965 vattr_to_sattr(struct vattr
*vap
, struct nfssattr
*sa
)
1967 uint_t mask
= vap
->va_mask
;
1969 if (!(mask
& VATTR_MODE
))
1970 sa
->sa_mode
= (uint32_t)-1;
1972 sa
->sa_mode
= vap
->va_mode
;
1973 if (!(mask
& VATTR_UID
))
1974 sa
->sa_uid
= (uint32_t)-1;
1976 sa
->sa_uid
= (uint32_t)vap
->va_uid
;
1977 if (!(mask
& VATTR_GID
))
1978 sa
->sa_gid
= (uint32_t)-1;
1980 sa
->sa_gid
= (uint32_t)vap
->va_gid
;
1981 if (!(mask
& VATTR_SIZE
))
1982 sa
->sa_size
= (uint32_t)-1;
1984 sa
->sa_size
= (uint32_t)vap
->va_size
;
1985 if (!(mask
& VATTR_ATIME
))
1986 sa
->sa_atime
.tv_sec
= sa
->sa_atime
.tv_usec
= (int32_t)-1;
1988 /* check time validity */
1989 if (! NFS_TIME_T_OK(vap
->va_atime
.tv_sec
)) {
1992 sa
->sa_atime
.tv_sec
= vap
->va_atime
.tv_sec
;
1993 sa
->sa_atime
.tv_usec
= vap
->va_atime
.tv_nsec
/ 1000;
1995 if (!(mask
& VATTR_MTIME
))
1996 sa
->sa_mtime
.tv_sec
= sa
->sa_mtime
.tv_usec
= (int32_t)-1;
1998 /* check time validity */
1999 if (! NFS_TIME_T_OK(vap
->va_mtime
.tv_sec
)) {
2002 sa
->sa_mtime
.tv_sec
= vap
->va_mtime
.tv_sec
;
2003 sa
->sa_mtime
.tv_usec
= vap
->va_mtime
.tv_nsec
/ 1000;
2009 vattr_to_sattr3(struct vattr
*vap
, sattr3
*sa
)
2011 uint_t mask
= vap
->va_mask
;
2013 if (!(mask
& VATTR_MODE
))
2014 sa
->mode
.set_it
= FALSE
;
2016 sa
->mode
.set_it
= TRUE
;
2017 sa
->mode
.mode
= (mode3
)vap
->va_mode
;
2019 if (!(mask
& VATTR_UID
))
2020 sa
->uid
.set_it
= FALSE
;
2022 sa
->uid
.set_it
= TRUE
;
2023 sa
->uid
.uid
= (uid3
)vap
->va_uid
;
2025 if (!(mask
& VATTR_GID
))
2026 sa
->gid
.set_it
= FALSE
;
2028 sa
->gid
.set_it
= TRUE
;
2029 sa
->gid
.gid
= (gid3
)vap
->va_gid
;
2031 if (!(mask
& VATTR_SIZE
))
2032 sa
->size
.set_it
= FALSE
;
2034 sa
->size
.set_it
= TRUE
;
2035 sa
->size
.size
= (size3
)vap
->va_size
;
2037 if (!(mask
& VATTR_ATIME
))
2038 sa
->atime
.set_it
= DONT_CHANGE
;
2040 /* check time validity */
2041 if (! NFS_TIME_T_OK(vap
->va_atime
.tv_sec
)) {
2044 sa
->atime
.set_it
= SET_TO_CLIENT_TIME
;
2045 sa
->atime
.atime
.seconds
= (uint32
)vap
->va_atime
.tv_sec
;
2046 sa
->atime
.atime
.nseconds
= (uint32
)vap
->va_atime
.tv_nsec
;
2048 if (!(mask
& VATTR_MTIME
))
2049 sa
->mtime
.set_it
= DONT_CHANGE
;
2051 /* check time validity */
2052 if (! NFS_TIME_T_OK(vap
->va_mtime
.tv_sec
)) {
2055 sa
->mtime
.set_it
= SET_TO_CLIENT_TIME
;
2056 sa
->mtime
.mtime
.seconds
= (uint32
)vap
->va_mtime
.tv_sec
;
2057 sa
->mtime
.mtime
.nseconds
= (uint32
)vap
->va_mtime
.tv_nsec
;
2063 setdiropargs(struct nfsdiropargs
*da
, char *nm
, vnode_t
*dvp
)
2066 da
->da_fhandle
= VTOFH(dvp
);
2072 setdiropargs3(diropargs3
*da
, char *nm
, vnode_t
*dvp
)
2075 da
->dirp
= VTOFH3(dvp
);
2080 setdirgid(vnode_t
*dvp
, gid_t
*gidp
, cred_t
*cr
)
2086 va
.va_mask
= VATTR_MODE
| VATTR_GID
;
2087 error
= fop_getattr(dvp
, &va
, 0, cr
, NULL
);
2092 * To determine the expected group-id of the created file:
2093 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2094 * GRPID option, and the directory's set-gid bit is clear,
2095 * then use the process's gid.
2096 * 2) Otherwise, set the group-id to the gid of the parent directory.
2099 mutex_enter(&rp
->r_statelock
);
2100 if (!(VTOMI(dvp
)->mi_flags
& MI_GRPID
) && !(va
.va_mode
& VSGID
))
2101 *gidp
= crgetgid(cr
);
2104 mutex_exit(&rp
->r_statelock
);
2109 setdirmode(vnode_t
*dvp
, mode_t
*omp
, cred_t
*cr
)
2114 va
.va_mask
= VATTR_MODE
;
2115 error
= fop_getattr(dvp
, &va
, 0, cr
, NULL
);
2120 * Modify the expected mode (om) so that the set-gid bit matches
2121 * that of the parent directory (dvp).
2123 if (va
.va_mode
& VSGID
)
2131 nfs_setswaplike(vnode_t
*vp
, vattr_t
*vap
)
2134 if (vp
->v_type
== VREG
&& (vap
->va_mode
& (VEXEC
| VSVTX
)) == VSVTX
) {
2135 if (!(vp
->v_flag
& VSWAPLIKE
)) {
2136 mutex_enter(&vp
->v_lock
);
2137 vp
->v_flag
|= VSWAPLIKE
;
2138 mutex_exit(&vp
->v_lock
);
2141 if (vp
->v_flag
& VSWAPLIKE
) {
2142 mutex_enter(&vp
->v_lock
);
2143 vp
->v_flag
&= ~VSWAPLIKE
;
2144 mutex_exit(&vp
->v_lock
);
2150 * Free the resources associated with an rnode.
2153 rinactive(rnode_t
*rp
, cred_t
*cr
)
2161 nfs3_pathconf_info
*info
;
2164 * Before freeing anything, wait until all asynchronous
2165 * activity is done on this rnode. This will allow all
2166 * asynchronous read ahead and write behind i/o's to
2169 mutex_enter(&rp
->r_statelock
);
2170 while (rp
->r_count
> 0)
2171 cv_wait(&rp
->r_cv
, &rp
->r_statelock
);
2172 mutex_exit(&rp
->r_statelock
);
2175 * Flush and invalidate all pages associated with the vnode.
2178 if (vn_has_cached_data(vp
)) {
2179 ASSERT(vp
->v_type
!= VCHR
);
2180 if ((rp
->r_flags
& RDIRTY
) && !rp
->r_error
) {
2181 error
= fop_putpage(vp
, 0, 0, 0, cr
, NULL
);
2182 if (error
&& (error
== ENOSPC
|| error
== EDQUOT
)) {
2183 mutex_enter(&rp
->r_statelock
);
2185 rp
->r_error
= error
;
2186 mutex_exit(&rp
->r_statelock
);
2189 nfs_invalidate_pages(vp
, 0, cr
);
2193 * Free any held credentials and caches which may be associated
2196 mutex_enter(&rp
->r_statelock
);
2199 contents
= rp
->r_symlink
.contents
;
2200 size
= rp
->r_symlink
.size
;
2201 rp
->r_symlink
.contents
= NULL
;
2202 vsp
= rp
->r_secattr
;
2203 rp
->r_secattr
= NULL
;
2204 info
= rp
->r_pathconf
;
2205 rp
->r_pathconf
= NULL
;
2206 mutex_exit(&rp
->r_statelock
);
2209 * Free the held credential.
2215 * Free the access cache entries.
2217 (void) nfs_access_purge_rp(rp
);
2220 * Free the readdir cache entries.
2222 if (HAVE_RDDIR_CACHE(rp
))
2223 nfs_purge_rddir_cache(vp
);
2226 * Free the symbolic link cache.
2228 if (contents
!= NULL
) {
2230 kmem_free((void *)contents
, size
);
2234 * Free any cached ACL.
2240 * Free any cached pathconf information.
2243 kmem_free(info
, sizeof (*info
));
2247 * Return a vnode for the given NFS Version 2 file handle.
2248 * If no rnode exists for this fhandle, create one and put it
2249 * into the hash queues. If the rnode for this fhandle
2250 * already exists, return it.
2252 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2255 makenfsnode(fhandle_t
*fh
, struct nfsfattr
*attr
, struct vfs
*vfsp
,
2256 hrtime_t t
, cred_t
*cr
, char *dnm
, char *nm
)
2264 nfh
.fh_len
= NFS_FHSIZE
;
2265 bcopy(fh
, nfh
.fh_buf
, NFS_FHSIZE
);
2267 index
= rtablehash(&nfh
);
2268 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2270 vp
= make_rnode(&nfh
, &rtable
[index
], vfsp
, &nfs_vnodeops
,
2271 nfs_putapage
, nfs_rddir_compar
, &newnode
, cr
, dnm
, nm
);
2275 rw_exit(&rtable
[index
].r_lock
);
2276 (void) nfs_cache_fattr(vp
, attr
, &va
, t
, cr
);
2278 if (attr
->na_type
< NFNON
|| attr
->na_type
> NFSOC
)
2281 vp
->v_type
= n2v_type(attr
);
2283 * A translation here seems to be necessary
2284 * because this function can be called
2285 * with `attr' that has come from the wire,
2286 * and been operated on by vattr_to_nattr().
2287 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2288 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2291 if ((attr
->na_rdev
& 0xffff0000) == 0)
2292 vp
->v_rdev
= nfsv2_expdev(attr
->na_rdev
);
2294 vp
->v_rdev
= expldev(n2v_rdev(attr
));
2295 nfs_attrcache(vp
, attr
, t
);
2296 rw_exit(&rtable
[index
].r_lock
);
2300 PURGE_ATTRCACHE(vp
);
2302 rw_exit(&rtable
[index
].r_lock
);
2309 * Return a vnode for the given NFS Version 3 file handle.
2310 * If no rnode exists for this fhandle, create one and put it
2311 * into the hash queues. If the rnode for this fhandle
2312 * already exists, return it.
2314 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2317 makenfs3node_va(nfs_fh3
*fh
, vattr_t
*vap
, struct vfs
*vfsp
, hrtime_t t
,
2318 cred_t
*cr
, char *dnm
, char *nm
)
2324 index
= rtablehash((nfs_fhandle
*)fh
);
2325 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2327 vp
= make_rnode((nfs_fhandle
*)fh
, &rtable
[index
], vfsp
,
2328 &nfs3_vnodeops
, nfs3_putapage
, nfs3_rddir_compar
, &newnode
, cr
,
2333 PURGE_ATTRCACHE(vp
);
2335 rw_exit(&rtable
[index
].r_lock
);
2340 rw_exit(&rtable
[index
].r_lock
);
2341 nfs_attr_cache(vp
, vap
, t
, cr
);
2343 rnode_t
*rp
= VTOR(vp
);
2345 vp
->v_type
= vap
->va_type
;
2346 vp
->v_rdev
= vap
->va_rdev
;
2348 mutex_enter(&rp
->r_statelock
);
2349 if (rp
->r_mtime
<= t
)
2350 nfs_attrcache_va(vp
, vap
);
2351 mutex_exit(&rp
->r_statelock
);
2352 rw_exit(&rtable
[index
].r_lock
);
2359 makenfs3node(nfs_fh3
*fh
, fattr3
*attr
, struct vfs
*vfsp
, hrtime_t t
,
2360 cred_t
*cr
, char *dnm
, char *nm
)
2367 index
= rtablehash((nfs_fhandle
*)fh
);
2368 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2370 vp
= make_rnode((nfs_fhandle
*)fh
, &rtable
[index
], vfsp
,
2371 &nfs3_vnodeops
, nfs3_putapage
, nfs3_rddir_compar
, &newnode
, cr
,
2376 PURGE_ATTRCACHE(vp
);
2378 rw_exit(&rtable
[index
].r_lock
);
2383 rw_exit(&rtable
[index
].r_lock
);
2384 (void) nfs3_cache_fattr3(vp
, attr
, &va
, t
, cr
);
2386 if (attr
->type
< NF3REG
|| attr
->type
> NF3FIFO
)
2389 vp
->v_type
= nf3_to_vt
[attr
->type
];
2390 vp
->v_rdev
= makedevice(attr
->rdev
.specdata1
,
2391 attr
->rdev
.specdata2
);
2392 nfs3_attrcache(vp
, attr
, t
);
2393 rw_exit(&rtable
[index
].r_lock
);
2400 * Read this comment before making changes to rtablehash()!
2401 * This is a hash function in which seemingly obvious and harmless
2402 * changes can cause escalations costing million dollars!
2403 * Know what you are doing.
2405 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2406 * algorithm is currently detailed here:
2408 * http://burtleburtle.net/bob/hash/doobs.html
2410 * Of course, the above link may not be valid by the time you are reading
2411 * this, but suffice it to say that the one-at-a-time algorithm works well in
2412 * almost all cases. If you are changing the algorithm be sure to verify that
2413 * the hash algorithm still provides even distribution in all cases and with
2414 * any server returning filehandles in whatever order (sequential or random).
2417 rtablehash(nfs_fhandle
*fh
)
2419 ulong_t hash
, len
, i
;
2423 len
= (ulong_t
)fh
->fh_len
;
2424 for (hash
= 0, i
= 0; i
< len
; i
++) {
2426 hash
+= (hash
<< 10);
2427 hash
^= (hash
>> 6);
2429 hash
+= (hash
<< 3);
2430 hash
^= (hash
>> 11);
2431 hash
+= (hash
<< 15);
2432 return (hash
& rtablemask
);
2436 make_rnode(nfs_fhandle
*fh
, rhashq_t
*rhtp
, struct vfs
*vfsp
,
2437 const struct vnodeops
*vops
,
2438 int (*putapage
)(vnode_t
*, page_t
*, uoff_t
*, size_t *, int, cred_t
*),
2439 int (*compar
)(const void *, const void *),
2440 int *newnode
, cred_t
*cr
, char *dnm
, char *nm
)
2447 ASSERT(RW_READ_HELD(&rhtp
->r_lock
));
2451 if ((rp
= rfind(rhtp
, fh
, vfsp
)) != NULL
) {
2457 rw_exit(&rhtp
->r_lock
);
2459 mutex_enter(&rpfreelist_lock
);
2460 if (rpfreelist
!= NULL
&& rnew
>= nrnode
) {
2463 mutex_exit(&rpfreelist_lock
);
2467 if (rp
->r_flags
& RHASHED
) {
2468 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2469 mutex_enter(&vp
->v_lock
);
2470 if (vp
->v_count
> 1) {
2472 mutex_exit(&vp
->v_lock
);
2473 rw_exit(&rp
->r_hashq
->r_lock
);
2474 rw_enter(&rhtp
->r_lock
, RW_READER
);
2477 mutex_exit(&vp
->v_lock
);
2478 rp_rmhash_locked(rp
);
2479 rw_exit(&rp
->r_hashq
->r_lock
);
2484 mutex_enter(&vp
->v_lock
);
2485 if (vp
->v_count
> 1) {
2487 mutex_exit(&vp
->v_lock
);
2488 rw_enter(&rhtp
->r_lock
, RW_READER
);
2491 mutex_exit(&vp
->v_lock
);
2494 * destroy old locks before bzero'ing and
2495 * recreating the locks below.
2497 nfs_rw_destroy(&rp
->r_rwlock
);
2498 nfs_rw_destroy(&rp
->r_lkserlock
);
2499 mutex_destroy(&rp
->r_statelock
);
2500 cv_destroy(&rp
->r_cv
);
2501 cv_destroy(&rp
->r_commit
.c_cv
);
2502 nfs_free_r_path(rp
);
2503 avl_destroy(&rp
->r_dir
);
2505 * Make sure that if rnode is recycled then
2506 * VFS count is decremented properly before
2509 VFS_RELE(vp
->v_vfsp
);
2514 mutex_exit(&rpfreelist_lock
);
2516 rp
= kmem_cache_alloc(rnode_cache
, KM_SLEEP
);
2517 new_vp
= vn_alloc(KM_SLEEP
);
2519 atomic_inc_ulong((ulong_t
*)&rnew
);
2521 clstat_debug
.nrnode
.value
.ui64
++;
2526 bzero(rp
, sizeof (*rp
));
2528 nfs_rw_init(&rp
->r_rwlock
, NULL
, RW_DEFAULT
, NULL
);
2529 nfs_rw_init(&rp
->r_lkserlock
, NULL
, RW_DEFAULT
, NULL
);
2530 mutex_init(&rp
->r_statelock
, NULL
, MUTEX_DEFAULT
, NULL
);
2531 cv_init(&rp
->r_cv
, NULL
, CV_DEFAULT
, NULL
);
2532 cv_init(&rp
->r_commit
.c_cv
, NULL
, CV_DEFAULT
, NULL
);
2533 rp
->r_fh
.fh_len
= fh
->fh_len
;
2534 bcopy(fh
->fh_buf
, rp
->r_fh
.fh_buf
, fh
->fh_len
);
2535 rp
->r_server
= mi
->mi_curr_serv
;
2536 if (FAILOVER_MOUNT(mi
)) {
2538 * If replicated servers, stash pathnames
2540 if (dnm
!= NULL
&& nm
!= NULL
) {
2544 len
= (uint_t
)(strlen(dnm
) + strlen(nm
) + 2);
2545 rp
->r_path
= kmem_alloc(len
, KM_SLEEP
);
2547 clstat_debug
.rpath
.value
.ui64
+= len
;
2550 for (p
= dnm
; *p
; p
++)
2553 for (p
= nm
; *p
; p
++)
2557 /* special case for root */
2558 rp
->r_path
= kmem_alloc(2, KM_SLEEP
);
2560 clstat_debug
.rpath
.value
.ui64
+= 2;
2563 *(rp
->r_path
+ 1) = '\0';
2567 rp
->r_putapage
= putapage
;
2569 rp
->r_flags
= RREADDIRPLUS
;
2570 avl_create(&rp
->r_dir
, compar
, sizeof (rddir_cache
),
2571 offsetof(rddir_cache
, tree
));
2572 vn_setops(vp
, vops
);
2573 vp
->v_data
= (caddr_t
)rp
;
2576 vp
->v_flag
|= VMODSORT
;
2580 * There is a race condition if someone else
2581 * alloc's the rnode while no locks are held, so we
2582 * check again and recover if found.
2584 rw_enter(&rhtp
->r_lock
, RW_WRITER
);
2585 if ((trp
= rfind(rhtp
, fh
, vfsp
)) != NULL
) {
2589 rw_exit(&rhtp
->r_lock
);
2591 rw_enter(&rhtp
->r_lock
, RW_READER
);
2600 * Callback function to check if the page should be marked as
2601 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2604 nfs_setmod_check(page_t
*pp
)
2606 if (pp
->p_fsdata
!= C_NOCOMMIT
) {
2607 pp
->p_fsdata
= C_NOCOMMIT
;
2614 nfs_set_vroot(vnode_t
*vp
)
2617 nfs_fhandle
*rootfh
;
2620 rootfh
= &rp
->r_server
->sv_fhandle
;
2621 if (rootfh
->fh_len
== rp
->r_fh
.fh_len
&&
2622 bcmp(rootfh
->fh_buf
, rp
->r_fh
.fh_buf
, rp
->r_fh
.fh_len
) == 0) {
2623 if (!(vp
->v_flag
& VROOT
)) {
2624 mutex_enter(&vp
->v_lock
);
2625 vp
->v_flag
|= VROOT
;
2626 mutex_exit(&vp
->v_lock
);
2632 nfs_free_r_path(rnode_t
*rp
)
2640 len
= strlen(path
) + 1;
2641 kmem_free(path
, len
);
2643 clstat_debug
.rpath
.value
.ui64
-= len
;
2649 * Put an rnode on the free list.
2651 * Rnodes which were allocated above and beyond the normal limit
2652 * are immediately freed.
2655 rp_addfree(rnode_t
*rp
, cred_t
*cr
)
2661 ASSERT(vp
->v_count
>= 1);
2662 ASSERT(rp
->r_freef
== NULL
&& rp
->r_freeb
== NULL
);
2665 * If we have too many rnodes allocated and there are no
2666 * references to this rnode, or if the rnode is no longer
2667 * accessible by it does not reside in the hash queues,
2668 * or if an i/o error occurred while writing to the file,
2669 * then just free it instead of putting it on the rnode
2673 if (((rnew
> nrnode
|| !(rp
->r_flags
& RHASHED
) || rp
->r_error
||
2674 (vfsp
->vfs_flag
& VFS_UNMOUNTED
)) && rp
->r_count
== 0)) {
2675 if (rp
->r_flags
& RHASHED
) {
2676 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2677 mutex_enter(&vp
->v_lock
);
2678 if (vp
->v_count
> 1) {
2680 mutex_exit(&vp
->v_lock
);
2681 rw_exit(&rp
->r_hashq
->r_lock
);
2684 mutex_exit(&vp
->v_lock
);
2685 rp_rmhash_locked(rp
);
2686 rw_exit(&rp
->r_hashq
->r_lock
);
2692 * Recheck the vnode reference count. We need to
2693 * make sure that another reference has not been
2694 * acquired while we were not holding v_lock. The
2695 * rnode is not in the rnode hash queues, so the
2696 * only way for a reference to have been acquired
2697 * is for a fop_putpage because the rnode was marked
2698 * with RDIRTY or for a modified page. This
2699 * reference may have been acquired before our call
2700 * to rinactive. The i/o may have been completed,
2701 * thus allowing rinactive to complete, but the
2702 * reference to the vnode may not have been released
2703 * yet. In any case, the rnode can not be destroyed
2704 * until the other references to this vnode have been
2705 * released. The other references will take care of
2706 * either destroying the rnode or placing it on the
2707 * rnode freelist. If there are no other references,
2708 * then the rnode may be safely destroyed.
2710 mutex_enter(&vp
->v_lock
);
2711 if (vp
->v_count
> 1) {
2713 mutex_exit(&vp
->v_lock
);
2716 mutex_exit(&vp
->v_lock
);
2723 * Lock the hash queue and then recheck the reference count
2724 * to ensure that no other threads have acquired a reference
2725 * to indicate that the rnode should not be placed on the
2726 * freelist. If another reference has been acquired, then
2727 * just release this one and let the other thread complete
2728 * the processing of adding this rnode to the freelist.
2730 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2732 mutex_enter(&vp
->v_lock
);
2733 if (vp
->v_count
> 1) {
2735 mutex_exit(&vp
->v_lock
);
2736 rw_exit(&rp
->r_hashq
->r_lock
);
2739 mutex_exit(&vp
->v_lock
);
2742 * If there is no cached data or metadata for this file, then
2743 * put the rnode on the front of the freelist so that it will
2744 * be reused before other rnodes which may have cached data or
2745 * metadata associated with them.
2747 mutex_enter(&rpfreelist_lock
);
2748 if (rpfreelist
== NULL
) {
2753 rp
->r_freef
= rpfreelist
;
2754 rp
->r_freeb
= rpfreelist
->r_freeb
;
2755 rpfreelist
->r_freeb
->r_freef
= rp
;
2756 rpfreelist
->r_freeb
= rp
;
2757 if (!vn_has_cached_data(vp
) &&
2758 !HAVE_RDDIR_CACHE(rp
) &&
2759 rp
->r_symlink
.contents
== NULL
&&
2760 rp
->r_secattr
== NULL
&&
2761 rp
->r_pathconf
== NULL
)
2764 mutex_exit(&rpfreelist_lock
);
2766 rw_exit(&rp
->r_hashq
->r_lock
);
2770 * Remove an rnode from the free list.
2772 * The caller must be holding rpfreelist_lock and the rnode
2773 * must be on the freelist.
2776 rp_rmfree(rnode_t
*rp
)
2779 ASSERT(MUTEX_HELD(&rpfreelist_lock
));
2780 ASSERT(rp
->r_freef
!= NULL
&& rp
->r_freeb
!= NULL
);
2782 if (rp
== rpfreelist
) {
2783 rpfreelist
= rp
->r_freef
;
2784 if (rp
== rpfreelist
)
2788 rp
->r_freeb
->r_freef
= rp
->r_freef
;
2789 rp
->r_freef
->r_freeb
= rp
->r_freeb
;
2791 rp
->r_freef
= rp
->r_freeb
= NULL
;
2795 * Put a rnode in the hash table.
2797 * The caller must be holding the exclusive hash queue lock.
2800 rp_addhash(rnode_t
*rp
)
2804 ASSERT(RW_WRITE_HELD(&rp
->r_hashq
->r_lock
));
2805 ASSERT(!(rp
->r_flags
& RHASHED
));
2807 rp
->r_hashf
= rp
->r_hashq
->r_hashf
;
2808 rp
->r_hashq
->r_hashf
= rp
;
2809 rp
->r_hashb
= (rnode_t
*)rp
->r_hashq
;
2810 rp
->r_hashf
->r_hashb
= rp
;
2812 mutex_enter(&rp
->r_statelock
);
2813 rp
->r_flags
|= RHASHED
;
2814 mutex_exit(&rp
->r_statelock
);
2816 mi
= VTOMI(RTOV(rp
));
2817 mutex_enter(&mi
->mi_rnodes_lock
);
2818 list_insert_tail(&mi
->mi_rnodes
, rp
);
2819 mutex_exit(&mi
->mi_rnodes_lock
);
2823 * Remove a rnode from the hash table.
2825 * The caller must be holding the hash queue lock.
2828 rp_rmhash_locked(rnode_t
*rp
)
2832 ASSERT(RW_WRITE_HELD(&rp
->r_hashq
->r_lock
));
2833 ASSERT(rp
->r_flags
& RHASHED
);
2835 rp
->r_hashb
->r_hashf
= rp
->r_hashf
;
2836 rp
->r_hashf
->r_hashb
= rp
->r_hashb
;
2838 mutex_enter(&rp
->r_statelock
);
2839 rp
->r_flags
&= ~RHASHED
;
2840 mutex_exit(&rp
->r_statelock
);
2842 mi
= VTOMI(RTOV(rp
));
2843 mutex_enter(&mi
->mi_rnodes_lock
);
2844 if (list_link_active(&rp
->r_mi_link
))
2845 list_remove(&mi
->mi_rnodes
, rp
);
2846 mutex_exit(&mi
->mi_rnodes_lock
);
2850 * Remove a rnode from the hash table.
2852 * The caller must not be holding the hash queue lock.
2855 rp_rmhash(rnode_t
*rp
)
2858 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2859 rp_rmhash_locked(rp
);
2860 rw_exit(&rp
->r_hashq
->r_lock
);
2864 * Lookup a rnode by fhandle.
2866 * The caller must be holding the hash queue lock, either shared or exclusive.
2869 rfind(rhashq_t
*rhtp
, nfs_fhandle
*fh
, struct vfs
*vfsp
)
2874 ASSERT(RW_LOCK_HELD(&rhtp
->r_lock
));
2876 for (rp
= rhtp
->r_hashf
; rp
!= (rnode_t
*)rhtp
; rp
= rp
->r_hashf
) {
2878 if (vp
->v_vfsp
== vfsp
&&
2879 rp
->r_fh
.fh_len
== fh
->fh_len
&&
2880 bcmp(rp
->r_fh
.fh_buf
, fh
->fh_buf
, fh
->fh_len
) == 0) {
2882 * remove rnode from free list, if necessary.
2884 if (rp
->r_freef
!= NULL
) {
2885 mutex_enter(&rpfreelist_lock
);
2887 * If the rnode is on the freelist,
2888 * then remove it and use that reference
2889 * as the new reference. Otherwise,
2890 * need to increment the reference count.
2892 if (rp
->r_freef
!= NULL
) {
2894 mutex_exit(&rpfreelist_lock
);
2896 mutex_exit(&rpfreelist_lock
);
2908 * Return 1 if there is an active vnode belonging to this vfs in the
2911 * Several of these checks are done without holding the usual
2912 * locks. This is safe because destroy_rtable(), rp_addfree(),
2913 * etc. will redo the necessary checks before actually destroying
2917 check_rtable(struct vfs
*vfsp
)
2923 ASSERT(vfsp
!= NULL
);
2926 mutex_enter(&mi
->mi_rnodes_lock
);
2927 for (rp
= list_head(&mi
->mi_rnodes
); rp
!= NULL
;
2928 rp
= list_next(&mi
->mi_rnodes
, rp
)) {
2931 if (rp
->r_freef
== NULL
||
2932 (vn_has_cached_data(vp
) && (rp
->r_flags
& RDIRTY
)) ||
2934 mutex_exit(&mi
->mi_rnodes_lock
);
2938 mutex_exit(&mi
->mi_rnodes_lock
);
2944 * Destroy inactive vnodes from the hash queues which belong to this
2945 * vfs. It is essential that we destroy all inactive vnodes during a
2946 * forced unmount as well as during a normal unmount.
2949 destroy_rtable(struct vfs
*vfsp
, cred_t
*cr
)
2954 ASSERT(vfsp
!= NULL
);
2958 mutex_enter(&rpfreelist_lock
);
2959 mutex_enter(&mi
->mi_rnodes_lock
);
2960 while ((rp
= list_remove_head(&mi
->mi_rnodes
)) != NULL
) {
2962 * If the rnode is no longer on the freelist it is not
2963 * ours and it will be handled by some other thread, so
2966 if (rp
->r_freef
== NULL
)
2968 mutex_exit(&mi
->mi_rnodes_lock
);
2971 mutex_exit(&rpfreelist_lock
);
2976 * This call to rp_addfree will end up destroying the
2977 * rnode, but in a safe way with the appropriate set
2982 mutex_enter(&rpfreelist_lock
);
2983 mutex_enter(&mi
->mi_rnodes_lock
);
2985 mutex_exit(&mi
->mi_rnodes_lock
);
2986 mutex_exit(&rpfreelist_lock
);
2990 * This routine destroys all the resources associated with the rnode
2991 * and then the rnode itself.
2994 destroy_rnode(rnode_t
*rp
)
3002 ASSERT(vp
->v_count
== 1);
3003 ASSERT(rp
->r_count
== 0);
3004 ASSERT(rp
->r_lmpl
== NULL
);
3005 ASSERT(rp
->r_mapcnt
== 0);
3006 ASSERT(!(rp
->r_flags
& RHASHED
));
3007 ASSERT(rp
->r_freef
== NULL
&& rp
->r_freeb
== NULL
);
3008 atomic_dec_ulong((ulong_t
*)&rnew
);
3010 clstat_debug
.nrnode
.value
.ui64
--;
3012 nfs_rw_destroy(&rp
->r_rwlock
);
3013 nfs_rw_destroy(&rp
->r_lkserlock
);
3014 mutex_destroy(&rp
->r_statelock
);
3015 cv_destroy(&rp
->r_cv
);
3016 cv_destroy(&rp
->r_commit
.c_cv
);
3017 if (rp
->r_flags
& RDELMAPLIST
)
3018 list_destroy(&rp
->r_indelmap
);
3019 nfs_free_r_path(rp
);
3020 avl_destroy(&rp
->r_dir
);
3023 kmem_cache_free(rnode_cache
, rp
);
3028 * Flush all vnodes in this (or every) vfs.
3029 * Used by nfs_sync and by nfs_unmount.
3032 rflush(struct vfs
*vfsp
, cred_t
*cr
)
3036 vnode_t
*vp
, **vplist
;
3040 * Check to see whether there is anything to do.
3047 * Allocate a slot for all currently active rnodes on the
3048 * supposition that they all may need flushing.
3050 vplist
= kmem_alloc(num
* sizeof (*vplist
), KM_SLEEP
);
3054 * If the vfs is known we can do fast path by iterating all rnodes that
3055 * belongs to this vfs. This is much faster than the traditional way
3056 * of iterating rtable (below) in a case there is a lot of rnodes that
3057 * does not belong to our vfs.
3060 mntinfo_t
*mi
= VFTOMI(vfsp
);
3062 mutex_enter(&mi
->mi_rnodes_lock
);
3063 for (rp
= list_head(&mi
->mi_rnodes
); rp
!= NULL
;
3064 rp
= list_next(&mi
->mi_rnodes
, rp
)) {
3067 * Don't bother sync'ing a vp if it
3068 * is part of virtual swap device or
3069 * if VFS is read-only
3071 if (IS_SWAPVP(vp
) || vn_is_readonly(vp
))
3074 * If the vnode has pages and is marked as either dirty
3075 * or mmap'd, hold and add this vnode to the list of
3078 ASSERT(vp
->v_vfsp
== vfsp
);
3079 if (vn_has_cached_data(vp
) &&
3080 ((rp
->r_flags
& RDIRTY
) || rp
->r_mapcnt
> 0)) {
3085 * The vplist is full because there is
3086 * too many rnodes. We are done for
3093 mutex_exit(&mi
->mi_rnodes_lock
);
3098 ASSERT(vfsp
== NULL
);
3101 * Walk the hash queues looking for rnodes with page
3102 * lists associated with them. Make a list of these
3105 for (index
= 0; index
< rtablesize
; index
++) {
3106 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
3107 for (rp
= rtable
[index
].r_hashf
;
3108 rp
!= (rnode_t
*)(&rtable
[index
]);
3112 * Don't bother sync'ing a vp if it
3113 * is part of virtual swap device or
3114 * if VFS is read-only
3116 if (IS_SWAPVP(vp
) || vn_is_readonly(vp
))
3119 * If the vnode has pages and is marked as either dirty
3120 * or mmap'd, hold and add this vnode to the list of
3123 if (vn_has_cached_data(vp
) &&
3124 ((rp
->r_flags
& RDIRTY
) || rp
->r_mapcnt
> 0)) {
3128 rw_exit(&rtable
[index
].r_lock
);
3130 * The vplist is full because there is
3131 * too many rnodes. We are done for
3138 rw_exit(&rtable
[index
].r_lock
);
3144 * Flush and release all of the files on the list.
3148 (void) fop_putpage(vp
, 0, 0, B_ASYNC
, cr
, NULL
);
3153 * Free the space allocated to hold the list.
3155 kmem_free(vplist
, num
* sizeof (*vplist
));
3159 * This probably needs to be larger than or equal to
3160 * log2(sizeof (struct rnode)) due to the way that rnodes are
3163 #define ACACHE_SHIFT_BITS 9
3166 acachehash(rnode_t
*rp
, cred_t
*cr
)
3169 return ((((intptr_t)rp
>> ACACHE_SHIFT_BITS
) + crgetuid(cr
)) &
3174 static long nfs_access_cache_hits
= 0;
3175 static long nfs_access_cache_misses
= 0;
3179 nfs_access_check(rnode_t
*rp
, uint32_t acc
, cred_t
*cr
)
3184 nfs_access_type_t all
;
3187 if (!ATTRCACHE_VALID(vp
) || nfs_waitfor_purge_complete(vp
))
3188 return (NFS_ACCESS_UNKNOWN
);
3190 if (rp
->r_acache
!= NULL
) {
3191 hp
= &acache
[acachehash(rp
, cr
)];
3192 rw_enter(&hp
->lock
, RW_READER
);
3194 while (ap
!= (acache_t
*)hp
) {
3195 if (crcmp(ap
->cred
, cr
) == 0 && ap
->rnode
== rp
) {
3196 if ((ap
->known
& acc
) == acc
) {
3198 nfs_access_cache_hits
++;
3200 if ((ap
->allowed
& acc
) == acc
)
3201 all
= NFS_ACCESS_ALLOWED
;
3203 all
= NFS_ACCESS_DENIED
;
3206 nfs_access_cache_misses
++;
3208 all
= NFS_ACCESS_UNKNOWN
;
3219 nfs_access_cache_misses
++;
3221 return (NFS_ACCESS_UNKNOWN
);
3225 nfs_access_cache(rnode_t
*rp
, uint32_t acc
, uint32_t resacc
, cred_t
*cr
)
3231 hp
= &acache
[acachehash(rp
, cr
)];
3234 * Allocate now assuming that mostly an allocation will be
3235 * required. This allows the allocation to happen without
3236 * holding the hash bucket locked.
3238 nap
= kmem_cache_alloc(acache_cache
, KM_NOSLEEP
);
3241 nap
->allowed
= resacc
;
3248 rw_enter(&hp
->lock
, RW_WRITER
);
3250 if (rp
->r_acache
!= NULL
) {
3252 while (ap
!= (acache_t
*)hp
) {
3253 if (crcmp(ap
->cred
, cr
) == 0 && ap
->rnode
== rp
) {
3255 ap
->allowed
&= ~acc
;
3256 ap
->allowed
|= resacc
;
3260 kmem_cache_free(acache_cache
, nap
);
3270 clstat_debug
.access
.value
.ui64
++;
3272 nap
->next
= hp
->next
;
3274 nap
->next
->prev
= nap
;
3275 nap
->prev
= (acache_t
*)hp
;
3277 mutex_enter(&rp
->r_statelock
);
3278 nap
->list
= rp
->r_acache
;
3280 mutex_exit(&rp
->r_statelock
);
3287 nfs_access_purge_rp(rnode_t
*rp
)
3294 * If there aren't any cached entries, then there is nothing
3297 if (rp
->r_acache
== NULL
)
3300 mutex_enter(&rp
->r_statelock
);
3301 rplist
= rp
->r_acache
;
3302 rp
->r_acache
= NULL
;
3303 mutex_exit(&rp
->r_statelock
);
3306 * Loop through each entry in the list pointed to in the
3307 * rnode. Remove each of these entries from the hash
3308 * queue that it is on and remove it from the list in
3311 for (ap
= rplist
; ap
!= NULL
; ap
= tmpap
) {
3312 rw_enter(&ap
->hashq
->lock
, RW_WRITER
);
3313 ap
->prev
->next
= ap
->next
;
3314 ap
->next
->prev
= ap
->prev
;
3315 rw_exit(&ap
->hashq
->lock
);
3319 kmem_cache_free(acache_cache
, ap
);
3321 clstat_debug
.access
.value
.ui64
--;
3328 static const char prefix
[] = ".nfs";
3330 static kmutex_t newnum_lock
;
3335 static uint_t newnum
= 0;
3338 mutex_enter(&newnum_lock
);
3340 newnum
= gethrestime_sec() & 0xffff;
3342 mutex_exit(&newnum_lock
);
3355 news
= kmem_alloc(MAXNAMELEN
, KM_SLEEP
);
3361 *s
++ = "0123456789ABCDEF"[id
& 0x0f];
3369 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3373 cl_snapshot(kstat_t
*ksp
, void *buf
, int rw
)
3375 ksp
->ks_snaptime
= gethrtime();
3376 if (rw
== KSTAT_WRITE
) {
3377 bcopy(buf
, ksp
->ks_private
, sizeof (clstat_tmpl
));
3380 * Currently only the global zone can write to kstats, but we
3381 * add the check just for paranoia.
3383 if (INGLOBALZONE(curproc
))
3384 bcopy((char *)buf
+ sizeof (clstat_tmpl
), &clstat_debug
,
3385 sizeof (clstat_debug
));
3388 bcopy(ksp
->ks_private
, buf
, sizeof (clstat_tmpl
));
3391 * If we're displaying the "global" debug kstat values, we
3392 * display them as-is to all zones since in fact they apply to
3393 * the system as a whole.
3395 bcopy(&clstat_debug
, (char *)buf
+ sizeof (clstat_tmpl
),
3396 sizeof (clstat_debug
));
3403 clinit_zone(zoneid_t zoneid
)
3405 kstat_t
*nfs_client_kstat
;
3406 struct nfs_clnt
*nfscl
;
3409 nfscl
= kmem_alloc(sizeof (*nfscl
), KM_SLEEP
);
3410 mutex_init(&nfscl
->nfscl_chtable_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3411 nfscl
->nfscl_chtable
= NULL
;
3412 nfscl
->nfscl_zoneid
= zoneid
;
3414 bcopy(&clstat_tmpl
, &nfscl
->nfscl_stat
, sizeof (clstat_tmpl
));
3415 ndata
= sizeof (clstat_tmpl
) / sizeof (kstat_named_t
);
3417 ndata
+= sizeof (clstat_debug
) / sizeof (kstat_named_t
);
3419 if ((nfs_client_kstat
= kstat_create_zone("nfs", 0, "nfs_client",
3420 "misc", KSTAT_TYPE_NAMED
, ndata
,
3421 KSTAT_FLAG_VIRTUAL
| KSTAT_FLAG_WRITABLE
, zoneid
)) != NULL
) {
3422 nfs_client_kstat
->ks_private
= &nfscl
->nfscl_stat
;
3423 nfs_client_kstat
->ks_snapshot
= cl_snapshot
;
3424 kstat_install(nfs_client_kstat
);
3426 mutex_enter(&nfs_clnt_list_lock
);
3427 list_insert_head(&nfs_clnt_list
, nfscl
);
3428 mutex_exit(&nfs_clnt_list_lock
);
3434 clfini_zone(zoneid_t zoneid
, void *arg
)
3436 struct nfs_clnt
*nfscl
= arg
;
3437 chhead_t
*chp
, *next
;
3441 mutex_enter(&nfs_clnt_list_lock
);
3442 list_remove(&nfs_clnt_list
, nfscl
);
3443 mutex_exit(&nfs_clnt_list_lock
);
3444 clreclaim_zone(nfscl
, 0);
3445 for (chp
= nfscl
->nfscl_chtable
; chp
!= NULL
; chp
= next
) {
3446 ASSERT(chp
->ch_list
== NULL
);
3447 kmem_free(chp
->ch_protofmly
, strlen(chp
->ch_protofmly
) + 1);
3448 next
= chp
->ch_next
;
3449 kmem_free(chp
, sizeof (*chp
));
3451 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid
);
3452 mutex_destroy(&nfscl
->nfscl_chtable_lock
);
3453 kmem_free(nfscl
, sizeof (*nfscl
));
3457 * Called by endpnt_destructor to make sure the client handles are
3458 * cleaned up before the RPC endpoints. This becomes a no-op if
3459 * clfini_zone (above) is called first. This function is needed
3460 * (rather than relying on clfini_zone to clean up) because the ZSD
3461 * callbacks have no ordering mechanism, so we have no way to ensure
3462 * that clfini_zone is called before endpnt_destructor.
3465 clcleanup_zone(zoneid_t zoneid
)
3467 struct nfs_clnt
*nfscl
;
3469 mutex_enter(&nfs_clnt_list_lock
);
3470 nfscl
= list_head(&nfs_clnt_list
);
3471 for (; nfscl
!= NULL
; nfscl
= list_next(&nfs_clnt_list
, nfscl
)) {
3472 if (nfscl
->nfscl_zoneid
== zoneid
) {
3473 clreclaim_zone(nfscl
, 0);
3477 mutex_exit(&nfs_clnt_list_lock
);
3487 * Allocate and initialize the rnode hash queues
3491 nrnode_max
= (ulong_t
)((kmem_maxavail() >> 2) / sizeof (struct rnode
));
3492 if (nrnode
> nrnode_max
|| (nrnode
== 0 && ncsize
== 0)) {
3493 zcmn_err(GLOBAL_ZONEID
, CE_NOTE
,
3494 "!setting nrnode to max value of %ld", nrnode_max
);
3495 nrnode
= nrnode_max
;
3498 rtablesize
= 1 << highbit(nrnode
/ hashlen
);
3499 rtablemask
= rtablesize
- 1;
3500 rtable
= kmem_alloc(rtablesize
* sizeof (*rtable
), KM_SLEEP
);
3501 for (i
= 0; i
< rtablesize
; i
++) {
3502 rtable
[i
].r_hashf
= (rnode_t
*)(&rtable
[i
]);
3503 rtable
[i
].r_hashb
= (rnode_t
*)(&rtable
[i
]);
3504 rw_init(&rtable
[i
].r_lock
, NULL
, RW_DEFAULT
, NULL
);
3506 rnode_cache
= kmem_cache_create("rnode_cache", sizeof (rnode_t
),
3507 0, NULL
, NULL
, nfs_reclaim
, NULL
, NULL
, 0);
3510 * Allocate and initialize the access cache
3514 * Initial guess is one access cache entry per rnode unless
3515 * nacache is set to a non-zero value and then it is used to
3516 * indicate a guess at the number of access cache entries.
3519 acachesize
= 1 << highbit(nacache
/ hashlen
);
3521 acachesize
= rtablesize
;
3522 acachemask
= acachesize
- 1;
3523 acache
= kmem_alloc(acachesize
* sizeof (*acache
), KM_SLEEP
);
3524 for (i
= 0; i
< acachesize
; i
++) {
3525 acache
[i
].next
= (acache_t
*)&acache
[i
];
3526 acache
[i
].prev
= (acache_t
*)&acache
[i
];
3527 rw_init(&acache
[i
].lock
, NULL
, RW_DEFAULT
, NULL
);
3529 acache_cache
= kmem_cache_create("nfs_access_cache",
3530 sizeof (acache_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
3532 * Allocate and initialize the client handle cache
3534 chtab_cache
= kmem_cache_create("client_handle_cache",
3535 sizeof (struct chtab
), 0, NULL
, NULL
, clreclaim
, NULL
, NULL
, 0);
3537 * Initialize the list of per-zone client handles (and associated data).
3538 * This needs to be done before we call zone_key_create().
3540 list_create(&nfs_clnt_list
, sizeof (struct nfs_clnt
),
3541 offsetof(struct nfs_clnt
, nfscl_node
));
3543 * Initialize the zone_key for per-zone client handle lists.
3545 zone_key_create(&nfsclnt_zone_key
, clinit_zone
, NULL
, clfini_zone
);
3547 * Initialize the various mutexes and reader/writer locks
3549 mutex_init(&rpfreelist_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3550 mutex_init(&newnum_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3551 mutex_init(&nfs_minor_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3554 * Assign unique major number for all nfs mounts
3556 if ((nfs_major
= getudev()) == -1) {
3557 zcmn_err(GLOBAL_ZONEID
, CE_WARN
,
3558 "nfs: init: can't get unique device number");
3563 if (nfs3_jukebox_delay
== 0)
3564 nfs3_jukebox_delay
= NFS3_JUKEBOX_DELAY
;
3575 * Deallocate the rnode hash queues
3577 kmem_cache_destroy(rnode_cache
);
3579 for (i
= 0; i
< rtablesize
; i
++)
3580 rw_destroy(&rtable
[i
].r_lock
);
3581 kmem_free(rtable
, rtablesize
* sizeof (*rtable
));
3584 * Deallocated the access cache
3586 kmem_cache_destroy(acache_cache
);
3588 for (i
= 0; i
< acachesize
; i
++)
3589 rw_destroy(&acache
[i
].lock
);
3590 kmem_free(acache
, acachesize
* sizeof (*acache
));
3593 * Deallocate the client handle cache
3595 kmem_cache_destroy(chtab_cache
);
3598 * Destroy the various mutexes and reader/writer locks
3600 mutex_destroy(&rpfreelist_lock
);
3601 mutex_destroy(&newnum_lock
);
3602 mutex_destroy(&nfs_minor_lock
);
3603 (void) zone_key_delete(nfsclnt_zone_key
);
3612 return (NFSERR_OPNOTSUPP
);
3614 return (NFSERR_NAMETOOLONG
);
3616 return (NFSERR_NOTEMPTY
);
3618 return (NFSERR_DQUOT
);
3620 return (NFSERR_STALE
);
3622 return (NFSERR_REMOTE
);
3624 return (NFSERR_OPNOTSUPP
);
3626 return (NFSERR_INVAL
);
3628 return ((enum nfsstat
)error
);
3634 geterrno(enum nfsstat status
)
3638 case NFSERR_OPNOTSUPP
:
3639 return (EOPNOTSUPP
);
3640 case NFSERR_NAMETOOLONG
:
3641 return (ENAMETOOLONG
);
3642 case NFSERR_NOTEMPTY
:
3653 return ((int)status
);
3659 puterrno3(int error
)
3667 return (NFS3ERR_PERM
);
3669 return (NFS3ERR_NOENT
);
3671 return (NFS3ERR_IO
);
3673 return (NFS3ERR_NXIO
);
3675 return (NFS3ERR_ACCES
);
3677 return (NFS3ERR_EXIST
);
3679 return (NFS3ERR_XDEV
);
3681 return (NFS3ERR_NODEV
);
3683 return (NFS3ERR_NOTDIR
);
3685 return (NFS3ERR_ISDIR
);
3687 return (NFS3ERR_INVAL
);
3689 return (NFS3ERR_FBIG
);
3691 return (NFS3ERR_NOSPC
);
3693 return (NFS3ERR_ROFS
);
3695 return (NFS3ERR_MLINK
);
3697 return (NFS3ERR_NAMETOOLONG
);
3699 return (NFS3ERR_NOTEMPTY
);
3701 return (NFS3ERR_DQUOT
);
3703 return (NFS3ERR_STALE
);
3705 return (NFS3ERR_REMOTE
);
3708 return (NFS3ERR_NOTSUPP
);
3710 return (NFS3ERR_INVAL
);
3712 zcmn_err(getzoneid(), CE_WARN
,
3713 "puterrno3: got error %d", error
);
3714 return ((enum nfsstat3
)error
);
3719 return (NFS3ERR_NAMETOOLONG
);
3721 return (NFS3ERR_NOTEMPTY
);
3723 return (NFS3ERR_DQUOT
);
3725 return (NFS3ERR_STALE
);
3728 return (NFS3ERR_NOTSUPP
);
3730 return (NFS3ERR_REMOTE
);
3732 return (NFS3ERR_INVAL
);
3734 return ((enum nfsstat3
)error
);
3740 geterrno3(enum nfsstat3 status
)
3763 case NFS3ERR_NOTDIR
:
3777 case NFS3ERR_NAMETOOLONG
:
3778 return (ENAMETOOLONG
);
3779 case NFS3ERR_NOTEMPTY
:
3785 case NFS3ERR_REMOTE
:
3787 case NFS3ERR_BADHANDLE
:
3789 case NFS3ERR_NOT_SYNC
:
3791 case NFS3ERR_BAD_COOKIE
:
3793 case NFS3ERR_NOTSUPP
:
3794 return (EOPNOTSUPP
);
3795 case NFS3ERR_TOOSMALL
:
3797 case NFS3ERR_SERVERFAULT
:
3799 case NFS3ERR_BADTYPE
:
3801 case NFS3ERR_JUKEBOX
:
3804 zcmn_err(getzoneid(), CE_WARN
,
3805 "geterrno3: got status %d", status
);
3806 return ((int)status
);
3810 case NFS3ERR_NAMETOOLONG
:
3811 return (ENAMETOOLONG
);
3812 case NFS3ERR_NOTEMPTY
:
3817 case NFS3ERR_BADHANDLE
:
3819 case NFS3ERR_NOTSUPP
:
3820 return (EOPNOTSUPP
);
3821 case NFS3ERR_REMOTE
:
3823 case NFS3ERR_NOT_SYNC
:
3824 case NFS3ERR_TOOSMALL
:
3825 case NFS3ERR_BADTYPE
:
3827 case NFS3ERR_BAD_COOKIE
:
3829 case NFS3ERR_SERVERFAULT
:
3831 case NFS3ERR_JUKEBOX
:
3834 return ((int)status
);
3840 rddir_cache_alloc(int flags
)
3844 rc
= kmem_alloc(sizeof (*rc
), flags
);
3848 cv_init(&rc
->cv
, NULL
, CV_DEFAULT
, NULL
);
3849 mutex_init(&rc
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3852 atomic_inc_64(&clstat_debug
.dirent
.value
.ui64
);
3859 rddir_cache_free(rddir_cache
*rc
)
3863 atomic_dec_64(&clstat_debug
.dirent
.value
.ui64
);
3865 if (rc
->entries
!= NULL
) {
3867 rddir_cache_buf_free(rc
->entries
, rc
->buflen
);
3869 kmem_free(rc
->entries
, rc
->buflen
);
3872 cv_destroy(&rc
->cv
);
3873 mutex_destroy(&rc
->lock
);
3874 kmem_free(rc
, sizeof (*rc
));
3878 rddir_cache_hold(rddir_cache
*rc
)
3881 mutex_enter(&rc
->lock
);
3883 mutex_exit(&rc
->lock
);
3887 rddir_cache_rele(rddir_cache
*rc
)
3890 mutex_enter(&rc
->lock
);
3891 ASSERT(rc
->count
> 0);
3892 if (--rc
->count
== 0) {
3893 mutex_exit(&rc
->lock
);
3894 rddir_cache_free(rc
);
3896 mutex_exit(&rc
->lock
);
3901 rddir_cache_buf_alloc(size_t size
, int flags
)
3905 rc
= kmem_alloc(size
, flags
);
3907 atomic_add_64(&clstat_debug
.dirents
.value
.ui64
, size
);
3912 rddir_cache_buf_free(void *addr
, size_t size
)
3915 atomic_add_64(&clstat_debug
.dirents
.value
.ui64
, -(int64_t)size
);
3916 kmem_free(addr
, size
);
3921 nfs_free_data_reclaim(rnode_t
*rp
)
3926 nfs3_pathconf_info
*info
;
3931 * Free any held credentials and caches which
3932 * may be associated with this rnode.
3934 mutex_enter(&rp
->r_statelock
);
3937 contents
= rp
->r_symlink
.contents
;
3938 size
= rp
->r_symlink
.size
;
3939 rp
->r_symlink
.contents
= NULL
;
3940 vsp
= rp
->r_secattr
;
3941 rp
->r_secattr
= NULL
;
3942 info
= rp
->r_pathconf
;
3943 rp
->r_pathconf
= NULL
;
3944 mutex_exit(&rp
->r_statelock
);
3950 * Free the access cache entries.
3952 freed
= nfs_access_purge_rp(rp
);
3954 if (!HAVE_RDDIR_CACHE(rp
) &&
3961 * Free the readdir cache entries
3963 if (HAVE_RDDIR_CACHE(rp
))
3964 nfs_purge_rddir_cache(RTOV(rp
));
3967 * Free the symbolic link cache.
3969 if (contents
!= NULL
) {
3971 kmem_free((void *)contents
, size
);
3975 * Free any cached ACL.
3981 * Free any cached pathconf information.
3984 kmem_free(info
, sizeof (*info
));
3990 nfs_active_data_reclaim(rnode_t
*rp
)
3995 nfs3_pathconf_info
*info
;
3999 * Free any held credentials and caches which
4000 * may be associated with this rnode.
4002 if (!mutex_tryenter(&rp
->r_statelock
))
4004 contents
= rp
->r_symlink
.contents
;
4005 size
= rp
->r_symlink
.size
;
4006 rp
->r_symlink
.contents
= NULL
;
4007 vsp
= rp
->r_secattr
;
4008 rp
->r_secattr
= NULL
;
4009 info
= rp
->r_pathconf
;
4010 rp
->r_pathconf
= NULL
;
4011 mutex_exit(&rp
->r_statelock
);
4014 * Free the access cache entries.
4016 freed
= nfs_access_purge_rp(rp
);
4018 if (!HAVE_RDDIR_CACHE(rp
) &&
4025 * Free the readdir cache entries
4027 if (HAVE_RDDIR_CACHE(rp
))
4028 nfs_purge_rddir_cache(RTOV(rp
));
4031 * Free the symbolic link cache.
4033 if (contents
!= NULL
) {
4035 kmem_free((void *)contents
, size
);
4039 * Free any cached ACL.
4045 * Free any cached pathconf information.
4048 kmem_free(info
, sizeof (*info
));
4054 nfs_free_reclaim(void)
4060 clstat_debug
.f_reclaim
.value
.ui64
++;
4063 mutex_enter(&rpfreelist_lock
);
4067 if (nfs_free_data_reclaim(rp
))
4069 } while ((rp
= rp
->r_freef
) != rpfreelist
);
4071 mutex_exit(&rpfreelist_lock
);
4076 nfs_active_reclaim(void)
4083 clstat_debug
.a_reclaim
.value
.ui64
++;
4086 for (index
= 0; index
< rtablesize
; index
++) {
4087 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
4088 for (rp
= rtable
[index
].r_hashf
;
4089 rp
!= (rnode_t
*)(&rtable
[index
]);
4091 if (nfs_active_data_reclaim(rp
))
4094 rw_exit(&rtable
[index
].r_lock
);
4100 nfs_rnode_reclaim(void)
4107 clstat_debug
.r_reclaim
.value
.ui64
++;
4110 mutex_enter(&rpfreelist_lock
);
4111 while ((rp
= rpfreelist
) != NULL
) {
4113 mutex_exit(&rpfreelist_lock
);
4114 if (rp
->r_flags
& RHASHED
) {
4116 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
4117 mutex_enter(&vp
->v_lock
);
4118 if (vp
->v_count
> 1) {
4120 mutex_exit(&vp
->v_lock
);
4121 rw_exit(&rp
->r_hashq
->r_lock
);
4122 mutex_enter(&rpfreelist_lock
);
4125 mutex_exit(&vp
->v_lock
);
4126 rp_rmhash_locked(rp
);
4127 rw_exit(&rp
->r_hashq
->r_lock
);
4130 * This call to rp_addfree will end up destroying the
4131 * rnode, but in a safe way with the appropriate set
4134 rp_addfree(rp
, CRED());
4135 mutex_enter(&rpfreelist_lock
);
4137 mutex_exit(&rpfreelist_lock
);
4143 nfs_reclaim(void *cdrarg
)
4147 clstat_debug
.reclaim
.value
.ui64
++;
4149 if (nfs_free_reclaim())
4152 if (nfs_active_reclaim())
4155 (void) nfs_rnode_reclaim();
4159 * NFS client failover support
4161 * Routines to copy filehandles
4164 nfscopyfh(caddr_t fhp
, vnode_t
*vp
)
4166 fhandle_t
*dest
= (fhandle_t
*)fhp
;
4173 nfs3copyfh(caddr_t fhp
, vnode_t
*vp
)
4175 nfs_fh3
*dest
= (nfs_fh3
*)fhp
;
4178 *dest
= *VTOFH3(vp
);
4182 * NFS client failover support
4184 * failover_safe() will test various conditions to ensure that
4185 * failover is permitted for this vnode. It will be denied
4187 * 1) the operation in progress does not support failover (NULL fi)
4188 * 2) there are no available replicas (NULL mi_servers->sv_next)
4189 * 3) any locks are outstanding on this file
4192 failover_safe(failinfo_t
*fi
)
4196 * Does this op permit failover?
4198 if (fi
== NULL
|| fi
->vp
== NULL
)
4202 * Are there any alternates to failover to?
4204 if (VTOMI(fi
->vp
)->mi_servers
->sv_next
== NULL
)
4208 * Disable check; we've forced local locking
4210 * if (flk_has_remote_locks(fi->vp))
4215 * If we have no partial path, we can't do anything
4217 if (VTOR(fi
->vp
)->r_path
== NULL
)
4223 #include <sys/thread.h>
4226 * NFS client failover support
4228 * failover_newserver() will start a search for a new server,
4229 * preferably by starting an async thread to do the work. If
4230 * someone is already doing this (recognizable by MI_BINDINPROG
4231 * being set), it will simply return and the calling thread
4232 * will queue on the mi_failover_cv condition variable.
4235 failover_newserver(mntinfo_t
*mi
)
4238 * Check if someone else is doing this already
4240 mutex_enter(&mi
->mi_lock
);
4241 if (mi
->mi_flags
& MI_BINDINPROG
) {
4242 mutex_exit(&mi
->mi_lock
);
4245 mi
->mi_flags
|= MI_BINDINPROG
;
4248 * Need to hold the vfs struct so that it can't be released
4249 * while the failover thread is selecting a new server.
4251 VFS_HOLD(mi
->mi_vfsp
);
4254 * Start a thread to do the real searching.
4256 (void) zthread_create(NULL
, 0, failover_thread
, mi
, 0, minclsyspri
);
4258 mutex_exit(&mi
->mi_lock
);
4262 * NFS client failover support
4264 * failover_thread() will find a new server to replace the one
4265 * currently in use, wake up other threads waiting on this mount
4266 * point, and die. It will start at the head of the server list
4267 * and poll servers until it finds one with an NFS server which is
4268 * registered and responds to a NULL procedure ping.
4270 * XXX failover_thread is unsafe within the scope of the
4271 * present model defined for cpr to suspend the system.
4272 * Specifically, over-the-wire calls made by the thread
4273 * are unsafe. The thread needs to be reevaluated in case of
4274 * future updates to the cpr suspend model.
4277 failover_thread(mntinfo_t
*mi
)
4279 servinfo_t
*svp
= NULL
;
4281 enum clnt_stat status
;
4285 callb_cpr_t cprinfo
;
4289 size_t srvnames_len
;
4290 struct nfs_clnt
*nfscl
= NULL
;
4291 zoneid_t zoneid
= getzoneid();
4295 * This is currently only needed to access counters which exist on
4296 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4297 * on non-DEBUG kernels.
4299 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
4300 ASSERT(nfscl
!= NULL
);
4304 * Its safe to piggyback on the mi_lock since failover_newserver()
4305 * code guarantees that there will be only one failover thread
4306 * per mountinfo at any instance.
4308 CALLB_CPR_INIT(&cprinfo
, &mi
->mi_lock
, callb_generic_cpr
,
4311 mutex_enter(&mi
->mi_lock
);
4312 while (mi
->mi_readers
) {
4313 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
4314 cv_wait(&mi
->mi_failover_cv
, &mi
->mi_lock
);
4315 CALLB_CPR_SAFE_END(&cprinfo
, &mi
->mi_lock
);
4317 mutex_exit(&mi
->mi_lock
);
4323 * Ping the null NFS procedure of every server in
4324 * the list until one responds. We always start
4325 * at the head of the list and always skip the one
4326 * that is current, since it's caused us a problem.
4328 while (svp
== NULL
) {
4329 for (svp
= mi
->mi_servers
; svp
; svp
= svp
->sv_next
) {
4330 if (!oncethru
&& svp
== mi
->mi_curr_serv
)
4334 * If the file system was forcibly umounted
4335 * while trying to do a failover, then just
4336 * give up on the failover. It won't matter
4337 * what the server is.
4339 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
4344 error
= clnt_tli_kcreate(svp
->sv_knconf
, &svp
->sv_addr
,
4345 NFS_PROGRAM
, NFS_VERSION
, 0, 1, CRED(), &cl
);
4349 if (!(mi
->mi_flags
& MI_INT
))
4350 cl
->cl_nosignal
= TRUE
;
4351 status
= CLNT_CALL(cl
, RFS_NULL
, xdr_void
, NULL
,
4352 xdr_void
, NULL
, tv
);
4353 if (!(mi
->mi_flags
& MI_INT
))
4354 cl
->cl_nosignal
= FALSE
;
4355 AUTH_DESTROY(cl
->cl_auth
);
4357 if (status
== RPC_SUCCESS
) {
4358 if (svp
== mi
->mi_curr_serv
) {
4360 zcmn_err(zoneid
, CE_NOTE
,
4361 "NFS%d: failing over: selecting original server %s",
4362 mi
->mi_vers
, svp
->sv_hostname
);
4364 zcmn_err(zoneid
, CE_NOTE
,
4365 "NFS: failing over: selecting original server %s",
4370 zcmn_err(zoneid
, CE_NOTE
,
4371 "NFS%d: failing over from %s to %s",
4373 mi
->mi_curr_serv
->sv_hostname
,
4376 zcmn_err(zoneid
, CE_NOTE
,
4377 "NFS: failing over from %s to %s",
4378 mi
->mi_curr_serv
->sv_hostname
,
4388 srvnames
= nfs_getsrvnames(mi
, &srvnames_len
);
4391 "NFS%d servers %s not responding "
4392 "still trying\n", mi
->mi_vers
, srvnames
);
4394 zprintf(zoneid
, "NFS servers %s not responding "
4395 "still trying\n", srvnames
);
4399 mutex_enter(&mi
->mi_lock
);
4400 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
4401 mutex_exit(&mi
->mi_lock
);
4403 mutex_enter(&mi
->mi_lock
);
4404 CALLB_CPR_SAFE_END(&cprinfo
, &mi
->mi_lock
);
4405 mutex_exit(&mi
->mi_lock
);
4411 zprintf(zoneid
, "NFS%d servers %s ok\n", mi
->mi_vers
, srvnames
);
4413 zprintf(zoneid
, "NFS servers %s ok\n", srvnames
);
4417 if (svp
!= mi
->mi_curr_serv
) {
4418 (void) dnlc_purge_vfsp(mi
->mi_vfsp
, 0);
4419 index
= rtablehash(&mi
->mi_curr_serv
->sv_fhandle
);
4420 rw_enter(&rtable
[index
].r_lock
, RW_WRITER
);
4421 rp
= rfind(&rtable
[index
], &mi
->mi_curr_serv
->sv_fhandle
,
4424 if (rp
->r_flags
& RHASHED
)
4425 rp_rmhash_locked(rp
);
4426 rw_exit(&rtable
[index
].r_lock
);
4428 rp
->r_fh
= svp
->sv_fhandle
;
4429 (void) nfs_free_data_reclaim(rp
);
4430 index
= rtablehash(&rp
->r_fh
);
4431 rp
->r_hashq
= &rtable
[index
];
4432 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
4433 vn_exists(RTOV(rp
));
4435 rw_exit(&rp
->r_hashq
->r_lock
);
4438 rw_exit(&rtable
[index
].r_lock
);
4443 kmem_free(srvnames
, srvnames_len
);
4444 mutex_enter(&mi
->mi_lock
);
4445 mi
->mi_flags
&= ~MI_BINDINPROG
;
4447 mi
->mi_curr_serv
= svp
;
4450 nfscl
->nfscl_stat
.failover
.value
.ui64
++;
4453 cv_broadcast(&mi
->mi_failover_cv
);
4454 CALLB_CPR_EXIT(&cprinfo
);
4455 VFS_RELE(mi
->mi_vfsp
);
4461 * NFS client failover support
4463 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4464 * is cleared, meaning that failover is complete. Called with
4465 * mi_lock mutex held.
4468 failover_wait(mntinfo_t
*mi
)
4473 * If someone else is hunting for a living server,
4474 * sleep until it's done. After our sleep, we may
4475 * be bound to the right server and get off cheaply.
4477 while (mi
->mi_flags
& MI_BINDINPROG
) {
4479 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4480 * and SIGTERM. (Preserving the existing masks).
4481 * Mask out SIGINT if mount option nointr is specified.
4483 sigintr(&smask
, (int)mi
->mi_flags
& MI_INT
);
4484 if (!cv_wait_sig(&mi
->mi_failover_cv
, &mi
->mi_lock
)) {
4486 * restore original signal mask
4492 * restore original signal mask
4500 * NFS client failover support
4502 * failover_remap() will do a partial pathname lookup and find the
4503 * desired vnode on the current server. The interim vnode will be
4504 * discarded after we pilfer the new filehandle.
4507 * - This routine will also update the filehandle in the args structure
4508 * pointed to by the fi->fhp pointer if it is non-NULL.
4512 failover_remap(failinfo_t
*fi
)
4514 vnode_t
*vp
, *nvp
, *rootvp
;
4519 struct nfs_clnt
*nfscl
;
4521 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
4522 ASSERT(nfscl
!= NULL
);
4527 if (fi
== NULL
|| fi
->vp
== NULL
|| fi
->lookupproc
== NULL
)
4533 if (!(vp
->v_flag
& VROOT
)) {
4535 * Given the root fh, use the path stored in
4536 * the rnode to find the fh for the new server.
4538 error
= VFS_ROOT(mi
->mi_vfsp
, &rootvp
);
4542 error
= failover_lookup(rp
->r_path
, rootvp
,
4543 fi
->lookupproc
, fi
->xattrdirproc
, &nvp
);
4551 * If we found the same rnode, we're done now
4555 * Failed and the new server may physically be same
4556 * OR may share a same disk subsystem. In this case
4557 * file handle for a particular file path is not going
4558 * to change, given the same filehandle lookup will
4559 * always locate the same rnode as the existing one.
4560 * All we might need to do is to update the r_server
4561 * with the current servinfo.
4563 if (!VALID_FH(fi
)) {
4564 rp
->r_server
= mi
->mi_curr_serv
;
4571 * Try to make it so that no one else will find this
4572 * vnode because it is just a temporary to hold the
4573 * new file handle until that file handle can be
4574 * copied to the original vnode/rnode.
4577 mutex_enter(&mi
->mi_remap_lock
);
4579 * Some other thread could have raced in here and could
4580 * have done the remap for this particular rnode before
4581 * this thread here. Check for rp->r_server and
4582 * mi->mi_curr_serv and return if they are same.
4585 mutex_exit(&mi
->mi_remap_lock
);
4590 if (nrp
->r_flags
& RHASHED
)
4594 * As a heuristic check on the validity of the new
4595 * file, check that the size and type match against
4596 * that we remember from the old version.
4598 if (rp
->r_size
!= nrp
->r_size
|| vp
->v_type
!= nvp
->v_type
) {
4599 mutex_exit(&mi
->mi_remap_lock
);
4600 zcmn_err(mi
->mi_zone
->zone_id
, CE_WARN
,
4601 "NFS replicas %s and %s: file %s not same.",
4602 rp
->r_server
->sv_hostname
,
4603 nrp
->r_server
->sv_hostname
, rp
->r_path
);
4609 * snarf the filehandle from the new rnode
4610 * then release it, again while updating the
4611 * hash queues for the rnode.
4613 if (rp
->r_flags
& RHASHED
)
4615 rp
->r_server
= mi
->mi_curr_serv
;
4616 rp
->r_fh
= nrp
->r_fh
;
4617 rp
->r_hashq
= nrp
->r_hashq
;
4619 * Copy the attributes from the new rnode to the old
4620 * rnode. This will help to reduce unnecessary page
4623 rp
->r_attr
= nrp
->r_attr
;
4624 rp
->r_attrtime
= nrp
->r_attrtime
;
4625 rp
->r_mtime
= nrp
->r_mtime
;
4626 (void) nfs_free_data_reclaim(rp
);
4627 nfs_setswaplike(vp
, &rp
->r_attr
);
4628 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
4630 rw_exit(&rp
->r_hashq
->r_lock
);
4631 mutex_exit(&mi
->mi_remap_lock
);
4636 * Update successful failover remap count
4638 mutex_enter(&mi
->mi_lock
);
4640 mutex_exit(&mi
->mi_lock
);
4642 nfscl
->nfscl_stat
.remap
.value
.ui64
++;
4646 * If we have a copied filehandle to update, do it now.
4648 if (fi
->fhp
!= NULL
&& fi
->copyproc
!= NULL
)
4649 (*fi
->copyproc
)(fi
->fhp
, vp
);
4655 * NFS client failover support
4657 * We want a simple pathname lookup routine to parse the pieces
4658 * of path in rp->r_path. We know that the path was a created
4659 * as rnodes were made, so we know we have only to deal with
4660 * paths that look like:
4661 * dir1/dir2/dir3/file
4662 * Any evidence of anything like .., symlinks, and ENOTDIR
4663 * are hard errors, because they mean something in this filesystem
4664 * is different from the one we came from, or has changed under
4665 * us in some way. If this is true, we want the failure.
4667 * Extended attributes: if the filesystem is mounted with extended
4668 * attributes enabled (-o xattr), the attribute directory will be
4669 * represented in the r_path as the magic name XATTR_RPATH. So if
4670 * we see that name in the pathname, is must be because this node
4671 * is an extended attribute. Therefore, look it up that way.
4674 failover_lookup(char *path
, vnode_t
*root
,
4675 int (*lookupproc
)(vnode_t
*, char *, vnode_t
**, struct pathname
*, int,
4676 vnode_t
*, cred_t
*, int),
4677 int (*xattrdirproc
)(vnode_t
*, vnode_t
**, bool_t
, cred_t
*, int),
4682 char *s
, *p
, *tmppath
;
4687 /* Make local copy of path */
4688 len
= strlen(path
) + 1;
4689 tmppath
= kmem_alloc(len
, KM_SLEEP
);
4690 (void) strcpy(tmppath
, path
);
4696 xattr
= mi
->mi_flags
& MI_EXTATTR
;
4702 if (xattr
&& strcmp(s
, XATTR_RPATH
) == 0) {
4703 error
= (*xattrdirproc
)(dvp
, &nvp
, FALSE
, CRED(),
4706 error
= (*lookupproc
)(dvp
, s
, &nvp
, NULL
, 0, NULL
,
4707 CRED(), RFSCALL_SOFT
);
4713 kmem_free(tmppath
, len
);
4719 } while (p
!= NULL
);
4721 if (nvp
!= NULL
&& new != NULL
)
4723 kmem_free(tmppath
, len
);
4728 * NFS client failover support
4730 * sv_free() frees the malloc'd portion of a "servinfo_t".
4733 sv_free(servinfo_t
*svp
)
4736 struct knetconfig
*knconf
;
4738 while (svp
!= NULL
) {
4739 next
= svp
->sv_next
;
4740 if (svp
->sv_secdata
)
4741 sec_clnt_freeinfo(svp
->sv_secdata
);
4742 if (svp
->sv_hostname
&& svp
->sv_hostnamelen
> 0)
4743 kmem_free(svp
->sv_hostname
, svp
->sv_hostnamelen
);
4744 knconf
= svp
->sv_knconf
;
4745 if (knconf
!= NULL
) {
4746 if (knconf
->knc_protofmly
!= NULL
)
4747 kmem_free(knconf
->knc_protofmly
, KNC_STRSIZE
);
4748 if (knconf
->knc_proto
!= NULL
)
4749 kmem_free(knconf
->knc_proto
, KNC_STRSIZE
);
4750 kmem_free(knconf
, sizeof (*knconf
));
4752 knconf
= svp
->sv_origknconf
;
4753 if (knconf
!= NULL
) {
4754 if (knconf
->knc_protofmly
!= NULL
)
4755 kmem_free(knconf
->knc_protofmly
, KNC_STRSIZE
);
4756 if (knconf
->knc_proto
!= NULL
)
4757 kmem_free(knconf
->knc_proto
, KNC_STRSIZE
);
4758 kmem_free(knconf
, sizeof (*knconf
));
4760 if (svp
->sv_addr
.buf
!= NULL
&& svp
->sv_addr
.maxlen
!= 0)
4761 kmem_free(svp
->sv_addr
.buf
, svp
->sv_addr
.maxlen
);
4762 mutex_destroy(&svp
->sv_lock
);
4763 kmem_free(svp
, sizeof (*svp
));
4769 * Only can return non-zero if intr != 0.
4772 nfs_rw_enter_sig(nfs_rwlock_t
*l
, krw_t rw
, int intr
)
4775 mutex_enter(&l
->lock
);
4778 * If this is a nested enter, then allow it. There
4779 * must be as many exits as enters through.
4781 if (l
->owner
== curthread
) {
4782 /* lock is held for writing by current thread */
4783 ASSERT(rw
== RW_READER
|| rw
== RW_WRITER
);
4785 } else if (rw
== RW_READER
) {
4787 * While there is a writer active or writers waiting,
4788 * then wait for them to finish up and move on. Then,
4789 * increment the count to indicate that a reader is
4792 while (l
->count
< 0 || l
->waiters
> 0) {
4794 klwp_t
*lwp
= ttolwp(curthread
);
4798 if (cv_wait_sig(&l
->cv_rd
, &l
->lock
) == 0) {
4801 mutex_exit(&l
->lock
);
4807 cv_wait(&l
->cv_rd
, &l
->lock
);
4809 ASSERT(l
->count
< INT_MAX
);
4811 if ((l
->count
% 10000) == 9999)
4812 cmn_err(CE_WARN
, "nfs_rw_enter_sig: count %d on"
4813 "rwlock @ %p\n", l
->count
, (void *)&l
);
4817 ASSERT(rw
== RW_WRITER
);
4819 * While there are readers active or a writer
4820 * active, then wait for all of the readers
4821 * to finish or for the writer to finish.
4822 * Then, set the owner field to curthread and
4823 * decrement count to indicate that a writer
4826 while (l
->count
!= 0) {
4829 klwp_t
*lwp
= ttolwp(curthread
);
4833 if (cv_wait_sig(&l
->cv
, &l
->lock
) == 0) {
4838 * If there are readers active and no
4839 * writers waiting then wake up all of
4840 * the waiting readers (if any).
4842 if (l
->count
> 0 && l
->waiters
== 0)
4843 cv_broadcast(&l
->cv_rd
);
4844 mutex_exit(&l
->lock
);
4850 cv_wait(&l
->cv
, &l
->lock
);
4853 ASSERT(l
->owner
== NULL
);
4854 l
->owner
= curthread
;
4858 mutex_exit(&l
->lock
);
4864 * If the lock is available, obtain it and return non-zero. If there is
4865 * already a conflicting lock, return 0 immediately.
4869 nfs_rw_tryenter(nfs_rwlock_t
*l
, krw_t rw
)
4871 mutex_enter(&l
->lock
);
4874 * If this is a nested enter, then allow it. There
4875 * must be as many exits as enters through.
4877 if (l
->owner
== curthread
) {
4878 /* lock is held for writing by current thread */
4879 ASSERT(rw
== RW_READER
|| rw
== RW_WRITER
);
4881 } else if (rw
== RW_READER
) {
4883 * If there is a writer active or writers waiting, deny the
4884 * lock. Otherwise, bump the count of readers.
4886 if (l
->count
< 0 || l
->waiters
> 0) {
4887 mutex_exit(&l
->lock
);
4892 ASSERT(rw
== RW_WRITER
);
4894 * If there are readers active or a writer active, deny the
4895 * lock. Otherwise, set the owner field to curthread and
4896 * decrement count to indicate that a writer is active.
4898 if (l
->count
!= 0) {
4899 mutex_exit(&l
->lock
);
4902 ASSERT(l
->owner
== NULL
);
4903 l
->owner
= curthread
;
4907 mutex_exit(&l
->lock
);
4913 nfs_rw_exit(nfs_rwlock_t
*l
)
4916 mutex_enter(&l
->lock
);
4918 if (l
->owner
!= NULL
) {
4919 ASSERT(l
->owner
== curthread
);
4922 * To release a writer lock increment count to indicate that
4923 * there is one less writer active. If this was the last of
4924 * possibly nested writer locks, then clear the owner field as
4925 * well to indicate that there is no writer active.
4927 ASSERT(l
->count
< 0);
4929 if (l
->count
== 0) {
4933 * If there are no writers waiting then wakeup all of
4934 * the waiting readers (if any).
4936 if (l
->waiters
== 0)
4937 cv_broadcast(&l
->cv_rd
);
4941 * To release a reader lock just decrement count to indicate
4942 * that there is one less reader active.
4944 ASSERT(l
->count
> 0);
4949 * If there are no readers active nor a writer active and there is a
4950 * writer waiting we need to wake up it.
4952 if (l
->count
== 0 && l
->waiters
> 0)
4954 mutex_exit(&l
->lock
);
4958 nfs_rw_lock_held(nfs_rwlock_t
*l
, krw_t rw
)
4961 if (rw
== RW_READER
)
4962 return (l
->count
> 0);
4963 ASSERT(rw
== RW_WRITER
);
4964 return (l
->count
< 0);
4969 nfs_rw_init(nfs_rwlock_t
*l
, char *name
, krw_type_t type
, void *arg
)
4975 mutex_init(&l
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
4976 cv_init(&l
->cv
, NULL
, CV_DEFAULT
, NULL
);
4977 cv_init(&l
->cv_rd
, NULL
, CV_DEFAULT
, NULL
);
4981 nfs_rw_destroy(nfs_rwlock_t
*l
)
4984 mutex_destroy(&l
->lock
);
4986 cv_destroy(&l
->cv_rd
);
4990 nfs3_rddir_compar(const void *x
, const void *y
)
4992 rddir_cache
*a
= (rddir_cache
*)x
;
4993 rddir_cache
*b
= (rddir_cache
*)y
;
4995 if (a
->nfs3_cookie
== b
->nfs3_cookie
) {
4996 if (a
->buflen
== b
->buflen
)
4998 if (a
->buflen
< b
->buflen
)
5003 if (a
->nfs3_cookie
< b
->nfs3_cookie
)
5010 nfs_rddir_compar(const void *x
, const void *y
)
5012 rddir_cache
*a
= (rddir_cache
*)x
;
5013 rddir_cache
*b
= (rddir_cache
*)y
;
5015 if (a
->nfs_cookie
== b
->nfs_cookie
) {
5016 if (a
->buflen
== b
->buflen
)
5018 if (a
->buflen
< b
->buflen
)
5023 if (a
->nfs_cookie
< b
->nfs_cookie
)
5030 nfs_getsrvnames(mntinfo_t
*mi
, size_t *len
)
5038 * Calculate the length of the string required to hold all
5039 * of the server names plus either a comma or a null
5040 * character following each individual one.
5043 for (s
= mi
->mi_servers
; s
!= NULL
; s
= s
->sv_next
)
5044 length
+= s
->sv_hostnamelen
;
5046 srvnames
= kmem_alloc(length
, KM_SLEEP
);
5049 for (s
= mi
->mi_servers
; s
!= NULL
; s
= s
->sv_next
) {
5050 (void) strcpy(namep
, s
->sv_hostname
);
5051 namep
+= s
->sv_hostnamelen
- 1;
5062 * These two functions are temporary and designed for the upgrade-workaround
5063 * only. They cannot be used for general zone-crossing NFS client support, and
5064 * will be removed shortly.
5066 * When the workaround is enabled, all NFS traffic is forced into the global
5067 * zone. These functions are called when the code needs to refer to the state
5068 * of the underlying network connection. They're not called when the function
5069 * needs to refer to the state of the process that invoked the system call.
5070 * (E.g., when checking whether the zone is shutting down during the mount()
5077 return (nfs_global_client_only
!= 0 ? global_zone
: curproc
->p_zone
);
5083 return (nfs_global_client_only
!= 0 ? GLOBAL_ZONEID
: getzoneid());
5090 mutex_enter(&curproc
->p_splock
);
5091 rv
= (curproc
->p_sessp
->s_vp
!= NULL
);
5092 mutex_exit(&curproc
->p_splock
);
5097 * See if xattr directory to see if it has any generic user attributes
5100 do_xattr_exists_check(vnode_t
*vp
, ulong_t
*valp
, cred_t
*cr
)
5106 size_t dlen
= 8 * 1024;
5112 dbuf
= kmem_alloc(dlen
, KM_SLEEP
);
5115 uio
.uio_segflg
= UIO_SYSSPACE
;
5117 uio
.uio_extflg
= UIO_COPY_CACHED
;
5118 uio
.uio_loffset
= 0;
5119 uio
.uio_resid
= dlen
;
5120 iov
.iov_base
= dbuf
;
5122 (void) fop_rwlock(vp
, V_WRITELOCK_FALSE
, NULL
);
5123 error
= fop_readdir(vp
, &uio
, cr
, &eof
, NULL
, 0);
5124 fop_rwunlock(vp
, V_WRITELOCK_FALSE
, NULL
);
5126 dbuflen
= dlen
- uio
.uio_resid
;
5128 if (error
|| dbuflen
== 0) {
5129 kmem_free(dbuf
, dlen
);
5133 dp
= (dirent_t
*)dbuf
;
5135 while ((intptr_t)dp
< (intptr_t)dbuf
+ dbuflen
) {
5136 if (strcmp(dp
->d_name
, ".") == 0 ||
5137 strcmp(dp
->d_name
, "..") == 0 || strcmp(dp
->d_name
,
5138 VIEW_READWRITE
) == 0 || strcmp(dp
->d_name
,
5139 VIEW_READONLY
) == 0) {
5140 dp
= (dirent_t
*)((intptr_t)dp
+ dp
->d_reclen
);
5147 kmem_free(dbuf
, dlen
);