4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
43 #include <sys/tiuser.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
53 #include <sys/bitmap.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
69 #include <rpc/types.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
81 * The hash queues for the access to active and cached rnodes
82 * are organized as doubly linked lists. A reader/writer lock
83 * for each hash bucket is used to control access and to synchronize
84 * lookups, additions, and deletions from the hash queue.
86 * The rnode freelist is organized as a doubly linked list with
87 * a head pointer. Additions and deletions are synchronized via
90 * In order to add an rnode to the free list, it must be hashed into
91 * a hash queue and the exclusive lock to the hash queue be held.
92 * If an rnode is not hashed into a hash queue, then it is destroyed
93 * because it represents no valuable information that can be reused
94 * about the file. The exclusive lock to the hash queue must be
95 * held in order to prevent a lookup in the hash queue from finding
96 * the rnode and using it and assuming that the rnode is not on the
97 * freelist. The lookup in the hash queue will have the hash queue
98 * locked, either exclusive or shared.
100 * The vnode reference count for each rnode is not allowed to drop
101 * below 1. This prevents external entities, such as the VM
102 * subsystem, from acquiring references to vnodes already on the
103 * freelist and then trying to place them back on the freelist
104 * when their reference is released. This means that the when an
105 * rnode is looked up in the hash queues, then either the rnode
106 * is removed from the freelist and that reference is transferred to
107 * the new reference or the vnode reference count must be incremented
108 * accordingly. The mutex for the freelist must be held in order to
109 * accurately test to see if the rnode is on the freelist or not.
110 * The hash queue lock might be held shared and it is possible that
111 * two different threads may race to remove the rnode from the
112 * freelist. This race can be resolved by holding the mutex for the
113 * freelist. Please note that the mutex for the freelist does not
114 * need to held if the rnode is not on the freelist. It can not be
115 * placed on the freelist due to the requirement that the thread
116 * putting the rnode on the freelist must hold the exclusive lock
117 * to the hash queue and the thread doing the lookup in the hash
118 * queue is holding either a shared or exclusive lock to the hash
121 * The lock ordering is:
123 * hash bucket lock -> vnode lock
124 * hash bucket lock -> freelist lock
126 static rhashq_t
*rtable
;
128 static kmutex_t rpfreelist_lock
;
129 static rnode_t
*rpfreelist
= NULL
;
130 static long rnew
= 0;
133 static int rtablesize
;
134 static int rtablemask
;
136 static int hashlen
= 4;
138 static struct kmem_cache
*rnode_cache
;
141 * Mutex to protect the following variables:
145 kmutex_t nfs_minor_lock
;
149 /* Do we allow preepoch (negative) time values otw? */
150 bool_t nfs_allow_preepoch_time
= FALSE
; /* default: do not allow preepoch */
155 static acache_hash_t
*acache
;
156 static long nacache
; /* used strictly to size the number of hash queues */
158 static int acachesize
;
159 static int acachemask
;
160 static struct kmem_cache
*acache_cache
;
163 * Client side utilities
167 * client side statistics
169 static const struct clstat clstat_tmpl
= {
170 { "calls", KSTAT_DATA_UINT64
},
171 { "badcalls", KSTAT_DATA_UINT64
},
172 { "clgets", KSTAT_DATA_UINT64
},
173 { "cltoomany", KSTAT_DATA_UINT64
},
175 { "clalloc", KSTAT_DATA_UINT64
},
176 { "noresponse", KSTAT_DATA_UINT64
},
177 { "failover", KSTAT_DATA_UINT64
},
178 { "remap", KSTAT_DATA_UINT64
},
183 * The following are statistics that describe behavior of the system as a whole
184 * and doesn't correspond to any one particular zone.
187 static struct clstat_debug
{
188 kstat_named_t nrnode
; /* number of allocated rnodes */
189 kstat_named_t access
; /* size of access cache */
190 kstat_named_t dirent
; /* size of readdir cache */
191 kstat_named_t dirents
; /* size of readdir buf cache */
192 kstat_named_t reclaim
; /* number of reclaims */
193 kstat_named_t clreclaim
; /* number of cl reclaims */
194 kstat_named_t f_reclaim
; /* number of free reclaims */
195 kstat_named_t a_reclaim
; /* number of active reclaims */
196 kstat_named_t r_reclaim
; /* number of rnode reclaims */
197 kstat_named_t rpath
; /* bytes used to store rpaths */
199 { "nrnode", KSTAT_DATA_UINT64
},
200 { "access", KSTAT_DATA_UINT64
},
201 { "dirent", KSTAT_DATA_UINT64
},
202 { "dirents", KSTAT_DATA_UINT64
},
203 { "reclaim", KSTAT_DATA_UINT64
},
204 { "clreclaim", KSTAT_DATA_UINT64
},
205 { "f_reclaim", KSTAT_DATA_UINT64
},
206 { "a_reclaim", KSTAT_DATA_UINT64
},
207 { "r_reclaim", KSTAT_DATA_UINT64
},
208 { "r_path", KSTAT_DATA_UINT64
},
213 * We keep a global list of per-zone client data, so we can clean up all zones
214 * if we get low on memory.
216 static list_t nfs_clnt_list
;
217 static kmutex_t nfs_clnt_list_lock
;
218 static zone_key_t nfsclnt_zone_key
;
220 static struct kmem_cache
*chtab_cache
;
223 * Some servers do not properly update the attributes of the
224 * directory when changes are made. To allow interoperability
225 * with these broken servers, the nfs_disable_rddir_cache
226 * parameter must be set in /etc/system
228 int nfs_disable_rddir_cache
= 0;
230 int clget(clinfo_t
*, servinfo_t
*, cred_t
*, CLIENT
**,
232 void clfree(CLIENT
*, struct chtab
*);
233 static int acl_clget(mntinfo_t
*, servinfo_t
*, cred_t
*, CLIENT
**,
234 struct chtab
**, struct nfs_clnt
*);
235 static int nfs_clget(mntinfo_t
*, servinfo_t
*, cred_t
*, CLIENT
**,
236 struct chtab
**, struct nfs_clnt
*);
237 static void clreclaim(void *);
238 static int nfs_feedback(int, int, mntinfo_t
*);
239 static int rfscall(mntinfo_t
*, rpcproc_t
, xdrproc_t
, caddr_t
, xdrproc_t
,
240 caddr_t
, cred_t
*, int *, enum clnt_stat
*, int,
242 static int aclcall(mntinfo_t
*, rpcproc_t
, xdrproc_t
, caddr_t
, xdrproc_t
,
243 caddr_t
, cred_t
*, int *, int, failinfo_t
*);
244 static void rinactive(rnode_t
*, cred_t
*);
245 static int rtablehash(nfs_fhandle
*);
246 static vnode_t
*make_rnode(nfs_fhandle
*, rhashq_t
*, struct vfs
*,
247 const struct vnodeops
*,
248 int (*)(vnode_t
*, page_t
*, uoff_t
*, size_t *, int,
250 int (*)(const void *, const void *), int *, cred_t
*,
252 static void rp_rmfree(rnode_t
*);
253 static void rp_addhash(rnode_t
*);
254 static void rp_rmhash_locked(rnode_t
*);
255 static rnode_t
*rfind(rhashq_t
*, nfs_fhandle
*, struct vfs
*);
256 static void destroy_rnode(rnode_t
*);
257 static void rddir_cache_free(rddir_cache
*);
258 static int nfs_free_data_reclaim(rnode_t
*);
259 static int nfs_active_data_reclaim(rnode_t
*);
260 static int nfs_free_reclaim(void);
261 static int nfs_active_reclaim(void);
262 static int nfs_rnode_reclaim(void);
263 static void nfs_reclaim(void *);
264 static int failover_safe(failinfo_t
*);
265 static void failover_newserver(mntinfo_t
*mi
);
266 static void failover_thread(mntinfo_t
*mi
);
267 static int failover_wait(mntinfo_t
*);
268 static int failover_remap(failinfo_t
*);
269 static int failover_lookup(char *, vnode_t
*,
270 int (*)(vnode_t
*, char *, vnode_t
**,
271 struct pathname
*, int, vnode_t
*, cred_t
*, int),
272 int (*)(vnode_t
*, vnode_t
**, bool_t
, cred_t
*, int),
274 static void nfs_free_r_path(rnode_t
*);
275 static void nfs_set_vroot(vnode_t
*);
276 static char *nfs_getsrvnames(mntinfo_t
*, size_t *);
279 * from rpcsec module (common/rpcsec)
281 extern int sec_clnt_geth(CLIENT
*, struct sec_data
*, cred_t
*, AUTH
**);
282 extern void sec_clnt_freeh(AUTH
*);
283 extern void sec_clnt_freeinfo(struct sec_data
*);
286 * EIO or EINTR are not recoverable errors.
288 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
291 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
292 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
294 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
295 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
298 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
301 clget_impl(clinfo_t
*ci
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
302 struct chtab
**chp
, struct nfs_clnt
*nfscl
)
304 struct chhead
*ch
, *newch
;
305 struct chhead
**plistp
;
310 if (newcl
== NULL
|| chp
== NULL
|| ci
== NULL
)
317 * Find an unused handle or create one
320 nfscl
->nfscl_stat
.clgets
.value
.ui64
++;
323 * Find the correct entry in the cache to check for free
324 * client handles. The search is based on the RPC program
325 * number, program version number, dev_t for the transport
326 * device, and the protocol family.
328 mutex_enter(&nfscl
->nfscl_chtable_lock
);
329 plistp
= &nfscl
->nfscl_chtable
;
330 for (ch
= nfscl
->nfscl_chtable
; ch
!= NULL
; ch
= ch
->ch_next
) {
331 if (ch
->ch_prog
== ci
->cl_prog
&&
332 ch
->ch_vers
== ci
->cl_vers
&&
333 ch
->ch_dev
== svp
->sv_knconf
->knc_rdev
&&
334 (strcmp(ch
->ch_protofmly
,
335 svp
->sv_knconf
->knc_protofmly
) == 0))
337 plistp
= &ch
->ch_next
;
341 * If we didn't find a cache entry for this quadruple, then
342 * create one. If we don't have one already preallocated,
343 * then drop the cache lock, create one, and then start over.
344 * If we did have a preallocated entry, then just add it to
345 * the front of the list.
349 mutex_exit(&nfscl
->nfscl_chtable_lock
);
350 newch
= kmem_alloc(sizeof (*newch
), KM_SLEEP
);
351 newch
->ch_timesused
= 0;
352 newch
->ch_prog
= ci
->cl_prog
;
353 newch
->ch_vers
= ci
->cl_vers
;
354 newch
->ch_dev
= svp
->sv_knconf
->knc_rdev
;
355 newch
->ch_protofmly
= kmem_alloc(
356 strlen(svp
->sv_knconf
->knc_protofmly
) + 1,
358 (void) strcpy(newch
->ch_protofmly
,
359 svp
->sv_knconf
->knc_protofmly
);
360 newch
->ch_list
= NULL
;
365 ch
->ch_next
= nfscl
->nfscl_chtable
;
366 nfscl
->nfscl_chtable
= ch
;
368 * We found a cache entry, but if it isn't on the front of the
369 * list, then move it to the front of the list to try to take
370 * advantage of locality of operations.
372 } else if (ch
!= nfscl
->nfscl_chtable
) {
373 *plistp
= ch
->ch_next
;
374 ch
->ch_next
= nfscl
->nfscl_chtable
;
375 nfscl
->nfscl_chtable
= ch
;
379 * If there was a free client handle cached, then remove it
380 * from the list, init it, and use it.
382 if (ch
->ch_list
!= NULL
) {
384 ch
->ch_list
= cp
->ch_list
;
385 mutex_exit(&nfscl
->nfscl_chtable_lock
);
387 kmem_free(newch
->ch_protofmly
,
388 strlen(newch
->ch_protofmly
) + 1);
389 kmem_free(newch
, sizeof (*newch
));
391 (void) clnt_tli_kinit(cp
->ch_client
, svp
->sv_knconf
,
392 &svp
->sv_addr
, ci
->cl_readsize
, ci
->cl_retrans
, cr
);
393 error
= sec_clnt_geth(cp
->ch_client
, svp
->sv_secdata
, cr
,
394 &cp
->ch_client
->cl_auth
);
395 if (error
|| cp
->ch_client
->cl_auth
== NULL
) {
396 CLNT_DESTROY(cp
->ch_client
);
397 kmem_cache_free(chtab_cache
, cp
);
398 return ((error
!= 0) ? error
: EINTR
);
401 *newcl
= cp
->ch_client
;
407 * There weren't any free client handles which fit, so allocate
408 * a new one and use that.
411 atomic_inc_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
413 mutex_exit(&nfscl
->nfscl_chtable_lock
);
415 nfscl
->nfscl_stat
.cltoomany
.value
.ui64
++;
417 kmem_free(newch
->ch_protofmly
, strlen(newch
->ch_protofmly
) + 1);
418 kmem_free(newch
, sizeof (*newch
));
421 cp
= kmem_cache_alloc(chtab_cache
, KM_SLEEP
);
424 sigintr(&smask
, (int)ci
->cl_flags
& MI_INT
);
425 error
= clnt_tli_kcreate(svp
->sv_knconf
, &svp
->sv_addr
, ci
->cl_prog
,
426 ci
->cl_vers
, ci
->cl_readsize
, ci
->cl_retrans
, cr
, &cp
->ch_client
);
430 kmem_cache_free(chtab_cache
, cp
);
432 atomic_dec_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
435 * Warning is unnecessary if error is EINTR.
437 if (error
!= EINTR
) {
438 nfs_cmn_err(error
, CE_WARN
,
439 "clget: couldn't create handle: %m\n");
443 (void) CLNT_CONTROL(cp
->ch_client
, CLSET_PROGRESS
, NULL
);
444 auth_destroy(cp
->ch_client
->cl_auth
);
445 error
= sec_clnt_geth(cp
->ch_client
, svp
->sv_secdata
, cr
,
446 &cp
->ch_client
->cl_auth
);
447 if (error
|| cp
->ch_client
->cl_auth
== NULL
) {
448 CLNT_DESTROY(cp
->ch_client
);
449 kmem_cache_free(chtab_cache
, cp
);
451 atomic_dec_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
);
453 return ((error
!= 0) ? error
: EINTR
);
456 *newcl
= cp
->ch_client
;
457 ASSERT(cp
->ch_client
->cl_nosignal
== FALSE
);
463 clget(clinfo_t
*ci
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
466 struct nfs_clnt
*nfscl
;
468 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
469 ASSERT(nfscl
!= NULL
);
471 return (clget_impl(ci
, svp
, cr
, newcl
, chp
, nfscl
));
475 acl_clget(mntinfo_t
*mi
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
476 struct chtab
**chp
, struct nfs_clnt
*nfscl
)
482 * Set read buffer size to rsize
483 * and add room for RPC headers.
485 ci
.cl_readsize
= mi
->mi_tsize
;
486 if (ci
.cl_readsize
!= 0)
487 ci
.cl_readsize
+= (RPC_MAXDATASIZE
- NFS_MAXDATA
);
490 * If soft mount and server is down just try once.
491 * meaning: do not retransmit.
493 if (!(mi
->mi_flags
& MI_HARD
) && (mi
->mi_flags
& MI_DOWN
))
496 ci
.cl_retrans
= mi
->mi_retrans
;
498 ci
.cl_prog
= NFS_ACL_PROGRAM
;
499 ci
.cl_vers
= mi
->mi_vers
;
500 ci
.cl_flags
= mi
->mi_flags
;
503 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
504 * security flavor, the client tries to establish a security context
505 * by contacting the server. If the connection is timed out or reset,
506 * e.g. server reboot, we will try again.
509 error
= clget_impl(&ci
, svp
, cr
, newcl
, chp
, nfscl
);
515 * For forced unmount or zone shutdown, bail out, no retry.
517 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
522 /* do not retry for softmount */
523 if (!(mi
->mi_flags
& MI_HARD
))
526 /* let the caller deal with the failover case */
527 if (FAILOVER_MOUNT(mi
))
530 } while (error
== ETIMEDOUT
|| error
== ECONNRESET
);
536 nfs_clget(mntinfo_t
*mi
, servinfo_t
*svp
, cred_t
*cr
, CLIENT
**newcl
,
537 struct chtab
**chp
, struct nfs_clnt
*nfscl
)
543 * Set read buffer size to rsize
544 * and add room for RPC headers.
546 ci
.cl_readsize
= mi
->mi_tsize
;
547 if (ci
.cl_readsize
!= 0)
548 ci
.cl_readsize
+= (RPC_MAXDATASIZE
- NFS_MAXDATA
);
551 * If soft mount and server is down just try once.
552 * meaning: do not retransmit.
554 if (!(mi
->mi_flags
& MI_HARD
) && (mi
->mi_flags
& MI_DOWN
))
557 ci
.cl_retrans
= mi
->mi_retrans
;
559 ci
.cl_prog
= mi
->mi_prog
;
560 ci
.cl_vers
= mi
->mi_vers
;
561 ci
.cl_flags
= mi
->mi_flags
;
564 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
565 * security flavor, the client tries to establish a security context
566 * by contacting the server. If the connection is timed out or reset,
567 * e.g. server reboot, we will try again.
570 error
= clget_impl(&ci
, svp
, cr
, newcl
, chp
, nfscl
);
576 * For forced unmount or zone shutdown, bail out, no retry.
578 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
583 /* do not retry for softmount */
584 if (!(mi
->mi_flags
& MI_HARD
))
587 /* let the caller deal with the failover case */
588 if (FAILOVER_MOUNT(mi
))
591 } while (error
== ETIMEDOUT
|| error
== ECONNRESET
);
597 clfree_impl(CLIENT
*cl
, struct chtab
*cp
, struct nfs_clnt
*nfscl
)
599 if (cl
->cl_auth
!= NULL
) {
600 sec_clnt_freeh(cl
->cl_auth
);
605 * Timestamp this cache entry so that we know when it was last
608 cp
->ch_freed
= gethrestime_sec();
611 * Add the free client handle to the front of the list.
612 * This way, the list will be sorted in youngest to oldest
615 mutex_enter(&nfscl
->nfscl_chtable_lock
);
616 cp
->ch_list
= cp
->ch_head
->ch_list
;
617 cp
->ch_head
->ch_list
= cp
;
618 mutex_exit(&nfscl
->nfscl_chtable_lock
);
622 clfree(CLIENT
*cl
, struct chtab
*cp
)
624 struct nfs_clnt
*nfscl
;
626 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
627 ASSERT(nfscl
!= NULL
);
629 clfree_impl(cl
, cp
, nfscl
);
632 #define CL_HOLDTIME 60 /* time to hold client handles */
635 clreclaim_zone(struct nfs_clnt
*nfscl
, uint_t cl_holdtime
)
638 struct chtab
*cp
; /* list of objects that can be reclaimed */
647 * Need to reclaim some memory, so step through the cache
648 * looking through the lists for entries which can be freed.
652 mutex_enter(&nfscl
->nfscl_chtable_lock
);
655 * Here we step through each non-NULL quadruple and start to
656 * construct the reclaim list pointed to by cp. Note that
657 * cp will contain all eligible chtab entries. When this traversal
658 * completes, chtab entries from the last quadruple will be at the
659 * front of cp and entries from previously inspected quadruples have
660 * been appended to the rear of cp.
662 for (ch
= nfscl
->nfscl_chtable
; ch
!= NULL
; ch
= ch
->ch_next
) {
663 if (ch
->ch_list
== NULL
)
666 * Search each list for entries older then
667 * cl_holdtime seconds. The lists are maintained
668 * in youngest to oldest order so that when the
669 * first entry is found which is old enough, then
670 * all of the rest of the entries on the list will
671 * be old enough as well.
675 while (cpl
!= NULL
&&
676 cpl
->ch_freed
+ cl_holdtime
> gethrestime_sec()) {
684 while (cpe
->ch_list
!= NULL
)
692 mutex_exit(&nfscl
->nfscl_chtable_lock
);
695 * If cp is empty, then there is nothing to reclaim here.
701 * Step through the list of entries to free, destroying each client
702 * handle and kmem_free'ing the memory for each entry.
708 CLNT_DESTROY(cp
->ch_client
);
710 kmem_cache_free(chtab_cache
, cp
);
716 * Update clalloc so that nfsstat shows the current number
717 * of allocated client handles.
719 atomic_add_64(&nfscl
->nfscl_stat
.clalloc
.value
.ui64
, -n
);
727 struct nfs_clnt
*nfscl
;
730 clstat_debug
.clreclaim
.value
.ui64
++;
733 * The system is low on memory; go through and try to reclaim some from
734 * every zone on the system.
736 mutex_enter(&nfs_clnt_list_lock
);
737 nfscl
= list_head(&nfs_clnt_list
);
738 for (; nfscl
!= NULL
; nfscl
= list_next(&nfs_clnt_list
, nfscl
))
739 clreclaim_zone(nfscl
, CL_HOLDTIME
);
740 mutex_exit(&nfs_clnt_list_lock
);
744 * Minimum time-out values indexed by call type
745 * These units are in "eights" of a second to avoid multiplies
747 static unsigned int minimum_timeo
[] = {
752 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
754 #define MAXTIMO (20*hz)
755 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
756 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
758 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
759 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
760 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
763 * Function called when rfscall notices that we have been
764 * re-transmitting, or when we get a response without retransmissions.
765 * Return 1 if the transfer size was adjusted down - 0 if no change.
768 nfs_feedback(int flag
, int which
, mntinfo_t
*mi
)
773 mutex_enter(&mi
->mi_lock
);
774 if (flag
== FEEDBACK_REXMIT1
) {
775 if (mi
->mi_timers
[NFS_CALLTYPES
].rt_rtxcur
!= 0 &&
776 mi
->mi_timers
[NFS_CALLTYPES
].rt_rtxcur
< REDUCE_NFS_TIME
)
778 if (mi
->mi_curread
> MIN_NFS_TSIZE
) {
780 if (mi
->mi_curread
< MIN_NFS_TSIZE
)
781 mi
->mi_curread
= MIN_NFS_TSIZE
;
785 if (mi
->mi_curwrite
> MIN_NFS_TSIZE
) {
786 mi
->mi_curwrite
/= 2;
787 if (mi
->mi_curwrite
< MIN_NFS_TSIZE
)
788 mi
->mi_curwrite
= MIN_NFS_TSIZE
;
791 } else if (flag
== FEEDBACK_OK
) {
792 kind
= mi
->mi_timer_type
[which
];
794 mi
->mi_timers
[kind
].rt_srtt
>= INCREASE_NFS_TIME
)
797 if (mi
->mi_curread
>= mi
->mi_tsize
)
799 mi
->mi_curread
+= MIN_NFS_TSIZE
;
800 if (mi
->mi_curread
> mi
->mi_tsize
/2)
801 mi
->mi_curread
= mi
->mi_tsize
;
802 } else if (kind
== 2) {
803 if (mi
->mi_curwrite
>= mi
->mi_stsize
)
805 mi
->mi_curwrite
+= MIN_NFS_TSIZE
;
806 if (mi
->mi_curwrite
> mi
->mi_stsize
/2)
807 mi
->mi_curwrite
= mi
->mi_stsize
;
811 mutex_exit(&mi
->mi_lock
);
816 static int rfs2call_hits
= 0;
817 static int rfs2call_misses
= 0;
821 rfs2call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
822 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
823 enum nfsstat
*statusp
, int flags
, failinfo_t
*fi
)
826 enum clnt_stat rpc_status
;
828 ASSERT(statusp
!= NULL
);
830 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
831 cr
, douprintf
, &rpc_status
, flags
, fi
);
834 * See crnetadjust() for comments.
836 if (*statusp
== NFSERR_ACCES
&&
837 (cr
= crnetadjust(cr
)) != NULL
) {
841 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
, xdrres
,
842 resp
, cr
, douprintf
, NULL
, flags
, fi
);
845 if (*statusp
== NFSERR_ACCES
)
849 } else if (rpc_status
== RPC_PROCUNAVAIL
) {
850 *statusp
= NFSERR_OPNOTSUPP
;
857 #define NFS3_JUKEBOX_DELAY 10 * hz
859 static clock_t nfs3_jukebox_delay
= 0;
862 static int rfs3call_hits
= 0;
863 static int rfs3call_misses
= 0;
867 rfs3call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
868 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
869 nfsstat3
*statusp
, int flags
, failinfo_t
*fi
)
876 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
877 cr
, douprintf
, NULL
, flags
, fi
);
880 if (*statusp
== NFS3ERR_JUKEBOX
) {
881 if (ttoproc(curthread
) == &p0
) {
885 if (!user_informed
) {
888 "file temporarily unavailable on the server, retrying...\n");
890 delay(nfs3_jukebox_delay
);
893 * See crnetadjust() for comments.
895 else if (*statusp
== NFS3ERR_ACCES
&&
896 (crr
= crnetadjust(cr
)) != NULL
) {
900 rpcerror
= rfscall(mi
, which
, xdrargs
, argsp
,
901 xdrres
, resp
, crr
, douprintf
,
906 if (*statusp
== NFS3ERR_ACCES
)
911 } while (!rpcerror
&& *statusp
== NFS3ERR_JUKEBOX
);
916 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
917 #define INC_READERS(mi) { \
920 #define DEC_READERS(mi) { \
922 if (mi->mi_readers == 0) \
923 cv_broadcast(&mi->mi_failover_cv); \
927 rfscall(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
928 xdrproc_t xdrres
, caddr_t resp
, cred_t
*icr
, int *douprintf
,
929 enum clnt_stat
*rpc_status
, int flags
, failinfo_t
*fi
)
934 enum clnt_stat status
;
935 struct rpc_err rpcerr
, rpcerr_tmp
;
937 int timeo
; /* in units of hz */
938 int my_rsize
, my_wsize
;
940 bool_t cred_cloned
= FALSE
;
943 struct nfs_clnt
*nfscl
;
944 zoneid_t zoneid
= getzoneid();
951 TRACE_2(TR_FAC_NFS
, TR_RFSCALL_START
,
952 "rfscall_start:which %d mi %p", which
, mi
);
954 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
955 ASSERT(nfscl
!= NULL
);
957 nfscl
->nfscl_stat
.calls
.value
.ui64
++;
958 mi
->mi_reqs
[which
].value
.ui64
++;
960 rpcerr
.re_status
= RPC_SUCCESS
;
963 * In case of forced unmount or zone shutdown, return EIO.
966 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
967 rpcerr
.re_status
= RPC_FAILED
;
968 rpcerr
.re_errno
= EIO
;
969 return (rpcerr
.re_errno
);
973 * Remember the transfer sizes in case
974 * nfs_feedback changes them underneath us.
976 my_rsize
= mi
->mi_curread
;
977 my_wsize
= mi
->mi_curwrite
;
980 * NFS client failover support
982 * If this rnode is not in sync with the current server (VALID_FH),
983 * we'd like to do a remap to get in sync. We can be interrupted
984 * in failover_remap(), and if so we'll bail. Otherwise, we'll
985 * use the best info we have to try the RPC. Part of that is
986 * unconditionally updating the filehandle copy kept for V3.
988 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
989 * rw_enter(); we're trying to keep the current server from being
990 * changed on us until we're done with the remapping and have a
991 * matching client handle. We don't want to sending a filehandle
995 if (FAILOVER_MOUNT(mi
)) {
996 mutex_enter(&mi
->mi_lock
);
997 if (!(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
998 if (failover_wait(mi
)) {
999 mutex_exit(&mi
->mi_lock
);
1004 mutex_exit(&mi
->mi_lock
);
1006 if (!VALID_FH(fi
) &&
1007 !(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
1010 svp
= mi
->mi_curr_serv
;
1011 remaperr
= failover_remap(fi
);
1012 if (remaperr
!= 0) {
1014 if (remaperr
!= EINTR
)
1015 nfs_cmn_err(remaperr
, CE_WARN
,
1016 "rfscall couldn't failover: %m");
1018 mutex_enter(&mi
->mi_lock
);
1020 mutex_exit(&mi
->mi_lock
);
1022 * If failover_remap returns ETIMEDOUT
1023 * and the filesystem is hard mounted
1024 * we have to retry the call with a new
1027 if ((mi
->mi_flags
& MI_HARD
) &&
1028 IS_RECOVERABLE_ERROR(remaperr
)) {
1029 if (svp
== mi
->mi_curr_serv
)
1030 failover_newserver(mi
);
1031 rpcerr
.re_status
= RPC_SUCCESS
;
1034 rpcerr
.re_errno
= remaperr
;
1038 if (fi
->fhp
&& fi
->copyproc
)
1039 (*fi
->copyproc
)(fi
->fhp
, fi
->vp
);
1044 * clget() calls clnt_tli_kinit() which clears the xid, so we
1045 * are guaranteed to reprocess the retry as a new request.
1047 svp
= mi
->mi_curr_serv
;
1048 rpcerr
.re_errno
= nfs_clget(mi
, svp
, cr
, &client
, &ch
, nfscl
);
1050 if (FAILOVER_MOUNT(mi
)) {
1051 mutex_enter(&mi
->mi_lock
);
1053 mutex_exit(&mi
->mi_lock
);
1055 if ((rpcerr
.re_errno
== ETIMEDOUT
||
1056 rpcerr
.re_errno
== ECONNRESET
) &&
1057 failover_safe(fi
)) {
1058 if (svp
== mi
->mi_curr_serv
)
1059 failover_newserver(mi
);
1063 if (rpcerr
.re_errno
!= 0)
1064 return (rpcerr
.re_errno
);
1066 if (svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS_ORD
||
1067 svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS
) {
1068 timeo
= (mi
->mi_timeo
* hz
) / 10;
1070 mutex_enter(&mi
->mi_lock
);
1071 timeo
= CLNT_SETTIMERS(client
,
1072 &(mi
->mi_timers
[mi
->mi_timer_type
[which
]]),
1073 &(mi
->mi_timers
[NFS_CALLTYPES
]),
1074 (minimum_timeo
[mi
->mi_call_type
[which
]]*hz
)>>3,
1075 (void (*)())NULL
, (caddr_t
)mi
, 0);
1076 mutex_exit(&mi
->mi_lock
);
1080 * If hard mounted fs, retry call forever unless hard error occurs.
1085 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
1086 status
= RPC_FAILED
;
1087 rpcerr
.re_status
= RPC_FAILED
;
1088 rpcerr
.re_errno
= EIO
;
1092 TICK_TO_TIMEVAL(timeo
, &wait
);
1095 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1096 * and SIGTERM. (Preserving the existing masks).
1097 * Mask out SIGINT if mount option nointr is specified.
1099 sigintr(&smask
, (int)mi
->mi_flags
& MI_INT
);
1100 if (!(mi
->mi_flags
& MI_INT
))
1101 client
->cl_nosignal
= TRUE
;
1104 * If there is a current signal, then don't bother
1105 * even trying to send out the request because we
1106 * won't be able to block waiting for the response.
1107 * Simply assume RPC_INTR and get on with it.
1109 if (ttolwp(curthread
) != NULL
&& ISSIG(curthread
, JUSTLOOKING
))
1112 status
= CLNT_CALL(client
, which
, xdrargs
, argsp
,
1113 xdrres
, resp
, wait
);
1116 if (!(mi
->mi_flags
& MI_INT
))
1117 client
->cl_nosignal
= FALSE
;
1119 * restore original signal mask
1125 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1126 mi
->mi_timer_type
[which
] != 0 &&
1127 (mi
->mi_curread
!= my_rsize
||
1128 mi
->mi_curwrite
!= my_wsize
))
1129 (void) nfs_feedback(FEEDBACK_OK
, which
, mi
);
1134 * There is no way to recover from this error,
1135 * even if mount option nointr is specified.
1136 * SIGKILL, for example, cannot be blocked.
1138 rpcerr
.re_status
= RPC_INTR
;
1139 rpcerr
.re_errno
= EINTR
;
1144 * If the NFS server is local (vold) and
1145 * it goes away then we get RPC_UDERROR.
1146 * This is a retryable error, so we would
1147 * loop, so check to see if the specific
1148 * error was ECONNRESET, indicating that
1149 * target did not exist at all. If so,
1150 * return with RPC_PROGUNAVAIL and
1151 * ECONNRESET to indicate why.
1153 CLNT_GETERR(client
, &rpcerr
);
1154 if (rpcerr
.re_errno
== ECONNRESET
) {
1155 rpcerr
.re_status
= RPC_PROGUNAVAIL
;
1156 rpcerr
.re_errno
= ECONNRESET
;
1161 default: /* probably RPC_TIMEDOUT */
1162 if (IS_UNRECOVERABLE_RPC(status
))
1166 * increment server not responding count
1168 mutex_enter(&mi
->mi_lock
);
1169 mi
->mi_noresponse
++;
1170 mutex_exit(&mi
->mi_lock
);
1172 nfscl
->nfscl_stat
.noresponse
.value
.ui64
++;
1175 if (!(mi
->mi_flags
& MI_HARD
)) {
1176 if (!(mi
->mi_flags
& MI_SEMISOFT
) ||
1177 (mi
->mi_ss_call_type
[which
] == 0))
1182 * The call is in progress (over COTS).
1183 * Try the CLNT_CALL again, but don't
1184 * print a noisy error message.
1186 if (status
== RPC_INPROGRESS
) {
1191 if (flags
& RFSCALL_SOFT
)
1195 * On zone shutdown, just move on.
1197 if (zone_status_get(curproc
->p_zone
) >=
1198 ZONE_IS_SHUTTING_DOWN
) {
1199 rpcerr
.re_status
= RPC_FAILED
;
1200 rpcerr
.re_errno
= EIO
;
1205 * NFS client failover support
1207 * If the current server just failed us, we'll
1208 * start the process of finding a new server.
1209 * After that, we can just retry.
1211 if (FAILOVER_MOUNT(mi
) && failover_safe(fi
)) {
1212 if (svp
== mi
->mi_curr_serv
)
1213 failover_newserver(mi
);
1214 clfree_impl(client
, ch
, nfscl
);
1219 timeo
= backoff(timeo
);
1221 CLNT_GETERR(client
, &rpcerr_tmp
);
1222 if ((status
== RPC_CANTSEND
) &&
1223 (rpcerr_tmp
.re_errno
== ENOBUFS
))
1224 msg
= SRV_QFULL_MSG
;
1226 msg
= SRV_NOTRESP_MSG
;
1228 mutex_enter(&mi
->mi_lock
);
1229 if (!(mi
->mi_flags
& MI_PRINTED
)) {
1230 mi
->mi_flags
|= MI_PRINTED
;
1231 mutex_exit(&mi
->mi_lock
);
1233 zprintf(zoneid
, msg
, mi
->mi_vers
,
1236 zprintf(zoneid
, msg
, svp
->sv_hostname
);
1239 mutex_exit(&mi
->mi_lock
);
1240 if (*douprintf
&& nfs_has_ctty()) {
1242 if (!(mi
->mi_flags
& MI_NOPRINT
))
1244 uprintf(msg
, mi
->mi_vers
,
1247 uprintf(msg
, svp
->sv_hostname
);
1252 * If doing dynamic adjustment of transfer
1253 * size and if it's a read or write call
1254 * and if the transfer size changed while
1255 * retransmitting or if the feedback routine
1256 * changed the transfer size,
1257 * then exit rfscall so that the transfer
1258 * size can be adjusted at the vnops level.
1260 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1261 mi
->mi_timer_type
[which
] != 0 &&
1262 (mi
->mi_curread
!= my_rsize
||
1263 mi
->mi_curwrite
!= my_wsize
||
1264 nfs_feedback(FEEDBACK_REXMIT1
, which
, mi
))) {
1266 * On read or write calls, return
1267 * back to the vnode ops level if
1268 * the transfer size changed.
1270 clfree_impl(client
, ch
, nfscl
);
1273 return (ENFS_TRYAGAIN
);
1278 if (status
!= RPC_SUCCESS
) {
1280 * Let soft mounts use the timed out message.
1282 if (status
== RPC_INPROGRESS
)
1283 status
= RPC_TIMEDOUT
;
1284 nfscl
->nfscl_stat
.badcalls
.value
.ui64
++;
1285 if (status
!= RPC_INTR
) {
1286 mutex_enter(&mi
->mi_lock
);
1287 mi
->mi_flags
|= MI_DOWN
;
1288 mutex_exit(&mi
->mi_lock
);
1289 CLNT_GETERR(client
, &rpcerr
);
1291 bufp
= clnt_sperror(client
, svp
->sv_hostname
);
1292 zprintf(zoneid
, "NFS%d %s failed for %s\n",
1293 mi
->mi_vers
, mi
->mi_rfsnames
[which
], bufp
);
1294 if (nfs_has_ctty()) {
1295 if (!(mi
->mi_flags
& MI_NOPRINT
)) {
1296 uprintf("NFS%d %s failed for %s\n",
1297 mi
->mi_vers
, mi
->mi_rfsnames
[which
],
1301 kmem_free(bufp
, MAXPATHLEN
);
1304 "NFS %s failed for server %s: error %d (%s)\n",
1305 mi
->mi_rfsnames
[which
], svp
->sv_hostname
,
1306 status
, clnt_sperrno(status
));
1307 if (nfs_has_ctty()) {
1308 if (!(mi
->mi_flags
& MI_NOPRINT
)) {
1310 "NFS %s failed for server %s: error %d (%s)\n",
1311 mi
->mi_rfsnames
[which
],
1312 svp
->sv_hostname
, status
,
1313 clnt_sperrno(status
));
1318 * when CLNT_CALL() fails with RPC_AUTHERROR,
1319 * re_errno is set appropriately depending on
1320 * the authentication error
1322 if (status
== RPC_VERSMISMATCH
||
1323 status
== RPC_PROGVERSMISMATCH
)
1324 rpcerr
.re_errno
= EIO
;
1328 * Test the value of mi_down and mi_printed without
1329 * holding the mi_lock mutex. If they are both zero,
1330 * then it is okay to skip the down and printed
1331 * processing. This saves on a mutex_enter and
1332 * mutex_exit pair for a normal, successful RPC.
1333 * This was just complete overhead.
1335 if (mi
->mi_flags
& (MI_DOWN
| MI_PRINTED
)) {
1336 mutex_enter(&mi
->mi_lock
);
1337 mi
->mi_flags
&= ~MI_DOWN
;
1338 if (mi
->mi_flags
& MI_PRINTED
) {
1339 mi
->mi_flags
&= ~MI_PRINTED
;
1340 mutex_exit(&mi
->mi_lock
);
1342 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1343 zprintf(zoneid
, "NFS%d server %s ok\n",
1344 mi
->mi_vers
, svp
->sv_hostname
);
1346 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1347 zprintf(zoneid
, "NFS server %s ok\n",
1351 mutex_exit(&mi
->mi_lock
);
1354 if (*douprintf
== 0) {
1355 if (!(mi
->mi_flags
& MI_NOPRINT
))
1357 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1358 uprintf("NFS%d server %s ok\n",
1359 mi
->mi_vers
, svp
->sv_hostname
);
1361 if (!(mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
1362 uprintf("NFS server %s ok\n", svp
->sv_hostname
);
1368 clfree_impl(client
, ch
, nfscl
);
1372 ASSERT(rpcerr
.re_status
== RPC_SUCCESS
|| rpcerr
.re_errno
!= 0);
1374 if (rpc_status
!= NULL
)
1375 *rpc_status
= rpcerr
.re_status
;
1377 TRACE_1(TR_FAC_NFS
, TR_RFSCALL_END
, "rfscall_end:errno %d",
1380 return (rpcerr
.re_errno
);
1384 static int acl2call_hits
= 0;
1385 static int acl2call_misses
= 0;
1389 acl2call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1390 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
1391 enum nfsstat
*statusp
, int flags
, failinfo_t
*fi
)
1395 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
1396 cr
, douprintf
, flags
, fi
);
1399 * See comments with crnetadjust().
1401 if (*statusp
== NFSERR_ACCES
&&
1402 (cr
= crnetadjust(cr
)) != NULL
) {
1406 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
, xdrres
,
1407 resp
, cr
, douprintf
, flags
, fi
);
1410 if (*statusp
== NFSERR_ACCES
)
1420 static int acl3call_hits
= 0;
1421 static int acl3call_misses
= 0;
1425 acl3call(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1426 xdrproc_t xdrres
, caddr_t resp
, cred_t
*cr
, int *douprintf
,
1427 nfsstat3
*statusp
, int flags
, failinfo_t
*fi
)
1435 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
, xdrres
, resp
,
1436 cr
, douprintf
, flags
, fi
);
1439 if (*statusp
== NFS3ERR_JUKEBOX
) {
1440 if (!user_informed
) {
1443 "file temporarily unavailable on the server, retrying...\n");
1445 delay(nfs3_jukebox_delay
);
1448 * See crnetadjust() for comments.
1450 else if (*statusp
== NFS3ERR_ACCES
&&
1451 (crr
= crnetadjust(cr
)) != NULL
) {
1455 rpcerror
= aclcall(mi
, which
, xdrargs
, argsp
,
1456 xdrres
, resp
, crr
, douprintf
, flags
, fi
);
1460 if (*statusp
== NFS3ERR_ACCES
)
1465 } while (!rpcerror
&& *statusp
== NFS3ERR_JUKEBOX
);
1471 aclcall(mntinfo_t
*mi
, rpcproc_t which
, xdrproc_t xdrargs
, caddr_t argsp
,
1472 xdrproc_t xdrres
, caddr_t resp
, cred_t
*icr
, int *douprintf
,
1473 int flags
, failinfo_t
*fi
)
1478 bool_t cred_cloned
= FALSE
;
1479 enum clnt_stat status
;
1480 struct rpc_err rpcerr
;
1481 struct timeval wait
;
1482 int timeo
; /* in units of hz */
1484 int my_rsize
, my_wsize
;
1489 struct nfs_clnt
*nfscl
;
1490 zoneid_t zoneid
= getzoneid();
1496 TRACE_2(TR_FAC_NFS
, TR_RFSCALL_START
,
1497 "rfscall_start:which %d mi %p", which
, mi
);
1500 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
1501 ASSERT(nfscl
!= NULL
);
1503 nfscl
->nfscl_stat
.calls
.value
.ui64
++;
1504 mi
->mi_aclreqs
[which
].value
.ui64
++;
1506 rpcerr
.re_status
= RPC_SUCCESS
;
1508 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
1509 rpcerr
.re_status
= RPC_FAILED
;
1510 rpcerr
.re_errno
= EIO
;
1511 return (rpcerr
.re_errno
);
1516 * Remember the transfer sizes in case
1517 * nfs_feedback changes them underneath us.
1519 my_rsize
= mi
->mi_curread
;
1520 my_wsize
= mi
->mi_curwrite
;
1524 * NFS client failover support
1526 * If this rnode is not in sync with the current server (VALID_FH),
1527 * we'd like to do a remap to get in sync. We can be interrupted
1528 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1529 * use the best info we have to try the RPC. Part of that is
1530 * unconditionally updating the filehandle copy kept for V3.
1532 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1533 * rw_enter(); we're trying to keep the current server from being
1534 * changed on us until we're done with the remapping and have a
1535 * matching client handle. We don't want to sending a filehandle
1536 * to the wrong host.
1539 if (FAILOVER_MOUNT(mi
)) {
1540 mutex_enter(&mi
->mi_lock
);
1541 if (!(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
1542 if (failover_wait(mi
)) {
1543 mutex_exit(&mi
->mi_lock
);
1548 mutex_exit(&mi
->mi_lock
);
1550 if (!VALID_FH(fi
) &&
1551 !(flags
& RFSCALL_SOFT
) && failover_safe(fi
)) {
1554 svp
= mi
->mi_curr_serv
;
1555 remaperr
= failover_remap(fi
);
1556 if (remaperr
!= 0) {
1558 if (remaperr
!= EINTR
)
1559 nfs_cmn_err(remaperr
, CE_WARN
,
1560 "aclcall couldn't failover: %m");
1562 mutex_enter(&mi
->mi_lock
);
1564 mutex_exit(&mi
->mi_lock
);
1567 * If failover_remap returns ETIMEDOUT
1568 * and the filesystem is hard mounted
1569 * we have to retry the call with a new
1572 if ((mi
->mi_flags
& MI_HARD
) &&
1573 IS_RECOVERABLE_ERROR(remaperr
)) {
1574 if (svp
== mi
->mi_curr_serv
)
1575 failover_newserver(mi
);
1576 rpcerr
.re_status
= RPC_SUCCESS
;
1582 if (fi
->fhp
&& fi
->copyproc
)
1583 (*fi
->copyproc
)(fi
->fhp
, fi
->vp
);
1588 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1589 * are guaranteed to reprocess the retry as a new request.
1591 svp
= mi
->mi_curr_serv
;
1592 rpcerr
.re_errno
= acl_clget(mi
, svp
, cr
, &client
, &ch
, nfscl
);
1593 if (FAILOVER_MOUNT(mi
)) {
1594 mutex_enter(&mi
->mi_lock
);
1596 mutex_exit(&mi
->mi_lock
);
1598 if ((rpcerr
.re_errno
== ETIMEDOUT
||
1599 rpcerr
.re_errno
== ECONNRESET
) &&
1600 failover_safe(fi
)) {
1601 if (svp
== mi
->mi_curr_serv
)
1602 failover_newserver(mi
);
1606 if (rpcerr
.re_errno
!= 0) {
1609 return (rpcerr
.re_errno
);
1612 if (svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS_ORD
||
1613 svp
->sv_knconf
->knc_semantics
== NC_TPI_COTS
) {
1614 timeo
= (mi
->mi_timeo
* hz
) / 10;
1616 mutex_enter(&mi
->mi_lock
);
1617 timeo
= CLNT_SETTIMERS(client
,
1618 &(mi
->mi_timers
[mi
->mi_acl_timer_type
[which
]]),
1619 &(mi
->mi_timers
[NFS_CALLTYPES
]),
1620 (minimum_timeo
[mi
->mi_acl_call_type
[which
]]*hz
)>>3,
1621 (void (*)()) 0, (caddr_t
)mi
, 0);
1622 mutex_exit(&mi
->mi_lock
);
1626 * If hard mounted fs, retry call forever unless hard error occurs.
1631 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
1632 status
= RPC_FAILED
;
1633 rpcerr
.re_status
= RPC_FAILED
;
1634 rpcerr
.re_errno
= EIO
;
1638 TICK_TO_TIMEVAL(timeo
, &wait
);
1641 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1642 * and SIGTERM. (Preserving the existing masks).
1643 * Mask out SIGINT if mount option nointr is specified.
1645 sigintr(&smask
, (int)mi
->mi_flags
& MI_INT
);
1646 if (!(mi
->mi_flags
& MI_INT
))
1647 client
->cl_nosignal
= TRUE
;
1650 * If there is a current signal, then don't bother
1651 * even trying to send out the request because we
1652 * won't be able to block waiting for the response.
1653 * Simply assume RPC_INTR and get on with it.
1655 if (ttolwp(curthread
) != NULL
&& ISSIG(curthread
, JUSTLOOKING
))
1658 status
= CLNT_CALL(client
, which
, xdrargs
, argsp
,
1659 xdrres
, resp
, wait
);
1662 if (!(mi
->mi_flags
& MI_INT
))
1663 client
->cl_nosignal
= FALSE
;
1665 * restore original signal mask
1672 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1673 mi
->mi_timer_type
[which
] != 0 &&
1674 (mi
->mi_curread
!= my_rsize
||
1675 mi
->mi_curwrite
!= my_wsize
))
1676 (void) nfs_feedback(FEEDBACK_OK
, which
, mi
);
1681 * Unfortunately, there are servers in the world which
1682 * are not coded correctly. They are not prepared to
1683 * handle RPC requests to the NFS port which are not
1684 * NFS requests. Thus, they may try to process the
1685 * NFS_ACL request as if it were an NFS request. This
1686 * does not work. Generally, an error will be generated
1687 * on the client because it will not be able to decode
1688 * the response from the server. However, it seems
1689 * possible that the server may not be able to decode
1690 * the arguments. Thus, the criteria for deciding
1691 * whether the server supports NFS_ACL or not is whether
1692 * the following RPC errors are returned from CLNT_CALL.
1694 case RPC_CANTDECODERES
:
1695 case RPC_PROGUNAVAIL
:
1696 case RPC_CANTDECODEARGS
:
1697 case RPC_PROGVERSMISMATCH
:
1698 mutex_enter(&mi
->mi_lock
);
1699 mi
->mi_flags
&= ~(MI_ACL
| MI_EXTATTR
);
1700 mutex_exit(&mi
->mi_lock
);
1704 * If the server supports NFS_ACL but not the new ops
1705 * for extended attributes, make sure we don't retry.
1707 case RPC_PROCUNAVAIL
:
1708 mutex_enter(&mi
->mi_lock
);
1709 mi
->mi_flags
&= ~MI_EXTATTR
;
1710 mutex_exit(&mi
->mi_lock
);
1715 * There is no way to recover from this error,
1716 * even if mount option nointr is specified.
1717 * SIGKILL, for example, cannot be blocked.
1719 rpcerr
.re_status
= RPC_INTR
;
1720 rpcerr
.re_errno
= EINTR
;
1725 * If the NFS server is local (vold) and
1726 * it goes away then we get RPC_UDERROR.
1727 * This is a retryable error, so we would
1728 * loop, so check to see if the specific
1729 * error was ECONNRESET, indicating that
1730 * target did not exist at all. If so,
1731 * return with RPC_PROGUNAVAIL and
1732 * ECONNRESET to indicate why.
1734 CLNT_GETERR(client
, &rpcerr
);
1735 if (rpcerr
.re_errno
== ECONNRESET
) {
1736 rpcerr
.re_status
= RPC_PROGUNAVAIL
;
1737 rpcerr
.re_errno
= ECONNRESET
;
1742 default: /* probably RPC_TIMEDOUT */
1743 if (IS_UNRECOVERABLE_RPC(status
))
1747 * increment server not responding count
1749 mutex_enter(&mi
->mi_lock
);
1750 mi
->mi_noresponse
++;
1751 mutex_exit(&mi
->mi_lock
);
1753 nfscl
->nfscl_stat
.noresponse
.value
.ui64
++;
1756 if (!(mi
->mi_flags
& MI_HARD
)) {
1757 if (!(mi
->mi_flags
& MI_SEMISOFT
) ||
1758 (mi
->mi_acl_ss_call_type
[which
] == 0))
1763 * The call is in progress (over COTS).
1764 * Try the CLNT_CALL again, but don't
1765 * print a noisy error message.
1767 if (status
== RPC_INPROGRESS
) {
1772 if (flags
& RFSCALL_SOFT
)
1776 * On zone shutdown, just move on.
1778 if (zone_status_get(curproc
->p_zone
) >=
1779 ZONE_IS_SHUTTING_DOWN
) {
1780 rpcerr
.re_status
= RPC_FAILED
;
1781 rpcerr
.re_errno
= EIO
;
1786 * NFS client failover support
1788 * If the current server just failed us, we'll
1789 * start the process of finding a new server.
1790 * After that, we can just retry.
1792 if (FAILOVER_MOUNT(mi
) && failover_safe(fi
)) {
1793 if (svp
== mi
->mi_curr_serv
)
1794 failover_newserver(mi
);
1795 clfree_impl(client
, ch
, nfscl
);
1800 timeo
= backoff(timeo
);
1801 mutex_enter(&mi
->mi_lock
);
1802 if (!(mi
->mi_flags
& MI_PRINTED
)) {
1803 mi
->mi_flags
|= MI_PRINTED
;
1804 mutex_exit(&mi
->mi_lock
);
1807 "NFS_ACL%d server %s not responding still trying\n",
1808 mi
->mi_vers
, svp
->sv_hostname
);
1811 "NFS server %s not responding still trying\n",
1815 mutex_exit(&mi
->mi_lock
);
1816 if (*douprintf
&& nfs_has_ctty()) {
1818 if (!(mi
->mi_flags
& MI_NOPRINT
))
1821 "NFS_ACL%d server %s not responding still trying\n",
1822 mi
->mi_vers
, svp
->sv_hostname
);
1825 "NFS server %s not responding still trying\n",
1832 * If doing dynamic adjustment of transfer
1833 * size and if it's a read or write call
1834 * and if the transfer size changed while
1835 * retransmitting or if the feedback routine
1836 * changed the transfer size,
1837 * then exit rfscall so that the transfer
1838 * size can be adjusted at the vnops level.
1840 if ((mi
->mi_flags
& MI_DYNAMIC
) &&
1841 mi
->mi_acl_timer_type
[which
] != 0 &&
1842 (mi
->mi_curread
!= my_rsize
||
1843 mi
->mi_curwrite
!= my_wsize
||
1844 nfs_feedback(FEEDBACK_REXMIT1
, which
, mi
))) {
1846 * On read or write calls, return
1847 * back to the vnode ops level if
1848 * the transfer size changed.
1850 clfree_impl(client
, ch
, nfscl
);
1853 return (ENFS_TRYAGAIN
);
1859 if (status
!= RPC_SUCCESS
) {
1861 * Let soft mounts use the timed out message.
1863 if (status
== RPC_INPROGRESS
)
1864 status
= RPC_TIMEDOUT
;
1865 nfscl
->nfscl_stat
.badcalls
.value
.ui64
++;
1866 if (status
== RPC_CANTDECODERES
||
1867 status
== RPC_PROGUNAVAIL
||
1868 status
== RPC_PROCUNAVAIL
||
1869 status
== RPC_CANTDECODEARGS
||
1870 status
== RPC_PROGVERSMISMATCH
)
1871 CLNT_GETERR(client
, &rpcerr
);
1872 else if (status
!= RPC_INTR
) {
1873 mutex_enter(&mi
->mi_lock
);
1874 mi
->mi_flags
|= MI_DOWN
;
1875 mutex_exit(&mi
->mi_lock
);
1876 CLNT_GETERR(client
, &rpcerr
);
1878 bufp
= clnt_sperror(client
, svp
->sv_hostname
);
1879 zprintf(zoneid
, "NFS_ACL%d %s failed for %s\n",
1880 mi
->mi_vers
, mi
->mi_aclnames
[which
], bufp
);
1881 if (nfs_has_ctty()) {
1882 if (!(mi
->mi_flags
& MI_NOPRINT
)) {
1883 uprintf("NFS_ACL%d %s failed for %s\n",
1884 mi
->mi_vers
, mi
->mi_aclnames
[which
],
1888 kmem_free(bufp
, MAXPATHLEN
);
1891 "NFS %s failed for server %s: error %d (%s)\n",
1892 mi
->mi_aclnames
[which
], svp
->sv_hostname
,
1893 status
, clnt_sperrno(status
));
1894 if (nfs_has_ctty()) {
1895 if (!(mi
->mi_flags
& MI_NOPRINT
))
1897 "NFS %s failed for server %s: error %d (%s)\n",
1898 mi
->mi_aclnames
[which
],
1899 svp
->sv_hostname
, status
,
1900 clnt_sperrno(status
));
1904 * when CLNT_CALL() fails with RPC_AUTHERROR,
1905 * re_errno is set appropriately depending on
1906 * the authentication error
1908 if (status
== RPC_VERSMISMATCH
||
1909 status
== RPC_PROGVERSMISMATCH
)
1910 rpcerr
.re_errno
= EIO
;
1914 * Test the value of mi_down and mi_printed without
1915 * holding the mi_lock mutex. If they are both zero,
1916 * then it is okay to skip the down and printed
1917 * processing. This saves on a mutex_enter and
1918 * mutex_exit pair for a normal, successful RPC.
1919 * This was just complete overhead.
1921 if (mi
->mi_flags
& (MI_DOWN
| MI_PRINTED
)) {
1922 mutex_enter(&mi
->mi_lock
);
1923 mi
->mi_flags
&= ~MI_DOWN
;
1924 if (mi
->mi_flags
& MI_PRINTED
) {
1925 mi
->mi_flags
&= ~MI_PRINTED
;
1926 mutex_exit(&mi
->mi_lock
);
1928 zprintf(zoneid
, "NFS_ACL%d server %s ok\n",
1929 mi
->mi_vers
, svp
->sv_hostname
);
1931 zprintf(zoneid
, "NFS server %s ok\n",
1935 mutex_exit(&mi
->mi_lock
);
1938 if (*douprintf
== 0) {
1939 if (!(mi
->mi_flags
& MI_NOPRINT
))
1941 uprintf("NFS_ACL%d server %s ok\n",
1942 mi
->mi_vers
, svp
->sv_hostname
);
1944 uprintf("NFS server %s ok\n", svp
->sv_hostname
);
1950 clfree_impl(client
, ch
, nfscl
);
1954 ASSERT(rpcerr
.re_status
== RPC_SUCCESS
|| rpcerr
.re_errno
!= 0);
1957 TRACE_1(TR_FAC_NFS
, TR_RFSCALL_END
, "rfscall_end:errno %d",
1961 return (rpcerr
.re_errno
);
1965 vattr_to_sattr(struct vattr
*vap
, struct nfssattr
*sa
)
1967 uint_t mask
= vap
->va_mask
;
1969 if (!(mask
& AT_MODE
))
1970 sa
->sa_mode
= (uint32_t)-1;
1972 sa
->sa_mode
= vap
->va_mode
;
1973 if (!(mask
& AT_UID
))
1974 sa
->sa_uid
= (uint32_t)-1;
1976 sa
->sa_uid
= (uint32_t)vap
->va_uid
;
1977 if (!(mask
& AT_GID
))
1978 sa
->sa_gid
= (uint32_t)-1;
1980 sa
->sa_gid
= (uint32_t)vap
->va_gid
;
1981 if (!(mask
& AT_SIZE
))
1982 sa
->sa_size
= (uint32_t)-1;
1984 sa
->sa_size
= (uint32_t)vap
->va_size
;
1985 if (!(mask
& AT_ATIME
))
1986 sa
->sa_atime
.tv_sec
= sa
->sa_atime
.tv_usec
= (int32_t)-1;
1988 /* check time validity */
1989 if (! NFS_TIME_T_OK(vap
->va_atime
.tv_sec
)) {
1992 sa
->sa_atime
.tv_sec
= vap
->va_atime
.tv_sec
;
1993 sa
->sa_atime
.tv_usec
= vap
->va_atime
.tv_nsec
/ 1000;
1995 if (!(mask
& AT_MTIME
))
1996 sa
->sa_mtime
.tv_sec
= sa
->sa_mtime
.tv_usec
= (int32_t)-1;
1998 /* check time validity */
1999 if (! NFS_TIME_T_OK(vap
->va_mtime
.tv_sec
)) {
2002 sa
->sa_mtime
.tv_sec
= vap
->va_mtime
.tv_sec
;
2003 sa
->sa_mtime
.tv_usec
= vap
->va_mtime
.tv_nsec
/ 1000;
2009 vattr_to_sattr3(struct vattr
*vap
, sattr3
*sa
)
2011 uint_t mask
= vap
->va_mask
;
2013 if (!(mask
& AT_MODE
))
2014 sa
->mode
.set_it
= FALSE
;
2016 sa
->mode
.set_it
= TRUE
;
2017 sa
->mode
.mode
= (mode3
)vap
->va_mode
;
2019 if (!(mask
& AT_UID
))
2020 sa
->uid
.set_it
= FALSE
;
2022 sa
->uid
.set_it
= TRUE
;
2023 sa
->uid
.uid
= (uid3
)vap
->va_uid
;
2025 if (!(mask
& AT_GID
))
2026 sa
->gid
.set_it
= FALSE
;
2028 sa
->gid
.set_it
= TRUE
;
2029 sa
->gid
.gid
= (gid3
)vap
->va_gid
;
2031 if (!(mask
& AT_SIZE
))
2032 sa
->size
.set_it
= FALSE
;
2034 sa
->size
.set_it
= TRUE
;
2035 sa
->size
.size
= (size3
)vap
->va_size
;
2037 if (!(mask
& AT_ATIME
))
2038 sa
->atime
.set_it
= DONT_CHANGE
;
2040 /* check time validity */
2041 if (! NFS_TIME_T_OK(vap
->va_atime
.tv_sec
)) {
2044 sa
->atime
.set_it
= SET_TO_CLIENT_TIME
;
2045 sa
->atime
.atime
.seconds
= (uint32
)vap
->va_atime
.tv_sec
;
2046 sa
->atime
.atime
.nseconds
= (uint32
)vap
->va_atime
.tv_nsec
;
2048 if (!(mask
& AT_MTIME
))
2049 sa
->mtime
.set_it
= DONT_CHANGE
;
2051 /* check time validity */
2052 if (! NFS_TIME_T_OK(vap
->va_mtime
.tv_sec
)) {
2055 sa
->mtime
.set_it
= SET_TO_CLIENT_TIME
;
2056 sa
->mtime
.mtime
.seconds
= (uint32
)vap
->va_mtime
.tv_sec
;
2057 sa
->mtime
.mtime
.nseconds
= (uint32
)vap
->va_mtime
.tv_nsec
;
2063 setdiropargs(struct nfsdiropargs
*da
, char *nm
, vnode_t
*dvp
)
2066 da
->da_fhandle
= VTOFH(dvp
);
2072 setdiropargs3(diropargs3
*da
, char *nm
, vnode_t
*dvp
)
2075 da
->dirp
= VTOFH3(dvp
);
2080 setdirgid(vnode_t
*dvp
, gid_t
*gidp
, cred_t
*cr
)
2086 va
.va_mask
= AT_MODE
| AT_GID
;
2087 error
= fop_getattr(dvp
, &va
, 0, cr
, NULL
);
2092 * To determine the expected group-id of the created file:
2093 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2094 * GRPID option, and the directory's set-gid bit is clear,
2095 * then use the process's gid.
2096 * 2) Otherwise, set the group-id to the gid of the parent directory.
2099 mutex_enter(&rp
->r_statelock
);
2100 if (!(VTOMI(dvp
)->mi_flags
& MI_GRPID
) && !(va
.va_mode
& VSGID
))
2101 *gidp
= crgetgid(cr
);
2104 mutex_exit(&rp
->r_statelock
);
2109 setdirmode(vnode_t
*dvp
, mode_t
*omp
, cred_t
*cr
)
2114 va
.va_mask
= AT_MODE
;
2115 error
= fop_getattr(dvp
, &va
, 0, cr
, NULL
);
2120 * Modify the expected mode (om) so that the set-gid bit matches
2121 * that of the parent directory (dvp).
2123 if (va
.va_mode
& VSGID
)
2131 nfs_setswaplike(vnode_t
*vp
, vattr_t
*vap
)
2134 if (vp
->v_type
== VREG
&& (vap
->va_mode
& (VEXEC
| VSVTX
)) == VSVTX
) {
2135 if (!(vp
->v_flag
& VSWAPLIKE
)) {
2136 mutex_enter(&vp
->v_lock
);
2137 vp
->v_flag
|= VSWAPLIKE
;
2138 mutex_exit(&vp
->v_lock
);
2141 if (vp
->v_flag
& VSWAPLIKE
) {
2142 mutex_enter(&vp
->v_lock
);
2143 vp
->v_flag
&= ~VSWAPLIKE
;
2144 mutex_exit(&vp
->v_lock
);
2150 * Free the resources associated with an rnode.
2153 rinactive(rnode_t
*rp
, cred_t
*cr
)
2161 nfs3_pathconf_info
*info
;
2164 * Before freeing anything, wait until all asynchronous
2165 * activity is done on this rnode. This will allow all
2166 * asynchronous read ahead and write behind i/o's to
2169 mutex_enter(&rp
->r_statelock
);
2170 while (rp
->r_count
> 0)
2171 cv_wait(&rp
->r_cv
, &rp
->r_statelock
);
2172 mutex_exit(&rp
->r_statelock
);
2175 * Flush and invalidate all pages associated with the vnode.
2178 if (vn_has_cached_data(vp
)) {
2179 ASSERT(vp
->v_type
!= VCHR
);
2180 if ((rp
->r_flags
& RDIRTY
) && !rp
->r_error
) {
2181 error
= fop_putpage(vp
, 0, 0, 0, cr
, NULL
);
2182 if (error
&& (error
== ENOSPC
|| error
== EDQUOT
)) {
2183 mutex_enter(&rp
->r_statelock
);
2185 rp
->r_error
= error
;
2186 mutex_exit(&rp
->r_statelock
);
2189 nfs_invalidate_pages(vp
, 0, cr
);
2193 * Free any held credentials and caches which may be associated
2196 mutex_enter(&rp
->r_statelock
);
2199 contents
= rp
->r_symlink
.contents
;
2200 size
= rp
->r_symlink
.size
;
2201 rp
->r_symlink
.contents
= NULL
;
2202 vsp
= rp
->r_secattr
;
2203 rp
->r_secattr
= NULL
;
2204 info
= rp
->r_pathconf
;
2205 rp
->r_pathconf
= NULL
;
2206 mutex_exit(&rp
->r_statelock
);
2209 * Free the held credential.
2215 * Free the access cache entries.
2217 (void) nfs_access_purge_rp(rp
);
2220 * Free the readdir cache entries.
2222 if (HAVE_RDDIR_CACHE(rp
))
2223 nfs_purge_rddir_cache(vp
);
2226 * Free the symbolic link cache.
2228 if (contents
!= NULL
) {
2230 kmem_free((void *)contents
, size
);
2234 * Free any cached ACL.
2240 * Free any cached pathconf information.
2243 kmem_free(info
, sizeof (*info
));
2247 * Return a vnode for the given NFS Version 2 file handle.
2248 * If no rnode exists for this fhandle, create one and put it
2249 * into the hash queues. If the rnode for this fhandle
2250 * already exists, return it.
2252 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2255 makenfsnode(fhandle_t
*fh
, struct nfsfattr
*attr
, struct vfs
*vfsp
,
2256 hrtime_t t
, cred_t
*cr
, char *dnm
, char *nm
)
2264 nfh
.fh_len
= NFS_FHSIZE
;
2265 bcopy(fh
, nfh
.fh_buf
, NFS_FHSIZE
);
2267 index
= rtablehash(&nfh
);
2268 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2270 vp
= make_rnode(&nfh
, &rtable
[index
], vfsp
, &nfs_vnodeops
,
2271 nfs_putapage
, nfs_rddir_compar
, &newnode
, cr
, dnm
, nm
);
2275 rw_exit(&rtable
[index
].r_lock
);
2276 (void) nfs_cache_fattr(vp
, attr
, &va
, t
, cr
);
2278 if (attr
->na_type
< NFNON
|| attr
->na_type
> NFSOC
)
2281 vp
->v_type
= n2v_type(attr
);
2283 * A translation here seems to be necessary
2284 * because this function can be called
2285 * with `attr' that has come from the wire,
2286 * and been operated on by vattr_to_nattr().
2287 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2288 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2291 if ((attr
->na_rdev
& 0xffff0000) == 0)
2292 vp
->v_rdev
= nfsv2_expdev(attr
->na_rdev
);
2294 vp
->v_rdev
= expldev(n2v_rdev(attr
));
2295 nfs_attrcache(vp
, attr
, t
);
2296 rw_exit(&rtable
[index
].r_lock
);
2300 PURGE_ATTRCACHE(vp
);
2302 rw_exit(&rtable
[index
].r_lock
);
2309 * Return a vnode for the given NFS Version 3 file handle.
2310 * If no rnode exists for this fhandle, create one and put it
2311 * into the hash queues. If the rnode for this fhandle
2312 * already exists, return it.
2314 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2317 makenfs3node_va(nfs_fh3
*fh
, vattr_t
*vap
, struct vfs
*vfsp
, hrtime_t t
,
2318 cred_t
*cr
, char *dnm
, char *nm
)
2324 index
= rtablehash((nfs_fhandle
*)fh
);
2325 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2327 vp
= make_rnode((nfs_fhandle
*)fh
, &rtable
[index
], vfsp
,
2328 &nfs3_vnodeops
, nfs3_putapage
, nfs3_rddir_compar
, &newnode
, cr
,
2333 PURGE_ATTRCACHE(vp
);
2335 rw_exit(&rtable
[index
].r_lock
);
2340 rw_exit(&rtable
[index
].r_lock
);
2341 nfs_attr_cache(vp
, vap
, t
, cr
);
2343 rnode_t
*rp
= VTOR(vp
);
2345 vp
->v_type
= vap
->va_type
;
2346 vp
->v_rdev
= vap
->va_rdev
;
2348 mutex_enter(&rp
->r_statelock
);
2349 if (rp
->r_mtime
<= t
)
2350 nfs_attrcache_va(vp
, vap
);
2351 mutex_exit(&rp
->r_statelock
);
2352 rw_exit(&rtable
[index
].r_lock
);
2359 makenfs3node(nfs_fh3
*fh
, fattr3
*attr
, struct vfs
*vfsp
, hrtime_t t
,
2360 cred_t
*cr
, char *dnm
, char *nm
)
2367 index
= rtablehash((nfs_fhandle
*)fh
);
2368 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2370 vp
= make_rnode((nfs_fhandle
*)fh
, &rtable
[index
], vfsp
,
2371 &nfs3_vnodeops
, nfs3_putapage
, nfs3_rddir_compar
, &newnode
, cr
,
2376 PURGE_ATTRCACHE(vp
);
2378 rw_exit(&rtable
[index
].r_lock
);
2383 rw_exit(&rtable
[index
].r_lock
);
2384 (void) nfs3_cache_fattr3(vp
, attr
, &va
, t
, cr
);
2386 if (attr
->type
< NF3REG
|| attr
->type
> NF3FIFO
)
2389 vp
->v_type
= nf3_to_vt
[attr
->type
];
2390 vp
->v_rdev
= makedevice(attr
->rdev
.specdata1
,
2391 attr
->rdev
.specdata2
);
2392 nfs3_attrcache(vp
, attr
, t
);
2393 rw_exit(&rtable
[index
].r_lock
);
2400 * Read this comment before making changes to rtablehash()!
2401 * This is a hash function in which seemingly obvious and harmless
2402 * changes can cause escalations costing million dollars!
2403 * Know what you are doing.
2405 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2406 * algorithm is currently detailed here:
2408 * http://burtleburtle.net/bob/hash/doobs.html
2410 * Of course, the above link may not be valid by the time you are reading
2411 * this, but suffice it to say that the one-at-a-time algorithm works well in
2412 * almost all cases. If you are changing the algorithm be sure to verify that
2413 * the hash algorithm still provides even distribution in all cases and with
2414 * any server returning filehandles in whatever order (sequential or random).
2417 rtablehash(nfs_fhandle
*fh
)
2419 ulong_t hash
, len
, i
;
2423 len
= (ulong_t
)fh
->fh_len
;
2424 for (hash
= 0, i
= 0; i
< len
; i
++) {
2426 hash
+= (hash
<< 10);
2427 hash
^= (hash
>> 6);
2429 hash
+= (hash
<< 3);
2430 hash
^= (hash
>> 11);
2431 hash
+= (hash
<< 15);
2432 return (hash
& rtablemask
);
2436 make_rnode(nfs_fhandle
*fh
, rhashq_t
*rhtp
, struct vfs
*vfsp
,
2437 const struct vnodeops
*vops
,
2438 int (*putapage
)(vnode_t
*, page_t
*, uoff_t
*, size_t *, int, cred_t
*),
2439 int (*compar
)(const void *, const void *),
2440 int *newnode
, cred_t
*cr
, char *dnm
, char *nm
)
2447 ASSERT(RW_READ_HELD(&rhtp
->r_lock
));
2451 if ((rp
= rfind(rhtp
, fh
, vfsp
)) != NULL
) {
2457 rw_exit(&rhtp
->r_lock
);
2459 mutex_enter(&rpfreelist_lock
);
2460 if (rpfreelist
!= NULL
&& rnew
>= nrnode
) {
2463 mutex_exit(&rpfreelist_lock
);
2467 if (rp
->r_flags
& RHASHED
) {
2468 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2469 mutex_enter(&vp
->v_lock
);
2470 if (vp
->v_count
> 1) {
2472 mutex_exit(&vp
->v_lock
);
2473 rw_exit(&rp
->r_hashq
->r_lock
);
2474 rw_enter(&rhtp
->r_lock
, RW_READER
);
2477 mutex_exit(&vp
->v_lock
);
2478 rp_rmhash_locked(rp
);
2479 rw_exit(&rp
->r_hashq
->r_lock
);
2484 mutex_enter(&vp
->v_lock
);
2485 if (vp
->v_count
> 1) {
2487 mutex_exit(&vp
->v_lock
);
2488 rw_enter(&rhtp
->r_lock
, RW_READER
);
2491 mutex_exit(&vp
->v_lock
);
2494 * destroy old locks before bzero'ing and
2495 * recreating the locks below.
2497 nfs_rw_destroy(&rp
->r_rwlock
);
2498 nfs_rw_destroy(&rp
->r_lkserlock
);
2499 mutex_destroy(&rp
->r_statelock
);
2500 cv_destroy(&rp
->r_cv
);
2501 cv_destroy(&rp
->r_commit
.c_cv
);
2502 nfs_free_r_path(rp
);
2503 avl_destroy(&rp
->r_dir
);
2505 * Make sure that if rnode is recycled then
2506 * VFS count is decremented properly before
2509 VFS_RELE(vp
->v_vfsp
);
2514 mutex_exit(&rpfreelist_lock
);
2516 rp
= kmem_cache_alloc(rnode_cache
, KM_SLEEP
);
2517 new_vp
= vn_alloc(KM_SLEEP
);
2519 atomic_inc_ulong((ulong_t
*)&rnew
);
2521 clstat_debug
.nrnode
.value
.ui64
++;
2526 bzero(rp
, sizeof (*rp
));
2528 nfs_rw_init(&rp
->r_rwlock
, NULL
, RW_DEFAULT
, NULL
);
2529 nfs_rw_init(&rp
->r_lkserlock
, NULL
, RW_DEFAULT
, NULL
);
2530 mutex_init(&rp
->r_statelock
, NULL
, MUTEX_DEFAULT
, NULL
);
2531 cv_init(&rp
->r_cv
, NULL
, CV_DEFAULT
, NULL
);
2532 cv_init(&rp
->r_commit
.c_cv
, NULL
, CV_DEFAULT
, NULL
);
2533 rp
->r_fh
.fh_len
= fh
->fh_len
;
2534 bcopy(fh
->fh_buf
, rp
->r_fh
.fh_buf
, fh
->fh_len
);
2535 rp
->r_server
= mi
->mi_curr_serv
;
2536 if (FAILOVER_MOUNT(mi
)) {
2538 * If replicated servers, stash pathnames
2540 if (dnm
!= NULL
&& nm
!= NULL
) {
2544 len
= (uint_t
)(strlen(dnm
) + strlen(nm
) + 2);
2545 rp
->r_path
= kmem_alloc(len
, KM_SLEEP
);
2547 clstat_debug
.rpath
.value
.ui64
+= len
;
2550 for (p
= dnm
; *p
; p
++)
2553 for (p
= nm
; *p
; p
++)
2557 /* special case for root */
2558 rp
->r_path
= kmem_alloc(2, KM_SLEEP
);
2560 clstat_debug
.rpath
.value
.ui64
+= 2;
2563 *(rp
->r_path
+ 1) = '\0';
2567 rp
->r_putapage
= putapage
;
2569 rp
->r_flags
= RREADDIRPLUS
;
2570 avl_create(&rp
->r_dir
, compar
, sizeof (rddir_cache
),
2571 offsetof(rddir_cache
, tree
));
2572 vn_setops(vp
, vops
);
2573 vp
->v_data
= (caddr_t
)rp
;
2576 vp
->v_flag
|= VMODSORT
;
2580 * There is a race condition if someone else
2581 * alloc's the rnode while no locks are held, so we
2582 * check again and recover if found.
2584 rw_enter(&rhtp
->r_lock
, RW_WRITER
);
2585 if ((trp
= rfind(rhtp
, fh
, vfsp
)) != NULL
) {
2589 rw_exit(&rhtp
->r_lock
);
2591 rw_enter(&rhtp
->r_lock
, RW_READER
);
2600 * Callback function to check if the page should be marked as
2601 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2604 nfs_setmod_check(page_t
*pp
)
2606 if (pp
->p_fsdata
!= C_NOCOMMIT
) {
2607 pp
->p_fsdata
= C_NOCOMMIT
;
2614 nfs_set_vroot(vnode_t
*vp
)
2617 nfs_fhandle
*rootfh
;
2620 rootfh
= &rp
->r_server
->sv_fhandle
;
2621 if (rootfh
->fh_len
== rp
->r_fh
.fh_len
&&
2622 bcmp(rootfh
->fh_buf
, rp
->r_fh
.fh_buf
, rp
->r_fh
.fh_len
) == 0) {
2623 if (!(vp
->v_flag
& VROOT
)) {
2624 mutex_enter(&vp
->v_lock
);
2625 vp
->v_flag
|= VROOT
;
2626 mutex_exit(&vp
->v_lock
);
2632 nfs_free_r_path(rnode_t
*rp
)
2640 len
= strlen(path
) + 1;
2641 kmem_free(path
, len
);
2643 clstat_debug
.rpath
.value
.ui64
-= len
;
2649 * Put an rnode on the free list.
2651 * Rnodes which were allocated above and beyond the normal limit
2652 * are immediately freed.
2655 rp_addfree(rnode_t
*rp
, cred_t
*cr
)
2661 ASSERT(vp
->v_count
>= 1);
2662 ASSERT(rp
->r_freef
== NULL
&& rp
->r_freeb
== NULL
);
2665 * If we have too many rnodes allocated and there are no
2666 * references to this rnode, or if the rnode is no longer
2667 * accessible by it does not reside in the hash queues,
2668 * or if an i/o error occurred while writing to the file,
2669 * then just free it instead of putting it on the rnode
2673 if (((rnew
> nrnode
|| !(rp
->r_flags
& RHASHED
) || rp
->r_error
||
2674 (vfsp
->vfs_flag
& VFS_UNMOUNTED
)) && rp
->r_count
== 0)) {
2675 if (rp
->r_flags
& RHASHED
) {
2676 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2677 mutex_enter(&vp
->v_lock
);
2678 if (vp
->v_count
> 1) {
2680 mutex_exit(&vp
->v_lock
);
2681 rw_exit(&rp
->r_hashq
->r_lock
);
2684 mutex_exit(&vp
->v_lock
);
2685 rp_rmhash_locked(rp
);
2686 rw_exit(&rp
->r_hashq
->r_lock
);
2692 * Recheck the vnode reference count. We need to
2693 * make sure that another reference has not been
2694 * acquired while we were not holding v_lock. The
2695 * rnode is not in the rnode hash queues, so the
2696 * only way for a reference to have been acquired
2697 * is for a fop_putpage because the rnode was marked
2698 * with RDIRTY or for a modified page. This
2699 * reference may have been acquired before our call
2700 * to rinactive. The i/o may have been completed,
2701 * thus allowing rinactive to complete, but the
2702 * reference to the vnode may not have been released
2703 * yet. In any case, the rnode can not be destroyed
2704 * until the other references to this vnode have been
2705 * released. The other references will take care of
2706 * either destroying the rnode or placing it on the
2707 * rnode freelist. If there are no other references,
2708 * then the rnode may be safely destroyed.
2710 mutex_enter(&vp
->v_lock
);
2711 if (vp
->v_count
> 1) {
2713 mutex_exit(&vp
->v_lock
);
2716 mutex_exit(&vp
->v_lock
);
2723 * Lock the hash queue and then recheck the reference count
2724 * to ensure that no other threads have acquired a reference
2725 * to indicate that the rnode should not be placed on the
2726 * freelist. If another reference has been acquired, then
2727 * just release this one and let the other thread complete
2728 * the processing of adding this rnode to the freelist.
2730 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2732 mutex_enter(&vp
->v_lock
);
2733 if (vp
->v_count
> 1) {
2735 mutex_exit(&vp
->v_lock
);
2736 rw_exit(&rp
->r_hashq
->r_lock
);
2739 mutex_exit(&vp
->v_lock
);
2742 * If there is no cached data or metadata for this file, then
2743 * put the rnode on the front of the freelist so that it will
2744 * be reused before other rnodes which may have cached data or
2745 * metadata associated with them.
2747 mutex_enter(&rpfreelist_lock
);
2748 if (rpfreelist
== NULL
) {
2753 rp
->r_freef
= rpfreelist
;
2754 rp
->r_freeb
= rpfreelist
->r_freeb
;
2755 rpfreelist
->r_freeb
->r_freef
= rp
;
2756 rpfreelist
->r_freeb
= rp
;
2757 if (!vn_has_cached_data(vp
) &&
2758 !HAVE_RDDIR_CACHE(rp
) &&
2759 rp
->r_symlink
.contents
== NULL
&&
2760 rp
->r_secattr
== NULL
&&
2761 rp
->r_pathconf
== NULL
)
2764 mutex_exit(&rpfreelist_lock
);
2766 rw_exit(&rp
->r_hashq
->r_lock
);
2770 * Remove an rnode from the free list.
2772 * The caller must be holding rpfreelist_lock and the rnode
2773 * must be on the freelist.
2776 rp_rmfree(rnode_t
*rp
)
2779 ASSERT(MUTEX_HELD(&rpfreelist_lock
));
2780 ASSERT(rp
->r_freef
!= NULL
&& rp
->r_freeb
!= NULL
);
2782 if (rp
== rpfreelist
) {
2783 rpfreelist
= rp
->r_freef
;
2784 if (rp
== rpfreelist
)
2788 rp
->r_freeb
->r_freef
= rp
->r_freef
;
2789 rp
->r_freef
->r_freeb
= rp
->r_freeb
;
2791 rp
->r_freef
= rp
->r_freeb
= NULL
;
2795 * Put a rnode in the hash table.
2797 * The caller must be holding the exclusive hash queue lock.
2800 rp_addhash(rnode_t
*rp
)
2803 ASSERT(RW_WRITE_HELD(&rp
->r_hashq
->r_lock
));
2804 ASSERT(!(rp
->r_flags
& RHASHED
));
2806 rp
->r_hashf
= rp
->r_hashq
->r_hashf
;
2807 rp
->r_hashq
->r_hashf
= rp
;
2808 rp
->r_hashb
= (rnode_t
*)rp
->r_hashq
;
2809 rp
->r_hashf
->r_hashb
= rp
;
2811 mutex_enter(&rp
->r_statelock
);
2812 rp
->r_flags
|= RHASHED
;
2813 mutex_exit(&rp
->r_statelock
);
2817 * Remove a rnode from the hash table.
2819 * The caller must be holding the hash queue lock.
2822 rp_rmhash_locked(rnode_t
*rp
)
2825 ASSERT(RW_WRITE_HELD(&rp
->r_hashq
->r_lock
));
2826 ASSERT(rp
->r_flags
& RHASHED
);
2828 rp
->r_hashb
->r_hashf
= rp
->r_hashf
;
2829 rp
->r_hashf
->r_hashb
= rp
->r_hashb
;
2831 mutex_enter(&rp
->r_statelock
);
2832 rp
->r_flags
&= ~RHASHED
;
2833 mutex_exit(&rp
->r_statelock
);
2837 * Remove a rnode from the hash table.
2839 * The caller must not be holding the hash queue lock.
2842 rp_rmhash(rnode_t
*rp
)
2845 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
2846 rp_rmhash_locked(rp
);
2847 rw_exit(&rp
->r_hashq
->r_lock
);
2851 * Lookup a rnode by fhandle.
2853 * The caller must be holding the hash queue lock, either shared or exclusive.
2856 rfind(rhashq_t
*rhtp
, nfs_fhandle
*fh
, struct vfs
*vfsp
)
2861 ASSERT(RW_LOCK_HELD(&rhtp
->r_lock
));
2863 for (rp
= rhtp
->r_hashf
; rp
!= (rnode_t
*)rhtp
; rp
= rp
->r_hashf
) {
2865 if (vp
->v_vfsp
== vfsp
&&
2866 rp
->r_fh
.fh_len
== fh
->fh_len
&&
2867 bcmp(rp
->r_fh
.fh_buf
, fh
->fh_buf
, fh
->fh_len
) == 0) {
2869 * remove rnode from free list, if necessary.
2871 if (rp
->r_freef
!= NULL
) {
2872 mutex_enter(&rpfreelist_lock
);
2874 * If the rnode is on the freelist,
2875 * then remove it and use that reference
2876 * as the new reference. Otherwise,
2877 * need to increment the reference count.
2879 if (rp
->r_freef
!= NULL
) {
2881 mutex_exit(&rpfreelist_lock
);
2883 mutex_exit(&rpfreelist_lock
);
2895 * Return 1 if there is a active vnode belonging to this vfs in the
2898 * Several of these checks are done without holding the usual
2899 * locks. This is safe because destroy_rtable(), rp_addfree(),
2900 * etc. will redo the necessary checks before actually destroying
2904 check_rtable(struct vfs
*vfsp
)
2910 for (index
= 0; index
< rtablesize
; index
++) {
2911 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
2912 for (rp
= rtable
[index
].r_hashf
;
2913 rp
!= (rnode_t
*)(&rtable
[index
]);
2916 if (vp
->v_vfsp
== vfsp
) {
2917 if (rp
->r_freef
== NULL
||
2918 (vn_has_cached_data(vp
) &&
2919 (rp
->r_flags
& RDIRTY
)) ||
2921 rw_exit(&rtable
[index
].r_lock
);
2926 rw_exit(&rtable
[index
].r_lock
);
2932 * Destroy inactive vnodes from the hash queues which belong to this
2933 * vfs. It is essential that we destroy all inactive vnodes during a
2934 * forced unmount as well as during a normal unmount.
2937 destroy_rtable(struct vfs
*vfsp
, cred_t
*cr
)
2947 for (index
= 0; index
< rtablesize
; index
++) {
2948 rw_enter(&rtable
[index
].r_lock
, RW_WRITER
);
2949 for (rp
= rtable
[index
].r_hashf
;
2950 rp
!= (rnode_t
*)(&rtable
[index
]);
2952 /* save the hash pointer before destroying */
2953 r_hashf
= rp
->r_hashf
;
2955 if (vp
->v_vfsp
== vfsp
) {
2956 mutex_enter(&rpfreelist_lock
);
2957 if (rp
->r_freef
!= NULL
) {
2959 mutex_exit(&rpfreelist_lock
);
2960 rp_rmhash_locked(rp
);
2961 rp
->r_hashf
= rlist
;
2964 mutex_exit(&rpfreelist_lock
);
2967 rw_exit(&rtable
[index
].r_lock
);
2970 for (rp
= rlist
; rp
!= NULL
; rp
= rlist
) {
2971 rlist
= rp
->r_hashf
;
2973 * This call to rp_addfree will end up destroying the
2974 * rnode, but in a safe way with the appropriate set
2983 * This routine destroys all the resources associated with the rnode
2984 * and then the rnode itself.
2987 destroy_rnode(rnode_t
*rp
)
2995 ASSERT(vp
->v_count
== 1);
2996 ASSERT(rp
->r_count
== 0);
2997 ASSERT(rp
->r_lmpl
== NULL
);
2998 ASSERT(rp
->r_mapcnt
== 0);
2999 ASSERT(!(rp
->r_flags
& RHASHED
));
3000 ASSERT(rp
->r_freef
== NULL
&& rp
->r_freeb
== NULL
);
3001 atomic_dec_ulong((ulong_t
*)&rnew
);
3003 clstat_debug
.nrnode
.value
.ui64
--;
3005 nfs_rw_destroy(&rp
->r_rwlock
);
3006 nfs_rw_destroy(&rp
->r_lkserlock
);
3007 mutex_destroy(&rp
->r_statelock
);
3008 cv_destroy(&rp
->r_cv
);
3009 cv_destroy(&rp
->r_commit
.c_cv
);
3010 if (rp
->r_flags
& RDELMAPLIST
)
3011 list_destroy(&rp
->r_indelmap
);
3012 nfs_free_r_path(rp
);
3013 avl_destroy(&rp
->r_dir
);
3016 kmem_cache_free(rnode_cache
, rp
);
3021 * Flush all vnodes in this (or every) vfs.
3022 * Used by nfs_sync and by nfs_unmount.
3025 rflush(struct vfs
*vfsp
, cred_t
*cr
)
3029 vnode_t
*vp
, **vplist
;
3033 * Check to see whether there is anything to do.
3040 * Allocate a slot for all currently active rnodes on the
3041 * supposition that they all may need flushing.
3043 vplist
= kmem_alloc(num
* sizeof (*vplist
), KM_SLEEP
);
3047 * Walk the hash queues looking for rnodes with page
3048 * lists associated with them. Make a list of these
3051 for (index
= 0; index
< rtablesize
; index
++) {
3052 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
3053 for (rp
= rtable
[index
].r_hashf
;
3054 rp
!= (rnode_t
*)(&rtable
[index
]);
3058 * Don't bother sync'ing a vp if it
3059 * is part of virtual swap device or
3060 * if VFS is read-only
3062 if (IS_SWAPVP(vp
) || vn_is_readonly(vp
))
3065 * If flushing all mounted file systems or
3066 * the vnode belongs to this vfs, has pages
3067 * and is marked as either dirty or mmap'd,
3068 * hold and add this vnode to the list of
3071 if ((vfsp
== NULL
|| vp
->v_vfsp
== vfsp
) &&
3072 vn_has_cached_data(vp
) &&
3073 ((rp
->r_flags
& RDIRTY
) || rp
->r_mapcnt
> 0)) {
3077 rw_exit(&rtable
[index
].r_lock
);
3082 rw_exit(&rtable
[index
].r_lock
);
3087 * Flush and release all of the files on the list.
3091 (void) fop_putpage(vp
, 0, 0, B_ASYNC
, cr
, NULL
);
3096 * Free the space allocated to hold the list.
3098 kmem_free(vplist
, num
* sizeof (*vplist
));
3102 * This probably needs to be larger than or equal to
3103 * log2(sizeof (struct rnode)) due to the way that rnodes are
3106 #define ACACHE_SHIFT_BITS 9
3109 acachehash(rnode_t
*rp
, cred_t
*cr
)
3112 return ((((intptr_t)rp
>> ACACHE_SHIFT_BITS
) + crgetuid(cr
)) &
3117 static long nfs_access_cache_hits
= 0;
3118 static long nfs_access_cache_misses
= 0;
3122 nfs_access_check(rnode_t
*rp
, uint32_t acc
, cred_t
*cr
)
3127 nfs_access_type_t all
;
3130 if (!ATTRCACHE_VALID(vp
) || nfs_waitfor_purge_complete(vp
))
3131 return (NFS_ACCESS_UNKNOWN
);
3133 if (rp
->r_acache
!= NULL
) {
3134 hp
= &acache
[acachehash(rp
, cr
)];
3135 rw_enter(&hp
->lock
, RW_READER
);
3137 while (ap
!= (acache_t
*)hp
) {
3138 if (crcmp(ap
->cred
, cr
) == 0 && ap
->rnode
== rp
) {
3139 if ((ap
->known
& acc
) == acc
) {
3141 nfs_access_cache_hits
++;
3143 if ((ap
->allowed
& acc
) == acc
)
3144 all
= NFS_ACCESS_ALLOWED
;
3146 all
= NFS_ACCESS_DENIED
;
3149 nfs_access_cache_misses
++;
3151 all
= NFS_ACCESS_UNKNOWN
;
3162 nfs_access_cache_misses
++;
3164 return (NFS_ACCESS_UNKNOWN
);
3168 nfs_access_cache(rnode_t
*rp
, uint32_t acc
, uint32_t resacc
, cred_t
*cr
)
3174 hp
= &acache
[acachehash(rp
, cr
)];
3177 * Allocate now assuming that mostly an allocation will be
3178 * required. This allows the allocation to happen without
3179 * holding the hash bucket locked.
3181 nap
= kmem_cache_alloc(acache_cache
, KM_NOSLEEP
);
3184 nap
->allowed
= resacc
;
3191 rw_enter(&hp
->lock
, RW_WRITER
);
3193 if (rp
->r_acache
!= NULL
) {
3195 while (ap
!= (acache_t
*)hp
) {
3196 if (crcmp(ap
->cred
, cr
) == 0 && ap
->rnode
== rp
) {
3198 ap
->allowed
&= ~acc
;
3199 ap
->allowed
|= resacc
;
3203 kmem_cache_free(acache_cache
, nap
);
3213 clstat_debug
.access
.value
.ui64
++;
3215 nap
->next
= hp
->next
;
3217 nap
->next
->prev
= nap
;
3218 nap
->prev
= (acache_t
*)hp
;
3220 mutex_enter(&rp
->r_statelock
);
3221 nap
->list
= rp
->r_acache
;
3223 mutex_exit(&rp
->r_statelock
);
3230 nfs_access_purge_rp(rnode_t
*rp
)
3237 * If there aren't any cached entries, then there is nothing
3240 if (rp
->r_acache
== NULL
)
3243 mutex_enter(&rp
->r_statelock
);
3244 rplist
= rp
->r_acache
;
3245 rp
->r_acache
= NULL
;
3246 mutex_exit(&rp
->r_statelock
);
3249 * Loop through each entry in the list pointed to in the
3250 * rnode. Remove each of these entries from the hash
3251 * queue that it is on and remove it from the list in
3254 for (ap
= rplist
; ap
!= NULL
; ap
= tmpap
) {
3255 rw_enter(&ap
->hashq
->lock
, RW_WRITER
);
3256 ap
->prev
->next
= ap
->next
;
3257 ap
->next
->prev
= ap
->prev
;
3258 rw_exit(&ap
->hashq
->lock
);
3262 kmem_cache_free(acache_cache
, ap
);
3264 clstat_debug
.access
.value
.ui64
--;
3271 static const char prefix
[] = ".nfs";
3273 static kmutex_t newnum_lock
;
3278 static uint_t newnum
= 0;
3281 mutex_enter(&newnum_lock
);
3283 newnum
= gethrestime_sec() & 0xffff;
3285 mutex_exit(&newnum_lock
);
3298 news
= kmem_alloc(MAXNAMELEN
, KM_SLEEP
);
3304 *s
++ = "0123456789ABCDEF"[id
& 0x0f];
3312 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3316 cl_snapshot(kstat_t
*ksp
, void *buf
, int rw
)
3318 ksp
->ks_snaptime
= gethrtime();
3319 if (rw
== KSTAT_WRITE
) {
3320 bcopy(buf
, ksp
->ks_private
, sizeof (clstat_tmpl
));
3323 * Currently only the global zone can write to kstats, but we
3324 * add the check just for paranoia.
3326 if (INGLOBALZONE(curproc
))
3327 bcopy((char *)buf
+ sizeof (clstat_tmpl
), &clstat_debug
,
3328 sizeof (clstat_debug
));
3331 bcopy(ksp
->ks_private
, buf
, sizeof (clstat_tmpl
));
3334 * If we're displaying the "global" debug kstat values, we
3335 * display them as-is to all zones since in fact they apply to
3336 * the system as a whole.
3338 bcopy(&clstat_debug
, (char *)buf
+ sizeof (clstat_tmpl
),
3339 sizeof (clstat_debug
));
3346 clinit_zone(zoneid_t zoneid
)
3348 kstat_t
*nfs_client_kstat
;
3349 struct nfs_clnt
*nfscl
;
3352 nfscl
= kmem_alloc(sizeof (*nfscl
), KM_SLEEP
);
3353 mutex_init(&nfscl
->nfscl_chtable_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3354 nfscl
->nfscl_chtable
= NULL
;
3355 nfscl
->nfscl_zoneid
= zoneid
;
3357 bcopy(&clstat_tmpl
, &nfscl
->nfscl_stat
, sizeof (clstat_tmpl
));
3358 ndata
= sizeof (clstat_tmpl
) / sizeof (kstat_named_t
);
3360 ndata
+= sizeof (clstat_debug
) / sizeof (kstat_named_t
);
3362 if ((nfs_client_kstat
= kstat_create_zone("nfs", 0, "nfs_client",
3363 "misc", KSTAT_TYPE_NAMED
, ndata
,
3364 KSTAT_FLAG_VIRTUAL
| KSTAT_FLAG_WRITABLE
, zoneid
)) != NULL
) {
3365 nfs_client_kstat
->ks_private
= &nfscl
->nfscl_stat
;
3366 nfs_client_kstat
->ks_snapshot
= cl_snapshot
;
3367 kstat_install(nfs_client_kstat
);
3369 mutex_enter(&nfs_clnt_list_lock
);
3370 list_insert_head(&nfs_clnt_list
, nfscl
);
3371 mutex_exit(&nfs_clnt_list_lock
);
3377 clfini_zone(zoneid_t zoneid
, void *arg
)
3379 struct nfs_clnt
*nfscl
= arg
;
3380 chhead_t
*chp
, *next
;
3384 mutex_enter(&nfs_clnt_list_lock
);
3385 list_remove(&nfs_clnt_list
, nfscl
);
3386 mutex_exit(&nfs_clnt_list_lock
);
3387 clreclaim_zone(nfscl
, 0);
3388 for (chp
= nfscl
->nfscl_chtable
; chp
!= NULL
; chp
= next
) {
3389 ASSERT(chp
->ch_list
== NULL
);
3390 kmem_free(chp
->ch_protofmly
, strlen(chp
->ch_protofmly
) + 1);
3391 next
= chp
->ch_next
;
3392 kmem_free(chp
, sizeof (*chp
));
3394 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid
);
3395 mutex_destroy(&nfscl
->nfscl_chtable_lock
);
3396 kmem_free(nfscl
, sizeof (*nfscl
));
3400 * Called by endpnt_destructor to make sure the client handles are
3401 * cleaned up before the RPC endpoints. This becomes a no-op if
3402 * clfini_zone (above) is called first. This function is needed
3403 * (rather than relying on clfini_zone to clean up) because the ZSD
3404 * callbacks have no ordering mechanism, so we have no way to ensure
3405 * that clfini_zone is called before endpnt_destructor.
3408 clcleanup_zone(zoneid_t zoneid
)
3410 struct nfs_clnt
*nfscl
;
3412 mutex_enter(&nfs_clnt_list_lock
);
3413 nfscl
= list_head(&nfs_clnt_list
);
3414 for (; nfscl
!= NULL
; nfscl
= list_next(&nfs_clnt_list
, nfscl
)) {
3415 if (nfscl
->nfscl_zoneid
== zoneid
) {
3416 clreclaim_zone(nfscl
, 0);
3420 mutex_exit(&nfs_clnt_list_lock
);
3430 * Allocate and initialize the rnode hash queues
3434 nrnode_max
= (ulong_t
)((kmem_maxavail() >> 2) / sizeof (struct rnode
));
3435 if (nrnode
> nrnode_max
|| (nrnode
== 0 && ncsize
== 0)) {
3436 zcmn_err(GLOBAL_ZONEID
, CE_NOTE
,
3437 "!setting nrnode to max value of %ld", nrnode_max
);
3438 nrnode
= nrnode_max
;
3441 rtablesize
= 1 << highbit(nrnode
/ hashlen
);
3442 rtablemask
= rtablesize
- 1;
3443 rtable
= kmem_alloc(rtablesize
* sizeof (*rtable
), KM_SLEEP
);
3444 for (i
= 0; i
< rtablesize
; i
++) {
3445 rtable
[i
].r_hashf
= (rnode_t
*)(&rtable
[i
]);
3446 rtable
[i
].r_hashb
= (rnode_t
*)(&rtable
[i
]);
3447 rw_init(&rtable
[i
].r_lock
, NULL
, RW_DEFAULT
, NULL
);
3449 rnode_cache
= kmem_cache_create("rnode_cache", sizeof (rnode_t
),
3450 0, NULL
, NULL
, nfs_reclaim
, NULL
, NULL
, 0);
3453 * Allocate and initialize the access cache
3457 * Initial guess is one access cache entry per rnode unless
3458 * nacache is set to a non-zero value and then it is used to
3459 * indicate a guess at the number of access cache entries.
3462 acachesize
= 1 << highbit(nacache
/ hashlen
);
3464 acachesize
= rtablesize
;
3465 acachemask
= acachesize
- 1;
3466 acache
= kmem_alloc(acachesize
* sizeof (*acache
), KM_SLEEP
);
3467 for (i
= 0; i
< acachesize
; i
++) {
3468 acache
[i
].next
= (acache_t
*)&acache
[i
];
3469 acache
[i
].prev
= (acache_t
*)&acache
[i
];
3470 rw_init(&acache
[i
].lock
, NULL
, RW_DEFAULT
, NULL
);
3472 acache_cache
= kmem_cache_create("nfs_access_cache",
3473 sizeof (acache_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
3475 * Allocate and initialize the client handle cache
3477 chtab_cache
= kmem_cache_create("client_handle_cache",
3478 sizeof (struct chtab
), 0, NULL
, NULL
, clreclaim
, NULL
, NULL
, 0);
3480 * Initialize the list of per-zone client handles (and associated data).
3481 * This needs to be done before we call zone_key_create().
3483 list_create(&nfs_clnt_list
, sizeof (struct nfs_clnt
),
3484 offsetof(struct nfs_clnt
, nfscl_node
));
3486 * Initialize the zone_key for per-zone client handle lists.
3488 zone_key_create(&nfsclnt_zone_key
, clinit_zone
, NULL
, clfini_zone
);
3490 * Initialize the various mutexes and reader/writer locks
3492 mutex_init(&rpfreelist_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3493 mutex_init(&newnum_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3494 mutex_init(&nfs_minor_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3497 * Assign unique major number for all nfs mounts
3499 if ((nfs_major
= getudev()) == -1) {
3500 zcmn_err(GLOBAL_ZONEID
, CE_WARN
,
3501 "nfs: init: can't get unique device number");
3506 if (nfs3_jukebox_delay
== 0)
3507 nfs3_jukebox_delay
= NFS3_JUKEBOX_DELAY
;
3518 * Deallocate the rnode hash queues
3520 kmem_cache_destroy(rnode_cache
);
3522 for (i
= 0; i
< rtablesize
; i
++)
3523 rw_destroy(&rtable
[i
].r_lock
);
3524 kmem_free(rtable
, rtablesize
* sizeof (*rtable
));
3527 * Deallocated the access cache
3529 kmem_cache_destroy(acache_cache
);
3531 for (i
= 0; i
< acachesize
; i
++)
3532 rw_destroy(&acache
[i
].lock
);
3533 kmem_free(acache
, acachesize
* sizeof (*acache
));
3536 * Deallocate the client handle cache
3538 kmem_cache_destroy(chtab_cache
);
3541 * Destroy the various mutexes and reader/writer locks
3543 mutex_destroy(&rpfreelist_lock
);
3544 mutex_destroy(&newnum_lock
);
3545 mutex_destroy(&nfs_minor_lock
);
3546 (void) zone_key_delete(nfsclnt_zone_key
);
3555 return (NFSERR_OPNOTSUPP
);
3557 return (NFSERR_NAMETOOLONG
);
3559 return (NFSERR_NOTEMPTY
);
3561 return (NFSERR_DQUOT
);
3563 return (NFSERR_STALE
);
3565 return (NFSERR_REMOTE
);
3567 return (NFSERR_OPNOTSUPP
);
3569 return (NFSERR_INVAL
);
3571 return ((enum nfsstat
)error
);
3577 geterrno(enum nfsstat status
)
3581 case NFSERR_OPNOTSUPP
:
3582 return (EOPNOTSUPP
);
3583 case NFSERR_NAMETOOLONG
:
3584 return (ENAMETOOLONG
);
3585 case NFSERR_NOTEMPTY
:
3596 return ((int)status
);
3602 puterrno3(int error
)
3610 return (NFS3ERR_PERM
);
3612 return (NFS3ERR_NOENT
);
3614 return (NFS3ERR_IO
);
3616 return (NFS3ERR_NXIO
);
3618 return (NFS3ERR_ACCES
);
3620 return (NFS3ERR_EXIST
);
3622 return (NFS3ERR_XDEV
);
3624 return (NFS3ERR_NODEV
);
3626 return (NFS3ERR_NOTDIR
);
3628 return (NFS3ERR_ISDIR
);
3630 return (NFS3ERR_INVAL
);
3632 return (NFS3ERR_FBIG
);
3634 return (NFS3ERR_NOSPC
);
3636 return (NFS3ERR_ROFS
);
3638 return (NFS3ERR_MLINK
);
3640 return (NFS3ERR_NAMETOOLONG
);
3642 return (NFS3ERR_NOTEMPTY
);
3644 return (NFS3ERR_DQUOT
);
3646 return (NFS3ERR_STALE
);
3648 return (NFS3ERR_REMOTE
);
3651 return (NFS3ERR_NOTSUPP
);
3653 return (NFS3ERR_INVAL
);
3655 zcmn_err(getzoneid(), CE_WARN
,
3656 "puterrno3: got error %d", error
);
3657 return ((enum nfsstat3
)error
);
3662 return (NFS3ERR_NAMETOOLONG
);
3664 return (NFS3ERR_NOTEMPTY
);
3666 return (NFS3ERR_DQUOT
);
3668 return (NFS3ERR_STALE
);
3671 return (NFS3ERR_NOTSUPP
);
3673 return (NFS3ERR_REMOTE
);
3675 return (NFS3ERR_INVAL
);
3677 return ((enum nfsstat3
)error
);
3683 geterrno3(enum nfsstat3 status
)
3706 case NFS3ERR_NOTDIR
:
3720 case NFS3ERR_NAMETOOLONG
:
3721 return (ENAMETOOLONG
);
3722 case NFS3ERR_NOTEMPTY
:
3728 case NFS3ERR_REMOTE
:
3730 case NFS3ERR_BADHANDLE
:
3732 case NFS3ERR_NOT_SYNC
:
3734 case NFS3ERR_BAD_COOKIE
:
3736 case NFS3ERR_NOTSUPP
:
3737 return (EOPNOTSUPP
);
3738 case NFS3ERR_TOOSMALL
:
3740 case NFS3ERR_SERVERFAULT
:
3742 case NFS3ERR_BADTYPE
:
3744 case NFS3ERR_JUKEBOX
:
3747 zcmn_err(getzoneid(), CE_WARN
,
3748 "geterrno3: got status %d", status
);
3749 return ((int)status
);
3753 case NFS3ERR_NAMETOOLONG
:
3754 return (ENAMETOOLONG
);
3755 case NFS3ERR_NOTEMPTY
:
3760 case NFS3ERR_BADHANDLE
:
3762 case NFS3ERR_NOTSUPP
:
3763 return (EOPNOTSUPP
);
3764 case NFS3ERR_REMOTE
:
3766 case NFS3ERR_NOT_SYNC
:
3767 case NFS3ERR_TOOSMALL
:
3768 case NFS3ERR_BADTYPE
:
3770 case NFS3ERR_BAD_COOKIE
:
3772 case NFS3ERR_SERVERFAULT
:
3774 case NFS3ERR_JUKEBOX
:
3777 return ((int)status
);
3783 rddir_cache_alloc(int flags
)
3787 rc
= kmem_alloc(sizeof (*rc
), flags
);
3791 cv_init(&rc
->cv
, NULL
, CV_DEFAULT
, NULL
);
3792 mutex_init(&rc
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
3795 atomic_inc_64(&clstat_debug
.dirent
.value
.ui64
);
3802 rddir_cache_free(rddir_cache
*rc
)
3806 atomic_dec_64(&clstat_debug
.dirent
.value
.ui64
);
3808 if (rc
->entries
!= NULL
) {
3810 rddir_cache_buf_free(rc
->entries
, rc
->buflen
);
3812 kmem_free(rc
->entries
, rc
->buflen
);
3815 cv_destroy(&rc
->cv
);
3816 mutex_destroy(&rc
->lock
);
3817 kmem_free(rc
, sizeof (*rc
));
3821 rddir_cache_hold(rddir_cache
*rc
)
3824 mutex_enter(&rc
->lock
);
3826 mutex_exit(&rc
->lock
);
3830 rddir_cache_rele(rddir_cache
*rc
)
3833 mutex_enter(&rc
->lock
);
3834 ASSERT(rc
->count
> 0);
3835 if (--rc
->count
== 0) {
3836 mutex_exit(&rc
->lock
);
3837 rddir_cache_free(rc
);
3839 mutex_exit(&rc
->lock
);
3844 rddir_cache_buf_alloc(size_t size
, int flags
)
3848 rc
= kmem_alloc(size
, flags
);
3850 atomic_add_64(&clstat_debug
.dirents
.value
.ui64
, size
);
3855 rddir_cache_buf_free(void *addr
, size_t size
)
3858 atomic_add_64(&clstat_debug
.dirents
.value
.ui64
, -(int64_t)size
);
3859 kmem_free(addr
, size
);
3864 nfs_free_data_reclaim(rnode_t
*rp
)
3869 nfs3_pathconf_info
*info
;
3874 * Free any held credentials and caches which
3875 * may be associated with this rnode.
3877 mutex_enter(&rp
->r_statelock
);
3880 contents
= rp
->r_symlink
.contents
;
3881 size
= rp
->r_symlink
.size
;
3882 rp
->r_symlink
.contents
= NULL
;
3883 vsp
= rp
->r_secattr
;
3884 rp
->r_secattr
= NULL
;
3885 info
= rp
->r_pathconf
;
3886 rp
->r_pathconf
= NULL
;
3887 mutex_exit(&rp
->r_statelock
);
3893 * Free the access cache entries.
3895 freed
= nfs_access_purge_rp(rp
);
3897 if (!HAVE_RDDIR_CACHE(rp
) &&
3904 * Free the readdir cache entries
3906 if (HAVE_RDDIR_CACHE(rp
))
3907 nfs_purge_rddir_cache(RTOV(rp
));
3910 * Free the symbolic link cache.
3912 if (contents
!= NULL
) {
3914 kmem_free((void *)contents
, size
);
3918 * Free any cached ACL.
3924 * Free any cached pathconf information.
3927 kmem_free(info
, sizeof (*info
));
3933 nfs_active_data_reclaim(rnode_t
*rp
)
3938 nfs3_pathconf_info
*info
;
3942 * Free any held credentials and caches which
3943 * may be associated with this rnode.
3945 if (!mutex_tryenter(&rp
->r_statelock
))
3947 contents
= rp
->r_symlink
.contents
;
3948 size
= rp
->r_symlink
.size
;
3949 rp
->r_symlink
.contents
= NULL
;
3950 vsp
= rp
->r_secattr
;
3951 rp
->r_secattr
= NULL
;
3952 info
= rp
->r_pathconf
;
3953 rp
->r_pathconf
= NULL
;
3954 mutex_exit(&rp
->r_statelock
);
3957 * Free the access cache entries.
3959 freed
= nfs_access_purge_rp(rp
);
3961 if (!HAVE_RDDIR_CACHE(rp
) &&
3968 * Free the readdir cache entries
3970 if (HAVE_RDDIR_CACHE(rp
))
3971 nfs_purge_rddir_cache(RTOV(rp
));
3974 * Free the symbolic link cache.
3976 if (contents
!= NULL
) {
3978 kmem_free((void *)contents
, size
);
3982 * Free any cached ACL.
3988 * Free any cached pathconf information.
3991 kmem_free(info
, sizeof (*info
));
3997 nfs_free_reclaim(void)
4003 clstat_debug
.f_reclaim
.value
.ui64
++;
4006 mutex_enter(&rpfreelist_lock
);
4010 if (nfs_free_data_reclaim(rp
))
4012 } while ((rp
= rp
->r_freef
) != rpfreelist
);
4014 mutex_exit(&rpfreelist_lock
);
4019 nfs_active_reclaim(void)
4026 clstat_debug
.a_reclaim
.value
.ui64
++;
4029 for (index
= 0; index
< rtablesize
; index
++) {
4030 rw_enter(&rtable
[index
].r_lock
, RW_READER
);
4031 for (rp
= rtable
[index
].r_hashf
;
4032 rp
!= (rnode_t
*)(&rtable
[index
]);
4034 if (nfs_active_data_reclaim(rp
))
4037 rw_exit(&rtable
[index
].r_lock
);
4043 nfs_rnode_reclaim(void)
4050 clstat_debug
.r_reclaim
.value
.ui64
++;
4053 mutex_enter(&rpfreelist_lock
);
4054 while ((rp
= rpfreelist
) != NULL
) {
4056 mutex_exit(&rpfreelist_lock
);
4057 if (rp
->r_flags
& RHASHED
) {
4059 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
4060 mutex_enter(&vp
->v_lock
);
4061 if (vp
->v_count
> 1) {
4063 mutex_exit(&vp
->v_lock
);
4064 rw_exit(&rp
->r_hashq
->r_lock
);
4065 mutex_enter(&rpfreelist_lock
);
4068 mutex_exit(&vp
->v_lock
);
4069 rp_rmhash_locked(rp
);
4070 rw_exit(&rp
->r_hashq
->r_lock
);
4073 * This call to rp_addfree will end up destroying the
4074 * rnode, but in a safe way with the appropriate set
4077 rp_addfree(rp
, CRED());
4078 mutex_enter(&rpfreelist_lock
);
4080 mutex_exit(&rpfreelist_lock
);
4086 nfs_reclaim(void *cdrarg
)
4090 clstat_debug
.reclaim
.value
.ui64
++;
4092 if (nfs_free_reclaim())
4095 if (nfs_active_reclaim())
4098 (void) nfs_rnode_reclaim();
4102 * NFS client failover support
4104 * Routines to copy filehandles
4107 nfscopyfh(caddr_t fhp
, vnode_t
*vp
)
4109 fhandle_t
*dest
= (fhandle_t
*)fhp
;
4116 nfs3copyfh(caddr_t fhp
, vnode_t
*vp
)
4118 nfs_fh3
*dest
= (nfs_fh3
*)fhp
;
4121 *dest
= *VTOFH3(vp
);
4125 * NFS client failover support
4127 * failover_safe() will test various conditions to ensure that
4128 * failover is permitted for this vnode. It will be denied
4130 * 1) the operation in progress does not support failover (NULL fi)
4131 * 2) there are no available replicas (NULL mi_servers->sv_next)
4132 * 3) any locks are outstanding on this file
4135 failover_safe(failinfo_t
*fi
)
4139 * Does this op permit failover?
4141 if (fi
== NULL
|| fi
->vp
== NULL
)
4145 * Are there any alternates to failover to?
4147 if (VTOMI(fi
->vp
)->mi_servers
->sv_next
== NULL
)
4151 * Disable check; we've forced local locking
4153 * if (flk_has_remote_locks(fi->vp))
4158 * If we have no partial path, we can't do anything
4160 if (VTOR(fi
->vp
)->r_path
== NULL
)
4166 #include <sys/thread.h>
4169 * NFS client failover support
4171 * failover_newserver() will start a search for a new server,
4172 * preferably by starting an async thread to do the work. If
4173 * someone is already doing this (recognizable by MI_BINDINPROG
4174 * being set), it will simply return and the calling thread
4175 * will queue on the mi_failover_cv condition variable.
4178 failover_newserver(mntinfo_t
*mi
)
4181 * Check if someone else is doing this already
4183 mutex_enter(&mi
->mi_lock
);
4184 if (mi
->mi_flags
& MI_BINDINPROG
) {
4185 mutex_exit(&mi
->mi_lock
);
4188 mi
->mi_flags
|= MI_BINDINPROG
;
4191 * Need to hold the vfs struct so that it can't be released
4192 * while the failover thread is selecting a new server.
4194 VFS_HOLD(mi
->mi_vfsp
);
4197 * Start a thread to do the real searching.
4199 (void) zthread_create(NULL
, 0, failover_thread
, mi
, 0, minclsyspri
);
4201 mutex_exit(&mi
->mi_lock
);
4205 * NFS client failover support
4207 * failover_thread() will find a new server to replace the one
4208 * currently in use, wake up other threads waiting on this mount
4209 * point, and die. It will start at the head of the server list
4210 * and poll servers until it finds one with an NFS server which is
4211 * registered and responds to a NULL procedure ping.
4213 * XXX failover_thread is unsafe within the scope of the
4214 * present model defined for cpr to suspend the system.
4215 * Specifically, over-the-wire calls made by the thread
4216 * are unsafe. The thread needs to be reevaluated in case of
4217 * future updates to the cpr suspend model.
4220 failover_thread(mntinfo_t
*mi
)
4222 servinfo_t
*svp
= NULL
;
4224 enum clnt_stat status
;
4228 callb_cpr_t cprinfo
;
4232 size_t srvnames_len
;
4233 struct nfs_clnt
*nfscl
= NULL
;
4234 zoneid_t zoneid
= getzoneid();
4238 * This is currently only needed to access counters which exist on
4239 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4240 * on non-DEBUG kernels.
4242 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
4243 ASSERT(nfscl
!= NULL
);
4247 * Its safe to piggyback on the mi_lock since failover_newserver()
4248 * code guarantees that there will be only one failover thread
4249 * per mountinfo at any instance.
4251 CALLB_CPR_INIT(&cprinfo
, &mi
->mi_lock
, callb_generic_cpr
,
4254 mutex_enter(&mi
->mi_lock
);
4255 while (mi
->mi_readers
) {
4256 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
4257 cv_wait(&mi
->mi_failover_cv
, &mi
->mi_lock
);
4258 CALLB_CPR_SAFE_END(&cprinfo
, &mi
->mi_lock
);
4260 mutex_exit(&mi
->mi_lock
);
4266 * Ping the null NFS procedure of every server in
4267 * the list until one responds. We always start
4268 * at the head of the list and always skip the one
4269 * that is current, since it's caused us a problem.
4271 while (svp
== NULL
) {
4272 for (svp
= mi
->mi_servers
; svp
; svp
= svp
->sv_next
) {
4273 if (!oncethru
&& svp
== mi
->mi_curr_serv
)
4277 * If the file system was forcibly umounted
4278 * while trying to do a failover, then just
4279 * give up on the failover. It won't matter
4280 * what the server is.
4282 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
)) {
4287 error
= clnt_tli_kcreate(svp
->sv_knconf
, &svp
->sv_addr
,
4288 NFS_PROGRAM
, NFS_VERSION
, 0, 1, CRED(), &cl
);
4292 if (!(mi
->mi_flags
& MI_INT
))
4293 cl
->cl_nosignal
= TRUE
;
4294 status
= CLNT_CALL(cl
, RFS_NULL
, xdr_void
, NULL
,
4295 xdr_void
, NULL
, tv
);
4296 if (!(mi
->mi_flags
& MI_INT
))
4297 cl
->cl_nosignal
= FALSE
;
4298 AUTH_DESTROY(cl
->cl_auth
);
4300 if (status
== RPC_SUCCESS
) {
4301 if (svp
== mi
->mi_curr_serv
) {
4303 zcmn_err(zoneid
, CE_NOTE
,
4304 "NFS%d: failing over: selecting original server %s",
4305 mi
->mi_vers
, svp
->sv_hostname
);
4307 zcmn_err(zoneid
, CE_NOTE
,
4308 "NFS: failing over: selecting original server %s",
4313 zcmn_err(zoneid
, CE_NOTE
,
4314 "NFS%d: failing over from %s to %s",
4316 mi
->mi_curr_serv
->sv_hostname
,
4319 zcmn_err(zoneid
, CE_NOTE
,
4320 "NFS: failing over from %s to %s",
4321 mi
->mi_curr_serv
->sv_hostname
,
4331 srvnames
= nfs_getsrvnames(mi
, &srvnames_len
);
4334 "NFS%d servers %s not responding "
4335 "still trying\n", mi
->mi_vers
, srvnames
);
4337 zprintf(zoneid
, "NFS servers %s not responding "
4338 "still trying\n", srvnames
);
4342 mutex_enter(&mi
->mi_lock
);
4343 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
4344 mutex_exit(&mi
->mi_lock
);
4346 mutex_enter(&mi
->mi_lock
);
4347 CALLB_CPR_SAFE_END(&cprinfo
, &mi
->mi_lock
);
4348 mutex_exit(&mi
->mi_lock
);
4354 zprintf(zoneid
, "NFS%d servers %s ok\n", mi
->mi_vers
, srvnames
);
4356 zprintf(zoneid
, "NFS servers %s ok\n", srvnames
);
4360 if (svp
!= mi
->mi_curr_serv
) {
4361 (void) dnlc_purge_vfsp(mi
->mi_vfsp
, 0);
4362 index
= rtablehash(&mi
->mi_curr_serv
->sv_fhandle
);
4363 rw_enter(&rtable
[index
].r_lock
, RW_WRITER
);
4364 rp
= rfind(&rtable
[index
], &mi
->mi_curr_serv
->sv_fhandle
,
4367 if (rp
->r_flags
& RHASHED
)
4368 rp_rmhash_locked(rp
);
4369 rw_exit(&rtable
[index
].r_lock
);
4371 rp
->r_fh
= svp
->sv_fhandle
;
4372 (void) nfs_free_data_reclaim(rp
);
4373 index
= rtablehash(&rp
->r_fh
);
4374 rp
->r_hashq
= &rtable
[index
];
4375 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
4376 vn_exists(RTOV(rp
));
4378 rw_exit(&rp
->r_hashq
->r_lock
);
4381 rw_exit(&rtable
[index
].r_lock
);
4386 kmem_free(srvnames
, srvnames_len
);
4387 mutex_enter(&mi
->mi_lock
);
4388 mi
->mi_flags
&= ~MI_BINDINPROG
;
4390 mi
->mi_curr_serv
= svp
;
4393 nfscl
->nfscl_stat
.failover
.value
.ui64
++;
4396 cv_broadcast(&mi
->mi_failover_cv
);
4397 CALLB_CPR_EXIT(&cprinfo
);
4398 VFS_RELE(mi
->mi_vfsp
);
4404 * NFS client failover support
4406 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4407 * is cleared, meaning that failover is complete. Called with
4408 * mi_lock mutex held.
4411 failover_wait(mntinfo_t
*mi
)
4416 * If someone else is hunting for a living server,
4417 * sleep until it's done. After our sleep, we may
4418 * be bound to the right server and get off cheaply.
4420 while (mi
->mi_flags
& MI_BINDINPROG
) {
4422 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4423 * and SIGTERM. (Preserving the existing masks).
4424 * Mask out SIGINT if mount option nointr is specified.
4426 sigintr(&smask
, (int)mi
->mi_flags
& MI_INT
);
4427 if (!cv_wait_sig(&mi
->mi_failover_cv
, &mi
->mi_lock
)) {
4429 * restore original signal mask
4435 * restore original signal mask
4443 * NFS client failover support
4445 * failover_remap() will do a partial pathname lookup and find the
4446 * desired vnode on the current server. The interim vnode will be
4447 * discarded after we pilfer the new filehandle.
4450 * - This routine will also update the filehandle in the args structure
4451 * pointed to by the fi->fhp pointer if it is non-NULL.
4455 failover_remap(failinfo_t
*fi
)
4457 vnode_t
*vp
, *nvp
, *rootvp
;
4462 struct nfs_clnt
*nfscl
;
4464 nfscl
= zone_getspecific(nfsclnt_zone_key
, nfs_zone());
4465 ASSERT(nfscl
!= NULL
);
4470 if (fi
== NULL
|| fi
->vp
== NULL
|| fi
->lookupproc
== NULL
)
4476 if (!(vp
->v_flag
& VROOT
)) {
4478 * Given the root fh, use the path stored in
4479 * the rnode to find the fh for the new server.
4481 error
= VFS_ROOT(mi
->mi_vfsp
, &rootvp
);
4485 error
= failover_lookup(rp
->r_path
, rootvp
,
4486 fi
->lookupproc
, fi
->xattrdirproc
, &nvp
);
4494 * If we found the same rnode, we're done now
4498 * Failed and the new server may physically be same
4499 * OR may share a same disk subsystem. In this case
4500 * file handle for a particular file path is not going
4501 * to change, given the same filehandle lookup will
4502 * always locate the same rnode as the existing one.
4503 * All we might need to do is to update the r_server
4504 * with the current servinfo.
4506 if (!VALID_FH(fi
)) {
4507 rp
->r_server
= mi
->mi_curr_serv
;
4514 * Try to make it so that no one else will find this
4515 * vnode because it is just a temporary to hold the
4516 * new file handle until that file handle can be
4517 * copied to the original vnode/rnode.
4520 mutex_enter(&mi
->mi_remap_lock
);
4522 * Some other thread could have raced in here and could
4523 * have done the remap for this particular rnode before
4524 * this thread here. Check for rp->r_server and
4525 * mi->mi_curr_serv and return if they are same.
4528 mutex_exit(&mi
->mi_remap_lock
);
4533 if (nrp
->r_flags
& RHASHED
)
4537 * As a heuristic check on the validity of the new
4538 * file, check that the size and type match against
4539 * that we remember from the old version.
4541 if (rp
->r_size
!= nrp
->r_size
|| vp
->v_type
!= nvp
->v_type
) {
4542 mutex_exit(&mi
->mi_remap_lock
);
4543 zcmn_err(mi
->mi_zone
->zone_id
, CE_WARN
,
4544 "NFS replicas %s and %s: file %s not same.",
4545 rp
->r_server
->sv_hostname
,
4546 nrp
->r_server
->sv_hostname
, rp
->r_path
);
4552 * snarf the filehandle from the new rnode
4553 * then release it, again while updating the
4554 * hash queues for the rnode.
4556 if (rp
->r_flags
& RHASHED
)
4558 rp
->r_server
= mi
->mi_curr_serv
;
4559 rp
->r_fh
= nrp
->r_fh
;
4560 rp
->r_hashq
= nrp
->r_hashq
;
4562 * Copy the attributes from the new rnode to the old
4563 * rnode. This will help to reduce unnecessary page
4566 rp
->r_attr
= nrp
->r_attr
;
4567 rp
->r_attrtime
= nrp
->r_attrtime
;
4568 rp
->r_mtime
= nrp
->r_mtime
;
4569 (void) nfs_free_data_reclaim(rp
);
4570 nfs_setswaplike(vp
, &rp
->r_attr
);
4571 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
4573 rw_exit(&rp
->r_hashq
->r_lock
);
4574 mutex_exit(&mi
->mi_remap_lock
);
4579 * Update successful failover remap count
4581 mutex_enter(&mi
->mi_lock
);
4583 mutex_exit(&mi
->mi_lock
);
4585 nfscl
->nfscl_stat
.remap
.value
.ui64
++;
4589 * If we have a copied filehandle to update, do it now.
4591 if (fi
->fhp
!= NULL
&& fi
->copyproc
!= NULL
)
4592 (*fi
->copyproc
)(fi
->fhp
, vp
);
4598 * NFS client failover support
4600 * We want a simple pathname lookup routine to parse the pieces
4601 * of path in rp->r_path. We know that the path was a created
4602 * as rnodes were made, so we know we have only to deal with
4603 * paths that look like:
4604 * dir1/dir2/dir3/file
4605 * Any evidence of anything like .., symlinks, and ENOTDIR
4606 * are hard errors, because they mean something in this filesystem
4607 * is different from the one we came from, or has changed under
4608 * us in some way. If this is true, we want the failure.
4610 * Extended attributes: if the filesystem is mounted with extended
4611 * attributes enabled (-o xattr), the attribute directory will be
4612 * represented in the r_path as the magic name XATTR_RPATH. So if
4613 * we see that name in the pathname, is must be because this node
4614 * is an extended attribute. Therefore, look it up that way.
4617 failover_lookup(char *path
, vnode_t
*root
,
4618 int (*lookupproc
)(vnode_t
*, char *, vnode_t
**, struct pathname
*, int,
4619 vnode_t
*, cred_t
*, int),
4620 int (*xattrdirproc
)(vnode_t
*, vnode_t
**, bool_t
, cred_t
*, int),
4625 char *s
, *p
, *tmppath
;
4630 /* Make local copy of path */
4631 len
= strlen(path
) + 1;
4632 tmppath
= kmem_alloc(len
, KM_SLEEP
);
4633 (void) strcpy(tmppath
, path
);
4639 xattr
= mi
->mi_flags
& MI_EXTATTR
;
4645 if (xattr
&& strcmp(s
, XATTR_RPATH
) == 0) {
4646 error
= (*xattrdirproc
)(dvp
, &nvp
, FALSE
, CRED(),
4649 error
= (*lookupproc
)(dvp
, s
, &nvp
, NULL
, 0, NULL
,
4650 CRED(), RFSCALL_SOFT
);
4656 kmem_free(tmppath
, len
);
4662 } while (p
!= NULL
);
4664 if (nvp
!= NULL
&& new != NULL
)
4666 kmem_free(tmppath
, len
);
4671 * NFS client failover support
4673 * sv_free() frees the malloc'd portion of a "servinfo_t".
4676 sv_free(servinfo_t
*svp
)
4679 struct knetconfig
*knconf
;
4681 while (svp
!= NULL
) {
4682 next
= svp
->sv_next
;
4683 if (svp
->sv_secdata
)
4684 sec_clnt_freeinfo(svp
->sv_secdata
);
4685 if (svp
->sv_hostname
&& svp
->sv_hostnamelen
> 0)
4686 kmem_free(svp
->sv_hostname
, svp
->sv_hostnamelen
);
4687 knconf
= svp
->sv_knconf
;
4688 if (knconf
!= NULL
) {
4689 if (knconf
->knc_protofmly
!= NULL
)
4690 kmem_free(knconf
->knc_protofmly
, KNC_STRSIZE
);
4691 if (knconf
->knc_proto
!= NULL
)
4692 kmem_free(knconf
->knc_proto
, KNC_STRSIZE
);
4693 kmem_free(knconf
, sizeof (*knconf
));
4695 knconf
= svp
->sv_origknconf
;
4696 if (knconf
!= NULL
) {
4697 if (knconf
->knc_protofmly
!= NULL
)
4698 kmem_free(knconf
->knc_protofmly
, KNC_STRSIZE
);
4699 if (knconf
->knc_proto
!= NULL
)
4700 kmem_free(knconf
->knc_proto
, KNC_STRSIZE
);
4701 kmem_free(knconf
, sizeof (*knconf
));
4703 if (svp
->sv_addr
.buf
!= NULL
&& svp
->sv_addr
.maxlen
!= 0)
4704 kmem_free(svp
->sv_addr
.buf
, svp
->sv_addr
.maxlen
);
4705 mutex_destroy(&svp
->sv_lock
);
4706 kmem_free(svp
, sizeof (*svp
));
4712 * Only can return non-zero if intr != 0.
4715 nfs_rw_enter_sig(nfs_rwlock_t
*l
, krw_t rw
, int intr
)
4718 mutex_enter(&l
->lock
);
4721 * If this is a nested enter, then allow it. There
4722 * must be as many exits as enters through.
4724 if (l
->owner
== curthread
) {
4725 /* lock is held for writing by current thread */
4726 ASSERT(rw
== RW_READER
|| rw
== RW_WRITER
);
4728 } else if (rw
== RW_READER
) {
4730 * While there is a writer active or writers waiting,
4731 * then wait for them to finish up and move on. Then,
4732 * increment the count to indicate that a reader is
4735 while (l
->count
< 0 || l
->waiters
> 0) {
4737 klwp_t
*lwp
= ttolwp(curthread
);
4741 if (cv_wait_sig(&l
->cv_rd
, &l
->lock
) == 0) {
4744 mutex_exit(&l
->lock
);
4750 cv_wait(&l
->cv_rd
, &l
->lock
);
4752 ASSERT(l
->count
< INT_MAX
);
4754 if ((l
->count
% 10000) == 9999)
4755 cmn_err(CE_WARN
, "nfs_rw_enter_sig: count %d on"
4756 "rwlock @ %p\n", l
->count
, (void *)&l
);
4760 ASSERT(rw
== RW_WRITER
);
4762 * While there are readers active or a writer
4763 * active, then wait for all of the readers
4764 * to finish or for the writer to finish.
4765 * Then, set the owner field to curthread and
4766 * decrement count to indicate that a writer
4769 while (l
->count
!= 0) {
4772 klwp_t
*lwp
= ttolwp(curthread
);
4776 if (cv_wait_sig(&l
->cv
, &l
->lock
) == 0) {
4781 * If there are readers active and no
4782 * writers waiting then wake up all of
4783 * the waiting readers (if any).
4785 if (l
->count
> 0 && l
->waiters
== 0)
4786 cv_broadcast(&l
->cv_rd
);
4787 mutex_exit(&l
->lock
);
4793 cv_wait(&l
->cv
, &l
->lock
);
4796 ASSERT(l
->owner
== NULL
);
4797 l
->owner
= curthread
;
4801 mutex_exit(&l
->lock
);
4807 * If the lock is available, obtain it and return non-zero. If there is
4808 * already a conflicting lock, return 0 immediately.
4812 nfs_rw_tryenter(nfs_rwlock_t
*l
, krw_t rw
)
4814 mutex_enter(&l
->lock
);
4817 * If this is a nested enter, then allow it. There
4818 * must be as many exits as enters through.
4820 if (l
->owner
== curthread
) {
4821 /* lock is held for writing by current thread */
4822 ASSERT(rw
== RW_READER
|| rw
== RW_WRITER
);
4824 } else if (rw
== RW_READER
) {
4826 * If there is a writer active or writers waiting, deny the
4827 * lock. Otherwise, bump the count of readers.
4829 if (l
->count
< 0 || l
->waiters
> 0) {
4830 mutex_exit(&l
->lock
);
4835 ASSERT(rw
== RW_WRITER
);
4837 * If there are readers active or a writer active, deny the
4838 * lock. Otherwise, set the owner field to curthread and
4839 * decrement count to indicate that a writer is active.
4841 if (l
->count
!= 0) {
4842 mutex_exit(&l
->lock
);
4845 ASSERT(l
->owner
== NULL
);
4846 l
->owner
= curthread
;
4850 mutex_exit(&l
->lock
);
4856 nfs_rw_exit(nfs_rwlock_t
*l
)
4859 mutex_enter(&l
->lock
);
4861 if (l
->owner
!= NULL
) {
4862 ASSERT(l
->owner
== curthread
);
4865 * To release a writer lock increment count to indicate that
4866 * there is one less writer active. If this was the last of
4867 * possibly nested writer locks, then clear the owner field as
4868 * well to indicate that there is no writer active.
4870 ASSERT(l
->count
< 0);
4872 if (l
->count
== 0) {
4876 * If there are no writers waiting then wakeup all of
4877 * the waiting readers (if any).
4879 if (l
->waiters
== 0)
4880 cv_broadcast(&l
->cv_rd
);
4884 * To release a reader lock just decrement count to indicate
4885 * that there is one less reader active.
4887 ASSERT(l
->count
> 0);
4892 * If there are no readers active nor a writer active and there is a
4893 * writer waiting we need to wake up it.
4895 if (l
->count
== 0 && l
->waiters
> 0)
4897 mutex_exit(&l
->lock
);
4901 nfs_rw_lock_held(nfs_rwlock_t
*l
, krw_t rw
)
4904 if (rw
== RW_READER
)
4905 return (l
->count
> 0);
4906 ASSERT(rw
== RW_WRITER
);
4907 return (l
->count
< 0);
4912 nfs_rw_init(nfs_rwlock_t
*l
, char *name
, krw_type_t type
, void *arg
)
4918 mutex_init(&l
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
4919 cv_init(&l
->cv
, NULL
, CV_DEFAULT
, NULL
);
4920 cv_init(&l
->cv_rd
, NULL
, CV_DEFAULT
, NULL
);
4924 nfs_rw_destroy(nfs_rwlock_t
*l
)
4927 mutex_destroy(&l
->lock
);
4929 cv_destroy(&l
->cv_rd
);
4933 nfs3_rddir_compar(const void *x
, const void *y
)
4935 rddir_cache
*a
= (rddir_cache
*)x
;
4936 rddir_cache
*b
= (rddir_cache
*)y
;
4938 if (a
->nfs3_cookie
== b
->nfs3_cookie
) {
4939 if (a
->buflen
== b
->buflen
)
4941 if (a
->buflen
< b
->buflen
)
4946 if (a
->nfs3_cookie
< b
->nfs3_cookie
)
4953 nfs_rddir_compar(const void *x
, const void *y
)
4955 rddir_cache
*a
= (rddir_cache
*)x
;
4956 rddir_cache
*b
= (rddir_cache
*)y
;
4958 if (a
->nfs_cookie
== b
->nfs_cookie
) {
4959 if (a
->buflen
== b
->buflen
)
4961 if (a
->buflen
< b
->buflen
)
4966 if (a
->nfs_cookie
< b
->nfs_cookie
)
4973 nfs_getsrvnames(mntinfo_t
*mi
, size_t *len
)
4981 * Calculate the length of the string required to hold all
4982 * of the server names plus either a comma or a null
4983 * character following each individual one.
4986 for (s
= mi
->mi_servers
; s
!= NULL
; s
= s
->sv_next
)
4987 length
+= s
->sv_hostnamelen
;
4989 srvnames
= kmem_alloc(length
, KM_SLEEP
);
4992 for (s
= mi
->mi_servers
; s
!= NULL
; s
= s
->sv_next
) {
4993 (void) strcpy(namep
, s
->sv_hostname
);
4994 namep
+= s
->sv_hostnamelen
- 1;
5005 * These two functions are temporary and designed for the upgrade-workaround
5006 * only. They cannot be used for general zone-crossing NFS client support, and
5007 * will be removed shortly.
5009 * When the workaround is enabled, all NFS traffic is forced into the global
5010 * zone. These functions are called when the code needs to refer to the state
5011 * of the underlying network connection. They're not called when the function
5012 * needs to refer to the state of the process that invoked the system call.
5013 * (E.g., when checking whether the zone is shutting down during the mount()
5020 return (nfs_global_client_only
!= 0 ? global_zone
: curproc
->p_zone
);
5026 return (nfs_global_client_only
!= 0 ? GLOBAL_ZONEID
: getzoneid());
5033 mutex_enter(&curproc
->p_splock
);
5034 rv
= (curproc
->p_sessp
->s_vp
!= NULL
);
5035 mutex_exit(&curproc
->p_splock
);
5040 * See if xattr directory to see if it has any generic user attributes
5043 do_xattr_exists_check(vnode_t
*vp
, ulong_t
*valp
, cred_t
*cr
)
5048 struct dirent64
*dp
;
5049 size_t dlen
= 8 * 1024;
5055 dbuf
= kmem_alloc(dlen
, KM_SLEEP
);
5058 uio
.uio_segflg
= UIO_SYSSPACE
;
5060 uio
.uio_extflg
= UIO_COPY_CACHED
;
5061 uio
.uio_loffset
= 0;
5062 uio
.uio_resid
= dlen
;
5063 iov
.iov_base
= dbuf
;
5065 (void) fop_rwlock(vp
, V_WRITELOCK_FALSE
, NULL
);
5066 error
= fop_readdir(vp
, &uio
, cr
, &eof
, NULL
, 0);
5067 fop_rwunlock(vp
, V_WRITELOCK_FALSE
, NULL
);
5069 dbuflen
= dlen
- uio
.uio_resid
;
5071 if (error
|| dbuflen
== 0) {
5072 kmem_free(dbuf
, dlen
);
5076 dp
= (dirent64_t
*)dbuf
;
5078 while ((intptr_t)dp
< (intptr_t)dbuf
+ dbuflen
) {
5079 if (strcmp(dp
->d_name
, ".") == 0 ||
5080 strcmp(dp
->d_name
, "..") == 0 || strcmp(dp
->d_name
,
5081 VIEW_READWRITE
) == 0 || strcmp(dp
->d_name
,
5082 VIEW_READONLY
) == 0) {
5083 dp
= (dirent64_t
*)((intptr_t)dp
+ dp
->d_reclen
);
5090 kmem_free(dbuf
, dlen
);