4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
32 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
33 * Copyright (c) 2017 by Delphix. All rights reserved.
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/systm.h>
45 #include <sys/vnode.h>
46 #include <sys/socket.h>
48 #include <sys/tiuser.h>
50 #include <sys/errno.h>
51 #include <sys/debug.h>
53 #include <sys/kstat.h>
54 #include <sys/cmn_err.h>
55 #include <sys/vtrace.h>
56 #include <sys/session.h>
58 #include <sys/bitmap.h>
61 #include <sys/pathname.h>
62 #include <sys/flock.h>
63 #include <sys/dirent.h>
64 #include <sys/flock.h>
65 #include <sys/callb.h>
70 #include <rpc/types.h>
73 #include <rpc/rpcsec_gss.h>
77 #include <nfs/nfs_clnt.h>
78 #include <nfs/nfs_acl.h>
81 #include <nfs/rnode4.h>
82 #include <nfs/nfs4_clnt.h>
85 * The hash queues for the access to active and cached rnodes
86 * are organized as doubly linked lists. A reader/writer lock
87 * for each hash bucket is used to control access and to synchronize
88 * lookups, additions, and deletions from the hash queue.
90 * The rnode freelist is organized as a doubly linked list with
91 * a head pointer. Additions and deletions are synchronized via
94 * In order to add an rnode to the free list, it must be hashed into
95 * a hash queue and the exclusive lock to the hash queue be held.
96 * If an rnode is not hashed into a hash queue, then it is destroyed
97 * because it represents no valuable information that can be reused
98 * about the file. The exclusive lock to the hash queue must be
99 * held in order to prevent a lookup in the hash queue from finding
100 * the rnode and using it and assuming that the rnode is not on the
101 * freelist. The lookup in the hash queue will have the hash queue
102 * locked, either exclusive or shared.
104 * The vnode reference count for each rnode is not allowed to drop
105 * below 1. This prevents external entities, such as the VM
106 * subsystem, from acquiring references to vnodes already on the
107 * freelist and then trying to place them back on the freelist
108 * when their reference is released. This means that the when an
109 * rnode is looked up in the hash queues, then either the rnode
110 * is removed from the freelist and that reference is transferred to
111 * the new reference or the vnode reference count must be incremented
112 * accordingly. The mutex for the freelist must be held in order to
113 * accurately test to see if the rnode is on the freelist or not.
114 * The hash queue lock might be held shared and it is possible that
115 * two different threads may race to remove the rnode from the
116 * freelist. This race can be resolved by holding the mutex for the
117 * freelist. Please note that the mutex for the freelist does not
118 * need to be held if the rnode is not on the freelist. It can not be
119 * placed on the freelist due to the requirement that the thread
120 * putting the rnode on the freelist must hold the exclusive lock
121 * to the hash queue and the thread doing the lookup in the hash
122 * queue is holding either a shared or exclusive lock to the hash
125 * The lock ordering is:
127 * hash bucket lock -> vnode lock
128 * hash bucket lock -> freelist lock -> r_statelock
132 static kmutex_t rp4freelist_lock
;
133 static rnode4_t
*rp4freelist
= NULL
;
134 static long rnode4_new
= 0;
136 static int rtable4mask
;
137 static struct kmem_cache
*rnode4_cache
;
138 static int rnode4_hashlen
= 4;
140 static void r4inactive(rnode4_t
*, cred_t
*);
141 static vnode_t
*make_rnode4(nfs4_sharedfh_t
*, r4hashq_t
*, struct vfs
*,
142 const struct vnodeops
*,
143 int (*)(vnode_t
*, page_t
*, uoff_t
*, size_t *, int,
146 static void rp4_rmfree(rnode4_t
*);
147 int nfs4_free_data_reclaim(rnode4_t
*);
148 static int nfs4_active_data_reclaim(rnode4_t
*);
149 static int nfs4_free_reclaim(void);
150 static int nfs4_active_reclaim(void);
151 static int nfs4_rnode_reclaim(void);
152 static void nfs4_reclaim(void *);
153 static int isrootfh(nfs4_sharedfh_t
*, rnode4_t
*);
154 static void uninit_rnode4(rnode4_t
*);
155 static void destroy_rnode4(rnode4_t
*);
156 static void r4_stub_set(rnode4_t
*, nfs4_stub_type_t
);
159 static int r4_check_for_dups
= 0; /* Flag to enable dup rnode detection. */
160 static int nfs4_rnode_debug
= 0;
161 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
162 static int nfs4_rnode_nofreelist
= 0;
163 /* give messages on colliding shared filehandles */
164 static void r4_dup_check(rnode4_t
*, vfs_t
*);
168 * If the vnode has pages, run the list and check for any that are
169 * still dangling. We call this routine before putting an rnode on
173 nfs4_dross_pages(vnode_t
*vp
)
177 vmobject_lock(&vp
->v_object
);
178 for (pp
= vmobject_get_head(&vp
->v_object
);
180 pp
= vmobject_get_next(&vp
->v_object
, pp
)) {
181 if (PP_ISPVN_TAG(pp
) &&
182 pp
->p_fsdata
!= C_NOCOMMIT
) {
183 vmobject_unlock(&vp
->v_object
);
187 vmobject_unlock(&vp
->v_object
);
193 * Flush any pages left on this rnode.
196 r4flushpages(rnode4_t
*rp
, cred_t
*cr
)
202 * Before freeing anything, wait until all asynchronous
203 * activity is done on this rnode. This will allow all
204 * asynchronous read ahead and write behind i/o's to
207 mutex_enter(&rp
->r_statelock
);
208 while (rp
->r_count
> 0)
209 cv_wait(&rp
->r_cv
, &rp
->r_statelock
);
210 mutex_exit(&rp
->r_statelock
);
213 * Flush and invalidate all pages associated with the vnode.
216 if (nfs4_has_pages(vp
)) {
217 ASSERT(vp
->v_type
!= VCHR
);
218 if ((rp
->r_flags
& R4DIRTY
) && !rp
->r_error
) {
219 error
= fop_putpage(vp
, 0, 0, 0, cr
, NULL
);
220 if (error
&& (error
== ENOSPC
|| error
== EDQUOT
)) {
221 mutex_enter(&rp
->r_statelock
);
224 mutex_exit(&rp
->r_statelock
);
227 nfs4_invalidate_pages(vp
, 0, cr
);
232 * Free the resources associated with an rnode.
235 r4inactive(rnode4_t
*rp
, cred_t
*cr
)
243 r4flushpages(rp
, cr
);
248 * Free any held caches which may be
249 * associated with this rnode.
251 mutex_enter(&rp
->r_statelock
);
252 contents
= rp
->r_symlink
.contents
;
253 size
= rp
->r_symlink
.size
;
254 rp
->r_symlink
.contents
= NULL
;
256 rp
->r_secattr
= NULL
;
257 xattr
= rp
->r_xattr_dir
;
258 rp
->r_xattr_dir
= NULL
;
259 mutex_exit(&rp
->r_statelock
);
262 * Free the access cache entries.
264 (void) nfs4_access_purge_rp(rp
);
267 * Free the readdir cache entries.
269 nfs4_purge_rddir_cache(vp
);
272 * Free the symbolic link cache.
274 if (contents
!= NULL
) {
276 kmem_free((void *)contents
, size
);
280 * Free any cached ACL.
283 nfs4_acl_free_cache(vsp
);
286 * Release the cached xattr_dir
293 * We have seen a case that the fh passed in is for "." which
294 * should be a VROOT node, however, the fh is different from the
295 * root fh stored in the mntinfo4_t. The invalid fh might be
296 * from a misbehaved server and will panic the client system at
297 * a later time. To avoid the panic, we drop the bad fh, use
298 * the root fh from mntinfo4_t, and print an error message
302 badrootfh_check(nfs4_sharedfh_t
*fh
, nfs4_fname_t
*nm
, mntinfo4_t
*mi
,
309 ASSERT(strcmp(s
, "..") != 0);
311 if ((s
[0] == '.' && s
[1] == '\0') && fh
&&
312 !SFH4_SAME(mi
->mi_rootfh
, fh
)) {
314 nfs4_fhandle_t fhandle
;
316 zcmn_err(mi
->mi_zone
->zone_id
, CE_WARN
,
317 "Server %s returns a different "
318 "root filehandle for the path %s:",
319 mi
->mi_curr_serv
->sv_hostname
,
320 mi
->mi_curr_serv
->sv_path
);
322 /* print the bad fh */
323 fhandle
.fh_len
= fh
->sfh_fh
.nfs_fh4_len
;
324 bcopy(fh
->sfh_fh
.nfs_fh4_val
, fhandle
.fh_buf
,
326 nfs4_printfhandle(&fhandle
);
328 /* print mi_rootfh */
329 fhandle
.fh_len
= mi
->mi_rootfh
->sfh_fh
.nfs_fh4_len
;
330 bcopy(mi
->mi_rootfh
->sfh_fh
.nfs_fh4_val
, fhandle
.fh_buf
,
332 nfs4_printfhandle(&fhandle
);
334 /* use mi_rootfh instead; fh will be rele by the caller */
339 kmem_free(s
, MAXNAMELEN
);
344 r4_do_attrcache(vnode_t
*vp
, nfs4_ga_res_t
*garp
, int newnode
,
345 hrtime_t t
, cred_t
*cr
, int index
)
350 * Don't add to attrcache if time overflow, but
351 * no need to check because either attr is null or the time
352 * values in it were processed by nfs4_time_ntov(), which checks
353 * for time overflows.
355 attr
= garp
? &garp
->n4g_va
: NULL
;
359 rw_exit(&rtable4
[index
].r_lock
);
361 if (vp
->v_type
!= attr
->va_type
&&
362 vp
->v_type
!= VNON
&& attr
->va_type
!= VNON
) {
363 zcmn_err(VTOMI4(vp
)->mi_zone
->zone_id
, CE_WARN
,
364 "makenfs4node: type (%d) doesn't "
365 "match type of found node at %p (%d)",
366 attr
->va_type
, (void *)vp
, vp
->v_type
);
369 nfs4_attr_cache(vp
, garp
, t
, cr
, TRUE
, NULL
);
371 rnode4_t
*rp
= VTOR4(vp
);
373 vp
->v_type
= attr
->va_type
;
374 vp
->v_rdev
= attr
->va_rdev
;
377 * Turn this object into a "stub" object if we
378 * crossed an underlying server fs boundary.
379 * To make this check, during mount we save the
380 * fsid of the server object being mounted.
381 * Here we compare this object's server fsid
382 * with the fsid we saved at mount. If they
383 * are different, we crossed server fs boundary.
385 * The stub type is set (or not) at rnode
386 * creation time and it never changes for life
389 * This stub will be for a mirror-mount, rather than
390 * a referral (the latter also sets R4SRVSTUB).
392 * The stub type is also set during RO failover,
395 * We don't bother with taking r_state_lock to
396 * set the stub type because this is a new rnode
397 * and we're holding the hash bucket r_lock RW_WRITER.
398 * No other thread could have obtained access
402 if (garp
->n4g_fsid_valid
) {
403 fattr4_fsid ga_fsid
= garp
->n4g_fsid
;
404 servinfo4_t
*svp
= rp
->r_server
;
406 rp
->r_srv_fsid
= ga_fsid
;
408 (void) nfs_rw_enter_sig(&svp
->sv_lock
,
410 if (!FATTR4_FSID_EQ(&ga_fsid
, &svp
->sv_fsid
))
412 nfs_rw_exit(&svp
->sv_lock
);
416 r4_stub_mirrormount(rp
);
420 /* Can not cache partial attr */
421 if (attr
->va_mask
== AT_ALL
)
422 nfs4_attrcache_noinval(vp
, garp
, t
);
424 PURGE_ATTRCACHE4(vp
);
426 rw_exit(&rtable4
[index
].r_lock
);
430 PURGE_ATTRCACHE4(vp
);
432 rw_exit(&rtable4
[index
].r_lock
);
437 * Find or create an rnode based primarily on filehandle. To be
438 * used when dvp (vnode for parent directory) is not available;
439 * otherwise, makenfs4node() should be used.
441 * The nfs4_fname_t argument *npp is consumed and nulled out.
445 makenfs4node_by_fh(nfs4_sharedfh_t
*sfh
, nfs4_sharedfh_t
*psfh
,
446 nfs4_fname_t
**npp
, nfs4_ga_res_t
*garp
,
447 mntinfo4_t
*mi
, cred_t
*cr
, hrtime_t t
)
449 vfs_t
*vfsp
= mi
->mi_vfsp
;
454 nfs4_fname_t
*name
, *svpname
;
461 index
= rtable4hash(sfh
);
462 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
464 vp
= make_rnode4(sfh
, &rtable4
[index
], vfsp
,
465 &nfs4_vnodeops
, nfs4_putapage
, &newnode
, cr
);
470 svp
->sv_forw
= svp
->sv_back
= svp
;
477 * It is possible that due to a server
478 * side rename fnames have changed.
479 * update the fname here.
481 mutex_enter(&rp
->r_svlock
);
482 svpname
= svp
->sv_name
;
483 if (svp
->sv_name
!= name
) {
485 mutex_exit(&rp
->r_svlock
);
488 mutex_exit(&rp
->r_svlock
);
493 ASSERT(RW_LOCK_HELD(&rtable4
[index
].r_lock
));
494 r4_do_attrcache(vp
, garp
, newnode
, t
, cr
, index
);
495 ASSERT(rw_owner(&rtable4
[index
].r_lock
) != curthread
);
501 * Find or create a vnode for the given filehandle, filesystem, parent, and
502 * name. The reference to nm is consumed, so the caller must first do an
503 * fn_hold() if it wants to continue using nm after this call.
506 makenfs4node(nfs4_sharedfh_t
*fh
, nfs4_ga_res_t
*garp
, struct vfs
*vfsp
,
507 hrtime_t t
, cred_t
*cr
, vnode_t
*dvp
, nfs4_fname_t
*nm
)
512 mntinfo4_t
*mi
= VFTOMI4(vfsp
);
518 fh
= badrootfh_check(fh
, nm
, mi
, &had_badfh
);
520 index
= rtable4hash(fh
);
521 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
524 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
526 vp
= make_rnode4(fh
, &rtable4
[index
], vfsp
, &nfs4_vnodeops
,
527 nfs4_putapage
, &newnode
, cr
);
530 sv_activate(&vp
, dvp
, &nm
, newnode
);
531 if (dvp
->v_flag
& V_XATTRDIR
) {
532 mutex_enter(&rp
->r_statelock
);
533 rp
->r_flags
|= R4ISXATTR
;
534 mutex_exit(&rp
->r_statelock
);
537 /* if getting a bad file handle, do not cache the attributes. */
539 rw_exit(&rtable4
[index
].r_lock
);
543 ASSERT(RW_LOCK_HELD(&rtable4
[index
].r_lock
));
544 r4_do_attrcache(vp
, garp
, newnode
, t
, cr
, index
);
545 ASSERT(rw_owner(&rtable4
[index
].r_lock
) != curthread
);
551 * Hash on address of filehandle object.
552 * XXX totally untuned.
556 rtable4hash(nfs4_sharedfh_t
*fh
)
558 return (((uintptr_t)fh
/ sizeof (*fh
)) & rtable4mask
);
562 * Find or create the vnode for the given filehandle and filesystem.
563 * *newnode is set to zero if the vnode already existed; non-zero if it had
566 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
570 make_rnode4(nfs4_sharedfh_t
*fh
, r4hashq_t
*rhtp
, struct vfs
*vfsp
,
571 const struct vnodeops
*vops
,
572 int (*putapage
)(vnode_t
*, page_t
*, uoff_t
*, size_t *, int, cred_t
*),
573 int *newnode
, cred_t
*cr
)
580 ASSERT(RW_READ_HELD(&rhtp
->r_lock
));
585 if ((rp
= r4find(rhtp
, fh
, vfsp
)) != NULL
) {
590 rw_exit(&rhtp
->r_lock
);
592 mutex_enter(&rp4freelist_lock
);
594 if (rp4freelist
!= NULL
&& rnode4_new
>= nrnode
) {
597 mutex_exit(&rp4freelist_lock
);
601 if (rp
->r_flags
& R4HASHED
) {
602 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
603 mutex_enter(&vp
->v_lock
);
604 if (vp
->v_count
> 1) {
606 mutex_exit(&vp
->v_lock
);
607 rw_exit(&rp
->r_hashq
->r_lock
);
608 rw_enter(&rhtp
->r_lock
, RW_READER
);
611 mutex_exit(&vp
->v_lock
);
612 rp4_rmhash_locked(rp
);
613 rw_exit(&rp
->r_hashq
->r_lock
);
618 mutex_enter(&vp
->v_lock
);
619 if (vp
->v_count
> 1) {
621 mutex_exit(&vp
->v_lock
);
622 rw_enter(&rhtp
->r_lock
, RW_READER
);
625 mutex_exit(&vp
->v_lock
);
629 * destroy old locks before bzero'ing and
630 * recreating the locks below.
635 * Make sure that if rnode is recycled then
636 * VFS count is decremented properly before
639 VFS_RELE(vp
->v_vfsp
);
644 mutex_exit(&rp4freelist_lock
);
646 rp
= kmem_cache_alloc(rnode4_cache
, KM_SLEEP
);
647 new_vp
= vn_alloc(KM_SLEEP
);
649 atomic_inc_ulong((ulong_t
*)&rnode4_new
);
651 clstat4_debug
.nrnode
.value
.ui64
++;
656 bzero(rp
, sizeof (*rp
));
658 nfs_rw_init(&rp
->r_rwlock
, NULL
, RW_DEFAULT
, NULL
);
659 nfs_rw_init(&rp
->r_lkserlock
, NULL
, RW_DEFAULT
, NULL
);
660 mutex_init(&rp
->r_svlock
, NULL
, MUTEX_DEFAULT
, NULL
);
661 mutex_init(&rp
->r_statelock
, NULL
, MUTEX_DEFAULT
, NULL
);
662 mutex_init(&rp
->r_statev4_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
663 mutex_init(&rp
->r_os_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
665 list_create(&rp
->r_open_streams
, sizeof (nfs4_open_stream_t
),
666 offsetof(nfs4_open_stream_t
, os_node
));
667 rp
->r_lo_head
.lo_prev_rnode
= &rp
->r_lo_head
;
668 rp
->r_lo_head
.lo_next_rnode
= &rp
->r_lo_head
;
669 cv_init(&rp
->r_cv
, NULL
, CV_DEFAULT
, NULL
);
670 cv_init(&rp
->r_commit
.c_cv
, NULL
, CV_DEFAULT
, NULL
);
671 rp
->r_flags
= R4READDIRWATTR
;
675 rp
->r_server
= mi
->mi_curr_serv
;
676 rp
->r_deleg_type
= OPEN_DELEGATE_NONE
;
677 rp
->r_deleg_needs_recovery
= OPEN_DELEGATE_NONE
;
678 nfs_rw_init(&rp
->r_deleg_recall_lock
, NULL
, RW_DEFAULT
, NULL
);
680 rddir4_cache_create(rp
);
681 rp
->r_putapage
= putapage
;
683 vp
->v_data
= (caddr_t
)rp
;
687 vp
->v_flag
|= VMODSORT
;
688 if (isrootfh(fh
, rp
))
693 * There is a race condition if someone else
694 * alloc's the rnode while no locks are held, so we
695 * check again and recover if found.
697 rw_enter(&rhtp
->r_lock
, RW_WRITER
);
698 if ((trp
= r4find(rhtp
, fh
, vfsp
)) != NULL
) {
701 rw_exit(&rhtp
->r_lock
);
703 rw_enter(&rhtp
->r_lock
, RW_READER
);
712 uninit_rnode4(rnode4_t
*rp
)
714 vnode_t
*vp
= RTOV4(rp
);
718 ASSERT(vp
->v_count
== 1);
719 ASSERT(rp
->r_count
== 0);
720 ASSERT(rp
->r_mapcnt
== 0);
721 if (rp
->r_flags
& R4LODANGLERS
) {
722 nfs4_flush_lock_owners(rp
);
724 ASSERT(rp
->r_lo_head
.lo_next_rnode
== &rp
->r_lo_head
);
725 ASSERT(rp
->r_lo_head
.lo_prev_rnode
== &rp
->r_lo_head
);
726 ASSERT(!(rp
->r_flags
& R4HASHED
));
727 ASSERT(rp
->r_freef
== NULL
&& rp
->r_freeb
== NULL
);
728 nfs4_clear_open_streams(rp
);
729 list_destroy(&rp
->r_open_streams
);
732 * Destroy the rddir cache first since we need to grab the r_statelock.
734 mutex_enter(&rp
->r_statelock
);
735 rddir4_cache_destroy(rp
);
736 mutex_exit(&rp
->r_statelock
);
737 sv_uninit(&rp
->r_svnode
);
738 sfh4_rele(&rp
->r_fh
);
739 nfs_rw_destroy(&rp
->r_rwlock
);
740 nfs_rw_destroy(&rp
->r_lkserlock
);
741 mutex_destroy(&rp
->r_statelock
);
742 mutex_destroy(&rp
->r_statev4_lock
);
743 mutex_destroy(&rp
->r_os_lock
);
744 cv_destroy(&rp
->r_cv
);
745 cv_destroy(&rp
->r_commit
.c_cv
);
746 nfs_rw_destroy(&rp
->r_deleg_recall_lock
);
747 if (rp
->r_flags
& R4DELMAPLIST
)
748 list_destroy(&rp
->r_indelmap
);
752 * Put an rnode on the free list.
754 * Rnodes which were allocated above and beyond the normal limit
755 * are immediately freed.
758 rp4_addfree(rnode4_t
*rp
, cred_t
*cr
)
765 ASSERT(vp
->v_count
>= 1);
766 ASSERT(rp
->r_freef
== NULL
&& rp
->r_freeb
== NULL
);
769 * If we have too many rnodes allocated and there are no
770 * references to this rnode, or if the rnode is no longer
771 * accessible by it does not reside in the hash queues,
772 * or if an i/o error occurred while writing to the file,
773 * then just free it instead of putting it on the rnode
777 if (((rnode4_new
> nrnode
|| !(rp
->r_flags
& R4HASHED
) ||
779 (nfs4_rnode_nofreelist
!= 0) ||
781 rp
->r_error
|| (rp
->r_flags
& R4RECOVERR
) ||
782 (vfsp
->vfs_flag
& VFS_UNMOUNTED
)) && rp
->r_count
== 0)) {
783 if (rp
->r_flags
& R4HASHED
) {
784 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
785 mutex_enter(&vp
->v_lock
);
786 if (vp
->v_count
> 1) {
788 mutex_exit(&vp
->v_lock
);
789 rw_exit(&rp
->r_hashq
->r_lock
);
792 mutex_exit(&vp
->v_lock
);
793 rp4_rmhash_locked(rp
);
794 rw_exit(&rp
->r_hashq
->r_lock
);
798 * Make sure we don't have a delegation on this rnode
799 * before destroying it.
801 if (rp
->r_deleg_type
!= OPEN_DELEGATE_NONE
) {
802 (void) nfs4delegreturn(rp
,
803 NFS4_DR_FORCE
|NFS4_DR_PUSH
|NFS4_DR_REOPEN
);
809 * Recheck the vnode reference count. We need to
810 * make sure that another reference has not been
811 * acquired while we were not holding v_lock. The
812 * rnode is not in the rnode hash queues; one
813 * way for a reference to have been acquired
814 * is for a fop_putpage because the rnode was marked
815 * with R4DIRTY or for a modified page. This
816 * reference may have been acquired before our call
817 * to r4inactive. The i/o may have been completed,
818 * thus allowing r4inactive to complete, but the
819 * reference to the vnode may not have been released
820 * yet. In any case, the rnode can not be destroyed
821 * until the other references to this vnode have been
822 * released. The other references will take care of
823 * either destroying the rnode or placing it on the
824 * rnode freelist. If there are no other references,
825 * then the rnode may be safely destroyed.
827 mutex_enter(&vp
->v_lock
);
828 if (vp
->v_count
> 1) {
830 mutex_exit(&vp
->v_lock
);
833 mutex_exit(&vp
->v_lock
);
840 * Lock the hash queue and then recheck the reference count
841 * to ensure that no other threads have acquired a reference
842 * to indicate that the rnode should not be placed on the
843 * freelist. If another reference has been acquired, then
844 * just release this one and let the other thread complete
845 * the processing of adding this rnode to the freelist.
848 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
850 mutex_enter(&vp
->v_lock
);
851 if (vp
->v_count
> 1) {
853 mutex_exit(&vp
->v_lock
);
854 rw_exit(&rp
->r_hashq
->r_lock
);
857 mutex_exit(&vp
->v_lock
);
860 * Make sure we don't put an rnode with a delegation
863 if (rp
->r_deleg_type
!= OPEN_DELEGATE_NONE
) {
864 rw_exit(&rp
->r_hashq
->r_lock
);
865 (void) nfs4delegreturn(rp
,
866 NFS4_DR_FORCE
|NFS4_DR_PUSH
|NFS4_DR_REOPEN
);
871 * Now that we have the hash queue lock, and we know there
872 * are not anymore references on the vnode, check to make
873 * sure there aren't any open streams still on the rnode.
874 * If so, drop the hash queue lock, remove the open streams,
875 * and recheck the v_count.
877 mutex_enter(&rp
->r_os_lock
);
878 if (list_head(&rp
->r_open_streams
) != NULL
) {
879 mutex_exit(&rp
->r_os_lock
);
880 rw_exit(&rp
->r_hashq
->r_lock
);
881 if (nfs_zone() != VTOMI4(vp
)->mi_zone
)
882 nfs4_clear_open_streams(rp
);
884 (void) nfs4close_all(vp
, cr
);
887 mutex_exit(&rp
->r_os_lock
);
890 * Before we put it on the freelist, make sure there are no pages.
891 * If there are, flush and commit of all of the dirty and
892 * uncommitted pages, assuming the file system isn't read only.
894 if (!(vp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) && nfs4_dross_pages(vp
)) {
895 rw_exit(&rp
->r_hashq
->r_lock
);
896 r4flushpages(rp
, cr
);
901 * Before we put it on the freelist, make sure there is no
902 * active xattr directory cached, the freelist will not
903 * have its entries r4inactive'd if there is still an active
904 * rnode, thus nothing in the freelist can hold another
907 xattr
= rp
->r_xattr_dir
;
908 rp
->r_xattr_dir
= NULL
;
911 * If there is no cached data or metadata for this file, then
912 * put the rnode on the front of the freelist so that it will
913 * be reused before other rnodes which may have cached data or
914 * metadata associated with them.
916 mutex_enter(&rp4freelist_lock
);
917 if (rp4freelist
== NULL
) {
922 rp
->r_freef
= rp4freelist
;
923 rp
->r_freeb
= rp4freelist
->r_freeb
;
924 rp4freelist
->r_freeb
->r_freef
= rp
;
925 rp4freelist
->r_freeb
= rp
;
926 if (!nfs4_has_pages(vp
) && rp
->r_dir
== NULL
&&
927 rp
->r_symlink
.contents
== NULL
&& rp
->r_secattr
== NULL
)
930 mutex_exit(&rp4freelist_lock
);
932 rw_exit(&rp
->r_hashq
->r_lock
);
939 * Remove an rnode from the free list.
941 * The caller must be holding rp4freelist_lock and the rnode
942 * must be on the freelist.
945 rp4_rmfree(rnode4_t
*rp
)
948 ASSERT(MUTEX_HELD(&rp4freelist_lock
));
949 ASSERT(rp
->r_freef
!= NULL
&& rp
->r_freeb
!= NULL
);
951 if (rp
== rp4freelist
) {
952 rp4freelist
= rp
->r_freef
;
953 if (rp
== rp4freelist
)
956 rp
->r_freeb
->r_freef
= rp
->r_freef
;
957 rp
->r_freef
->r_freeb
= rp
->r_freeb
;
959 rp
->r_freef
= rp
->r_freeb
= NULL
;
963 * Put a rnode in the hash table.
965 * The caller must be holding the exclusive hash queue lock
968 rp4_addhash(rnode4_t
*rp
)
970 ASSERT(RW_WRITE_HELD(&rp
->r_hashq
->r_lock
));
971 ASSERT(!(rp
->r_flags
& R4HASHED
));
974 r4_dup_check(rp
, RTOV4(rp
)->v_vfsp
);
977 rp
->r_hashf
= rp
->r_hashq
->r_hashf
;
978 rp
->r_hashq
->r_hashf
= rp
;
979 rp
->r_hashb
= (rnode4_t
*)rp
->r_hashq
;
980 rp
->r_hashf
->r_hashb
= rp
;
982 mutex_enter(&rp
->r_statelock
);
983 rp
->r_flags
|= R4HASHED
;
984 mutex_exit(&rp
->r_statelock
);
988 * Remove a rnode from the hash table.
990 * The caller must be holding the hash queue lock.
993 rp4_rmhash_locked(rnode4_t
*rp
)
995 ASSERT(RW_WRITE_HELD(&rp
->r_hashq
->r_lock
));
996 ASSERT(rp
->r_flags
& R4HASHED
);
998 rp
->r_hashb
->r_hashf
= rp
->r_hashf
;
999 rp
->r_hashf
->r_hashb
= rp
->r_hashb
;
1001 mutex_enter(&rp
->r_statelock
);
1002 rp
->r_flags
&= ~R4HASHED
;
1003 mutex_exit(&rp
->r_statelock
);
1007 * Remove a rnode from the hash table.
1009 * The caller must not be holding the hash queue lock.
1012 rp4_rmhash(rnode4_t
*rp
)
1014 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
1015 rp4_rmhash_locked(rp
);
1016 rw_exit(&rp
->r_hashq
->r_lock
);
1020 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery.
1021 * Returns NULL if no match. If an rnode is returned, the reference count
1022 * on the master vnode is incremented.
1024 * The caller must be holding the hash queue lock, either shared or exclusive.
1027 r4find(r4hashq_t
*rhtp
, nfs4_sharedfh_t
*fh
, struct vfs
*vfsp
)
1032 ASSERT(RW_LOCK_HELD(&rhtp
->r_lock
));
1034 for (rp
= rhtp
->r_hashf
; rp
!= (rnode4_t
*)rhtp
; rp
= rp
->r_hashf
) {
1036 if (vp
->v_vfsp
== vfsp
&& SFH4_SAME(rp
->r_fh
, fh
)) {
1038 mutex_enter(&rp
->r_statelock
);
1039 if (rp
->r_flags
& R4RECOVERR
) {
1040 mutex_exit(&rp
->r_statelock
);
1043 mutex_exit(&rp
->r_statelock
);
1045 r4_dup_check(rp
, vfsp
);
1047 if (rp
->r_freef
!= NULL
) {
1048 mutex_enter(&rp4freelist_lock
);
1050 * If the rnode is on the freelist,
1051 * then remove it and use that reference
1052 * as the new reference. Otherwise,
1053 * need to increment the reference count.
1055 if (rp
->r_freef
!= NULL
) {
1057 mutex_exit(&rp4freelist_lock
);
1059 mutex_exit(&rp4freelist_lock
);
1066 * if root vnode, set v_flag to indicate that
1068 if (isrootfh(fh
, rp
)) {
1069 if (!(vp
->v_flag
& VROOT
)) {
1070 mutex_enter(&vp
->v_lock
);
1071 vp
->v_flag
|= VROOT
;
1072 mutex_exit(&vp
->v_lock
);
1082 * Lookup an rnode by fhandle. Just a wrapper for r4find()
1083 * that assumes the caller hasn't already got the lock
1084 * on the hash bucket.
1087 r4find_unlocked(nfs4_sharedfh_t
*fh
, struct vfs
*vfsp
)
1092 index
= rtable4hash(fh
);
1093 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1094 rp
= r4find(&rtable4
[index
], fh
, vfsp
);
1095 rw_exit(&rtable4
[index
].r_lock
);
1101 * Return >0 if there is a active vnode belonging to this vfs in the
1104 * Several of these checks are done without holding the usual
1105 * locks. This is safe because destroy_rtable(), rp_addfree(),
1106 * etc. will redo the necessary checks before actually destroying
1110 check_rtable4(struct vfs
*vfsp
)
1114 int busy
= NFSV4_RTABLE4_OK
;
1117 for (index
= 0; index
< rtable4size
; index
++) {
1118 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1120 for (rp
= rtable4
[index
].r_hashf
;
1121 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1125 if (vp
->v_vfsp
== vfsp
) {
1126 if (rp
->r_freef
== NULL
) {
1127 busy
= NFSV4_RTABLE4_NOT_FREE_LIST
;
1128 } else if (nfs4_has_pages(vp
) &&
1129 (rp
->r_flags
& R4DIRTY
)) {
1130 busy
= NFSV4_RTABLE4_DIRTY_PAGES
;
1131 } else if (rp
->r_count
> 0) {
1132 busy
= NFSV4_RTABLE4_POS_R_COUNT
;
1135 if (busy
!= NFSV4_RTABLE4_OK
) {
1139 path
= fn_path(rp
->r_svnode
.sv_name
);
1140 DTRACE_NFSV4_3(rnode__e__debug
,
1141 int, busy
, char *, path
,
1143 kmem_free(path
, strlen(path
)+1);
1145 rw_exit(&rtable4
[index
].r_lock
);
1150 rw_exit(&rtable4
[index
].r_lock
);
1156 * Destroy inactive vnodes from the hash queues which
1157 * belong to this vfs. All of the vnodes should be inactive.
1158 * It is essential that we destroy all rnodes in case of
1159 * forced unmount as well as in normal unmount case.
1163 destroy_rtable4(struct vfs
*vfsp
, cred_t
*cr
)
1167 rnode4_t
*rp
, *r_hashf
, *rlist
;
1171 for (index
= 0; index
< rtable4size
; index
++) {
1172 rw_enter(&rtable4
[index
].r_lock
, RW_WRITER
);
1173 for (rp
= rtable4
[index
].r_hashf
;
1174 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1176 /* save the hash pointer before destroying */
1177 r_hashf
= rp
->r_hashf
;
1180 if (vp
->v_vfsp
== vfsp
) {
1181 mutex_enter(&rp4freelist_lock
);
1182 if (rp
->r_freef
!= NULL
) {
1184 mutex_exit(&rp4freelist_lock
);
1185 rp4_rmhash_locked(rp
);
1186 rp
->r_hashf
= rlist
;
1189 mutex_exit(&rp4freelist_lock
);
1192 rw_exit(&rtable4
[index
].r_lock
);
1195 for (rp
= rlist
; rp
!= NULL
; rp
= r_hashf
) {
1196 r_hashf
= rp
->r_hashf
;
1198 * This call to rp4_addfree will end up destroying the
1199 * rnode, but in a safe way with the appropriate set
1202 rp4_addfree(rp
, cr
);
1207 * This routine destroys all the resources of an rnode
1208 * and finally the rnode itself.
1211 destroy_rnode4(rnode4_t
*rp
)
1216 ASSERT(rp
->r_deleg_type
== OPEN_DELEGATE_NONE
);
1222 atomic_dec_ulong((ulong_t
*)&rnode4_new
);
1224 clstat4_debug
.nrnode
.value
.ui64
--;
1226 kmem_cache_free(rnode4_cache
, rp
);
1233 * Invalidate the attributes on all rnodes forcing the next getattr
1234 * to go over the wire. Used to flush stale uid and gid mappings.
1235 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1238 nfs4_rnode_invalidate(struct vfs
*vfsp
)
1245 * Walk the hash queues looking for rnodes.
1247 for (index
= 0; index
< rtable4size
; index
++) {
1248 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1249 for (rp
= rtable4
[index
].r_hashf
;
1250 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1253 if (vfsp
!= NULL
&& vp
->v_vfsp
!= vfsp
)
1256 if (!mutex_tryenter(&rp
->r_statelock
))
1260 * Expire the attributes by resetting the change
1264 PURGE_ATTRCACHE4_LOCKED(rp
);
1265 mutex_exit(&rp
->r_statelock
);
1267 rw_exit(&rtable4
[index
].r_lock
);
1272 * Flush all vnodes in this (or every) vfs.
1273 * Used by nfs_sync and by nfs_unmount.
1276 r4flush(struct vfs
*vfsp
, cred_t
*cr
)
1280 vnode_t
*vp
, **vplist
;
1284 * Check to see whether there is anything to do.
1291 * Allocate a slot for all currently active rnodes on the
1292 * supposition that they all may need flushing.
1294 vplist
= kmem_alloc(num
* sizeof (*vplist
), KM_SLEEP
);
1298 * Walk the hash queues looking for rnodes with page
1299 * lists associated with them. Make a list of these
1302 for (index
= 0; index
< rtable4size
; index
++) {
1303 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1304 for (rp
= rtable4
[index
].r_hashf
;
1305 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1309 * Don't bother sync'ing a vp if it
1310 * is part of virtual swap device or
1311 * if VFS is read-only
1313 if (IS_SWAPVP(vp
) || vn_is_readonly(vp
))
1316 * If flushing all mounted file systems or
1317 * the vnode belongs to this vfs, has pages
1318 * and is marked as either dirty or mmap'd,
1319 * hold and add this vnode to the list of
1322 if ((vfsp
== NULL
|| vp
->v_vfsp
== vfsp
) &&
1323 nfs4_has_pages(vp
) &&
1324 ((rp
->r_flags
& R4DIRTY
) || rp
->r_mapcnt
> 0)) {
1328 rw_exit(&rtable4
[index
].r_lock
);
1333 rw_exit(&rtable4
[index
].r_lock
);
1338 * Flush and release all of the files on the list.
1342 (void) fop_putpage(vp
, 0, 0, B_ASYNC
, cr
, NULL
);
1347 * Free the space allocated to hold the list.
1349 kmem_free(vplist
, num
* sizeof (*vplist
));
1353 nfs4_free_data_reclaim(rnode4_t
*rp
)
1363 * Free any held caches which may
1364 * be associated with this rnode.
1366 mutex_enter(&rp
->r_statelock
);
1367 if (rp
->r_dir
!= NULL
)
1369 contents
= rp
->r_symlink
.contents
;
1370 size
= rp
->r_symlink
.size
;
1371 rp
->r_symlink
.contents
= NULL
;
1372 vsp
= rp
->r_secattr
;
1373 rp
->r_secattr
= NULL
;
1374 xattr
= rp
->r_xattr_dir
;
1375 rp
->r_xattr_dir
= NULL
;
1376 mutex_exit(&rp
->r_statelock
);
1379 * Free the access cache entries.
1381 freed
= nfs4_access_purge_rp(rp
);
1383 if (rdc
== FALSE
&& contents
== NULL
&& vsp
== NULL
&& xattr
== NULL
)
1387 * Free the readdir cache entries, incompletely if we can't block.
1389 nfs4_purge_rddir_cache(RTOV4(rp
));
1392 * Free the symbolic link cache.
1394 if (contents
!= NULL
) {
1396 kmem_free((void *)contents
, size
);
1400 * Free any cached ACL.
1403 nfs4_acl_free_cache(vsp
);
1406 * Release the xattr directory vnode
1415 nfs4_active_data_reclaim(rnode4_t
*rp
)
1418 vnode_t
*xattr
= NULL
;
1425 * Free any held credentials and caches which
1426 * may be associated with this rnode.
1428 if (!mutex_tryenter(&rp
->r_statelock
))
1430 contents
= rp
->r_symlink
.contents
;
1431 size
= rp
->r_symlink
.size
;
1432 rp
->r_symlink
.contents
= NULL
;
1433 vsp
= rp
->r_secattr
;
1434 rp
->r_secattr
= NULL
;
1435 if (rp
->r_dir
!= NULL
)
1438 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1439 * on the same r_hashq queue. We are not mandated to free all caches.
1440 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1441 * rnode 'rp' is freed or put on the free list.
1443 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1444 * - it has no associated rnode4_t (its v_data is NULL),
1445 * - it is preallocated statically and will never go away,
1446 * so we cannot save anything by releasing it.
1448 if (rp
->r_xattr_dir
&& rp
->r_xattr_dir
!= NFS4_XATTR_DIR_NOTSUPP
&&
1449 VTOR4(rp
->r_xattr_dir
)->r_hashq
!= rp
->r_hashq
) {
1450 xattr
= rp
->r_xattr_dir
;
1451 rp
->r_xattr_dir
= NULL
;
1453 mutex_exit(&rp
->r_statelock
);
1456 * Free the access cache entries.
1458 freed
= nfs4_access_purge_rp(rp
);
1460 if (contents
== NULL
&& vsp
== NULL
&& rdc
== FALSE
&& xattr
== NULL
)
1464 * Free the symbolic link cache.
1466 if (contents
!= NULL
) {
1468 kmem_free((void *)contents
, size
);
1472 * Free any cached ACL.
1475 nfs4_acl_free_cache(vsp
);
1477 nfs4_purge_rddir_cache(RTOV4(rp
));
1480 * Release the xattr directory vnode
1489 nfs4_free_reclaim(void)
1495 clstat4_debug
.f_reclaim
.value
.ui64
++;
1498 mutex_enter(&rp4freelist_lock
);
1502 if (nfs4_free_data_reclaim(rp
))
1504 } while ((rp
= rp
->r_freef
) != rp4freelist
);
1506 mutex_exit(&rp4freelist_lock
);
1511 nfs4_active_reclaim(void)
1518 clstat4_debug
.a_reclaim
.value
.ui64
++;
1521 for (index
= 0; index
< rtable4size
; index
++) {
1522 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1523 for (rp
= rtable4
[index
].r_hashf
;
1524 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1526 if (nfs4_active_data_reclaim(rp
))
1529 rw_exit(&rtable4
[index
].r_lock
);
1535 nfs4_rnode_reclaim(void)
1542 clstat4_debug
.r_reclaim
.value
.ui64
++;
1545 mutex_enter(&rp4freelist_lock
);
1546 while ((rp
= rp4freelist
) != NULL
) {
1548 mutex_exit(&rp4freelist_lock
);
1549 if (rp
->r_flags
& R4HASHED
) {
1551 rw_enter(&rp
->r_hashq
->r_lock
, RW_WRITER
);
1552 mutex_enter(&vp
->v_lock
);
1553 if (vp
->v_count
> 1) {
1555 mutex_exit(&vp
->v_lock
);
1556 rw_exit(&rp
->r_hashq
->r_lock
);
1557 mutex_enter(&rp4freelist_lock
);
1560 mutex_exit(&vp
->v_lock
);
1561 rp4_rmhash_locked(rp
);
1562 rw_exit(&rp
->r_hashq
->r_lock
);
1565 * This call to rp_addfree will end up destroying the
1566 * rnode, but in a safe way with the appropriate set
1569 rp4_addfree(rp
, CRED());
1570 mutex_enter(&rp4freelist_lock
);
1572 mutex_exit(&rp4freelist_lock
);
1578 nfs4_reclaim(void *cdrarg
)
1581 clstat4_debug
.reclaim
.value
.ui64
++;
1583 if (nfs4_free_reclaim())
1586 if (nfs4_active_reclaim())
1589 (void) nfs4_rnode_reclaim();
1593 * Returns the clientid4 to use for the given mntinfo4. Note that the
1594 * clientid can change if the caller drops mi_recovlock.
1598 mi2clientid(mntinfo4_t
*mi
)
1601 clientid4 clientid
= 0;
1603 /* this locks down sp if it is found */
1604 sp
= find_nfs4_server(mi
);
1606 clientid
= sp
->clientid
;
1607 mutex_exit(&sp
->s_lock
);
1608 nfs4_server_rele(sp
);
1614 * Return the current lease time for the server associated with the given
1615 * file. Note that the lease time could change immediately after this
1620 r2lease_time(rnode4_t
*rp
)
1624 mntinfo4_t
*mi
= VTOMI4(RTOV4(rp
));
1626 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_READER
, 0);
1628 /* this locks down sp if it is found */
1629 sp
= find_nfs4_server(VTOMI4(RTOV4(rp
)));
1631 if (VTOMI4(RTOV4(rp
))->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
) {
1633 mutex_exit(&sp
->s_lock
);
1634 nfs4_server_rele(sp
);
1636 nfs_rw_exit(&mi
->mi_recovlock
);
1637 return (1); /* 1 second */
1642 lease_time
= sp
->s_lease_time
;
1644 mutex_exit(&sp
->s_lock
);
1645 nfs4_server_rele(sp
);
1646 nfs_rw_exit(&mi
->mi_recovlock
);
1648 return (lease_time
);
1652 * Return a list with information about all the known open instances for
1653 * a filesystem. The caller must call r4releopenlist() when done with the
1656 * We are safe at looking at os_valid and os_pending_close across dropping
1657 * the 'os_sync_lock' to count up the number of open streams and then
1658 * allocate memory for the osp list due to:
1659 * -Looking at os_pending_close is safe since this routine is
1660 * only called via recovery, and os_pending_close can only be set via
1661 * a non-recovery operation (which are all blocked when recovery
1664 * -Examining os_valid is safe since non-recovery operations, which
1665 * could potentially switch os_valid to 0, are blocked (via
1666 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1667 * (which means we are the only recovery thread potentially acting
1668 * on this open stream).
1672 r4mkopenlist(mntinfo4_t
*mi
)
1674 nfs4_opinst_t
*reopenlist
, *rep
;
1677 vfs_t
*vfsp
= mi
->mi_vfsp
;
1679 nfs4_open_stream_t
*osp
;
1681 open_delegation_type4 dtype
;
1686 for (index
= 0; index
< rtable4size
; index
++) {
1687 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1688 for (rp
= rtable4
[index
].r_hashf
;
1689 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1693 if (vp
->v_vfsp
!= vfsp
)
1697 mutex_enter(&rp
->r_os_lock
);
1699 /* Count the number of valid open_streams of the file */
1701 for (osp
= list_head(&rp
->r_open_streams
); osp
!= NULL
;
1702 osp
= list_next(&rp
->r_open_streams
, osp
)) {
1703 mutex_enter(&osp
->os_sync_lock
);
1704 if (osp
->os_valid
&& !osp
->os_pending_close
)
1706 mutex_exit(&osp
->os_sync_lock
);
1709 /* Fill in the valid open streams per vp */
1716 * Add a new open instance to the list
1718 rep
= kmem_zalloc(sizeof (*reopenlist
),
1720 rep
->re_next
= reopenlist
;
1724 rep
->re_osp
= kmem_zalloc(
1725 numosp
* sizeof (*(rep
->re_osp
)),
1727 rep
->re_numosp
= numosp
;
1730 for (osp
= list_head(&rp
->r_open_streams
);
1732 osp
= list_next(&rp
->r_open_streams
, osp
)) {
1734 mutex_enter(&osp
->os_sync_lock
);
1735 if (osp
->os_valid
&&
1736 !osp
->os_pending_close
) {
1737 osp
->os_ref_count
++;
1738 rep
->re_osp
[j
] = osp
;
1741 mutex_exit(&osp
->os_sync_lock
);
1744 * Assuming valid osp(s) stays valid between
1745 * the time obtaining j and numosp.
1747 ASSERT(j
== numosp
);
1750 mutex_exit(&rp
->r_os_lock
);
1751 /* do this here to keep v_lock > r_os_lock */
1754 mutex_enter(&rp
->r_statev4_lock
);
1755 if (rp
->r_deleg_type
!= OPEN_DELEGATE_NONE
) {
1757 * If this rnode holds a delegation,
1758 * but if there are no valid open streams,
1759 * then just discard the delegation
1760 * without doing delegreturn.
1763 rp
->r_deleg_needs_recovery
=
1766 /* Save the delegation type for use outside the lock */
1767 dtype
= rp
->r_deleg_type
;
1768 mutex_exit(&rp
->r_statev4_lock
);
1771 * If we have a delegation then get rid of it.
1772 * We've set rp->r_deleg_needs_recovery so we have
1773 * enough information to recover.
1775 if (dtype
!= OPEN_DELEGATE_NONE
) {
1776 (void) nfs4delegreturn(rp
, NFS4_DR_DISCARD
);
1779 rw_exit(&rtable4
[index
].r_lock
);
1781 return (reopenlist
);
1785 * Given a filesystem id, check to see if any rnodes
1786 * within this fsid reside in the rnode cache, other
1787 * than one we know about.
1789 * Return 1 if an rnode is found, 0 otherwise
1792 r4find_by_fsid(mntinfo4_t
*mi
, fattr4_fsid
*moved_fsid
)
1796 vfs_t
*vfsp
= mi
->mi_vfsp
;
1798 int index
, found
= 0;
1800 for (index
= 0; index
< rtable4size
; index
++) {
1801 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
1802 for (rp
= rtable4
[index
].r_hashf
;
1803 rp
!= (rnode4_t
*)(&rtable4
[index
]);
1807 if (vp
->v_vfsp
!= vfsp
)
1811 * XXX there might be a case where a
1812 * replicated fs may have the same fsid
1813 * across two different servers. This
1814 * check isn't good enough in that case
1816 fsid
= &rp
->r_srv_fsid
;
1817 if (FATTR4_FSID_EQ(moved_fsid
, fsid
)) {
1822 rw_exit(&rtable4
[index
].r_lock
);
1831 * Release the list of open instance references.
1835 r4releopenlist(nfs4_opinst_t
*reopenp
)
1837 nfs4_opinst_t
*rep
, *next
;
1840 for (rep
= reopenp
; rep
; rep
= next
) {
1841 next
= rep
->re_next
;
1843 for (i
= 0; i
< rep
->re_numosp
; i
++)
1844 open_stream_rele(rep
->re_osp
[i
], VTOR4(rep
->re_vp
));
1846 VN_RELE(rep
->re_vp
);
1847 kmem_free(rep
->re_osp
,
1848 rep
->re_numosp
* sizeof (*(rep
->re_osp
)));
1850 kmem_free(rep
, sizeof (*rep
));
1855 nfs4_rnode_init(void)
1857 ulong_t nrnode4_max
;
1861 * Compute the size of the rnode4 hash table
1866 (ulong_t
)((kmem_maxavail() >> 2) / sizeof (struct rnode4
));
1867 if (nrnode
> nrnode4_max
|| (nrnode
== 0 && ncsize
== 0)) {
1868 zcmn_err(GLOBAL_ZONEID
, CE_NOTE
,
1869 "!setting nrnode to max value of %ld", nrnode4_max
);
1870 nrnode
= nrnode4_max
;
1872 rtable4size
= 1 << highbit(nrnode
/ rnode4_hashlen
);
1873 rtable4mask
= rtable4size
- 1;
1876 * Allocate and initialize the hash buckets
1878 rtable4
= kmem_alloc(rtable4size
* sizeof (*rtable4
), KM_SLEEP
);
1879 for (i
= 0; i
< rtable4size
; i
++) {
1880 rtable4
[i
].r_hashf
= (rnode4_t
*)(&rtable4
[i
]);
1881 rtable4
[i
].r_hashb
= (rnode4_t
*)(&rtable4
[i
]);
1882 rw_init(&rtable4
[i
].r_lock
, NULL
, RW_DEFAULT
, NULL
);
1885 rnode4_cache
= kmem_cache_create("rnode4_cache", sizeof (rnode4_t
),
1886 0, NULL
, NULL
, nfs4_reclaim
, NULL
, NULL
, 0);
1892 nfs4_rnode_fini(void)
1897 * Deallocate the rnode hash queues
1899 kmem_cache_destroy(rnode4_cache
);
1901 for (i
= 0; i
< rtable4size
; i
++)
1902 rw_destroy(&rtable4
[i
].r_lock
);
1904 kmem_free(rtable4
, rtable4size
* sizeof (*rtable4
));
1910 * Return non-zero if the given filehandle refers to the root filehandle
1911 * for the given rnode.
1915 isrootfh(nfs4_sharedfh_t
*fh
, rnode4_t
*rp
)
1920 if (SFH4_SAME(VTOMI4(RTOV4(rp
))->mi_rootfh
, fh
))
1927 * The r4_stub_* routines assume that the rnode is newly activated, and
1928 * that the caller either holds the hash bucket r_lock for this rnode as
1929 * RW_WRITER, or holds r_statelock.
1932 r4_stub_set(rnode4_t
*rp
, nfs4_stub_type_t type
)
1934 vnode_t
*vp
= RTOV4(rp
);
1935 krwlock_t
*hash_lock
= &rp
->r_hashq
->r_lock
;
1937 ASSERT(RW_WRITE_HELD(hash_lock
) || MUTEX_HELD(&rp
->r_statelock
));
1939 rp
->r_stub_type
= type
;
1942 * Safely switch this vnode to the trigger vnodeops.
1944 * Currently, we don't ever switch a trigger vnode back to using
1945 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1946 * a new v4 object is not a trigger, and it will already have the
1947 * correct v4 vnodeops by default. So, no "else" case required here.
1949 if (type
!= NFS4_STUB_NONE
)
1950 vn_setops(vp
, &nfs4_trigger_vnodeops
);
1954 r4_stub_mirrormount(rnode4_t
*rp
)
1956 r4_stub_set(rp
, NFS4_STUB_MIRRORMOUNT
);
1960 r4_stub_referral(rnode4_t
*rp
)
1962 DTRACE_PROBE1(nfs4clnt__func__referral__moved
,
1963 vnode_t
*, RTOV4(rp
));
1964 r4_stub_set(rp
, NFS4_STUB_REFERRAL
);
1968 r4_stub_none(rnode4_t
*rp
)
1970 r4_stub_set(rp
, NFS4_STUB_NONE
);
1976 * Look in the rnode table for other rnodes that have the same filehandle.
1977 * Assume the lock is held for the hash chain of checkrp
1981 r4_dup_check(rnode4_t
*checkrp
, vfs_t
*vfsp
)
1985 nfs4_fhandle_t fh
, fh2
;
1988 if (!r4_check_for_dups
)
1991 ASSERT(RW_LOCK_HELD(&checkrp
->r_hashq
->r_lock
));
1993 sfh4_copyval(checkrp
->r_fh
, &fh
);
1995 for (index
= 0; index
< rtable4size
; index
++) {
1997 if (&rtable4
[index
] != checkrp
->r_hashq
)
1998 rw_enter(&rtable4
[index
].r_lock
, RW_READER
);
2000 for (rp
= rtable4
[index
].r_hashf
;
2001 rp
!= (rnode4_t
*)(&rtable4
[index
]);
2008 if (tvp
->v_vfsp
!= vfsp
)
2011 sfh4_copyval(rp
->r_fh
, &fh2
);
2012 if (nfs4cmpfhandle(&fh
, &fh2
) == 0) {
2013 cmn_err(CE_PANIC
, "rnodes with same fs, fh "
2014 "(%p, %p)", (void *)checkrp
, (void *)rp
);
2018 if (&rtable4
[index
] != checkrp
->r_hashq
)
2019 rw_exit(&rtable4
[index
].r_lock
);