4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
25 * All rights reserved.
28 #include <sys/param.h>
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/thread.h>
32 #include <sys/t_lock.h>
34 #include <sys/vnode.h>
36 #include <sys/errno.h>
41 #include <sys/debug.h>
43 #include <sys/vmsystm.h>
44 #include <sys/flock.h>
45 #include <sys/share.h>
46 #include <sys/cmn_err.h>
47 #include <sys/tiuser.h>
48 #include <sys/sysmacros.h>
49 #include <sys/callb.h>
51 #include <sys/kstat.h>
52 #include <sys/signal.h>
56 #include <rpc/types.h>
62 #include <nfs/nfs_clnt.h>
64 #include <nfs/rnode.h>
65 #include <nfs/nfs_acl.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_vn.h>
76 static void nfs3_attr_cache(vnode_t
*, vattr_t
*, vattr_t
*, hrtime_t
,
78 static int nfs_getattr_cache(vnode_t
*, struct vattr
*);
79 static int nfs_remove_locking_id(vnode_t
*, int, char *, char *, int *);
82 kmutex_t mig_lock
; /* lock protecting mig_list */
83 list_t mig_list
; /* list of NFS v2 or v3 mounts in zone */
84 boolean_t mig_destructor_called
;
87 static zone_key_t mi_list_key
;
89 /* Debugging flag for PC file shares. */
90 extern int share_debug
;
95 * Attributes are cached in the rnode in struct vattr form.
96 * There is a time associated with the cached attributes (r_attrtime)
97 * which tells whether the attributes are valid. The time is initialized
98 * to the difference between current time and the modify time of the vnode
99 * when new attributes are cached. This allows the attributes for
100 * files that have changed recently to be timed out sooner than for files
101 * that have not changed for a long time. There are minimum and maximum
102 * timeout values that can be set per mount point.
106 nfs_waitfor_purge_complete(vnode_t
*vp
)
112 if (rp
->r_serial
!= NULL
&& rp
->r_serial
!= curthread
) {
113 mutex_enter(&rp
->r_statelock
);
114 sigintr(&smask
, VTOMI(vp
)->mi_flags
& MI_INT
);
115 while (rp
->r_serial
!= NULL
) {
116 if (!cv_wait_sig(&rp
->r_cv
, &rp
->r_statelock
)) {
118 mutex_exit(&rp
->r_statelock
);
123 mutex_exit(&rp
->r_statelock
);
129 * Validate caches by checking cached attributes. If the cached
130 * attributes have timed out, then get new attributes from the server.
131 * As a side affect, this will do cache invalidation if the attributes
134 * If the attributes have not timed out and if there is a cache
135 * invalidation being done by some other thread, then wait until that
136 * thread has completed the cache invalidation.
139 nfs_validate_caches(vnode_t
*vp
, cred_t
*cr
)
144 if (ATTRCACHE_VALID(vp
)) {
145 error
= nfs_waitfor_purge_complete(vp
);
152 return (nfs_getattr_otw(vp
, &va
, cr
));
156 * Validate caches by checking cached attributes. If the cached
157 * attributes have timed out, then get new attributes from the server.
158 * As a side affect, this will do cache invalidation if the attributes
161 * If the attributes have not timed out and if there is a cache
162 * invalidation being done by some other thread, then wait until that
163 * thread has completed the cache invalidation.
166 nfs3_validate_caches(vnode_t
*vp
, cred_t
*cr
)
171 if (ATTRCACHE_VALID(vp
)) {
172 error
= nfs_waitfor_purge_complete(vp
);
179 return (nfs3_getattr_otw(vp
, &va
, cr
));
183 * Purge all of the various NFS `data' caches.
186 nfs_purge_caches(vnode_t
*vp
, int purge_dnlc
, cred_t
*cr
)
194 * Purge the DNLC for any entries which refer to this file.
195 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
198 mutex_enter(&rp
->r_statelock
);
199 if (vp
->v_count
> 1 &&
200 (vp
->v_type
== VDIR
|| purge_dnlc
== NFS_PURGE_DNLC
) &&
201 !(rp
->r_flags
& RINDNLCPURGE
)) {
203 * Set the RINDNLCPURGE flag to prevent recursive entry
204 * into dnlc_purge_vp()
206 if (vp
->v_type
== VDIR
)
207 rp
->r_flags
|= RINDNLCPURGE
;
208 mutex_exit(&rp
->r_statelock
);
210 mutex_enter(&rp
->r_statelock
);
211 if (rp
->r_flags
& RINDNLCPURGE
)
212 rp
->r_flags
&= ~RINDNLCPURGE
;
216 * Clear any readdir state bits and purge the readlink response cache.
218 contents
= rp
->r_symlink
.contents
;
219 size
= rp
->r_symlink
.size
;
220 rp
->r_symlink
.contents
= NULL
;
221 mutex_exit(&rp
->r_statelock
);
223 if (contents
!= NULL
) {
225 kmem_free((void *)contents
, size
);
229 * Flush the page cache.
231 if (vn_has_cached_data(vp
)) {
232 error
= fop_putpage(vp
, 0, 0, B_INVAL
, cr
, NULL
);
233 if (error
&& (error
== ENOSPC
|| error
== EDQUOT
)) {
234 mutex_enter(&rp
->r_statelock
);
237 mutex_exit(&rp
->r_statelock
);
242 * Flush the readdir response cache.
244 if (HAVE_RDDIR_CACHE(rp
))
245 nfs_purge_rddir_cache(vp
);
249 * Purge the readdir cache of all entries
252 nfs_purge_rddir_cache(vnode_t
*vp
)
260 mutex_enter(&rp
->r_statelock
);
262 rp
->r_flags
&= ~RLOOKUP
;
263 rp
->r_flags
|= RREADDIRPLUS
;
264 rdc
= avl_first(&rp
->r_dir
);
265 while (rdc
!= NULL
) {
266 nrdc
= AVL_NEXT(&rp
->r_dir
, rdc
);
267 avl_remove(&rp
->r_dir
, rdc
);
268 rddir_cache_rele(rdc
);
271 mutex_exit(&rp
->r_statelock
);
275 * Do a cache check based on the post-operation attributes.
276 * Then make them the new cached attributes. If no attributes
277 * were returned, then mark the attributes as timed out.
280 nfs3_cache_post_op_attr(vnode_t
*vp
, post_op_attr
*poap
, hrtime_t t
, cred_t
*cr
)
284 if (!poap
->attributes
) {
288 (void) nfs3_cache_fattr3(vp
, &poap
->attr
, &attr
, t
, cr
);
292 * Same as above, but using a vattr
295 nfs3_cache_post_op_vattr(vnode_t
*vp
, post_op_vattr
*poap
, hrtime_t t
,
298 if (!poap
->attributes
) {
302 nfs_attr_cache(vp
, poap
->fres
.vap
, t
, cr
);
306 * Do a cache check based on the weak cache consistency attributes.
307 * These consist of a small set of pre-operation attributes and the
308 * full set of post-operation attributes.
310 * If we are given the pre-operation attributes, then use them to
311 * check the validity of the various caches. Then, if we got the
312 * post-operation attributes, make them the new cached attributes.
313 * If we didn't get the post-operation attributes, then mark the
314 * attribute cache as timed out so that the next reference will
315 * cause a GETATTR to the server to refresh with the current
318 * Otherwise, if we didn't get the pre-operation attributes, but
319 * we did get the post-operation attributes, then use these
320 * attributes to check the validity of the various caches. This
321 * will probably cause a flush of the caches because if the
322 * operation succeeded, the attributes of the object were changed
323 * in some way from the old post-operation attributes. This
324 * should be okay because it is the safe thing to do. After
325 * checking the data caches, then we make these the new cached
328 * Otherwise, we didn't get either the pre- or post-operation
329 * attributes. Simply mark the attribute cache as timed out so
330 * the next reference will cause a GETATTR to the server to
331 * refresh with the current attributes.
333 * If an error occurred trying to convert the over the wire
334 * attributes to a vattr, then simply mark the attribute cache as
338 nfs3_cache_wcc_data(vnode_t
*vp
, wcc_data
*wccp
, hrtime_t t
, cred_t
*cr
)
343 if (wccp
->after
.attributes
) {
344 if (fattr3_to_vattr(vp
, &wccp
->after
.attr
, &ava
)) {
348 if (wccp
->before
.attributes
) {
349 bva
.va_ctime
.tv_sec
= wccp
->before
.attr
.ctime
.seconds
;
350 bva
.va_ctime
.tv_nsec
= wccp
->before
.attr
.ctime
.nseconds
;
351 bva
.va_mtime
.tv_sec
= wccp
->before
.attr
.mtime
.seconds
;
352 bva
.va_mtime
.tv_nsec
= wccp
->before
.attr
.mtime
.nseconds
;
353 bva
.va_size
= wccp
->before
.attr
.size
;
354 nfs3_attr_cache(vp
, &bva
, &ava
, t
, cr
);
356 nfs_attr_cache(vp
, &ava
, t
, cr
);
363 * Set attributes cache for given vnode using nfsattr.
365 * This routine does not do cache validation with the attributes.
367 * If an error occurred trying to convert the over the wire
368 * attributes to a vattr, then simply mark the attribute cache as
372 nfs_attrcache(vnode_t
*vp
, struct nfsfattr
*na
, hrtime_t t
)
377 if (!nattr_to_vattr(vp
, na
, &va
)) {
379 mutex_enter(&rp
->r_statelock
);
380 if (rp
->r_mtime
<= t
)
381 nfs_attrcache_va(vp
, &va
);
382 mutex_exit(&rp
->r_statelock
);
389 * Set attributes cache for given vnode using fattr3.
391 * This routine does not do cache validation with the attributes.
393 * If an error occurred trying to convert the over the wire
394 * attributes to a vattr, then simply mark the attribute cache as
398 nfs3_attrcache(vnode_t
*vp
, fattr3
*na
, hrtime_t t
)
403 if (!fattr3_to_vattr(vp
, na
, &va
)) {
405 mutex_enter(&rp
->r_statelock
);
406 if (rp
->r_mtime
<= t
)
407 nfs_attrcache_va(vp
, &va
);
408 mutex_exit(&rp
->r_statelock
);
415 * Do a cache check based on attributes returned over the wire. The
416 * new attributes are cached.
418 * If an error occurred trying to convert the over the wire attributes
419 * to a vattr, then just return that error.
421 * As a side affect, the vattr argument is filled in with the converted
425 nfs_cache_fattr(vnode_t
*vp
, struct nfsfattr
*na
, vattr_t
*vap
, hrtime_t t
,
430 error
= nattr_to_vattr(vp
, na
, vap
);
433 nfs_attr_cache(vp
, vap
, t
, cr
);
438 * Do a cache check based on attributes returned over the wire. The
439 * new attributes are cached.
441 * If an error occurred trying to convert the over the wire attributes
442 * to a vattr, then just return that error.
444 * As a side affect, the vattr argument is filled in with the converted
448 nfs3_cache_fattr3(vnode_t
*vp
, fattr3
*na
, vattr_t
*vap
, hrtime_t t
, cred_t
*cr
)
452 error
= fattr3_to_vattr(vp
, na
, vap
);
455 nfs_attr_cache(vp
, vap
, t
, cr
);
460 * Use the passed in virtual attributes to check to see whether the
461 * data and metadata caches are valid, cache the new attributes, and
462 * then do the cache invalidation if required.
464 * The cache validation and caching of the new attributes is done
465 * atomically via the use of the mutex, r_statelock. If required,
466 * the cache invalidation is done atomically w.r.t. the cache
467 * validation and caching of the attributes via the pseudo lock,
470 * This routine is used to do cache validation and attributes caching
471 * for operations with a single set of post operation attributes.
474 nfs_attr_cache(vnode_t
*vp
, vattr_t
*vap
, hrtime_t t
, cred_t
*cr
)
477 int mtime_changed
= 0;
478 int ctime_changed
= 0;
482 boolean_t writeattr_set
= B_FALSE
;
483 boolean_t cachepurge_set
= B_FALSE
;
487 mutex_enter(&rp
->r_statelock
);
489 if (rp
->r_serial
!= curthread
) {
490 klwp_t
*lwp
= ttolwp(curthread
);
495 while (rp
->r_serial
!= NULL
) {
496 if (!cv_wait_sig(&rp
->r_cv
, &rp
->r_statelock
)) {
497 mutex_exit(&rp
->r_statelock
);
508 if (rp
->r_mtime
> t
) {
509 if (!CACHE_VALID(rp
, vap
->va_mtime
, vap
->va_size
))
510 PURGE_ATTRCACHE_LOCKED(rp
);
511 mutex_exit(&rp
->r_statelock
);
516 * Write thread after writing data to file on remote server,
517 * will always set RWRITEATTR to indicate that file on remote
518 * server was modified with a WRITE operation and would have
519 * marked attribute cache as timed out. If RWRITEATTR
520 * is set, then do not check for mtime and ctime change.
522 if (!(rp
->r_flags
& RWRITEATTR
)) {
523 if (!CACHE_VALID(rp
, vap
->va_mtime
, vap
->va_size
))
526 if (rp
->r_attr
.va_ctime
.tv_sec
!= vap
->va_ctime
.tv_sec
||
527 rp
->r_attr
.va_ctime
.tv_nsec
!= vap
->va_ctime
.tv_nsec
)
530 writeattr_set
= B_TRUE
;
533 preattr_rsize
= rp
->r_size
;
535 nfs_attrcache_va(vp
, vap
);
538 * If we have updated filesize in nfs_attrcache_va, as soon as we
539 * drop statelock we will be in transition of purging all
540 * our caches and updating them. It is possible for another
541 * thread to pick this new file size and read in zeroed data.
542 * stall other threads till cache purge is complete.
544 if ((vp
->v_type
== VREG
) && (rp
->r_size
!= preattr_rsize
)) {
546 * If RWRITEATTR was set and we have updated the file
547 * size, Server's returned file size need not necessarily
548 * be because of this Client's WRITE. We need to purge
554 if (mtime_changed
&& !(rp
->r_flags
& RINCACHEPURGE
)) {
555 rp
->r_flags
|= RINCACHEPURGE
;
556 cachepurge_set
= B_TRUE
;
560 if (!mtime_changed
&& !ctime_changed
) {
561 mutex_exit(&rp
->r_statelock
);
565 rp
->r_serial
= curthread
;
567 mutex_exit(&rp
->r_statelock
);
570 nfs_purge_caches(vp
, NFS_NOPURGE_DNLC
, cr
);
572 if ((rp
->r_flags
& RINCACHEPURGE
) && cachepurge_set
) {
573 mutex_enter(&rp
->r_statelock
);
574 rp
->r_flags
&= ~RINCACHEPURGE
;
575 cv_broadcast(&rp
->r_cv
);
576 mutex_exit(&rp
->r_statelock
);
577 cachepurge_set
= B_FALSE
;
581 (void) nfs_access_purge_rp(rp
);
582 if (rp
->r_secattr
!= NULL
) {
583 mutex_enter(&rp
->r_statelock
);
585 rp
->r_secattr
= NULL
;
586 mutex_exit(&rp
->r_statelock
);
593 mutex_enter(&rp
->r_statelock
);
595 cv_broadcast(&rp
->r_cv
);
596 mutex_exit(&rp
->r_statelock
);
601 * Use the passed in "before" virtual attributes to check to see
602 * whether the data and metadata caches are valid, cache the "after"
603 * new attributes, and then do the cache invalidation if required.
605 * The cache validation and caching of the new attributes is done
606 * atomically via the use of the mutex, r_statelock. If required,
607 * the cache invalidation is done atomically w.r.t. the cache
608 * validation and caching of the attributes via the pseudo lock,
611 * This routine is used to do cache validation and attributes caching
612 * for operations with both pre operation attributes and post operation
616 nfs3_attr_cache(vnode_t
*vp
, vattr_t
*bvap
, vattr_t
*avap
, hrtime_t t
,
620 int mtime_changed
= 0;
621 int ctime_changed
= 0;
625 boolean_t writeattr_set
= B_FALSE
;
626 boolean_t cachepurge_set
= B_FALSE
;
630 mutex_enter(&rp
->r_statelock
);
632 if (rp
->r_serial
!= curthread
) {
633 klwp_t
*lwp
= ttolwp(curthread
);
638 while (rp
->r_serial
!= NULL
) {
639 if (!cv_wait_sig(&rp
->r_cv
, &rp
->r_statelock
)) {
640 mutex_exit(&rp
->r_statelock
);
651 if (rp
->r_mtime
> t
) {
652 if (!CACHE_VALID(rp
, avap
->va_mtime
, avap
->va_size
))
653 PURGE_ATTRCACHE_LOCKED(rp
);
654 mutex_exit(&rp
->r_statelock
);
659 * Write thread after writing data to file on remote server,
660 * will always set RWRITEATTR to indicate that file on remote
661 * server was modified with a WRITE operation and would have
662 * marked attribute cache as timed out. If RWRITEATTR
663 * is set, then do not check for mtime and ctime change.
665 if (!(rp
->r_flags
& RWRITEATTR
)) {
666 if (!CACHE_VALID(rp
, bvap
->va_mtime
, bvap
->va_size
))
669 if (rp
->r_attr
.va_ctime
.tv_sec
!= bvap
->va_ctime
.tv_sec
||
670 rp
->r_attr
.va_ctime
.tv_nsec
!= bvap
->va_ctime
.tv_nsec
)
673 writeattr_set
= B_TRUE
;
676 preattr_rsize
= rp
->r_size
;
678 nfs_attrcache_va(vp
, avap
);
681 * If we have updated filesize in nfs_attrcache_va, as soon as we
682 * drop statelock we will be in transition of purging all
683 * our caches and updating them. It is possible for another
684 * thread to pick this new file size and read in zeroed data.
685 * stall other threads till cache purge is complete.
687 if ((vp
->v_type
== VREG
) && (rp
->r_size
!= preattr_rsize
)) {
689 * If RWRITEATTR was set and we have updated the file
690 * size, Server's returned file size need not necessarily
691 * be because of this Client's WRITE. We need to purge
697 if (mtime_changed
&& !(rp
->r_flags
& RINCACHEPURGE
)) {
698 rp
->r_flags
|= RINCACHEPURGE
;
699 cachepurge_set
= B_TRUE
;
703 if (!mtime_changed
&& !ctime_changed
) {
704 mutex_exit(&rp
->r_statelock
);
708 rp
->r_serial
= curthread
;
710 mutex_exit(&rp
->r_statelock
);
713 nfs_purge_caches(vp
, NFS_NOPURGE_DNLC
, cr
);
715 if ((rp
->r_flags
& RINCACHEPURGE
) && cachepurge_set
) {
716 mutex_enter(&rp
->r_statelock
);
717 rp
->r_flags
&= ~RINCACHEPURGE
;
718 cv_broadcast(&rp
->r_cv
);
719 mutex_exit(&rp
->r_statelock
);
720 cachepurge_set
= B_FALSE
;
724 (void) nfs_access_purge_rp(rp
);
725 if (rp
->r_secattr
!= NULL
) {
726 mutex_enter(&rp
->r_statelock
);
728 rp
->r_secattr
= NULL
;
729 mutex_exit(&rp
->r_statelock
);
736 mutex_enter(&rp
->r_statelock
);
738 cv_broadcast(&rp
->r_cv
);
739 mutex_exit(&rp
->r_statelock
);
744 * Set attributes cache for given vnode using virtual attributes.
746 * Set the timeout value on the attribute cache and fill it
747 * with the passed in attributes.
749 * The caller must be holding r_statelock.
752 nfs_attrcache_va(vnode_t
*vp
, struct vattr
*va
)
761 ASSERT(MUTEX_HELD(&rp
->r_statelock
));
768 * Delta is the number of nanoseconds that we will
769 * cache the attributes of the file. It is based on
770 * the number of nanoseconds since the last time that
771 * we detected a change. The assumption is that files
772 * that changed recently are likely to change again.
773 * There is a minimum and a maximum for regular files
774 * and for directories which is enforced though.
776 * Using the time since last change was detected
777 * eliminates direct comparison or calculation
778 * using mixed client and server times. NFS does
779 * not make any assumptions regarding the client
780 * and server clocks being synchronized.
782 if (va
->va_mtime
.tv_sec
!= rp
->r_attr
.va_mtime
.tv_sec
||
783 va
->va_mtime
.tv_nsec
!= rp
->r_attr
.va_mtime
.tv_nsec
||
784 va
->va_size
!= rp
->r_attr
.va_size
)
787 if ((mi
->mi_flags
& MI_NOAC
) || (vp
->v_flag
& VNOCACHE
))
790 delta
= now
- rp
->r_mtime
;
791 if (vp
->v_type
== VDIR
) {
792 if (delta
< mi
->mi_acdirmin
)
793 delta
= mi
->mi_acdirmin
;
794 else if (delta
> mi
->mi_acdirmax
)
795 delta
= mi
->mi_acdirmax
;
797 if (delta
< mi
->mi_acregmin
)
798 delta
= mi
->mi_acregmin
;
799 else if (delta
> mi
->mi_acregmax
)
800 delta
= mi
->mi_acregmax
;
803 rp
->r_attrtime
= now
+ delta
;
806 * Update the size of the file if there is no cached data or if
807 * the cached data is clean and there is no data being written
810 if (rp
->r_size
!= va
->va_size
&&
811 (!vn_has_cached_data(vp
) ||
812 (!(rp
->r_flags
& RDIRTY
) && rp
->r_count
== 0)))
813 rp
->r_size
= va
->va_size
;
814 nfs_setswaplike(vp
, va
);
815 rp
->r_flags
&= ~RWRITEATTR
;
819 * Fill in attribute from the cache.
820 * If valid, then return 0 to indicate that no error occurred,
821 * otherwise return 1 to indicate that an error occurred.
824 nfs_getattr_cache(vnode_t
*vp
, struct vattr
*vap
)
827 uint_t mask
= vap
->va_mask
;
830 mutex_enter(&rp
->r_statelock
);
831 if (ATTRCACHE_VALID(vp
)) {
833 * Cached attributes are valid
837 * Set the caller's va_mask to the set of attributes
838 * that were requested ANDed with the attributes that
839 * are available. If attributes were requested that
840 * are not available, those bits must be turned off
841 * in the callers va_mask.
843 vap
->va_mask
&= mask
;
844 mutex_exit(&rp
->r_statelock
);
847 mutex_exit(&rp
->r_statelock
);
852 * Get attributes over-the-wire and update attributes cache
853 * if no error occurred in the over-the-wire operation.
854 * Return 0 if successful, otherwise error.
857 nfs_getattr_otw(vnode_t
*vp
, struct vattr
*vap
, cred_t
*cr
)
860 struct nfsattrstat ns
;
868 fi
.fhp
= NULL
; /* no need to update, filehandle not copied */
869 fi
.copyproc
= nfscopyfh
;
870 fi
.lookupproc
= nfslookup
;
871 fi
.xattrdirproc
= acl_getxattrdir2
;
873 if (mi
->mi_flags
& MI_ACL
) {
874 error
= acl_getattr2_otw(vp
, vap
, cr
);
875 if (mi
->mi_flags
& MI_ACL
)
883 error
= rfs2call(mi
, RFS_GETATTR
,
884 xdr_fhandle
, (caddr_t
)VTOFH(vp
),
885 xdr_attrstat
, (caddr_t
)&ns
, cr
,
886 &douprintf
, &ns
.ns_status
, 0, &fi
);
889 error
= geterrno(ns
.ns_status
);
891 error
= nfs_cache_fattr(vp
, &ns
.ns_attr
, vap
, t
, cr
);
893 PURGE_STALE_FH(error
, vp
, cr
);
901 * Return either cached ot remote attributes. If get remote attr
902 * use them to check and invalidate caches, then cache the new attributes.
905 nfsgetattr(vnode_t
*vp
, struct vattr
*vap
, cred_t
*cr
)
911 * If we've got cached attributes, we're done, otherwise go
912 * to the server to get attributes, which will update the cache
915 error
= nfs_getattr_cache(vp
, vap
);
917 error
= nfs_getattr_otw(vp
, vap
, cr
);
919 /* Return the client's view of file size */
921 mutex_enter(&rp
->r_statelock
);
922 vap
->va_size
= rp
->r_size
;
923 mutex_exit(&rp
->r_statelock
);
929 * Get attributes over-the-wire and update attributes cache
930 * if no error occurred in the over-the-wire operation.
931 * Return 0 if successful, otherwise error.
934 nfs3_getattr_otw(vnode_t
*vp
, struct vattr
*vap
, cred_t
*cr
)
943 args
.object
= *VTOFH3(vp
);
945 fi
.fhp
= (caddr_t
)&args
.object
;
946 fi
.copyproc
= nfs3copyfh
;
947 fi
.lookupproc
= nfs3lookup
;
948 fi
.xattrdirproc
= acl_getxattrdir3
;
956 error
= rfs3call(VTOMI(vp
), NFSPROC3_GETATTR
,
957 xdr_nfs_fh3
, (caddr_t
)&args
,
958 xdr_GETATTR3vres
, (caddr_t
)&res
, cr
,
959 &douprintf
, &res
.status
, 0, &fi
);
964 error
= geterrno3(res
.status
);
966 PURGE_STALE_FH(error
, vp
, cr
);
971 * Catch status codes that indicate fattr3 to vattr translation failure
974 return (res
.fres
.status
);
976 nfs_attr_cache(vp
, vap
, t
, cr
);
981 * Return either cached or remote attributes. If get remote attr
982 * use them to check and invalidate caches, then cache the new attributes.
985 nfs3getattr(vnode_t
*vp
, struct vattr
*vap
, cred_t
*cr
)
991 * If we've got cached attributes, we're done, otherwise go
992 * to the server to get attributes, which will update the cache
995 error
= nfs_getattr_cache(vp
, vap
);
997 error
= nfs3_getattr_otw(vp
, vap
, cr
);
999 /* Return the client's view of file size */
1001 mutex_enter(&rp
->r_statelock
);
1002 vap
->va_size
= rp
->r_size
;
1003 mutex_exit(&rp
->r_statelock
);
1008 vtype_t nf_to_vt
[] = {
1009 VNON
, VREG
, VDIR
, VBLK
, VCHR
, VLNK
, VSOCK
1012 * Convert NFS Version 2 over the network attributes to the local
1013 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1014 * network representation and the local representation is done here.
1015 * Returns 0 for success, error if failed due to overflow.
1018 nattr_to_vattr(vnode_t
*vp
, struct nfsfattr
*na
, struct vattr
*vap
)
1020 /* overflow in time attributes? */
1022 if (!NFS2_FATTR_TIME_OK(na
))
1026 vap
->va_mask
= AT_ALL
;
1028 if (na
->na_type
< NFNON
|| na
->na_type
> NFSOC
)
1029 vap
->va_type
= VBAD
;
1031 vap
->va_type
= nf_to_vt
[na
->na_type
];
1032 vap
->va_mode
= na
->na_mode
;
1033 vap
->va_uid
= (na
->na_uid
== NFS_UID_NOBODY
) ? UID_NOBODY
: na
->na_uid
;
1034 vap
->va_gid
= (na
->na_gid
== NFS_GID_NOBODY
) ? GID_NOBODY
: na
->na_gid
;
1035 vap
->va_fsid
= vp
->v_vfsp
->vfs_dev
;
1036 vap
->va_nodeid
= na
->na_nodeid
;
1037 vap
->va_nlink
= na
->na_nlink
;
1038 vap
->va_size
= na
->na_size
; /* keep for cache validation */
1040 * nfs protocol defines times as unsigned so don't extend sign,
1041 * unless sysadmin set nfs_allow_preepoch_time.
1043 NFS_TIME_T_CONVERT(vap
->va_atime
.tv_sec
, na
->na_atime
.tv_sec
);
1044 vap
->va_atime
.tv_nsec
= (uint32_t)(na
->na_atime
.tv_usec
* 1000);
1045 NFS_TIME_T_CONVERT(vap
->va_mtime
.tv_sec
, na
->na_mtime
.tv_sec
);
1046 vap
->va_mtime
.tv_nsec
= (uint32_t)(na
->na_mtime
.tv_usec
* 1000);
1047 NFS_TIME_T_CONVERT(vap
->va_ctime
.tv_sec
, na
->na_ctime
.tv_sec
);
1048 vap
->va_ctime
.tv_nsec
= (uint32_t)(na
->na_ctime
.tv_usec
* 1000);
1050 * Shannon's law - uncompress the received dev_t
1051 * if the top half of is zero indicating a response
1052 * from an `older style' OS. Except for when it is a
1053 * `new style' OS sending the maj device of zero,
1054 * in which case the algorithm still works because the
1055 * fact that it is a new style server
1056 * is hidden by the minor device not being greater
1057 * than 255 (a requirement in this case).
1059 if ((na
->na_rdev
& 0xffff0000) == 0)
1060 vap
->va_rdev
= nfsv2_expdev(na
->na_rdev
);
1062 vap
->va_rdev
= expldev(na
->na_rdev
);
1064 vap
->va_nblocks
= na
->na_blocks
;
1065 switch (na
->na_type
) {
1067 vap
->va_blksize
= DEV_BSIZE
;
1071 vap
->va_blksize
= MAXBSIZE
;
1076 vap
->va_blksize
= na
->na_blocksize
;
1080 * This bit of ugliness is a hack to preserve the
1081 * over-the-wire protocols for named-pipe vnodes.
1082 * It remaps the special over-the-wire type to the
1083 * VFIFO type. (see note in nfs.h)
1085 if (NA_ISFIFO(na
)) {
1086 vap
->va_type
= VFIFO
;
1087 vap
->va_mode
= (vap
->va_mode
& ~S_IFMT
) | S_IFIFO
;
1089 vap
->va_blksize
= na
->na_blocksize
;
1096 * Convert NFS Version 3 over the network attributes to the local
1097 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1098 * network representation and the local representation is done here.
1100 vtype_t nf3_to_vt
[] = {
1101 VBAD
, VREG
, VDIR
, VBLK
, VCHR
, VLNK
, VSOCK
, VFIFO
1105 fattr3_to_vattr(vnode_t
*vp
, fattr3
*na
, struct vattr
*vap
)
1109 /* overflow in time attributes? */
1110 if (!NFS3_FATTR_TIME_OK(na
))
1113 if (!NFS3_SIZE_OK(na
->size
))
1117 vap
->va_mask
= AT_ALL
;
1119 if (na
->type
< NF3REG
|| na
->type
> NF3FIFO
)
1120 vap
->va_type
= VBAD
;
1122 vap
->va_type
= nf3_to_vt
[na
->type
];
1123 vap
->va_mode
= na
->mode
;
1124 vap
->va_uid
= (na
->uid
== NFS_UID_NOBODY
) ? UID_NOBODY
: (uid_t
)na
->uid
;
1125 vap
->va_gid
= (na
->gid
== NFS_GID_NOBODY
) ? GID_NOBODY
: (gid_t
)na
->gid
;
1126 vap
->va_fsid
= vp
->v_vfsp
->vfs_dev
;
1127 vap
->va_nodeid
= na
->fileid
;
1128 vap
->va_nlink
= na
->nlink
;
1129 vap
->va_size
= na
->size
;
1132 * nfs protocol defines times as unsigned so don't extend sign,
1133 * unless sysadmin set nfs_allow_preepoch_time.
1135 NFS_TIME_T_CONVERT(vap
->va_atime
.tv_sec
, na
->atime
.seconds
);
1136 vap
->va_atime
.tv_nsec
= (uint32_t)na
->atime
.nseconds
;
1137 NFS_TIME_T_CONVERT(vap
->va_mtime
.tv_sec
, na
->mtime
.seconds
);
1138 vap
->va_mtime
.tv_nsec
= (uint32_t)na
->mtime
.nseconds
;
1139 NFS_TIME_T_CONVERT(vap
->va_ctime
.tv_sec
, na
->ctime
.seconds
);
1140 vap
->va_ctime
.tv_nsec
= (uint32_t)na
->ctime
.nseconds
;
1144 vap
->va_rdev
= makedevice(na
->rdev
.specdata1
,
1145 na
->rdev
.specdata2
);
1146 vap
->va_blksize
= DEV_BSIZE
;
1147 vap
->va_nblocks
= 0;
1150 vap
->va_rdev
= makedevice(na
->rdev
.specdata1
,
1151 na
->rdev
.specdata2
);
1152 vap
->va_blksize
= MAXBSIZE
;
1153 vap
->va_nblocks
= 0;
1159 vap
->va_blksize
= MAXBSIZE
;
1160 vap
->va_nblocks
= (u_longlong_t
)
1161 ((na
->used
+ (size3
)DEV_BSIZE
- (size3
)1) /
1168 vap
->va_blksize
= MAXBSIZE
;
1169 vap
->va_nblocks
= 0;
1177 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1178 * for the demand-based allocation of async threads per-mount. The
1179 * nfs_async_timeout is the amount of time a thread will live after it
1180 * becomes idle, unless new I/O requests are received before the thread
1181 * dies. See nfs_async_putpage and nfs_async_start.
1184 int nfs_async_timeout
= -1; /* uninitialized */
1186 static void nfs_async_start(struct vfs
*);
1187 static void nfs_async_pgops_start(struct vfs
*);
1188 static void nfs_async_common_start(struct vfs
*, int);
1191 free_async_args(struct nfs_async_reqs
*args
)
1195 if (args
->a_io
!= NFS_INACTIVE
) {
1196 rp
= VTOR(args
->a_vp
);
1197 mutex_enter(&rp
->r_statelock
);
1199 if (args
->a_io
== NFS_PUTAPAGE
||
1200 args
->a_io
== NFS_PAGEIO
)
1202 cv_broadcast(&rp
->r_cv
);
1203 mutex_exit(&rp
->r_statelock
);
1204 VN_RELE(args
->a_vp
);
1206 crfree(args
->a_cred
);
1207 kmem_free(args
, sizeof (*args
));
1211 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1212 * pageout(), running in the global zone, have legitimate reasons to do
1213 * fop_putpage(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1214 * use of a a per-mount "asynchronous requests manager thread" which is
1215 * signaled by the various asynchronous work routines when there is
1216 * asynchronous work to be done. It is responsible for creating new
1217 * worker threads if necessary, and notifying existing worker threads
1218 * that there is work to be done.
1220 * In other words, it will "take the specifications from the customers and
1221 * give them to the engineers."
1223 * Worker threads die off of their own accord if they are no longer
1226 * This thread is killed when the zone is going away or the filesystem
1227 * is being unmounted.
1230 nfs_async_manager(vfs_t
*vfsp
)
1232 callb_cpr_t cprinfo
;
1238 CALLB_CPR_INIT(&cprinfo
, &mi
->mi_async_lock
, callb_generic_cpr
,
1239 "nfs_async_manager");
1241 mutex_enter(&mi
->mi_async_lock
);
1243 * We want to stash the max number of threads that this mount was
1244 * allowed so we can use it later when the variable is set to zero as
1245 * part of the zone/mount going away.
1247 * We want to be able to create at least one thread to handle
1248 * asynchronous inactive calls.
1250 max_threads
= MAX(mi
->mi_max_threads
, 1);
1252 * We don't want to wait for mi_max_threads to go to zero, since that
1253 * happens as part of a failed unmount, but this thread should only
1254 * exit when the mount/zone is really going away.
1256 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1257 * attempted: the various _async_*() functions know to do things
1258 * inline if mi_max_threads == 0. Henceforth we just drain out the
1259 * outstanding requests.
1261 * Note that we still create zthreads even if we notice the zone is
1262 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1263 * shutdown sequence to take slightly longer in some cases, but
1264 * doesn't violate the protocol, as all threads will exit as soon as
1265 * they're done processing the remaining requests.
1268 while (mi
->mi_async_req_count
> 0) {
1270 * Paranoia: If the mount started out having
1271 * (mi->mi_max_threads == 0), and the value was
1272 * later changed (via a debugger or somesuch),
1273 * we could be confused since we will think we
1274 * can't create any threads, and the calling
1275 * code (which looks at the current value of
1276 * mi->mi_max_threads, now non-zero) thinks we
1279 * So, because we're paranoid, we create threads
1280 * up to the maximum of the original and the
1281 * current value. This means that future
1282 * (debugger-induced) lowerings of
1283 * mi->mi_max_threads are ignored for our
1284 * purposes, but who told them they could change
1285 * random values on a live kernel anyhow?
1287 if (mi
->mi_threads
[NFS_ASYNC_QUEUE
] <
1288 MAX(mi
->mi_max_threads
, max_threads
)) {
1289 mi
->mi_threads
[NFS_ASYNC_QUEUE
]++;
1290 mutex_exit(&mi
->mi_async_lock
);
1291 VFS_HOLD(vfsp
); /* hold for new thread */
1292 (void) zthread_create(NULL
, 0, nfs_async_start
,
1293 vfsp
, 0, minclsyspri
);
1294 mutex_enter(&mi
->mi_async_lock
);
1295 } else if (mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
] <
1296 NUM_ASYNC_PGOPS_THREADS
) {
1297 mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
]++;
1298 mutex_exit(&mi
->mi_async_lock
);
1299 VFS_HOLD(vfsp
); /* hold for new thread */
1300 (void) zthread_create(NULL
, 0,
1301 nfs_async_pgops_start
, vfsp
, 0,
1303 mutex_enter(&mi
->mi_async_lock
);
1305 NFS_WAKE_ASYNC_WORKER(mi
->mi_async_work_cv
);
1306 ASSERT(mi
->mi_async_req_count
!= 0);
1307 mi
->mi_async_req_count
--;
1310 mutex_enter(&mi
->mi_lock
);
1311 if (mi
->mi_flags
& MI_ASYNC_MGR_STOP
) {
1312 mutex_exit(&mi
->mi_lock
);
1315 mutex_exit(&mi
->mi_lock
);
1317 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
1318 cv_wait(&mi
->mi_async_reqs_cv
, &mi
->mi_async_lock
);
1319 CALLB_CPR_SAFE_END(&cprinfo
, &mi
->mi_async_lock
);
1322 * Let everyone know we're done.
1324 mi
->mi_manager_thread
= NULL
;
1325 cv_broadcast(&mi
->mi_async_cv
);
1328 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1329 * since CALLB_CPR_EXIT is actually responsible for releasing
1332 CALLB_CPR_EXIT(&cprinfo
);
1333 VFS_RELE(vfsp
); /* release thread's hold */
1338 * Signal (and wait for) the async manager thread to clean up and go away.
1341 nfs_async_manager_stop(vfs_t
*vfsp
)
1343 mntinfo_t
*mi
= VFTOMI(vfsp
);
1345 mutex_enter(&mi
->mi_async_lock
);
1346 mutex_enter(&mi
->mi_lock
);
1347 mi
->mi_flags
|= MI_ASYNC_MGR_STOP
;
1348 mutex_exit(&mi
->mi_lock
);
1349 cv_broadcast(&mi
->mi_async_reqs_cv
);
1350 while (mi
->mi_manager_thread
!= NULL
)
1351 cv_wait(&mi
->mi_async_cv
, &mi
->mi_async_lock
);
1352 mutex_exit(&mi
->mi_async_lock
);
1356 nfs_async_readahead(vnode_t
*vp
, uoff_t blkoff
, caddr_t addr
,
1357 struct seg
*seg
, cred_t
*cr
, void (*readahead
)(vnode_t
*,
1358 uoff_t
, caddr_t
, struct seg
*, cred_t
*))
1362 struct nfs_async_reqs
*args
;
1365 ASSERT(rp
->r_freef
== NULL
);
1370 * If addr falls in a different segment, don't bother doing readahead.
1372 if (addr
>= seg
->s_base
+ seg
->s_size
)
1376 * If we can't allocate a request structure, punt on the readahead.
1378 if ((args
= kmem_alloc(sizeof (*args
), KM_NOSLEEP
)) == NULL
)
1382 * If a lock operation is pending, don't initiate any new
1383 * readaheads. Otherwise, bump r_count to indicate the new
1386 if (!nfs_rw_tryenter(&rp
->r_lkserlock
, RW_READER
)) {
1387 kmem_free(args
, sizeof (*args
));
1390 mutex_enter(&rp
->r_statelock
);
1392 mutex_exit(&rp
->r_statelock
);
1393 nfs_rw_exit(&rp
->r_lkserlock
);
1395 args
->a_next
= NULL
;
1397 args
->a_queuer
= curthread
;
1404 args
->a_io
= NFS_READ_AHEAD
;
1405 args
->a_nfs_readahead
= readahead
;
1406 args
->a_nfs_blkoff
= blkoff
;
1407 args
->a_nfs_seg
= seg
;
1408 args
->a_nfs_addr
= addr
;
1410 mutex_enter(&mi
->mi_async_lock
);
1413 * If asyncio has been disabled, don't bother readahead.
1415 if (mi
->mi_max_threads
== 0) {
1416 mutex_exit(&mi
->mi_async_lock
);
1421 * Link request structure into the async list and
1422 * wakeup async thread to do the i/o.
1424 if (mi
->mi_async_reqs
[NFS_READ_AHEAD
] == NULL
) {
1425 mi
->mi_async_reqs
[NFS_READ_AHEAD
] = args
;
1426 mi
->mi_async_tail
[NFS_READ_AHEAD
] = args
;
1428 mi
->mi_async_tail
[NFS_READ_AHEAD
]->a_next
= args
;
1429 mi
->mi_async_tail
[NFS_READ_AHEAD
] = args
;
1432 if (mi
->mi_io_kstats
) {
1433 mutex_enter(&mi
->mi_lock
);
1434 kstat_waitq_enter(KSTAT_IO_PTR(mi
->mi_io_kstats
));
1435 mutex_exit(&mi
->mi_lock
);
1438 mi
->mi_async_req_count
++;
1439 ASSERT(mi
->mi_async_req_count
!= 0);
1440 cv_signal(&mi
->mi_async_reqs_cv
);
1441 mutex_exit(&mi
->mi_async_lock
);
1445 mutex_enter(&rp
->r_statelock
);
1447 cv_broadcast(&rp
->r_cv
);
1448 mutex_exit(&rp
->r_statelock
);
1451 kmem_free(args
, sizeof (*args
));
1456 nfs_async_putapage(vnode_t
*vp
, page_t
*pp
, uoff_t off
, size_t len
,
1457 int flags
, cred_t
*cr
, int (*putapage
)(vnode_t
*, page_t
*,
1458 uoff_t
, size_t, int, cred_t
*))
1462 struct nfs_async_reqs
*args
;
1464 ASSERT(flags
& B_ASYNC
);
1465 ASSERT(vp
->v_vfsp
!= NULL
);
1468 ASSERT(rp
->r_count
> 0);
1473 * If we can't allocate a request structure, do the putpage
1474 * operation synchronously in this thread's context.
1476 if ((args
= kmem_alloc(sizeof (*args
), KM_NOSLEEP
)) == NULL
)
1479 args
->a_next
= NULL
;
1481 args
->a_queuer
= curthread
;
1488 args
->a_io
= NFS_PUTAPAGE
;
1489 args
->a_nfs_putapage
= putapage
;
1490 args
->a_nfs_pp
= pp
;
1491 args
->a_nfs_off
= off
;
1492 args
->a_nfs_len
= (uint_t
)len
;
1493 args
->a_nfs_flags
= flags
;
1495 mutex_enter(&mi
->mi_async_lock
);
1498 * If asyncio has been disabled, then make a synchronous request.
1499 * This check is done a second time in case async io was diabled
1500 * while this thread was blocked waiting for memory pressure to
1501 * reduce or for the queue to drain.
1503 if (mi
->mi_max_threads
== 0) {
1504 mutex_exit(&mi
->mi_async_lock
);
1509 * Link request structure into the async list and
1510 * wakeup async thread to do the i/o.
1512 if (mi
->mi_async_reqs
[NFS_PUTAPAGE
] == NULL
) {
1513 mi
->mi_async_reqs
[NFS_PUTAPAGE
] = args
;
1514 mi
->mi_async_tail
[NFS_PUTAPAGE
] = args
;
1516 mi
->mi_async_tail
[NFS_PUTAPAGE
]->a_next
= args
;
1517 mi
->mi_async_tail
[NFS_PUTAPAGE
] = args
;
1520 mutex_enter(&rp
->r_statelock
);
1523 mutex_exit(&rp
->r_statelock
);
1525 if (mi
->mi_io_kstats
) {
1526 mutex_enter(&mi
->mi_lock
);
1527 kstat_waitq_enter(KSTAT_IO_PTR(mi
->mi_io_kstats
));
1528 mutex_exit(&mi
->mi_lock
);
1531 mi
->mi_async_req_count
++;
1532 ASSERT(mi
->mi_async_req_count
!= 0);
1533 cv_signal(&mi
->mi_async_reqs_cv
);
1534 mutex_exit(&mi
->mi_async_lock
);
1541 kmem_free(args
, sizeof (*args
));
1544 if (curproc
== proc_pageout
|| curproc
== proc_fsflush
) {
1546 * If we get here in the context of the pageout/fsflush,
1547 * we refuse to do a sync write, because this may hang
1548 * pageout (and the machine). In this case, we just
1549 * re-mark the page as dirty and punt on the page.
1551 * Make sure B_FORCE isn't set. We can re-mark the
1552 * pages as dirty and unlock the pages in one swoop by
1553 * passing in B_ERROR to pvn_write_done(). However,
1554 * we should make sure B_FORCE isn't set - we don't
1555 * want the page tossed before it gets written out.
1557 if (flags
& B_FORCE
)
1558 flags
&= ~(B_INVAL
| B_FORCE
);
1559 pvn_write_done(pp
, flags
| B_ERROR
);
1562 if (nfs_zone() != mi
->mi_zone
) {
1564 * So this was a cross-zone sync putpage. We pass in B_ERROR
1565 * to pvn_write_done() to re-mark the pages as dirty and unlock
1568 * We don't want to clear B_FORCE here as the caller presumably
1569 * knows what they're doing if they set it.
1571 pvn_write_done(pp
, flags
| B_ERROR
);
1574 return ((*putapage
)(vp
, pp
, off
, len
, flags
, cr
));
1578 nfs_async_pageio(vnode_t
*vp
, page_t
*pp
, uoff_t io_off
, size_t io_len
,
1579 int flags
, cred_t
*cr
, int (*pageio
)(vnode_t
*, page_t
*, uoff_t
,
1580 size_t, int, cred_t
*))
1584 struct nfs_async_reqs
*args
;
1586 ASSERT(flags
& B_ASYNC
);
1587 ASSERT(vp
->v_vfsp
!= NULL
);
1590 ASSERT(rp
->r_count
> 0);
1595 * If we can't allocate a request structure, do the pageio
1596 * request synchronously in this thread's context.
1598 if ((args
= kmem_alloc(sizeof (*args
), KM_NOSLEEP
)) == NULL
)
1601 args
->a_next
= NULL
;
1603 args
->a_queuer
= curthread
;
1610 args
->a_io
= NFS_PAGEIO
;
1611 args
->a_nfs_pageio
= pageio
;
1612 args
->a_nfs_pp
= pp
;
1613 args
->a_nfs_off
= io_off
;
1614 args
->a_nfs_len
= (uint_t
)io_len
;
1615 args
->a_nfs_flags
= flags
;
1617 mutex_enter(&mi
->mi_async_lock
);
1620 * If asyncio has been disabled, then make a synchronous request.
1621 * This check is done a second time in case async io was diabled
1622 * while this thread was blocked waiting for memory pressure to
1623 * reduce or for the queue to drain.
1625 if (mi
->mi_max_threads
== 0) {
1626 mutex_exit(&mi
->mi_async_lock
);
1631 * Link request structure into the async list and
1632 * wakeup async thread to do the i/o.
1634 if (mi
->mi_async_reqs
[NFS_PAGEIO
] == NULL
) {
1635 mi
->mi_async_reqs
[NFS_PAGEIO
] = args
;
1636 mi
->mi_async_tail
[NFS_PAGEIO
] = args
;
1638 mi
->mi_async_tail
[NFS_PAGEIO
]->a_next
= args
;
1639 mi
->mi_async_tail
[NFS_PAGEIO
] = args
;
1642 mutex_enter(&rp
->r_statelock
);
1645 mutex_exit(&rp
->r_statelock
);
1647 if (mi
->mi_io_kstats
) {
1648 mutex_enter(&mi
->mi_lock
);
1649 kstat_waitq_enter(KSTAT_IO_PTR(mi
->mi_io_kstats
));
1650 mutex_exit(&mi
->mi_lock
);
1653 mi
->mi_async_req_count
++;
1654 ASSERT(mi
->mi_async_req_count
!= 0);
1655 cv_signal(&mi
->mi_async_reqs_cv
);
1656 mutex_exit(&mi
->mi_async_lock
);
1663 kmem_free(args
, sizeof (*args
));
1667 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1668 * the page list), for writes we do it synchronously, except for
1669 * proc_pageout/proc_fsflush as described below.
1671 if (flags
& B_READ
) {
1672 pvn_read_done(pp
, flags
| B_ERROR
);
1676 if (curproc
== proc_pageout
|| curproc
== proc_fsflush
) {
1678 * If we get here in the context of the pageout/fsflush,
1679 * we refuse to do a sync write, because this may hang
1680 * pageout/fsflush (and the machine). In this case, we just
1681 * re-mark the page as dirty and punt on the page.
1683 * Make sure B_FORCE isn't set. We can re-mark the
1684 * pages as dirty and unlock the pages in one swoop by
1685 * passing in B_ERROR to pvn_write_done(). However,
1686 * we should make sure B_FORCE isn't set - we don't
1687 * want the page tossed before it gets written out.
1689 if (flags
& B_FORCE
)
1690 flags
&= ~(B_INVAL
| B_FORCE
);
1691 pvn_write_done(pp
, flags
| B_ERROR
);
1695 if (nfs_zone() != mi
->mi_zone
) {
1697 * So this was a cross-zone sync pageio. We pass in B_ERROR
1698 * to pvn_write_done() to re-mark the pages as dirty and unlock
1701 * We don't want to clear B_FORCE here as the caller presumably
1702 * knows what they're doing if they set it.
1704 pvn_write_done(pp
, flags
| B_ERROR
);
1707 return ((*pageio
)(vp
, pp
, io_off
, io_len
, flags
, cr
));
1711 nfs_async_readdir(vnode_t
*vp
, rddir_cache
*rdc
, cred_t
*cr
,
1712 int (*readdir
)(vnode_t
*, rddir_cache
*, cred_t
*))
1716 struct nfs_async_reqs
*args
;
1719 ASSERT(rp
->r_freef
== NULL
);
1724 * If we can't allocate a request structure, do the readdir
1725 * operation synchronously in this thread's context.
1727 if ((args
= kmem_alloc(sizeof (*args
), KM_NOSLEEP
)) == NULL
)
1730 args
->a_next
= NULL
;
1732 args
->a_queuer
= curthread
;
1739 args
->a_io
= NFS_READDIR
;
1740 args
->a_nfs_readdir
= readdir
;
1741 args
->a_nfs_rdc
= rdc
;
1743 mutex_enter(&mi
->mi_async_lock
);
1746 * If asyncio has been disabled, then make a synchronous request.
1748 if (mi
->mi_max_threads
== 0) {
1749 mutex_exit(&mi
->mi_async_lock
);
1754 * Link request structure into the async list and
1755 * wakeup async thread to do the i/o.
1757 if (mi
->mi_async_reqs
[NFS_READDIR
] == NULL
) {
1758 mi
->mi_async_reqs
[NFS_READDIR
] = args
;
1759 mi
->mi_async_tail
[NFS_READDIR
] = args
;
1761 mi
->mi_async_tail
[NFS_READDIR
]->a_next
= args
;
1762 mi
->mi_async_tail
[NFS_READDIR
] = args
;
1765 mutex_enter(&rp
->r_statelock
);
1767 mutex_exit(&rp
->r_statelock
);
1769 if (mi
->mi_io_kstats
) {
1770 mutex_enter(&mi
->mi_lock
);
1771 kstat_waitq_enter(KSTAT_IO_PTR(mi
->mi_io_kstats
));
1772 mutex_exit(&mi
->mi_lock
);
1775 mi
->mi_async_req_count
++;
1776 ASSERT(mi
->mi_async_req_count
!= 0);
1777 cv_signal(&mi
->mi_async_reqs_cv
);
1778 mutex_exit(&mi
->mi_async_lock
);
1785 kmem_free(args
, sizeof (*args
));
1788 rdc
->entries
= NULL
;
1789 mutex_enter(&rp
->r_statelock
);
1790 ASSERT(rdc
->flags
& RDDIR
);
1791 rdc
->flags
&= ~RDDIR
;
1792 rdc
->flags
|= RDDIRREQ
;
1794 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1795 * is set, wakeup the thread sleeping in cv_wait_sig().
1796 * The woken up thread will reset the flag to RDDIR and will
1797 * continue with the readdir opeartion.
1799 if (rdc
->flags
& RDDIRWAIT
) {
1800 rdc
->flags
&= ~RDDIRWAIT
;
1801 cv_broadcast(&rdc
->cv
);
1803 mutex_exit(&rp
->r_statelock
);
1804 rddir_cache_rele(rdc
);
1808 nfs_async_commit(vnode_t
*vp
, page_t
*plist
, offset3 offset
, count3 count
,
1809 cred_t
*cr
, void (*commit
)(vnode_t
*, page_t
*, offset3
, count3
,
1814 struct nfs_async_reqs
*args
;
1821 * If we can't allocate a request structure, do the commit
1822 * operation synchronously in this thread's context.
1824 if ((args
= kmem_alloc(sizeof (*args
), KM_NOSLEEP
)) == NULL
)
1827 args
->a_next
= NULL
;
1829 args
->a_queuer
= curthread
;
1836 args
->a_io
= NFS_COMMIT
;
1837 args
->a_nfs_commit
= commit
;
1838 args
->a_nfs_plist
= plist
;
1839 args
->a_nfs_offset
= offset
;
1840 args
->a_nfs_count
= count
;
1842 mutex_enter(&mi
->mi_async_lock
);
1845 * If asyncio has been disabled, then make a synchronous request.
1846 * This check is done a second time in case async io was diabled
1847 * while this thread was blocked waiting for memory pressure to
1848 * reduce or for the queue to drain.
1850 if (mi
->mi_max_threads
== 0) {
1851 mutex_exit(&mi
->mi_async_lock
);
1856 * Link request structure into the async list and
1857 * wakeup async thread to do the i/o.
1859 if (mi
->mi_async_reqs
[NFS_COMMIT
] == NULL
) {
1860 mi
->mi_async_reqs
[NFS_COMMIT
] = args
;
1861 mi
->mi_async_tail
[NFS_COMMIT
] = args
;
1863 mi
->mi_async_tail
[NFS_COMMIT
]->a_next
= args
;
1864 mi
->mi_async_tail
[NFS_COMMIT
] = args
;
1867 mutex_enter(&rp
->r_statelock
);
1869 mutex_exit(&rp
->r_statelock
);
1871 if (mi
->mi_io_kstats
) {
1872 mutex_enter(&mi
->mi_lock
);
1873 kstat_waitq_enter(KSTAT_IO_PTR(mi
->mi_io_kstats
));
1874 mutex_exit(&mi
->mi_lock
);
1877 mi
->mi_async_req_count
++;
1878 ASSERT(mi
->mi_async_req_count
!= 0);
1879 cv_signal(&mi
->mi_async_reqs_cv
);
1880 mutex_exit(&mi
->mi_async_lock
);
1887 kmem_free(args
, sizeof (*args
));
1890 if (curproc
== proc_pageout
|| curproc
== proc_fsflush
||
1891 nfs_zone() != mi
->mi_zone
) {
1892 while (plist
!= NULL
) {
1894 page_sub(&plist
, pp
);
1895 pp
->p_fsdata
= C_COMMIT
;
1900 (*commit
)(vp
, plist
, offset
, count
, cr
);
1904 nfs_async_inactive(vnode_t
*vp
, cred_t
*cr
,
1905 void (*inactive
)(vnode_t
*, cred_t
*, caller_context_t
*))
1908 struct nfs_async_reqs
*args
;
1912 args
= kmem_alloc(sizeof (*args
), KM_SLEEP
);
1913 args
->a_next
= NULL
;
1915 args
->a_queuer
= curthread
;
1921 args
->a_io
= NFS_INACTIVE
;
1922 args
->a_nfs_inactive
= inactive
;
1925 * Note that we don't check mi->mi_max_threads here, since we
1926 * *need* to get rid of this vnode regardless of whether someone
1927 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1929 * The manager thread knows about this and is willing to create
1930 * at least one thread to accommodate us.
1932 mutex_enter(&mi
->mi_async_lock
);
1933 if (mi
->mi_manager_thread
== NULL
) {
1934 rnode_t
*rp
= VTOR(vp
);
1936 mutex_exit(&mi
->mi_async_lock
);
1937 crfree(cr
); /* drop our reference */
1938 kmem_free(args
, sizeof (*args
));
1940 * We can't do an over-the-wire call since we're in the wrong
1941 * zone, so we need to clean up state as best we can and then
1942 * throw away the vnode.
1944 mutex_enter(&rp
->r_statelock
);
1945 if (rp
->r_unldvp
!= NULL
) {
1950 unldvp
= rp
->r_unldvp
;
1951 rp
->r_unldvp
= NULL
;
1952 unlname
= rp
->r_unlname
;
1953 rp
->r_unlname
= NULL
;
1954 unlcred
= rp
->r_unlcred
;
1955 rp
->r_unlcred
= NULL
;
1956 mutex_exit(&rp
->r_statelock
);
1959 kmem_free(unlname
, MAXNAMELEN
);
1962 mutex_exit(&rp
->r_statelock
);
1965 * No need to explicitly throw away any cached pages. The
1966 * eventual rinactive() will attempt a synchronous
1967 * fop_putpage() which will immediately fail since the request
1968 * is coming from the wrong zone, and then will proceed to call
1969 * nfs_invalidate_pages() which will clean things up for us.
1971 rp_addfree(VTOR(vp
), cr
);
1975 if (mi
->mi_async_reqs
[NFS_INACTIVE
] == NULL
) {
1976 mi
->mi_async_reqs
[NFS_INACTIVE
] = args
;
1978 mi
->mi_async_tail
[NFS_INACTIVE
]->a_next
= args
;
1980 mi
->mi_async_tail
[NFS_INACTIVE
] = args
;
1982 * Don't increment r_count, since we're trying to get rid of the vnode.
1985 mi
->mi_async_req_count
++;
1986 ASSERT(mi
->mi_async_req_count
!= 0);
1987 cv_signal(&mi
->mi_async_reqs_cv
);
1988 mutex_exit(&mi
->mi_async_lock
);
1992 nfs_async_start(struct vfs
*vfsp
)
1994 nfs_async_common_start(vfsp
, NFS_ASYNC_QUEUE
);
1998 nfs_async_pgops_start(struct vfs
*vfsp
)
2000 nfs_async_common_start(vfsp
, NFS_ASYNC_PGOPS_QUEUE
);
2004 * The async queues for each mounted file system are arranged as a
2005 * set of queues, one for each async i/o type. Requests are taken
2006 * from the queues in a round-robin fashion. A number of consecutive
2007 * requests are taken from each queue before moving on to the next
2008 * queue. This functionality may allow the NFS Version 2 server to do
2009 * write clustering, even if the client is mixing writes and reads
2010 * because it will take multiple write requests from the queue
2011 * before processing any of the other async i/o types.
2013 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2014 * model defined by cpr to suspend the system. Specifically over the
2015 * wire calls are cpr-unsafe. The thread should be reevaluated in
2016 * case of future updates to the cpr model.
2019 nfs_async_common_start(struct vfs
*vfsp
, int async_queue
)
2021 struct nfs_async_reqs
*args
;
2022 mntinfo_t
*mi
= VFTOMI(vfsp
);
2023 clock_t time_left
= 1;
2024 callb_cpr_t cprinfo
;
2027 kcondvar_t
*async_work_cv
;
2029 if (async_queue
== NFS_ASYNC_QUEUE
) {
2030 async_types
= NFS_ASYNC_TYPES
;
2031 async_work_cv
= &mi
->mi_async_work_cv
[NFS_ASYNC_QUEUE
];
2033 async_types
= NFS_ASYNC_PGOPS_TYPES
;
2034 async_work_cv
= &mi
->mi_async_work_cv
[NFS_ASYNC_PGOPS_QUEUE
];
2038 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2039 * built in an implementation independent manner.
2041 if (nfs_async_timeout
== -1)
2042 nfs_async_timeout
= NFS_ASYNC_TIMEOUT
;
2044 CALLB_CPR_INIT(&cprinfo
, &mi
->mi_async_lock
, callb_generic_cpr
, "nas");
2046 mutex_enter(&mi
->mi_async_lock
);
2049 * Find the next queue containing an entry. We start
2050 * at the current queue pointer and then round robin
2051 * through all of them until we either find a non-empty
2052 * queue or have looked through all of them.
2054 for (i
= 0; i
< async_types
; i
++) {
2055 args
= *mi
->mi_async_curr
[async_queue
];
2058 mi
->mi_async_curr
[async_queue
]++;
2059 if (mi
->mi_async_curr
[async_queue
] ==
2060 &mi
->mi_async_reqs
[async_types
]) {
2061 mi
->mi_async_curr
[async_queue
] =
2062 &mi
->mi_async_reqs
[0];
2066 * If we didn't find a entry, then block until woken up
2067 * again and then look through the queues again.
2071 * Exiting is considered to be safe for CPR as well
2073 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
2076 * Wakeup thread waiting to unmount the file
2077 * system only if all async threads are inactive.
2079 * If we've timed-out and there's nothing to do,
2080 * then get rid of this thread.
2082 if (mi
->mi_max_threads
== 0 || time_left
<= 0) {
2083 --mi
->mi_threads
[async_queue
];
2085 if (mi
->mi_threads
[NFS_ASYNC_QUEUE
] == 0 &&
2086 mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
] == 0)
2087 cv_signal(&mi
->mi_async_cv
);
2088 CALLB_CPR_EXIT(&cprinfo
);
2089 VFS_RELE(vfsp
); /* release thread's hold */
2093 time_left
= cv_reltimedwait(async_work_cv
,
2094 &mi
->mi_async_lock
, nfs_async_timeout
,
2097 CALLB_CPR_SAFE_END(&cprinfo
, &mi
->mi_async_lock
);
2104 * Remove the request from the async queue and then
2105 * update the current async request queue pointer. If
2106 * the current queue is empty or we have removed enough
2107 * consecutive entries from it, then reset the counter
2108 * for this queue and then move the current pointer to
2111 *mi
->mi_async_curr
[async_queue
] = args
->a_next
;
2112 if (*mi
->mi_async_curr
[async_queue
] == NULL
||
2113 --mi
->mi_async_clusters
[args
->a_io
] == 0) {
2114 mi
->mi_async_clusters
[args
->a_io
] =
2115 mi
->mi_async_init_clusters
;
2116 mi
->mi_async_curr
[async_queue
]++;
2117 if (mi
->mi_async_curr
[async_queue
] ==
2118 &mi
->mi_async_reqs
[async_types
]) {
2119 mi
->mi_async_curr
[async_queue
] =
2120 &mi
->mi_async_reqs
[0];
2124 if (args
->a_io
!= NFS_INACTIVE
&& mi
->mi_io_kstats
) {
2125 mutex_enter(&mi
->mi_lock
);
2126 kstat_waitq_exit(KSTAT_IO_PTR(mi
->mi_io_kstats
));
2127 mutex_exit(&mi
->mi_lock
);
2130 mutex_exit(&mi
->mi_async_lock
);
2133 * Obtain arguments from the async request structure.
2135 if (args
->a_io
== NFS_READ_AHEAD
&& mi
->mi_max_threads
> 0) {
2136 (*args
->a_nfs_readahead
)(args
->a_vp
, args
->a_nfs_blkoff
,
2137 args
->a_nfs_addr
, args
->a_nfs_seg
,
2139 } else if (args
->a_io
== NFS_PUTAPAGE
) {
2140 (void) (*args
->a_nfs_putapage
)(args
->a_vp
,
2141 args
->a_nfs_pp
, args
->a_nfs_off
,
2142 args
->a_nfs_len
, args
->a_nfs_flags
,
2144 } else if (args
->a_io
== NFS_PAGEIO
) {
2145 (void) (*args
->a_nfs_pageio
)(args
->a_vp
,
2146 args
->a_nfs_pp
, args
->a_nfs_off
,
2147 args
->a_nfs_len
, args
->a_nfs_flags
,
2149 } else if (args
->a_io
== NFS_READDIR
) {
2150 (void) ((*args
->a_nfs_readdir
)(args
->a_vp
,
2151 args
->a_nfs_rdc
, args
->a_cred
));
2152 } else if (args
->a_io
== NFS_COMMIT
) {
2153 (*args
->a_nfs_commit
)(args
->a_vp
, args
->a_nfs_plist
,
2154 args
->a_nfs_offset
, args
->a_nfs_count
,
2156 } else if (args
->a_io
== NFS_INACTIVE
) {
2157 (*args
->a_nfs_inactive
)(args
->a_vp
, args
->a_cred
, NULL
);
2161 * Now, release the vnode and free the credentials
2164 free_async_args(args
);
2166 * Reacquire the mutex because it will be needed above.
2168 mutex_enter(&mi
->mi_async_lock
);
2173 nfs_async_stop(struct vfs
*vfsp
)
2175 mntinfo_t
*mi
= VFTOMI(vfsp
);
2178 * Wait for all outstanding async operations to complete and for the
2179 * worker threads to exit.
2181 mutex_enter(&mi
->mi_async_lock
);
2182 mi
->mi_max_threads
= 0;
2183 NFS_WAKEALL_ASYNC_WORKERS(mi
->mi_async_work_cv
);
2184 while (mi
->mi_threads
[NFS_ASYNC_QUEUE
] != 0 ||
2185 mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
] != 0)
2186 cv_wait(&mi
->mi_async_cv
, &mi
->mi_async_lock
);
2187 mutex_exit(&mi
->mi_async_lock
);
2191 * nfs_async_stop_sig:
2192 * Wait for all outstanding putpage operation to complete. If a signal
2193 * is deliver we will abort and return non-zero. If we can put all the
2194 * pages we will return 0. This routine is called from nfs_unmount and
2195 * nfs3_unmount to make these operations interruptible.
2198 nfs_async_stop_sig(struct vfs
*vfsp
)
2200 mntinfo_t
*mi
= VFTOMI(vfsp
);
2205 * Wait for all outstanding async operations to complete and for the
2206 * worker threads to exit.
2208 mutex_enter(&mi
->mi_async_lock
);
2209 omax
= mi
->mi_max_threads
;
2210 mi
->mi_max_threads
= 0;
2212 * Tell all the worker threads to exit.
2214 NFS_WAKEALL_ASYNC_WORKERS(mi
->mi_async_work_cv
);
2215 while (mi
->mi_threads
[NFS_ASYNC_QUEUE
] != 0 ||
2216 mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
] != 0) {
2217 if (!cv_wait_sig(&mi
->mi_async_cv
, &mi
->mi_async_lock
))
2220 rval
= (mi
->mi_threads
[NFS_ASYNC_QUEUE
] != 0 ||
2221 mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
] != 0); /* Interrupted */
2223 mi
->mi_max_threads
= omax
;
2224 mutex_exit(&mi
->mi_async_lock
);
2230 writerp(rnode_t
*rp
, caddr_t base
, int tcount
, struct uio
*uio
, int pgcreated
)
2239 vnode_t
*vp
= RTOV(rp
);
2241 ASSERT(tcount
<= MAXBSIZE
&& tcount
<= uio
->uio_resid
);
2242 ASSERT(nfs_rw_lock_held(&rp
->r_rwlock
, RW_WRITER
));
2244 ASSERT(((uintptr_t)base
& MAXBOFFSET
) + tcount
<= MAXBSIZE
);
2248 * Move bytes in at most PAGESIZE chunks. We must avoid
2249 * spanning pages in uiomove() because page faults may cause
2250 * the cache to be invalidated out from under us. The r_size is not
2251 * updated until after the uiomove. If we push the last page of a
2252 * file before r_size is correct, we will lose the data written past
2253 * the current (and invalid) r_size.
2256 offset
= uio
->uio_loffset
;
2260 * n is the number of bytes required to satisfy the request
2261 * or the number of bytes to fill out the page.
2263 n
= (int)MIN((PAGESIZE
- (offset
& PAGEOFFSET
)), tcount
);
2266 * Check to see if we can skip reading in the page
2267 * and just allocate the memory. We can do this
2268 * if we are going to rewrite the entire mapping
2269 * or if we are going to write to or beyond the current
2270 * end of file from the beginning of the mapping.
2272 * The read of r_size is now protected by r_statelock.
2274 mutex_enter(&rp
->r_statelock
);
2276 * When pgcreated is nonzero the caller has already done
2277 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2278 * segkpm this means we already have at least one page
2279 * created and mapped at base.
2281 pagecreate
= pgcreated
||
2282 ((offset
& PAGEOFFSET
) == 0 &&
2283 (n
== PAGESIZE
|| ((offset
+ n
) >= rp
->r_size
)));
2285 mutex_exit(&rp
->r_statelock
);
2286 if (!vpm_enable
&& pagecreate
) {
2288 * The last argument tells segmap_pagecreate() to
2289 * always lock the page, as opposed to sometimes
2290 * returning with the page locked. This way we avoid a
2291 * fault on the ensuing uiomove(), but also
2292 * more importantly (to fix bug 1094402) we can
2293 * call segmap_fault() to unlock the page in all
2294 * cases. An alternative would be to modify
2295 * segmap_pagecreate() to tell us when it is
2296 * locking a page, but that's a fairly major
2300 (void) segmap_pagecreate(segkmap
, base
,
2307 * The number of bytes of data in the last page can not
2308 * be accurately be determined while page is being
2309 * uiomove'd to and the size of the file being updated.
2310 * Thus, inform threads which need to know accurately
2311 * how much data is in the last page of the file. They
2312 * will not do the i/o immediately, but will arrange for
2313 * the i/o to happen later when this modify operation
2314 * will have finished.
2316 ASSERT(!(rp
->r_flags
& RMODINPROGRESS
));
2317 mutex_enter(&rp
->r_statelock
);
2318 rp
->r_flags
|= RMODINPROGRESS
;
2319 rp
->r_modaddr
= (offset
& MAXBMASK
);
2320 mutex_exit(&rp
->r_statelock
);
2324 * Copy data. If new pages are created, part of
2325 * the page that is not written will be initizliazed
2328 error
= vpm_data_copy(vp
, offset
, n
, uio
,
2329 !pagecreate
, NULL
, 0, S_WRITE
);
2331 error
= uiomove(base
, n
, UIO_WRITE
, uio
);
2335 * r_size is the maximum number of
2336 * bytes known to be in the file.
2337 * Make sure it is at least as high as the
2338 * first unwritten byte pointed to by uio_loffset.
2340 mutex_enter(&rp
->r_statelock
);
2341 if (rp
->r_size
< uio
->uio_loffset
)
2342 rp
->r_size
= uio
->uio_loffset
;
2343 rp
->r_flags
&= ~RMODINPROGRESS
;
2344 rp
->r_flags
|= RDIRTY
;
2345 mutex_exit(&rp
->r_statelock
);
2347 /* n = # of bytes written */
2348 n
= (int)(uio
->uio_loffset
- offset
);
2355 * If we created pages w/o initializing them completely,
2356 * we need to zero the part that wasn't set up.
2357 * This happens on a most EOF write cases and if
2358 * we had some sort of error during the uiomove.
2360 if (!vpm_enable
&& pagecreate
) {
2361 if ((uio
->uio_loffset
& PAGEOFFSET
) || n
== 0)
2362 (void) kzero(base
, PAGESIZE
- n
);
2366 * Caller is responsible for this page,
2367 * it was not created in this loop.
2372 * For bug 1094402: segmap_pagecreate locks
2373 * page. Unlock it. This also unlocks the
2374 * pages allocated by page_create_va() in
2375 * segmap_pagecreate().
2377 sm_error
= segmap_fault(kas
.a_hat
, segkmap
,
2378 saved_base
, saved_n
,
2379 F_SOFTUNLOCK
, S_WRITE
);
2384 } while (tcount
> 0 && error
== 0);
2390 nfs_putpages(vnode_t
*vp
, uoff_t off
, size_t len
, int flags
, cred_t
*cr
)
2402 ASSERT(rp
->r_count
> 0);
2404 if (!vn_has_cached_data(vp
))
2407 ASSERT(vp
->v_type
!= VCHR
);
2410 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2411 * writes. B_FORCE is set to force the VM system to actually
2412 * invalidate the pages, even if the i/o failed. The pages
2413 * need to get invalidated because they can't be written out
2414 * because there isn't any space left on either the server's
2415 * file system or in the user's disk quota. The B_FREE bit
2416 * is cleared to avoid confusion as to whether this is a
2417 * request to place the page on the freelist or to destroy
2420 if ((rp
->r_flags
& ROUTOFSPACE
) ||
2421 (vp
->v_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
2422 flags
= (flags
& ~B_FREE
) | B_INVAL
| B_FORCE
;
2426 * If doing a full file synchronous operation, then clear
2427 * the RDIRTY bit. If a page gets dirtied while the flush
2428 * is happening, then RDIRTY will get set again. The
2429 * RDIRTY bit must get cleared before the flush so that
2430 * we don't lose this information.
2432 * If there are no full file async write operations
2433 * pending and RDIRTY bit is set, clear it.
2436 !(flags
& B_ASYNC
) &&
2437 (rp
->r_flags
& RDIRTY
)) {
2438 mutex_enter(&rp
->r_statelock
);
2439 rdirty
= (rp
->r_flags
& RDIRTY
);
2440 rp
->r_flags
&= ~RDIRTY
;
2441 mutex_exit(&rp
->r_statelock
);
2442 } else if (flags
& B_ASYNC
&& off
== 0) {
2443 mutex_enter(&rp
->r_statelock
);
2444 if (rp
->r_flags
& RDIRTY
&& rp
->r_awcount
== 0) {
2445 rdirty
= (rp
->r_flags
& RDIRTY
);
2446 rp
->r_flags
&= ~RDIRTY
;
2448 mutex_exit(&rp
->r_statelock
);
2453 * Search the entire vp list for pages >= off, and flush
2456 error
= pvn_vplist_dirty(vp
, off
, rp
->r_putapage
,
2460 * If an error occurred and the file was marked as dirty
2461 * before and we aren't forcibly invalidating pages, then
2462 * reset the RDIRTY flag.
2464 if (error
&& rdirty
&&
2465 (flags
& (B_INVAL
| B_FORCE
)) != (B_INVAL
| B_FORCE
)) {
2466 mutex_enter(&rp
->r_statelock
);
2467 rp
->r_flags
|= RDIRTY
;
2468 mutex_exit(&rp
->r_statelock
);
2472 * Do a range from [off...off + len) looking for pages
2477 mutex_enter(&rp
->r_statelock
);
2478 for (io_off
= off
; io_off
< eoff
&& io_off
< rp
->r_size
;
2480 mutex_exit(&rp
->r_statelock
);
2482 * If we are not invalidating, synchronously
2483 * freeing or writing pages use the routine
2484 * page_lookup_nowait() to prevent reclaiming
2485 * them from the free list.
2487 if ((flags
& B_INVAL
) || !(flags
& B_ASYNC
)) {
2488 pp
= page_lookup(&vp
->v_object
, io_off
,
2489 (flags
& (B_INVAL
| B_FREE
)) ? SE_EXCL
: SE_SHARED
);
2491 pp
= page_lookup_nowait(&vp
->v_object
,
2493 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
2496 if (pp
== NULL
|| !pvn_getdirty(pp
, flags
))
2499 err
= (*rp
->r_putapage
)(vp
, pp
, &io_off
,
2500 &io_len
, flags
, cr
);
2504 * "io_off" and "io_len" are returned as
2505 * the range of pages we actually wrote.
2506 * This allows us to skip ahead more quickly
2507 * since several pages may've been dealt
2508 * with by this iteration of the loop.
2511 mutex_enter(&rp
->r_statelock
);
2513 mutex_exit(&rp
->r_statelock
);
2520 nfs_invalidate_pages(vnode_t
*vp
, uoff_t off
, cred_t
*cr
)
2525 mutex_enter(&rp
->r_statelock
);
2526 while (rp
->r_flags
& RTRUNCATE
)
2527 cv_wait(&rp
->r_cv
, &rp
->r_statelock
);
2528 rp
->r_flags
|= RTRUNCATE
;
2530 rp
->r_flags
&= ~RDIRTY
;
2531 if (!(rp
->r_flags
& RSTALE
))
2534 rp
->r_truncaddr
= off
;
2535 mutex_exit(&rp
->r_statelock
);
2536 (void) pvn_vplist_dirty(vp
, off
, rp
->r_putapage
,
2537 B_INVAL
| B_TRUNC
, cr
);
2538 mutex_enter(&rp
->r_statelock
);
2539 rp
->r_flags
&= ~RTRUNCATE
;
2540 cv_broadcast(&rp
->r_cv
);
2541 mutex_exit(&rp
->r_statelock
);
2544 static int nfs_write_error_to_cons_only
= 0;
2545 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2548 * Print a file handle
2551 nfs_printfhandle(nfs_fhandle
*fhp
)
2559 * 13 == "(file handle:"
2560 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2562 * 8 == maximum strlen of "%x"
2565 bufsize
= 13 + ((NFS_FHANDLE_LEN
/ sizeof (*ip
)) * (1 + 8)) + 3;
2566 buf
= kmem_alloc(bufsize
, KM_NOSLEEP
);
2571 (void) strcpy(cp
, "(file handle:");
2574 for (ip
= (int *)fhp
->fh_buf
;
2575 ip
< (int *)&fhp
->fh_buf
[fhp
->fh_len
];
2577 (void) sprintf(cp
, " %x", *ip
);
2581 (void) strcpy(cp
, ")\n");
2583 zcmn_err(getzoneid(), CE_CONT
, MSG("^%s"), buf
);
2585 kmem_free(buf
, bufsize
);
2589 * Notify the system administrator that an NFS write error has
2593 /* seconds between ENOSPC/EDQUOT messages */
2594 clock_t nfs_write_error_interval
= 5;
2597 nfs_write_error(vnode_t
*vp
, int error
, cred_t
*cr
)
2604 * In case of forced unmount or zone shutdown, do not print any
2605 * messages since it can flood the console with error messages.
2607 if (FS_OR_ZONE_GONE(mi
->mi_vfsp
))
2611 * No use in flooding the console with ENOSPC
2612 * messages from the same file system.
2614 now
= ddi_get_lbolt();
2615 if ((error
!= ENOSPC
&& error
!= EDQUOT
) ||
2616 now
- mi
->mi_printftime
> 0) {
2617 zoneid_t zoneid
= mi
->mi_zone
->zone_id
;
2620 nfs_perror(error
, "NFS%ld write error on host %s: %m.\n",
2621 mi
->mi_vers
, VTOR(vp
)->r_server
->sv_hostname
, NULL
);
2623 nfs_perror(error
, "NFS write error on host %s: %m.\n",
2624 VTOR(vp
)->r_server
->sv_hostname
, NULL
);
2626 if (error
== ENOSPC
|| error
== EDQUOT
) {
2627 zcmn_err(zoneid
, CE_CONT
,
2628 MSG("^File: userid=%d, groupid=%d\n"),
2629 crgetuid(cr
), crgetgid(cr
));
2630 if (crgetuid(CRED()) != crgetuid(cr
) ||
2631 crgetgid(CRED()) != crgetgid(cr
)) {
2632 zcmn_err(zoneid
, CE_CONT
,
2633 MSG("^User: userid=%d, groupid=%d\n"),
2634 crgetuid(CRED()), crgetgid(CRED()));
2636 mi
->mi_printftime
= now
+
2637 nfs_write_error_interval
* hz
;
2639 nfs_printfhandle(&VTOR(vp
)->r_fh
);
2641 if (error
== EACCES
) {
2642 zcmn_err(zoneid
, CE_CONT
,
2643 MSG("^nfs_bio: cred is%s kcred\n"),
2644 cr
== kcred
? "" : " not");
2652 nfs_mi_init(zoneid_t zoneid
)
2654 struct mi_globals
*mig
;
2656 mig
= kmem_alloc(sizeof (*mig
), KM_SLEEP
);
2657 mutex_init(&mig
->mig_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2658 list_create(&mig
->mig_list
, sizeof (mntinfo_t
),
2659 offsetof(mntinfo_t
, mi_zone_node
));
2660 mig
->mig_destructor_called
= B_FALSE
;
2665 * Callback routine to tell all NFS mounts in the zone to stop creating new
2666 * threads. Existing threads should exit.
2670 nfs_mi_shutdown(zoneid_t zoneid
, void *data
)
2672 struct mi_globals
*mig
= data
;
2675 ASSERT(mig
!= NULL
);
2677 mutex_enter(&mig
->mig_lock
);
2678 for (mi
= list_head(&mig
->mig_list
); mi
!= NULL
;
2679 mi
= list_next(&mig
->mig_list
, mi
)) {
2682 * If we've done the shutdown work for this FS, skip.
2683 * Once we go off the end of the list, we're done.
2685 if (mi
->mi_flags
& MI_DEAD
)
2689 * We will do work, so not done. Get a hold on the FS.
2691 VFS_HOLD(mi
->mi_vfsp
);
2694 * purge the DNLC for this filesystem
2696 (void) dnlc_purge_vfsp(mi
->mi_vfsp
, 0);
2698 mutex_enter(&mi
->mi_async_lock
);
2700 * Tell existing async worker threads to exit.
2702 mi
->mi_max_threads
= 0;
2703 NFS_WAKEALL_ASYNC_WORKERS(mi
->mi_async_work_cv
);
2705 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2706 * getting ready to exit when it's done with its current work.
2707 * Also set MI_DEAD to note we've acted on this FS.
2709 mutex_enter(&mi
->mi_lock
);
2710 mi
->mi_flags
|= (MI_ASYNC_MGR_STOP
|MI_DEAD
);
2711 mutex_exit(&mi
->mi_lock
);
2713 * Wake up the async manager thread.
2715 cv_broadcast(&mi
->mi_async_reqs_cv
);
2716 mutex_exit(&mi
->mi_async_lock
);
2719 * Drop lock and release FS, which may change list, then repeat.
2720 * We're done when every mi has been done or the list is empty.
2722 mutex_exit(&mig
->mig_lock
);
2723 VFS_RELE(mi
->mi_vfsp
);
2726 mutex_exit(&mig
->mig_lock
);
2730 nfs_mi_free_globals(struct mi_globals
*mig
)
2732 list_destroy(&mig
->mig_list
); /* makes sure the list is empty */
2733 mutex_destroy(&mig
->mig_lock
);
2734 kmem_free(mig
, sizeof (*mig
));
2740 nfs_mi_destroy(zoneid_t zoneid
, void *data
)
2742 struct mi_globals
*mig
= data
;
2744 ASSERT(mig
!= NULL
);
2745 mutex_enter(&mig
->mig_lock
);
2746 if (list_head(&mig
->mig_list
) != NULL
) {
2747 /* Still waiting for VFS_FREEVFS() */
2748 mig
->mig_destructor_called
= B_TRUE
;
2749 mutex_exit(&mig
->mig_lock
);
2752 nfs_mi_free_globals(mig
);
2756 * Add an NFS mount to the per-zone list of NFS mounts.
2759 nfs_mi_zonelist_add(mntinfo_t
*mi
)
2761 struct mi_globals
*mig
;
2763 mig
= zone_getspecific(mi_list_key
, mi
->mi_zone
);
2764 mutex_enter(&mig
->mig_lock
);
2765 list_insert_head(&mig
->mig_list
, mi
);
2766 mutex_exit(&mig
->mig_lock
);
2770 * Remove an NFS mount from the per-zone list of NFS mounts.
2773 nfs_mi_zonelist_remove(mntinfo_t
*mi
)
2775 struct mi_globals
*mig
;
2777 mig
= zone_getspecific(mi_list_key
, mi
->mi_zone
);
2778 mutex_enter(&mig
->mig_lock
);
2779 list_remove(&mig
->mig_list
, mi
);
2781 * We can be called asynchronously by VFS_FREEVFS() after the zone
2782 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2785 if (list_head(&mig
->mig_list
) == NULL
&&
2786 mig
->mig_destructor_called
== B_TRUE
) {
2787 nfs_mi_free_globals(mig
);
2790 mutex_exit(&mig
->mig_lock
);
2794 * NFS Client initialization routine. This routine should only be called
2795 * once. It performs the following tasks:
2796 * - Initalize all global locks
2797 * - Call sub-initialization routines (localize access to variables)
2803 static boolean_t nfs_clntup
= B_FALSE
;
2808 ASSERT(nfs_clntup
== B_FALSE
);
2811 error
= nfs_subrinit();
2815 error
= nfs_vfsinit();
2818 * Cleanup nfs_subrinit() work
2823 zone_key_create(&mi_list_key
, nfs_mi_init
, nfs_mi_shutdown
,
2829 nfs_clntup
= B_TRUE
;
2836 * This routine is only called if the NFS Client has been initialized but
2837 * the module failed to be installed. This routine will cleanup the previously
2838 * allocated/initialized work.
2843 (void) zone_key_delete(mi_list_key
);
2852 * Release any locks on the given vnode that are held by the current
2856 nfs_lockrelease(vnode_t
*vp
, int flag
, offset_t offset
, cred_t
*cr
)
2861 int remote_lock_possible
;
2864 ASSERT((uintptr_t)vp
> KERNELBASE
);
2867 * Generate an explicit unlock operation for the entire file. As a
2868 * partial optimization, only generate the unlock if there is a
2869 * lock registered for the file. We could check whether this
2870 * particular process has any locks on the file, but that would
2871 * require the local locking code to provide yet another query
2872 * routine. Note that no explicit synchronization is needed here.
2873 * At worst, flk_has_remote_locks() will return a false positive,
2874 * in which case the unlock call wastes time but doesn't harm
2877 * In addition, an unlock request is generated if the process
2878 * is listed as possibly having a lock on the file because the
2879 * server and client lock managers may have gotten out of sync.
2880 * N.B. It is important to make sure nfs_remove_locking_id() is
2881 * called here even if flk_has_remote_locks(vp) reports true.
2882 * If it is not called and there is an entry on the process id
2883 * list, that entry will never get removed.
2885 remote_lock_possible
= nfs_remove_locking_id(vp
, RLMPL_PID
,
2886 (char *)&(ttoproc(curthread
)->p_pid
), NULL
, NULL
);
2887 if (remote_lock_possible
|| flk_has_remote_locks(vp
)) {
2888 ld
.l_type
= F_UNLCK
; /* set to unlock entire file */
2889 ld
.l_whence
= 0; /* unlock from start of file */
2891 ld
.l_len
= 0; /* do entire file */
2892 ret
= fop_frlock(vp
, F_SETLK
, &ld
, flag
, offset
, NULL
, cr
,
2897 * If fop_frlock fails, make sure we unregister
2898 * local locks before we continue.
2900 ld
.l_pid
= ttoproc(curthread
)->p_pid
;
2901 lm_register_lock_locally(vp
, NULL
, &ld
, flag
, offset
);
2904 "NFS lock release error on vp %p: %m.\n",
2910 * The call to fop_frlock may put the pid back on the
2911 * list. We need to remove it.
2913 (void) nfs_remove_locking_id(vp
, RLMPL_PID
,
2914 (char *)&(ttoproc(curthread
)->p_pid
), NULL
, NULL
);
2918 * As long as the vp has a share matching our pid,
2919 * pluck it off and unshare it. There are circumstances in
2920 * which the call to nfs_remove_locking_id() may put the
2921 * owner back on the list, in which case we simply do a
2922 * redundant and harmless unshare.
2924 buf
= kmem_alloc(MAX_SHR_OWNER_LEN
, KM_SLEEP
);
2925 while (nfs_remove_locking_id(vp
, RLMPL_OWNER
,
2926 NULL
, buf
, &shr
.s_own_len
)) {
2931 shr
.s_pid
= curproc
->p_pid
;
2933 ret
= fop_shrlock(vp
, F_UNSHARE
, &shr
, flag
, cr
, NULL
);
2937 "NFS share release error on vp %p: %m.\n",
2942 kmem_free(buf
, MAX_SHR_OWNER_LEN
);
2946 * nfs_lockcompletion:
2948 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2949 * as non cachable (set VNOCACHE bit).
2953 nfs_lockcompletion(vnode_t
*vp
, int cmd
)
2956 rnode_t
*rp
= VTOR(vp
);
2958 ASSERT(nfs_rw_lock_held(&rp
->r_lkserlock
, RW_WRITER
));
2961 if (cmd
== F_SETLK
|| cmd
== F_SETLKW
) {
2962 if (!lm_safemap(vp
)) {
2963 mutex_enter(&vp
->v_lock
);
2964 vp
->v_flag
|= VNOCACHE
;
2965 mutex_exit(&vp
->v_lock
);
2967 mutex_enter(&vp
->v_lock
);
2968 vp
->v_flag
&= ~VNOCACHE
;
2969 mutex_exit(&vp
->v_lock
);
2973 * The cached attributes of the file are stale after acquiring
2974 * the lock on the file. They were updated when the file was
2975 * opened, but not updated when the lock was acquired. Therefore the
2976 * cached attributes are invalidated after the lock is obtained.
2978 PURGE_ATTRCACHE(vp
);
2982 * The lock manager holds state making it possible for the client
2983 * and server to be out of sync. For example, if the response from
2984 * the server granting a lock request is lost, the server will think
2985 * the lock is granted and the client will think the lock is lost.
2986 * The client can tell when it is not positive if it is in sync with
2989 * To deal with this, a list of processes for which the client is
2990 * not sure if the server holds a lock is attached to the rnode.
2991 * When such a process closes the rnode, an unlock request is sent
2992 * to the server to unlock the entire file.
2994 * The list is kept as a singularly linked NULL terminated list.
2995 * Because it is only added to under extreme error conditions, the
2996 * list shouldn't get very big. DEBUG kernels print a message if
2997 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
2998 * choosen to be 8, but can be tuned at runtime.
3001 /* int nfs_lmpl_high_water = 8; */
3002 int nfs_lmpl_high_water
= 128;
3003 int nfs_cnt_add_locking_id
= 0;
3004 int nfs_len_add_locking_id
= 0;
3008 * Record that the nfs lock manager server may be holding a lock on
3009 * a vnode for a process.
3011 * Because the nfs lock manager server holds state, it is possible
3012 * for the server to get out of sync with the client. This routine is called
3013 * from the client when it is no longer sure if the server is in sync
3014 * with the client. nfs_lockrelease() will then notice this and send
3015 * an unlock request when the file is closed
3018 nfs_add_locking_id(vnode_t
*vp
, pid_t pid
, int type
, char *id
, int len
)
3029 ++nfs_cnt_add_locking_id
;
3032 * allocate new lmpl_t now so we don't sleep
3033 * later after grabbing mutexes
3035 ASSERT(len
< MAX_SHR_OWNER_LEN
);
3036 new = kmem_alloc(sizeof (*new), KM_SLEEP
);
3037 new->lmpl_type
= type
;
3038 new->lmpl_pid
= pid
;
3039 new->lmpl_owner
= kmem_alloc(len
, KM_SLEEP
);
3040 bcopy(id
, new->lmpl_owner
, len
);
3041 new->lmpl_own_len
= len
;
3042 new->lmpl_next
= (lmpl_t
*)NULL
;
3044 if (type
== RLMPL_PID
) {
3045 ASSERT(len
== sizeof (pid_t
));
3046 ASSERT(pid
== *(pid_t
*)new->lmpl_owner
);
3048 ASSERT(type
== RLMPL_OWNER
);
3053 mutex_enter(&rp
->r_statelock
);
3056 * Add this id to the list for this rnode only if the
3057 * rnode is active and the id is not already there.
3059 ASSERT(rp
->r_flags
& RHASHED
);
3060 lmplp
= &(rp
->r_lmpl
);
3061 for (cur
= rp
->r_lmpl
; cur
!= (lmpl_t
*)NULL
; cur
= cur
->lmpl_next
) {
3062 if (cur
->lmpl_pid
== pid
&&
3063 cur
->lmpl_type
== type
&&
3064 cur
->lmpl_own_len
== len
&&
3065 bcmp(cur
->lmpl_owner
, new->lmpl_owner
, len
) == 0) {
3066 kmem_free(new->lmpl_owner
, len
);
3067 kmem_free(new, sizeof (*new));
3070 lmplp
= &cur
->lmpl_next
;
3075 if (cur
== (lmpl_t
*)NULL
) {
3078 if (list_len
> nfs_len_add_locking_id
) {
3079 nfs_len_add_locking_id
= list_len
;
3081 if (list_len
> nfs_lmpl_high_water
) {
3082 cmn_err(CE_WARN
, "nfs_add_locking_id: long list "
3083 "vp=%p is %d", (void *)vp
, list_len
);
3095 * Count the number of things left on r_lmpl after the remove.
3097 for (cur
= rp
->r_lmpl
; cur
!= (lmpl_t
*)NULL
;
3098 cur
= cur
->lmpl_next
) {
3100 if (cur
->lmpl_type
== RLMPL_PID
) {
3102 } else if (cur
->lmpl_type
== RLMPL_OWNER
) {
3105 cmn_err(CE_PANIC
, "nfs_add_locking_id: "
3106 "unrecognized lmpl_type %d",
3111 cmn_err(CE_CONT
, "nfs_add_locking_id(%s): %d PIDs + %d "
3112 "OWNs = %d items left on r_lmpl\n",
3113 (type
== RLMPL_PID
) ? "P" : "O", npids
, nowners
, nitems
);
3117 mutex_exit(&rp
->r_statelock
);
3121 * Remove an id from the lock manager id list.
3123 * If the id is not in the list return 0. If it was found and
3124 * removed, return 1.
3127 nfs_remove_locking_id(vnode_t
*vp
, int type
, char *id
, char *rid
, int *rlen
)
3134 ASSERT(type
== RLMPL_PID
|| type
== RLMPL_OWNER
);
3138 mutex_enter(&rp
->r_statelock
);
3139 ASSERT(rp
->r_flags
& RHASHED
);
3140 lmplp
= &(rp
->r_lmpl
);
3143 * Search through the list and remove the entry for this id
3144 * if it is there. The special case id == NULL allows removal
3145 * of the first share on the r_lmpl list belonging to the
3146 * current process (if any), without regard to further details
3149 for (cur
= rp
->r_lmpl
; cur
!= (lmpl_t
*)NULL
; cur
= cur
->lmpl_next
) {
3150 if (cur
->lmpl_type
== type
&&
3151 cur
->lmpl_pid
== curproc
->p_pid
&&
3153 bcmp(cur
->lmpl_owner
, id
, cur
->lmpl_own_len
) == 0)) {
3154 *lmplp
= cur
->lmpl_next
;
3155 ASSERT(cur
->lmpl_own_len
< MAX_SHR_OWNER_LEN
);
3157 bcopy(cur
->lmpl_owner
, rid
, cur
->lmpl_own_len
);
3158 *rlen
= cur
->lmpl_own_len
;
3160 kmem_free(cur
->lmpl_owner
, cur
->lmpl_own_len
);
3161 kmem_free(cur
, sizeof (*cur
));
3165 lmplp
= &cur
->lmpl_next
;
3175 * Count the number of things left on r_lmpl after the remove.
3177 for (cur
= rp
->r_lmpl
; cur
!= (lmpl_t
*)NULL
;
3178 cur
= cur
->lmpl_next
) {
3180 if (cur
->lmpl_type
== RLMPL_PID
) {
3182 } else if (cur
->lmpl_type
== RLMPL_OWNER
) {
3186 "nrli: unrecognized lmpl_type %d",
3192 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3193 (type
== RLMPL_PID
) ? "P" : "O",
3200 mutex_exit(&rp
->r_statelock
);
3205 nfs_free_mi(mntinfo_t
*mi
)
3207 ASSERT(mi
->mi_flags
& MI_ASYNC_MGR_STOP
);
3208 ASSERT(mi
->mi_manager_thread
== NULL
);
3209 ASSERT(mi
->mi_threads
[NFS_ASYNC_QUEUE
] == 0 &&
3210 mi
->mi_threads
[NFS_ASYNC_PGOPS_QUEUE
] == 0);
3213 * Remove the node from the global list before we start tearing it down.
3215 nfs_mi_zonelist_remove(mi
);
3216 if (mi
->mi_klmconfig
) {
3217 lm_free_config(mi
->mi_klmconfig
);
3218 kmem_free(mi
->mi_klmconfig
, sizeof (struct knetconfig
));
3220 mutex_destroy(&mi
->mi_lock
);
3221 mutex_destroy(&mi
->mi_remap_lock
);
3222 mutex_destroy(&mi
->mi_async_lock
);
3223 cv_destroy(&mi
->mi_failover_cv
);
3224 cv_destroy(&mi
->mi_async_work_cv
[NFS_ASYNC_QUEUE
]);
3225 cv_destroy(&mi
->mi_async_work_cv
[NFS_ASYNC_PGOPS_QUEUE
]);
3226 cv_destroy(&mi
->mi_async_reqs_cv
);
3227 cv_destroy(&mi
->mi_async_cv
);
3228 zone_rele_ref(&mi
->mi_zone_ref
, ZONE_REF_NFS
);
3229 kmem_free(mi
, sizeof (*mi
));
3233 mnt_kstat_update(kstat_t
*ksp
, int rw
)
3236 struct mntinfo_kstat
*mik
;
3240 /* this is a read-only kstat. Bail out on a write */
3241 if (rw
== KSTAT_WRITE
)
3245 * We don't want to wait here as kstat_chain_lock could be held by
3246 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3247 * and thus could lead to a deadlock.
3249 vfsp
= (struct vfs
*)ksp
->ks_private
;
3254 mik
= (struct mntinfo_kstat
*)ksp
->ks_data
;
3256 (void) strcpy(mik
->mik_proto
, mi
->mi_curr_serv
->sv_knconf
->knc_proto
);
3257 mik
->mik_vers
= (uint32_t)mi
->mi_vers
;
3258 mik
->mik_flags
= mi
->mi_flags
;
3259 mik
->mik_secmod
= mi
->mi_curr_serv
->sv_secdata
->secmod
;
3260 mik
->mik_curread
= (uint32_t)mi
->mi_curread
;
3261 mik
->mik_curwrite
= (uint32_t)mi
->mi_curwrite
;
3262 mik
->mik_retrans
= mi
->mi_retrans
;
3263 mik
->mik_timeo
= mi
->mi_timeo
;
3264 mik
->mik_acregmin
= HR2SEC(mi
->mi_acregmin
);
3265 mik
->mik_acregmax
= HR2SEC(mi
->mi_acregmax
);
3266 mik
->mik_acdirmin
= HR2SEC(mi
->mi_acdirmin
);
3267 mik
->mik_acdirmax
= HR2SEC(mi
->mi_acdirmax
);
3268 for (i
= 0; i
< NFS_CALLTYPES
+ 1; i
++) {
3269 mik
->mik_timers
[i
].srtt
= (uint32_t)mi
->mi_timers
[i
].rt_srtt
;
3270 mik
->mik_timers
[i
].deviate
=
3271 (uint32_t)mi
->mi_timers
[i
].rt_deviate
;
3272 mik
->mik_timers
[i
].rtxcur
=
3273 (uint32_t)mi
->mi_timers
[i
].rt_rtxcur
;
3275 mik
->mik_noresponse
= (uint32_t)mi
->mi_noresponse
;
3276 mik
->mik_failover
= (uint32_t)mi
->mi_failover
;
3277 mik
->mik_remap
= (uint32_t)mi
->mi_remap
;
3278 (void) strcpy(mik
->mik_curserver
, mi
->mi_curr_serv
->sv_hostname
);
3284 nfs_mnt_kstat_init(struct vfs
*vfsp
)
3286 mntinfo_t
*mi
= VFTOMI(vfsp
);
3289 * Create the version specific kstats.
3291 * PSARC 2001/697 Contract Private Interface
3292 * All nfs kstats are under SunMC contract
3293 * Please refer to the PSARC listed above and contact
3294 * SunMC before making any changes!
3296 * Changes must be reviewed by Solaris File Sharing
3297 * Changes must be communicated to contract-2001-697@sun.com
3301 mi
->mi_io_kstats
= kstat_create_zone("nfs", getminor(vfsp
->vfs_dev
),
3302 NULL
, "nfs", KSTAT_TYPE_IO
, 1, 0, mi
->mi_zone
->zone_id
);
3303 if (mi
->mi_io_kstats
) {
3304 if (mi
->mi_zone
->zone_id
!= GLOBAL_ZONEID
)
3305 kstat_zone_add(mi
->mi_io_kstats
, GLOBAL_ZONEID
);
3306 mi
->mi_io_kstats
->ks_lock
= &mi
->mi_lock
;
3307 kstat_install(mi
->mi_io_kstats
);
3310 if ((mi
->mi_ro_kstats
= kstat_create_zone("nfs",
3311 getminor(vfsp
->vfs_dev
), "mntinfo", "misc", KSTAT_TYPE_RAW
,
3312 sizeof (struct mntinfo_kstat
), 0, mi
->mi_zone
->zone_id
)) != NULL
) {
3313 if (mi
->mi_zone
->zone_id
!= GLOBAL_ZONEID
)
3314 kstat_zone_add(mi
->mi_ro_kstats
, GLOBAL_ZONEID
);
3315 mi
->mi_ro_kstats
->ks_update
= mnt_kstat_update
;
3316 mi
->mi_ro_kstats
->ks_private
= (void *)vfsp
;
3317 kstat_install(mi
->mi_ro_kstats
);
3322 nfs_init_delmapcall()
3324 nfs_delmapcall_t
*delmap_call
;
3326 delmap_call
= kmem_alloc(sizeof (nfs_delmapcall_t
), KM_SLEEP
);
3327 delmap_call
->call_id
= curthread
;
3328 delmap_call
->error
= 0;
3330 return (delmap_call
);
3334 nfs_free_delmapcall(nfs_delmapcall_t
*delmap_call
)
3336 kmem_free(delmap_call
, sizeof (nfs_delmapcall_t
));
3340 * Searches for the current delmap caller (based on curthread) in the list of
3341 * callers. If it is found, we remove it and free the delmap caller.
3343 * 0 if the caller wasn't found
3344 * 1 if the caller was found, removed and freed. *errp is set to what
3345 * the result of the delmap was.
3348 nfs_find_and_delete_delmapcall(rnode_t
*rp
, int *errp
)
3350 nfs_delmapcall_t
*delmap_call
;
3353 * If the list doesn't exist yet, we create it and return
3354 * that the caller wasn't found. No list = no callers.
3356 mutex_enter(&rp
->r_statelock
);
3357 if (!(rp
->r_flags
& RDELMAPLIST
)) {
3358 /* The list does not exist */
3359 list_create(&rp
->r_indelmap
, sizeof (nfs_delmapcall_t
),
3360 offsetof(nfs_delmapcall_t
, call_node
));
3361 rp
->r_flags
|= RDELMAPLIST
;
3362 mutex_exit(&rp
->r_statelock
);
3365 /* The list exists so search it */
3366 for (delmap_call
= list_head(&rp
->r_indelmap
);
3367 delmap_call
!= NULL
;
3368 delmap_call
= list_next(&rp
->r_indelmap
, delmap_call
)) {
3369 if (delmap_call
->call_id
== curthread
) {
3370 /* current caller is in the list */
3371 *errp
= delmap_call
->error
;
3372 list_remove(&rp
->r_indelmap
, delmap_call
);
3373 mutex_exit(&rp
->r_statelock
);
3374 nfs_free_delmapcall(delmap_call
);
3379 mutex_exit(&rp
->r_statelock
);