4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * NFS Version 4 state recovery code.
30 #include <nfs/nfs4_clnt.h>
32 #include <nfs/rnode4.h>
33 #include <sys/cmn_err.h>
35 #include <sys/systm.h>
36 #include <sys/flock.h>
42 #include <sys/mount.h>
44 #include <nfs/nfssys.h>
45 #include <nfs/nfsid_map.h>
46 #include <nfs/nfs4_idmap_impl.h>
48 extern r4hashq_t
*rtable4
;
51 * Information that describes what needs to be done for recovery. It is
52 * passed to a client recovery thread as well as passed to various recovery
53 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
54 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use
55 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost
56 * lock or open/close request, and it holds reference counts for the
57 * various objects (vnode, etc.). The recovery thread also uses flags set
58 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used
59 * to save the error that originally triggered the recovery event -- will
60 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst
61 * contains information about the request that got NFS4ERR_BAD_SEQID, and
62 * it holds reference count for the various objects (vnode, open owner,
63 * open stream, lock owner).
70 nfs4_recov_t rc_action
;
72 bool_t rc_srv_reboot
; /* server has rebooted */
73 nfs4_lost_rqst_t
*rc_lost_rqst
;
74 nfs4_error_t rc_orig_errors
; /* original errors causing recovery */
76 nfs4_bseqid_entry_t
*rc_bseqid_rqst
;
82 * How long to wait before trying again if there is an error doing
83 * recovery, in seconds.
86 static int recov_err_delay
= 1;
89 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
90 * errors. Expressed in seconds. Default is defined as
91 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
93 time_t nfs4err_delay_time
= 0;
96 * Tuneable to limit how many time "exempt" ops go OTW
97 * after a recovery error. Exempt op hints are OH_CLOSE,
98 * OH_LOCKU, OH_DELEGRETURN. These previously always went
99 * OTW even after rnode was "dead" due to recovery errors.
101 * The tuneable below limits the number of times a start_fop
102 * invocation will retry the exempt hints. After the limit
103 * is reached, nfs4_start_fop will return an error just like
104 * it would for non-exempt op hints.
106 int nfs4_max_recov_error_retry
= 3;
109 * Number of seconds the recovery thread should pause before retry when the
110 * filesystem has been forcibly unmounted.
113 int nfs4_unmount_delay
= 1;
118 * How long to wait (in seconds) between recovery operations on a given
119 * file. Normally zero, but could be set longer for testing purposes.
121 static int nfs4_recovdelay
= 0;
124 * Switch that controls whether to go into the debugger when recovery
127 static int nfs4_fail_recov_stop
= 0;
130 * Tuneables to debug client namespace interaction with server
133 * nfs4_srvmnt_fail_cnt:
134 * number of times EACCES returned because client
135 * attempted to cross server mountpoint
138 * trigger console printf whenever client attempts
139 * to cross server mountpoint
141 int nfs4_srvmnt_fail_cnt
= 0;
142 int nfs4_srvmnt_debug
= 0;
145 extern zone_key_t nfs4clnt_zone_key
;
147 /* forward references, in alphabetic order */
148 static void close_after_open_resend(vnode_t
*, cred_t
*, uint32_t,
150 static void errs_to_action(recov_info_t
*,
151 nfs4_server_t
*, mntinfo4_t
*, stateid4
*, nfs4_lost_rqst_t
*, int,
152 nfs_opnum4
, nfs4_bseqid_entry_t
*);
153 static void flush_reinstate(nfs4_lost_rqst_t
*);
154 static void free_milist(mntinfo4_t
**, int);
155 static mntinfo4_t
**make_milist(nfs4_server_t
*, int *);
156 static int nfs4_check_recov_err(vnode_t
*, nfs4_op_hint_t
,
157 nfs4_recov_state_t
*, int, char *);
158 static char *nfs4_getsrvnames(mntinfo4_t
*, size_t *);
159 static void nfs4_recov_fh_fail(vnode_t
*, int, nfsstat4
);
160 static void nfs4_recov_thread(recov_info_t
*);
161 static void nfs4_remove_lost_rqsts(mntinfo4_t
*, nfs4_server_t
*);
162 static void nfs4_resend_lost_rqsts(recov_info_t
*, nfs4_server_t
*);
163 static cred_t
*pid_to_cr(pid_t
);
164 static void reclaim_one_lock(vnode_t
*, flock64_t
*, nfs4_error_t
*, int *);
165 static void recov_bad_seqid(recov_info_t
*);
166 static void recov_badstate(recov_info_t
*, vnode_t
*, nfsstat4
);
167 static void recov_clientid(recov_info_t
*, nfs4_server_t
*);
168 static void recov_done(mntinfo4_t
*, recov_info_t
*);
169 static void recov_filehandle(nfs4_recov_t
, mntinfo4_t
*, vnode_t
*);
170 static void recov_newserver(recov_info_t
*, nfs4_server_t
**, bool_t
*);
171 static void recov_openfiles(recov_info_t
*, nfs4_server_t
*);
172 static void recov_stale(mntinfo4_t
*, vnode_t
*);
173 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t
*, nfs4_server_t
*);
174 static void recov_throttle(recov_info_t
*, vnode_t
*);
175 static void relock_skip_pid(vnode_t
*, locklist_t
*, pid_t
);
176 static void resend_lock(nfs4_lost_rqst_t
*, nfs4_error_t
*);
177 static void resend_one_op(nfs4_lost_rqst_t
*, nfs4_error_t
*, mntinfo4_t
*,
179 static void save_bseqid_rqst(nfs4_bseqid_entry_t
*, recov_info_t
*);
180 static void start_recovery(recov_info_t
*, mntinfo4_t
*, vnode_t
*, vnode_t
*,
181 nfs4_server_t
*, vnode_t
*, char *);
182 static void start_recovery_action(nfs4_recov_t
, bool_t
, mntinfo4_t
*, vnode_t
*,
184 static int wait_for_recovery(mntinfo4_t
*, nfs4_op_hint_t
);
187 * Return non-zero if the given errno, status, and rpc status codes
188 * in the nfs4_error_t indicate that client recovery is needed.
189 * "stateful" indicates whether the call that got the error establishes or
190 * removes state on the server (open, close, lock, unlock, delegreturn).
194 nfs4_needs_recovery(nfs4_error_t
*ep
, bool_t stateful
, vfs_t
*vfsp
)
200 * Try failover if the error values justify it and if
201 * it's a failover mount. Don't try if the mount is in
202 * progress, failures are handled explicitly by nfs4rootvp.
204 if (nfs4_try_failover(ep
)) {
206 mutex_enter(&mi
->mi_lock
);
207 recov
= FAILOVER_MOUNT4(mi
) && !(mi
->mi_flags
& MI4_MOUNTING
);
208 mutex_exit(&mi
->mi_lock
);
213 if (ep
->error
== EINTR
|| NFS4_FRC_UNMT_ERR(ep
->error
, vfsp
)) {
215 * The server may have gotten the request, so for stateful
216 * ops we need to resynchronize and possibly back out the
224 /* stat values are listed alphabetically */
226 * There are two lists here: the errors for which we have code, and
227 * the errors for which we plan to have code before FCS. For the
228 * second list, print a warning message but don't attempt recovery.
231 case NFS4ERR_BADHANDLE
:
232 case NFS4ERR_BAD_SEQID
:
233 case NFS4ERR_BAD_STATEID
:
235 case NFS4ERR_EXPIRED
:
236 case NFS4ERR_FHEXPIRED
:
238 case NFS4ERR_OLD_STATEID
:
239 case NFS4ERR_RESOURCE
:
240 case NFS4ERR_STALE_CLIENTID
:
241 case NFS4ERR_STALE_STATEID
:
242 case NFS4ERR_WRONGSEC
:
247 case NFS4ERR_LEASE_MOVED
:
249 zcmn_err(VFTOMI4(vfsp
)->mi_zone
->zone_id
,
250 CE_WARN
, "!Can't yet recover from NFS status %d",
260 * Some operations such as DELEGRETURN want to avoid invoking
261 * recovery actions that will only mark the file dead. If
262 * better handlers are invoked for any of these errors, this
263 * routine should be modified.
266 nfs4_recov_marks_dead(nfsstat4 status
)
268 if (status
== NFS4ERR_BAD_SEQID
||
269 status
== NFS4ERR_EXPIRED
||
270 status
== NFS4ERR_BAD_STATEID
||
271 status
== NFS4ERR_OLD_STATEID
)
277 * Transfer the state recovery information in recovp to mi's resend queue,
278 * and mark mi as having a lost state request.
281 nfs4_enqueue_lost_rqst(recov_info_t
*recovp
, mntinfo4_t
*mi
)
283 nfs4_lost_rqst_t
*lrp
= recovp
->rc_lost_rqst
;
285 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
286 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
288 ASSERT(lrp
!= NULL
&& lrp
->lr_op
!= 0);
290 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
,
291 "nfs4_enqueue_lost_rqst %p, op %d",
292 (void *)lrp
, lrp
->lr_op
));
294 mutex_enter(&mi
->mi_lock
);
295 mi
->mi_recovflags
|= MI4R_LOST_STATE
;
296 if (lrp
->lr_putfirst
)
297 list_insert_head(&mi
->mi_lost_state
, lrp
);
299 list_insert_tail(&mi
->mi_lost_state
, lrp
);
300 recovp
->rc_lost_rqst
= NULL
;
301 mutex_exit(&mi
->mi_lock
);
303 nfs4_queue_event(RE_LOST_STATE
, mi
, NULL
, lrp
->lr_op
, lrp
->lr_vp
,
304 lrp
->lr_dvp
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
308 * Transfer the bad seqid recovery information in recovp to mi's
309 * bad seqid queue, and mark mi as having a bad seqid request.
312 enqueue_bseqid_rqst(recov_info_t
*recovp
, mntinfo4_t
*mi
)
314 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
315 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
316 ASSERT(recovp
->rc_bseqid_rqst
!= NULL
);
318 mutex_enter(&mi
->mi_lock
);
319 mi
->mi_recovflags
|= MI4R_BAD_SEQID
;
320 list_insert_tail(&mi
->mi_bseqid_list
, recovp
->rc_bseqid_rqst
);
321 recovp
->rc_bseqid_rqst
= NULL
;
322 mutex_exit(&mi
->mi_lock
);
328 * The nfs4_error_t contains the return codes that triggered a recovery
329 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were
330 * being operated on. vp1 and vp2 may be NULL.
332 * Multiple calls are okay. If recovery is already underway, the call
333 * updates the information about what state needs recovery but does not
334 * start a new thread. The caller should hold mi->mi_recovlock as a reader
335 * for proper synchronization with any recovery thread.
337 * This will return TRUE if recovery was aborted, and FALSE otherwise.
340 nfs4_start_recovery(nfs4_error_t
*ep
, mntinfo4_t
*mi
, vnode_t
*vp1
,
341 vnode_t
*vp2
, stateid4
*sid
, nfs4_lost_rqst_t
*lost_rqstp
, nfs_opnum4 op
,
342 nfs4_bseqid_entry_t
*bsep
, vnode_t
*moved_vp
, char *moved_nm
)
344 recov_info_t
*recovp
;
346 bool_t abort
= FALSE
;
349 ASSERT(nfs_zone() == mi
->mi_zone
);
350 mutex_enter(&mi
->mi_lock
);
352 * If there is lost state, we need to kick off recovery even if the
353 * filesystem has been unmounted or the zone is shutting down.
355 gone
= FS_OR_ZONE_GONE4(mi
->mi_vfsp
);
357 ASSERT(ep
->error
!= EINTR
|| lost_rqstp
!= NULL
);
358 if (ep
->error
== EIO
&& lost_rqstp
== NULL
) {
359 /* failed due to forced unmount, no new lost state */
362 if ((ep
->error
== 0 || ep
->error
== ETIMEDOUT
) &&
363 !(mi
->mi_recovflags
& MI4R_LOST_STATE
)) {
364 /* some other failure, no existing lost state */
368 mutex_exit(&mi
->mi_lock
);
369 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
370 "nfs4_start_recovery: fs unmounted"));
374 mi
->mi_in_recovery
++;
375 mutex_exit(&mi
->mi_lock
);
377 recovp
= kmem_alloc(sizeof (recov_info_t
), KM_SLEEP
);
378 recovp
->rc_orig_errors
= *ep
;
379 sp
= find_nfs4_server(mi
);
380 errs_to_action(recovp
, sp
, mi
, sid
, lost_rqstp
, gone
, op
, bsep
);
382 mutex_exit(&sp
->s_lock
);
383 start_recovery(recovp
, mi
, vp1
, vp2
, sp
, moved_vp
, moved_nm
);
385 nfs4_server_rele(sp
);
390 * Internal version of nfs4_start_recovery. The difference is that the
391 * caller specifies the recovery action, rather than the errors leading to
395 start_recovery_action(nfs4_recov_t what
, bool_t reboot
, mntinfo4_t
*mi
,
396 vnode_t
*vp1
, vnode_t
*vp2
)
398 recov_info_t
*recovp
;
400 ASSERT(nfs_zone() == mi
->mi_zone
);
401 mutex_enter(&mi
->mi_lock
);
402 mi
->mi_in_recovery
++;
403 mutex_exit(&mi
->mi_lock
);
405 recovp
= kmem_zalloc(sizeof (recov_info_t
), KM_SLEEP
);
406 recovp
->rc_action
= what
;
407 recovp
->rc_srv_reboot
= reboot
;
408 recovp
->rc_error
= EIO
;
409 start_recovery(recovp
, mi
, vp1
, vp2
, NULL
, NULL
, NULL
);
413 start_recovery(recov_info_t
*recovp
, mntinfo4_t
*mi
,
414 vnode_t
*vp1
, vnode_t
*vp2
, nfs4_server_t
*sp
,
415 vnode_t
*moved_vp
, char *moved_nm
)
417 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
418 "start_recovery: mi %p, what %s", (void*)mi
,
419 nfs4_recov_action_to_str(recovp
->rc_action
)));
422 * Bump the reference on the vfs so that we can pass it to the
425 VFS_HOLD(mi
->mi_vfsp
);
428 switch (recovp
->rc_action
) {
430 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
431 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
432 if (mi
->mi_servers
->sv_next
== NULL
)
434 mutex_enter(&mi
->mi_lock
);
435 mi
->mi_recovflags
|= MI4R_NEED_NEW_SERVER
;
436 mutex_exit(&mi
->mi_lock
);
438 if (recovp
->rc_lost_rqst
!= NULL
)
439 nfs4_enqueue_lost_rqst(recovp
, mi
);
444 * If the filesystem has been unmounted, punt.
450 * If nobody else is working on the clientid, mark the
451 * clientid as being no longer set. Then mark the specific
452 * filesystem being worked on.
454 if (!nfs4_server_in_recovery(sp
)) {
455 mutex_enter(&sp
->s_lock
);
456 sp
->s_flags
&= ~N4S_CLIENTID_SET
;
457 mutex_exit(&sp
->s_lock
);
459 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
460 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
461 mutex_enter(&mi
->mi_lock
);
462 mi
->mi_recovflags
|= MI4R_NEED_CLIENTID
;
463 if (recovp
->rc_srv_reboot
)
464 mi
->mi_recovflags
|= MI4R_SRV_REBOOT
;
465 mutex_exit(&mi
->mi_lock
);
469 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
470 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
471 mutex_enter(&mi
->mi_lock
);
472 mi
->mi_recovflags
|= MI4R_REOPEN_FILES
;
473 if (recovp
->rc_srv_reboot
)
474 mi
->mi_recovflags
|= MI4R_SRV_REBOOT
;
475 mutex_exit(&mi
->mi_lock
);
479 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
480 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
481 mutex_enter(&mi
->mi_lock
);
482 mi
->mi_recovflags
|= MI4R_NEED_SECINFO
;
483 mutex_exit(&mi
->mi_lock
);
488 recov_badstate(recovp
, vp1
, NFS4ERR_EXPIRED
);
490 recov_badstate(recovp
, vp2
, NFS4ERR_EXPIRED
);
491 goto out_no_thread
; /* no further recovery possible */
495 recov_badstate(recovp
, vp1
, NFS4ERR_BAD_STATEID
);
497 recov_badstate(recovp
, vp2
, NFS4ERR_BAD_STATEID
);
498 goto out_no_thread
; /* no further recovery possible */
503 recov_throttle(recovp
, vp1
);
505 recov_throttle(recovp
, vp2
);
507 * Recover the filehandle now, rather than using a
508 * separate thread. We can do this because filehandle
509 * recovery is independent of any other state, and because
510 * we know that we are not competing with the recovery
511 * thread at this time. recov_filehandle will deal with
512 * threads that are competing to recover this filehandle.
514 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
515 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
517 recov_filehandle(recovp
->rc_action
, mi
, vp1
);
519 recov_filehandle(recovp
->rc_action
, mi
, vp2
);
520 goto out_no_thread
; /* no further recovery needed */
524 * NFS4ERR_STALE handling
525 * recov_stale() could set MI4R_NEED_NEW_SERVER to
526 * indicate that we can and should failover.
528 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_READER
) ||
529 nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
532 recov_stale(mi
, vp1
);
534 recov_stale(mi
, vp2
);
535 mutex_enter(&mi
->mi_lock
);
536 if ((mi
->mi_recovflags
& MI4R_NEED_NEW_SERVER
) == 0) {
537 mutex_exit(&mi
->mi_lock
);
540 mutex_exit(&mi
->mi_lock
);
541 recovp
->rc_action
= NR_FAILOVER
;
545 if (recovp
->rc_bseqid_rqst
) {
546 enqueue_bseqid_rqst(recovp
, mi
);
551 recov_badstate(recovp
, vp1
, NFS4ERR_BAD_SEQID
);
553 recov_badstate(recovp
, vp2
, NFS4ERR_BAD_SEQID
);
554 goto out_no_thread
; /* no further recovery possible */
558 recov_badstate(recovp
, vp1
, NFS4ERR_OLD_STATEID
);
560 recov_badstate(recovp
, vp2
, NFS4ERR_OLD_STATEID
);
561 goto out_no_thread
; /* no further recovery possible */
564 nfs4_set_grace_wait(mi
);
565 goto out_no_thread
; /* no further action required for GRACE */
569 nfs4_set_delay_wait(vp1
);
570 goto out_no_thread
; /* no further action required for DELAY */
572 case NR_LOST_STATE_RQST
:
574 nfs4_enqueue_lost_rqst(recovp
, mi
);
577 nfs4_queue_event(RE_UNEXPECTED_ACTION
, mi
, NULL
,
578 recovp
->rc_action
, NULL
, NULL
, 0, NULL
, 0, TAG_NONE
,
584 * If either file recently went through the same recovery, wait
585 * awhile. This is in case there is some sort of bug; we might not
586 * be able to recover properly, but at least we won't bombard the
587 * server with calls, and we won't tie up the client.
590 recov_throttle(recovp
, vp1
);
592 recov_throttle(recovp
, vp2
);
595 * If there's already a recovery thread, don't start another one.
598 mutex_enter(&mi
->mi_lock
);
599 if (mi
->mi_flags
& MI4_RECOV_ACTIV
) {
600 mutex_exit(&mi
->mi_lock
);
603 mi
->mi_flags
|= MI4_RECOV_ACTIV
;
604 mutex_exit(&mi
->mi_lock
);
605 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
606 "start_recovery: starting new thread for mi %p", (void*)mi
));
609 recovp
->rc_vp1
= vp1
;
611 ASSERT(VTOMI4(vp1
) == mi
);
612 VN_HOLD(recovp
->rc_vp1
);
614 recovp
->rc_vp2
= vp2
;
616 ASSERT(VTOMI4(vp2
) == mi
);
617 VN_HOLD(recovp
->rc_vp2
);
619 recovp
->rc_moved_vp
= moved_vp
;
620 recovp
->rc_moved_nm
= moved_nm
;
622 (void) zthread_create(NULL
, 0, nfs4_recov_thread
, recovp
, 0,
626 /* not reached by thread creating call */
628 mutex_enter(&mi
->mi_lock
);
629 mi
->mi_in_recovery
--;
630 if (mi
->mi_in_recovery
== 0)
631 cv_broadcast(&mi
->mi_cv_in_recov
);
632 mutex_exit(&mi
->mi_lock
);
634 VFS_RELE(mi
->mi_vfsp
);
637 * Free up resources that were allocated for us.
639 kmem_free(recovp
, sizeof (recov_info_t
));
643 nfs4_check_recov_err(vnode_t
*vp
, nfs4_op_hint_t op
,
644 nfs4_recov_state_t
*rsp
, int retry_err_cnt
, char *str
)
653 exempt
= (op
== OH_CLOSE
|| op
== OH_LOCKU
|| op
== OH_DELEGRETURN
);
655 mutex_enter(&rp
->r_statelock
);
658 * If there was a recovery error, then allow op hints "exempt" from
659 * recov errors to retry (currently 3 times). Either r_error or
660 * EIO is returned for non-exempt op hints.
662 if (rp
->r_flags
& R4RECOVERR
) {
663 if (exempt
&& rsp
->rs_num_retry_despite_err
<=
664 nfs4_max_recov_error_retry
) {
667 * Check to make sure that we haven't already inc'd
668 * rs_num_retry_despite_err for current nfs4_start_fop
669 * instance. We don't want to double inc (if we were
670 * called with vp2, then the vp1 call could have
671 * already incremented.
673 if (retry_err_cnt
== rsp
->rs_num_retry_despite_err
)
674 rsp
->rs_num_retry_despite_err
++;
676 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
677 "nfs4_start_fop: %s %p DEAD, cnt=%d", str
,
678 (void *)vp
, rsp
->rs_num_retry_despite_err
));
680 error
= (rp
->r_error
? rp
->r_error
: EIO
);
682 * An ESTALE error on a non-regular file is not
683 * "sticky". Return the ESTALE error once, but
684 * clear the condition to allow future operations
685 * to go OTW. This will allow the client to
686 * recover if the server has merely unshared then
687 * re-shared the file system. For regular files,
688 * the unshare has destroyed the open state at the
689 * server and we aren't willing to do a reopen (yet).
691 if (error
== ESTALE
&& vp
->v_type
!= VREG
) {
693 ~(R4RECOVERR
|R4RECOVERRP
|R4STALE
);
697 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
698 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
700 rsp
->rs_num_retry_despite_err
, error
));
704 mutex_exit(&rp
->r_statelock
);
709 * Initial setup code that every operation should call if it might invoke
710 * client recovery. Can block waiting for recovery to finish on a
711 * filesystem. Either vnode ptr can be NULL.
713 * Returns 0 if there are no outstanding errors. Can return an
714 * errno value under various circumstances (e.g., failed recovery, or
715 * interrupted while waiting for recovery to finish).
717 * There must be a corresponding call to nfs4_end_op() to free up any locks
718 * or resources allocated by this call (assuming this call succeeded),
719 * using the same rsp that's passed in here.
721 * The open and lock seqid synchronization must be stopped before calling this
722 * function, as it could lead to deadlock when trying to reopen a file or
723 * reclaim a lock. The synchronization is obtained with calls to:
724 * nfs4_start_open_seqid_sync()
725 * nfs4_start_lock_seqid_sync()
727 * *startrecovp is set TRUE if the caller should not bother with the
728 * over-the-wire call, and just initiate recovery for the given request.
729 * This is typically used for state-releasing ops if the filesystem has
730 * been forcibly unmounted. startrecovp may be NULL for
731 * non-state-releasing ops.
735 nfs4_start_fop(mntinfo4_t
*mi
, vnode_t
*vp1
, vnode_t
*vp2
, nfs4_op_hint_t op
,
736 nfs4_recov_state_t
*rsp
, bool_t
*startrecovp
)
738 int error
= 0, rerr_cnt
;
739 nfs4_server_t
*sp
= NULL
;
741 nfs4_error_t e
= { 0, NFS4_OK
, RPC_SUCCESS
};
747 ASSERT(vp1
== NULL
|| vp1
->v_vfsp
== mi
->mi_vfsp
);
748 ASSERT(vp2
== NULL
|| vp2
->v_vfsp
== mi
->mi_vfsp
);
751 if ((fop_caller
= tsd_get(nfs4_tsd_key
)) != NULL
) {
752 cmn_err(CE_PANIC
, "Missing nfs4_end_fop: last caller %p",
755 (void) tsd_set(nfs4_tsd_key
, caller());
759 rsp
->rs_flags
&= ~NFS4_RS_RENAME_HELD
;
760 rerr_cnt
= rsp
->rs_num_retry_despite_err
;
763 * Process the items that may delay() based on server response
765 error
= nfs4_wait_for_grace(mi
, rsp
);
770 error
= nfs4_wait_for_delay(vp1
, rsp
);
775 /* Wait for a delegation recall to complete. */
777 error
= wait_for_recall(vp1
, vp2
, op
, rsp
);
782 * Wait for any current recovery actions to finish. Note that a
783 * recovery thread can still start up after wait_for_recovery()
784 * finishes. We don't block out recovery operations until we
785 * acquire s_recovlock and mi_recovlock.
787 error
= wait_for_recovery(mi
, op
);
792 * Check to see if the rnode is already marked with a
793 * recovery error. If so, return it immediately. But
794 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
795 * clean up state on the server.
799 if (error
= nfs4_check_recov_err(vp1
, op
, rsp
, rerr_cnt
, "vp1"))
801 nfs4_check_remap(mi
, vp1
, NFS4_REMAP_CKATTRS
, &e
);
805 if (error
= nfs4_check_recov_err(vp2
, op
, rsp
, rerr_cnt
, "vp2"))
807 nfs4_check_remap(mi
, vp2
, NFS4_REMAP_CKATTRS
, &e
);
811 * The lock order calls for us to acquire s_recovlock before
812 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
813 * prevent races with the failover/migration code). So acquire
814 * mi_recovlock, look up sp, drop mi_recovlock, acquire
815 * s_recovlock and mi_recovlock, then verify that sp is still the
816 * right object. XXX Can we find a simpler way to deal with this?
818 if (nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_READER
,
819 mi
->mi_flags
& MI4_INT
)) {
824 sp
= find_nfs4_server(mi
);
826 sp
->s_otw_call_count
++;
827 mutex_exit(&sp
->s_lock
);
828 droplock_cnt
= mi
->mi_srvset_cnt
;
830 nfs_rw_exit(&mi
->mi_recovlock
);
833 if (nfs_rw_enter_sig(&sp
->s_recovlock
, RW_READER
,
834 mi
->mi_flags
& MI4_INT
)) {
839 if (nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_READER
,
840 mi
->mi_flags
& MI4_INT
)) {
842 nfs_rw_exit(&sp
->s_recovlock
);
847 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
848 * there's no point in double checking to make sure it
851 if (sp
== NULL
|| droplock_cnt
!= mi
->mi_srvset_cnt
) {
852 tsp
= find_nfs4_server(mi
);
856 mutex_exit(&tsp
->s_lock
);
857 nfs4_server_rele(tsp
);
861 nfs_rw_exit(&sp
->s_recovlock
);
862 mutex_enter(&sp
->s_lock
);
863 sp
->s_otw_call_count
--;
864 mutex_exit(&sp
->s_lock
);
865 nfs4_server_rele(sp
);
871 mutex_exit(&tsp
->s_lock
);
872 nfs4_server_rele(tsp
);
883 * If the fileystem uses volatile filehandles, obtain a lock so
884 * that we synchronize with renames. Exception: mount operations
885 * can change mi_fh_expire_type, which could be a problem, since
886 * the end_op code needs to be consistent with the start_op code
887 * about mi_rename_lock. Since mounts don't compete with renames,
888 * it's simpler to just not acquire the rename lock for mounts.
890 if (NFS4_VOLATILE_FH(mi
) && op
!= OH_MOUNT
) {
891 if (nfs_rw_enter_sig(&mi
->mi_rename_lock
,
892 op
== OH_VFH_RENAME
? RW_WRITER
: RW_READER
,
893 mi
->mi_flags
& MI4_INT
)) {
894 nfs_rw_exit(&mi
->mi_recovlock
);
896 nfs_rw_exit(&sp
->s_recovlock
);
900 rsp
->rs_flags
|= NFS4_RS_RENAME_HELD
;
903 if (OH_IS_STATE_RELE(op
)) {
905 * For forced unmount, letting the request proceed will
906 * almost always delay response to the user, so hand it off
907 * to the recovery thread. For exiting lwp's, we don't
908 * have a good way to tell if the request will hang. We
909 * generally want processes to handle their own requests so
910 * that they can be done in parallel, but if there is
911 * already a recovery thread, hand the request off to it.
912 * This will improve user response at no cost to overall
913 * system throughput. For zone shutdown, we'd prefer
914 * the recovery thread to handle this as well.
916 ASSERT(startrecovp
!= NULL
);
917 mutex_enter(&mi
->mi_lock
);
918 if (FS_OR_ZONE_GONE4(mi
->mi_vfsp
))
920 else if ((curthread
->t_proc_flag
& TP_LWPEXIT
) &&
921 (mi
->mi_flags
& MI4_RECOV_ACTIV
))
924 *startrecovp
= FALSE
;
925 mutex_exit(&mi
->mi_lock
);
927 if (startrecovp
!= NULL
)
928 *startrecovp
= FALSE
;
936 mutex_enter(&sp
->s_lock
);
937 sp
->s_otw_call_count
--;
938 mutex_exit(&sp
->s_lock
);
939 nfs4_server_rele(sp
);
942 nfs4_end_op_recall(vp1
, vp2
, rsp
);
945 (void) tsd_set(nfs4_tsd_key
, NULL
);
951 * It is up to the caller to determine if rsp->rs_sp being NULL
952 * is detrimental or not.
955 nfs4_start_op(mntinfo4_t
*mi
, vnode_t
*vp1
, vnode_t
*vp2
,
956 nfs4_recov_state_t
*rsp
)
958 ASSERT(rsp
->rs_num_retry_despite_err
== 0);
959 rsp
->rs_num_retry_despite_err
= 0;
960 return (nfs4_start_fop(mi
, vp1
, vp2
, OH_OTHER
, rsp
, NULL
));
964 * Release any resources acquired by nfs4_start_op().
965 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
967 * The operation hint is used to avoid a deadlock by bypassing delegation
968 * return logic for writes, which are done while returning a delegation.
972 nfs4_end_fop(mntinfo4_t
*mi
, vnode_t
*vp1
, vnode_t
*vp2
, nfs4_op_hint_t op
,
973 nfs4_recov_state_t
*rsp
, bool_t needs_recov
)
975 nfs4_server_t
*sp
= rsp
->rs_sp
;
980 ASSERT(tsd_get(nfs4_tsd_key
) != NULL
);
981 (void) tsd_set(nfs4_tsd_key
, NULL
);
984 nfs4_end_op_recall(vp1
, vp2
, rsp
);
986 if (rsp
->rs_flags
& NFS4_RS_RENAME_HELD
)
987 nfs_rw_exit(&mi
->mi_rename_lock
);
990 if (rsp
->rs_flags
& NFS4_RS_DELAY_MSG
) {
991 /* may need to clear the delay interval */
994 mutex_enter(&rp
->r_statelock
);
995 rp
->r_delay_interval
= 0;
996 mutex_exit(&rp
->r_statelock
);
999 rsp
->rs_flags
&= ~(NFS4_RS_GRACE_MSG
|NFS4_RS_DELAY_MSG
);
1003 * If the corresponding nfs4_start_op() found a sp,
1004 * then there must still be a sp.
1007 nfs_rw_exit(&mi
->mi_recovlock
);
1008 nfs_rw_exit(&sp
->s_recovlock
);
1009 mutex_enter(&sp
->s_lock
);
1010 sp
->s_otw_call_count
--;
1011 cv_broadcast(&sp
->s_cv_otw_count
);
1012 mutex_exit(&sp
->s_lock
);
1013 nfs4_server_rele(sp
);
1015 nfs_rw_exit(&mi
->mi_recovlock
);
1020 nfs4_end_op(mntinfo4_t
*mi
, vnode_t
*vp1
, vnode_t
*vp2
,
1021 nfs4_recov_state_t
*rsp
, bool_t needrecov
)
1023 nfs4_end_fop(mi
, vp1
, vp2
, OH_OTHER
, rsp
, needrecov
);
1027 * If the filesystem is going through client recovery, block until
1030 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1031 * if the filesystem has been forcibly unmounted or the lwp is exiting.
1035 * - EINTR if the call was interrupted
1036 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1038 * - the errno value from the recovery thread, if recovery failed
1042 wait_for_recovery(mntinfo4_t
*mi
, nfs4_op_hint_t op_hint
)
1046 mutex_enter(&mi
->mi_lock
);
1048 while (mi
->mi_recovflags
!= 0) {
1049 klwp_t
*lwp
= ttolwp(curthread
);
1051 if ((mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
) ||
1052 (mi
->mi_flags
& MI4_RECOV_FAIL
))
1054 if (OH_IS_STATE_RELE(op_hint
) &&
1055 (curthread
->t_proc_flag
& TP_LWPEXIT
))
1060 /* XXX - use different cv? */
1061 if (cv_wait_sig(&mi
->mi_failover_cv
, &mi
->mi_lock
) == 0) {
1071 if ((mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
) &&
1072 !OH_IS_STATE_RELE(op_hint
)) {
1073 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
1074 "wait_for_recovery: forced unmount"));
1076 } else if (mi
->mi_flags
& MI4_RECOV_FAIL
) {
1077 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
1078 "wait_for_recovery: fail since RECOV FAIL"));
1079 error
= mi
->mi_error
;
1082 mutex_exit(&mi
->mi_lock
);
1088 * If the client received NFS4ERR_GRACE for this particular mount,
1089 * the client blocks here until it is time to try again.
1092 * - 0 if wait was successful
1093 * - EINTR if the call was interrupted
1097 nfs4_wait_for_grace(mntinfo4_t
*mi
, nfs4_recov_state_t
*rsp
)
1100 time_t curtime
, time_to_wait
;
1102 /* do a unprotected check to reduce mi_lock contention */
1103 if (mi
->mi_grace_wait
!= 0) {
1104 mutex_enter(&mi
->mi_lock
);
1106 if (mi
->mi_grace_wait
!= 0) {
1107 if (!(rsp
->rs_flags
& NFS4_RS_GRACE_MSG
))
1108 rsp
->rs_flags
|= NFS4_RS_GRACE_MSG
;
1110 curtime
= gethrestime_sec();
1112 if (curtime
< mi
->mi_grace_wait
) {
1114 time_to_wait
= mi
->mi_grace_wait
- curtime
;
1116 mutex_exit(&mi
->mi_lock
);
1118 ddi_sleep(time_to_wait
);
1120 curtime
= gethrestime_sec();
1122 mutex_enter(&mi
->mi_lock
);
1124 if (curtime
>= mi
->mi_grace_wait
)
1125 mi
->mi_grace_wait
= 0;
1127 mi
->mi_grace_wait
= 0;
1130 mutex_exit(&mi
->mi_lock
);
1137 * If the client received NFS4ERR_DELAY for an operation on a vnode,
1138 * the client blocks here until it is time to try again.
1141 * - 0 if wait was successful
1142 * - EINTR if the call was interrupted
1146 nfs4_wait_for_delay(vnode_t
*vp
, nfs4_recov_state_t
*rsp
)
1149 time_t curtime
, time_to_wait
;
1156 /* do a unprotected check to reduce r_statelock contention */
1157 if (rp
->r_delay_wait
!= 0) {
1158 mutex_enter(&rp
->r_statelock
);
1160 if (rp
->r_delay_wait
!= 0) {
1162 if (!(rsp
->rs_flags
& NFS4_RS_DELAY_MSG
)) {
1163 rsp
->rs_flags
|= NFS4_RS_DELAY_MSG
;
1164 nfs4_mi_kstat_inc_delay(VTOMI4(vp
));
1167 curtime
= gethrestime_sec();
1169 if (curtime
< rp
->r_delay_wait
) {
1171 time_to_wait
= rp
->r_delay_wait
- curtime
;
1173 mutex_exit(&rp
->r_statelock
);
1175 ddi_sleep(time_to_wait
);
1177 curtime
= gethrestime_sec();
1179 mutex_enter(&rp
->r_statelock
);
1181 if (curtime
>= rp
->r_delay_wait
)
1182 rp
->r_delay_wait
= 0;
1184 rp
->r_delay_wait
= 0;
1187 mutex_exit(&rp
->r_statelock
);
1194 * The recovery thread.
1198 nfs4_recov_thread(recov_info_t
*recovp
)
1200 mntinfo4_t
*mi
= recovp
->rc_mi
;
1202 int done
= 0, error
= 0;
1203 bool_t recov_fail
= FALSE
;
1204 callb_cpr_t cpr_info
;
1207 nfs4_queue_event(RE_START
, mi
, NULL
, mi
->mi_recovflags
,
1208 recovp
->rc_vp1
, recovp
->rc_vp2
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
,
1211 mutex_init(&cpr_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1212 CALLB_CPR_INIT(&cpr_info
, &cpr_lock
, callb_generic_cpr
, "nfsv4Recov");
1214 mutex_enter(&mi
->mi_lock
);
1215 mi
->mi_recovthread
= curthread
;
1216 mutex_exit(&mi
->mi_lock
);
1219 * We don't really need protection here against failover or
1220 * migration, since the current thread is the one that would make
1221 * any changes, but hold mi_recovlock anyway for completeness (and
1222 * to satisfy any ASSERTs).
1224 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_READER
, 0);
1225 sp
= find_nfs4_server(mi
);
1227 mutex_exit(&sp
->s_lock
);
1228 nfs_rw_exit(&mi
->mi_recovlock
);
1231 * Do any necessary recovery, based on the information in recovp
1232 * and any recovery flags.
1236 mutex_enter(&mi
->mi_lock
);
1237 if (FS_OR_ZONE_GONE4(mi
->mi_vfsp
)) {
1240 NFS4_DEBUG(nfs4_client_recov_debug
&&
1241 mi
->mi_vfsp
->vfs_flag
& VFS_UNMOUNTED
, (CE_NOTE
,
1242 "nfs4_recov_thread: file system has been "
1244 NFS4_DEBUG(nfs4_client_recov_debug
&&
1245 zone_status_get(curproc
->p_zone
) >=
1246 ZONE_IS_SHUTTING_DOWN
, (CE_NOTE
,
1247 "nfs4_recov_thread: zone shutting down"));
1249 * If the server has lost its state for us and
1250 * the filesystem is unmounted, then the filesystem
1251 * can be tossed, even if there are lost lock or
1252 * lost state calls in the recovery queue.
1254 if (mi
->mi_recovflags
&
1255 (MI4R_NEED_CLIENTID
| MI4R_REOPEN_FILES
)) {
1256 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
1257 "nfs4_recov_thread: bailing out"));
1258 mi
->mi_flags
|= MI4_RECOV_FAIL
;
1259 mi
->mi_error
= recovp
->rc_error
;
1263 * We don't know if the server has any state for
1264 * us, and the filesystem has been unmounted. If
1265 * there are "lost state" recovery items, keep
1266 * trying to process them until there are no more
1267 * mounted filesystems for the server. Otherwise,
1268 * bail out. The reason we don't mark the
1269 * filesystem as failing recovery is in case we
1270 * have to do "lost state" recovery later (e.g., a
1271 * user process exits).
1273 if (!(mi
->mi_recovflags
& MI4R_LOST_STATE
)) {
1275 mutex_exit(&mi
->mi_lock
);
1278 mutex_exit(&mi
->mi_lock
);
1283 mutex_enter(&sp
->s_lock
);
1284 activesrv
= nfs4_fs_active(sp
);
1287 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
1288 "no active fs for server %p",
1290 mutex_enter(&mi
->mi_lock
);
1291 mi
->mi_flags
|= MI4_RECOV_FAIL
;
1292 mi
->mi_error
= recovp
->rc_error
;
1293 mutex_exit(&mi
->mi_lock
);
1297 * Mark the server instance as
1298 * dead, so that nobody will attach
1301 nfs4_mark_srv_dead(sp
);
1305 mutex_exit(&sp
->s_lock
);
1307 mutex_exit(&mi
->mi_lock
);
1311 * Check if we need to select a new server for a
1312 * failover. Choosing a new server will force at
1313 * least a check of the clientid.
1315 mutex_enter(&mi
->mi_lock
);
1317 (mi
->mi_recovflags
& MI4R_NEED_NEW_SERVER
)) {
1318 mutex_exit(&mi
->mi_lock
);
1319 recov_newserver(recovp
, &sp
, &recov_fail
);
1321 mutex_exit(&mi
->mi_lock
);
1324 * Check if we need to recover the clientid. This
1325 * must be done before file and lock recovery, and it
1326 * potentially affects the recovery threads for other
1327 * filesystems, so it gets special treatment.
1329 if (sp
!= NULL
&& recov_fail
== FALSE
) {
1330 mutex_enter(&sp
->s_lock
);
1331 if (!(sp
->s_flags
& N4S_CLIENTID_SET
)) {
1332 mutex_exit(&sp
->s_lock
);
1333 recov_clientid(recovp
, sp
);
1336 * Unset this flag in case another recovery
1337 * thread successfully recovered the clientid
1340 mutex_enter(&mi
->mi_lock
);
1341 mi
->mi_recovflags
&= ~MI4R_NEED_CLIENTID
;
1342 mutex_exit(&mi
->mi_lock
);
1343 mutex_exit(&sp
->s_lock
);
1348 * Check if we need to get the security information.
1350 mutex_enter(&mi
->mi_lock
);
1351 if ((mi
->mi_recovflags
& MI4R_NEED_SECINFO
) &&
1352 !(mi
->mi_flags
& MI4_RECOV_FAIL
)) {
1353 mutex_exit(&mi
->mi_lock
);
1354 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
,
1356 error
= nfs4_secinfo_recov(recovp
->rc_mi
,
1357 recovp
->rc_vp1
, recovp
->rc_vp2
);
1359 * If error, nothing more can be done, stop
1363 mutex_enter(&mi
->mi_lock
);
1364 mi
->mi_flags
|= MI4_RECOV_FAIL
;
1365 mi
->mi_error
= recovp
->rc_error
;
1366 mutex_exit(&mi
->mi_lock
);
1367 nfs4_queue_event(RE_WRONGSEC
, mi
, NULL
,
1368 error
, recovp
->rc_vp1
, recovp
->rc_vp2
,
1369 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1371 nfs_rw_exit(&mi
->mi_recovlock
);
1373 mutex_exit(&mi
->mi_lock
);
1376 * Check if there's a bad seqid to recover.
1378 mutex_enter(&mi
->mi_lock
);
1379 if ((mi
->mi_recovflags
& MI4R_BAD_SEQID
) &&
1380 !(mi
->mi_flags
& MI4_RECOV_FAIL
)) {
1381 mutex_exit(&mi
->mi_lock
);
1382 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
,
1384 recov_bad_seqid(recovp
);
1385 nfs_rw_exit(&mi
->mi_recovlock
);
1387 mutex_exit(&mi
->mi_lock
);
1390 * Next check for recovery that affects the entire
1394 mutex_enter(&mi
->mi_lock
);
1395 if ((mi
->mi_recovflags
& MI4R_REOPEN_FILES
) &&
1396 !(mi
->mi_flags
& MI4_RECOV_FAIL
)) {
1397 mutex_exit(&mi
->mi_lock
);
1398 recov_openfiles(recovp
, sp
);
1400 mutex_exit(&mi
->mi_lock
);
1404 * Send any queued state recovery requests.
1406 mutex_enter(&mi
->mi_lock
);
1408 (mi
->mi_recovflags
& MI4R_LOST_STATE
) &&
1409 !(mi
->mi_flags
& MI4_RECOV_FAIL
)) {
1410 mutex_exit(&mi
->mi_lock
);
1411 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
,
1413 nfs4_resend_lost_rqsts(recovp
, sp
);
1414 if (list_head(&mi
->mi_lost_state
) == NULL
) {
1416 mutex_enter(&mi
->mi_lock
);
1417 mi
->mi_recovflags
&= ~MI4R_LOST_STATE
;
1418 mutex_exit(&mi
->mi_lock
);
1420 nfs_rw_exit(&mi
->mi_recovlock
);
1422 mutex_exit(&mi
->mi_lock
);
1426 * See if there is anything more to do. If not, announce
1427 * that we are done and exit.
1429 * Need mi_recovlock to keep 'sp' valid. Must grab
1430 * mi_recovlock before mi_lock to preserve lock ordering.
1432 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_READER
, 0);
1433 mutex_enter(&mi
->mi_lock
);
1434 if ((mi
->mi_recovflags
& ~MI4R_SRV_REBOOT
) == 0 ||
1435 (mi
->mi_flags
& MI4_RECOV_FAIL
)) {
1436 list_t local_lost_state
;
1437 nfs4_lost_rqst_t
*lrp
;
1440 * We need to remove the lost requests before we
1441 * unmark the mi as no longer doing recovery to
1442 * avoid a race with a new thread putting new lost
1443 * requests on the same mi (and the going away
1444 * thread would remove the new lost requests).
1446 * Move the lost requests to a local list since
1447 * nfs4_remove_lost_rqst() drops mi_lock, and
1448 * dropping the mi_lock would make our check to
1449 * see if recovery is done no longer valid.
1451 list_create(&local_lost_state
,
1452 sizeof (nfs4_lost_rqst_t
),
1453 offsetof(nfs4_lost_rqst_t
, lr_node
));
1454 list_move_tail(&local_lost_state
, &mi
->mi_lost_state
);
1457 mutex_exit(&mi
->mi_lock
);
1459 * Now officially free the "moved"
1462 while ((lrp
= list_head(&local_lost_state
)) != NULL
) {
1463 list_remove(&local_lost_state
, lrp
);
1464 nfs4_free_lost_rqst(lrp
, sp
);
1466 list_destroy(&local_lost_state
);
1468 mutex_exit(&mi
->mi_lock
);
1469 nfs_rw_exit(&mi
->mi_recovlock
);
1472 * If the filesystem has been forcibly unmounted, there is
1473 * probably no point in retrying immediately. Furthermore,
1474 * there might be user processes waiting for a chance to
1475 * queue up "lost state" requests, so that they can exit.
1476 * So pause here for a moment. Same logic for zone shutdown.
1478 if (!done
&& FS_OR_ZONE_GONE4(mi
->mi_vfsp
)) {
1479 mutex_enter(&mi
->mi_lock
);
1480 cv_broadcast(&mi
->mi_failover_cv
);
1481 mutex_exit(&mi
->mi_lock
);
1482 ddi_sleep(nfs4_unmount_delay
);
1488 nfs4_server_rele(sp
);
1491 * Return all recalled delegations
1495 mutex_enter(&mi
->mi_lock
);
1496 recov_done(mi
, recovp
);
1497 mutex_exit(&mi
->mi_lock
);
1500 * Free up resources that were allocated for us.
1502 if (recovp
->rc_vp1
!= NULL
)
1503 VN_RELE(recovp
->rc_vp1
);
1504 if (recovp
->rc_vp2
!= NULL
)
1505 VN_RELE(recovp
->rc_vp2
);
1507 /* now we are done using the mi struct, signal the waiters */
1508 mutex_enter(&mi
->mi_lock
);
1509 mi
->mi_in_recovery
--;
1510 if (mi
->mi_in_recovery
== 0)
1511 cv_broadcast(&mi
->mi_cv_in_recov
);
1512 mutex_exit(&mi
->mi_lock
);
1514 VFS_RELE(mi
->mi_vfsp
);
1516 kmem_free(recovp
, sizeof (recov_info_t
));
1517 mutex_enter(&cpr_lock
);
1518 CALLB_CPR_EXIT(&cpr_info
);
1519 mutex_destroy(&cpr_lock
);
1524 * Log the end of recovery and notify any waiting threads.
1528 recov_done(mntinfo4_t
*mi
, recov_info_t
*recovp
)
1531 ASSERT(MUTEX_HELD(&mi
->mi_lock
));
1533 nfs4_queue_event(RE_END
, mi
, NULL
, 0, recovp
->rc_vp1
,
1534 recovp
->rc_vp2
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1535 mi
->mi_recovthread
= NULL
;
1536 mi
->mi_flags
&= ~MI4_RECOV_ACTIV
;
1537 mi
->mi_recovflags
&= ~MI4R_SRV_REBOOT
;
1538 cv_broadcast(&mi
->mi_failover_cv
);
1542 * State-specific recovery routines, by state.
1548 * Replaces *spp with a reference to the new server, which must
1549 * eventually be freed.
1553 recov_newserver(recov_info_t
*recovp
, nfs4_server_t
**spp
, bool_t
*recov_fail
)
1555 mntinfo4_t
*mi
= recovp
->rc_mi
;
1556 servinfo4_t
*svp
= NULL
;
1557 nfs4_server_t
*osp
= *spp
;
1559 enum clnt_stat status
;
1569 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_WRITER
, 0);
1576 * Ping the null NFS procedure of every server in
1577 * the list until one responds. We always start
1578 * at the head of the list and always skip the one
1579 * that is current, since it's caused us a problem.
1581 while (svp
== NULL
) {
1582 for (svp
= mi
->mi_servers
; svp
; svp
= svp
->sv_next
) {
1584 mutex_enter(&mi
->mi_lock
);
1585 if (FS_OR_ZONE_GONE4(mi
->mi_vfsp
)) {
1586 mi
->mi_flags
|= MI4_RECOV_FAIL
;
1587 mutex_exit(&mi
->mi_lock
);
1588 (void) nfs_rw_exit(&mi
->mi_recovlock
);
1591 kmem_free(snames
, len
);
1594 mutex_exit(&mi
->mi_lock
);
1596 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_READER
, 0);
1597 if (svp
->sv_flags
& SV4_NOTINUSE
) {
1598 nfs_rw_exit(&svp
->sv_lock
);
1601 nfs_rw_exit(&svp
->sv_lock
);
1603 if (!oncethru
&& svp
== mi
->mi_curr_serv
)
1606 error
= clnt_tli_kcreate(svp
->sv_knconf
, &svp
->sv_addr
,
1607 NFS_PROGRAM
, NFS_V4
, 0, 1, CRED(), &cl
);
1611 if (!(mi
->mi_flags
& MI4_INT
))
1612 cl
->cl_nosignal
= TRUE
;
1613 status
= CLNT_CALL(cl
, RFS_NULL
, xdr_void
, NULL
,
1614 xdr_void
, NULL
, tv
);
1615 if (!(mi
->mi_flags
& MI4_INT
))
1616 cl
->cl_nosignal
= FALSE
;
1617 AUTH_DESTROY(cl
->cl_auth
);
1619 if (status
== RPC_SUCCESS
) {
1620 nfs4_queue_event(RE_FAILOVER
, mi
,
1621 svp
== mi
->mi_curr_serv
? NULL
:
1622 svp
->sv_hostname
, 0, NULL
, NULL
, 0,
1623 NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1630 snames
= nfs4_getsrvnames(mi
, &len
);
1631 nfs4_queue_fact(RF_SRVS_NOT_RESPOND
, mi
,
1632 0, 0, 0, FALSE
, snames
, 0, NULL
);
1640 nfs4_queue_fact(RF_SRVS_OK
, mi
, 0, 0, 0, FALSE
, snames
,
1642 kmem_free(snames
, len
);
1646 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_READER
, 0);
1647 ASSERT((svp
->sv_flags
& SV4_NOTINUSE
) == 0);
1648 nfs_rw_exit(&svp
->sv_lock
);
1651 mutex_enter(&mi
->mi_lock
);
1652 mi
->mi_recovflags
&= ~MI4R_NEED_NEW_SERVER
;
1653 if (svp
!= mi
->mi_curr_serv
) {
1654 servinfo4_t
*osvp
= mi
->mi_curr_serv
;
1656 mutex_exit(&mi
->mi_lock
);
1659 * Update server-dependent fields in the root vnode.
1661 index
= rtable4hash(mi
->mi_rootfh
);
1662 rw_enter(&rtable4
[index
].r_lock
, RW_WRITER
);
1664 rp
= r4find(&rtable4
[index
], mi
->mi_rootfh
, mi
->mi_vfsp
);
1666 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
1667 "recov_newserver: remapping %s", rnode4info(rp
)));
1668 mutex_enter(&rp
->r_statelock
);
1670 PURGE_ATTRCACHE4_LOCKED(rp
);
1671 mutex_exit(&rp
->r_statelock
);
1672 (void) nfs4_free_data_reclaim(rp
);
1673 nfs4_purge_rddir_cache(RTOV4(rp
));
1674 rw_exit(&rtable4
[index
].r_lock
);
1675 NFS4_DEBUG(nfs4_client_failover_debug
, (CE_NOTE
,
1676 "recov_newserver: done with %s",
1680 rw_exit(&rtable4
[index
].r_lock
);
1681 (void) dnlc_purge_vfsp(mi
->mi_vfsp
, 0);
1683 mutex_enter(&mi
->mi_lock
);
1684 mi
->mi_recovflags
|= MI4R_REOPEN_FILES
| MI4R_REMAP_FILES
;
1685 if (recovp
->rc_srv_reboot
)
1686 mi
->mi_recovflags
|= MI4R_SRV_REBOOT
;
1687 mi
->mi_curr_serv
= svp
;
1689 mi
->mi_flags
&= ~MI4_BADOWNER_DEBUG
;
1690 mutex_exit(&mi
->mi_lock
);
1692 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_READER
, 0);
1693 fh
.nfs_fh4_len
= svp
->sv_fhandle
.fh_len
;
1694 fh
.nfs_fh4_val
= svp
->sv_fhandle
.fh_buf
;
1695 sfh4_update(mi
->mi_rootfh
, &fh
);
1696 fh
.nfs_fh4_len
= svp
->sv_pfhandle
.fh_len
;
1697 fh
.nfs_fh4_val
= svp
->sv_pfhandle
.fh_buf
;
1698 sfh4_update(mi
->mi_srvparentfh
, &fh
);
1699 nfs_rw_exit(&svp
->sv_lock
);
1701 *spp
= nfs4_move_mi(mi
, osvp
, svp
);
1703 nfs4_server_rele(osp
);
1705 mutex_exit(&mi
->mi_lock
);
1706 (void) nfs_rw_exit(&mi
->mi_recovlock
);
1714 recov_clientid(recov_info_t
*recovp
, nfs4_server_t
*sp
)
1716 mntinfo4_t
*mi
= recovp
->rc_mi
;
1724 * Acquire the recovery lock and then verify that the clientid
1725 * still needs to be recovered. (Note that s_recovlock is supposed
1726 * to be acquired before s_lock.) Since the thread holds the
1727 * recovery lock, no other thread will recover the clientid.
1729 (void) nfs_rw_enter_sig(&sp
->s_recovlock
, RW_WRITER
, 0);
1730 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_WRITER
, 0);
1731 mutex_enter(&sp
->s_lock
);
1732 still_stale
= ((sp
->s_flags
& N4S_CLIENTID_SET
) == 0);
1733 mutex_exit(&sp
->s_lock
);
1738 nfs4_error_zinit(&n4e
);
1739 nfs4setclientid(mi
, kcred
, TRUE
, &n4e
);
1744 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1745 * if so, just return and let recov_thread drive
1748 mutex_enter(&mi
->mi_lock
);
1749 need_new_s
= mi
->mi_recovflags
& MI4R_NEED_NEW_SERVER
;
1750 mutex_exit(&mi
->mi_lock
);
1753 nfs_rw_exit(&mi
->mi_recovlock
);
1754 nfs_rw_exit(&sp
->s_recovlock
);
1758 nfs4_queue_event(RE_CLIENTID
, mi
, NULL
, n4e
.error
, NULL
,
1759 NULL
, n4e
.stat
, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1760 mutex_enter(&mi
->mi_lock
);
1761 mi
->mi_flags
|= MI4_RECOV_FAIL
;
1762 mi
->mi_error
= recovp
->rc_error
;
1763 mutex_exit(&mi
->mi_lock
);
1764 /* don't destroy the nfs4_server, let umount do it */
1769 mutex_enter(&mi
->mi_lock
);
1770 mi
->mi_recovflags
&= ~MI4R_NEED_CLIENTID
;
1772 * If still_stale isn't true, then another thread already
1773 * recovered the clientid. And that thread that set the
1774 * clientid will have initiated reopening files on all the
1775 * filesystems for the server, so we should not initiate
1776 * reopening for this filesystem here.
1779 mi
->mi_recovflags
|= MI4R_REOPEN_FILES
;
1780 if (recovp
->rc_srv_reboot
)
1781 mi
->mi_recovflags
|= MI4R_SRV_REBOOT
;
1783 mutex_exit(&mi
->mi_lock
);
1786 nfs_rw_exit(&mi
->mi_recovlock
);
1789 nfs_rw_exit(&sp
->s_recovlock
);
1790 mutex_enter(&mi
->mi_lock
);
1791 if ((mi
->mi_flags
& MI4_RECOV_FAIL
) == 0)
1792 ddi_sleep(recov_err_delay
);
1793 mutex_exit(&mi
->mi_lock
);
1795 mntinfo4_t
**milist
;
1800 * Initiate recovery of open files for other filesystems.
1801 * We create an array of filesystems, rather than just
1802 * walking the filesystem list, to avoid deadlock issues
1803 * with s_lock and mi_recovlock.
1805 milist
= make_milist(sp
, &nummi
);
1806 for (i
= 0; i
< nummi
; i
++) {
1809 (void) nfs_rw_enter_sig(&tmi
->mi_recovlock
,
1811 start_recovery_action(NR_OPENFILES
, TRUE
, tmi
,
1813 nfs_rw_exit(&tmi
->mi_recovlock
);
1816 free_milist(milist
, nummi
);
1818 nfs_rw_exit(&sp
->s_recovlock
);
1823 * Return an array of filesystems associated with the given server. The
1824 * caller should call free_milist() to free the references and memory.
1827 static mntinfo4_t
**
1828 make_milist(nfs4_server_t
*sp
, int *nummip
)
1831 mntinfo4_t
**milist
;
1834 mutex_enter(&sp
->s_lock
);
1836 for (tmi
= sp
->mntinfo4_list
; tmi
!= NULL
; tmi
= tmi
->mi_clientid_next
)
1839 milist
= kmem_alloc(nummi
* sizeof (mntinfo4_t
*), KM_SLEEP
);
1841 for (i
= 0, tmi
= sp
->mntinfo4_list
; tmi
!= NULL
; i
++,
1842 tmi
= tmi
->mi_clientid_next
) {
1844 VFS_HOLD(tmi
->mi_vfsp
);
1846 mutex_exit(&sp
->s_lock
);
1853 * Free the filesystem list created by make_milist().
1857 free_milist(mntinfo4_t
**milist
, int nummi
)
1862 for (i
= 0; i
< nummi
; i
++) {
1864 VFS_RELE(tmi
->mi_vfsp
);
1866 kmem_free(milist
, nummi
* sizeof (mntinfo4_t
*));
1874 * Lookup the filehandle for the given vnode and update the rnode if it has
1878 * - if the filehandle could not be updated because of an error that
1879 * requires further recovery, initiate that recovery and return.
1880 * - if the filehandle could not be updated because of a signal, pretend we
1881 * succeeded and let someone else deal with it.
1882 * - if the filehandle could not be updated and the filesystem has been
1883 * forcibly unmounted, pretend we succeeded, and let the caller deal with
1884 * the forced unmount (to retry or not to retry, that is the question).
1885 * - if the filehandle could not be updated because of some other error,
1886 * mark the rnode bad and return.
1889 recov_filehandle(nfs4_recov_t action
, mntinfo4_t
*mi
, vnode_t
*vp
)
1891 rnode4_t
*rp
= VTOR4(vp
);
1892 nfs4_error_t e
= { 0, NFS4_OK
, RPC_SUCCESS
};
1895 mutex_enter(&rp
->r_statelock
);
1897 if (rp
->r_flags
& R4RECOVERR
) {
1898 mutex_exit(&rp
->r_statelock
);
1903 * If someone else is updating the filehandle, wait for them to
1904 * finish and then let our caller retry.
1906 if (rp
->r_flags
& R4RECEXPFH
) {
1907 while (rp
->r_flags
& R4RECEXPFH
) {
1908 cv_wait(&rp
->r_cv
, &rp
->r_statelock
);
1910 mutex_exit(&rp
->r_statelock
);
1913 rp
->r_flags
|= R4RECEXPFH
;
1914 mutex_exit(&rp
->r_statelock
);
1916 if (action
== NR_BADHANDLE
) {
1917 /* shouldn't happen */
1918 nfs4_queue_event(RE_BADHANDLE
, mi
, NULL
, 0,
1919 vp
, NULL
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
1922 nfs4_remap_file(mi
, vp
, 0, &e
);
1923 needrecov
= nfs4_needs_recovery(&e
, FALSE
, mi
->mi_vfsp
);
1926 * If we get BADHANDLE, FHEXPIRED or STALE in their handler,
1927 * something is broken. Don't try to recover, just mark the
1930 DTRACE_PROBE2(recov__filehandle
, nfs4_error_t
, &e
, vnode_t
, vp
);
1934 case NFS4ERR_BADHANDLE
:
1935 case NFS4ERR_FHEXPIRED
:
1937 goto norec
; /* Unrecoverable errors */
1942 (void) nfs4_start_recovery(&e
, mi
, vp
, NULL
,
1943 NULL
, NULL
, OP_LOOKUP
, NULL
, NULL
, NULL
);
1945 } else if (e
.error
!= EINTR
&&
1946 !NFS4_FRC_UNMT_ERR(e
.error
, mi
->mi_vfsp
) &&
1947 (e
.error
!= 0 || e
.stat
!= NFS4_OK
)) {
1948 nfs4_recov_fh_fail(vp
, e
.error
, e
.stat
);
1950 * Don't set r_error to ESTALE. Higher-level code (e.g.,
1951 * cstatat_getvp()) retries on ESTALE, which would cause
1956 mutex_enter(&rp
->r_statelock
);
1957 rp
->r_flags
&= ~R4RECEXPFH
;
1958 cv_broadcast(&rp
->r_cv
);
1959 mutex_exit(&rp
->r_statelock
);
1967 * A stale filehandle can happen when an individual file has
1968 * been removed, or when an entire filesystem has been taken
1969 * offline. To distinguish these cases, we do this:
1970 * - if a GETATTR with the current filehandle is okay, we do
1971 * nothing (this can happen with two-filehandle ops)
1972 * - if the GETATTR fails, but a GETATTR of the root filehandle
1973 * succeeds, mark the rnode with R4STALE, which will stop use
1974 * - if the GETATTR fails, and a GETATTR of the root filehandle
1975 * also fails, we consider the problem filesystem-wide, so:
1976 * - if we can failover, we should
1977 * - if we can't failover, we should mark both the original
1978 * vnode and the root bad
1981 recov_stale(mntinfo4_t
*mi
, vnode_t
*vp
)
1983 rnode4_t
*rp
= VTOR4(vp
);
1984 vnode_t
*rootvp
= NULL
;
1985 nfs4_error_t e
= { 0, NFS4_OK
, RPC_SUCCESS
};
1987 char *fail_msg
= "failed to recover from NFS4ERR_STALE";
1990 mutex_enter(&rp
->r_statelock
);
1992 if (rp
->r_flags
& R4RECOVERR
) {
1993 mutex_exit(&rp
->r_statelock
);
1994 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
1995 "recov_stale: already marked dead, rp %s",
2000 if (rp
->r_flags
& R4STALE
) {
2001 mutex_exit(&rp
->r_statelock
);
2002 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2003 "recov_stale: already marked stale, rp %s",
2008 mutex_exit(&rp
->r_statelock
);
2010 /* Try a GETATTR on this vnode */
2011 nfs4_getattr_otw_norecovery(vp
, &gar
, &e
, CRED(), 0);
2014 * Handle non-STALE recoverable errors
2016 needrecov
= nfs4_needs_recovery(&e
, FALSE
, vp
->v_vfsp
);
2021 case NFS4ERR_BADHANDLE
:
2022 goto norec
; /* Unrecoverable */
2027 (void) nfs4_start_recovery(&e
, mi
, vp
, NULL
,
2028 NULL
, NULL
, OP_GETATTR
, NULL
, NULL
, NULL
);
2032 /* Are things OK for this vnode? */
2033 if (!e
.error
&& e
.stat
== NFS4_OK
) {
2034 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2035 "recov_stale: file appears fine, rp %s",
2040 /* Did we get an unrelated non-recoverable error? */
2041 if (e
.error
|| e
.stat
!= NFS4ERR_STALE
) {
2042 nfs4_fail_recov(vp
, fail_msg
, e
.error
, e
.stat
);
2043 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2044 "recov_stale: unrelated fatal error, rp %s",
2050 * If we don't appear to be dealing with the root node, find it.
2052 if ((vp
->v_flag
& VROOT
) == 0) {
2053 nfs4_error_zinit(&e
);
2054 e
.error
= VFS_ROOT(vp
->v_vfsp
, &rootvp
);
2056 nfs4_fail_recov(vp
, fail_msg
, 0, NFS4ERR_STALE
);
2057 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2058 "recov_stale: can't find root node for rp %s",
2064 /* Try a GETATTR on the root vnode */
2065 if (rootvp
!= NULL
) {
2066 nfs4_error_zinit(&e
);
2067 nfs4_getattr_otw_norecovery(rootvp
, &gar
, &e
, CRED(), 0);
2069 needrecov
= nfs4_needs_recovery(&e
, FALSE
, vp
->v_vfsp
);
2074 case NFS4ERR_BADHANDLE
:
2075 goto unrec
; /* Unrecoverable */
2080 (void) nfs4_start_recovery(&e
, mi
, rootvp
, NULL
,
2081 NULL
, NULL
, OP_GETATTR
, NULL
, NULL
, NULL
);
2085 * Check to see if a failover attempt is warranted
2086 * NB: nfs4_try_failover doesn't check for STALE
2087 * because recov_stale gets a shot first. Now that
2088 * recov_stale has failed, go ahead and try failover.
2090 * If the getattr on the root filehandle was successful,
2091 * then mark recovery as failed for 'vp' and exit.
2093 if (nfs4_try_failover(&e
) == 0 && e
.stat
!= NFS4ERR_STALE
) {
2095 * pass the original error to fail_recov, not
2096 * the one from trying the root vnode.
2098 nfs4_fail_recov(vp
, fail_msg
, 0, NFS4ERR_STALE
);
2099 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2100 "recov_stale: root node OK, marking "
2101 "dead rp %s", rnode4info(rp
)));
2107 * Here, we know that both the original file and the
2108 * root filehandle (which may be the same) are stale.
2109 * We want to fail over if we can, and if we can't, we
2110 * want to mark everything in sight bad.
2112 if (FAILOVER_MOUNT4(mi
)) {
2113 mutex_enter(&mi
->mi_lock
);
2114 mi
->mi_recovflags
|= MI4R_NEED_NEW_SERVER
;
2115 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2116 "recov_stale: failing over due to rp %s",
2118 mutex_exit(&mi
->mi_lock
);
2124 * Can't fail over, so mark things dead.
2126 * If rootvp is set, we know we have a distinct
2127 * non-root vnode which can be marked dead in
2130 * Then we want to mark the root vnode dead.
2131 * Note that if rootvp wasn't set, our vp is
2132 * actually the root vnode.
2134 if (rootvp
!= NULL
) {
2135 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
2136 "recov_stale: can't fail over, marking dead rp %s",
2138 nfs4_fail_recov(vp
, fail_msg
, 0, NFS4ERR_STALE
);
2145 * Mark root dead, but quietly - since
2146 * the root rnode is frequently recreated,
2147 * we can encounter this at every access.
2148 * Also mark recovery as failed on this VFS.
2150 rootrp
= VTOR4(rootvp
);
2151 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_CONT
,
2152 "recov_stale: marking dead root rp %s",
2153 rnode4info(rootrp
)));
2154 mutex_enter(&rootrp
->r_statelock
);
2155 rootrp
->r_flags
|= (R4RECOVERR
| R4STALE
);
2156 rootrp
->r_error
= ESTALE
;
2157 mutex_exit(&rootrp
->r_statelock
);
2158 mutex_enter(&mi
->mi_lock
);
2159 mi
->mi_error
= ESTALE
;
2160 mutex_exit(&mi
->mi_lock
);
2162 svp
= mi
->mi_curr_serv
;
2163 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_WRITER
, 0);
2164 svp
->sv_flags
|= SV4_ROOT_STALE
;
2165 nfs_rw_exit(&svp
->sv_lock
);
2178 * Reclaim all the active (acquired) locks for the given file.
2179 * If a process lost a lock, the process is sent a SIGLOST. This is not
2180 * considered an error.
2183 * Errors and status are returned via the nfs4_error_t parameter
2184 * If an error indicates that recovery is needed, the caller is responsible
2185 * for dealing with it.
2189 relock_file(vnode_t
*vp
, mntinfo4_t
*mi
, nfs4_error_t
*ep
,
2190 fattr4_change pre_change
)
2192 locklist_t
*locks
, *llp
;
2196 nfs4_error_zinit(ep
);
2198 if (VTOMI4(vp
)->mi_flags
& MI4_LLOCK
)
2201 nfs4_flush_lock_owners(VTOR4(vp
));
2204 * If we get an error that requires recovery actions, just bail out
2205 * and let the top-level recovery code handle it.
2207 * If we get some other error, kill the process that owned the lock
2208 * and mark its remaining locks (if any) as belonging to NOPID, so
2209 * that we don't make any more reclaim requests for that process.
2213 locks
= flk_active_locks_for_vp(vp
);
2214 for (llp
= locks
; llp
!= NULL
; llp
= llp
->ll_next
) {
2215 int did_reclaim
= 1;
2217 ASSERT(llp
->ll_vp
== vp
);
2218 if (llp
->ll_flock
.l_pid
== NOPID
)
2220 reclaim_one_lock(vp
, &llp
->ll_flock
, ep
, &did_reclaim
);
2222 * If we need to restart recovery, stop processing the
2223 * list. Some errors would be recoverable under other
2224 * circumstances, but if they happen here we just give up
2227 if (nfs4_needs_recovery(ep
, TRUE
, vp
->v_vfsp
)) {
2230 if (!nfs4_recov_marks_dead(ep
->stat
))
2234 * In case the server isn't offering us a grace period, or
2235 * if we missed it, we might have opened & locked from scratch,
2236 * rather than reopened/reclaimed.
2237 * We need to ensure that the object hadn't been otherwise
2238 * changed during this time, by comparing the changeinfo.
2239 * We get passed the changeinfo from before the reopen by our
2240 * caller, in pre_change.
2241 * The changeinfo from after the reopen is in rp->r_change,
2242 * courtesy of the GETATTR in the reopen.
2243 * If they're different, then the file has changed, and we
2244 * have to SIGLOST the app.
2246 if (ep
->error
== 0 && ep
->stat
== NFS4_OK
&& !did_reclaim
) {
2247 mutex_enter(&rp
->r_statelock
);
2248 if (pre_change
!= rp
->r_change
)
2249 ep
->stat
= NFS4ERR_NO_GRACE
;
2250 mutex_exit(&rp
->r_statelock
);
2252 if (ep
->error
!= 0 || ep
->stat
!= NFS4_OK
) {
2254 nfs4_queue_event(RE_FAIL_RELOCK
, mi
,
2255 NULL
, ep
->error
, vp
, NULL
, 0, NULL
,
2256 llp
->ll_flock
.l_pid
, TAG_NONE
, TAG_NONE
,
2259 nfs4_queue_event(RE_FAIL_RELOCK
, mi
,
2260 NULL
, 0, vp
, NULL
, ep
->stat
, NULL
,
2261 llp
->ll_flock
.l_pid
, TAG_NONE
, TAG_NONE
,
2263 nfs4_send_siglost(llp
->ll_flock
.l_pid
, mi
, vp
, TRUE
,
2264 ep
->error
, ep
->stat
);
2265 relock_skip_pid(vp
, llp
, llp
->ll_flock
.l_pid
);
2267 /* Reinitialize the nfs4_error and continue */
2268 nfs4_error_zinit(ep
);
2273 flk_free_locklist(locks
);
2277 * Reclaim the given lock.
2279 * Errors are returned via the nfs4_error_t parameter.
2282 reclaim_one_lock(vnode_t
*vp
, flock64_t
*flk
, nfs4_error_t
*ep
,
2286 rnode4_t
*rp
= VTOR4(vp
);
2288 cr
= pid_to_cr(flk
->l_pid
);
2290 nfs4_error_init(ep
, ESRCH
);
2295 mutex_enter(&rp
->r_statelock
);
2296 if (rp
->r_flags
& R4RECOVERR
) {
2297 mutex_exit(&rp
->r_statelock
);
2298 nfs4_error_init(ep
, ESTALE
);
2301 mutex_exit(&rp
->r_statelock
);
2303 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM
, vp
, F_SETLK
, flk
,
2304 FREAD
|FWRITE
, 0, cr
, ep
, NULL
, did_reclaimp
);
2305 if (ep
->error
== 0 && ep
->stat
== NFS4ERR_FHEXPIRED
)
2306 start_recovery_action(NR_FHEXPIRED
, TRUE
, VTOMI4(vp
),
2308 } while (ep
->error
== 0 && ep
->stat
== NFS4ERR_FHEXPIRED
);
2318 * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2319 * Returns 1 if the error is valid; 0 otherwise.
2322 nfs4_valid_recov_err_for_vp(vnode_t
*vp
, nfsstat4 stat
)
2325 * We should not be marking non-regular files as dead,
2326 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2328 if (vp
->v_type
!= VREG
&& stat
!= NFS4ERR_BADHANDLE
&&
2329 stat
!= NFS4ERR_BADNAME
)
2336 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp',
2337 * then mark the object dead. Since we've had to do a lookup for
2338 * filehandle recovery, we will mark the object dead if we got NOENT.
2341 nfs4_recov_fh_fail(vnode_t
*vp
, int error
, nfsstat4 stat
)
2345 if ((error
== 0) && (stat
!= NFS4ERR_NOENT
) &&
2346 (!nfs4_valid_recov_err_for_vp(vp
, stat
)))
2349 nfs4_fail_recov(vp
, "can't recover filehandle", error
, stat
);
2353 * Recovery from a "shouldn't happen" error. In the long term, we'd like
2354 * to mark only the data structure(s) that provided the bad value as being
2355 * bad. But for now we'll just mark the entire file.
2359 recov_badstate(recov_info_t
*recovp
, vnode_t
*vp
, nfsstat4 stat
)
2362 recov_throttle(recovp
, vp
);
2364 if (!nfs4_valid_recov_err_for_vp(vp
, stat
))
2367 nfs4_fail_recov(vp
, "", 0, stat
);
2371 * Free up the information saved for a lost state request.
2374 nfs4_free_lost_rqst(nfs4_lost_rqst_t
*lrp
, nfs4_server_t
*sp
)
2377 nfs4_open_stream_t
*osp
;
2380 NFS4_DEBUG(nfs4_lost_rqst_debug
,
2381 (CE_NOTE
, "nfs4_free_lost_rqst:"));
2383 switch (lrp
->lr_op
) {
2385 filep
= &lrp
->lr_ofile
;
2386 if (filep
->utf8string_val
) {
2387 kmem_free(filep
->utf8string_val
, filep
->utf8string_len
);
2388 filep
->utf8string_val
= NULL
;
2391 case OP_DELEGRETURN
:
2392 nfs4delegreturn_cleanup(VTOR4(lrp
->lr_vp
), sp
);
2396 ASSERT(osp
!= NULL
);
2397 mutex_enter(&osp
->os_sync_lock
);
2399 if (osp
->os_pending_close
) {
2400 /* clean up the open file state. */
2401 osp
->os_pending_close
= 0;
2402 nfs4close_notw(lrp
->lr_vp
, osp
, &have_sync_lock
);
2405 mutex_exit(&osp
->os_sync_lock
);
2410 if (lrp
->lr_oop
!= NULL
) {
2411 open_owner_rele(lrp
->lr_oop
);
2414 if (lrp
->lr_osp
!= NULL
) {
2415 open_stream_rele(lrp
->lr_osp
, VTOR4(lrp
->lr_vp
));
2418 if (lrp
->lr_lop
!= NULL
) {
2419 lock_owner_rele(lrp
->lr_lop
);
2422 if (lrp
->lr_flk
!= NULL
) {
2423 kmem_free(lrp
->lr_flk
, sizeof (flock64_t
));
2426 if (lrp
->lr_vp
!= NULL
) {
2427 VN_RELE(lrp
->lr_vp
);
2430 if (lrp
->lr_dvp
!= NULL
) {
2431 VN_RELE(lrp
->lr_dvp
);
2434 if (lrp
->lr_cr
!= NULL
) {
2439 kmem_free(lrp
, sizeof (nfs4_lost_rqst_t
));
2443 * Remove any lost state requests and free them.
2446 nfs4_remove_lost_rqsts(mntinfo4_t
*mi
, nfs4_server_t
*sp
)
2448 nfs4_lost_rqst_t
*lrp
;
2450 mutex_enter(&mi
->mi_lock
);
2451 while ((lrp
= list_head(&mi
->mi_lost_state
)) != NULL
) {
2452 list_remove(&mi
->mi_lost_state
, lrp
);
2453 mutex_exit(&mi
->mi_lock
);
2454 nfs4_free_lost_rqst(lrp
, sp
);
2455 mutex_enter(&mi
->mi_lock
);
2457 mutex_exit(&mi
->mi_lock
);
2461 * Reopen all the files for the given filesystem and reclaim any locks.
2465 recov_openfiles(recov_info_t
*recovp
, nfs4_server_t
*sp
)
2467 mntinfo4_t
*mi
= recovp
->rc_mi
;
2468 nfs4_opinst_t
*reopenlist
= NULL
, *rep
;
2469 nfs4_error_t e
= { 0, NFS4_OK
, RPC_SUCCESS
};
2470 open_claim_type4 claim
;
2472 char *fail_msg
= "No such file or directory on replica";
2474 fattr4_change pre_change
;
2479 * This check is to allow a 10ms pause before we reopen files
2480 * it should allow the server time to have received the CB_NULL
2481 * reply and update its internal structures such that (if
2482 * applicable) we are granted a delegation on reopened files.
2484 mutex_enter(&sp
->s_lock
);
2485 if ((sp
->s_flags
& (N4S_CB_PINGED
| N4S_CB_WAITER
)) == 0) {
2486 sp
->s_flags
|= N4S_CB_WAITER
;
2487 (void) cv_reltimedwait(&sp
->wait_cb_null
, &sp
->s_lock
,
2488 drv_usectohz(N4S_CB_PAUSE_TIME
), TR_CLOCK_TICK
);
2490 mutex_exit(&sp
->s_lock
);
2492 (void) nfs_rw_enter_sig(&sp
->s_recovlock
, RW_READER
, 0);
2493 (void) nfs_rw_enter_sig(&mi
->mi_recovlock
, RW_WRITER
, 0);
2495 if (NFS4_VOLATILE_FH(mi
)) {
2496 nfs4_remap_root(mi
, &e
, 0);
2497 if (nfs4_needs_recovery(&e
, FALSE
, mi
->mi_vfsp
)) {
2498 (void) nfs4_start_recovery(&e
, mi
, NULL
,
2499 NULL
, NULL
, NULL
, OP_LOOKUP
, NULL
, NULL
, NULL
);
2503 mutex_enter(&mi
->mi_lock
);
2504 if (recovp
->rc_srv_reboot
|| (mi
->mi_recovflags
& MI4R_SRV_REBOOT
))
2505 claim
= CLAIM_PREVIOUS
;
2508 mutex_exit(&mi
->mi_lock
);
2510 if (e
.error
== 0 && e
.stat
== NFS4_OK
) {
2512 * Get a snapshot of open files in the filesystem. Note
2513 * that new opens will stall until the server's grace
2516 reopenlist
= r4mkopenlist(mi
);
2518 mutex_enter(&mi
->mi_lock
);
2519 remap
= mi
->mi_recovflags
& MI4R_REMAP_FILES
;
2520 mutex_exit(&mi
->mi_lock
);
2522 * Since we are re-establishing state on the
2523 * server, its ok to blow away the saved lost
2524 * requests since we don't need to reissue it.
2526 nfs4_remove_lost_rqsts(mi
, sp
);
2528 for (rep
= reopenlist
; rep
; rep
= rep
->re_next
) {
2531 nfs4_remap_file(mi
, rep
->re_vp
,
2532 NFS4_REMAP_CKATTRS
, &e
);
2534 DTRACE_PROBE2(recov__openfiles
, nfs4_error_t
, &e
,
2535 vnode_t
, rep
->re_vp
);
2536 if (e
.error
== ENOENT
|| e
.stat
== NFS4ERR_NOENT
) {
2538 * The current server does not have the file
2539 * that is to be remapped. This is most
2540 * likely due to an improperly maintained
2541 * replica. The files that are missing from
2542 * the server will be marked dead and logged
2543 * in order to make sys admins aware of the
2546 nfs4_fail_recov(rep
->re_vp
,
2547 fail_msg
, e
.error
, e
.stat
);
2549 * We've already handled the error so clear it.
2551 nfs4_error_zinit(&e
);
2553 } else if (e
.error
== 0 && e
.stat
== NFS4_OK
) {
2556 rp
= VTOR4(rep
->re_vp
);
2557 mutex_enter(&rp
->r_statelock
);
2558 pre_change
= rp
->r_change
;
2559 mutex_exit(&rp
->r_statelock
);
2561 for (j
= 0; j
< rep
->re_numosp
; j
++) {
2562 nfs4_reopen(rep
->re_vp
, rep
->re_osp
[j
],
2563 &e
, claim
, FALSE
, TRUE
);
2564 if (e
.error
!= 0 || e
.stat
!= NFS4_OK
)
2567 if (nfs4_needs_recovery(&e
, TRUE
,
2569 (void) nfs4_start_recovery(&e
, mi
,
2570 rep
->re_vp
, NULL
, NULL
, NULL
,
2571 OP_OPEN
, NULL
, NULL
, NULL
);
2576 if (nfs4_recovdelay
> 0)
2577 ddi_sleep(nfs4_recovdelay
);
2579 if (e
.error
== 0 && e
.stat
== NFS4_OK
) {
2580 relock_file(rep
->re_vp
, mi
, &e
, pre_change
);
2582 if (nfs4_needs_recovery(&e
, TRUE
, mi
->mi_vfsp
))
2583 (void) nfs4_start_recovery(&e
, mi
,
2584 rep
->re_vp
, NULL
, NULL
, NULL
,
2585 OP_LOCK
, NULL
, NULL
, NULL
);
2588 if (e
.error
!= 0 || e
.stat
!= NFS4_OK
)
2593 * Check to see if we need to remap files passed in
2594 * via the recovery arguments; this will have been
2595 * done for open files. A failure here is not fatal.
2598 nfs4_error_t ignore
;
2599 nfs4_check_remap(mi
, recovp
->rc_vp1
, NFS4_REMAP_CKATTRS
,
2601 nfs4_check_remap(mi
, recovp
->rc_vp2
, NFS4_REMAP_CKATTRS
,
2606 if (e
.error
== 0 && e
.stat
== NFS4_OK
) {
2607 mutex_enter(&mi
->mi_lock
);
2608 mi
->mi_recovflags
&= ~(MI4R_REOPEN_FILES
| MI4R_REMAP_FILES
);
2609 mutex_exit(&mi
->mi_lock
);
2612 nfs_rw_exit(&mi
->mi_recovlock
);
2613 nfs_rw_exit(&sp
->s_recovlock
);
2615 if (reopenlist
!= NULL
)
2616 r4releopenlist(reopenlist
);
2620 * Resend the queued state recovery requests in "rqsts".
2624 nfs4_resend_lost_rqsts(recov_info_t
*recovp
, nfs4_server_t
*sp
)
2626 nfs4_lost_rqst_t
*lrp
, *tlrp
;
2627 mntinfo4_t
*mi
= recovp
->rc_mi
;
2630 uint32_t deny_bits
= 0;
2633 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
, "nfs4_resend_lost_rqsts"));
2636 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
2638 mutex_enter(&mi
->mi_lock
);
2639 lrp
= list_head(&mi
->mi_lost_state
);
2640 mutex_exit(&mi
->mi_lock
);
2641 while (lrp
!= NULL
) {
2642 nfs4_error_zinit(&n4e
);
2643 resend_one_op(lrp
, &n4e
, mi
, sp
);
2644 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
,
2645 "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2646 "error %d stat %d", (void *)lrp
->lr_vp
, n4e
.error
,
2650 * If we get a recovery error that we can actually
2651 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2652 * return and let the recovery thread redrive the call.
2653 * Don't requeue unless the zone is still healthy.
2655 if (zone_status_get(curproc
->p_zone
) < ZONE_IS_SHUTTING_DOWN
&&
2656 nfs4_needs_recovery(&n4e
, TRUE
, mi
->mi_vfsp
) &&
2657 (nfs4_try_failover(&n4e
) ||
2658 NFS4_FRC_UNMT_ERR(n4e
.error
, mi
->mi_vfsp
) ||
2659 (n4e
.error
== 0 && n4e
.stat
!= NFS4ERR_BADHANDLE
&&
2660 !nfs4_recov_marks_dead(n4e
.stat
)))) {
2662 * For these three errors, we want to delay a bit
2663 * instead of pounding the server into submission.
2664 * We have to do this manually; the normal
2665 * processing for these errors only works for
2666 * non-recovery requests.
2668 if ((n4e
.error
== 0 && n4e
.stat
== NFS4ERR_DELAY
) ||
2669 (n4e
.error
== 0 && n4e
.stat
== NFS4ERR_GRACE
) ||
2670 (n4e
.error
== 0 && n4e
.stat
== NFS4ERR_RESOURCE
) ||
2671 NFS4_FRC_UNMT_ERR(n4e
.error
, mi
->mi_vfsp
)) {
2672 ddi_sleep(nfs4err_delay_time
);
2674 (void) nfs4_start_recovery(&n4e
,
2675 mi
, lrp
->lr_dvp
, lrp
->lr_vp
, NULL
, NULL
,
2676 lrp
->lr_op
, NULL
, NULL
, NULL
);
2681 mutex_enter(&mi
->mi_lock
);
2682 list_remove(&mi
->mi_lost_state
, lrp
);
2684 lrp
= list_head(&mi
->mi_lost_state
);
2685 mutex_exit(&mi
->mi_lock
);
2686 nfs4_free_lost_rqst(tlrp
, sp
);
2691 * Resend the given op, and issue any necessary undo call.
2692 * errors are returned via the nfs4_error_t parameter.
2696 resend_one_op(nfs4_lost_rqst_t
*lrp
, nfs4_error_t
*ep
,
2697 mntinfo4_t
*mi
, nfs4_server_t
*sp
)
2700 nfs4_open_stream_t
*osp
;
2705 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
, "resend_one_op: "
2706 "have a lost open/close request for vp %p", (void *)vp
));
2708 switch (lrp
->lr_op
) {
2710 nfs4_resend_open_otw(&vp
, lrp
, ep
);
2712 case OP_OPEN_DOWNGRADE
:
2713 ASSERT(lrp
->lr_oop
!= NULL
);
2714 ep
->error
= nfs4_start_open_seqid_sync(lrp
->lr_oop
, mi
);
2715 ASSERT(!ep
->error
); /* recov thread always succeeds */
2716 ASSERT(lrp
->lr_osp
!= NULL
);
2717 mutex_enter(&lrp
->lr_osp
->os_sync_lock
);
2718 nfs4_open_downgrade(lrp
->lr_dg_acc
, lrp
->lr_dg_deny
,
2719 lrp
->lr_oop
, lrp
->lr_osp
, vp
, lrp
->lr_cr
, lrp
,
2721 mutex_exit(&lrp
->lr_osp
->os_sync_lock
);
2722 nfs4_end_open_seqid_sync(lrp
->lr_oop
);
2728 mutex_enter(&osp
->os_sync_lock
);
2729 if (osp
->os_share_acc_read
)
2730 acc_bits
|= OPEN4_SHARE_ACCESS_READ
;
2731 if (osp
->os_share_acc_write
)
2732 acc_bits
|= OPEN4_SHARE_ACCESS_WRITE
;
2733 mutex_exit(&osp
->os_sync_lock
);
2734 nfs4close_one(vp
, osp
, cr
, acc_bits
, lrp
, ep
,
2735 CLOSE_RESEND
, 0, 0, 0);
2739 resend_lock(lrp
, ep
);
2741 case OP_DELEGRETURN
:
2742 nfs4_resend_delegreturn(lrp
, ep
, sp
);
2746 cmn_err(CE_PANIC
, "resend_one_op: unexpected op: %d",
2749 nfs4_queue_event(RE_LOST_STATE_BAD_OP
, mi
, NULL
,
2750 lrp
->lr_op
, lrp
->lr_vp
, lrp
->lr_dvp
, NFS4_OK
, NULL
, 0,
2751 TAG_NONE
, TAG_NONE
, 0, 0);
2752 nfs4_error_init(ep
, EINVAL
);
2757 * No need to retry nor send an "undo" CLOSE in the
2758 * event the server rebooted.
2760 if (ep
->error
== 0 && (ep
->stat
== NFS4ERR_STALE_CLIENTID
||
2761 ep
->stat
== NFS4ERR_STALE_STATEID
|| ep
->stat
== NFS4ERR_EXPIRED
))
2765 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2766 * to undo. Undoing locking operations was handled by
2769 if (lrp
->lr_op
== OP_OPEN_DOWNGRADE
|| lrp
->lr_op
== OP_CLOSE
)
2773 * If we get any other error for OPEN, then don't attempt
2774 * to undo the resend of the open (since it was never
2777 ASSERT(lrp
->lr_op
== OP_OPEN
);
2778 if (ep
->error
|| ep
->stat
!= NFS4_OK
)
2782 * Now let's undo our OPEN.
2784 nfs4_error_zinit(ep
);
2785 close_after_open_resend(vp
, lrp
->lr_cr
, lrp
->lr_oacc
, ep
);
2786 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
, "resend_one_op: "
2787 "nfs4close_one: for vp %p got error %d stat %d",
2788 (void *)vp
, ep
->error
, ep
->stat
));
2791 if (vp
!= lrp
->lr_vp
)
2796 * Close a file that was opened via a resent OPEN.
2797 * Most errors are passed back to the caller (via the return value and
2798 * *statp), except for FHEXPIRED, which is retried.
2800 * It might be conceptually cleaner to push the CLOSE request onto the
2801 * front of the resend queue, rather than sending it here. That would
2802 * match the way we undo lost lock requests. On the other
2803 * hand, we've already got something that works, and there's no reason to
2804 * change it at this time.
2808 close_after_open_resend(vnode_t
*vp
, cred_t
*cr
, uint32_t acc_bits
,
2813 nfs4close_one(vp
, NULL
, cr
, acc_bits
, NULL
, ep
,
2814 CLOSE_AFTER_RESEND
, 0, 0, 0);
2815 if (ep
->error
== 0 && ep
->stat
== NFS4_OK
)
2816 break; /* success; done */
2817 if (ep
->error
!= 0 || ep
->stat
!= NFS4ERR_FHEXPIRED
)
2819 /* else retry FHEXPIRED */
2825 * Resend the given lost lock request. Return an errno value. If zero,
2826 * *statp is set to the NFS status code for the call.
2828 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2829 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2830 * Let the recovery thread redrive the call if we get a recovery error that
2831 * we can actually recover from.
2834 resend_lock(nfs4_lost_rqst_t
*lrp
, nfs4_error_t
*ep
)
2836 bool_t send_siglost
= FALSE
;
2837 vnode_t
*vp
= lrp
->lr_vp
;
2839 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
, "resend_lock:"));
2840 ASSERT(lrp
->lr_ctype
== NFS4_LCK_CTYPE_REINSTATE
||
2841 lrp
->lr_ctype
== NFS4_LCK_CTYPE_RESEND
);
2843 nfs4frlock(lrp
->lr_ctype
, vp
, F_SETLK
,
2844 lrp
->lr_flk
, FREAD
|FWRITE
, 0, lrp
->lr_cr
, ep
, lrp
, NULL
);
2846 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
, "resend_lock: "
2847 "nfs4frlock for vp %p returned error %d, stat %d",
2848 (void *)vp
, ep
->error
, ep
->stat
));
2850 if (ep
->error
== 0 && ep
->stat
== 0)
2852 if (ep
->error
== 0 && ep
->stat
== NFS4ERR_DENIED
&&
2853 lrp
->lr_ctype
== NFS4_LCK_CTYPE_RESEND
)
2857 * If we failed with a non-recovery error, send SIGLOST and
2858 * mark the file dead.
2860 if (!nfs4_needs_recovery(ep
, TRUE
, vp
->v_vfsp
))
2861 send_siglost
= TRUE
;
2864 * Done with recovering LOST LOCK in the event the
2865 * server rebooted or we've lost the lease.
2867 if (ep
->error
== 0 && (ep
->stat
== NFS4ERR_STALE_CLIENTID
||
2868 ep
->stat
== NFS4ERR_STALE_STATEID
||
2869 ep
->stat
== NFS4ERR_EXPIRED
)) {
2874 * BAD_STATEID on an unlock indicates that the server has
2875 * forgotten about the lock anyway, so act like the call
2878 if (ep
->error
== 0 && ep
->stat
== NFS4ERR_BAD_STATEID
&&
2879 lrp
->lr_op
== OP_LOCKU
)
2883 * If we got a recovery error that we don't actually
2884 * recover from, send SIGLOST. If the filesystem was
2885 * forcibly unmounted, we skip the SIGLOST because (a) it's
2886 * unnecessary noise, and (b) there could be a new process
2887 * with the same pid as the one that had generated the lost
2890 if (ep
->error
== 0 && (ep
->stat
== NFS4ERR_BADHANDLE
||
2891 nfs4_recov_marks_dead(ep
->stat
))) {
2892 if (!(vp
->v_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
2893 send_siglost
= TRUE
;
2898 * If the filesystem was forcibly unmounted, we
2899 * still need to synchronize with the server and
2900 * release state. Try again later.
2902 if (NFS4_FRC_UNMT_ERR(ep
->error
, vp
->v_vfsp
))
2906 * If we get a recovery error that we can actually
2907 * recover from (such as ETIMEDOUT, FHEXPIRED),
2908 * return and let the recovery thread redrive the call.
2910 * For the three errors below, we want to delay a bit
2911 * instead of pounding the server into submission.
2913 if ((ep
->error
== 0 && ep
->stat
== NFS4ERR_DELAY
) ||
2914 (ep
->error
== 0 && ep
->stat
== NFS4ERR_GRACE
) ||
2915 (ep
->error
== 0 && ep
->stat
== NFS4ERR_RESOURCE
))
2916 ddi_sleep(recov_err_delay
);
2925 * Must be root or the actual thread being issued the
2926 * SIGLOST for this to work, so just become root.
2928 sv_cred
= curthread
->t_cred
;
2929 curthread
->t_cred
= kcred
;
2930 nfs4_send_siglost(lrp
->lr_flk
->l_pid
, VTOMI4(vp
), vp
, FALSE
,
2931 ep
->error
, ep
->stat
);
2932 curthread
->t_cred
= sv_cred
;
2935 * Flush any additional reinstantiation requests for
2936 * this operation. Sending multiple SIGLOSTs to the user
2937 * process is unlikely to help and may cause trouble.
2939 if (lrp
->lr_ctype
== NFS4_LCK_CTYPE_REINSTATE
)
2940 flush_reinstate(lrp
);
2945 * Remove any lock reinstantiation requests that correspond to the given
2946 * lost request. We only remove items that follow lrp in the queue,
2947 * assuming that lrp will be removed by the generic lost state code.
2951 flush_reinstate(nfs4_lost_rqst_t
*lrp
)
2956 nfs4_lost_rqst_t
*nlrp
;
2960 pid
= lrp
->lr_flk
->l_pid
;
2963 * If there are any more reinstantation requests to get rid of,
2964 * they should all be clustered at the front of the lost state
2967 mutex_enter(&mi
->mi_lock
);
2968 for (lrp
= list_next(&mi
->mi_lost_state
, lrp
); lrp
!= NULL
;
2970 nlrp
= list_next(&mi
->mi_lost_state
, lrp
);
2971 if (lrp
->lr_op
!= OP_LOCK
&& lrp
->lr_op
!= OP_LOCKU
)
2973 if (lrp
->lr_ctype
!= NFS4_LCK_CTYPE_REINSTATE
)
2975 ASSERT(lrp
->lr_vp
== vp
);
2976 ASSERT(lrp
->lr_flk
->l_pid
== pid
);
2977 NFS4_DEBUG(nfs4_lost_rqst_debug
, (CE_NOTE
,
2978 "remove reinstantiation %p", (void *)lrp
));
2979 list_remove(&mi
->mi_lost_state
, lrp
);
2980 nfs4_free_lost_rqst(lrp
, NULL
);
2982 mutex_exit(&mi
->mi_lock
);
2986 * End of state-specific recovery routines.
2990 * Allocate a lost request struct, initialize it from lost_rqstp (including
2991 * bumping the reference counts for the referenced vnode, etc.), and hang
2996 nfs4_save_lost_rqst(nfs4_lost_rqst_t
*lost_rqstp
, recov_info_t
*recovp
,
2997 nfs4_recov_t
*action
, mntinfo4_t
*mi
)
2999 nfs4_lost_rqst_t
*destp
;
3001 ASSERT(recovp
->rc_lost_rqst
== NULL
);
3003 destp
= kmem_alloc(sizeof (nfs4_lost_rqst_t
), KM_SLEEP
);
3004 recovp
->rc_lost_rqst
= destp
;
3006 if (lost_rqstp
->lr_op
== OP_LOCK
||
3007 lost_rqstp
->lr_op
== OP_LOCKU
) {
3008 ASSERT(lost_rqstp
->lr_lop
);
3009 *action
= NR_LOST_LOCK
;
3010 destp
->lr_ctype
= lost_rqstp
->lr_ctype
;
3011 destp
->lr_locktype
= lost_rqstp
->lr_locktype
;
3012 } else if (lost_rqstp
->lr_op
== OP_OPEN
) {
3013 component4
*srcfp
, *destfp
;
3015 destp
->lr_oacc
= lost_rqstp
->lr_oacc
;
3016 destp
->lr_odeny
= lost_rqstp
->lr_odeny
;
3017 destp
->lr_oclaim
= lost_rqstp
->lr_oclaim
;
3018 if (lost_rqstp
->lr_oclaim
== CLAIM_DELEGATE_CUR
)
3019 destp
->lr_ostateid
= lost_rqstp
->lr_ostateid
;
3021 srcfp
= &lost_rqstp
->lr_ofile
;
3022 destfp
= &destp
->lr_ofile
;
3024 * Consume caller's utf8string
3026 destfp
->utf8string_len
= srcfp
->utf8string_len
;
3027 destfp
->utf8string_val
= srcfp
->utf8string_val
;
3028 srcfp
->utf8string_len
= 0;
3029 srcfp
->utf8string_val
= NULL
; /* make sure not reused */
3031 *action
= NR_LOST_STATE_RQST
;
3032 } else if (lost_rqstp
->lr_op
== OP_OPEN_DOWNGRADE
) {
3033 destp
->lr_dg_acc
= lost_rqstp
->lr_dg_acc
;
3034 destp
->lr_dg_deny
= lost_rqstp
->lr_dg_deny
;
3036 *action
= NR_LOST_STATE_RQST
;
3037 } else if (lost_rqstp
->lr_op
== OP_CLOSE
) {
3038 ASSERT(lost_rqstp
->lr_oop
);
3039 *action
= NR_LOST_STATE_RQST
;
3040 } else if (lost_rqstp
->lr_op
== OP_DELEGRETURN
) {
3041 *action
= NR_LOST_STATE_RQST
;
3044 cmn_err(CE_PANIC
, "nfs4_save_lost_rqst: bad op %d",
3047 nfs4_queue_event(RE_LOST_STATE_BAD_OP
, mi
, NULL
,
3048 lost_rqstp
->lr_op
, lost_rqstp
->lr_vp
, lost_rqstp
->lr_dvp
,
3049 NFS4_OK
, NULL
, curproc
->p_pid
, TAG_NONE
, TAG_NONE
, 0, 0);
3050 *action
= NR_UNUSED
;
3051 recovp
->rc_lost_rqst
= NULL
;
3052 kmem_free(destp
, sizeof (nfs4_lost_rqst_t
));
3056 destp
->lr_op
= lost_rqstp
->lr_op
;
3057 destp
->lr_vp
= lost_rqstp
->lr_vp
;
3059 VN_HOLD(destp
->lr_vp
);
3060 destp
->lr_dvp
= lost_rqstp
->lr_dvp
;
3062 VN_HOLD(destp
->lr_dvp
);
3063 destp
->lr_oop
= lost_rqstp
->lr_oop
;
3065 open_owner_hold(destp
->lr_oop
);
3066 destp
->lr_osp
= lost_rqstp
->lr_osp
;
3068 open_stream_hold(destp
->lr_osp
);
3069 destp
->lr_lop
= lost_rqstp
->lr_lop
;
3071 lock_owner_hold(destp
->lr_lop
);
3072 destp
->lr_cr
= lost_rqstp
->lr_cr
;
3074 crhold(destp
->lr_cr
);
3075 if (lost_rqstp
->lr_flk
== NULL
)
3076 destp
->lr_flk
= NULL
;
3078 destp
->lr_flk
= kmem_alloc(sizeof (flock64_t
), KM_SLEEP
);
3079 *destp
->lr_flk
= *lost_rqstp
->lr_flk
;
3081 destp
->lr_putfirst
= lost_rqstp
->lr_putfirst
;
3085 * Map the given return values (errno and nfs4 status code) to a recovery
3086 * action and fill in the following fields of recovp: rc_action,
3087 * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3091 errs_to_action(recov_info_t
*recovp
,
3092 nfs4_server_t
*sp
, mntinfo4_t
*mi
, stateid4
*sidp
,
3093 nfs4_lost_rqst_t
*lost_rqstp
, int unmounted
, nfs_opnum4 op
,
3094 nfs4_bseqid_entry_t
*bsep
)
3096 nfs4_recov_t action
= NR_UNUSED
;
3097 bool_t reboot
= FALSE
;
3099 int error
= recovp
->rc_orig_errors
.error
;
3100 nfsstat4 stat
= recovp
->rc_orig_errors
.stat
;
3102 bzero(&recovp
->rc_stateid
, sizeof (stateid4
));
3103 recovp
->rc_lost_rqst
= NULL
;
3104 recovp
->rc_bseqid_rqst
= NULL
;
3106 try_f
= nfs4_try_failover(&recovp
->rc_orig_errors
) &&
3107 FAILOVER_MOUNT4(mi
);
3110 * We start recovery for EINTR only in the lost lock
3111 * or lost open/close case.
3114 if (try_f
|| error
== EINTR
|| (error
== EIO
&& unmounted
)) {
3115 recovp
->rc_error
= (error
!= 0 ? error
: geterrno4(stat
));
3117 ASSERT(lost_rqstp
->lr_op
!= 0);
3118 nfs4_save_lost_rqst(lost_rqstp
, recovp
, &action
, mi
);
3121 action
= NR_FAILOVER
;
3122 } else if (error
!= 0) {
3123 recovp
->rc_error
= error
;
3124 nfs4_queue_event(RE_UNEXPECTED_ERRNO
, mi
, NULL
, error
, NULL
,
3125 NULL
, 0, NULL
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
3126 action
= NR_CLIENTID
;
3128 recovp
->rc_error
= geterrno4(stat
);
3131 case NFS4ERR_LEASE_MOVED
:
3138 case NFS4ERR_BADHANDLE
:
3139 action
= NR_BADHANDLE
;
3141 case NFS4ERR_BAD_SEQID
:
3143 save_bseqid_rqst(bsep
, recovp
);
3144 action
= NR_BAD_SEQID
;
3146 case NFS4ERR_OLD_STATEID
:
3147 action
= NR_OLDSTATEID
;
3149 case NFS4ERR_WRONGSEC
:
3150 action
= NR_WRONGSEC
;
3152 case NFS4ERR_FHEXPIRED
:
3153 action
= NR_FHEXPIRED
;
3155 case NFS4ERR_BAD_STATEID
:
3156 if (sp
== NULL
|| (sp
!= NULL
&& inlease(sp
))) {
3158 action
= NR_BAD_STATEID
;
3160 recovp
->rc_stateid
= *sidp
;
3162 action
= NR_CLIENTID
;
3164 case NFS4ERR_EXPIRED
:
3166 * The client's lease has expired, either due
3167 * to a network partition or perhaps a client
3168 * error. In either case, try an NR_CLIENTID
3169 * style recovery. reboot remains false, since
3170 * there is no evidence the server has rebooted.
3171 * This will cause CLAIM_NULL opens and lock
3172 * requests without the reclaim bit.
3174 action
= NR_CLIENTID
;
3176 DTRACE_PROBE4(nfs4__expired
,
3177 nfs4_server_t
*, sp
,
3179 stateid4
*, sidp
, int, op
);
3182 case NFS4ERR_STALE_CLIENTID
:
3183 case NFS4ERR_STALE_STATEID
:
3184 action
= NR_CLIENTID
;
3187 case NFS4ERR_RESOURCE
:
3189 * If this had been a FAILOVER mount, then
3190 * we'd have tried failover. Since it's not,
3191 * just delay a while and retry.
3205 nfs4_queue_event(RE_UNEXPECTED_STATUS
, mi
, NULL
, 0,
3206 NULL
, NULL
, stat
, NULL
, 0, TAG_NONE
, TAG_NONE
,
3208 action
= NR_CLIENTID
;
3213 /* make sure action got set */
3214 ASSERT(action
!= NR_UNUSED
);
3215 recovp
->rc_srv_reboot
= reboot
;
3216 recovp
->rc_action
= action
;
3217 nfs4_queue_fact(RF_ERR
, mi
, stat
, action
, op
, reboot
, NULL
, error
,
3222 * Return the (held) credential for the process with the given pid.
3223 * May return NULL (e.g., process not found).
3227 pid_to_cr(pid_t pid
)
3232 mutex_enter(&pidlock
);
3233 if ((p
= prfind(pid
)) == NULL
) {
3234 mutex_exit(&pidlock
);
3238 mutex_enter(&p
->p_crlock
);
3239 crhold(cr
= p
->p_cred
);
3240 mutex_exit(&p
->p_crlock
);
3241 mutex_exit(&pidlock
);
3247 * Send SIGLOST to the given process and queue the event.
3249 * The 'dump' boolean tells us whether this action should dump the
3250 * in-kernel queue of recovery messages or not.
3254 nfs4_send_siglost(pid_t pid
, mntinfo4_t
*mi
, vnode_t
*vp
, bool_t dump
,
3255 int error
, nfsstat4 stat
)
3259 mutex_enter(&pidlock
);
3262 psignal(p
, SIGLOST
);
3263 mutex_exit(&pidlock
);
3264 nfs4_queue_event(dump
? RE_SIGLOST
: RE_SIGLOST_NO_DUMP
, mi
,
3265 NULL
, error
, vp
, NULL
, stat
, NULL
, pid
, TAG_NONE
, TAG_NONE
, 0, 0);
3269 * Scan the lock list for entries that match the given pid. Unregister those
3270 * locks that do and change their pid to NOPID.
3274 relock_skip_pid(vnode_t
*vp
, locklist_t
*llp
, pid_t pid
)
3276 for (; llp
!= NULL
; llp
= llp
->ll_next
) {
3277 if (llp
->ll_flock
.l_pid
== pid
) {
3281 * Unregister the lost lock.
3283 llp
->ll_flock
.l_type
= F_UNLCK
;
3284 r
= reclock(vp
, &llp
->ll_flock
, SETFLCK
, FREAD
| FWRITE
,
3286 /* The unlock cannot fail */
3289 llp
->ll_flock
.l_pid
= NOPID
;
3295 * Mark a file as having failed recovery, after making a last-ditch effort
3296 * to return any delegation.
3298 * Sets r_error to EIO or ESTALE for the given vnode.
3301 nfs4_fail_recov(vnode_t
*vp
, char *why
, int error
, nfsstat4 stat
)
3303 rnode4_t
*rp
= VTOR4(vp
);
3306 if (nfs4_fail_recov_stop
)
3307 debug_enter("nfs4_fail_recov");
3310 mutex_enter(&rp
->r_statelock
);
3311 if (rp
->r_flags
& (R4RECOVERR
|R4RECOVERRP
)) {
3312 mutex_exit(&rp
->r_statelock
);
3317 * Set R4RECOVERRP to indicate that a recovery error is in
3318 * progress. This will shut down reads and writes at the top
3319 * half. Don't set R4RECOVERR until after we've returned the
3320 * delegation, otherwise it will fail.
3323 rp
->r_flags
|= R4RECOVERRP
;
3324 mutex_exit(&rp
->r_statelock
);
3326 nfs4delegabandon(rp
);
3328 mutex_enter(&rp
->r_statelock
);
3329 rp
->r_flags
|= (R4RECOVERR
| R4STALE
);
3330 rp
->r_error
= (error
== 0 && stat
== NFS4ERR_STALE
) ? ESTALE
: EIO
;
3331 PURGE_ATTRCACHE4_LOCKED(rp
);
3332 if (!(vp
->v_vfsp
->vfs_flag
& VFS_UNMOUNTED
))
3333 nfs4_queue_event(RE_DEAD_FILE
, VTOMI4(vp
), NULL
, error
,
3334 vp
, NULL
, stat
, why
, 0, TAG_NONE
, TAG_NONE
, 0, 0);
3335 mutex_exit(&rp
->r_statelock
);
3341 * recov_throttle: if the file had the same recovery action within the
3342 * throttle interval, wait for the throttle interval to finish before
3345 * Side effects: updates the rnode with the current recovery information.
3349 recov_throttle(recov_info_t
*recovp
, vnode_t
*vp
)
3351 time_t curtime
, time_to_wait
;
3352 rnode4_t
*rp
= VTOR4(vp
);
3354 curtime
= gethrestime_sec();
3356 mutex_enter(&rp
->r_statelock
);
3357 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
3358 "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3359 recovp
->rc_action
, curtime
,
3360 rp
->r_recov_act
, rp
->r_last_recov
));
3361 if (recovp
->rc_action
== rp
->r_recov_act
&&
3362 rp
->r_last_recov
+ recov_err_delay
> curtime
) {
3363 time_to_wait
= rp
->r_last_recov
+ recov_err_delay
- curtime
;
3364 mutex_exit(&rp
->r_statelock
);
3365 ddi_sleep(time_to_wait
);
3366 curtime
= gethrestime_sec();
3367 mutex_enter(&rp
->r_statelock
);
3370 rp
->r_last_recov
= curtime
;
3371 rp
->r_recov_act
= recovp
->rc_action
;
3372 mutex_exit(&rp
->r_statelock
);
3376 * React to NFS4ERR_GRACE by setting the time we'll permit
3377 * the next call to this filesystem.
3380 nfs4_set_grace_wait(mntinfo4_t
*mi
)
3382 mutex_enter(&mi
->mi_lock
);
3383 /* Mark the time for the future */
3384 mi
->mi_grace_wait
= gethrestime_sec() + nfs4err_delay_time
;
3385 mutex_exit(&mi
->mi_lock
);
3389 * React to MFS4ERR_DELAY by setting the time we'll permit
3390 * the next call to this vnode.
3393 nfs4_set_delay_wait(vnode_t
*vp
)
3395 rnode4_t
*rp
= VTOR4(vp
);
3397 mutex_enter(&rp
->r_statelock
);
3399 * Calculate amount we should delay, initial
3400 * delay will be short and then we will back off.
3402 if (rp
->r_delay_interval
== 0)
3403 rp
->r_delay_interval
= NFS4_INITIAL_DELAY_INTERVAL
;
3405 /* calculate next interval value */
3406 rp
->r_delay_interval
=
3407 MIN(NFS4_MAX_DELAY_INTERVAL
, (rp
->r_delay_interval
<< 1));
3408 rp
->r_delay_wait
= gethrestime_sec() + rp
->r_delay_interval
;
3409 mutex_exit(&rp
->r_statelock
);
3413 * The caller is responsible for freeing the returned string.
3416 nfs4_getsrvnames(mntinfo4_t
*mi
, size_t *len
)
3424 * Calculate the length of the string required to hold all
3425 * of the server names plus either a comma or a null
3426 * character following each individual one.
3429 for (svp
= mi
->mi_servers
; svp
!= NULL
; svp
= svp
->sv_next
) {
3430 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_READER
, 0);
3431 if (svp
->sv_flags
& SV4_NOTINUSE
) {
3432 nfs_rw_exit(&svp
->sv_lock
);
3435 nfs_rw_exit(&svp
->sv_lock
);
3436 length
+= svp
->sv_hostnamelen
;
3439 srvnames
= kmem_alloc(length
, KM_SLEEP
);
3442 for (svp
= mi
->mi_servers
; svp
!= NULL
; svp
= svp
->sv_next
) {
3443 (void) nfs_rw_enter_sig(&svp
->sv_lock
, RW_READER
, 0);
3444 if (svp
->sv_flags
& SV4_NOTINUSE
) {
3445 nfs_rw_exit(&svp
->sv_lock
);
3448 nfs_rw_exit(&svp
->sv_lock
);
3449 (void) strcpy(namep
, svp
->sv_hostname
);
3450 namep
+= svp
->sv_hostnamelen
- 1;
3461 save_bseqid_rqst(nfs4_bseqid_entry_t
*bsep
, recov_info_t
*recovp
)
3463 nfs4_bseqid_entry_t
*destp
;
3465 destp
= kmem_alloc(sizeof (nfs4_bseqid_entry_t
), KM_SLEEP
);
3466 recovp
->rc_bseqid_rqst
= destp
;
3469 open_owner_hold(bsep
->bs_oop
);
3470 destp
->bs_oop
= bsep
->bs_oop
;
3472 lock_owner_hold(bsep
->bs_lop
);
3473 destp
->bs_lop
= bsep
->bs_lop
;
3475 VN_HOLD(bsep
->bs_vp
);
3476 destp
->bs_vp
= bsep
->bs_vp
;
3477 destp
->bs_pid
= bsep
->bs_pid
;
3478 destp
->bs_tag
= bsep
->bs_tag
;
3479 destp
->bs_seqid
= bsep
->bs_seqid
;
3483 free_bseqid_rqst(nfs4_bseqid_entry_t
*bsep
)
3486 open_owner_rele(bsep
->bs_oop
);
3488 lock_owner_rele(bsep
->bs_lop
);
3490 VN_RELE(bsep
->bs_vp
);
3491 kmem_free(bsep
, sizeof (nfs4_bseqid_entry_t
));
3495 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We
3496 * simply mark the open owner and open stream (if provided) as "bad".
3497 * Then future uses of these data structures will be limited to basically
3498 * just cleaning up the internal client state (no going OTW).
3500 * The result of this is to return errors back to the app/usr when
3501 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3502 * succeed so progress can be made.
3505 recov_bad_seqid(recov_info_t
*recovp
)
3507 mntinfo4_t
*mi
= recovp
->rc_mi
;
3508 nfs4_open_owner_t
*bad_oop
;
3509 nfs4_lock_owner_t
*bad_lop
;
3511 rnode4_t
*rp
= NULL
;
3513 nfs4_bseqid_entry_t
*bsep
, *tbsep
;
3517 ASSERT(nfs_rw_lock_held(&mi
->mi_recovlock
, RW_WRITER
));
3519 mutex_enter(&mi
->mi_lock
);
3520 bsep
= list_head(&mi
->mi_bseqid_list
);
3521 mutex_exit(&mi
->mi_lock
);
3524 * Handle all the bad seqid entries on mi's list.
3526 while (bsep
!= NULL
) {
3527 bad_oop
= bsep
->bs_oop
;
3528 bad_lop
= bsep
->bs_lop
;
3532 NFS4_DEBUG(nfs4_client_recov_debug
, (CE_NOTE
,
3533 "recov_bad_seqid: mark oop %p lop %p as bad for "
3534 "vp %p tag %s pid %d: last good seqid %d for tag %s",
3535 (void *)bad_oop
, (void *)bad_lop
, (void *)vp
,
3536 nfs4_ctags
[bsep
->bs_tag
].ct_str
, pid
,
3537 bad_oop
? bad_oop
->oo_last_good_seqid
: 0,
3538 bad_oop
? nfs4_ctags
[bad_oop
->oo_last_good_op
].ct_str
:
3539 nfs4_ctags
[TAG_NONE
].ct_str
));
3541 nfs4_queue_event(RE_BAD_SEQID
, mi
, NULL
,
3542 0, vp
, NULL
, NFS4ERR_BAD_SEQID
, NULL
, pid
, bsep
->bs_tag
,
3543 bad_oop
? bad_oop
->oo_last_good_op
: TAG_NONE
,
3544 bsep
->bs_seqid
, bad_oop
? bad_oop
->oo_last_good_seqid
: 0);
3547 /* essentially reset the open owner */
3548 error
= nfs4_start_open_seqid_sync(bad_oop
, mi
);
3549 ASSERT(!error
); /* recov thread always succeeds */
3550 bad_oop
->oo_name
= nfs4_get_new_oo_name();
3551 bad_oop
->oo_seqid
= 0;
3552 nfs4_end_open_seqid_sync(bad_oop
);
3556 mutex_enter(&bad_lop
->lo_lock
);
3557 bad_lop
->lo_flags
|= NFS4_BAD_SEQID_LOCK
;
3558 mutex_exit(&bad_lop
->lo_lock
);
3562 mutex_enter(&rp
->r_statelock
);
3563 rp
->r_flags
|= R4LODANGLERS
;
3564 mutex_exit(&rp
->r_statelock
);
3566 nfs4_send_siglost(pid
, mi
, vp
, TRUE
,
3567 0, NFS4ERR_BAD_SEQID
);
3570 mutex_enter(&mi
->mi_lock
);
3571 list_remove(&mi
->mi_bseqid_list
, bsep
);
3573 bsep
= list_head(&mi
->mi_bseqid_list
);
3574 mutex_exit(&mi
->mi_lock
);
3575 free_bseqid_rqst(tbsep
);
3578 mutex_enter(&mi
->mi_lock
);
3579 mi
->mi_recovflags
&= ~MI4R_BAD_SEQID
;
3580 mutex_exit(&mi
->mi_lock
);