kernel/fs/nfs/nfs4_recovery.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * NFS Version 4 state recovery code.
  28  */
  29
  30 #include <nfs/nfs4_clnt.h>
  31 #include <nfs/nfs4.h>
  32 #include <nfs/rnode4.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/cred.h>
  35 #include <sys/systm.h>
  36 #include <sys/flock.h>
  37 #include <sys/dnlc.h>
  38 #include <sys/ddi.h>
  39 #include <sys/disp.h>
  40 #include <sys/list.h>
  41 #include <sys/sdt.h>
  42 #include <sys/mount.h>
  43 #include <sys/door.h>
  44 #include <nfs/nfssys.h>
  45 #include <nfs/nfsid_map.h>
  46 #include <nfs/nfs4_idmap_impl.h>
  47
  48 extern r4hashq_t *rtable4;
  49
  50 /*
  51  * Information that describes what needs to be done for recovery.  It is
  52  * passed to a client recovery thread as well as passed to various recovery
  53  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
  54  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
  55  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
  56  * lock or open/close request, and it holds reference counts for the
  57  * various objects (vnode, etc.).  The recovery thread also uses flags set
  58  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
  59  * to save the error that originally triggered the recovery event -- will
  60  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
  61  * contains information about the request that got NFS4ERR_BAD_SEQID, and
  62  * it holds reference count for the various objects (vnode, open owner,
  63  * open stream, lock owner).
  64  */
  65
  66 typedef struct {
  67         mntinfo4_t *rc_mi;
  68         vnode_t *rc_vp1;
  69         vnode_t *rc_vp2;
  70         nfs4_recov_t rc_action;
  71         stateid4 rc_stateid;
  72         bool_t rc_srv_reboot;           /* server has rebooted */
  73         nfs4_lost_rqst_t *rc_lost_rqst;
  74         nfs4_error_t rc_orig_errors;    /* original errors causing recovery */
  75         int rc_error;
  76         nfs4_bseqid_entry_t *rc_bseqid_rqst;
  77         vnode_t *rc_moved_vp;
  78         char *rc_moved_nm;
  79 } recov_info_t;
  80
  81 /*
  82  * How long to wait before trying again if there is an error doing
  83  * recovery, in seconds.
  84  */
  85
  86 static int recov_err_delay = 1;
  87
  88 /*
  89  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
  90  * errors.  Expressed in seconds.  Default is defined as
  91  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
  92  */
  93 time_t nfs4err_delay_time = 0;
  94
  95 /*
  96  * Tuneable to limit how many time "exempt" ops go OTW
  97  * after a recovery error.  Exempt op hints are OH_CLOSE,
  98  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
  99  * OTW even after rnode was "dead" due to recovery errors.
 100  *
 101  * The tuneable below limits the number of times a start_fop
 102  * invocation will retry the exempt hints.  After the limit
 103  * is reached, nfs4_start_fop will return an error just like
 104  * it would for non-exempt op hints.
 105  */
 106 int nfs4_max_recov_error_retry = 3;
 107
 108 /*
 109  * Number of seconds the recovery thread should pause before retry when the
 110  * filesystem has been forcibly unmounted.
 111  */
 112
 113 int nfs4_unmount_delay = 1;
 114
 115 #ifdef DEBUG
 116
 117 /*
 118  * How long to wait (in seconds) between recovery operations on a given
 119  * file.  Normally zero, but could be set longer for testing purposes.
 120  */
 121 static int nfs4_recovdelay = 0;
 122
 123 /*
 124  * Switch that controls whether to go into the debugger when recovery
 125  * fails.
 126  */
 127 static int nfs4_fail_recov_stop = 0;
 128
 129 /*
 130  * Tuneables to debug client namespace interaction with server
 131  * mount points:
 132  *
 133  *      nfs4_srvmnt_fail_cnt:
 134  *              number of times EACCES returned because client
 135  *              attempted to cross server mountpoint
 136  *
 137  *      nfs4_srvmnt_debug:
 138  *              trigger console printf whenever client attempts
 139  *              to cross server mountpoint
 140  */
 141 int nfs4_srvmnt_fail_cnt = 0;
 142 int nfs4_srvmnt_debug = 0;
 143 #endif
 144
 145 extern zone_key_t       nfs4clnt_zone_key;
 146
 147 /* forward references, in alphabetic order */
 148 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
 149         nfs4_error_t *);
 150 static void errs_to_action(recov_info_t *,
 151         nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
 152         nfs_opnum4, nfs4_bseqid_entry_t *);
 153 static void flush_reinstate(nfs4_lost_rqst_t *);
 154 static void free_milist(mntinfo4_t **, int);
 155 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
 156 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
 157         nfs4_recov_state_t *, int, char *);
 158 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
 159 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
 160 static void nfs4_recov_thread(recov_info_t *);
 161 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
 162 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
 163 static cred_t *pid_to_cr(pid_t);
 164 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
 165 static void recov_bad_seqid(recov_info_t *);
 166 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
 167 static void recov_clientid(recov_info_t *, nfs4_server_t *);
 168 static void recov_done(mntinfo4_t *, recov_info_t *);
 169 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
 170 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
 171 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
 172 static void recov_stale(mntinfo4_t *, vnode_t *);
 173 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
 174 static void recov_throttle(recov_info_t *, vnode_t *);
 175 static void relock_skip_pid(vnode_t *, locklist_t *, pid_t);
 176 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
 177 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
 178         nfs4_server_t *);
 179 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
 180 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
 181         nfs4_server_t *, vnode_t *, char *);
 182 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
 183         vnode_t *);
 184 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
 185
 186 /*
 187  * Return non-zero if the given errno, status, and rpc status codes
 188  * in the nfs4_error_t indicate that client recovery is needed.
 189  * "stateful" indicates whether the call that got the error establishes or
 190  * removes state on the server (open, close, lock, unlock, delegreturn).
 191  */
 192
 193 int
 194 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
 195 {
 196         int recov = 0;
 197         mntinfo4_t *mi;
 198
 199         /*
 200          * Try failover if the error values justify it and if
 201          * it's a failover mount.  Don't try if the mount is in
 202          * progress, failures are handled explicitly by nfs4rootvp.
 203          */
 204         if (nfs4_try_failover(ep)) {
 205                 mi = VFTOMI4(vfsp);
 206                 mutex_enter(&mi->mi_lock);
 207                 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
 208                 mutex_exit(&mi->mi_lock);
 209                 if (recov)
 210                         return (recov);
 211         }
 212
 213         if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
 214                 /*
 215                  * The server may have gotten the request, so for stateful
 216                  * ops we need to resynchronize and possibly back out the
 217                  * op.
 218                  */
 219                 return (stateful);
 220         }
 221         if (ep->error != 0)
 222                 return (0);
 223
 224         /* stat values are listed alphabetically */
 225         /*
 226          * There are two lists here: the errors for which we have code, and
 227          * the errors for which we plan to have code before FCS.  For the
 228          * second list, print a warning message but don't attempt recovery.
 229          */
 230         switch (ep->stat) {
 231         case NFS4ERR_BADHANDLE:
 232         case NFS4ERR_BAD_SEQID:
 233         case NFS4ERR_BAD_STATEID:
 234         case NFS4ERR_DELAY:
 235         case NFS4ERR_EXPIRED:
 236         case NFS4ERR_FHEXPIRED:
 237         case NFS4ERR_GRACE:
 238         case NFS4ERR_OLD_STATEID:
 239         case NFS4ERR_RESOURCE:
 240         case NFS4ERR_STALE_CLIENTID:
 241         case NFS4ERR_STALE_STATEID:
 242         case NFS4ERR_WRONGSEC:
 243         case NFS4ERR_STALE:
 244                 recov = 1;
 245                 break;
 246 #ifdef DEBUG
 247         case NFS4ERR_LEASE_MOVED:
 248         case NFS4ERR_MOVED:
 249                 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
 250                     CE_WARN, "!Can't yet recover from NFS status %d",
 251                     ep->stat);
 252                 break;
 253 #endif
 254         }
 255
 256         return (recov);
 257 }
 258
 259 /*
 260  * Some operations such as DELEGRETURN want to avoid invoking
 261  * recovery actions that will only mark the file dead.  If
 262  * better handlers are invoked for any of these errors, this
 263  * routine should be modified.
 264  */
 265 int
 266 nfs4_recov_marks_dead(nfsstat4 status)
 267 {
 268         if (status == NFS4ERR_BAD_SEQID ||
 269             status == NFS4ERR_EXPIRED ||
 270             status == NFS4ERR_BAD_STATEID ||
 271             status == NFS4ERR_OLD_STATEID)
 272                 return (1);
 273         return (0);
 274 }
 275
 276 /*
 277  * Transfer the state recovery information in recovp to mi's resend queue,
 278  * and mark mi as having a lost state request.
 279  */
 280 static void
 281 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
 282 {
 283         nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
 284
 285         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 286             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 287
 288         ASSERT(lrp != NULL && lrp->lr_op != 0);
 289
 290         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
 291             "nfs4_enqueue_lost_rqst %p, op %d",
 292             (void *)lrp, lrp->lr_op));
 293
 294         mutex_enter(&mi->mi_lock);
 295         mi->mi_recovflags |= MI4R_LOST_STATE;
 296         if (lrp->lr_putfirst)
 297                 list_insert_head(&mi->mi_lost_state, lrp);
 298         else
 299                 list_insert_tail(&mi->mi_lost_state, lrp);
 300         recovp->rc_lost_rqst = NULL;
 301         mutex_exit(&mi->mi_lock);
 302
 303         nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
 304             lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
 305 }
 306
 307 /*
 308  * Transfer the bad seqid recovery information in recovp to mi's
 309  * bad seqid queue, and mark mi as having a bad seqid request.
 310  */
 311 void
 312 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
 313 {
 314         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 315             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 316         ASSERT(recovp->rc_bseqid_rqst != NULL);
 317
 318         mutex_enter(&mi->mi_lock);
 319         mi->mi_recovflags |= MI4R_BAD_SEQID;
 320         list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
 321         recovp->rc_bseqid_rqst = NULL;
 322         mutex_exit(&mi->mi_lock);
 323 }
 324
 325 /*
 326  * Initiate recovery.
 327  *
 328  * The nfs4_error_t contains the return codes that triggered a recovery
 329  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
 330  * being operated on.  vp1 and vp2 may be NULL.
 331  *
 332  * Multiple calls are okay.  If recovery is already underway, the call
 333  * updates the information about what state needs recovery but does not
 334  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
 335  * for proper synchronization with any recovery thread.
 336  *
 337  * This will return TRUE if recovery was aborted, and FALSE otherwise.
 338  */
 339 bool_t
 340 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
 341     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
 342     nfs4_bseqid_entry_t *bsep, vnode_t *moved_vp, char *moved_nm)
 343 {
 344         recov_info_t *recovp;
 345         nfs4_server_t *sp;
 346         bool_t abort = FALSE;
 347         bool_t gone = FALSE;
 348
 349         ASSERT(nfs_zone() == mi->mi_zone);
 350         mutex_enter(&mi->mi_lock);
 351         /*
 352          * If there is lost state, we need to kick off recovery even if the
 353          * filesystem has been unmounted or the zone is shutting down.
 354          */
 355         gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
 356         if (gone) {
 357                 ASSERT(ep->error != EINTR || lost_rqstp != NULL);
 358                 if (ep->error == EIO && lost_rqstp == NULL) {
 359                         /* failed due to forced unmount, no new lost state */
 360                         abort = TRUE;
 361                 }
 362                 if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
 363                     !(mi->mi_recovflags & MI4R_LOST_STATE)) {
 364                         /* some other failure, no existing lost state */
 365                         abort = TRUE;
 366                 }
 367                 if (abort) {
 368                         mutex_exit(&mi->mi_lock);
 369                         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
 370                             "nfs4_start_recovery: fs unmounted"));
 371                         return (TRUE);
 372                 }
 373         }
 374         mi->mi_in_recovery++;
 375         mutex_exit(&mi->mi_lock);
 376
 377         recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
 378         recovp->rc_orig_errors = *ep;
 379         sp = find_nfs4_server(mi);
 380         errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
 381         if (sp != NULL)
 382                 mutex_exit(&sp->s_lock);
 383         start_recovery(recovp, mi, vp1, vp2, sp, moved_vp, moved_nm);
 384         if (sp != NULL)
 385                 nfs4_server_rele(sp);
 386         return (FALSE);
 387 }
 388
 389 /*
 390  * Internal version of nfs4_start_recovery.  The difference is that the
 391  * caller specifies the recovery action, rather than the errors leading to
 392  * recovery.
 393  */
 394 static void
 395 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
 396     vnode_t *vp1, vnode_t *vp2)
 397 {
 398         recov_info_t *recovp;
 399
 400         ASSERT(nfs_zone() == mi->mi_zone);
 401         mutex_enter(&mi->mi_lock);
 402         mi->mi_in_recovery++;
 403         mutex_exit(&mi->mi_lock);
 404
 405         recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
 406         recovp->rc_action = what;
 407         recovp->rc_srv_reboot = reboot;
 408         recovp->rc_error = EIO;
 409         start_recovery(recovp, mi, vp1, vp2, NULL, NULL, NULL);
 410 }
 411
 412 static void
 413 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
 414     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp,
 415     vnode_t *moved_vp, char *moved_nm)
 416 {
 417         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
 418             "start_recovery: mi %p, what %s", (void*)mi,
 419             nfs4_recov_action_to_str(recovp->rc_action)));
 420
 421         /*
 422          * Bump the reference on the vfs so that we can pass it to the
 423          * recovery thread.
 424          */
 425         VFS_HOLD(mi->mi_vfsp);
 426         MI4_HOLD(mi);
 427 again:
 428         switch (recovp->rc_action) {
 429         case NR_FAILOVER:
 430                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 431                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 432                 if (mi->mi_servers->sv_next == NULL)
 433                         goto out_no_thread;
 434                 mutex_enter(&mi->mi_lock);
 435                 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
 436                 mutex_exit(&mi->mi_lock);
 437
 438                 if (recovp->rc_lost_rqst != NULL)
 439                         nfs4_enqueue_lost_rqst(recovp, mi);
 440                 break;
 441
 442         case NR_CLIENTID:
 443                 /*
 444                  * If the filesystem has been unmounted, punt.
 445                  */
 446                 if (sp == NULL)
 447                         goto out_no_thread;
 448
 449                 /*
 450                  * If nobody else is working on the clientid, mark the
 451                  * clientid as being no longer set.  Then mark the specific
 452                  * filesystem being worked on.
 453                  */
 454                 if (!nfs4_server_in_recovery(sp)) {
 455                         mutex_enter(&sp->s_lock);
 456                         sp->s_flags &= ~N4S_CLIENTID_SET;
 457                         mutex_exit(&sp->s_lock);
 458                 }
 459                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 460                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 461                 mutex_enter(&mi->mi_lock);
 462                 mi->mi_recovflags |= MI4R_NEED_CLIENTID;
 463                 if (recovp->rc_srv_reboot)
 464                         mi->mi_recovflags |= MI4R_SRV_REBOOT;
 465                 mutex_exit(&mi->mi_lock);
 466                 break;
 467
 468         case NR_OPENFILES:
 469                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 470                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 471                 mutex_enter(&mi->mi_lock);
 472                 mi->mi_recovflags |= MI4R_REOPEN_FILES;
 473                 if (recovp->rc_srv_reboot)
 474                         mi->mi_recovflags |= MI4R_SRV_REBOOT;
 475                 mutex_exit(&mi->mi_lock);
 476                 break;
 477
 478         case NR_WRONGSEC:
 479                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 480                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 481                 mutex_enter(&mi->mi_lock);
 482                 mi->mi_recovflags |= MI4R_NEED_SECINFO;
 483                 mutex_exit(&mi->mi_lock);
 484                 break;
 485
 486         case NR_EXPIRED:
 487                 if (vp1 != NULL)
 488                         recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
 489                 if (vp2 != NULL)
 490                         recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
 491                 goto out_no_thread;     /* no further recovery possible */
 492
 493         case NR_BAD_STATEID:
 494                 if (vp1 != NULL)
 495                         recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
 496                 if (vp2 != NULL)
 497                         recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
 498                 goto out_no_thread;     /* no further recovery possible */
 499
 500         case NR_FHEXPIRED:
 501         case NR_BADHANDLE:
 502                 if (vp1 != NULL)
 503                         recov_throttle(recovp, vp1);
 504                 if (vp2 != NULL)
 505                         recov_throttle(recovp, vp2);
 506                 /*
 507                  * Recover the filehandle now, rather than using a
 508                  * separate thread.  We can do this because filehandle
 509                  * recovery is independent of any other state, and because
 510                  * we know that we are not competing with the recovery
 511                  * thread at this time.  recov_filehandle will deal with
 512                  * threads that are competing to recover this filehandle.
 513                  */
 514                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 515                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 516                 if (vp1 != NULL)
 517                         recov_filehandle(recovp->rc_action, mi, vp1);
 518                 if (vp2 != NULL)
 519                         recov_filehandle(recovp->rc_action, mi, vp2);
 520                 goto out_no_thread;     /* no further recovery needed */
 521
 522         case NR_STALE:
 523                 /*
 524                  * NFS4ERR_STALE handling
 525                  * recov_stale() could set MI4R_NEED_NEW_SERVER to
 526                  * indicate that we can and should failover.
 527                  */
 528                 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
 529                     nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
 530
 531                 if (vp1 != NULL)
 532                         recov_stale(mi, vp1);
 533                 if (vp2 != NULL)
 534                         recov_stale(mi, vp2);
 535                 mutex_enter(&mi->mi_lock);
 536                 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
 537                         mutex_exit(&mi->mi_lock);
 538                         goto out_no_thread;
 539                 }
 540                 mutex_exit(&mi->mi_lock);
 541                 recovp->rc_action = NR_FAILOVER;
 542                 goto again;
 543
 544         case NR_BAD_SEQID:
 545                 if (recovp->rc_bseqid_rqst) {
 546                         enqueue_bseqid_rqst(recovp, mi);
 547                         break;
 548                 }
 549
 550                 if (vp1 != NULL)
 551                         recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
 552                 if (vp2 != NULL)
 553                         recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
 554                 goto out_no_thread; /* no further recovery possible */
 555
 556         case NR_OLDSTATEID:
 557                 if (vp1 != NULL)
 558                         recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
 559                 if (vp2 != NULL)
 560                         recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
 561                 goto out_no_thread;     /* no further recovery possible */
 562
 563         case NR_GRACE:
 564                 nfs4_set_grace_wait(mi);
 565                 goto out_no_thread; /* no further action required for GRACE */
 566
 567         case NR_DELAY:
 568                 if (vp1)
 569                         nfs4_set_delay_wait(vp1);
 570                 goto out_no_thread; /* no further action required for DELAY */
 571
 572         case NR_LOST_STATE_RQST:
 573         case NR_LOST_LOCK:
 574                 nfs4_enqueue_lost_rqst(recovp, mi);
 575                 break;
 576         default:
 577                 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
 578                     recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
 579                     TAG_NONE, 0, 0);
 580                 goto out_no_thread;
 581         }
 582
 583         /*
 584          * If either file recently went through the same recovery, wait
 585          * awhile.  This is in case there is some sort of bug; we might not
 586          * be able to recover properly, but at least we won't bombard the
 587          * server with calls, and we won't tie up the client.
 588          */
 589         if (vp1 != NULL)
 590                 recov_throttle(recovp, vp1);
 591         if (vp2 != NULL)
 592                 recov_throttle(recovp, vp2);
 593
 594         /*
 595          * If there's already a recovery thread, don't start another one.
 596          */
 597
 598         mutex_enter(&mi->mi_lock);
 599         if (mi->mi_flags & MI4_RECOV_ACTIV) {
 600                 mutex_exit(&mi->mi_lock);
 601                 goto out_no_thread;
 602         }
 603         mi->mi_flags |= MI4_RECOV_ACTIV;
 604         mutex_exit(&mi->mi_lock);
 605         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
 606             "start_recovery: starting new thread for mi %p", (void*)mi));
 607
 608         recovp->rc_mi = mi;
 609         recovp->rc_vp1 = vp1;
 610         if (vp1 != NULL) {
 611                 ASSERT(VTOMI4(vp1) == mi);
 612                 VN_HOLD(recovp->rc_vp1);
 613         }
 614         recovp->rc_vp2 = vp2;
 615         if (vp2 != NULL) {
 616                 ASSERT(VTOMI4(vp2) == mi);
 617                 VN_HOLD(recovp->rc_vp2);
 618         }
 619         recovp->rc_moved_vp = moved_vp;
 620         recovp->rc_moved_nm = moved_nm;
 621
 622         (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
 623             minclsyspri);
 624         return;
 625
 626         /* not reached by thread creating call */
 627 out_no_thread:
 628         mutex_enter(&mi->mi_lock);
 629         mi->mi_in_recovery--;
 630         if (mi->mi_in_recovery == 0)
 631                 cv_broadcast(&mi->mi_cv_in_recov);
 632         mutex_exit(&mi->mi_lock);
 633
 634         VFS_RELE(mi->mi_vfsp);
 635         MI4_RELE(mi);
 636         /*
 637          * Free up resources that were allocated for us.
 638          */
 639         kmem_free(recovp, sizeof (recov_info_t));
 640 }
 641
 642 static int
 643 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
 644     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
 645 {
 646         rnode4_t *rp;
 647         int error = 0;
 648         int exempt;
 649
 650         if (vp == NULL)
 651                 return (0);
 652
 653         exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
 654         rp = VTOR4(vp);
 655         mutex_enter(&rp->r_statelock);
 656
 657         /*
 658          * If there was a recovery error, then allow op hints "exempt" from
 659          * recov errors to retry (currently 3 times).  Either r_error or
 660          * EIO is returned for non-exempt op hints.
 661          */
 662         if (rp->r_flags & R4RECOVERR) {
 663                 if (exempt && rsp->rs_num_retry_despite_err <=
 664                     nfs4_max_recov_error_retry) {
 665
 666                         /*
 667                          * Check to make sure that we haven't already inc'd
 668                          * rs_num_retry_despite_err for current nfs4_start_fop
 669                          * instance.  We don't want to double inc (if we were
 670                          * called with vp2, then the vp1 call could have
 671                          * already incremented.
 672                          */
 673                         if (retry_err_cnt == rsp->rs_num_retry_despite_err)
 674                                 rsp->rs_num_retry_despite_err++;
 675
 676                         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
 677                             "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
 678                             (void *)vp, rsp->rs_num_retry_despite_err));
 679                 } else {
 680                         error = (rp->r_error ? rp->r_error : EIO);
 681                         /*
 682                          * An ESTALE error on a non-regular file is not
 683                          * "sticky".  Return the ESTALE error once, but
 684                          * clear the condition to allow future operations
 685                          * to go OTW.  This will allow the client to
 686                          * recover if the server has merely unshared then
 687                          * re-shared the file system.  For regular files,
 688                          * the unshare has destroyed the open state at the
 689                          * server and we aren't willing to do a reopen (yet).
 690                          */
 691                         if (error == ESTALE && vp->v_type != VREG) {
 692                                 rp->r_flags &=
 693                                     ~(R4RECOVERR|R4RECOVERRP|R4STALE);
 694                                 rp->r_error = 0;
 695                                 error = ESTALE;
 696                         }
 697                         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
 698                             "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
 699                             str, (void *)vp,
 700                             rsp->rs_num_retry_despite_err, error));
 701                 }
 702         }
 703
 704         mutex_exit(&rp->r_statelock);
 705         return (error);
 706 }
 707
 708 /*
 709  * Initial setup code that every operation should call if it might invoke
 710  * client recovery.  Can block waiting for recovery to finish on a
 711  * filesystem.  Either vnode ptr can be NULL.
 712  *
 713  * Returns 0 if there are no outstanding errors.  Can return an
 714  * errno value under various circumstances (e.g., failed recovery, or
 715  * interrupted while waiting for recovery to finish).
 716  *
 717  * There must be a corresponding call to nfs4_end_op() to free up any locks
 718  * or resources allocated by this call (assuming this call succeeded),
 719  * using the same rsp that's passed in here.
 720  *
 721  * The open and lock seqid synchronization must be stopped before calling this
 722  * function, as it could lead to deadlock when trying to reopen a file or
 723  * reclaim a lock.  The synchronization is obtained with calls to:
 724  *   nfs4_start_open_seqid_sync()
 725  *   nfs4_start_lock_seqid_sync()
 726  *
 727  * *startrecovp is set TRUE if the caller should not bother with the
 728  * over-the-wire call, and just initiate recovery for the given request.
 729  * This is typically used for state-releasing ops if the filesystem has
 730  * been forcibly unmounted.  startrecovp may be NULL for
 731  * non-state-releasing ops.
 732  */
 733
 734 int
 735 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
 736     nfs4_recov_state_t *rsp, bool_t *startrecovp)
 737 {
 738         int error = 0, rerr_cnt;
 739         nfs4_server_t *sp = NULL;
 740         nfs4_server_t *tsp;
 741         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 742         uint_t droplock_cnt;
 743 #ifdef DEBUG
 744         void *fop_caller;
 745 #endif
 746
 747         ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
 748         ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
 749
 750 #ifdef  DEBUG
 751         if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
 752                 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
 753                     fop_caller);
 754         }
 755         (void) tsd_set(nfs4_tsd_key, caller());
 756 #endif
 757
 758         rsp->rs_sp = NULL;
 759         rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
 760         rerr_cnt = rsp->rs_num_retry_despite_err;
 761
 762         /*
 763          * Process the items that may delay() based on server response
 764          */
 765         error = nfs4_wait_for_grace(mi, rsp);
 766         if (error)
 767                 goto out;
 768
 769         if (vp1 != NULL) {
 770                 error = nfs4_wait_for_delay(vp1, rsp);
 771                 if (error)
 772                         goto out;
 773         }
 774
 775         /* Wait for a delegation recall to complete. */
 776
 777         error = wait_for_recall(vp1, vp2, op, rsp);
 778         if (error)
 779                 goto out;
 780
 781         /*
 782          * Wait for any current recovery actions to finish.  Note that a
 783          * recovery thread can still start up after wait_for_recovery()
 784          * finishes.  We don't block out recovery operations until we
 785          * acquire s_recovlock and mi_recovlock.
 786          */
 787         error = wait_for_recovery(mi, op);
 788         if (error)
 789                 goto out;
 790
 791         /*
 792          * Check to see if the rnode is already marked with a
 793          * recovery error.  If so, return it immediately.  But
 794          * always pass CLOSE, LOCKU, and DELEGRETURN so we can
 795          * clean up state on the server.
 796          */
 797
 798         if (vp1 != NULL) {
 799                 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
 800                         goto out;
 801                 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
 802         }
 803
 804         if (vp2 != NULL) {
 805                 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
 806                         goto out;
 807                 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
 808         }
 809
 810         /*
 811          * The lock order calls for us to acquire s_recovlock before
 812          * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
 813          * prevent races with the failover/migration code).  So acquire
 814          * mi_recovlock, look up sp, drop mi_recovlock, acquire
 815          * s_recovlock and mi_recovlock, then verify that sp is still the
 816          * right object.  XXX Can we find a simpler way to deal with this?
 817          */
 818         if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
 819             mi->mi_flags & MI4_INT)) {
 820                 error = EINTR;
 821                 goto out;
 822         }
 823 get_sp:
 824         sp = find_nfs4_server(mi);
 825         if (sp != NULL) {
 826                 sp->s_otw_call_count++;
 827                 mutex_exit(&sp->s_lock);
 828                 droplock_cnt = mi->mi_srvset_cnt;
 829         }
 830         nfs_rw_exit(&mi->mi_recovlock);
 831
 832         if (sp != NULL) {
 833                 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
 834                     mi->mi_flags & MI4_INT)) {
 835                         error = EINTR;
 836                         goto out;
 837                 }
 838         }
 839         if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
 840             mi->mi_flags & MI4_INT)) {
 841                 if (sp != NULL)
 842                         nfs_rw_exit(&sp->s_recovlock);
 843                 error = EINTR;
 844                 goto out;
 845         }
 846         /*
 847          * If the mntinfo4_t hasn't changed nfs4_sever_ts then
 848          * there's no point in double checking to make sure it
 849          * has switched.
 850          */
 851         if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
 852                 tsp = find_nfs4_server(mi);
 853                 if (tsp != sp) {
 854                         /* try again */
 855                         if (tsp != NULL) {
 856                                 mutex_exit(&tsp->s_lock);
 857                                 nfs4_server_rele(tsp);
 858                                 tsp = NULL;
 859                         }
 860                         if (sp != NULL) {
 861                                 nfs_rw_exit(&sp->s_recovlock);
 862                                 mutex_enter(&sp->s_lock);
 863                                 sp->s_otw_call_count--;
 864                                 mutex_exit(&sp->s_lock);
 865                                 nfs4_server_rele(sp);
 866                                 sp = NULL;
 867                         }
 868                         goto get_sp;
 869                 } else {
 870                         if (tsp != NULL) {
 871                                 mutex_exit(&tsp->s_lock);
 872                                 nfs4_server_rele(tsp);
 873                                 tsp = NULL;
 874                         }
 875                 }
 876         }
 877
 878         if (sp != NULL) {
 879                 rsp->rs_sp = sp;
 880         }
 881
 882         /*
 883          * If the fileystem uses volatile filehandles, obtain a lock so
 884          * that we synchronize with renames.  Exception: mount operations
 885          * can change mi_fh_expire_type, which could be a problem, since
 886          * the end_op code needs to be consistent with the start_op code
 887          * about mi_rename_lock.  Since mounts don't compete with renames,
 888          * it's simpler to just not acquire the rename lock for mounts.
 889          */
 890         if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
 891                 if (nfs_rw_enter_sig(&mi->mi_rename_lock,
 892                     op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
 893                     mi->mi_flags & MI4_INT)) {
 894                         nfs_rw_exit(&mi->mi_recovlock);
 895                         if (sp != NULL)
 896                                 nfs_rw_exit(&sp->s_recovlock);
 897                         error = EINTR;
 898                         goto out;
 899                 }
 900                 rsp->rs_flags |= NFS4_RS_RENAME_HELD;
 901         }
 902
 903         if (OH_IS_STATE_RELE(op)) {
 904                 /*
 905                  * For forced unmount, letting the request proceed will
 906                  * almost always delay response to the user, so hand it off
 907                  * to the recovery thread.  For exiting lwp's, we don't
 908                  * have a good way to tell if the request will hang.  We
 909                  * generally want processes to handle their own requests so
 910                  * that they can be done in parallel, but if there is
 911                  * already a recovery thread, hand the request off to it.
 912                  * This will improve user response at no cost to overall
 913                  * system throughput.  For zone shutdown, we'd prefer
 914                  * the recovery thread to handle this as well.
 915                  */
 916                 ASSERT(startrecovp != NULL);
 917                 mutex_enter(&mi->mi_lock);
 918                 if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
 919                         *startrecovp = TRUE;
 920                 else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
 921                     (mi->mi_flags & MI4_RECOV_ACTIV))
 922                         *startrecovp = TRUE;
 923                 else
 924                         *startrecovp = FALSE;
 925                 mutex_exit(&mi->mi_lock);
 926         } else
 927                 if (startrecovp != NULL)
 928                         *startrecovp = FALSE;
 929
 930         ASSERT(error == 0);
 931         return (error);
 932
 933 out:
 934         ASSERT(error != 0);
 935         if (sp != NULL) {
 936                 mutex_enter(&sp->s_lock);
 937                 sp->s_otw_call_count--;
 938                 mutex_exit(&sp->s_lock);
 939                 nfs4_server_rele(sp);
 940                 rsp->rs_sp = NULL;
 941         }
 942         nfs4_end_op_recall(vp1, vp2, rsp);
 943
 944 #ifdef  DEBUG
 945         (void) tsd_set(nfs4_tsd_key, NULL);
 946 #endif
 947         return (error);
 948 }
 949
 950 /*
 951  * It is up to the caller to determine if rsp->rs_sp being NULL
 952  * is detrimental or not.
 953  */
 954 int
 955 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
 956     nfs4_recov_state_t *rsp)
 957 {
 958         ASSERT(rsp->rs_num_retry_despite_err == 0);
 959         rsp->rs_num_retry_despite_err = 0;
 960         return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
 961 }
 962
 963 /*
 964  * Release any resources acquired by nfs4_start_op().
 965  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
 966  *
 967  * The operation hint is used to avoid a deadlock by bypassing delegation
 968  * return logic for writes, which are done while returning a delegation.
 969  */
 970
 971 void
 972 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
 973     nfs4_recov_state_t *rsp, bool_t needs_recov)
 974 {
 975         nfs4_server_t *sp = rsp->rs_sp;
 976         rnode4_t *rp = NULL;
 977
 978
 979 #ifdef  DEBUG
 980         ASSERT(tsd_get(nfs4_tsd_key) != NULL);
 981         (void) tsd_set(nfs4_tsd_key, NULL);
 982 #endif
 983
 984         nfs4_end_op_recall(vp1, vp2, rsp);
 985
 986         if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
 987                 nfs_rw_exit(&mi->mi_rename_lock);
 988
 989         if (!needs_recov) {
 990                 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
 991                         /* may need to clear the delay interval */
 992                         if (vp1 != NULL) {
 993                                 rp = VTOR4(vp1);
 994                                 mutex_enter(&rp->r_statelock);
 995                                 rp->r_delay_interval = 0;
 996                                 mutex_exit(&rp->r_statelock);
 997                         }
 998                 }
 999                 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
1000         }
1001
1002         /*
1003          * If the corresponding nfs4_start_op() found a sp,
1004          * then there must still be a sp.
1005          */
1006         if (sp != NULL) {
1007                 nfs_rw_exit(&mi->mi_recovlock);
1008                 nfs_rw_exit(&sp->s_recovlock);
1009                 mutex_enter(&sp->s_lock);
1010                 sp->s_otw_call_count--;
1011                 cv_broadcast(&sp->s_cv_otw_count);
1012                 mutex_exit(&sp->s_lock);
1013                 nfs4_server_rele(sp);
1014         } else {
1015                 nfs_rw_exit(&mi->mi_recovlock);
1016         }
1017 }
1018
1019 void
1020 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1021     nfs4_recov_state_t *rsp, bool_t needrecov)
1022 {
1023         nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1024 }
1025
1026 /*
1027  * If the filesystem is going through client recovery, block until
1028  * finished.
1029  * Exceptions:
1030  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1031  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
1032  *
1033  * Return value:
1034  * - 0 if no errors
1035  * - EINTR if the call was interrupted
1036  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1037  *   op)
1038  * - the errno value from the recovery thread, if recovery failed
1039  */
1040
1041 static int
1042 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1043 {
1044         int error = 0;
1045
1046         mutex_enter(&mi->mi_lock);
1047
1048         while (mi->mi_recovflags != 0) {
1049                 klwp_t *lwp = ttolwp(curthread);
1050
1051                 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
1052                     (mi->mi_flags & MI4_RECOV_FAIL))
1053                         break;
1054                 if (OH_IS_STATE_RELE(op_hint) &&
1055                     (curthread->t_proc_flag & TP_LWPEXIT))
1056                         break;
1057
1058                 if (lwp != NULL)
1059                         lwp->lwp_nostop++;
1060                 /* XXX - use different cv? */
1061                 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1062                         error = EINTR;
1063                         if (lwp != NULL)
1064                                 lwp->lwp_nostop--;
1065                         break;
1066                 }
1067                 if (lwp != NULL)
1068                         lwp->lwp_nostop--;
1069         }
1070
1071         if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1072             !OH_IS_STATE_RELE(op_hint)) {
1073                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1074                     "wait_for_recovery: forced unmount"));
1075                 error = EIO;
1076         } else if (mi->mi_flags & MI4_RECOV_FAIL) {
1077                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1078                     "wait_for_recovery: fail since RECOV FAIL"));
1079                 error = mi->mi_error;
1080         }
1081
1082         mutex_exit(&mi->mi_lock);
1083
1084         return (error);
1085 }
1086
1087 /*
1088  * If the client received NFS4ERR_GRACE for this particular mount,
1089  * the client blocks here until it is time to try again.
1090  *
1091  * Return value:
1092  * - 0 if wait was successful
1093  * - EINTR if the call was interrupted
1094  */
1095
1096 int
1097 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1098 {
1099         int error = 0;
1100         time_t curtime, time_to_wait;
1101
1102         /* do a unprotected check to reduce mi_lock contention */
1103         if (mi->mi_grace_wait != 0) {
1104                 mutex_enter(&mi->mi_lock);
1105
1106                 if (mi->mi_grace_wait != 0) {
1107                         if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1108                                 rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1109
1110                         curtime = gethrestime_sec();
1111
1112                         if (curtime < mi->mi_grace_wait) {
1113
1114                                 time_to_wait = mi->mi_grace_wait - curtime;
1115
1116                                 mutex_exit(&mi->mi_lock);
1117
1118                                 ddi_sleep(time_to_wait);
1119
1120                                 curtime = gethrestime_sec();
1121
1122                                 mutex_enter(&mi->mi_lock);
1123
1124                                 if (curtime >= mi->mi_grace_wait)
1125                                         mi->mi_grace_wait = 0;
1126                         } else {
1127                                 mi->mi_grace_wait = 0;
1128                         }
1129                 }
1130                 mutex_exit(&mi->mi_lock);
1131         }
1132
1133         return (error);
1134 }
1135
1136 /*
1137  * If the client received NFS4ERR_DELAY for an operation on a vnode,
1138  * the client blocks here until it is time to try again.
1139  *
1140  * Return value:
1141  * - 0 if wait was successful
1142  * - EINTR if the call was interrupted
1143  */
1144
1145 int
1146 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1147 {
1148         int error = 0;
1149         time_t curtime, time_to_wait;
1150         rnode4_t *rp;
1151
1152         ASSERT(vp != NULL);
1153
1154         rp = VTOR4(vp);
1155
1156         /* do a unprotected check to reduce r_statelock contention */
1157         if (rp->r_delay_wait != 0) {
1158                 mutex_enter(&rp->r_statelock);
1159
1160                 if (rp->r_delay_wait != 0) {
1161
1162                         if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1163                                 rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1164                                 nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1165                         }
1166
1167                         curtime = gethrestime_sec();
1168
1169                         if (curtime < rp->r_delay_wait) {
1170
1171                                 time_to_wait = rp->r_delay_wait - curtime;
1172
1173                                 mutex_exit(&rp->r_statelock);
1174
1175                                 ddi_sleep(time_to_wait);
1176
1177                                 curtime = gethrestime_sec();
1178
1179                                 mutex_enter(&rp->r_statelock);
1180
1181                                 if (curtime >= rp->r_delay_wait)
1182                                         rp->r_delay_wait = 0;
1183                         } else {
1184                                 rp->r_delay_wait = 0;
1185                         }
1186                 }
1187                 mutex_exit(&rp->r_statelock);
1188         }
1189
1190         return (error);
1191 }
1192
1193 /*
1194  * The recovery thread.
1195  */
1196
1197 static void
1198 nfs4_recov_thread(recov_info_t *recovp)
1199 {
1200         mntinfo4_t *mi = recovp->rc_mi;
1201         nfs4_server_t *sp;
1202         int done = 0, error = 0;
1203         bool_t recov_fail = FALSE;
1204         callb_cpr_t cpr_info;
1205         kmutex_t cpr_lock;
1206
1207         nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1208             recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1209             0, 0);
1210
1211         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1212         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1213
1214         mutex_enter(&mi->mi_lock);
1215         mi->mi_recovthread = curthread;
1216         mutex_exit(&mi->mi_lock);
1217
1218         /*
1219          * We don't really need protection here against failover or
1220          * migration, since the current thread is the one that would make
1221          * any changes, but hold mi_recovlock anyway for completeness (and
1222          * to satisfy any ASSERTs).
1223          */
1224         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1225         sp = find_nfs4_server(mi);
1226         if (sp != NULL)
1227                 mutex_exit(&sp->s_lock);
1228         nfs_rw_exit(&mi->mi_recovlock);
1229
1230         /*
1231          * Do any necessary recovery, based on the information in recovp
1232          * and any recovery flags.
1233          */
1234
1235         do {
1236                 mutex_enter(&mi->mi_lock);
1237                 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1238                         bool_t activesrv;
1239
1240                         NFS4_DEBUG(nfs4_client_recov_debug &&
1241                             mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1242                             "nfs4_recov_thread: file system has been "
1243                             "unmounted"));
1244                         NFS4_DEBUG(nfs4_client_recov_debug &&
1245                             zone_status_get(curproc->p_zone) >=
1246                             ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1247                             "nfs4_recov_thread: zone shutting down"));
1248                         /*
1249                          * If the server has lost its state for us and
1250                          * the filesystem is unmounted, then the filesystem
1251                          * can be tossed, even if there are lost lock or
1252                          * lost state calls in the recovery queue.
1253                          */
1254                         if (mi->mi_recovflags &
1255                             (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1256                                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1257                                 "nfs4_recov_thread: bailing out"));
1258                                 mi->mi_flags |= MI4_RECOV_FAIL;
1259                                 mi->mi_error = recovp->rc_error;
1260                                 recov_fail = TRUE;
1261                         }
1262                         /*
1263                          * We don't know if the server has any state for
1264                          * us, and the filesystem has been unmounted.  If
1265                          * there are "lost state" recovery items, keep
1266                          * trying to process them until there are no more
1267                          * mounted filesystems for the server.  Otherwise,
1268                          * bail out.  The reason we don't mark the
1269                          * filesystem as failing recovery is in case we
1270                          * have to do "lost state" recovery later (e.g., a
1271                          * user process exits).
1272                          */
1273                         if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1274                                 done = 1;
1275                                 mutex_exit(&mi->mi_lock);
1276                                 break;
1277                         }
1278                         mutex_exit(&mi->mi_lock);
1279
1280                         if (sp == NULL)
1281                                 activesrv = FALSE;
1282                         else {
1283                                 mutex_enter(&sp->s_lock);
1284                                 activesrv = nfs4_fs_active(sp);
1285                         }
1286                         if (!activesrv) {
1287                                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1288                                     "no active fs for server %p",
1289                                     (void *)sp));
1290                                 mutex_enter(&mi->mi_lock);
1291                                 mi->mi_flags |= MI4_RECOV_FAIL;
1292                                 mi->mi_error = recovp->rc_error;
1293                                 mutex_exit(&mi->mi_lock);
1294                                 recov_fail = TRUE;
1295                                 if (sp != NULL) {
1296                                         /*
1297                                          * Mark the server instance as
1298                                          * dead, so that nobody will attach
1299                                          * a new filesystem.
1300                                          */
1301                                         nfs4_mark_srv_dead(sp);
1302                                 }
1303                         }
1304                         if (sp != NULL)
1305                                 mutex_exit(&sp->s_lock);
1306                 } else {
1307                         mutex_exit(&mi->mi_lock);
1308                 }
1309
1310                 /*
1311                  * Check if we need to select a new server for a
1312                  * failover.  Choosing a new server will force at
1313                  * least a check of the clientid.
1314                  */
1315                 mutex_enter(&mi->mi_lock);
1316                 if (!recov_fail &&
1317                     (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1318                         mutex_exit(&mi->mi_lock);
1319                         recov_newserver(recovp, &sp, &recov_fail);
1320                 } else
1321                         mutex_exit(&mi->mi_lock);
1322
1323                 /*
1324                  * Check if we need to recover the clientid.  This
1325                  * must be done before file and lock recovery, and it
1326                  * potentially affects the recovery threads for other
1327                  * filesystems, so it gets special treatment.
1328                  */
1329                 if (sp != NULL && recov_fail == FALSE) {
1330                         mutex_enter(&sp->s_lock);
1331                         if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1332                                 mutex_exit(&sp->s_lock);
1333                                 recov_clientid(recovp, sp);
1334                         } else {
1335                                 /*
1336                                  * Unset this flag in case another recovery
1337                                  * thread successfully recovered the clientid
1338                                  * for us already.
1339                                  */
1340                                 mutex_enter(&mi->mi_lock);
1341                                 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1342                                 mutex_exit(&mi->mi_lock);
1343                                 mutex_exit(&sp->s_lock);
1344                         }
1345                 }
1346
1347                 /*
1348                  * Check if we need to get the security information.
1349                  */
1350                 mutex_enter(&mi->mi_lock);
1351                 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1352                     !(mi->mi_flags & MI4_RECOV_FAIL)) {
1353                         mutex_exit(&mi->mi_lock);
1354                         (void) nfs_rw_enter_sig(&mi->mi_recovlock,
1355                             RW_WRITER, 0);
1356                         error = nfs4_secinfo_recov(recovp->rc_mi,
1357                             recovp->rc_vp1, recovp->rc_vp2);
1358                         /*
1359                          * If error, nothing more can be done, stop
1360                          * the recovery.
1361                          */
1362                         if (error) {
1363                                 mutex_enter(&mi->mi_lock);
1364                                 mi->mi_flags |= MI4_RECOV_FAIL;
1365                                 mi->mi_error = recovp->rc_error;
1366                                 mutex_exit(&mi->mi_lock);
1367                                 nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1368                                     error, recovp->rc_vp1, recovp->rc_vp2,
1369                                     0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1370                         }
1371                         nfs_rw_exit(&mi->mi_recovlock);
1372                 } else
1373                         mutex_exit(&mi->mi_lock);
1374
1375                 /*
1376                  * Check if there's a bad seqid to recover.
1377                  */
1378                 mutex_enter(&mi->mi_lock);
1379                 if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1380                     !(mi->mi_flags & MI4_RECOV_FAIL)) {
1381                         mutex_exit(&mi->mi_lock);
1382                         (void) nfs_rw_enter_sig(&mi->mi_recovlock,
1383                             RW_WRITER, 0);
1384                         recov_bad_seqid(recovp);
1385                         nfs_rw_exit(&mi->mi_recovlock);
1386                 } else
1387                         mutex_exit(&mi->mi_lock);
1388
1389                 /*
1390                  * Next check for recovery that affects the entire
1391                  * filesystem.
1392                  */
1393                 if (sp != NULL) {
1394                         mutex_enter(&mi->mi_lock);
1395                         if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1396                             !(mi->mi_flags & MI4_RECOV_FAIL)) {
1397                                 mutex_exit(&mi->mi_lock);
1398                                 recov_openfiles(recovp, sp);
1399                         } else
1400                                 mutex_exit(&mi->mi_lock);
1401                 }
1402
1403                 /*
1404                  * Send any queued state recovery requests.
1405                  */
1406                 mutex_enter(&mi->mi_lock);
1407                 if (sp != NULL &&
1408                     (mi->mi_recovflags & MI4R_LOST_STATE) &&
1409                     !(mi->mi_flags & MI4_RECOV_FAIL)) {
1410                         mutex_exit(&mi->mi_lock);
1411                         (void) nfs_rw_enter_sig(&mi->mi_recovlock,
1412                             RW_WRITER, 0);
1413                         nfs4_resend_lost_rqsts(recovp, sp);
1414                         if (list_head(&mi->mi_lost_state) == NULL) {
1415                                 /* done */
1416                                 mutex_enter(&mi->mi_lock);
1417                                 mi->mi_recovflags &= ~MI4R_LOST_STATE;
1418                                 mutex_exit(&mi->mi_lock);
1419                         }
1420                         nfs_rw_exit(&mi->mi_recovlock);
1421                 } else {
1422                         mutex_exit(&mi->mi_lock);
1423                 }
1424
1425                 /*
1426                  * See if there is anything more to do.  If not, announce
1427                  * that we are done and exit.
1428                  *
1429                  * Need mi_recovlock to keep 'sp' valid.  Must grab
1430                  * mi_recovlock before mi_lock to preserve lock ordering.
1431                  */
1432                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1433                 mutex_enter(&mi->mi_lock);
1434                 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1435                     (mi->mi_flags & MI4_RECOV_FAIL)) {
1436                         list_t local_lost_state;
1437                         nfs4_lost_rqst_t *lrp;
1438
1439                         /*
1440                          * We need to remove the lost requests before we
1441                          * unmark the mi as no longer doing recovery to
1442                          * avoid a race with a new thread putting new lost
1443                          * requests on the same mi (and the going away
1444                          * thread would remove the new lost requests).
1445                          *
1446                          * Move the lost requests to a local list since
1447                          * nfs4_remove_lost_rqst() drops mi_lock, and
1448                          * dropping the mi_lock would make our check to
1449                          * see if recovery is done no longer valid.
1450                          */
1451                         list_create(&local_lost_state,
1452                             sizeof (nfs4_lost_rqst_t),
1453                             offsetof(nfs4_lost_rqst_t, lr_node));
1454                         list_move_tail(&local_lost_state, &mi->mi_lost_state);
1455
1456                         done = 1;
1457                         mutex_exit(&mi->mi_lock);
1458                         /*
1459                          * Now officially free the "moved"
1460                          * lost requests.
1461                          */
1462                         while ((lrp = list_head(&local_lost_state)) != NULL) {
1463                                 list_remove(&local_lost_state, lrp);
1464                                 nfs4_free_lost_rqst(lrp, sp);
1465                         }
1466                         list_destroy(&local_lost_state);
1467                 } else
1468                         mutex_exit(&mi->mi_lock);
1469                 nfs_rw_exit(&mi->mi_recovlock);
1470
1471                 /*
1472                  * If the filesystem has been forcibly unmounted, there is
1473                  * probably no point in retrying immediately.  Furthermore,
1474                  * there might be user processes waiting for a chance to
1475                  * queue up "lost state" requests, so that they can exit.
1476                  * So pause here for a moment.  Same logic for zone shutdown.
1477                  */
1478                 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1479                         mutex_enter(&mi->mi_lock);
1480                         cv_broadcast(&mi->mi_failover_cv);
1481                         mutex_exit(&mi->mi_lock);
1482                         ddi_sleep(nfs4_unmount_delay);
1483                 }
1484
1485         } while (!done);
1486
1487         if (sp != NULL)
1488                 nfs4_server_rele(sp);
1489
1490         /*
1491          * Return all recalled delegations
1492          */
1493         nfs4_dlistclean();
1494
1495         mutex_enter(&mi->mi_lock);
1496         recov_done(mi, recovp);
1497         mutex_exit(&mi->mi_lock);
1498
1499         /*
1500          * Free up resources that were allocated for us.
1501          */
1502         if (recovp->rc_vp1 != NULL)
1503                 VN_RELE(recovp->rc_vp1);
1504         if (recovp->rc_vp2 != NULL)
1505                 VN_RELE(recovp->rc_vp2);
1506
1507         /* now we are done using the mi struct, signal the waiters */
1508         mutex_enter(&mi->mi_lock);
1509         mi->mi_in_recovery--;
1510         if (mi->mi_in_recovery == 0)
1511                 cv_broadcast(&mi->mi_cv_in_recov);
1512         mutex_exit(&mi->mi_lock);
1513
1514         VFS_RELE(mi->mi_vfsp);
1515         MI4_RELE(mi);
1516         kmem_free(recovp, sizeof (recov_info_t));
1517         mutex_enter(&cpr_lock);
1518         CALLB_CPR_EXIT(&cpr_info);
1519         mutex_destroy(&cpr_lock);
1520         zthread_exit();
1521 }
1522
1523 /*
1524  * Log the end of recovery and notify any waiting threads.
1525  */
1526
1527 static void
1528 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1529 {
1530
1531         ASSERT(MUTEX_HELD(&mi->mi_lock));
1532
1533         nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1534             recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1535         mi->mi_recovthread = NULL;
1536         mi->mi_flags &= ~MI4_RECOV_ACTIV;
1537         mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1538         cv_broadcast(&mi->mi_failover_cv);
1539 }
1540
1541 /*
1542  * State-specific recovery routines, by state.
1543  */
1544
1545 /*
1546  * Failover.
1547  *
1548  * Replaces *spp with a reference to the new server, which must
1549  * eventually be freed.
1550  */
1551
1552 static void
1553 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1554 {
1555         mntinfo4_t *mi = recovp->rc_mi;
1556         servinfo4_t *svp = NULL;
1557         nfs4_server_t *osp = *spp;
1558         CLIENT *cl;
1559         enum clnt_stat status;
1560         struct timeval tv;
1561         int error;
1562         int oncethru = 0;
1563         rnode4_t *rp;
1564         int index;
1565         nfs_fh4 fh;
1566         char *snames;
1567         size_t len;
1568
1569         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1570
1571         tv.tv_sec = 2;
1572         tv.tv_usec = 0;
1573
1574
1575         /*
1576          * Ping the null NFS procedure of every server in
1577          * the list until one responds.  We always start
1578          * at the head of the list and always skip the one
1579          * that is current, since it's caused us a problem.
1580          */
1581         while (svp == NULL) {
1582                 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1583
1584                         mutex_enter(&mi->mi_lock);
1585                         if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1586                                 mi->mi_flags |= MI4_RECOV_FAIL;
1587                                 mutex_exit(&mi->mi_lock);
1588                                 (void) nfs_rw_exit(&mi->mi_recovlock);
1589                                 *recov_fail = TRUE;
1590                                 if (oncethru)
1591                                         kmem_free(snames, len);
1592                                 return;
1593                         }
1594                         mutex_exit(&mi->mi_lock);
1595
1596                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1597                         if (svp->sv_flags & SV4_NOTINUSE) {
1598                                 nfs_rw_exit(&svp->sv_lock);
1599                                 continue;
1600                         }
1601                         nfs_rw_exit(&svp->sv_lock);
1602
1603                         if (!oncethru && svp == mi->mi_curr_serv)
1604                                 continue;
1605
1606                         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1607                             NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1608                         if (error)
1609                                 continue;
1610
1611                         if (!(mi->mi_flags & MI4_INT))
1612                                 cl->cl_nosignal = TRUE;
1613                         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1614                             xdr_void, NULL, tv);
1615                         if (!(mi->mi_flags & MI4_INT))
1616                                 cl->cl_nosignal = FALSE;
1617                         AUTH_DESTROY(cl->cl_auth);
1618                         CLNT_DESTROY(cl);
1619                         if (status == RPC_SUCCESS) {
1620                                 nfs4_queue_event(RE_FAILOVER, mi,
1621                                     svp == mi->mi_curr_serv ? NULL :
1622                                     svp->sv_hostname, 0, NULL, NULL, 0,
1623                                     NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1624                                 break;
1625                         }
1626                 }
1627
1628                 if (svp == NULL) {
1629                         if (!oncethru) {
1630                                 snames = nfs4_getsrvnames(mi, &len);
1631                                 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1632                                     0, 0, 0, FALSE, snames, 0, NULL);
1633                                 oncethru = 1;
1634                         }
1635                         ddi_sleep(1);
1636                 }
1637         }
1638
1639         if (oncethru) {
1640                 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1641                     0, NULL);
1642                 kmem_free(snames, len);
1643         }
1644
1645 #if DEBUG
1646         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1647         ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1648         nfs_rw_exit(&svp->sv_lock);
1649 #endif
1650
1651         mutex_enter(&mi->mi_lock);
1652         mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1653         if (svp != mi->mi_curr_serv) {
1654                 servinfo4_t *osvp = mi->mi_curr_serv;
1655
1656                 mutex_exit(&mi->mi_lock);
1657
1658                 /*
1659                  * Update server-dependent fields in the root vnode.
1660                  */
1661                 index = rtable4hash(mi->mi_rootfh);
1662                 rw_enter(&rtable4[index].r_lock, RW_WRITER);
1663
1664                 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1665                 if (rp != NULL) {
1666                         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1667                             "recov_newserver: remapping %s", rnode4info(rp)));
1668                         mutex_enter(&rp->r_statelock);
1669                         rp->r_server = svp;
1670                         PURGE_ATTRCACHE4_LOCKED(rp);
1671                         mutex_exit(&rp->r_statelock);
1672                         (void) nfs4_free_data_reclaim(rp);
1673                         nfs4_purge_rddir_cache(RTOV4(rp));
1674                         rw_exit(&rtable4[index].r_lock);
1675                         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1676                             "recov_newserver: done with %s",
1677                             rnode4info(rp)));
1678                         VN_RELE(RTOV4(rp));
1679                 } else
1680                         rw_exit(&rtable4[index].r_lock);
1681                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1682
1683                 mutex_enter(&mi->mi_lock);
1684                 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1685                 if (recovp->rc_srv_reboot)
1686                         mi->mi_recovflags |= MI4R_SRV_REBOOT;
1687                 mi->mi_curr_serv = svp;
1688                 mi->mi_failover++;
1689                 mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1690                 mutex_exit(&mi->mi_lock);
1691
1692                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1693                 fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1694                 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1695                 sfh4_update(mi->mi_rootfh, &fh);
1696                 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1697                 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1698                 sfh4_update(mi->mi_srvparentfh, &fh);
1699                 nfs_rw_exit(&svp->sv_lock);
1700
1701                 *spp = nfs4_move_mi(mi, osvp, svp);
1702                 if (osp != NULL)
1703                         nfs4_server_rele(osp);
1704         } else
1705                 mutex_exit(&mi->mi_lock);
1706         (void) nfs_rw_exit(&mi->mi_recovlock);
1707 }
1708
1709 /*
1710  * Clientid.
1711  */
1712
1713 static void
1714 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1715 {
1716         mntinfo4_t *mi = recovp->rc_mi;
1717         int error = 0;
1718         int still_stale;
1719         int need_new_s;
1720
1721         ASSERT(sp != NULL);
1722
1723         /*
1724          * Acquire the recovery lock and then verify that the clientid
1725          * still needs to be recovered.  (Note that s_recovlock is supposed
1726          * to be acquired before s_lock.)  Since the thread holds the
1727          * recovery lock, no other thread will recover the clientid.
1728          */
1729         (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1730         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1731         mutex_enter(&sp->s_lock);
1732         still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1733         mutex_exit(&sp->s_lock);
1734
1735         if (still_stale) {
1736                 nfs4_error_t n4e;
1737
1738                 nfs4_error_zinit(&n4e);
1739                 nfs4setclientid(mi, kcred, TRUE, &n4e);
1740                 error = n4e.error;
1741                 if (error != 0) {
1742
1743                         /*
1744                          * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1745                          * if so, just return and let recov_thread drive
1746                          * failover.
1747                          */
1748                         mutex_enter(&mi->mi_lock);
1749                         need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1750                         mutex_exit(&mi->mi_lock);
1751
1752                         if (need_new_s) {
1753                                 nfs_rw_exit(&mi->mi_recovlock);
1754                                 nfs_rw_exit(&sp->s_recovlock);
1755                                 return;
1756                         }
1757
1758                         nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1759                             NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1760                         mutex_enter(&mi->mi_lock);
1761                         mi->mi_flags |= MI4_RECOV_FAIL;
1762                         mi->mi_error = recovp->rc_error;
1763                         mutex_exit(&mi->mi_lock);
1764                         /* don't destroy the nfs4_server, let umount do it */
1765                 }
1766         }
1767
1768         if (error == 0) {
1769                 mutex_enter(&mi->mi_lock);
1770                 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1771                 /*
1772                  * If still_stale isn't true, then another thread already
1773                  * recovered the clientid.  And that thread that set the
1774                  * clientid will have initiated reopening files on all the
1775                  * filesystems for the server, so we should not initiate
1776                  * reopening for this filesystem here.
1777                  */
1778                 if (still_stale) {
1779                         mi->mi_recovflags |= MI4R_REOPEN_FILES;
1780                         if (recovp->rc_srv_reboot)
1781                                 mi->mi_recovflags |= MI4R_SRV_REBOOT;
1782                 }
1783                 mutex_exit(&mi->mi_lock);
1784         }
1785
1786         nfs_rw_exit(&mi->mi_recovlock);
1787
1788         if (error != 0) {
1789                 nfs_rw_exit(&sp->s_recovlock);
1790                 mutex_enter(&mi->mi_lock);
1791                 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1792                         ddi_sleep(recov_err_delay);
1793                 mutex_exit(&mi->mi_lock);
1794         } else {
1795                 mntinfo4_t **milist;
1796                 mntinfo4_t *tmi;
1797                 int nummi, i;
1798
1799                 /*
1800                  * Initiate recovery of open files for other filesystems.
1801                  * We create an array of filesystems, rather than just
1802                  * walking the filesystem list, to avoid deadlock issues
1803                  * with s_lock and mi_recovlock.
1804                  */
1805                 milist = make_milist(sp, &nummi);
1806                 for (i = 0; i < nummi; i++) {
1807                         tmi = milist[i];
1808                         if (tmi != mi) {
1809                                 (void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1810                                     RW_READER, 0);
1811                                 start_recovery_action(NR_OPENFILES, TRUE, tmi,
1812                                     NULL, NULL);
1813                                 nfs_rw_exit(&tmi->mi_recovlock);
1814                         }
1815                 }
1816                 free_milist(milist, nummi);
1817
1818                 nfs_rw_exit(&sp->s_recovlock);
1819         }
1820 }
1821
1822 /*
1823  * Return an array of filesystems associated with the given server.  The
1824  * caller should call free_milist() to free the references and memory.
1825  */
1826
1827 static mntinfo4_t **
1828 make_milist(nfs4_server_t *sp, int *nummip)
1829 {
1830         int nummi, i;
1831         mntinfo4_t **milist;
1832         mntinfo4_t *tmi;
1833
1834         mutex_enter(&sp->s_lock);
1835         nummi = 0;
1836         for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1837                 nummi++;
1838
1839         milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
1840
1841         for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1842             tmi = tmi->mi_clientid_next) {
1843                 milist[i] = tmi;
1844                 VFS_HOLD(tmi->mi_vfsp);
1845         }
1846         mutex_exit(&sp->s_lock);
1847
1848         *nummip = nummi;
1849         return (milist);
1850 }
1851
1852 /*
1853  * Free the filesystem list created by make_milist().
1854  */
1855
1856 static void
1857 free_milist(mntinfo4_t **milist, int nummi)
1858 {
1859         mntinfo4_t *tmi;
1860         int i;
1861
1862         for (i = 0; i < nummi; i++) {
1863                 tmi = milist[i];
1864                 VFS_RELE(tmi->mi_vfsp);
1865         }
1866         kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1867 }
1868
1869 /*
1870  * Filehandle
1871  */
1872
1873 /*
1874  * Lookup the filehandle for the given vnode and update the rnode if it has
1875  * changed.
1876  *
1877  * Errors:
1878  * - if the filehandle could not be updated because of an error that
1879  *   requires further recovery, initiate that recovery and return.
1880  * - if the filehandle could not be updated because of a signal, pretend we
1881  *   succeeded and let someone else deal with it.
1882  * - if the filehandle could not be updated and the filesystem has been
1883  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
1884  *   the forced unmount (to retry or not to retry, that is the question).
1885  * - if the filehandle could not be updated because of some other error,
1886  *   mark the rnode bad and return.
1887  */
1888 static void
1889 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1890 {
1891         rnode4_t *rp = VTOR4(vp);
1892         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1893         bool_t needrecov;
1894
1895         mutex_enter(&rp->r_statelock);
1896
1897         if (rp->r_flags & R4RECOVERR) {
1898                 mutex_exit(&rp->r_statelock);
1899                 return;
1900         }
1901
1902         /*
1903          * If someone else is updating the filehandle, wait for them to
1904          * finish and then let our caller retry.
1905          */
1906         if (rp->r_flags & R4RECEXPFH) {
1907                 while (rp->r_flags & R4RECEXPFH) {
1908                         cv_wait(&rp->r_cv, &rp->r_statelock);
1909                 }
1910                 mutex_exit(&rp->r_statelock);
1911                 return;
1912         }
1913         rp->r_flags |= R4RECEXPFH;
1914         mutex_exit(&rp->r_statelock);
1915
1916         if (action == NR_BADHANDLE) {
1917                 /* shouldn't happen */
1918                 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1919                     vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1920         }
1921
1922         nfs4_remap_file(mi, vp, 0, &e);
1923         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1924
1925         /*
1926          * If we get BADHANDLE, FHEXPIRED or STALE in their handler,
1927          * something is broken. Don't try to recover, just mark the
1928          * file dead.
1929          */
1930         DTRACE_PROBE2(recov__filehandle, nfs4_error_t, &e, vnode_t, vp);
1931         if (needrecov) {
1932                 if (e.error == 0) {
1933                         switch (e.stat) {
1934                         case NFS4ERR_BADHANDLE:
1935                         case NFS4ERR_FHEXPIRED:
1936                         case NFS4ERR_STALE:
1937                                 goto norec;     /* Unrecoverable errors */
1938                         default:
1939                                 break;
1940                         }
1941                 }
1942                 (void) nfs4_start_recovery(&e, mi, vp, NULL,
1943                     NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
1944
1945         } else if (e.error != EINTR &&
1946             !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1947             (e.error != 0 || e.stat != NFS4_OK)) {
1948                 nfs4_recov_fh_fail(vp, e.error, e.stat);
1949                 /*
1950                  * Don't set r_error to ESTALE. Higher-level code (e.g.,
1951                  * cstatat_getvp()) retries on ESTALE, which would cause
1952                  * an infinite loop.
1953                  */
1954         }
1955 norec:
1956         mutex_enter(&rp->r_statelock);
1957         rp->r_flags &= ~R4RECEXPFH;
1958         cv_broadcast(&rp->r_cv);
1959         mutex_exit(&rp->r_statelock);
1960 }
1961
1962 /*
1963  * Stale Filehandle
1964  */
1965
1966 /*
1967  * A stale filehandle can happen when an individual file has
1968  * been removed, or when an entire filesystem has been taken
1969  * offline.  To distinguish these cases, we do this:
1970  * - if a GETATTR with the current filehandle is okay, we do
1971  *   nothing (this can happen with two-filehandle ops)
1972  * - if the GETATTR fails, but a GETATTR of the root filehandle
1973  *   succeeds, mark the rnode with R4STALE, which will stop use
1974  * - if the GETATTR fails, and a GETATTR of the root filehandle
1975  *   also fails, we consider the problem filesystem-wide, so:
1976  *   - if we can failover, we should
1977  *   - if we can't failover, we should mark both the original
1978  *     vnode and the root bad
1979  */
1980 static void
1981 recov_stale(mntinfo4_t *mi, vnode_t *vp)
1982 {
1983         rnode4_t *rp = VTOR4(vp);
1984         vnode_t *rootvp = NULL;
1985         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1986         nfs4_ga_res_t gar;
1987         char *fail_msg = "failed to recover from NFS4ERR_STALE";
1988         bool_t needrecov;
1989
1990         mutex_enter(&rp->r_statelock);
1991
1992         if (rp->r_flags & R4RECOVERR) {
1993                 mutex_exit(&rp->r_statelock);
1994                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1995                     "recov_stale: already marked dead, rp %s",
1996                     rnode4info(rp)));
1997                 return;
1998         }
1999
2000         if (rp->r_flags & R4STALE) {
2001                 mutex_exit(&rp->r_statelock);
2002                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2003                     "recov_stale: already marked stale, rp %s",
2004                     rnode4info(rp)));
2005                 return;
2006         }
2007
2008         mutex_exit(&rp->r_statelock);
2009
2010         /* Try a GETATTR on this vnode */
2011         nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2012
2013         /*
2014          * Handle non-STALE recoverable errors
2015          */
2016         needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2017         if (needrecov) {
2018                 if (e.error == 0) {
2019                         switch (e.stat) {
2020                         case NFS4ERR_STALE:
2021                         case NFS4ERR_BADHANDLE:
2022                                 goto norec;     /* Unrecoverable */
2023                         default:
2024                                 break;
2025                         }
2026                 }
2027                 (void) nfs4_start_recovery(&e, mi, vp, NULL,
2028                     NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
2029                 goto out;
2030         }
2031 norec:
2032         /* Are things OK for this vnode? */
2033         if (!e.error && e.stat == NFS4_OK) {
2034                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2035                     "recov_stale: file appears fine, rp %s",
2036                     rnode4info(rp)));
2037                 goto out;
2038         }
2039
2040         /* Did we get an unrelated non-recoverable error? */
2041         if (e.error || e.stat != NFS4ERR_STALE) {
2042                 nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2043                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2044                     "recov_stale: unrelated fatal error, rp %s",
2045                     rnode4info(rp)));
2046                 goto out;
2047         }
2048
2049         /*
2050          * If we don't appear to be dealing with the root node, find it.
2051          */
2052         if ((vp->v_flag & VROOT) == 0) {
2053                 nfs4_error_zinit(&e);
2054                 e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2055                 if (e.error) {
2056                         nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2057                         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2058                             "recov_stale: can't find root node for rp %s",
2059                             rnode4info(rp)));
2060                         goto out;
2061                 }
2062         }
2063
2064         /* Try a GETATTR on the root vnode */
2065         if (rootvp != NULL) {
2066                 nfs4_error_zinit(&e);
2067                 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2068
2069                 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2070                 if (needrecov) {
2071                         if (e.error == 0) {
2072                                 switch (e.stat) {
2073                                 case NFS4ERR_STALE:
2074                                 case NFS4ERR_BADHANDLE:
2075                                         goto unrec;     /* Unrecoverable */
2076                                 default:
2077                                         break;
2078                                 }
2079                         }
2080                         (void) nfs4_start_recovery(&e, mi, rootvp, NULL,
2081                             NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
2082                 }
2083 unrec:
2084                 /*
2085                  * Check to see if a failover attempt is warranted
2086                  * NB: nfs4_try_failover doesn't check for STALE
2087                  * because recov_stale gets a shot first.  Now that
2088                  * recov_stale has failed, go ahead and try failover.
2089                  *
2090                  * If the getattr on the root filehandle was successful,
2091                  * then mark recovery as failed for 'vp' and exit.
2092                  */
2093                 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2094                         /*
2095                          * pass the original error to fail_recov, not
2096                          * the one from trying the root vnode.
2097                          */
2098                         nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2099                         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2100                             "recov_stale: root node OK, marking "
2101                             "dead rp %s", rnode4info(rp)));
2102                         goto out;
2103                 }
2104         }
2105
2106         /*
2107          * Here, we know that both the original file and the
2108          * root filehandle (which may be the same) are stale.
2109          * We want to fail over if we can, and if we can't, we
2110          * want to mark everything in sight bad.
2111          */
2112         if (FAILOVER_MOUNT4(mi)) {
2113                 mutex_enter(&mi->mi_lock);
2114                 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2115                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2116                     "recov_stale: failing over due to rp %s",
2117                     rnode4info(rp)));
2118                 mutex_exit(&mi->mi_lock);
2119         } else {
2120                 rnode4_t *rootrp;
2121                 servinfo4_t *svp;
2122
2123                 /*
2124                  * Can't fail over, so mark things dead.
2125                  *
2126                  * If rootvp is set, we know we have a distinct
2127                  * non-root vnode which can be marked dead in
2128                  * the usual way.
2129                  *
2130                  * Then we want to mark the root vnode dead.
2131                  * Note that if rootvp wasn't set, our vp is
2132                  * actually the root vnode.
2133                  */
2134                 if (rootvp != NULL) {
2135                         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2136                             "recov_stale: can't fail over, marking dead rp %s",
2137                             rnode4info(rp)));
2138                         nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2139                 } else {
2140                         rootvp = vp;
2141                         VN_HOLD(rootvp);
2142                 }
2143
2144                 /*
2145                  * Mark root dead, but quietly - since
2146                  * the root rnode is frequently recreated,
2147                  * we can encounter this at every access.
2148                  * Also mark recovery as failed on this VFS.
2149                  */
2150                 rootrp = VTOR4(rootvp);
2151                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2152                     "recov_stale: marking dead root rp %s",
2153                     rnode4info(rootrp)));
2154                 mutex_enter(&rootrp->r_statelock);
2155                 rootrp->r_flags |= (R4RECOVERR | R4STALE);
2156                 rootrp->r_error = ESTALE;
2157                 mutex_exit(&rootrp->r_statelock);
2158                 mutex_enter(&mi->mi_lock);
2159                 mi->mi_error = ESTALE;
2160                 mutex_exit(&mi->mi_lock);
2161
2162                 svp = mi->mi_curr_serv;
2163                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2164                 svp->sv_flags |= SV4_ROOT_STALE;
2165                 nfs_rw_exit(&svp->sv_lock);
2166         }
2167
2168 out:
2169         if (rootvp)
2170                 VN_RELE(rootvp);
2171 }
2172
2173 /*
2174  * Locks.
2175  */
2176
2177 /*
2178  * Reclaim all the active (acquired) locks for the given file.
2179  * If a process lost a lock, the process is sent a SIGLOST.  This is not
2180  * considered an error.
2181  *
2182  * Return values:
2183  * Errors and status are returned via the nfs4_error_t parameter
2184  * If an error indicates that recovery is needed, the caller is responsible
2185  * for dealing with it.
2186  */
2187
2188 static void
2189 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2190     fattr4_change pre_change)
2191 {
2192         locklist_t *locks, *llp;
2193         rnode4_t *rp;
2194
2195         ASSERT(ep != NULL);
2196         nfs4_error_zinit(ep);
2197
2198         if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2199                 return;
2200
2201         nfs4_flush_lock_owners(VTOR4(vp));
2202
2203         /*
2204          * If we get an error that requires recovery actions, just bail out
2205          * and let the top-level recovery code handle it.
2206          *
2207          * If we get some other error, kill the process that owned the lock
2208          * and mark its remaining locks (if any) as belonging to NOPID, so
2209          * that we don't make any more reclaim requests for that process.
2210          */
2211
2212         rp = VTOR4(vp);
2213         locks = flk_active_locks_for_vp(vp);
2214         for (llp = locks; llp != NULL; llp = llp->ll_next) {
2215                 int did_reclaim = 1;
2216
2217                 ASSERT(llp->ll_vp == vp);
2218                 if (llp->ll_flock.l_pid == NOPID)
2219                         continue;
2220                 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2221                 /*
2222                  * If we need to restart recovery, stop processing the
2223                  * list.  Some errors would be recoverable under other
2224                  * circumstances, but if they happen here we just give up
2225                  * on the lock.
2226                  */
2227                 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2228                         if (ep->error != 0)
2229                                 break;
2230                         if (!nfs4_recov_marks_dead(ep->stat))
2231                                 break;
2232                 }
2233                 /*
2234                  *   In case the server isn't offering us a grace period, or
2235                  * if we missed it, we might have opened & locked from scratch,
2236                  * rather than reopened/reclaimed.
2237                  *   We need to ensure that the object hadn't been otherwise
2238                  * changed during this time, by comparing the changeinfo.
2239                  *   We get passed the changeinfo from before the reopen by our
2240                  * caller, in pre_change.
2241                  *   The changeinfo from after the reopen is in rp->r_change,
2242                  * courtesy of the GETATTR in the reopen.
2243                  *   If they're different, then the file has changed, and we
2244                  * have to SIGLOST the app.
2245                  */
2246                 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2247                         mutex_enter(&rp->r_statelock);
2248                         if (pre_change != rp->r_change)
2249                                 ep->stat = NFS4ERR_NO_GRACE;
2250                         mutex_exit(&rp->r_statelock);
2251                 }
2252                 if (ep->error != 0 || ep->stat != NFS4_OK) {
2253                         if (ep->error != 0)
2254                                 nfs4_queue_event(RE_FAIL_RELOCK, mi,
2255                                     NULL, ep->error, vp, NULL, 0, NULL,
2256                                     llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2257                                     0, 0);
2258                         else
2259                                 nfs4_queue_event(RE_FAIL_RELOCK, mi,
2260                                     NULL, 0, vp, NULL, ep->stat, NULL,
2261                                     llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2262                                     0, 0);
2263                         nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2264                             ep->error, ep->stat);
2265                         relock_skip_pid(vp, llp, llp->ll_flock.l_pid);
2266
2267                         /* Reinitialize the nfs4_error and continue */
2268                         nfs4_error_zinit(ep);
2269                 }
2270         }
2271
2272         if (locks != NULL)
2273                 flk_free_locklist(locks);
2274 }
2275
2276 /*
2277  * Reclaim the given lock.
2278  *
2279  * Errors are returned via the nfs4_error_t parameter.
2280  */
2281 static void
2282 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2283     int *did_reclaimp)
2284 {
2285         cred_t *cr;
2286         rnode4_t *rp = VTOR4(vp);
2287
2288         cr = pid_to_cr(flk->l_pid);
2289         if (cr == NULL) {
2290                 nfs4_error_init(ep, ESRCH);
2291                 return;
2292         }
2293
2294         do {
2295                 mutex_enter(&rp->r_statelock);
2296                 if (rp->r_flags & R4RECOVERR) {
2297                         mutex_exit(&rp->r_statelock);
2298                         nfs4_error_init(ep, ESTALE);
2299                         break;
2300                 }
2301                 mutex_exit(&rp->r_statelock);
2302
2303                 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2304                     FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
2305                 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2306                         start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2307                             vp, NULL);
2308         } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2309
2310         crfree(cr);
2311 }
2312
2313 /*
2314  * Open files.
2315  */
2316
2317 /*
2318  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2319  * Returns 1 if the error is valid; 0 otherwise.
2320  */
2321 static int
2322 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2323 {
2324         /*
2325          * We should not be marking non-regular files as dead,
2326          * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2327          */
2328         if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2329             stat != NFS4ERR_BADNAME)
2330                 return (0);
2331
2332         return (1);
2333 }
2334
2335 /*
2336  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
2337  * then mark the object dead.  Since we've had to do a lookup for
2338  * filehandle recovery, we will mark the object dead if we got NOENT.
2339  */
2340 static void
2341 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2342 {
2343         ASSERT(vp != NULL);
2344
2345         if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2346             (!nfs4_valid_recov_err_for_vp(vp, stat)))
2347                 return;
2348
2349         nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2350 }
2351
2352 /*
2353  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
2354  * to mark only the data structure(s) that provided the bad value as being
2355  * bad.  But for now we'll just mark the entire file.
2356  */
2357
2358 static void
2359 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2360 {
2361         ASSERT(vp != NULL);
2362         recov_throttle(recovp, vp);
2363
2364         if (!nfs4_valid_recov_err_for_vp(vp, stat))
2365                 return;
2366
2367         nfs4_fail_recov(vp, "", 0, stat);
2368 }
2369
2370 /*
2371  * Free up the information saved for a lost state request.
2372  */
2373 static void
2374 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2375 {
2376         component4 *filep;
2377         nfs4_open_stream_t *osp;
2378         int have_sync_lock;
2379
2380         NFS4_DEBUG(nfs4_lost_rqst_debug,
2381             (CE_NOTE, "nfs4_free_lost_rqst:"));
2382
2383         switch (lrp->lr_op) {
2384         case OP_OPEN:
2385                 filep = &lrp->lr_ofile;
2386                 if (filep->utf8string_val) {
2387                         kmem_free(filep->utf8string_val, filep->utf8string_len);
2388                         filep->utf8string_val = NULL;
2389                 }
2390                 break;
2391         case OP_DELEGRETURN:
2392                 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2393                 break;
2394         case OP_CLOSE:
2395                 osp = lrp->lr_osp;
2396                 ASSERT(osp != NULL);
2397                 mutex_enter(&osp->os_sync_lock);
2398                 have_sync_lock = 1;
2399                 if (osp->os_pending_close) {
2400                         /* clean up the open file state. */
2401                         osp->os_pending_close = 0;
2402                         nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2403                 }
2404                 if (have_sync_lock)
2405                         mutex_exit(&osp->os_sync_lock);
2406                 break;
2407         }
2408
2409         lrp->lr_op = 0;
2410         if (lrp->lr_oop != NULL) {
2411                 open_owner_rele(lrp->lr_oop);
2412                 lrp->lr_oop = NULL;
2413         }
2414         if (lrp->lr_osp != NULL) {
2415                 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2416                 lrp->lr_osp = NULL;
2417         }
2418         if (lrp->lr_lop != NULL) {
2419                 lock_owner_rele(lrp->lr_lop);
2420                 lrp->lr_lop = NULL;
2421         }
2422         if (lrp->lr_flk != NULL) {
2423                 kmem_free(lrp->lr_flk, sizeof (flock64_t));
2424                 lrp->lr_flk = NULL;
2425         }
2426         if (lrp->lr_vp != NULL) {
2427                 VN_RELE(lrp->lr_vp);
2428                 lrp->lr_vp = NULL;
2429         }
2430         if (lrp->lr_dvp != NULL) {
2431                 VN_RELE(lrp->lr_dvp);
2432                 lrp->lr_dvp = NULL;
2433         }
2434         if (lrp->lr_cr != NULL) {
2435                 crfree(lrp->lr_cr);
2436                 lrp->lr_cr = NULL;
2437         }
2438
2439         kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2440 }
2441
2442 /*
2443  * Remove any lost state requests and free them.
2444  */
2445 static void
2446 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2447 {
2448         nfs4_lost_rqst_t *lrp;
2449
2450         mutex_enter(&mi->mi_lock);
2451         while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2452                 list_remove(&mi->mi_lost_state, lrp);
2453                 mutex_exit(&mi->mi_lock);
2454                 nfs4_free_lost_rqst(lrp, sp);
2455                 mutex_enter(&mi->mi_lock);
2456         }
2457         mutex_exit(&mi->mi_lock);
2458 }
2459
2460 /*
2461  * Reopen all the files for the given filesystem and reclaim any locks.
2462  */
2463
2464 static void
2465 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2466 {
2467         mntinfo4_t *mi = recovp->rc_mi;
2468         nfs4_opinst_t *reopenlist = NULL, *rep;
2469         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2470         open_claim_type4 claim;
2471         int remap;
2472         char *fail_msg = "No such file or directory on replica";
2473         rnode4_t *rp;
2474         fattr4_change pre_change;
2475
2476         ASSERT(sp != NULL);
2477
2478         /*
2479          * This check is to allow a 10ms pause before we reopen files
2480          * it should allow the server time to have received the CB_NULL
2481          * reply and update its internal structures such that (if
2482          * applicable) we are granted a delegation on reopened files.
2483          */
2484         mutex_enter(&sp->s_lock);
2485         if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2486                 sp->s_flags |= N4S_CB_WAITER;
2487                 (void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
2488                     drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
2489         }
2490         mutex_exit(&sp->s_lock);
2491
2492         (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2493         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2494
2495         if (NFS4_VOLATILE_FH(mi)) {
2496                 nfs4_remap_root(mi, &e, 0);
2497                 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2498                         (void) nfs4_start_recovery(&e, mi, NULL,
2499                             NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
2500                 }
2501         }
2502
2503         mutex_enter(&mi->mi_lock);
2504         if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2505                 claim = CLAIM_PREVIOUS;
2506         else
2507                 claim = CLAIM_NULL;
2508         mutex_exit(&mi->mi_lock);
2509
2510         if (e.error == 0 && e.stat == NFS4_OK) {
2511                 /*
2512                  * Get a snapshot of open files in the filesystem.  Note
2513                  * that new opens will stall until the server's grace
2514                  * period is done.
2515                  */
2516                 reopenlist = r4mkopenlist(mi);
2517
2518                 mutex_enter(&mi->mi_lock);
2519                 remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2520                 mutex_exit(&mi->mi_lock);
2521                 /*
2522                  * Since we are re-establishing state on the
2523                  * server, its ok to blow away the saved lost
2524                  * requests since we don't need to reissue it.
2525                  */
2526                 nfs4_remove_lost_rqsts(mi, sp);
2527
2528                 for (rep = reopenlist; rep; rep = rep->re_next) {
2529
2530                         if (remap) {
2531                                 nfs4_remap_file(mi, rep->re_vp,
2532                                     NFS4_REMAP_CKATTRS, &e);
2533                         }
2534                         DTRACE_PROBE2(recov__openfiles, nfs4_error_t, &e,
2535                             vnode_t, rep->re_vp);
2536                         if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2537                                 /*
2538                                  * The current server does not have the file
2539                                  * that is to be remapped.  This is most
2540                                  * likely due to an improperly maintained
2541                                  * replica.   The files that are missing from
2542                                  * the server will be marked dead and logged
2543                                  * in order to make sys admins aware of the
2544                                  * problem.
2545                                  */
2546                                 nfs4_fail_recov(rep->re_vp,
2547                                     fail_msg, e.error, e.stat);
2548                                 /*
2549                                  * We've already handled the error so clear it.
2550                                  */
2551                                 nfs4_error_zinit(&e);
2552                                 continue;
2553                         } else if (e.error == 0 && e.stat == NFS4_OK) {
2554                                 int j;
2555
2556                                 rp = VTOR4(rep->re_vp);
2557                                 mutex_enter(&rp->r_statelock);
2558                                 pre_change = rp->r_change;
2559                                 mutex_exit(&rp->r_statelock);
2560
2561                                 for (j = 0; j < rep->re_numosp; j++) {
2562                                         nfs4_reopen(rep->re_vp, rep->re_osp[j],
2563                                             &e, claim, FALSE, TRUE);
2564                                         if (e.error != 0 || e.stat != NFS4_OK)
2565                                                 break;
2566                                 }
2567                                 if (nfs4_needs_recovery(&e, TRUE,
2568                                     mi->mi_vfsp)) {
2569                                         (void) nfs4_start_recovery(&e, mi,
2570                                             rep->re_vp, NULL, NULL, NULL,
2571                                             OP_OPEN, NULL, NULL, NULL);
2572                                         break;
2573                                 }
2574                         }
2575 #ifdef DEBUG
2576                         if (nfs4_recovdelay > 0)
2577                                 ddi_sleep(nfs4_recovdelay);
2578 #endif
2579                         if (e.error == 0 && e.stat == NFS4_OK) {
2580                                 relock_file(rep->re_vp, mi, &e, pre_change);
2581
2582                                 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2583                                         (void) nfs4_start_recovery(&e, mi,
2584                                             rep->re_vp, NULL, NULL, NULL,
2585                                             OP_LOCK, NULL, NULL, NULL);
2586                         }
2587
2588                         if (e.error != 0 || e.stat != NFS4_OK)
2589                                 break;
2590                 }
2591
2592                 /*
2593                  * Check to see if we need to remap files passed in
2594                  * via the recovery arguments; this will have been
2595                  * done for open files.  A failure here is not fatal.
2596                  */
2597                 if (remap) {
2598                         nfs4_error_t ignore;
2599                         nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2600                             &ignore);
2601                         nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2602                             &ignore);
2603                 }
2604         }
2605
2606         if (e.error == 0 && e.stat == NFS4_OK) {
2607                 mutex_enter(&mi->mi_lock);
2608                 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2609                 mutex_exit(&mi->mi_lock);
2610         }
2611
2612         nfs_rw_exit(&mi->mi_recovlock);
2613         nfs_rw_exit(&sp->s_recovlock);
2614
2615         if (reopenlist != NULL)
2616                 r4releopenlist(reopenlist);
2617 }
2618
2619 /*
2620  * Resend the queued state recovery requests in "rqsts".
2621  */
2622
2623 static void
2624 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2625 {
2626         nfs4_lost_rqst_t        *lrp, *tlrp;
2627         mntinfo4_t              *mi = recovp->rc_mi;
2628         nfs4_error_t            n4e;
2629 #ifdef NOTYET
2630         uint32_t                deny_bits = 0;
2631 #endif
2632
2633         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2634
2635         ASSERT(mi != NULL);
2636         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2637
2638         mutex_enter(&mi->mi_lock);
2639         lrp = list_head(&mi->mi_lost_state);
2640         mutex_exit(&mi->mi_lock);
2641         while (lrp != NULL) {
2642                 nfs4_error_zinit(&n4e);
2643                 resend_one_op(lrp, &n4e, mi, sp);
2644                 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2645                     "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2646                     "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2647                     n4e.stat));
2648
2649                 /*
2650                  * If we get a recovery error that we can actually
2651                  * recover from (such as ETIMEDOUT, FHEXPIRED), we
2652                  * return and let the recovery thread redrive the call.
2653                  * Don't requeue unless the zone is still healthy.
2654                  */
2655                 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2656                     nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2657                     (nfs4_try_failover(&n4e) ||
2658                     NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2659                     (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2660                     !nfs4_recov_marks_dead(n4e.stat)))) {
2661                         /*
2662                          * For these three errors, we want to delay a bit
2663                          * instead of pounding the server into submission.
2664                          * We have to do this manually; the normal
2665                          * processing for these errors only works for
2666                          * non-recovery requests.
2667                          */
2668                         if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2669                             (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2670                             (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2671                             NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2672                                 ddi_sleep(nfs4err_delay_time);
2673                         } else {
2674                                 (void) nfs4_start_recovery(&n4e,
2675                                     mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2676                                     lrp->lr_op, NULL, NULL, NULL);
2677                         }
2678                         return;
2679                 }
2680
2681                 mutex_enter(&mi->mi_lock);
2682                 list_remove(&mi->mi_lost_state, lrp);
2683                 tlrp = lrp;
2684                 lrp = list_head(&mi->mi_lost_state);
2685                 mutex_exit(&mi->mi_lock);
2686                 nfs4_free_lost_rqst(tlrp, sp);
2687         }
2688 }
2689
2690 /*
2691  * Resend the given op, and issue any necessary undo call.
2692  * errors are returned via the nfs4_error_t parameter.
2693  */
2694
2695 static void
2696 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2697     mntinfo4_t *mi, nfs4_server_t *sp)
2698 {
2699         vnode_t *vp;
2700         nfs4_open_stream_t *osp;
2701         cred_t *cr;
2702         uint32_t acc_bits;
2703
2704         vp = lrp->lr_vp;
2705         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2706             "have a lost open/close request for vp %p", (void *)vp));
2707
2708         switch (lrp->lr_op) {
2709         case OP_OPEN:
2710                 nfs4_resend_open_otw(&vp, lrp, ep);
2711                 break;
2712         case OP_OPEN_DOWNGRADE:
2713                 ASSERT(lrp->lr_oop != NULL);
2714                 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2715                 ASSERT(!ep->error);     /* recov thread always succeeds */
2716                 ASSERT(lrp->lr_osp != NULL);
2717                 mutex_enter(&lrp->lr_osp->os_sync_lock);
2718                 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2719                     lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2720                     ep, NULL, NULL);
2721                 mutex_exit(&lrp->lr_osp->os_sync_lock);
2722                 nfs4_end_open_seqid_sync(lrp->lr_oop);
2723                 break;
2724         case OP_CLOSE:
2725                 osp = lrp->lr_osp;
2726                 cr = lrp->lr_cr;
2727                 acc_bits = 0;
2728                 mutex_enter(&osp->os_sync_lock);
2729                 if (osp->os_share_acc_read)
2730                         acc_bits |= OPEN4_SHARE_ACCESS_READ;
2731                 if (osp->os_share_acc_write)
2732                         acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2733                 mutex_exit(&osp->os_sync_lock);
2734                 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2735                     CLOSE_RESEND, 0, 0, 0);
2736                 break;
2737         case OP_LOCK:
2738         case OP_LOCKU:
2739                 resend_lock(lrp, ep);
2740                 goto done;
2741         case OP_DELEGRETURN:
2742                 nfs4_resend_delegreturn(lrp, ep, sp);
2743                 goto done;
2744         default:
2745 #ifdef DEBUG
2746                 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2747                     lrp->lr_op);
2748 #endif
2749                 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2750                     lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2751                     TAG_NONE, TAG_NONE, 0, 0);
2752                 nfs4_error_init(ep, EINVAL);
2753                 return;
2754         }
2755
2756         /*
2757          * No need to retry nor send an "undo" CLOSE in the
2758          * event the server rebooted.
2759          */
2760         if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2761             ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2762                 goto done;
2763
2764         /*
2765          * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2766          * to undo.  Undoing locking operations was handled by
2767          * resend_lock().
2768          */
2769         if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2770                 goto done;
2771
2772         /*
2773          * If we get any other error for OPEN, then don't attempt
2774          * to undo the resend of the open (since it was never
2775          * successful!).
2776          */
2777         ASSERT(lrp->lr_op == OP_OPEN);
2778         if (ep->error || ep->stat != NFS4_OK)
2779                 goto done;
2780
2781         /*
2782          * Now let's undo our OPEN.
2783          */
2784         nfs4_error_zinit(ep);
2785         close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2786         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2787             "nfs4close_one: for vp %p got error %d stat %d",
2788             (void *)vp, ep->error, ep->stat));
2789
2790 done:
2791         if (vp != lrp->lr_vp)
2792                 VN_RELE(vp);
2793 }
2794
2795 /*
2796  * Close a file that was opened via a resent OPEN.
2797  * Most errors are passed back to the caller (via the return value and
2798  * *statp), except for FHEXPIRED, which is retried.
2799  *
2800  * It might be conceptually cleaner to push the CLOSE request onto the
2801  * front of the resend queue, rather than sending it here.  That would
2802  * match the way we undo lost lock requests.  On the other
2803  * hand, we've already got something that works, and there's no reason to
2804  * change it at this time.
2805  */
2806
2807 static void
2808 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2809     nfs4_error_t *ep)
2810 {
2811
2812         for (;;) {
2813                 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2814                     CLOSE_AFTER_RESEND, 0, 0, 0);
2815                 if (ep->error == 0 && ep->stat == NFS4_OK)
2816                         break;          /* success; done */
2817                 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2818                         break;
2819                 /* else retry FHEXPIRED */
2820         }
2821
2822 }
2823
2824 /*
2825  * Resend the given lost lock request.  Return an errno value.  If zero,
2826  * *statp is set to the NFS status code for the call.
2827  *
2828  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2829  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2830  * Let the recovery thread redrive the call if we get a recovery error that
2831  * we can actually recover from.
2832  */
2833 static void
2834 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2835 {
2836         bool_t          send_siglost = FALSE;
2837         vnode_t         *vp = lrp->lr_vp;
2838
2839         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2840         ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2841             lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2842
2843         nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2844             lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
2845
2846         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2847             "nfs4frlock for vp %p returned error %d, stat %d",
2848             (void *)vp, ep->error, ep->stat));
2849
2850         if (ep->error == 0 && ep->stat == 0)
2851                 goto done;
2852         if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2853             lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2854                 goto done;
2855
2856         /*
2857          * If we failed with a non-recovery error, send SIGLOST and
2858          * mark the file dead.
2859          */
2860         if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2861                 send_siglost = TRUE;
2862         else {
2863                 /*
2864                  * Done with recovering LOST LOCK in the event the
2865                  * server rebooted or we've lost the lease.
2866                  */
2867                 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2868                     ep->stat == NFS4ERR_STALE_STATEID ||
2869                     ep->stat == NFS4ERR_EXPIRED)) {
2870                         goto done;
2871                 }
2872
2873                 /*
2874                  * BAD_STATEID on an unlock indicates that the server has
2875                  * forgotten about the lock anyway, so act like the call
2876                  * was successful.
2877                  */
2878                 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2879                     lrp->lr_op == OP_LOCKU)
2880                         goto done;
2881
2882                 /*
2883                  * If we got a recovery error that we don't actually
2884                  * recover from, send SIGLOST.  If the filesystem was
2885                  * forcibly unmounted, we skip the SIGLOST because (a) it's
2886                  * unnecessary noise, and (b) there could be a new process
2887                  * with the same pid as the one that had generated the lost
2888                  * state request.
2889                  */
2890                 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2891                     nfs4_recov_marks_dead(ep->stat))) {
2892                         if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2893                                 send_siglost = TRUE;
2894                         goto done;
2895                 }
2896
2897                 /*
2898                  * If the filesystem was forcibly unmounted, we
2899                  * still need to synchronize with the server and
2900                  * release state.  Try again later.
2901                  */
2902                 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2903                         goto done;
2904
2905                 /*
2906                  * If we get a recovery error that we can actually
2907                  * recover from (such as ETIMEDOUT, FHEXPIRED),
2908                  * return and let the recovery thread redrive the call.
2909                  *
2910                  * For the three errors below, we want to delay a bit
2911                  * instead of pounding the server into submission.
2912                  */
2913                 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2914                     (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2915                     (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2916                         ddi_sleep(recov_err_delay);
2917                 goto done;
2918         }
2919
2920 done:
2921         if (send_siglost) {
2922                 cred_t *sv_cred;
2923
2924                 /*
2925                  * Must be root or the actual thread being issued the
2926                  * SIGLOST for this to work, so just become root.
2927                  */
2928                 sv_cred = curthread->t_cred;
2929                 curthread->t_cred = kcred;
2930                 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2931                     ep->error, ep->stat);
2932                 curthread->t_cred = sv_cred;
2933
2934                 /*
2935                  * Flush any additional reinstantiation requests for
2936                  * this operation.  Sending multiple SIGLOSTs to the user
2937                  * process is unlikely to help and may cause trouble.
2938                  */
2939                 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2940                         flush_reinstate(lrp);
2941         }
2942 }
2943
2944 /*
2945  * Remove any lock reinstantiation requests that correspond to the given
2946  * lost request.  We only remove items that follow lrp in the queue,
2947  * assuming that lrp will be removed by the generic lost state code.
2948  */
2949
2950 static void
2951 flush_reinstate(nfs4_lost_rqst_t *lrp)
2952 {
2953         vnode_t *vp;
2954         pid_t pid;
2955         mntinfo4_t *mi;
2956         nfs4_lost_rqst_t *nlrp;
2957
2958         vp = lrp->lr_vp;
2959         mi = VTOMI4(vp);
2960         pid = lrp->lr_flk->l_pid;
2961
2962         /*
2963          * If there are any more reinstantation requests to get rid of,
2964          * they should all be clustered at the front of the lost state
2965          * queue.
2966          */
2967         mutex_enter(&mi->mi_lock);
2968         for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
2969             lrp = nlrp) {
2970                 nlrp = list_next(&mi->mi_lost_state, lrp);
2971                 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2972                         break;
2973                 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
2974                         break;
2975                 ASSERT(lrp->lr_vp == vp);
2976                 ASSERT(lrp->lr_flk->l_pid == pid);
2977                 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2978                     "remove reinstantiation %p", (void *)lrp));
2979                 list_remove(&mi->mi_lost_state, lrp);
2980                 nfs4_free_lost_rqst(lrp, NULL);
2981         }
2982         mutex_exit(&mi->mi_lock);
2983 }
2984
2985 /*
2986  * End of state-specific recovery routines.
2987  */
2988
2989 /*
2990  * Allocate a lost request struct, initialize it from lost_rqstp (including
2991  * bumping the reference counts for the referenced vnode, etc.), and hang
2992  * it off of recovp.
2993  */
2994
2995 static void
2996 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
2997     nfs4_recov_t *action, mntinfo4_t *mi)
2998 {
2999         nfs4_lost_rqst_t *destp;
3000
3001         ASSERT(recovp->rc_lost_rqst == NULL);
3002
3003         destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
3004         recovp->rc_lost_rqst = destp;
3005
3006         if (lost_rqstp->lr_op == OP_LOCK ||
3007             lost_rqstp->lr_op == OP_LOCKU) {
3008                 ASSERT(lost_rqstp->lr_lop);
3009                 *action = NR_LOST_LOCK;
3010                 destp->lr_ctype = lost_rqstp->lr_ctype;
3011                 destp->lr_locktype = lost_rqstp->lr_locktype;
3012         } else if (lost_rqstp->lr_op == OP_OPEN) {
3013                 component4 *srcfp, *destfp;
3014
3015                 destp->lr_oacc = lost_rqstp->lr_oacc;
3016                 destp->lr_odeny = lost_rqstp->lr_odeny;
3017                 destp->lr_oclaim = lost_rqstp->lr_oclaim;
3018                 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3019                         destp->lr_ostateid = lost_rqstp->lr_ostateid;
3020
3021                 srcfp = &lost_rqstp->lr_ofile;
3022                 destfp = &destp->lr_ofile;
3023                 /*
3024                  * Consume caller's utf8string
3025                  */
3026                 destfp->utf8string_len = srcfp->utf8string_len;
3027                 destfp->utf8string_val = srcfp->utf8string_val;
3028                 srcfp->utf8string_len = 0;
3029                 srcfp->utf8string_val = NULL;   /* make sure not reused */
3030
3031                 *action = NR_LOST_STATE_RQST;
3032         } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3033                 destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3034                 destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3035
3036                 *action = NR_LOST_STATE_RQST;
3037         } else if (lost_rqstp->lr_op == OP_CLOSE) {
3038                 ASSERT(lost_rqstp->lr_oop);
3039                 *action = NR_LOST_STATE_RQST;
3040         } else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3041                 *action = NR_LOST_STATE_RQST;
3042         } else {
3043 #ifdef DEBUG
3044                 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3045                     lost_rqstp->lr_op);
3046 #endif
3047                 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3048                     lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3049                     NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3050                 *action = NR_UNUSED;
3051                 recovp->rc_lost_rqst = NULL;
3052                 kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3053                 return;
3054         }
3055
3056         destp->lr_op = lost_rqstp->lr_op;
3057         destp->lr_vp = lost_rqstp->lr_vp;
3058         if (destp->lr_vp)
3059                 VN_HOLD(destp->lr_vp);
3060         destp->lr_dvp = lost_rqstp->lr_dvp;
3061         if (destp->lr_dvp)
3062                 VN_HOLD(destp->lr_dvp);
3063         destp->lr_oop = lost_rqstp->lr_oop;
3064         if (destp->lr_oop)
3065                 open_owner_hold(destp->lr_oop);
3066         destp->lr_osp = lost_rqstp->lr_osp;
3067         if (destp->lr_osp)
3068                 open_stream_hold(destp->lr_osp);
3069         destp->lr_lop = lost_rqstp->lr_lop;
3070         if (destp->lr_lop)
3071                 lock_owner_hold(destp->lr_lop);
3072         destp->lr_cr = lost_rqstp->lr_cr;
3073         if (destp->lr_cr)
3074                 crhold(destp->lr_cr);
3075         if (lost_rqstp->lr_flk == NULL)
3076                 destp->lr_flk = NULL;
3077         else {
3078                 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3079                 *destp->lr_flk = *lost_rqstp->lr_flk;
3080         }
3081         destp->lr_putfirst = lost_rqstp->lr_putfirst;
3082 }
3083
3084 /*
3085  * Map the given return values (errno and nfs4 status code) to a recovery
3086  * action and fill in the following fields of recovp: rc_action,
3087  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3088  */
3089
3090 void
3091 errs_to_action(recov_info_t *recovp,
3092     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3093     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3094     nfs4_bseqid_entry_t *bsep)
3095 {
3096         nfs4_recov_t action = NR_UNUSED;
3097         bool_t reboot = FALSE;
3098         int try_f;
3099         int error = recovp->rc_orig_errors.error;
3100         nfsstat4 stat = recovp->rc_orig_errors.stat;
3101
3102         bzero(&recovp->rc_stateid, sizeof (stateid4));
3103         recovp->rc_lost_rqst = NULL;
3104         recovp->rc_bseqid_rqst = NULL;
3105
3106         try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3107             FAILOVER_MOUNT4(mi);
3108
3109         /*
3110          * We start recovery for EINTR only in the lost lock
3111          * or lost open/close case.
3112          */
3113
3114         if (try_f || error == EINTR || (error == EIO && unmounted)) {
3115                 recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3116                 if (lost_rqstp) {
3117                         ASSERT(lost_rqstp->lr_op != 0);
3118                         nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3119                 }
3120                 if (try_f)
3121                         action = NR_FAILOVER;
3122         } else if (error != 0) {
3123                 recovp->rc_error = error;
3124                 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3125                     NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3126                 action = NR_CLIENTID;
3127         } else {
3128                 recovp->rc_error = geterrno4(stat);
3129                 switch (stat) {
3130 #ifdef notyet
3131                 case NFS4ERR_LEASE_MOVED:
3132                         action = xxx;
3133                         break;
3134 #endif
3135                 case NFS4ERR_MOVED:
3136                         action = NR_MOVED;
3137                         break;
3138                 case NFS4ERR_BADHANDLE:
3139                         action = NR_BADHANDLE;
3140                         break;
3141                 case NFS4ERR_BAD_SEQID:
3142                         if (bsep)
3143                                 save_bseqid_rqst(bsep, recovp);
3144                         action = NR_BAD_SEQID;
3145                         break;
3146                 case NFS4ERR_OLD_STATEID:
3147                         action = NR_OLDSTATEID;
3148                         break;
3149                 case NFS4ERR_WRONGSEC:
3150                         action = NR_WRONGSEC;
3151                         break;
3152                 case NFS4ERR_FHEXPIRED:
3153                         action = NR_FHEXPIRED;
3154                         break;
3155                 case NFS4ERR_BAD_STATEID:
3156                         if (sp == NULL || (sp != NULL && inlease(sp))) {
3157
3158                                 action = NR_BAD_STATEID;
3159                                 if (sidp)
3160                                         recovp->rc_stateid = *sidp;
3161                         } else
3162                                 action = NR_CLIENTID;
3163                         break;
3164                 case NFS4ERR_EXPIRED:
3165                         /*
3166                          * The client's lease has expired, either due
3167                          * to a network partition or perhaps a client
3168                          * error.  In either case, try an NR_CLIENTID
3169                          * style recovery.  reboot remains false, since
3170                          * there is no evidence the server has rebooted.
3171                          * This will cause CLAIM_NULL opens and lock
3172                          * requests without the reclaim bit.
3173                          */
3174                         action = NR_CLIENTID;
3175
3176                         DTRACE_PROBE4(nfs4__expired,
3177                             nfs4_server_t *, sp,
3178                             mntinfo4_t *, mi,
3179                             stateid4 *, sidp, int, op);
3180
3181                         break;
3182                 case NFS4ERR_STALE_CLIENTID:
3183                 case NFS4ERR_STALE_STATEID:
3184                         action = NR_CLIENTID;
3185                         reboot = TRUE;
3186                         break;
3187                 case NFS4ERR_RESOURCE:
3188                         /*
3189                          * If this had been a FAILOVER mount, then
3190                          * we'd have tried failover.  Since it's not,
3191                          * just delay a while and retry.
3192                          */
3193                         action = NR_DELAY;
3194                         break;
3195                 case NFS4ERR_GRACE:
3196                         action = NR_GRACE;
3197                         break;
3198                 case NFS4ERR_DELAY:
3199                         action = NR_DELAY;
3200                         break;
3201                 case NFS4ERR_STALE:
3202                         action = NR_STALE;
3203                         break;
3204                 default:
3205                         nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3206                             NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3207                             0, 0);
3208                         action = NR_CLIENTID;
3209                         break;
3210                 }
3211         }
3212
3213         /* make sure action got set */
3214         ASSERT(action != NR_UNUSED);
3215         recovp->rc_srv_reboot = reboot;
3216         recovp->rc_action = action;
3217         nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3218             NULL);
3219 }
3220
3221 /*
3222  * Return the (held) credential for the process with the given pid.
3223  * May return NULL (e.g., process not found).
3224  */
3225
3226 static cred_t *
3227 pid_to_cr(pid_t pid)
3228 {
3229         proc_t *p;
3230         cred_t *cr;
3231
3232         mutex_enter(&pidlock);
3233         if ((p = prfind(pid)) == NULL) {
3234                 mutex_exit(&pidlock);
3235                 return (NULL);
3236         }
3237
3238         mutex_enter(&p->p_crlock);
3239         crhold(cr = p->p_cred);
3240         mutex_exit(&p->p_crlock);
3241         mutex_exit(&pidlock);
3242
3243         return (cr);
3244 }
3245
3246 /*
3247  * Send SIGLOST to the given process and queue the event.
3248  *
3249  * The 'dump' boolean tells us whether this action should dump the
3250  * in-kernel queue of recovery messages or not.
3251  */
3252
3253 void
3254 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3255     int error, nfsstat4 stat)
3256 {
3257         proc_t *p;
3258
3259         mutex_enter(&pidlock);
3260         p = prfind(pid);
3261         if (p)
3262                 psignal(p, SIGLOST);
3263         mutex_exit(&pidlock);
3264         nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3265             NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3266 }
3267
3268 /*
3269  * Scan the lock list for entries that match the given pid.  Unregister those
3270  * locks that do and change their pid to NOPID.
3271  */
3272
3273 static void
3274 relock_skip_pid(vnode_t *vp, locklist_t *llp, pid_t pid)
3275 {
3276         for (; llp != NULL; llp = llp->ll_next) {
3277                 if (llp->ll_flock.l_pid == pid) {
3278                         int r;
3279
3280                         /*
3281                          * Unregister the lost lock.
3282                          */
3283                         llp->ll_flock.l_type = F_UNLCK;
3284                         r = reclock(vp, &llp->ll_flock, SETFLCK, FREAD | FWRITE,
3285                             0, NULL);
3286                         /* The unlock cannot fail */
3287                         ASSERT(r == 0);
3288
3289                         llp->ll_flock.l_pid = NOPID;
3290                 }
3291         }
3292 }
3293
3294 /*
3295  * Mark a file as having failed recovery, after making a last-ditch effort
3296  * to return any delegation.
3297  *
3298  * Sets r_error to EIO or ESTALE for the given vnode.
3299  */
3300 void
3301 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3302 {
3303         rnode4_t *rp = VTOR4(vp);
3304
3305 #ifdef DEBUG
3306         if (nfs4_fail_recov_stop)
3307                 debug_enter("nfs4_fail_recov");
3308 #endif
3309
3310         mutex_enter(&rp->r_statelock);
3311         if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3312                 mutex_exit(&rp->r_statelock);
3313                 return;
3314         }
3315
3316         /*
3317          * Set R4RECOVERRP to indicate that a recovery error is in
3318          * progress.  This will shut down reads and writes at the top
3319          * half.  Don't set R4RECOVERR until after we've returned the
3320          * delegation, otherwise it will fail.
3321          */
3322
3323         rp->r_flags |= R4RECOVERRP;
3324         mutex_exit(&rp->r_statelock);
3325
3326         nfs4delegabandon(rp);
3327
3328         mutex_enter(&rp->r_statelock);
3329         rp->r_flags |= (R4RECOVERR | R4STALE);
3330         rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3331         PURGE_ATTRCACHE4_LOCKED(rp);
3332         if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3333                 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3334                     vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3335         mutex_exit(&rp->r_statelock);
3336
3337         dnlc_purge_vp(vp);
3338 }
3339
3340 /*
3341  * recov_throttle: if the file had the same recovery action within the
3342  * throttle interval, wait for the throttle interval to finish before
3343  * proceeding.
3344  *
3345  * Side effects: updates the rnode with the current recovery information.
3346  */
3347
3348 static void
3349 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3350 {
3351         time_t curtime, time_to_wait;
3352         rnode4_t *rp = VTOR4(vp);
3353
3354         curtime = gethrestime_sec();
3355
3356         mutex_enter(&rp->r_statelock);
3357         NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3358             "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3359             recovp->rc_action, curtime,
3360             rp->r_recov_act, rp->r_last_recov));
3361         if (recovp->rc_action == rp->r_recov_act &&
3362             rp->r_last_recov + recov_err_delay > curtime) {
3363                 time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3364                 mutex_exit(&rp->r_statelock);
3365                 ddi_sleep(time_to_wait);
3366                 curtime = gethrestime_sec();
3367                 mutex_enter(&rp->r_statelock);
3368         }
3369
3370         rp->r_last_recov = curtime;
3371         rp->r_recov_act = recovp->rc_action;
3372         mutex_exit(&rp->r_statelock);
3373 }
3374
3375 /*
3376  * React to NFS4ERR_GRACE by setting the time we'll permit
3377  * the next call to this filesystem.
3378  */
3379 void
3380 nfs4_set_grace_wait(mntinfo4_t *mi)
3381 {
3382         mutex_enter(&mi->mi_lock);
3383         /* Mark the time for the future */
3384         mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3385         mutex_exit(&mi->mi_lock);
3386 }
3387
3388 /*
3389  * React to MFS4ERR_DELAY by setting the time we'll permit
3390  * the next call to this vnode.
3391  */
3392 void
3393 nfs4_set_delay_wait(vnode_t *vp)
3394 {
3395         rnode4_t *rp = VTOR4(vp);
3396
3397         mutex_enter(&rp->r_statelock);
3398         /*
3399          * Calculate amount we should delay, initial
3400          * delay will be short and then we will back off.
3401          */
3402         if (rp->r_delay_interval == 0)
3403                 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3404         else
3405                 /* calculate next interval value */
3406                 rp->r_delay_interval =
3407                     MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3408         rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3409         mutex_exit(&rp->r_statelock);
3410 }
3411
3412 /*
3413  * The caller is responsible for freeing the returned string.
3414  */
3415 static char *
3416 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3417 {
3418         servinfo4_t *svp;
3419         char *srvnames;
3420         char *namep;
3421         size_t length;
3422
3423         /*
3424          * Calculate the length of the string required to hold all
3425          * of the server names plus either a comma or a null
3426          * character following each individual one.
3427          */
3428         length = 0;
3429         for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3430                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3431                 if (svp->sv_flags & SV4_NOTINUSE) {
3432                         nfs_rw_exit(&svp->sv_lock);
3433                         continue;
3434                 }
3435                 nfs_rw_exit(&svp->sv_lock);
3436                 length += svp->sv_hostnamelen;
3437         }
3438
3439         srvnames = kmem_alloc(length, KM_SLEEP);
3440
3441         namep = srvnames;
3442         for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3443                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3444                 if (svp->sv_flags & SV4_NOTINUSE) {
3445                         nfs_rw_exit(&svp->sv_lock);
3446                         continue;
3447                 }
3448                 nfs_rw_exit(&svp->sv_lock);
3449                 (void) strcpy(namep, svp->sv_hostname);
3450                 namep += svp->sv_hostnamelen - 1;
3451                 *namep++ = ',';
3452         }
3453         *--namep = '\0';
3454
3455         *len = length;
3456
3457         return (srvnames);
3458 }
3459
3460 static void
3461 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3462 {
3463         nfs4_bseqid_entry_t *destp;
3464
3465         destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3466         recovp->rc_bseqid_rqst = destp;
3467
3468         if (bsep->bs_oop)
3469                 open_owner_hold(bsep->bs_oop);
3470         destp->bs_oop = bsep->bs_oop;
3471         if (bsep->bs_lop)
3472                 lock_owner_hold(bsep->bs_lop);
3473         destp->bs_lop = bsep->bs_lop;
3474         if (bsep->bs_vp)
3475                 VN_HOLD(bsep->bs_vp);
3476         destp->bs_vp = bsep->bs_vp;
3477         destp->bs_pid = bsep->bs_pid;
3478         destp->bs_tag = bsep->bs_tag;
3479         destp->bs_seqid = bsep->bs_seqid;
3480 }
3481
3482 static void
3483 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3484 {
3485         if (bsep->bs_oop)
3486                 open_owner_rele(bsep->bs_oop);
3487         if (bsep->bs_lop)
3488                 lock_owner_rele(bsep->bs_lop);
3489         if (bsep->bs_vp)
3490                 VN_RELE(bsep->bs_vp);
3491         kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3492 }
3493
3494 /*
3495  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
3496  * simply mark the open owner and open stream (if provided) as "bad".
3497  * Then future uses of these data structures will be limited to basically
3498  * just cleaning up the internal client state (no going OTW).
3499  *
3500  * The result of this is to return errors back to the app/usr when
3501  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3502  * succeed so progress can be made.
3503  */
3504 void
3505 recov_bad_seqid(recov_info_t *recovp)
3506 {
3507         mntinfo4_t              *mi = recovp->rc_mi;
3508         nfs4_open_owner_t       *bad_oop;
3509         nfs4_lock_owner_t       *bad_lop;
3510         vnode_t                 *vp;
3511         rnode4_t                *rp = NULL;
3512         pid_t                   pid;
3513         nfs4_bseqid_entry_t     *bsep, *tbsep;
3514         int                     error;
3515
3516         ASSERT(mi != NULL);
3517         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3518
3519         mutex_enter(&mi->mi_lock);
3520         bsep = list_head(&mi->mi_bseqid_list);
3521         mutex_exit(&mi->mi_lock);
3522
3523         /*
3524          * Handle all the bad seqid entries on mi's list.
3525          */
3526         while (bsep != NULL) {
3527                 bad_oop = bsep->bs_oop;
3528                 bad_lop = bsep->bs_lop;
3529                 vp = bsep->bs_vp;
3530                 pid = bsep->bs_pid;
3531
3532                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3533                     "recov_bad_seqid: mark oop %p lop %p as bad for "
3534                     "vp %p tag %s pid %d: last good seqid %d for tag %s",
3535                     (void *)bad_oop, (void *)bad_lop, (void *)vp,
3536                     nfs4_ctags[bsep->bs_tag].ct_str, pid,
3537                     bad_oop ?  bad_oop->oo_last_good_seqid : 0,
3538                     bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3539                     nfs4_ctags[TAG_NONE].ct_str));
3540
3541                 nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3542                     0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3543                     bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3544                     bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3545
3546                 if (bad_oop) {
3547                         /* essentially reset the open owner */
3548                         error = nfs4_start_open_seqid_sync(bad_oop, mi);
3549                         ASSERT(!error); /* recov thread always succeeds */
3550                         bad_oop->oo_name = nfs4_get_new_oo_name();
3551                         bad_oop->oo_seqid = 0;
3552                         nfs4_end_open_seqid_sync(bad_oop);
3553                 }
3554
3555                 if (bad_lop) {
3556                         mutex_enter(&bad_lop->lo_lock);
3557                         bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3558                         mutex_exit(&bad_lop->lo_lock);
3559
3560                         ASSERT(vp != NULL);
3561                         rp = VTOR4(vp);
3562                         mutex_enter(&rp->r_statelock);
3563                         rp->r_flags |= R4LODANGLERS;
3564                         mutex_exit(&rp->r_statelock);
3565
3566                         nfs4_send_siglost(pid, mi, vp, TRUE,
3567                             0, NFS4ERR_BAD_SEQID);
3568                 }
3569
3570                 mutex_enter(&mi->mi_lock);
3571                 list_remove(&mi->mi_bseqid_list, bsep);
3572                 tbsep = bsep;
3573                 bsep = list_head(&mi->mi_bseqid_list);
3574                 mutex_exit(&mi->mi_lock);
3575                 free_bseqid_rqst(tbsep);
3576         }
3577
3578         mutex_enter(&mi->mi_lock);
3579         mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3580         mutex_exit(&mi->mi_lock);
3581 }