kernel/fs/nfs/nfs_subr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  28  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  29  */
  30
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/proc.h>
  36 #include <sys/user.h>
  37 #include <sys/time.h>
  38 #include <sys/buf.h>
  39 #include <sys/vfs.h>
  40 #include <sys/vnode.h>
  41 #include <sys/socket.h>
  42 #include <sys/uio.h>
  43 #include <sys/tiuser.h>
  44 #include <sys/swap.h>
  45 #include <sys/errno.h>
  46 #include <sys/debug.h>
  47 #include <sys/kmem.h>
  48 #include <sys/kstat.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/session.h>
  52 #include <sys/dnlc.h>
  53 #include <sys/bitmap.h>
  54 #include <sys/acl.h>
  55 #include <sys/ddi.h>
  56 #include <sys/pathname.h>
  57 #include <sys/flock.h>
  58 #include <sys/dirent.h>
  59 #include <sys/flock.h>
  60 #include <sys/callb.h>
  61 #include <sys/atomic.h>
  62 #include <sys/list.h>
  63 #include <sys/priv.h>
  64 #include <sys/sdt.h>
  65 #include <sys/attr.h>
  66
  67 #include <inet/ip6.h>
  68
  69 #include <rpc/types.h>
  70 #include <rpc/xdr.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73
  74 #include <nfs/nfs.h>
  75 #include <nfs/nfs4.h>
  76 #include <nfs/nfs_clnt.h>
  77 #include <nfs/rnode.h>
  78 #include <nfs/nfs_acl.h>
  79
  80 /*
  81  * The hash queues for the access to active and cached rnodes
  82  * are organized as doubly linked lists.  A reader/writer lock
  83  * for each hash bucket is used to control access and to synchronize
  84  * lookups, additions, and deletions from the hash queue.
  85  *
  86  * The rnode freelist is organized as a doubly linked list with
  87  * a head pointer.  Additions and deletions are synchronized via
  88  * a single mutex.
  89  *
  90  * In order to add an rnode to the free list, it must be hashed into
  91  * a hash queue and the exclusive lock to the hash queue be held.
  92  * If an rnode is not hashed into a hash queue, then it is destroyed
  93  * because it represents no valuable information that can be reused
  94  * about the file.  The exclusive lock to the hash queue must be
  95  * held in order to prevent a lookup in the hash queue from finding
  96  * the rnode and using it and assuming that the rnode is not on the
  97  * freelist.  The lookup in the hash queue will have the hash queue
  98  * locked, either exclusive or shared.
  99  *
 100  * The vnode reference count for each rnode is not allowed to drop
 101  * below 1.  This prevents external entities, such as the VM
 102  * subsystem, from acquiring references to vnodes already on the
 103  * freelist and then trying to place them back on the freelist
 104  * when their reference is released.  This means that the when an
 105  * rnode is looked up in the hash queues, then either the rnode
 106  * is removed from the freelist and that reference is transferred to
 107  * the new reference or the vnode reference count must be incremented
 108  * accordingly.  The mutex for the freelist must be held in order to
 109  * accurately test to see if the rnode is on the freelist or not.
 110  * The hash queue lock might be held shared and it is possible that
 111  * two different threads may race to remove the rnode from the
 112  * freelist.  This race can be resolved by holding the mutex for the
 113  * freelist.  Please note that the mutex for the freelist does not
 114  * need to held if the rnode is not on the freelist.  It can not be
 115  * placed on the freelist due to the requirement that the thread
 116  * putting the rnode on the freelist must hold the exclusive lock
 117  * to the hash queue and the thread doing the lookup in the hash
 118  * queue is holding either a shared or exclusive lock to the hash
 119  * queue.
 120  *
 121  * The lock ordering is:
 122  *
 123  *      hash bucket lock -> vnode lock
 124  *      hash bucket lock -> freelist lock
 125  */
 126 static rhashq_t *rtable;
 127
 128 static kmutex_t rpfreelist_lock;
 129 static rnode_t *rpfreelist = NULL;
 130 static long rnew = 0;
 131 long nrnode = 0;
 132
 133 static int rtablesize;
 134 static int rtablemask;
 135
 136 static int hashlen = 4;
 137
 138 static struct kmem_cache *rnode_cache;
 139
 140 /*
 141  * Mutex to protect the following variables:
 142  *      nfs_major
 143  *      nfs_minor
 144  */
 145 kmutex_t nfs_minor_lock;
 146 int nfs_major;
 147 int nfs_minor;
 148
 149 /* Do we allow preepoch (negative) time values otw? */
 150 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
 151
 152 /*
 153  * Access cache
 154  */
 155 static acache_hash_t *acache;
 156 static long nacache;    /* used strictly to size the number of hash queues */
 157
 158 static int acachesize;
 159 static int acachemask;
 160 static struct kmem_cache *acache_cache;
 161
 162 /*
 163  * Client side utilities
 164  */
 165
 166 /*
 167  * client side statistics
 168  */
 169 static const struct clstat clstat_tmpl = {
 170         { "calls",      KSTAT_DATA_UINT64 },
 171         { "badcalls",   KSTAT_DATA_UINT64 },
 172         { "clgets",     KSTAT_DATA_UINT64 },
 173         { "cltoomany",  KSTAT_DATA_UINT64 },
 174 #ifdef DEBUG
 175         { "clalloc",    KSTAT_DATA_UINT64 },
 176         { "noresponse", KSTAT_DATA_UINT64 },
 177         { "failover",   KSTAT_DATA_UINT64 },
 178         { "remap",      KSTAT_DATA_UINT64 },
 179 #endif
 180 };
 181
 182 /*
 183  * The following are statistics that describe behavior of the system as a whole
 184  * and doesn't correspond to any one particular zone.
 185  */
 186 #ifdef DEBUG
 187 static struct clstat_debug {
 188         kstat_named_t   nrnode;                 /* number of allocated rnodes */
 189         kstat_named_t   access;                 /* size of access cache */
 190         kstat_named_t   dirent;                 /* size of readdir cache */
 191         kstat_named_t   dirents;                /* size of readdir buf cache */
 192         kstat_named_t   reclaim;                /* number of reclaims */
 193         kstat_named_t   clreclaim;              /* number of cl reclaims */
 194         kstat_named_t   f_reclaim;              /* number of free reclaims */
 195         kstat_named_t   a_reclaim;              /* number of active reclaims */
 196         kstat_named_t   r_reclaim;              /* number of rnode reclaims */
 197         kstat_named_t   rpath;                  /* bytes used to store rpaths */
 198 } clstat_debug = {
 199         { "nrnode",     KSTAT_DATA_UINT64 },
 200         { "access",     KSTAT_DATA_UINT64 },
 201         { "dirent",     KSTAT_DATA_UINT64 },
 202         { "dirents",    KSTAT_DATA_UINT64 },
 203         { "reclaim",    KSTAT_DATA_UINT64 },
 204         { "clreclaim",  KSTAT_DATA_UINT64 },
 205         { "f_reclaim",  KSTAT_DATA_UINT64 },
 206         { "a_reclaim",  KSTAT_DATA_UINT64 },
 207         { "r_reclaim",  KSTAT_DATA_UINT64 },
 208         { "r_path",     KSTAT_DATA_UINT64 },
 209 };
 210 #endif  /* DEBUG */
 211
 212 /*
 213  * We keep a global list of per-zone client data, so we can clean up all zones
 214  * if we get low on memory.
 215  */
 216 static list_t nfs_clnt_list;
 217 static kmutex_t nfs_clnt_list_lock;
 218 static zone_key_t nfsclnt_zone_key;
 219
 220 static struct kmem_cache *chtab_cache;
 221
 222 /*
 223  * Some servers do not properly update the attributes of the
 224  * directory when changes are made.  To allow interoperability
 225  * with these broken servers, the nfs_disable_rddir_cache
 226  * parameter must be set in /etc/system
 227  */
 228 int nfs_disable_rddir_cache = 0;
 229
 230 int             clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 231                     struct chtab **);
 232 void            clfree(CLIENT *, struct chtab *);
 233 static int      acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 234                     struct chtab **, struct nfs_clnt *);
 235 static int      nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 236                     struct chtab **, struct nfs_clnt *);
 237 static void     clreclaim(void *);
 238 static int      nfs_feedback(int, int, mntinfo_t *);
 239 static int      rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 240                     caddr_t, cred_t *, int *, enum clnt_stat *, int,
 241                     failinfo_t *);
 242 static int      aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 243                     caddr_t, cred_t *, int *, int, failinfo_t *);
 244 static void     rinactive(rnode_t *, cred_t *);
 245 static int      rtablehash(nfs_fhandle *);
 246 static vnode_t  *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
 247                     const struct vnodeops *,
 248                     int (*)(vnode_t *, page_t *, uoff_t *, size_t *, int,
 249                         cred_t *),
 250                     int (*)(const void *, const void *), int *, cred_t *,
 251                     char *, char *);
 252 static void     rp_rmfree(rnode_t *);
 253 static void     rp_addhash(rnode_t *);
 254 static void     rp_rmhash_locked(rnode_t *);
 255 static rnode_t  *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
 256 static void     destroy_rnode(rnode_t *);
 257 static void     rddir_cache_free(rddir_cache *);
 258 static int      nfs_free_data_reclaim(rnode_t *);
 259 static int      nfs_active_data_reclaim(rnode_t *);
 260 static int      nfs_free_reclaim(void);
 261 static int      nfs_active_reclaim(void);
 262 static int      nfs_rnode_reclaim(void);
 263 static void     nfs_reclaim(void *);
 264 static int      failover_safe(failinfo_t *);
 265 static void     failover_newserver(mntinfo_t *mi);
 266 static void     failover_thread(mntinfo_t *mi);
 267 static int      failover_wait(mntinfo_t *);
 268 static int      failover_remap(failinfo_t *);
 269 static int      failover_lookup(char *, vnode_t *,
 270                     int (*)(vnode_t *, char *, vnode_t **,
 271                         struct pathname *, int, vnode_t *, cred_t *, int),
 272                     int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
 273                     vnode_t **);
 274 static void     nfs_free_r_path(rnode_t *);
 275 static void     nfs_set_vroot(vnode_t *);
 276 static char     *nfs_getsrvnames(mntinfo_t *, size_t *);
 277
 278 /*
 279  * from rpcsec module (common/rpcsec)
 280  */
 281 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
 282 extern void sec_clnt_freeh(AUTH *);
 283 extern void sec_clnt_freeinfo(struct sec_data *);
 284
 285 /*
 286  * EIO or EINTR are not recoverable errors.
 287  */
 288 #define IS_RECOVERABLE_ERROR(error)     !((error == EINTR) || (error == EIO))
 289
 290 #ifdef DEBUG
 291 #define SRV_QFULL_MSG   "send queue to NFS%d server %s is full; still trying\n"
 292 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
 293 #else
 294 #define SRV_QFULL_MSG   "send queue to NFS server %s is full still trying\n"
 295 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
 296 #endif
 297 /*
 298  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 299  */
 300 static int
 301 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 302     struct chtab **chp, struct nfs_clnt *nfscl)
 303 {
 304         struct chhead *ch, *newch;
 305         struct chhead **plistp;
 306         struct chtab *cp;
 307         int error;
 308         k_sigset_t smask;
 309
 310         if (newcl == NULL || chp == NULL || ci == NULL)
 311                 return (EINVAL);
 312
 313         *newcl = NULL;
 314         *chp = NULL;
 315
 316         /*
 317          * Find an unused handle or create one
 318          */
 319         newch = NULL;
 320         nfscl->nfscl_stat.clgets.value.ui64++;
 321 top:
 322         /*
 323          * Find the correct entry in the cache to check for free
 324          * client handles.  The search is based on the RPC program
 325          * number, program version number, dev_t for the transport
 326          * device, and the protocol family.
 327          */
 328         mutex_enter(&nfscl->nfscl_chtable_lock);
 329         plistp = &nfscl->nfscl_chtable;
 330         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 331                 if (ch->ch_prog == ci->cl_prog &&
 332                     ch->ch_vers == ci->cl_vers &&
 333                     ch->ch_dev == svp->sv_knconf->knc_rdev &&
 334                     (strcmp(ch->ch_protofmly,
 335                     svp->sv_knconf->knc_protofmly) == 0))
 336                         break;
 337                 plistp = &ch->ch_next;
 338         }
 339
 340         /*
 341          * If we didn't find a cache entry for this quadruple, then
 342          * create one.  If we don't have one already preallocated,
 343          * then drop the cache lock, create one, and then start over.
 344          * If we did have a preallocated entry, then just add it to
 345          * the front of the list.
 346          */
 347         if (ch == NULL) {
 348                 if (newch == NULL) {
 349                         mutex_exit(&nfscl->nfscl_chtable_lock);
 350                         newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
 351                         newch->ch_timesused = 0;
 352                         newch->ch_prog = ci->cl_prog;
 353                         newch->ch_vers = ci->cl_vers;
 354                         newch->ch_dev = svp->sv_knconf->knc_rdev;
 355                         newch->ch_protofmly = kmem_alloc(
 356                             strlen(svp->sv_knconf->knc_protofmly) + 1,
 357                             KM_SLEEP);
 358                         (void) strcpy(newch->ch_protofmly,
 359                             svp->sv_knconf->knc_protofmly);
 360                         newch->ch_list = NULL;
 361                         goto top;
 362                 }
 363                 ch = newch;
 364                 newch = NULL;
 365                 ch->ch_next = nfscl->nfscl_chtable;
 366                 nfscl->nfscl_chtable = ch;
 367         /*
 368          * We found a cache entry, but if it isn't on the front of the
 369          * list, then move it to the front of the list to try to take
 370          * advantage of locality of operations.
 371          */
 372         } else if (ch != nfscl->nfscl_chtable) {
 373                 *plistp = ch->ch_next;
 374                 ch->ch_next = nfscl->nfscl_chtable;
 375                 nfscl->nfscl_chtable = ch;
 376         }
 377
 378         /*
 379          * If there was a free client handle cached, then remove it
 380          * from the list, init it, and use it.
 381          */
 382         if (ch->ch_list != NULL) {
 383                 cp = ch->ch_list;
 384                 ch->ch_list = cp->ch_list;
 385                 mutex_exit(&nfscl->nfscl_chtable_lock);
 386                 if (newch != NULL) {
 387                         kmem_free(newch->ch_protofmly,
 388                             strlen(newch->ch_protofmly) + 1);
 389                         kmem_free(newch, sizeof (*newch));
 390                 }
 391                 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
 392                     &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
 393                 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 394                     &cp->ch_client->cl_auth);
 395                 if (error || cp->ch_client->cl_auth == NULL) {
 396                         CLNT_DESTROY(cp->ch_client);
 397                         kmem_cache_free(chtab_cache, cp);
 398                         return ((error != 0) ? error : EINTR);
 399                 }
 400                 ch->ch_timesused++;
 401                 *newcl = cp->ch_client;
 402                 *chp = cp;
 403                 return (0);
 404         }
 405
 406         /*
 407          * There weren't any free client handles which fit, so allocate
 408          * a new one and use that.
 409          */
 410 #ifdef DEBUG
 411         atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 412 #endif
 413         mutex_exit(&nfscl->nfscl_chtable_lock);
 414
 415         nfscl->nfscl_stat.cltoomany.value.ui64++;
 416         if (newch != NULL) {
 417                 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
 418                 kmem_free(newch, sizeof (*newch));
 419         }
 420
 421         cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
 422         cp->ch_head = ch;
 423
 424         sigintr(&smask, (int)ci->cl_flags & MI_INT);
 425         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
 426             ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
 427         sigunintr(&smask);
 428
 429         if (error != 0) {
 430                 kmem_cache_free(chtab_cache, cp);
 431 #ifdef DEBUG
 432                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 433 #endif
 434                 /*
 435                  * Warning is unnecessary if error is EINTR.
 436                  */
 437                 if (error != EINTR) {
 438                         nfs_cmn_err(error, CE_WARN,
 439                             "clget: couldn't create handle: %m\n");
 440                 }
 441                 return (error);
 442         }
 443         (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
 444         auth_destroy(cp->ch_client->cl_auth);
 445         error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 446             &cp->ch_client->cl_auth);
 447         if (error || cp->ch_client->cl_auth == NULL) {
 448                 CLNT_DESTROY(cp->ch_client);
 449                 kmem_cache_free(chtab_cache, cp);
 450 #ifdef DEBUG
 451                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 452 #endif
 453                 return ((error != 0) ? error : EINTR);
 454         }
 455         ch->ch_timesused++;
 456         *newcl = cp->ch_client;
 457         ASSERT(cp->ch_client->cl_nosignal == FALSE);
 458         *chp = cp;
 459         return (0);
 460 }
 461
 462 int
 463 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 464     struct chtab **chp)
 465 {
 466         struct nfs_clnt *nfscl;
 467
 468         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 469         ASSERT(nfscl != NULL);
 470
 471         return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
 472 }
 473
 474 static int
 475 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 476     struct chtab **chp, struct nfs_clnt *nfscl)
 477 {
 478         clinfo_t ci;
 479         int error;
 480
 481         /*
 482          * Set read buffer size to rsize
 483          * and add room for RPC headers.
 484          */
 485         ci.cl_readsize = mi->mi_tsize;
 486         if (ci.cl_readsize != 0)
 487                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 488
 489         /*
 490          * If soft mount and server is down just try once.
 491          * meaning: do not retransmit.
 492          */
 493         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 494                 ci.cl_retrans = 0;
 495         else
 496                 ci.cl_retrans = mi->mi_retrans;
 497
 498         ci.cl_prog = NFS_ACL_PROGRAM;
 499         ci.cl_vers = mi->mi_vers;
 500         ci.cl_flags = mi->mi_flags;
 501
 502         /*
 503          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 504          * security flavor, the client tries to establish a security context
 505          * by contacting the server. If the connection is timed out or reset,
 506          * e.g. server reboot, we will try again.
 507          */
 508         do {
 509                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 510
 511                 if (error == 0)
 512                         break;
 513
 514                 /*
 515                  * For forced unmount or zone shutdown, bail out, no retry.
 516                  */
 517                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 518                         error = EIO;
 519                         break;
 520                 }
 521
 522                 /* do not retry for softmount */
 523                 if (!(mi->mi_flags & MI_HARD))
 524                         break;
 525
 526                 /* let the caller deal with the failover case */
 527                 if (FAILOVER_MOUNT(mi))
 528                         break;
 529
 530         } while (error == ETIMEDOUT || error == ECONNRESET);
 531
 532         return (error);
 533 }
 534
 535 static int
 536 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 537     struct chtab **chp, struct nfs_clnt *nfscl)
 538 {
 539         clinfo_t ci;
 540         int error;
 541
 542         /*
 543          * Set read buffer size to rsize
 544          * and add room for RPC headers.
 545          */
 546         ci.cl_readsize = mi->mi_tsize;
 547         if (ci.cl_readsize != 0)
 548                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 549
 550         /*
 551          * If soft mount and server is down just try once.
 552          * meaning: do not retransmit.
 553          */
 554         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 555                 ci.cl_retrans = 0;
 556         else
 557                 ci.cl_retrans = mi->mi_retrans;
 558
 559         ci.cl_prog = mi->mi_prog;
 560         ci.cl_vers = mi->mi_vers;
 561         ci.cl_flags = mi->mi_flags;
 562
 563         /*
 564          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 565          * security flavor, the client tries to establish a security context
 566          * by contacting the server. If the connection is timed out or reset,
 567          * e.g. server reboot, we will try again.
 568          */
 569         do {
 570                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 571
 572                 if (error == 0)
 573                         break;
 574
 575                 /*
 576                  * For forced unmount or zone shutdown, bail out, no retry.
 577                  */
 578                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 579                         error = EIO;
 580                         break;
 581                 }
 582
 583                 /* do not retry for softmount */
 584                 if (!(mi->mi_flags & MI_HARD))
 585                         break;
 586
 587                 /* let the caller deal with the failover case */
 588                 if (FAILOVER_MOUNT(mi))
 589                         break;
 590
 591         } while (error == ETIMEDOUT || error == ECONNRESET);
 592
 593         return (error);
 594 }
 595
 596 static void
 597 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
 598 {
 599         if (cl->cl_auth != NULL) {
 600                 sec_clnt_freeh(cl->cl_auth);
 601                 cl->cl_auth = NULL;
 602         }
 603
 604         /*
 605          * Timestamp this cache entry so that we know when it was last
 606          * used.
 607          */
 608         cp->ch_freed = gethrestime_sec();
 609
 610         /*
 611          * Add the free client handle to the front of the list.
 612          * This way, the list will be sorted in youngest to oldest
 613          * order.
 614          */
 615         mutex_enter(&nfscl->nfscl_chtable_lock);
 616         cp->ch_list = cp->ch_head->ch_list;
 617         cp->ch_head->ch_list = cp;
 618         mutex_exit(&nfscl->nfscl_chtable_lock);
 619 }
 620
 621 void
 622 clfree(CLIENT *cl, struct chtab *cp)
 623 {
 624         struct nfs_clnt *nfscl;
 625
 626         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 627         ASSERT(nfscl != NULL);
 628
 629         clfree_impl(cl, cp, nfscl);
 630 }
 631
 632 #define CL_HOLDTIME     60      /* time to hold client handles */
 633
 634 static void
 635 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
 636 {
 637         struct chhead *ch;
 638         struct chtab *cp;       /* list of objects that can be reclaimed */
 639         struct chtab *cpe;
 640         struct chtab *cpl;
 641         struct chtab **cpp;
 642 #ifdef DEBUG
 643         int n = 0;
 644 #endif
 645
 646         /*
 647          * Need to reclaim some memory, so step through the cache
 648          * looking through the lists for entries which can be freed.
 649          */
 650         cp = NULL;
 651
 652         mutex_enter(&nfscl->nfscl_chtable_lock);
 653
 654         /*
 655          * Here we step through each non-NULL quadruple and start to
 656          * construct the reclaim list pointed to by cp.  Note that
 657          * cp will contain all eligible chtab entries.  When this traversal
 658          * completes, chtab entries from the last quadruple will be at the
 659          * front of cp and entries from previously inspected quadruples have
 660          * been appended to the rear of cp.
 661          */
 662         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 663                 if (ch->ch_list == NULL)
 664                         continue;
 665                 /*
 666                  * Search each list for entries older then
 667                  * cl_holdtime seconds.  The lists are maintained
 668                  * in youngest to oldest order so that when the
 669                  * first entry is found which is old enough, then
 670                  * all of the rest of the entries on the list will
 671                  * be old enough as well.
 672                  */
 673                 cpl = ch->ch_list;
 674                 cpp = &ch->ch_list;
 675                 while (cpl != NULL &&
 676                     cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
 677                         cpp = &cpl->ch_list;
 678                         cpl = cpl->ch_list;
 679                 }
 680                 if (cpl != NULL) {
 681                         *cpp = NULL;
 682                         if (cp != NULL) {
 683                                 cpe = cpl;
 684                                 while (cpe->ch_list != NULL)
 685                                         cpe = cpe->ch_list;
 686                                 cpe->ch_list = cp;
 687                         }
 688                         cp = cpl;
 689                 }
 690         }
 691
 692         mutex_exit(&nfscl->nfscl_chtable_lock);
 693
 694         /*
 695          * If cp is empty, then there is nothing to reclaim here.
 696          */
 697         if (cp == NULL)
 698                 return;
 699
 700         /*
 701          * Step through the list of entries to free, destroying each client
 702          * handle and kmem_free'ing the memory for each entry.
 703          */
 704         while (cp != NULL) {
 705 #ifdef DEBUG
 706                 n++;
 707 #endif
 708                 CLNT_DESTROY(cp->ch_client);
 709                 cpl = cp->ch_list;
 710                 kmem_cache_free(chtab_cache, cp);
 711                 cp = cpl;
 712         }
 713
 714 #ifdef DEBUG
 715         /*
 716          * Update clalloc so that nfsstat shows the current number
 717          * of allocated client handles.
 718          */
 719         atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
 720 #endif
 721 }
 722
 723 /* ARGSUSED */
 724 static void
 725 clreclaim(void *all)
 726 {
 727         struct nfs_clnt *nfscl;
 728
 729 #ifdef DEBUG
 730         clstat_debug.clreclaim.value.ui64++;
 731 #endif
 732         /*
 733          * The system is low on memory; go through and try to reclaim some from
 734          * every zone on the system.
 735          */
 736         mutex_enter(&nfs_clnt_list_lock);
 737         nfscl = list_head(&nfs_clnt_list);
 738         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
 739                 clreclaim_zone(nfscl, CL_HOLDTIME);
 740         mutex_exit(&nfs_clnt_list_lock);
 741 }
 742
 743 /*
 744  * Minimum time-out values indexed by call type
 745  * These units are in "eights" of a second to avoid multiplies
 746  */
 747 static unsigned int minimum_timeo[] = {
 748         6, 7, 10
 749 };
 750
 751 /*
 752  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 753  */
 754 #define MAXTIMO (20*hz)
 755 #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
 756 #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
 757
 758 #define MIN_NFS_TSIZE 512       /* minimum "chunk" of NFS IO */
 759 #define REDUCE_NFS_TIME (hz/2)  /* rtxcur we try to keep under */
 760 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
 761
 762 /*
 763  * Function called when rfscall notices that we have been
 764  * re-transmitting, or when we get a response without retransmissions.
 765  * Return 1 if the transfer size was adjusted down - 0 if no change.
 766  */
 767 static int
 768 nfs_feedback(int flag, int which, mntinfo_t *mi)
 769 {
 770         int kind;
 771         int r = 0;
 772
 773         mutex_enter(&mi->mi_lock);
 774         if (flag == FEEDBACK_REXMIT1) {
 775                 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
 776                     mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
 777                         goto done;
 778                 if (mi->mi_curread > MIN_NFS_TSIZE) {
 779                         mi->mi_curread /= 2;
 780                         if (mi->mi_curread < MIN_NFS_TSIZE)
 781                                 mi->mi_curread = MIN_NFS_TSIZE;
 782                         r = 1;
 783                 }
 784
 785                 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
 786                         mi->mi_curwrite /= 2;
 787                         if (mi->mi_curwrite < MIN_NFS_TSIZE)
 788                                 mi->mi_curwrite = MIN_NFS_TSIZE;
 789                         r = 1;
 790                 }
 791         } else if (flag == FEEDBACK_OK) {
 792                 kind = mi->mi_timer_type[which];
 793                 if (kind == 0 ||
 794                     mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
 795                         goto done;
 796                 if (kind == 1) {
 797                         if (mi->mi_curread >= mi->mi_tsize)
 798                                 goto done;
 799                         mi->mi_curread +=  MIN_NFS_TSIZE;
 800                         if (mi->mi_curread > mi->mi_tsize/2)
 801                                 mi->mi_curread = mi->mi_tsize;
 802                 } else if (kind == 2) {
 803                         if (mi->mi_curwrite >= mi->mi_stsize)
 804                                 goto done;
 805                         mi->mi_curwrite += MIN_NFS_TSIZE;
 806                         if (mi->mi_curwrite > mi->mi_stsize/2)
 807                                 mi->mi_curwrite = mi->mi_stsize;
 808                 }
 809         }
 810 done:
 811         mutex_exit(&mi->mi_lock);
 812         return (r);
 813 }
 814
 815 #ifdef DEBUG
 816 static int rfs2call_hits = 0;
 817 static int rfs2call_misses = 0;
 818 #endif
 819
 820 int
 821 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 822     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 823     enum nfsstat *statusp, int flags, failinfo_t *fi)
 824 {
 825         int rpcerror;
 826         enum clnt_stat rpc_status;
 827
 828         ASSERT(statusp != NULL);
 829
 830         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 831             cr, douprintf, &rpc_status, flags, fi);
 832         if (!rpcerror) {
 833                 /*
 834                  * See crnetadjust() for comments.
 835                  */
 836                 if (*statusp == NFSERR_ACCES &&
 837                     (cr = crnetadjust(cr)) != NULL) {
 838 #ifdef DEBUG
 839                         rfs2call_hits++;
 840 #endif
 841                         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
 842                             resp, cr, douprintf, NULL, flags, fi);
 843                         crfree(cr);
 844 #ifdef DEBUG
 845                         if (*statusp == NFSERR_ACCES)
 846                                 rfs2call_misses++;
 847 #endif
 848                 }
 849         } else if (rpc_status == RPC_PROCUNAVAIL) {
 850                 *statusp = NFSERR_OPNOTSUPP;
 851                 rpcerror = 0;
 852         }
 853
 854         return (rpcerror);
 855 }
 856
 857 #define NFS3_JUKEBOX_DELAY      10 * hz
 858
 859 static clock_t nfs3_jukebox_delay = 0;
 860
 861 #ifdef DEBUG
 862 static int rfs3call_hits = 0;
 863 static int rfs3call_misses = 0;
 864 #endif
 865
 866 int
 867 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 868     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 869     nfsstat3 *statusp, int flags, failinfo_t *fi)
 870 {
 871         int rpcerror;
 872         int user_informed;
 873
 874         user_informed = 0;
 875         do {
 876                 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 877                     cr, douprintf, NULL, flags, fi);
 878                 if (!rpcerror) {
 879                         cred_t *crr;
 880                         if (*statusp == NFS3ERR_JUKEBOX) {
 881                                 if (ttoproc(curthread) == &p0) {
 882                                         rpcerror = EAGAIN;
 883                                         break;
 884                                 }
 885                                 if (!user_informed) {
 886                                         user_informed = 1;
 887                                         uprintf(
 888                 "file temporarily unavailable on the server, retrying...\n");
 889                                 }
 890                                 delay(nfs3_jukebox_delay);
 891                         }
 892                         /*
 893                          * See crnetadjust() for comments.
 894                          */
 895                         else if (*statusp == NFS3ERR_ACCES &&
 896                             (crr = crnetadjust(cr)) != NULL) {
 897 #ifdef DEBUG
 898                                 rfs3call_hits++;
 899 #endif
 900                                 rpcerror = rfscall(mi, which, xdrargs, argsp,
 901                                     xdrres, resp, crr, douprintf,
 902                                     NULL, flags, fi);
 903
 904                                 crfree(crr);
 905 #ifdef DEBUG
 906                                 if (*statusp == NFS3ERR_ACCES)
 907                                         rfs3call_misses++;
 908 #endif
 909                         }
 910                 }
 911         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
 912
 913         return (rpcerror);
 914 }
 915
 916 #define VALID_FH(fi)    (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
 917 #define INC_READERS(mi)         { \
 918         mi->mi_readers++; \
 919 }
 920 #define DEC_READERS(mi)         { \
 921         mi->mi_readers--; \
 922         if (mi->mi_readers == 0) \
 923                 cv_broadcast(&mi->mi_failover_cv); \
 924 }
 925
 926 static int
 927 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 928     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
 929     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
 930 {
 931         CLIENT *client;
 932         struct chtab *ch;
 933         cred_t *cr = icr;
 934         enum clnt_stat status;
 935         struct rpc_err rpcerr, rpcerr_tmp;
 936         struct timeval wait;
 937         int timeo;              /* in units of hz */
 938         int my_rsize, my_wsize;
 939         bool_t tryagain;
 940         bool_t cred_cloned = FALSE;
 941         k_sigset_t smask;
 942         servinfo_t *svp;
 943         struct nfs_clnt *nfscl;
 944         zoneid_t zoneid = getzoneid();
 945         char *msg;
 946 #ifdef DEBUG
 947         char *bufp;
 948 #endif
 949
 950
 951         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
 952             "rfscall_start:which %d mi %p", which, mi);
 953
 954         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 955         ASSERT(nfscl != NULL);
 956
 957         nfscl->nfscl_stat.calls.value.ui64++;
 958         mi->mi_reqs[which].value.ui64++;
 959
 960         rpcerr.re_status = RPC_SUCCESS;
 961
 962         /*
 963          * In case of forced unmount or zone shutdown, return EIO.
 964          */
 965
 966         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 967                 rpcerr.re_status = RPC_FAILED;
 968                 rpcerr.re_errno = EIO;
 969                 return (rpcerr.re_errno);
 970         }
 971
 972         /*
 973          * Remember the transfer sizes in case
 974          * nfs_feedback changes them underneath us.
 975          */
 976         my_rsize = mi->mi_curread;
 977         my_wsize = mi->mi_curwrite;
 978
 979         /*
 980          * NFS client failover support
 981          *
 982          * If this rnode is not in sync with the current server (VALID_FH),
 983          * we'd like to do a remap to get in sync.  We can be interrupted
 984          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
 985          * use the best info we have to try the RPC.  Part of that is
 986          * unconditionally updating the filehandle copy kept for V3.
 987          *
 988          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
 989          * rw_enter(); we're trying to keep the current server from being
 990          * changed on us until we're done with the remapping and have a
 991          * matching client handle.  We don't want to sending a filehandle
 992          * to the wrong host.
 993          */
 994 failoverretry:
 995         if (FAILOVER_MOUNT(mi)) {
 996                 mutex_enter(&mi->mi_lock);
 997                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
 998                         if (failover_wait(mi)) {
 999                                 mutex_exit(&mi->mi_lock);
1000                                 return (EINTR);
1001                         }
1002                 }
1003                 INC_READERS(mi);
1004                 mutex_exit(&mi->mi_lock);
1005                 if (fi) {
1006                         if (!VALID_FH(fi) &&
1007                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1008                                 int remaperr;
1009
1010                                 svp = mi->mi_curr_serv;
1011                                 remaperr = failover_remap(fi);
1012                                 if (remaperr != 0) {
1013 #ifdef DEBUG
1014                                         if (remaperr != EINTR)
1015                                                 nfs_cmn_err(remaperr, CE_WARN,
1016                                             "rfscall couldn't failover: %m");
1017 #endif
1018                                         mutex_enter(&mi->mi_lock);
1019                                         DEC_READERS(mi);
1020                                         mutex_exit(&mi->mi_lock);
1021                                         /*
1022                                          * If failover_remap returns ETIMEDOUT
1023                                          * and the filesystem is hard mounted
1024                                          * we have to retry the call with a new
1025                                          * server.
1026                                          */
1027                                         if ((mi->mi_flags & MI_HARD) &&
1028                                             IS_RECOVERABLE_ERROR(remaperr)) {
1029                                                 if (svp == mi->mi_curr_serv)
1030                                                         failover_newserver(mi);
1031                                                 rpcerr.re_status = RPC_SUCCESS;
1032                                                 goto failoverretry;
1033                                         }
1034                                         rpcerr.re_errno = remaperr;
1035                                         return (remaperr);
1036                                 }
1037                         }
1038                         if (fi->fhp && fi->copyproc)
1039                                 (*fi->copyproc)(fi->fhp, fi->vp);
1040                 }
1041         }
1042
1043         /*
1044          * clget() calls clnt_tli_kinit() which clears the xid, so we
1045          * are guaranteed to reprocess the retry as a new request.
1046          */
1047         svp = mi->mi_curr_serv;
1048         rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1049
1050         if (FAILOVER_MOUNT(mi)) {
1051                 mutex_enter(&mi->mi_lock);
1052                 DEC_READERS(mi);
1053                 mutex_exit(&mi->mi_lock);
1054
1055                 if ((rpcerr.re_errno == ETIMEDOUT ||
1056                     rpcerr.re_errno == ECONNRESET) &&
1057                     failover_safe(fi)) {
1058                         if (svp == mi->mi_curr_serv)
1059                                 failover_newserver(mi);
1060                         goto failoverretry;
1061                 }
1062         }
1063         if (rpcerr.re_errno != 0)
1064                 return (rpcerr.re_errno);
1065
1066         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1067             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1068                 timeo = (mi->mi_timeo * hz) / 10;
1069         } else {
1070                 mutex_enter(&mi->mi_lock);
1071                 timeo = CLNT_SETTIMERS(client,
1072                     &(mi->mi_timers[mi->mi_timer_type[which]]),
1073                     &(mi->mi_timers[NFS_CALLTYPES]),
1074                     (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1075                     (void (*)())NULL, (caddr_t)mi, 0);
1076                 mutex_exit(&mi->mi_lock);
1077         }
1078
1079         /*
1080          * If hard mounted fs, retry call forever unless hard error occurs.
1081          */
1082         do {
1083                 tryagain = FALSE;
1084
1085                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1086                         status = RPC_FAILED;
1087                         rpcerr.re_status = RPC_FAILED;
1088                         rpcerr.re_errno = EIO;
1089                         break;
1090                 }
1091
1092                 TICK_TO_TIMEVAL(timeo, &wait);
1093
1094                 /*
1095                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1096                  * and SIGTERM. (Preserving the existing masks).
1097                  * Mask out SIGINT if mount option nointr is specified.
1098                  */
1099                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1100                 if (!(mi->mi_flags & MI_INT))
1101                         client->cl_nosignal = TRUE;
1102
1103                 /*
1104                  * If there is a current signal, then don't bother
1105                  * even trying to send out the request because we
1106                  * won't be able to block waiting for the response.
1107                  * Simply assume RPC_INTR and get on with it.
1108                  */
1109                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1110                         status = RPC_INTR;
1111                 else {
1112                         status = CLNT_CALL(client, which, xdrargs, argsp,
1113                             xdrres, resp, wait);
1114                 }
1115
1116                 if (!(mi->mi_flags & MI_INT))
1117                         client->cl_nosignal = FALSE;
1118                 /*
1119                  * restore original signal mask
1120                  */
1121                 sigunintr(&smask);
1122
1123                 switch (status) {
1124                 case RPC_SUCCESS:
1125                         if ((mi->mi_flags & MI_DYNAMIC) &&
1126                             mi->mi_timer_type[which] != 0 &&
1127                             (mi->mi_curread != my_rsize ||
1128                             mi->mi_curwrite != my_wsize))
1129                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1130                         break;
1131
1132                 case RPC_INTR:
1133                         /*
1134                          * There is no way to recover from this error,
1135                          * even if mount option nointr is specified.
1136                          * SIGKILL, for example, cannot be blocked.
1137                          */
1138                         rpcerr.re_status = RPC_INTR;
1139                         rpcerr.re_errno = EINTR;
1140                         break;
1141
1142                 case RPC_UDERROR:
1143                         /*
1144                          * If the NFS server is local (vold) and
1145                          * it goes away then we get RPC_UDERROR.
1146                          * This is a retryable error, so we would
1147                          * loop, so check to see if the specific
1148                          * error was ECONNRESET, indicating that
1149                          * target did not exist at all.  If so,
1150                          * return with RPC_PROGUNAVAIL and
1151                          * ECONNRESET to indicate why.
1152                          */
1153                         CLNT_GETERR(client, &rpcerr);
1154                         if (rpcerr.re_errno == ECONNRESET) {
1155                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1156                                 rpcerr.re_errno = ECONNRESET;
1157                                 break;
1158                         }
1159                         /*FALLTHROUGH*/
1160
1161                 default:                /* probably RPC_TIMEDOUT */
1162                         if (IS_UNRECOVERABLE_RPC(status))
1163                                 break;
1164
1165                         /*
1166                          * increment server not responding count
1167                          */
1168                         mutex_enter(&mi->mi_lock);
1169                         mi->mi_noresponse++;
1170                         mutex_exit(&mi->mi_lock);
1171 #ifdef DEBUG
1172                         nfscl->nfscl_stat.noresponse.value.ui64++;
1173 #endif
1174
1175                         if (!(mi->mi_flags & MI_HARD)) {
1176                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1177                                     (mi->mi_ss_call_type[which] == 0))
1178                                         break;
1179                         }
1180
1181                         /*
1182                          * The call is in progress (over COTS).
1183                          * Try the CLNT_CALL again, but don't
1184                          * print a noisy error message.
1185                          */
1186                         if (status == RPC_INPROGRESS) {
1187                                 tryagain = TRUE;
1188                                 break;
1189                         }
1190
1191                         if (flags & RFSCALL_SOFT)
1192                                 break;
1193
1194                         /*
1195                          * On zone shutdown, just move on.
1196                          */
1197                         if (zone_status_get(curproc->p_zone) >=
1198                             ZONE_IS_SHUTTING_DOWN) {
1199                                 rpcerr.re_status = RPC_FAILED;
1200                                 rpcerr.re_errno = EIO;
1201                                 break;
1202                         }
1203
1204                         /*
1205                          * NFS client failover support
1206                          *
1207                          * If the current server just failed us, we'll
1208                          * start the process of finding a new server.
1209                          * After that, we can just retry.
1210                          */
1211                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1212                                 if (svp == mi->mi_curr_serv)
1213                                         failover_newserver(mi);
1214                                 clfree_impl(client, ch, nfscl);
1215                                 goto failoverretry;
1216                         }
1217
1218                         tryagain = TRUE;
1219                         timeo = backoff(timeo);
1220
1221                         CLNT_GETERR(client, &rpcerr_tmp);
1222                         if ((status == RPC_CANTSEND) &&
1223                             (rpcerr_tmp.re_errno == ENOBUFS))
1224                                 msg = SRV_QFULL_MSG;
1225                         else
1226                                 msg = SRV_NOTRESP_MSG;
1227
1228                         mutex_enter(&mi->mi_lock);
1229                         if (!(mi->mi_flags & MI_PRINTED)) {
1230                                 mi->mi_flags |= MI_PRINTED;
1231                                 mutex_exit(&mi->mi_lock);
1232 #ifdef DEBUG
1233                                 zprintf(zoneid, msg, mi->mi_vers,
1234                                     svp->sv_hostname);
1235 #else
1236                                 zprintf(zoneid, msg, svp->sv_hostname);
1237 #endif
1238                         } else
1239                                 mutex_exit(&mi->mi_lock);
1240                         if (*douprintf && nfs_has_ctty()) {
1241                                 *douprintf = 0;
1242                                 if (!(mi->mi_flags & MI_NOPRINT))
1243 #ifdef DEBUG
1244                                         uprintf(msg, mi->mi_vers,
1245                                             svp->sv_hostname);
1246 #else
1247                                         uprintf(msg, svp->sv_hostname);
1248 #endif
1249                         }
1250
1251                         /*
1252                          * If doing dynamic adjustment of transfer
1253                          * size and if it's a read or write call
1254                          * and if the transfer size changed while
1255                          * retransmitting or if the feedback routine
1256                          * changed the transfer size,
1257                          * then exit rfscall so that the transfer
1258                          * size can be adjusted at the vnops level.
1259                          */
1260                         if ((mi->mi_flags & MI_DYNAMIC) &&
1261                             mi->mi_timer_type[which] != 0 &&
1262                             (mi->mi_curread != my_rsize ||
1263                             mi->mi_curwrite != my_wsize ||
1264                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1265                                 /*
1266                                  * On read or write calls, return
1267                                  * back to the vnode ops level if
1268                                  * the transfer size changed.
1269                                  */
1270                                 clfree_impl(client, ch, nfscl);
1271                                 if (cred_cloned)
1272                                         crfree(cr);
1273                                 return (ENFS_TRYAGAIN);
1274                         }
1275                 }
1276         } while (tryagain);
1277
1278         if (status != RPC_SUCCESS) {
1279                 /*
1280                  * Let soft mounts use the timed out message.
1281                  */
1282                 if (status == RPC_INPROGRESS)
1283                         status = RPC_TIMEDOUT;
1284                 nfscl->nfscl_stat.badcalls.value.ui64++;
1285                 if (status != RPC_INTR) {
1286                         mutex_enter(&mi->mi_lock);
1287                         mi->mi_flags |= MI_DOWN;
1288                         mutex_exit(&mi->mi_lock);
1289                         CLNT_GETERR(client, &rpcerr);
1290 #ifdef DEBUG
1291                         bufp = clnt_sperror(client, svp->sv_hostname);
1292                         zprintf(zoneid, "NFS%d %s failed for %s\n",
1293                             mi->mi_vers, mi->mi_rfsnames[which], bufp);
1294                         if (nfs_has_ctty()) {
1295                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1296                                         uprintf("NFS%d %s failed for %s\n",
1297                                             mi->mi_vers, mi->mi_rfsnames[which],
1298                                             bufp);
1299                                 }
1300                         }
1301                         kmem_free(bufp, MAXPATHLEN);
1302 #else
1303                         zprintf(zoneid,
1304                             "NFS %s failed for server %s: error %d (%s)\n",
1305                             mi->mi_rfsnames[which], svp->sv_hostname,
1306                             status, clnt_sperrno(status));
1307                         if (nfs_has_ctty()) {
1308                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1309                                         uprintf(
1310                                 "NFS %s failed for server %s: error %d (%s)\n",
1311                                             mi->mi_rfsnames[which],
1312                                             svp->sv_hostname, status,
1313                                             clnt_sperrno(status));
1314                                 }
1315                         }
1316 #endif
1317                         /*
1318                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1319                          * re_errno is set appropriately depending on
1320                          * the authentication error
1321                          */
1322                         if (status == RPC_VERSMISMATCH ||
1323                             status == RPC_PROGVERSMISMATCH)
1324                                 rpcerr.re_errno = EIO;
1325                 }
1326         } else {
1327                 /*
1328                  * Test the value of mi_down and mi_printed without
1329                  * holding the mi_lock mutex.  If they are both zero,
1330                  * then it is okay to skip the down and printed
1331                  * processing.  This saves on a mutex_enter and
1332                  * mutex_exit pair for a normal, successful RPC.
1333                  * This was just complete overhead.
1334                  */
1335                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1336                         mutex_enter(&mi->mi_lock);
1337                         mi->mi_flags &= ~MI_DOWN;
1338                         if (mi->mi_flags & MI_PRINTED) {
1339                                 mi->mi_flags &= ~MI_PRINTED;
1340                                 mutex_exit(&mi->mi_lock);
1341 #ifdef DEBUG
1342                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1343                                 zprintf(zoneid, "NFS%d server %s ok\n",
1344                                     mi->mi_vers, svp->sv_hostname);
1345 #else
1346                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1347                                 zprintf(zoneid, "NFS server %s ok\n",
1348                                     svp->sv_hostname);
1349 #endif
1350                         } else
1351                                 mutex_exit(&mi->mi_lock);
1352                 }
1353
1354                 if (*douprintf == 0) {
1355                         if (!(mi->mi_flags & MI_NOPRINT))
1356 #ifdef DEBUG
1357                                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358                                         uprintf("NFS%d server %s ok\n",
1359                                             mi->mi_vers, svp->sv_hostname);
1360 #else
1361                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1363 #endif
1364                         *douprintf = 1;
1365                 }
1366         }
1367
1368         clfree_impl(client, ch, nfscl);
1369         if (cred_cloned)
1370                 crfree(cr);
1371
1372         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1373
1374         if (rpc_status != NULL)
1375                 *rpc_status = rpcerr.re_status;
1376
1377         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1378             rpcerr.re_errno);
1379
1380         return (rpcerr.re_errno);
1381 }
1382
1383 #ifdef DEBUG
1384 static int acl2call_hits = 0;
1385 static int acl2call_misses = 0;
1386 #endif
1387
1388 int
1389 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1390     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1391     enum nfsstat *statusp, int flags, failinfo_t *fi)
1392 {
1393         int rpcerror;
1394
1395         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1396             cr, douprintf, flags, fi);
1397         if (!rpcerror) {
1398                 /*
1399                  * See comments with crnetadjust().
1400                  */
1401                 if (*statusp == NFSERR_ACCES &&
1402                     (cr = crnetadjust(cr)) != NULL) {
1403 #ifdef DEBUG
1404                         acl2call_hits++;
1405 #endif
1406                         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1407                             resp, cr, douprintf, flags, fi);
1408                         crfree(cr);
1409 #ifdef DEBUG
1410                         if (*statusp == NFSERR_ACCES)
1411                                 acl2call_misses++;
1412 #endif
1413                 }
1414         }
1415
1416         return (rpcerror);
1417 }
1418
1419 #ifdef DEBUG
1420 static int acl3call_hits = 0;
1421 static int acl3call_misses = 0;
1422 #endif
1423
1424 int
1425 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1426     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1427     nfsstat3 *statusp, int flags, failinfo_t *fi)
1428 {
1429         int rpcerror;
1430         int user_informed;
1431
1432         user_informed = 0;
1433
1434         do {
1435                 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1436                     cr, douprintf, flags, fi);
1437                 if (!rpcerror) {
1438                         cred_t *crr;
1439                         if (*statusp == NFS3ERR_JUKEBOX) {
1440                                 if (!user_informed) {
1441                                         user_informed = 1;
1442                                         uprintf(
1443                 "file temporarily unavailable on the server, retrying...\n");
1444                                 }
1445                                 delay(nfs3_jukebox_delay);
1446                         }
1447                         /*
1448                          * See crnetadjust() for comments.
1449                          */
1450                         else if (*statusp == NFS3ERR_ACCES &&
1451                             (crr = crnetadjust(cr)) != NULL) {
1452 #ifdef DEBUG
1453                                 acl3call_hits++;
1454 #endif
1455                                 rpcerror = aclcall(mi, which, xdrargs, argsp,
1456                                     xdrres, resp, crr, douprintf, flags, fi);
1457
1458                                 crfree(crr);
1459 #ifdef DEBUG
1460                                 if (*statusp == NFS3ERR_ACCES)
1461                                         acl3call_misses++;
1462 #endif
1463                         }
1464                 }
1465         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1466
1467         return (rpcerror);
1468 }
1469
1470 static int
1471 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1472     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1473     int flags, failinfo_t *fi)
1474 {
1475         CLIENT *client;
1476         struct chtab *ch;
1477         cred_t *cr = icr;
1478         bool_t cred_cloned = FALSE;
1479         enum clnt_stat status;
1480         struct rpc_err rpcerr;
1481         struct timeval wait;
1482         int timeo;              /* in units of hz */
1483 #if 0 /* notyet */
1484         int my_rsize, my_wsize;
1485 #endif
1486         bool_t tryagain;
1487         k_sigset_t smask;
1488         servinfo_t *svp;
1489         struct nfs_clnt *nfscl;
1490         zoneid_t zoneid = getzoneid();
1491 #ifdef DEBUG
1492         char *bufp;
1493 #endif
1494
1495 #if 0 /* notyet */
1496         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1497             "rfscall_start:which %d mi %p", which, mi);
1498 #endif
1499
1500         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1501         ASSERT(nfscl != NULL);
1502
1503         nfscl->nfscl_stat.calls.value.ui64++;
1504         mi->mi_aclreqs[which].value.ui64++;
1505
1506         rpcerr.re_status = RPC_SUCCESS;
1507
1508         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1509                 rpcerr.re_status = RPC_FAILED;
1510                 rpcerr.re_errno = EIO;
1511                 return (rpcerr.re_errno);
1512         }
1513
1514 #if 0 /* notyet */
1515         /*
1516          * Remember the transfer sizes in case
1517          * nfs_feedback changes them underneath us.
1518          */
1519         my_rsize = mi->mi_curread;
1520         my_wsize = mi->mi_curwrite;
1521 #endif
1522
1523         /*
1524          * NFS client failover support
1525          *
1526          * If this rnode is not in sync with the current server (VALID_FH),
1527          * we'd like to do a remap to get in sync.  We can be interrupted
1528          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1529          * use the best info we have to try the RPC.  Part of that is
1530          * unconditionally updating the filehandle copy kept for V3.
1531          *
1532          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1533          * rw_enter(); we're trying to keep the current server from being
1534          * changed on us until we're done with the remapping and have a
1535          * matching client handle.  We don't want to sending a filehandle
1536          * to the wrong host.
1537          */
1538 failoverretry:
1539         if (FAILOVER_MOUNT(mi)) {
1540                 mutex_enter(&mi->mi_lock);
1541                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1542                         if (failover_wait(mi)) {
1543                                 mutex_exit(&mi->mi_lock);
1544                                 return (EINTR);
1545                         }
1546                 }
1547                 INC_READERS(mi);
1548                 mutex_exit(&mi->mi_lock);
1549                 if (fi) {
1550                         if (!VALID_FH(fi) &&
1551                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552                                 int remaperr;
1553
1554                                 svp = mi->mi_curr_serv;
1555                                 remaperr = failover_remap(fi);
1556                                 if (remaperr != 0) {
1557 #ifdef DEBUG
1558                                         if (remaperr != EINTR)
1559                                                 nfs_cmn_err(remaperr, CE_WARN,
1560                                             "aclcall couldn't failover: %m");
1561 #endif
1562                                         mutex_enter(&mi->mi_lock);
1563                                         DEC_READERS(mi);
1564                                         mutex_exit(&mi->mi_lock);
1565
1566                                         /*
1567                                          * If failover_remap returns ETIMEDOUT
1568                                          * and the filesystem is hard mounted
1569                                          * we have to retry the call with a new
1570                                          * server.
1571                                          */
1572                                         if ((mi->mi_flags & MI_HARD) &&
1573                                             IS_RECOVERABLE_ERROR(remaperr)) {
1574                                                 if (svp == mi->mi_curr_serv)
1575                                                         failover_newserver(mi);
1576                                                 rpcerr.re_status = RPC_SUCCESS;
1577                                                 goto failoverretry;
1578                                         }
1579                                         return (remaperr);
1580                                 }
1581                         }
1582                         if (fi->fhp && fi->copyproc)
1583                                 (*fi->copyproc)(fi->fhp, fi->vp);
1584                 }
1585         }
1586
1587         /*
1588          * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1589          * are guaranteed to reprocess the retry as a new request.
1590          */
1591         svp = mi->mi_curr_serv;
1592         rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1593         if (FAILOVER_MOUNT(mi)) {
1594                 mutex_enter(&mi->mi_lock);
1595                 DEC_READERS(mi);
1596                 mutex_exit(&mi->mi_lock);
1597
1598                 if ((rpcerr.re_errno == ETIMEDOUT ||
1599                     rpcerr.re_errno == ECONNRESET) &&
1600                     failover_safe(fi)) {
1601                         if (svp == mi->mi_curr_serv)
1602                                 failover_newserver(mi);
1603                         goto failoverretry;
1604                 }
1605         }
1606         if (rpcerr.re_errno != 0) {
1607                 if (cred_cloned)
1608                         crfree(cr);
1609                 return (rpcerr.re_errno);
1610         }
1611
1612         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1613             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1614                 timeo = (mi->mi_timeo * hz) / 10;
1615         } else {
1616                 mutex_enter(&mi->mi_lock);
1617                 timeo = CLNT_SETTIMERS(client,
1618                     &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1619                     &(mi->mi_timers[NFS_CALLTYPES]),
1620                     (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1621                     (void (*)()) 0, (caddr_t)mi, 0);
1622                 mutex_exit(&mi->mi_lock);
1623         }
1624
1625         /*
1626          * If hard mounted fs, retry call forever unless hard error occurs.
1627          */
1628         do {
1629                 tryagain = FALSE;
1630
1631                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1632                         status = RPC_FAILED;
1633                         rpcerr.re_status = RPC_FAILED;
1634                         rpcerr.re_errno = EIO;
1635                         break;
1636                 }
1637
1638                 TICK_TO_TIMEVAL(timeo, &wait);
1639
1640                 /*
1641                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1642                  * and SIGTERM. (Preserving the existing masks).
1643                  * Mask out SIGINT if mount option nointr is specified.
1644                  */
1645                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1646                 if (!(mi->mi_flags & MI_INT))
1647                         client->cl_nosignal = TRUE;
1648
1649                 /*
1650                  * If there is a current signal, then don't bother
1651                  * even trying to send out the request because we
1652                  * won't be able to block waiting for the response.
1653                  * Simply assume RPC_INTR and get on with it.
1654                  */
1655                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1656                         status = RPC_INTR;
1657                 else {
1658                         status = CLNT_CALL(client, which, xdrargs, argsp,
1659                             xdrres, resp, wait);
1660                 }
1661
1662                 if (!(mi->mi_flags & MI_INT))
1663                         client->cl_nosignal = FALSE;
1664                 /*
1665                  * restore original signal mask
1666                  */
1667                 sigunintr(&smask);
1668
1669                 switch (status) {
1670                 case RPC_SUCCESS:
1671 #if 0 /* notyet */
1672                         if ((mi->mi_flags & MI_DYNAMIC) &&
1673                             mi->mi_timer_type[which] != 0 &&
1674                             (mi->mi_curread != my_rsize ||
1675                             mi->mi_curwrite != my_wsize))
1676                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1677 #endif
1678                         break;
1679
1680                 /*
1681                  * Unfortunately, there are servers in the world which
1682                  * are not coded correctly.  They are not prepared to
1683                  * handle RPC requests to the NFS port which are not
1684                  * NFS requests.  Thus, they may try to process the
1685                  * NFS_ACL request as if it were an NFS request.  This
1686                  * does not work.  Generally, an error will be generated
1687                  * on the client because it will not be able to decode
1688                  * the response from the server.  However, it seems
1689                  * possible that the server may not be able to decode
1690                  * the arguments.  Thus, the criteria for deciding
1691                  * whether the server supports NFS_ACL or not is whether
1692                  * the following RPC errors are returned from CLNT_CALL.
1693                  */
1694                 case RPC_CANTDECODERES:
1695                 case RPC_PROGUNAVAIL:
1696                 case RPC_CANTDECODEARGS:
1697                 case RPC_PROGVERSMISMATCH:
1698                         mutex_enter(&mi->mi_lock);
1699                         mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1700                         mutex_exit(&mi->mi_lock);
1701                         break;
1702
1703                 /*
1704                  * If the server supports NFS_ACL but not the new ops
1705                  * for extended attributes, make sure we don't retry.
1706                  */
1707                 case RPC_PROCUNAVAIL:
1708                         mutex_enter(&mi->mi_lock);
1709                         mi->mi_flags &= ~MI_EXTATTR;
1710                         mutex_exit(&mi->mi_lock);
1711                         break;
1712
1713                 case RPC_INTR:
1714                         /*
1715                          * There is no way to recover from this error,
1716                          * even if mount option nointr is specified.
1717                          * SIGKILL, for example, cannot be blocked.
1718                          */
1719                         rpcerr.re_status = RPC_INTR;
1720                         rpcerr.re_errno = EINTR;
1721                         break;
1722
1723                 case RPC_UDERROR:
1724                         /*
1725                          * If the NFS server is local (vold) and
1726                          * it goes away then we get RPC_UDERROR.
1727                          * This is a retryable error, so we would
1728                          * loop, so check to see if the specific
1729                          * error was ECONNRESET, indicating that
1730                          * target did not exist at all.  If so,
1731                          * return with RPC_PROGUNAVAIL and
1732                          * ECONNRESET to indicate why.
1733                          */
1734                         CLNT_GETERR(client, &rpcerr);
1735                         if (rpcerr.re_errno == ECONNRESET) {
1736                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1737                                 rpcerr.re_errno = ECONNRESET;
1738                                 break;
1739                         }
1740                         /*FALLTHROUGH*/
1741
1742                 default:                /* probably RPC_TIMEDOUT */
1743                         if (IS_UNRECOVERABLE_RPC(status))
1744                                 break;
1745
1746                         /*
1747                          * increment server not responding count
1748                          */
1749                         mutex_enter(&mi->mi_lock);
1750                         mi->mi_noresponse++;
1751                         mutex_exit(&mi->mi_lock);
1752 #ifdef DEBUG
1753                         nfscl->nfscl_stat.noresponse.value.ui64++;
1754 #endif
1755
1756                         if (!(mi->mi_flags & MI_HARD)) {
1757                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1758                                     (mi->mi_acl_ss_call_type[which] == 0))
1759                                         break;
1760                         }
1761
1762                         /*
1763                          * The call is in progress (over COTS).
1764                          * Try the CLNT_CALL again, but don't
1765                          * print a noisy error message.
1766                          */
1767                         if (status == RPC_INPROGRESS) {
1768                                 tryagain = TRUE;
1769                                 break;
1770                         }
1771
1772                         if (flags & RFSCALL_SOFT)
1773                                 break;
1774
1775                         /*
1776                          * On zone shutdown, just move on.
1777                          */
1778                         if (zone_status_get(curproc->p_zone) >=
1779                             ZONE_IS_SHUTTING_DOWN) {
1780                                 rpcerr.re_status = RPC_FAILED;
1781                                 rpcerr.re_errno = EIO;
1782                                 break;
1783                         }
1784
1785                         /*
1786                          * NFS client failover support
1787                          *
1788                          * If the current server just failed us, we'll
1789                          * start the process of finding a new server.
1790                          * After that, we can just retry.
1791                          */
1792                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1793                                 if (svp == mi->mi_curr_serv)
1794                                         failover_newserver(mi);
1795                                 clfree_impl(client, ch, nfscl);
1796                                 goto failoverretry;
1797                         }
1798
1799                         tryagain = TRUE;
1800                         timeo = backoff(timeo);
1801                         mutex_enter(&mi->mi_lock);
1802                         if (!(mi->mi_flags & MI_PRINTED)) {
1803                                 mi->mi_flags |= MI_PRINTED;
1804                                 mutex_exit(&mi->mi_lock);
1805 #ifdef DEBUG
1806                                 zprintf(zoneid,
1807                         "NFS_ACL%d server %s not responding still trying\n",
1808                                     mi->mi_vers, svp->sv_hostname);
1809 #else
1810                                 zprintf(zoneid,
1811                             "NFS server %s not responding still trying\n",
1812                                     svp->sv_hostname);
1813 #endif
1814                         } else
1815                                 mutex_exit(&mi->mi_lock);
1816                         if (*douprintf && nfs_has_ctty()) {
1817                                 *douprintf = 0;
1818                                 if (!(mi->mi_flags & MI_NOPRINT))
1819 #ifdef DEBUG
1820                                         uprintf(
1821                         "NFS_ACL%d server %s not responding still trying\n",
1822                                             mi->mi_vers, svp->sv_hostname);
1823 #else
1824                                         uprintf(
1825                             "NFS server %s not responding still trying\n",
1826                                             svp->sv_hostname);
1827 #endif
1828                         }
1829
1830 #if 0 /* notyet */
1831                         /*
1832                          * If doing dynamic adjustment of transfer
1833                          * size and if it's a read or write call
1834                          * and if the transfer size changed while
1835                          * retransmitting or if the feedback routine
1836                          * changed the transfer size,
1837                          * then exit rfscall so that the transfer
1838                          * size can be adjusted at the vnops level.
1839                          */
1840                         if ((mi->mi_flags & MI_DYNAMIC) &&
1841                             mi->mi_acl_timer_type[which] != 0 &&
1842                             (mi->mi_curread != my_rsize ||
1843                             mi->mi_curwrite != my_wsize ||
1844                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1845                                 /*
1846                                  * On read or write calls, return
1847                                  * back to the vnode ops level if
1848                                  * the transfer size changed.
1849                                  */
1850                                 clfree_impl(client, ch, nfscl);
1851                                 if (cred_cloned)
1852                                         crfree(cr);
1853                                 return (ENFS_TRYAGAIN);
1854                         }
1855 #endif
1856                 }
1857         } while (tryagain);
1858
1859         if (status != RPC_SUCCESS) {
1860                 /*
1861                  * Let soft mounts use the timed out message.
1862                  */
1863                 if (status == RPC_INPROGRESS)
1864                         status = RPC_TIMEDOUT;
1865                 nfscl->nfscl_stat.badcalls.value.ui64++;
1866                 if (status == RPC_CANTDECODERES ||
1867                     status == RPC_PROGUNAVAIL ||
1868                     status == RPC_PROCUNAVAIL ||
1869                     status == RPC_CANTDECODEARGS ||
1870                     status == RPC_PROGVERSMISMATCH)
1871                         CLNT_GETERR(client, &rpcerr);
1872                 else if (status != RPC_INTR) {
1873                         mutex_enter(&mi->mi_lock);
1874                         mi->mi_flags |= MI_DOWN;
1875                         mutex_exit(&mi->mi_lock);
1876                         CLNT_GETERR(client, &rpcerr);
1877 #ifdef DEBUG
1878                         bufp = clnt_sperror(client, svp->sv_hostname);
1879                         zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1880                             mi->mi_vers, mi->mi_aclnames[which], bufp);
1881                         if (nfs_has_ctty()) {
1882                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1883                                         uprintf("NFS_ACL%d %s failed for %s\n",
1884                                             mi->mi_vers, mi->mi_aclnames[which],
1885                                             bufp);
1886                                 }
1887                         }
1888                         kmem_free(bufp, MAXPATHLEN);
1889 #else
1890                         zprintf(zoneid,
1891                             "NFS %s failed for server %s: error %d (%s)\n",
1892                             mi->mi_aclnames[which], svp->sv_hostname,
1893                             status, clnt_sperrno(status));
1894                         if (nfs_has_ctty()) {
1895                                 if (!(mi->mi_flags & MI_NOPRINT))
1896                                         uprintf(
1897                                 "NFS %s failed for server %s: error %d (%s)\n",
1898                                             mi->mi_aclnames[which],
1899                                             svp->sv_hostname, status,
1900                                             clnt_sperrno(status));
1901                         }
1902 #endif
1903                         /*
1904                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1905                          * re_errno is set appropriately depending on
1906                          * the authentication error
1907                          */
1908                         if (status == RPC_VERSMISMATCH ||
1909                             status == RPC_PROGVERSMISMATCH)
1910                                 rpcerr.re_errno = EIO;
1911                 }
1912         } else {
1913                 /*
1914                  * Test the value of mi_down and mi_printed without
1915                  * holding the mi_lock mutex.  If they are both zero,
1916                  * then it is okay to skip the down and printed
1917                  * processing.  This saves on a mutex_enter and
1918                  * mutex_exit pair for a normal, successful RPC.
1919                  * This was just complete overhead.
1920                  */
1921                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1922                         mutex_enter(&mi->mi_lock);
1923                         mi->mi_flags &= ~MI_DOWN;
1924                         if (mi->mi_flags & MI_PRINTED) {
1925                                 mi->mi_flags &= ~MI_PRINTED;
1926                                 mutex_exit(&mi->mi_lock);
1927 #ifdef DEBUG
1928                                 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1929                                     mi->mi_vers, svp->sv_hostname);
1930 #else
1931                                 zprintf(zoneid, "NFS server %s ok\n",
1932                                     svp->sv_hostname);
1933 #endif
1934                         } else
1935                                 mutex_exit(&mi->mi_lock);
1936                 }
1937
1938                 if (*douprintf == 0) {
1939                         if (!(mi->mi_flags & MI_NOPRINT))
1940 #ifdef DEBUG
1941                                 uprintf("NFS_ACL%d server %s ok\n",
1942                                     mi->mi_vers, svp->sv_hostname);
1943 #else
1944                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1945 #endif
1946                         *douprintf = 1;
1947                 }
1948         }
1949
1950         clfree_impl(client, ch, nfscl);
1951         if (cred_cloned)
1952                 crfree(cr);
1953
1954         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1955
1956 #if 0 /* notyet */
1957         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1958             rpcerr.re_errno);
1959 #endif
1960
1961         return (rpcerr.re_errno);
1962 }
1963
1964 int
1965 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1966 {
1967         uint_t mask = vap->va_mask;
1968
1969         if (!(mask & VATTR_MODE))
1970                 sa->sa_mode = (uint32_t)-1;
1971         else
1972                 sa->sa_mode = vap->va_mode;
1973         if (!(mask & VATTR_UID))
1974                 sa->sa_uid = (uint32_t)-1;
1975         else
1976                 sa->sa_uid = (uint32_t)vap->va_uid;
1977         if (!(mask & VATTR_GID))
1978                 sa->sa_gid = (uint32_t)-1;
1979         else
1980                 sa->sa_gid = (uint32_t)vap->va_gid;
1981         if (!(mask & VATTR_SIZE))
1982                 sa->sa_size = (uint32_t)-1;
1983         else
1984                 sa->sa_size = (uint32_t)vap->va_size;
1985         if (!(mask & VATTR_ATIME))
1986                 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1987         else {
1988                 /* check time validity */
1989                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1990                         return (EOVERFLOW);
1991                 }
1992                 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
1993                 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
1994         }
1995         if (!(mask & VATTR_MTIME))
1996                 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
1997         else {
1998                 /* check time validity */
1999                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2000                         return (EOVERFLOW);
2001                 }
2002                 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2003                 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2004         }
2005         return (0);
2006 }
2007
2008 int
2009 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2010 {
2011         uint_t mask = vap->va_mask;
2012
2013         if (!(mask & VATTR_MODE))
2014                 sa->mode.set_it = FALSE;
2015         else {
2016                 sa->mode.set_it = TRUE;
2017                 sa->mode.mode = (mode3)vap->va_mode;
2018         }
2019         if (!(mask & VATTR_UID))
2020                 sa->uid.set_it = FALSE;
2021         else {
2022                 sa->uid.set_it = TRUE;
2023                 sa->uid.uid = (uid3)vap->va_uid;
2024         }
2025         if (!(mask & VATTR_GID))
2026                 sa->gid.set_it = FALSE;
2027         else {
2028                 sa->gid.set_it = TRUE;
2029                 sa->gid.gid = (gid3)vap->va_gid;
2030         }
2031         if (!(mask & VATTR_SIZE))
2032                 sa->size.set_it = FALSE;
2033         else {
2034                 sa->size.set_it = TRUE;
2035                 sa->size.size = (size3)vap->va_size;
2036         }
2037         if (!(mask & VATTR_ATIME))
2038                 sa->atime.set_it = DONT_CHANGE;
2039         else {
2040                 /* check time validity */
2041                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2042                         return (EOVERFLOW);
2043                 }
2044                 sa->atime.set_it = SET_TO_CLIENT_TIME;
2045                 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2046                 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2047         }
2048         if (!(mask & VATTR_MTIME))
2049                 sa->mtime.set_it = DONT_CHANGE;
2050         else {
2051                 /* check time validity */
2052                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2053                         return (EOVERFLOW);
2054                 }
2055                 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2056                 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2057                 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2058         }
2059         return (0);
2060 }
2061
2062 void
2063 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2064 {
2065
2066         da->da_fhandle = VTOFH(dvp);
2067         da->da_name = nm;
2068         da->da_flags = 0;
2069 }
2070
2071 void
2072 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2073 {
2074
2075         da->dirp = VTOFH3(dvp);
2076         da->name = nm;
2077 }
2078
2079 int
2080 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2081 {
2082         int error;
2083         rnode_t *rp;
2084         struct vattr va;
2085
2086         va.va_mask = VATTR_MODE | VATTR_GID;
2087         error = fop_getattr(dvp, &va, 0, cr, NULL);
2088         if (error)
2089                 return (error);
2090
2091         /*
2092          * To determine the expected group-id of the created file:
2093          *  1)  If the filesystem was not mounted with the Old-BSD-compatible
2094          *      GRPID option, and the directory's set-gid bit is clear,
2095          *      then use the process's gid.
2096          *  2)  Otherwise, set the group-id to the gid of the parent directory.
2097          */
2098         rp = VTOR(dvp);
2099         mutex_enter(&rp->r_statelock);
2100         if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2101                 *gidp = crgetgid(cr);
2102         else
2103                 *gidp = va.va_gid;
2104         mutex_exit(&rp->r_statelock);
2105         return (0);
2106 }
2107
2108 int
2109 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2110 {
2111         int error;
2112         struct vattr va;
2113
2114         va.va_mask = VATTR_MODE;
2115         error = fop_getattr(dvp, &va, 0, cr, NULL);
2116         if (error)
2117                 return (error);
2118
2119         /*
2120          * Modify the expected mode (om) so that the set-gid bit matches
2121          * that of the parent directory (dvp).
2122          */
2123         if (va.va_mode & VSGID)
2124                 *omp |= VSGID;
2125         else
2126                 *omp &= ~VSGID;
2127         return (0);
2128 }
2129
2130 void
2131 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2132 {
2133
2134         if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2135                 if (!(vp->v_flag & VSWAPLIKE)) {
2136                         mutex_enter(&vp->v_lock);
2137                         vp->v_flag |= VSWAPLIKE;
2138                         mutex_exit(&vp->v_lock);
2139                 }
2140         } else {
2141                 if (vp->v_flag & VSWAPLIKE) {
2142                         mutex_enter(&vp->v_lock);
2143                         vp->v_flag &= ~VSWAPLIKE;
2144                         mutex_exit(&vp->v_lock);
2145                 }
2146         }
2147 }
2148
2149 /*
2150  * Free the resources associated with an rnode.
2151  */
2152 static void
2153 rinactive(rnode_t *rp, cred_t *cr)
2154 {
2155         vnode_t *vp;
2156         cred_t *cred;
2157         char *contents;
2158         int size;
2159         vsecattr_t *vsp;
2160         int error;
2161         nfs3_pathconf_info *info;
2162
2163         /*
2164          * Before freeing anything, wait until all asynchronous
2165          * activity is done on this rnode.  This will allow all
2166          * asynchronous read ahead and write behind i/o's to
2167          * finish.
2168          */
2169         mutex_enter(&rp->r_statelock);
2170         while (rp->r_count > 0)
2171                 cv_wait(&rp->r_cv, &rp->r_statelock);
2172         mutex_exit(&rp->r_statelock);
2173
2174         /*
2175          * Flush and invalidate all pages associated with the vnode.
2176          */
2177         vp = RTOV(rp);
2178         if (vn_has_cached_data(vp)) {
2179                 ASSERT(vp->v_type != VCHR);
2180                 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2181                         error = fop_putpage(vp, 0, 0, 0, cr, NULL);
2182                         if (error && (error == ENOSPC || error == EDQUOT)) {
2183                                 mutex_enter(&rp->r_statelock);
2184                                 if (!rp->r_error)
2185                                         rp->r_error = error;
2186                                 mutex_exit(&rp->r_statelock);
2187                         }
2188                 }
2189                 nfs_invalidate_pages(vp, 0, cr);
2190         }
2191
2192         /*
2193          * Free any held credentials and caches which may be associated
2194          * with this rnode.
2195          */
2196         mutex_enter(&rp->r_statelock);
2197         cred = rp->r_cred;
2198         rp->r_cred = NULL;
2199         contents = rp->r_symlink.contents;
2200         size = rp->r_symlink.size;
2201         rp->r_symlink.contents = NULL;
2202         vsp = rp->r_secattr;
2203         rp->r_secattr = NULL;
2204         info = rp->r_pathconf;
2205         rp->r_pathconf = NULL;
2206         mutex_exit(&rp->r_statelock);
2207
2208         /*
2209          * Free the held credential.
2210          */
2211         if (cred != NULL)
2212                 crfree(cred);
2213
2214         /*
2215          * Free the access cache entries.
2216          */
2217         (void) nfs_access_purge_rp(rp);
2218
2219         /*
2220          * Free the readdir cache entries.
2221          */
2222         if (HAVE_RDDIR_CACHE(rp))
2223                 nfs_purge_rddir_cache(vp);
2224
2225         /*
2226          * Free the symbolic link cache.
2227          */
2228         if (contents != NULL) {
2229
2230                 kmem_free((void *)contents, size);
2231         }
2232
2233         /*
2234          * Free any cached ACL.
2235          */
2236         if (vsp != NULL)
2237                 nfs_acl_free(vsp);
2238
2239         /*
2240          * Free any cached pathconf information.
2241          */
2242         if (info != NULL)
2243                 kmem_free(info, sizeof (*info));
2244 }
2245
2246 /*
2247  * Return a vnode for the given NFS Version 2 file handle.
2248  * If no rnode exists for this fhandle, create one and put it
2249  * into the hash queues.  If the rnode for this fhandle
2250  * already exists, return it.
2251  *
2252  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2253  */
2254 vnode_t *
2255 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2256     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2257 {
2258         int newnode;
2259         int index;
2260         vnode_t *vp;
2261         nfs_fhandle nfh;
2262         vattr_t va;
2263
2264         nfh.fh_len = NFS_FHSIZE;
2265         bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2266
2267         index = rtablehash(&nfh);
2268         rw_enter(&rtable[index].r_lock, RW_READER);
2269
2270         vp = make_rnode(&nfh, &rtable[index], vfsp, &nfs_vnodeops,
2271             nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2272
2273         if (attr != NULL) {
2274                 if (!newnode) {
2275                         rw_exit(&rtable[index].r_lock);
2276                         (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2277                 } else {
2278                         if (attr->na_type < NFNON || attr->na_type > NFSOC)
2279                                 vp->v_type = VBAD;
2280                         else
2281                                 vp->v_type = n2v_type(attr);
2282                         /*
2283                          * A translation here seems to be necessary
2284                          * because this function can be called
2285                          * with `attr' that has come from the wire,
2286                          * and been operated on by vattr_to_nattr().
2287                          * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2288                          * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2289                          * ->makenfsnode().
2290                          */
2291                         if ((attr->na_rdev & 0xffff0000) == 0)
2292                                 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2293                         else
2294                                 vp->v_rdev = expldev(n2v_rdev(attr));
2295                         nfs_attrcache(vp, attr, t);
2296                         rw_exit(&rtable[index].r_lock);
2297                 }
2298         } else {
2299                 if (newnode) {
2300                         PURGE_ATTRCACHE(vp);
2301                 }
2302                 rw_exit(&rtable[index].r_lock);
2303         }
2304
2305         return (vp);
2306 }
2307
2308 /*
2309  * Return a vnode for the given NFS Version 3 file handle.
2310  * If no rnode exists for this fhandle, create one and put it
2311  * into the hash queues.  If the rnode for this fhandle
2312  * already exists, return it.
2313  *
2314  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2315  */
2316 vnode_t *
2317 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2318     cred_t *cr, char *dnm, char *nm)
2319 {
2320         int newnode;
2321         int index;
2322         vnode_t *vp;
2323
2324         index = rtablehash((nfs_fhandle *)fh);
2325         rw_enter(&rtable[index].r_lock, RW_READER);
2326
2327         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2328             &nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2329             dnm, nm);
2330
2331         if (vap == NULL) {
2332                 if (newnode) {
2333                         PURGE_ATTRCACHE(vp);
2334                 }
2335                 rw_exit(&rtable[index].r_lock);
2336                 return (vp);
2337         }
2338
2339         if (!newnode) {
2340                 rw_exit(&rtable[index].r_lock);
2341                 nfs_attr_cache(vp, vap, t, cr);
2342         } else {
2343                 rnode_t *rp = VTOR(vp);
2344
2345                 vp->v_type = vap->va_type;
2346                 vp->v_rdev = vap->va_rdev;
2347
2348                 mutex_enter(&rp->r_statelock);
2349                 if (rp->r_mtime <= t)
2350                         nfs_attrcache_va(vp, vap);
2351                 mutex_exit(&rp->r_statelock);
2352                 rw_exit(&rtable[index].r_lock);
2353         }
2354
2355         return (vp);
2356 }
2357
2358 vnode_t *
2359 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2360     cred_t *cr, char *dnm, char *nm)
2361 {
2362         int newnode;
2363         int index;
2364         vnode_t *vp;
2365         vattr_t va;
2366
2367         index = rtablehash((nfs_fhandle *)fh);
2368         rw_enter(&rtable[index].r_lock, RW_READER);
2369
2370         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2371             &nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2372             dnm, nm);
2373
2374         if (attr == NULL) {
2375                 if (newnode) {
2376                         PURGE_ATTRCACHE(vp);
2377                 }
2378                 rw_exit(&rtable[index].r_lock);
2379                 return (vp);
2380         }
2381
2382         if (!newnode) {
2383                 rw_exit(&rtable[index].r_lock);
2384                 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2385         } else {
2386                 if (attr->type < NF3REG || attr->type > NF3FIFO)
2387                         vp->v_type = VBAD;
2388                 else
2389                         vp->v_type = nf3_to_vt[attr->type];
2390                 vp->v_rdev = makedevice(attr->rdev.specdata1,
2391                     attr->rdev.specdata2);
2392                 nfs3_attrcache(vp, attr, t);
2393                 rw_exit(&rtable[index].r_lock);
2394         }
2395
2396         return (vp);
2397 }
2398
2399 /*
2400  * Read this comment before making changes to rtablehash()!
2401  * This is a hash function in which seemingly obvious and harmless
2402  * changes can cause escalations costing million dollars!
2403  * Know what you are doing.
2404  *
2405  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2406  * algorithm is currently detailed here:
2407  *
2408  *   http://burtleburtle.net/bob/hash/doobs.html
2409  *
2410  * Of course, the above link may not be valid by the time you are reading
2411  * this, but suffice it to say that the one-at-a-time algorithm works well in
2412  * almost all cases.  If you are changing the algorithm be sure to verify that
2413  * the hash algorithm still provides even distribution in all cases and with
2414  * any server returning filehandles in whatever order (sequential or random).
2415  */
2416 static int
2417 rtablehash(nfs_fhandle *fh)
2418 {
2419         ulong_t hash, len, i;
2420         char *key;
2421
2422         key = fh->fh_buf;
2423         len = (ulong_t)fh->fh_len;
2424         for (hash = 0, i = 0; i < len; i++) {
2425                 hash += key[i];
2426                 hash += (hash << 10);
2427                 hash ^= (hash >> 6);
2428         }
2429         hash += (hash << 3);
2430         hash ^= (hash >> 11);
2431         hash += (hash << 15);
2432         return (hash & rtablemask);
2433 }
2434
2435 static vnode_t *
2436 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2437     const struct vnodeops *vops,
2438     int (*putapage)(vnode_t *, page_t *, uoff_t *, size_t *, int, cred_t *),
2439     int (*compar)(const void *, const void *),
2440     int *newnode, cred_t *cr, char *dnm, char *nm)
2441 {
2442         rnode_t *rp;
2443         rnode_t *trp;
2444         vnode_t *vp;
2445         mntinfo_t *mi;
2446
2447         ASSERT(RW_READ_HELD(&rhtp->r_lock));
2448
2449         mi = VFTOMI(vfsp);
2450 start:
2451         if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2452                 vp = RTOV(rp);
2453                 nfs_set_vroot(vp);
2454                 *newnode = 0;
2455                 return (vp);
2456         }
2457         rw_exit(&rhtp->r_lock);
2458
2459         mutex_enter(&rpfreelist_lock);
2460         if (rpfreelist != NULL && rnew >= nrnode) {
2461                 rp = rpfreelist;
2462                 rp_rmfree(rp);
2463                 mutex_exit(&rpfreelist_lock);
2464
2465                 vp = RTOV(rp);
2466
2467                 if (rp->r_flags & RHASHED) {
2468                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2469                         mutex_enter(&vp->v_lock);
2470                         if (vp->v_count > 1) {
2471                                 VN_RELE_LOCKED(vp);
2472                                 mutex_exit(&vp->v_lock);
2473                                 rw_exit(&rp->r_hashq->r_lock);
2474                                 rw_enter(&rhtp->r_lock, RW_READER);
2475                                 goto start;
2476                         }
2477                         mutex_exit(&vp->v_lock);
2478                         rp_rmhash_locked(rp);
2479                         rw_exit(&rp->r_hashq->r_lock);
2480                 }
2481
2482                 rinactive(rp, cr);
2483
2484                 mutex_enter(&vp->v_lock);
2485                 if (vp->v_count > 1) {
2486                         VN_RELE_LOCKED(vp);
2487                         mutex_exit(&vp->v_lock);
2488                         rw_enter(&rhtp->r_lock, RW_READER);
2489                         goto start;
2490                 }
2491                 mutex_exit(&vp->v_lock);
2492                 vn_invalid(vp);
2493                 /*
2494                  * destroy old locks before bzero'ing and
2495                  * recreating the locks below.
2496                  */
2497                 nfs_rw_destroy(&rp->r_rwlock);
2498                 nfs_rw_destroy(&rp->r_lkserlock);
2499                 mutex_destroy(&rp->r_statelock);
2500                 cv_destroy(&rp->r_cv);
2501                 cv_destroy(&rp->r_commit.c_cv);
2502                 nfs_free_r_path(rp);
2503                 avl_destroy(&rp->r_dir);
2504                 /*
2505                  * Make sure that if rnode is recycled then
2506                  * VFS count is decremented properly before
2507                  * reuse.
2508                  */
2509                 VFS_RELE(vp->v_vfsp);
2510                 vn_reinit(vp);
2511         } else {
2512                 vnode_t *new_vp;
2513
2514                 mutex_exit(&rpfreelist_lock);
2515
2516                 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2517                 new_vp = vn_alloc(KM_SLEEP);
2518
2519                 atomic_inc_ulong((ulong_t *)&rnew);
2520 #ifdef DEBUG
2521                 clstat_debug.nrnode.value.ui64++;
2522 #endif
2523                 vp = new_vp;
2524         }
2525
2526         bzero(rp, sizeof (*rp));
2527         rp->r_vnode = vp;
2528         nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2529         nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2530         mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2531         cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2532         cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2533         rp->r_fh.fh_len = fh->fh_len;
2534         bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2535         rp->r_server = mi->mi_curr_serv;
2536         if (FAILOVER_MOUNT(mi)) {
2537                 /*
2538                  * If replicated servers, stash pathnames
2539                  */
2540                 if (dnm != NULL && nm != NULL) {
2541                         char *s, *p;
2542                         uint_t len;
2543
2544                         len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2545                         rp->r_path = kmem_alloc(len, KM_SLEEP);
2546 #ifdef DEBUG
2547                         clstat_debug.rpath.value.ui64 += len;
2548 #endif
2549                         s = rp->r_path;
2550                         for (p = dnm; *p; p++)
2551                                 *s++ = *p;
2552                         *s++ = '/';
2553                         for (p = nm; *p; p++)
2554                                 *s++ = *p;
2555                         *s = '\0';
2556                 } else {
2557                         /* special case for root */
2558                         rp->r_path = kmem_alloc(2, KM_SLEEP);
2559 #ifdef DEBUG
2560                         clstat_debug.rpath.value.ui64 += 2;
2561 #endif
2562                         *rp->r_path = '.';
2563                         *(rp->r_path + 1) = '\0';
2564                 }
2565         }
2566         VFS_HOLD(vfsp);
2567         rp->r_putapage = putapage;
2568         rp->r_hashq = rhtp;
2569         rp->r_flags = RREADDIRPLUS;
2570         avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2571             offsetof(rddir_cache, tree));
2572         vn_setops(vp, vops);
2573         vp->v_data = (caddr_t)rp;
2574         vp->v_vfsp = vfsp;
2575         vp->v_type = VNON;
2576         vp->v_flag |= VMODSORT;
2577         nfs_set_vroot(vp);
2578
2579         /*
2580          * There is a race condition if someone else
2581          * alloc's the rnode while no locks are held, so we
2582          * check again and recover if found.
2583          */
2584         rw_enter(&rhtp->r_lock, RW_WRITER);
2585         if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2586                 vp = RTOV(trp);
2587                 nfs_set_vroot(vp);
2588                 *newnode = 0;
2589                 rw_exit(&rhtp->r_lock);
2590                 rp_addfree(rp, cr);
2591                 rw_enter(&rhtp->r_lock, RW_READER);
2592                 return (vp);
2593         }
2594         rp_addhash(rp);
2595         *newnode = 1;
2596         return (vp);
2597 }
2598
2599 /*
2600  * Callback function to check if the page should be marked as
2601  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2602  */
2603 int
2604 nfs_setmod_check(page_t *pp)
2605 {
2606         if (pp->p_fsdata != C_NOCOMMIT) {
2607                 pp->p_fsdata = C_NOCOMMIT;
2608                 return (1);
2609         }
2610         return (0);
2611 }
2612
2613 static void
2614 nfs_set_vroot(vnode_t *vp)
2615 {
2616         rnode_t *rp;
2617         nfs_fhandle *rootfh;
2618
2619         rp = VTOR(vp);
2620         rootfh = &rp->r_server->sv_fhandle;
2621         if (rootfh->fh_len == rp->r_fh.fh_len &&
2622             bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2623                 if (!(vp->v_flag & VROOT)) {
2624                         mutex_enter(&vp->v_lock);
2625                         vp->v_flag |= VROOT;
2626                         mutex_exit(&vp->v_lock);
2627                 }
2628         }
2629 }
2630
2631 static void
2632 nfs_free_r_path(rnode_t *rp)
2633 {
2634         char *path;
2635         size_t len;
2636
2637         path = rp->r_path;
2638         if (path) {
2639                 rp->r_path = NULL;
2640                 len = strlen(path) + 1;
2641                 kmem_free(path, len);
2642 #ifdef DEBUG
2643                 clstat_debug.rpath.value.ui64 -= len;
2644 #endif
2645         }
2646 }
2647
2648 /*
2649  * Put an rnode on the free list.
2650  *
2651  * Rnodes which were allocated above and beyond the normal limit
2652  * are immediately freed.
2653  */
2654 void
2655 rp_addfree(rnode_t *rp, cred_t *cr)
2656 {
2657         vnode_t *vp;
2658         struct vfs *vfsp;
2659
2660         vp = RTOV(rp);
2661         ASSERT(vp->v_count >= 1);
2662         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2663
2664         /*
2665          * If we have too many rnodes allocated and there are no
2666          * references to this rnode, or if the rnode is no longer
2667          * accessible by it does not reside in the hash queues,
2668          * or if an i/o error occurred while writing to the file,
2669          * then just free it instead of putting it on the rnode
2670          * freelist.
2671          */
2672         vfsp = vp->v_vfsp;
2673         if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2674             (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2675                 if (rp->r_flags & RHASHED) {
2676                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2677                         mutex_enter(&vp->v_lock);
2678                         if (vp->v_count > 1) {
2679                                 VN_RELE_LOCKED(vp);
2680                                 mutex_exit(&vp->v_lock);
2681                                 rw_exit(&rp->r_hashq->r_lock);
2682                                 return;
2683                         }
2684                         mutex_exit(&vp->v_lock);
2685                         rp_rmhash_locked(rp);
2686                         rw_exit(&rp->r_hashq->r_lock);
2687                 }
2688
2689                 rinactive(rp, cr);
2690
2691                 /*
2692                  * Recheck the vnode reference count.  We need to
2693                  * make sure that another reference has not been
2694                  * acquired while we were not holding v_lock.  The
2695                  * rnode is not in the rnode hash queues, so the
2696                  * only way for a reference to have been acquired
2697                  * is for a fop_putpage because the rnode was marked
2698                  * with RDIRTY or for a modified page.  This
2699                  * reference may have been acquired before our call
2700                  * to rinactive.  The i/o may have been completed,
2701                  * thus allowing rinactive to complete, but the
2702                  * reference to the vnode may not have been released
2703                  * yet.  In any case, the rnode can not be destroyed
2704                  * until the other references to this vnode have been
2705                  * released.  The other references will take care of
2706                  * either destroying the rnode or placing it on the
2707                  * rnode freelist.  If there are no other references,
2708                  * then the rnode may be safely destroyed.
2709                  */
2710                 mutex_enter(&vp->v_lock);
2711                 if (vp->v_count > 1) {
2712                         VN_RELE_LOCKED(vp);
2713                         mutex_exit(&vp->v_lock);
2714                         return;
2715                 }
2716                 mutex_exit(&vp->v_lock);
2717
2718                 destroy_rnode(rp);
2719                 return;
2720         }
2721
2722         /*
2723          * Lock the hash queue and then recheck the reference count
2724          * to ensure that no other threads have acquired a reference
2725          * to indicate that the rnode should not be placed on the
2726          * freelist.  If another reference has been acquired, then
2727          * just release this one and let the other thread complete
2728          * the processing of adding this rnode to the freelist.
2729          */
2730         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2731
2732         mutex_enter(&vp->v_lock);
2733         if (vp->v_count > 1) {
2734                 VN_RELE_LOCKED(vp);
2735                 mutex_exit(&vp->v_lock);
2736                 rw_exit(&rp->r_hashq->r_lock);
2737                 return;
2738         }
2739         mutex_exit(&vp->v_lock);
2740
2741         /*
2742          * If there is no cached data or metadata for this file, then
2743          * put the rnode on the front of the freelist so that it will
2744          * be reused before other rnodes which may have cached data or
2745          * metadata associated with them.
2746          */
2747         mutex_enter(&rpfreelist_lock);
2748         if (rpfreelist == NULL) {
2749                 rp->r_freef = rp;
2750                 rp->r_freeb = rp;
2751                 rpfreelist = rp;
2752         } else {
2753                 rp->r_freef = rpfreelist;
2754                 rp->r_freeb = rpfreelist->r_freeb;
2755                 rpfreelist->r_freeb->r_freef = rp;
2756                 rpfreelist->r_freeb = rp;
2757                 if (!vn_has_cached_data(vp) &&
2758                     !HAVE_RDDIR_CACHE(rp) &&
2759                     rp->r_symlink.contents == NULL &&
2760                     rp->r_secattr == NULL &&
2761                     rp->r_pathconf == NULL)
2762                         rpfreelist = rp;
2763         }
2764         mutex_exit(&rpfreelist_lock);
2765
2766         rw_exit(&rp->r_hashq->r_lock);
2767 }
2768
2769 /*
2770  * Remove an rnode from the free list.
2771  *
2772  * The caller must be holding rpfreelist_lock and the rnode
2773  * must be on the freelist.
2774  */
2775 static void
2776 rp_rmfree(rnode_t *rp)
2777 {
2778
2779         ASSERT(MUTEX_HELD(&rpfreelist_lock));
2780         ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2781
2782         if (rp == rpfreelist) {
2783                 rpfreelist = rp->r_freef;
2784                 if (rp == rpfreelist)
2785                         rpfreelist = NULL;
2786         }
2787
2788         rp->r_freeb->r_freef = rp->r_freef;
2789         rp->r_freef->r_freeb = rp->r_freeb;
2790
2791         rp->r_freef = rp->r_freeb = NULL;
2792 }
2793
2794 /*
2795  * Put a rnode in the hash table.
2796  *
2797  * The caller must be holding the exclusive hash queue lock.
2798  */
2799 static void
2800 rp_addhash(rnode_t *rp)
2801 {
2802         mntinfo_t *mi;
2803
2804         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2805         ASSERT(!(rp->r_flags & RHASHED));
2806
2807         rp->r_hashf = rp->r_hashq->r_hashf;
2808         rp->r_hashq->r_hashf = rp;
2809         rp->r_hashb = (rnode_t *)rp->r_hashq;
2810         rp->r_hashf->r_hashb = rp;
2811
2812         mutex_enter(&rp->r_statelock);
2813         rp->r_flags |= RHASHED;
2814         mutex_exit(&rp->r_statelock);
2815
2816         mi = VTOMI(RTOV(rp));
2817         mutex_enter(&mi->mi_rnodes_lock);
2818         list_insert_tail(&mi->mi_rnodes, rp);
2819         mutex_exit(&mi->mi_rnodes_lock);
2820 }
2821
2822 /*
2823  * Remove a rnode from the hash table.
2824  *
2825  * The caller must be holding the hash queue lock.
2826  */
2827 static void
2828 rp_rmhash_locked(rnode_t *rp)
2829 {
2830         mntinfo_t *mi;
2831
2832         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2833         ASSERT(rp->r_flags & RHASHED);
2834
2835         rp->r_hashb->r_hashf = rp->r_hashf;
2836         rp->r_hashf->r_hashb = rp->r_hashb;
2837
2838         mutex_enter(&rp->r_statelock);
2839         rp->r_flags &= ~RHASHED;
2840         mutex_exit(&rp->r_statelock);
2841
2842         mi = VTOMI(RTOV(rp));
2843         mutex_enter(&mi->mi_rnodes_lock);
2844         if (list_link_active(&rp->r_mi_link))
2845                 list_remove(&mi->mi_rnodes, rp);
2846         mutex_exit(&mi->mi_rnodes_lock);
2847 }
2848
2849 /*
2850  * Remove a rnode from the hash table.
2851  *
2852  * The caller must not be holding the hash queue lock.
2853  */
2854 void
2855 rp_rmhash(rnode_t *rp)
2856 {
2857
2858         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2859         rp_rmhash_locked(rp);
2860         rw_exit(&rp->r_hashq->r_lock);
2861 }
2862
2863 /*
2864  * Lookup a rnode by fhandle.
2865  *
2866  * The caller must be holding the hash queue lock, either shared or exclusive.
2867  */
2868 static rnode_t *
2869 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2870 {
2871         rnode_t *rp;
2872         vnode_t *vp;
2873
2874         ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2875
2876         for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2877                 vp = RTOV(rp);
2878                 if (vp->v_vfsp == vfsp &&
2879                     rp->r_fh.fh_len == fh->fh_len &&
2880                     bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2881                         /*
2882                          * remove rnode from free list, if necessary.
2883                          */
2884                         if (rp->r_freef != NULL) {
2885                                 mutex_enter(&rpfreelist_lock);
2886                                 /*
2887                                  * If the rnode is on the freelist,
2888                                  * then remove it and use that reference
2889                                  * as the new reference.  Otherwise,
2890                                  * need to increment the reference count.
2891                                  */
2892                                 if (rp->r_freef != NULL) {
2893                                         rp_rmfree(rp);
2894                                         mutex_exit(&rpfreelist_lock);
2895                                 } else {
2896                                         mutex_exit(&rpfreelist_lock);
2897                                         VN_HOLD(vp);
2898                                 }
2899                         } else
2900                                 VN_HOLD(vp);
2901                         return (rp);
2902                 }
2903         }
2904         return (NULL);
2905 }
2906
2907 /*
2908  * Return 1 if there is an active vnode belonging to this vfs in the
2909  * rtable cache.
2910  *
2911  * Several of these checks are done without holding the usual
2912  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2913  * etc. will redo the necessary checks before actually destroying
2914  * any rnodes.
2915  */
2916 int
2917 check_rtable(struct vfs *vfsp)
2918 {
2919         rnode_t *rp;
2920         vnode_t *vp;
2921         mntinfo_t *mi;
2922
2923         ASSERT(vfsp != NULL);
2924         mi = VFTOMI(vfsp);
2925
2926         mutex_enter(&mi->mi_rnodes_lock);
2927         for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2928             rp = list_next(&mi->mi_rnodes, rp)) {
2929                 vp = RTOV(rp);
2930
2931                 if (rp->r_freef == NULL ||
2932                     (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2933                     rp->r_count > 0) {
2934                         mutex_exit(&mi->mi_rnodes_lock);
2935                         return (1);
2936                 }
2937         }
2938         mutex_exit(&mi->mi_rnodes_lock);
2939
2940         return (0);
2941 }
2942
2943 /*
2944  * Destroy inactive vnodes from the hash queues which belong to this
2945  * vfs.  It is essential that we destroy all inactive vnodes during a
2946  * forced unmount as well as during a normal unmount.
2947  */
2948 void
2949 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2950 {
2951         rnode_t *rp;
2952         mntinfo_t *mi;
2953
2954         ASSERT(vfsp != NULL);
2955
2956         mi = VFTOMI(vfsp);
2957
2958         mutex_enter(&rpfreelist_lock);
2959         mutex_enter(&mi->mi_rnodes_lock);
2960         while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2961                 /*
2962                  * If the rnode is no longer on the freelist it is not
2963                  * ours and it will be handled by some other thread, so
2964                  * skip it.
2965                  */
2966                 if (rp->r_freef == NULL)
2967                         continue;
2968                 mutex_exit(&mi->mi_rnodes_lock);
2969
2970                 rp_rmfree(rp);
2971                 mutex_exit(&rpfreelist_lock);
2972
2973                 rp_rmhash(rp);
2974
2975                 /*
2976                  * This call to rp_addfree will end up destroying the
2977                  * rnode, but in a safe way with the appropriate set
2978                  * of checks done.
2979                  */
2980                 rp_addfree(rp, cr);
2981
2982                 mutex_enter(&rpfreelist_lock);
2983                 mutex_enter(&mi->mi_rnodes_lock);
2984         }
2985         mutex_exit(&mi->mi_rnodes_lock);
2986         mutex_exit(&rpfreelist_lock);
2987 }
2988
2989 /*
2990  * This routine destroys all the resources associated with the rnode
2991  * and then the rnode itself.
2992  */
2993 static void
2994 destroy_rnode(rnode_t *rp)
2995 {
2996         vnode_t *vp;
2997         vfs_t *vfsp;
2998
2999         vp = RTOV(rp);
3000         vfsp = vp->v_vfsp;
3001
3002         ASSERT(vp->v_count == 1);
3003         ASSERT(rp->r_count == 0);
3004         ASSERT(rp->r_lmpl == NULL);
3005         ASSERT(rp->r_mapcnt == 0);
3006         ASSERT(!(rp->r_flags & RHASHED));
3007         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3008         atomic_dec_ulong((ulong_t *)&rnew);
3009 #ifdef DEBUG
3010         clstat_debug.nrnode.value.ui64--;
3011 #endif
3012         nfs_rw_destroy(&rp->r_rwlock);
3013         nfs_rw_destroy(&rp->r_lkserlock);
3014         mutex_destroy(&rp->r_statelock);
3015         cv_destroy(&rp->r_cv);
3016         cv_destroy(&rp->r_commit.c_cv);
3017         if (rp->r_flags & RDELMAPLIST)
3018                 list_destroy(&rp->r_indelmap);
3019         nfs_free_r_path(rp);
3020         avl_destroy(&rp->r_dir);
3021         vn_invalid(vp);
3022         vn_free(vp);
3023         kmem_cache_free(rnode_cache, rp);
3024         VFS_RELE(vfsp);
3025 }
3026
3027 /*
3028  * Flush all vnodes in this (or every) vfs.
3029  * Used by nfs_sync and by nfs_unmount.
3030  */
3031 void
3032 rflush(struct vfs *vfsp, cred_t *cr)
3033 {
3034         int index;
3035         rnode_t *rp;
3036         vnode_t *vp, **vplist;
3037         long num, cnt;
3038
3039         /*
3040          * Check to see whether there is anything to do.
3041          */
3042         num = rnew;
3043         if (num == 0)
3044                 return;
3045
3046         /*
3047          * Allocate a slot for all currently active rnodes on the
3048          * supposition that they all may need flushing.
3049          */
3050         vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3051         cnt = 0;
3052
3053         /*
3054          * If the vfs is known we can do fast path by iterating all rnodes that
3055          * belongs to this vfs.  This is much faster than the traditional way
3056          * of iterating rtable (below) in a case there is a lot of rnodes that
3057          * does not belong to our vfs.
3058          */
3059         if (vfsp != NULL) {
3060                 mntinfo_t *mi = VFTOMI(vfsp);
3061
3062                 mutex_enter(&mi->mi_rnodes_lock);
3063                 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3064                     rp = list_next(&mi->mi_rnodes, rp)) {
3065                         vp = RTOV(rp);
3066                         /*
3067                          * Don't bother sync'ing a vp if it
3068                          * is part of virtual swap device or
3069                          * if VFS is read-only
3070                          */
3071                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3072                                 continue;
3073                         /*
3074                          * If the vnode has pages and is marked as either dirty
3075                          * or mmap'd, hold and add this vnode to the list of
3076                          * vnodes to flush.
3077                          */
3078                         ASSERT(vp->v_vfsp == vfsp);
3079                         if (vn_has_cached_data(vp) &&
3080                             ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3081                                 VN_HOLD(vp);
3082                                 vplist[cnt++] = vp;
3083                                 if (cnt == num) {
3084                                         /*
3085                                          * The vplist is full because there is
3086                                          * too many rnodes.  We are done for
3087                                          * now.
3088                                          */
3089                                         break;
3090                                 }
3091                         }
3092                 }
3093                 mutex_exit(&mi->mi_rnodes_lock);
3094
3095                 goto done;
3096         }
3097
3098         ASSERT(vfsp == NULL);
3099
3100         /*
3101          * Walk the hash queues looking for rnodes with page
3102          * lists associated with them.  Make a list of these
3103          * files.
3104          */
3105         for (index = 0; index < rtablesize; index++) {
3106                 rw_enter(&rtable[index].r_lock, RW_READER);
3107                 for (rp = rtable[index].r_hashf;
3108                     rp != (rnode_t *)(&rtable[index]);
3109                     rp = rp->r_hashf) {
3110                         vp = RTOV(rp);
3111                         /*
3112                          * Don't bother sync'ing a vp if it
3113                          * is part of virtual swap device or
3114                          * if VFS is read-only
3115                          */
3116                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3117                                 continue;
3118                         /*
3119                          * If the vnode has pages and is marked as either dirty
3120                          * or mmap'd, hold and add this vnode to the list of
3121                          * vnodes to flush.
3122                          */
3123                         if (vn_has_cached_data(vp) &&
3124                             ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3125                                 VN_HOLD(vp);
3126                                 vplist[cnt++] = vp;
3127                                 if (cnt == num) {
3128                                         rw_exit(&rtable[index].r_lock);
3129                                         /*
3130                                          * The vplist is full because there is
3131                                          * too many rnodes.  We are done for
3132                                          * now.
3133                                          */
3134                                         goto done;
3135                                 }
3136                         }
3137                 }
3138                 rw_exit(&rtable[index].r_lock);
3139         }
3140
3141 done:
3142
3143         /*
3144          * Flush and release all of the files on the list.
3145          */
3146         while (cnt-- > 0) {
3147                 vp = vplist[cnt];
3148                 (void) fop_putpage(vp, 0, 0, B_ASYNC, cr, NULL);
3149                 VN_RELE(vp);
3150         }
3151
3152         /*
3153          * Free the space allocated to hold the list.
3154          */
3155         kmem_free(vplist, num * sizeof (*vplist));
3156 }
3157
3158 /*
3159  * This probably needs to be larger than or equal to
3160  * log2(sizeof (struct rnode)) due to the way that rnodes are
3161  * allocated.
3162  */
3163 #define ACACHE_SHIFT_BITS       9
3164
3165 static int
3166 acachehash(rnode_t *rp, cred_t *cr)
3167 {
3168
3169         return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3170             acachemask);
3171 }
3172
3173 #ifdef DEBUG
3174 static long nfs_access_cache_hits = 0;
3175 static long nfs_access_cache_misses = 0;
3176 #endif
3177
3178 nfs_access_type_t
3179 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3180 {
3181         vnode_t *vp;
3182         acache_t *ap;
3183         acache_hash_t *hp;
3184         nfs_access_type_t all;
3185
3186         vp = RTOV(rp);
3187         if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3188                 return (NFS_ACCESS_UNKNOWN);
3189
3190         if (rp->r_acache != NULL) {
3191                 hp = &acache[acachehash(rp, cr)];
3192                 rw_enter(&hp->lock, RW_READER);
3193                 ap = hp->next;
3194                 while (ap != (acache_t *)hp) {
3195                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3196                                 if ((ap->known & acc) == acc) {
3197 #ifdef DEBUG
3198                                         nfs_access_cache_hits++;
3199 #endif
3200                                         if ((ap->allowed & acc) == acc)
3201                                                 all = NFS_ACCESS_ALLOWED;
3202                                         else
3203                                                 all = NFS_ACCESS_DENIED;
3204                                 } else {
3205 #ifdef DEBUG
3206                                         nfs_access_cache_misses++;
3207 #endif
3208                                         all = NFS_ACCESS_UNKNOWN;
3209                                 }
3210                                 rw_exit(&hp->lock);
3211                                 return (all);
3212                         }
3213                         ap = ap->next;
3214                 }
3215                 rw_exit(&hp->lock);
3216         }
3217
3218 #ifdef DEBUG
3219         nfs_access_cache_misses++;
3220 #endif
3221         return (NFS_ACCESS_UNKNOWN);
3222 }
3223
3224 void
3225 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3226 {
3227         acache_t *ap;
3228         acache_t *nap;
3229         acache_hash_t *hp;
3230
3231         hp = &acache[acachehash(rp, cr)];
3232
3233         /*
3234          * Allocate now assuming that mostly an allocation will be
3235          * required.  This allows the allocation to happen without
3236          * holding the hash bucket locked.
3237          */
3238         nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3239         if (nap != NULL) {
3240                 nap->known = acc;
3241                 nap->allowed = resacc;
3242                 nap->rnode = rp;
3243                 crhold(cr);
3244                 nap->cred = cr;
3245                 nap->hashq = hp;
3246         }
3247
3248         rw_enter(&hp->lock, RW_WRITER);
3249
3250         if (rp->r_acache != NULL) {
3251                 ap = hp->next;
3252                 while (ap != (acache_t *)hp) {
3253                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3254                                 ap->known |= acc;
3255                                 ap->allowed &= ~acc;
3256                                 ap->allowed |= resacc;
3257                                 rw_exit(&hp->lock);
3258                                 if (nap != NULL) {
3259                                         crfree(nap->cred);
3260                                         kmem_cache_free(acache_cache, nap);
3261                                 }
3262                                 return;
3263                         }
3264                         ap = ap->next;
3265                 }
3266         }
3267
3268         if (nap != NULL) {
3269 #ifdef DEBUG
3270                 clstat_debug.access.value.ui64++;
3271 #endif
3272                 nap->next = hp->next;
3273                 hp->next = nap;
3274                 nap->next->prev = nap;
3275                 nap->prev = (acache_t *)hp;
3276
3277                 mutex_enter(&rp->r_statelock);
3278                 nap->list = rp->r_acache;
3279                 rp->r_acache = nap;
3280                 mutex_exit(&rp->r_statelock);
3281         }
3282
3283         rw_exit(&hp->lock);
3284 }
3285
3286 int
3287 nfs_access_purge_rp(rnode_t *rp)
3288 {
3289         acache_t *ap;
3290         acache_t *tmpap;
3291         acache_t *rplist;
3292
3293         /*
3294          * If there aren't any cached entries, then there is nothing
3295          * to free.
3296          */
3297         if (rp->r_acache == NULL)
3298                 return (0);
3299
3300         mutex_enter(&rp->r_statelock);
3301         rplist = rp->r_acache;
3302         rp->r_acache = NULL;
3303         mutex_exit(&rp->r_statelock);
3304
3305         /*
3306          * Loop through each entry in the list pointed to in the
3307          * rnode.  Remove each of these entries from the hash
3308          * queue that it is on and remove it from the list in
3309          * the rnode.
3310          */
3311         for (ap = rplist; ap != NULL; ap = tmpap) {
3312                 rw_enter(&ap->hashq->lock, RW_WRITER);
3313                 ap->prev->next = ap->next;
3314                 ap->next->prev = ap->prev;
3315                 rw_exit(&ap->hashq->lock);
3316
3317                 tmpap = ap->list;
3318                 crfree(ap->cred);
3319                 kmem_cache_free(acache_cache, ap);
3320 #ifdef DEBUG
3321                 clstat_debug.access.value.ui64--;
3322 #endif
3323         }
3324
3325         return (1);
3326 }
3327
3328 static const char prefix[] = ".nfs";
3329
3330 static kmutex_t newnum_lock;
3331
3332 int
3333 newnum(void)
3334 {
3335         static uint_t newnum = 0;
3336         uint_t id;
3337
3338         mutex_enter(&newnum_lock);
3339         if (newnum == 0)
3340                 newnum = gethrestime_sec() & 0xffff;
3341         id = newnum++;
3342         mutex_exit(&newnum_lock);
3343         return (id);
3344 }
3345
3346 char *
3347 newname(void)
3348 {
3349         char *news;
3350         char *s;
3351         const char *p;
3352         uint_t id;
3353
3354         id = newnum();
3355         news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3356         s = news;
3357         p = prefix;
3358         while (*p != '\0')
3359                 *s++ = *p++;
3360         while (id != 0) {
3361                 *s++ = "0123456789ABCDEF"[id & 0x0f];
3362                 id >>= 4;
3363         }
3364         *s = '\0';
3365         return (news);
3366 }
3367
3368 /*
3369  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3370  * framework.
3371  */
3372 static int
3373 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3374 {
3375         ksp->ks_snaptime = gethrtime();
3376         if (rw == KSTAT_WRITE) {
3377                 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3378 #ifdef DEBUG
3379                 /*
3380                  * Currently only the global zone can write to kstats, but we
3381                  * add the check just for paranoia.
3382                  */
3383                 if (INGLOBALZONE(curproc))
3384                         bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3385                             sizeof (clstat_debug));
3386 #endif
3387         } else {
3388                 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3389 #ifdef DEBUG
3390                 /*
3391                  * If we're displaying the "global" debug kstat values, we
3392                  * display them as-is to all zones since in fact they apply to
3393                  * the system as a whole.
3394                  */
3395                 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3396                     sizeof (clstat_debug));
3397 #endif
3398         }
3399         return (0);
3400 }
3401
3402 static void *
3403 clinit_zone(zoneid_t zoneid)
3404 {
3405         kstat_t *nfs_client_kstat;
3406         struct nfs_clnt *nfscl;
3407         uint_t ndata;
3408
3409         nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3410         mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3411         nfscl->nfscl_chtable = NULL;
3412         nfscl->nfscl_zoneid = zoneid;
3413
3414         bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3415         ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3416 #ifdef DEBUG
3417         ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3418 #endif
3419         if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3420             "misc", KSTAT_TYPE_NAMED, ndata,
3421             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3422                 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3423                 nfs_client_kstat->ks_snapshot = cl_snapshot;
3424                 kstat_install(nfs_client_kstat);
3425         }
3426         mutex_enter(&nfs_clnt_list_lock);
3427         list_insert_head(&nfs_clnt_list, nfscl);
3428         mutex_exit(&nfs_clnt_list_lock);
3429         return (nfscl);
3430 }
3431
3432 /*ARGSUSED*/
3433 static void
3434 clfini_zone(zoneid_t zoneid, void *arg)
3435 {
3436         struct nfs_clnt *nfscl = arg;
3437         chhead_t *chp, *next;
3438
3439         if (nfscl == NULL)
3440                 return;
3441         mutex_enter(&nfs_clnt_list_lock);
3442         list_remove(&nfs_clnt_list, nfscl);
3443         mutex_exit(&nfs_clnt_list_lock);
3444         clreclaim_zone(nfscl, 0);
3445         for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3446                 ASSERT(chp->ch_list == NULL);
3447                 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3448                 next = chp->ch_next;
3449                 kmem_free(chp, sizeof (*chp));
3450         }
3451         kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3452         mutex_destroy(&nfscl->nfscl_chtable_lock);
3453         kmem_free(nfscl, sizeof (*nfscl));
3454 }
3455
3456 /*
3457  * Called by endpnt_destructor to make sure the client handles are
3458  * cleaned up before the RPC endpoints.  This becomes a no-op if
3459  * clfini_zone (above) is called first.  This function is needed
3460  * (rather than relying on clfini_zone to clean up) because the ZSD
3461  * callbacks have no ordering mechanism, so we have no way to ensure
3462  * that clfini_zone is called before endpnt_destructor.
3463  */
3464 void
3465 clcleanup_zone(zoneid_t zoneid)
3466 {
3467         struct nfs_clnt *nfscl;
3468
3469         mutex_enter(&nfs_clnt_list_lock);
3470         nfscl = list_head(&nfs_clnt_list);
3471         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3472                 if (nfscl->nfscl_zoneid == zoneid) {
3473                         clreclaim_zone(nfscl, 0);
3474                         break;
3475                 }
3476         }
3477         mutex_exit(&nfs_clnt_list_lock);
3478 }
3479
3480 int
3481 nfs_subrinit(void)
3482 {
3483         int i;
3484         ulong_t nrnode_max;
3485
3486         /*
3487          * Allocate and initialize the rnode hash queues
3488          */
3489         if (nrnode <= 0)
3490                 nrnode = ncsize;
3491         nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3492         if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3493                 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3494                     "!setting nrnode to max value of %ld", nrnode_max);
3495                 nrnode = nrnode_max;
3496         }
3497
3498         rtablesize = 1 << highbit(nrnode / hashlen);
3499         rtablemask = rtablesize - 1;
3500         rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3501         for (i = 0; i < rtablesize; i++) {
3502                 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3503                 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3504                 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3505         }
3506         rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3507             0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3508
3509         /*
3510          * Allocate and initialize the access cache
3511          */
3512
3513         /*
3514          * Initial guess is one access cache entry per rnode unless
3515          * nacache is set to a non-zero value and then it is used to
3516          * indicate a guess at the number of access cache entries.
3517          */
3518         if (nacache > 0)
3519                 acachesize = 1 << highbit(nacache / hashlen);
3520         else
3521                 acachesize = rtablesize;
3522         acachemask = acachesize - 1;
3523         acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3524         for (i = 0; i < acachesize; i++) {
3525                 acache[i].next = (acache_t *)&acache[i];
3526                 acache[i].prev = (acache_t *)&acache[i];
3527                 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3528         }
3529         acache_cache = kmem_cache_create("nfs_access_cache",
3530             sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3531         /*
3532          * Allocate and initialize the client handle cache
3533          */
3534         chtab_cache = kmem_cache_create("client_handle_cache",
3535             sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3536         /*
3537          * Initialize the list of per-zone client handles (and associated data).
3538          * This needs to be done before we call zone_key_create().
3539          */
3540         list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3541             offsetof(struct nfs_clnt, nfscl_node));
3542         /*
3543          * Initialize the zone_key for per-zone client handle lists.
3544          */
3545         zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3546         /*
3547          * Initialize the various mutexes and reader/writer locks
3548          */
3549         mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3550         mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3551         mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3552
3553         /*
3554          * Assign unique major number for all nfs mounts
3555          */
3556         if ((nfs_major = getudev()) == -1) {
3557                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3558                     "nfs: init: can't get unique device number");
3559                 nfs_major = 0;
3560         }
3561         nfs_minor = 0;
3562
3563         if (nfs3_jukebox_delay == 0)
3564                 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3565
3566         return (0);
3567 }
3568
3569 void
3570 nfs_subrfini(void)
3571 {
3572         int i;
3573
3574         /*
3575          * Deallocate the rnode hash queues
3576          */
3577         kmem_cache_destroy(rnode_cache);
3578
3579         for (i = 0; i < rtablesize; i++)
3580                 rw_destroy(&rtable[i].r_lock);
3581         kmem_free(rtable, rtablesize * sizeof (*rtable));
3582
3583         /*
3584          * Deallocated the access cache
3585          */
3586         kmem_cache_destroy(acache_cache);
3587
3588         for (i = 0; i < acachesize; i++)
3589                 rw_destroy(&acache[i].lock);
3590         kmem_free(acache, acachesize * sizeof (*acache));
3591
3592         /*
3593          * Deallocate the client handle cache
3594          */
3595         kmem_cache_destroy(chtab_cache);
3596
3597         /*
3598          * Destroy the various mutexes and reader/writer locks
3599          */
3600         mutex_destroy(&rpfreelist_lock);
3601         mutex_destroy(&newnum_lock);
3602         mutex_destroy(&nfs_minor_lock);
3603         (void) zone_key_delete(nfsclnt_zone_key);
3604 }
3605
3606 enum nfsstat
3607 puterrno(int error)
3608 {
3609
3610         switch (error) {
3611         case EOPNOTSUPP:
3612                 return (NFSERR_OPNOTSUPP);
3613         case ENAMETOOLONG:
3614                 return (NFSERR_NAMETOOLONG);
3615         case ENOTEMPTY:
3616                 return (NFSERR_NOTEMPTY);
3617         case EDQUOT:
3618                 return (NFSERR_DQUOT);
3619         case ESTALE:
3620                 return (NFSERR_STALE);
3621         case EREMOTE:
3622                 return (NFSERR_REMOTE);
3623         case ENOSYS:
3624                 return (NFSERR_OPNOTSUPP);
3625         case EOVERFLOW:
3626                 return (NFSERR_INVAL);
3627         default:
3628                 return ((enum nfsstat)error);
3629         }
3630         /* NOTREACHED */
3631 }
3632
3633 int
3634 geterrno(enum nfsstat status)
3635 {
3636
3637         switch (status) {
3638         case NFSERR_OPNOTSUPP:
3639                 return (EOPNOTSUPP);
3640         case NFSERR_NAMETOOLONG:
3641                 return (ENAMETOOLONG);
3642         case NFSERR_NOTEMPTY:
3643                 return (ENOTEMPTY);
3644         case NFSERR_DQUOT:
3645                 return (EDQUOT);
3646         case NFSERR_STALE:
3647                 return (ESTALE);
3648         case NFSERR_REMOTE:
3649                 return (EREMOTE);
3650         case NFSERR_WFLUSH:
3651                 return (EIO);
3652         default:
3653                 return ((int)status);
3654         }
3655         /* NOTREACHED */
3656 }
3657
3658 enum nfsstat3
3659 puterrno3(int error)
3660 {
3661
3662 #ifdef DEBUG
3663         switch (error) {
3664         case 0:
3665                 return (NFS3_OK);
3666         case EPERM:
3667                 return (NFS3ERR_PERM);
3668         case ENOENT:
3669                 return (NFS3ERR_NOENT);
3670         case EIO:
3671                 return (NFS3ERR_IO);
3672         case ENXIO:
3673                 return (NFS3ERR_NXIO);
3674         case EACCES:
3675                 return (NFS3ERR_ACCES);
3676         case EEXIST:
3677                 return (NFS3ERR_EXIST);
3678         case EXDEV:
3679                 return (NFS3ERR_XDEV);
3680         case ENODEV:
3681                 return (NFS3ERR_NODEV);
3682         case ENOTDIR:
3683                 return (NFS3ERR_NOTDIR);
3684         case EISDIR:
3685                 return (NFS3ERR_ISDIR);
3686         case EINVAL:
3687                 return (NFS3ERR_INVAL);
3688         case EFBIG:
3689                 return (NFS3ERR_FBIG);
3690         case ENOSPC:
3691                 return (NFS3ERR_NOSPC);
3692         case EROFS:
3693                 return (NFS3ERR_ROFS);
3694         case EMLINK:
3695                 return (NFS3ERR_MLINK);
3696         case ENAMETOOLONG:
3697                 return (NFS3ERR_NAMETOOLONG);
3698         case ENOTEMPTY:
3699                 return (NFS3ERR_NOTEMPTY);
3700         case EDQUOT:
3701                 return (NFS3ERR_DQUOT);
3702         case ESTALE:
3703                 return (NFS3ERR_STALE);
3704         case EREMOTE:
3705                 return (NFS3ERR_REMOTE);
3706         case ENOSYS:
3707         case EOPNOTSUPP:
3708                 return (NFS3ERR_NOTSUPP);
3709         case EOVERFLOW:
3710                 return (NFS3ERR_INVAL);
3711         default:
3712                 zcmn_err(getzoneid(), CE_WARN,
3713                     "puterrno3: got error %d", error);
3714                 return ((enum nfsstat3)error);
3715         }
3716 #else
3717         switch (error) {
3718         case ENAMETOOLONG:
3719                 return (NFS3ERR_NAMETOOLONG);
3720         case ENOTEMPTY:
3721                 return (NFS3ERR_NOTEMPTY);
3722         case EDQUOT:
3723                 return (NFS3ERR_DQUOT);
3724         case ESTALE:
3725                 return (NFS3ERR_STALE);
3726         case ENOSYS:
3727         case EOPNOTSUPP:
3728                 return (NFS3ERR_NOTSUPP);
3729         case EREMOTE:
3730                 return (NFS3ERR_REMOTE);
3731         case EOVERFLOW:
3732                 return (NFS3ERR_INVAL);
3733         default:
3734                 return ((enum nfsstat3)error);
3735         }
3736 #endif
3737 }
3738
3739 int
3740 geterrno3(enum nfsstat3 status)
3741 {
3742
3743 #ifdef DEBUG
3744         switch (status) {
3745         case NFS3_OK:
3746                 return (0);
3747         case NFS3ERR_PERM:
3748                 return (EPERM);
3749         case NFS3ERR_NOENT:
3750                 return (ENOENT);
3751         case NFS3ERR_IO:
3752                 return (EIO);
3753         case NFS3ERR_NXIO:
3754                 return (ENXIO);
3755         case NFS3ERR_ACCES:
3756                 return (EACCES);
3757         case NFS3ERR_EXIST:
3758                 return (EEXIST);
3759         case NFS3ERR_XDEV:
3760                 return (EXDEV);
3761         case NFS3ERR_NODEV:
3762                 return (ENODEV);
3763         case NFS3ERR_NOTDIR:
3764                 return (ENOTDIR);
3765         case NFS3ERR_ISDIR:
3766                 return (EISDIR);
3767         case NFS3ERR_INVAL:
3768                 return (EINVAL);
3769         case NFS3ERR_FBIG:
3770                 return (EFBIG);
3771         case NFS3ERR_NOSPC:
3772                 return (ENOSPC);
3773         case NFS3ERR_ROFS:
3774                 return (EROFS);
3775         case NFS3ERR_MLINK:
3776                 return (EMLINK);
3777         case NFS3ERR_NAMETOOLONG:
3778                 return (ENAMETOOLONG);
3779         case NFS3ERR_NOTEMPTY:
3780                 return (ENOTEMPTY);
3781         case NFS3ERR_DQUOT:
3782                 return (EDQUOT);
3783         case NFS3ERR_STALE:
3784                 return (ESTALE);
3785         case NFS3ERR_REMOTE:
3786                 return (EREMOTE);
3787         case NFS3ERR_BADHANDLE:
3788                 return (ESTALE);
3789         case NFS3ERR_NOT_SYNC:
3790                 return (EINVAL);
3791         case NFS3ERR_BAD_COOKIE:
3792                 return (ENOENT);
3793         case NFS3ERR_NOTSUPP:
3794                 return (EOPNOTSUPP);
3795         case NFS3ERR_TOOSMALL:
3796                 return (EINVAL);
3797         case NFS3ERR_SERVERFAULT:
3798                 return (EIO);
3799         case NFS3ERR_BADTYPE:
3800                 return (EINVAL);
3801         case NFS3ERR_JUKEBOX:
3802                 return (ENXIO);
3803         default:
3804                 zcmn_err(getzoneid(), CE_WARN,
3805                     "geterrno3: got status %d", status);
3806                 return ((int)status);
3807         }
3808 #else
3809         switch (status) {
3810         case NFS3ERR_NAMETOOLONG:
3811                 return (ENAMETOOLONG);
3812         case NFS3ERR_NOTEMPTY:
3813                 return (ENOTEMPTY);
3814         case NFS3ERR_DQUOT:
3815                 return (EDQUOT);
3816         case NFS3ERR_STALE:
3817         case NFS3ERR_BADHANDLE:
3818                 return (ESTALE);
3819         case NFS3ERR_NOTSUPP:
3820                 return (EOPNOTSUPP);
3821         case NFS3ERR_REMOTE:
3822                 return (EREMOTE);
3823         case NFS3ERR_NOT_SYNC:
3824         case NFS3ERR_TOOSMALL:
3825         case NFS3ERR_BADTYPE:
3826                 return (EINVAL);
3827         case NFS3ERR_BAD_COOKIE:
3828                 return (ENOENT);
3829         case NFS3ERR_SERVERFAULT:
3830                 return (EIO);
3831         case NFS3ERR_JUKEBOX:
3832                 return (ENXIO);
3833         default:
3834                 return ((int)status);
3835         }
3836 #endif
3837 }
3838
3839 rddir_cache *
3840 rddir_cache_alloc(int flags)
3841 {
3842         rddir_cache *rc;
3843
3844         rc = kmem_alloc(sizeof (*rc), flags);
3845         if (rc != NULL) {
3846                 rc->entries = NULL;
3847                 rc->flags = RDDIR;
3848                 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3849                 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3850                 rc->count = 1;
3851 #ifdef DEBUG
3852                 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3853 #endif
3854         }
3855         return (rc);
3856 }
3857
3858 static void
3859 rddir_cache_free(rddir_cache *rc)
3860 {
3861
3862 #ifdef DEBUG
3863         atomic_dec_64(&clstat_debug.dirent.value.ui64);
3864 #endif
3865         if (rc->entries != NULL) {
3866 #ifdef DEBUG
3867                 rddir_cache_buf_free(rc->entries, rc->buflen);
3868 #else
3869                 kmem_free(rc->entries, rc->buflen);
3870 #endif
3871         }
3872         cv_destroy(&rc->cv);
3873         mutex_destroy(&rc->lock);
3874         kmem_free(rc, sizeof (*rc));
3875 }
3876
3877 void
3878 rddir_cache_hold(rddir_cache *rc)
3879 {
3880
3881         mutex_enter(&rc->lock);
3882         rc->count++;
3883         mutex_exit(&rc->lock);
3884 }
3885
3886 void
3887 rddir_cache_rele(rddir_cache *rc)
3888 {
3889
3890         mutex_enter(&rc->lock);
3891         ASSERT(rc->count > 0);
3892         if (--rc->count == 0) {
3893                 mutex_exit(&rc->lock);
3894                 rddir_cache_free(rc);
3895         } else
3896                 mutex_exit(&rc->lock);
3897 }
3898
3899 #ifdef DEBUG
3900 char *
3901 rddir_cache_buf_alloc(size_t size, int flags)
3902 {
3903         char *rc;
3904
3905         rc = kmem_alloc(size, flags);
3906         if (rc != NULL)
3907                 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3908         return (rc);
3909 }
3910
3911 void
3912 rddir_cache_buf_free(void *addr, size_t size)
3913 {
3914
3915         atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3916         kmem_free(addr, size);
3917 }
3918 #endif
3919
3920 static int
3921 nfs_free_data_reclaim(rnode_t *rp)
3922 {
3923         char *contents;
3924         int size;
3925         vsecattr_t *vsp;
3926         nfs3_pathconf_info *info;
3927         int freed;
3928         cred_t *cred;
3929
3930         /*
3931          * Free any held credentials and caches which
3932          * may be associated with this rnode.
3933          */
3934         mutex_enter(&rp->r_statelock);
3935         cred = rp->r_cred;
3936         rp->r_cred = NULL;
3937         contents = rp->r_symlink.contents;
3938         size = rp->r_symlink.size;
3939         rp->r_symlink.contents = NULL;
3940         vsp = rp->r_secattr;
3941         rp->r_secattr = NULL;
3942         info = rp->r_pathconf;
3943         rp->r_pathconf = NULL;
3944         mutex_exit(&rp->r_statelock);
3945
3946         if (cred != NULL)
3947                 crfree(cred);
3948
3949         /*
3950          * Free the access cache entries.
3951          */
3952         freed = nfs_access_purge_rp(rp);
3953
3954         if (!HAVE_RDDIR_CACHE(rp) &&
3955             contents == NULL &&
3956             vsp == NULL &&
3957             info == NULL)
3958                 return (freed);
3959
3960         /*
3961          * Free the readdir cache entries
3962          */
3963         if (HAVE_RDDIR_CACHE(rp))
3964                 nfs_purge_rddir_cache(RTOV(rp));
3965
3966         /*
3967          * Free the symbolic link cache.
3968          */
3969         if (contents != NULL) {
3970
3971                 kmem_free((void *)contents, size);
3972         }
3973
3974         /*
3975          * Free any cached ACL.
3976          */
3977         if (vsp != NULL)
3978                 nfs_acl_free(vsp);
3979
3980         /*
3981          * Free any cached pathconf information.
3982          */
3983         if (info != NULL)
3984                 kmem_free(info, sizeof (*info));
3985
3986         return (1);
3987 }
3988
3989 static int
3990 nfs_active_data_reclaim(rnode_t *rp)
3991 {
3992         char *contents;
3993         int size;
3994         vsecattr_t *vsp;
3995         nfs3_pathconf_info *info;
3996         int freed;
3997
3998         /*
3999          * Free any held credentials and caches which
4000          * may be associated with this rnode.
4001          */
4002         if (!mutex_tryenter(&rp->r_statelock))
4003                 return (0);
4004         contents = rp->r_symlink.contents;
4005         size = rp->r_symlink.size;
4006         rp->r_symlink.contents = NULL;
4007         vsp = rp->r_secattr;
4008         rp->r_secattr = NULL;
4009         info = rp->r_pathconf;
4010         rp->r_pathconf = NULL;
4011         mutex_exit(&rp->r_statelock);
4012
4013         /*
4014          * Free the access cache entries.
4015          */
4016         freed = nfs_access_purge_rp(rp);
4017
4018         if (!HAVE_RDDIR_CACHE(rp) &&
4019             contents == NULL &&
4020             vsp == NULL &&
4021             info == NULL)
4022                 return (freed);
4023
4024         /*
4025          * Free the readdir cache entries
4026          */
4027         if (HAVE_RDDIR_CACHE(rp))
4028                 nfs_purge_rddir_cache(RTOV(rp));
4029
4030         /*
4031          * Free the symbolic link cache.
4032          */
4033         if (contents != NULL) {
4034
4035                 kmem_free((void *)contents, size);
4036         }
4037
4038         /*
4039          * Free any cached ACL.
4040          */
4041         if (vsp != NULL)
4042                 nfs_acl_free(vsp);
4043
4044         /*
4045          * Free any cached pathconf information.
4046          */
4047         if (info != NULL)
4048                 kmem_free(info, sizeof (*info));
4049
4050         return (1);
4051 }
4052
4053 static int
4054 nfs_free_reclaim(void)
4055 {
4056         int freed;
4057         rnode_t *rp;
4058
4059 #ifdef DEBUG
4060         clstat_debug.f_reclaim.value.ui64++;
4061 #endif
4062         freed = 0;
4063         mutex_enter(&rpfreelist_lock);
4064         rp = rpfreelist;
4065         if (rp != NULL) {
4066                 do {
4067                         if (nfs_free_data_reclaim(rp))
4068                                 freed = 1;
4069                 } while ((rp = rp->r_freef) != rpfreelist);
4070         }
4071         mutex_exit(&rpfreelist_lock);
4072         return (freed);
4073 }
4074
4075 static int
4076 nfs_active_reclaim(void)
4077 {
4078         int freed;
4079         int index;
4080         rnode_t *rp;
4081
4082 #ifdef DEBUG
4083         clstat_debug.a_reclaim.value.ui64++;
4084 #endif
4085         freed = 0;
4086         for (index = 0; index < rtablesize; index++) {
4087                 rw_enter(&rtable[index].r_lock, RW_READER);
4088                 for (rp = rtable[index].r_hashf;
4089                     rp != (rnode_t *)(&rtable[index]);
4090                     rp = rp->r_hashf) {
4091                         if (nfs_active_data_reclaim(rp))
4092                                 freed = 1;
4093                 }
4094                 rw_exit(&rtable[index].r_lock);
4095         }
4096         return (freed);
4097 }
4098
4099 static int
4100 nfs_rnode_reclaim(void)
4101 {
4102         int freed;
4103         rnode_t *rp;
4104         vnode_t *vp;
4105
4106 #ifdef DEBUG
4107         clstat_debug.r_reclaim.value.ui64++;
4108 #endif
4109         freed = 0;
4110         mutex_enter(&rpfreelist_lock);
4111         while ((rp = rpfreelist) != NULL) {
4112                 rp_rmfree(rp);
4113                 mutex_exit(&rpfreelist_lock);
4114                 if (rp->r_flags & RHASHED) {
4115                         vp = RTOV(rp);
4116                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4117                         mutex_enter(&vp->v_lock);
4118                         if (vp->v_count > 1) {
4119                                 VN_RELE_LOCKED(vp);
4120                                 mutex_exit(&vp->v_lock);
4121                                 rw_exit(&rp->r_hashq->r_lock);
4122                                 mutex_enter(&rpfreelist_lock);
4123                                 continue;
4124                         }
4125                         mutex_exit(&vp->v_lock);
4126                         rp_rmhash_locked(rp);
4127                         rw_exit(&rp->r_hashq->r_lock);
4128                 }
4129                 /*
4130                  * This call to rp_addfree will end up destroying the
4131                  * rnode, but in a safe way with the appropriate set
4132                  * of checks done.
4133                  */
4134                 rp_addfree(rp, CRED());
4135                 mutex_enter(&rpfreelist_lock);
4136         }
4137         mutex_exit(&rpfreelist_lock);
4138         return (freed);
4139 }
4140
4141 /*ARGSUSED*/
4142 static void
4143 nfs_reclaim(void *cdrarg)
4144 {
4145
4146 #ifdef DEBUG
4147         clstat_debug.reclaim.value.ui64++;
4148 #endif
4149         if (nfs_free_reclaim())
4150                 return;
4151
4152         if (nfs_active_reclaim())
4153                 return;
4154
4155         (void) nfs_rnode_reclaim();
4156 }
4157
4158 /*
4159  * NFS client failover support
4160  *
4161  * Routines to copy filehandles
4162  */
4163 void
4164 nfscopyfh(caddr_t fhp, vnode_t *vp)
4165 {
4166         fhandle_t *dest = (fhandle_t *)fhp;
4167
4168         if (dest != NULL)
4169                 *dest = *VTOFH(vp);
4170 }
4171
4172 void
4173 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4174 {
4175         nfs_fh3 *dest = (nfs_fh3 *)fhp;
4176
4177         if (dest != NULL)
4178                 *dest = *VTOFH3(vp);
4179 }
4180
4181 /*
4182  * NFS client failover support
4183  *
4184  * failover_safe() will test various conditions to ensure that
4185  * failover is permitted for this vnode.  It will be denied
4186  * if:
4187  *      1) the operation in progress does not support failover (NULL fi)
4188  *      2) there are no available replicas (NULL mi_servers->sv_next)
4189  *      3) any locks are outstanding on this file
4190  */
4191 static int
4192 failover_safe(failinfo_t *fi)
4193 {
4194
4195         /*
4196          * Does this op permit failover?
4197          */
4198         if (fi == NULL || fi->vp == NULL)
4199                 return (0);
4200
4201         /*
4202          * Are there any alternates to failover to?
4203          */
4204         if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4205                 return (0);
4206
4207         /*
4208          * Disable check; we've forced local locking
4209          *
4210          * if (flk_has_remote_locks(fi->vp))
4211          *      return (0);
4212          */
4213
4214         /*
4215          * If we have no partial path, we can't do anything
4216          */
4217         if (VTOR(fi->vp)->r_path == NULL)
4218                 return (0);
4219
4220         return (1);
4221 }
4222
4223 #include <sys/thread.h>
4224
4225 /*
4226  * NFS client failover support
4227  *
4228  * failover_newserver() will start a search for a new server,
4229  * preferably by starting an async thread to do the work.  If
4230  * someone is already doing this (recognizable by MI_BINDINPROG
4231  * being set), it will simply return and the calling thread
4232  * will queue on the mi_failover_cv condition variable.
4233  */
4234 static void
4235 failover_newserver(mntinfo_t *mi)
4236 {
4237         /*
4238          * Check if someone else is doing this already
4239          */
4240         mutex_enter(&mi->mi_lock);
4241         if (mi->mi_flags & MI_BINDINPROG) {
4242                 mutex_exit(&mi->mi_lock);
4243                 return;
4244         }
4245         mi->mi_flags |= MI_BINDINPROG;
4246
4247         /*
4248          * Need to hold the vfs struct so that it can't be released
4249          * while the failover thread is selecting a new server.
4250          */
4251         VFS_HOLD(mi->mi_vfsp);
4252
4253         /*
4254          * Start a thread to do the real searching.
4255          */
4256         (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4257
4258         mutex_exit(&mi->mi_lock);
4259 }
4260
4261 /*
4262  * NFS client failover support
4263  *
4264  * failover_thread() will find a new server to replace the one
4265  * currently in use, wake up other threads waiting on this mount
4266  * point, and die.  It will start at the head of the server list
4267  * and poll servers until it finds one with an NFS server which is
4268  * registered and responds to a NULL procedure ping.
4269  *
4270  * XXX failover_thread is unsafe within the scope of the
4271  * present model defined for cpr to suspend the system.
4272  * Specifically, over-the-wire calls made by the thread
4273  * are unsafe. The thread needs to be reevaluated in case of
4274  * future updates to the cpr suspend model.
4275  */
4276 static void
4277 failover_thread(mntinfo_t *mi)
4278 {
4279         servinfo_t *svp = NULL;
4280         CLIENT *cl;
4281         enum clnt_stat status;
4282         struct timeval tv;
4283         int error;
4284         int oncethru = 0;
4285         callb_cpr_t cprinfo;
4286         rnode_t *rp;
4287         int index;
4288         char *srvnames;
4289         size_t srvnames_len;
4290         struct nfs_clnt *nfscl = NULL;
4291         zoneid_t zoneid = getzoneid();
4292
4293 #ifdef DEBUG
4294         /*
4295          * This is currently only needed to access counters which exist on
4296          * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4297          * on non-DEBUG kernels.
4298          */
4299         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4300         ASSERT(nfscl != NULL);
4301 #endif
4302
4303         /*
4304          * Its safe to piggyback on the mi_lock since failover_newserver()
4305          * code guarantees that there will be only one failover thread
4306          * per mountinfo at any instance.
4307          */
4308         CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4309             "failover_thread");
4310
4311         mutex_enter(&mi->mi_lock);
4312         while (mi->mi_readers) {
4313                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4314                 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4315                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4316         }
4317         mutex_exit(&mi->mi_lock);
4318
4319         tv.tv_sec = 2;
4320         tv.tv_usec = 0;
4321
4322         /*
4323          * Ping the null NFS procedure of every server in
4324          * the list until one responds.  We always start
4325          * at the head of the list and always skip the one
4326          * that is current, since it's caused us a problem.
4327          */
4328         while (svp == NULL) {
4329                 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4330                         if (!oncethru && svp == mi->mi_curr_serv)
4331                                 continue;
4332
4333                         /*
4334                          * If the file system was forcibly umounted
4335                          * while trying to do a failover, then just
4336                          * give up on the failover.  It won't matter
4337                          * what the server is.
4338                          */
4339                         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4340                                 svp = NULL;
4341                                 goto done;
4342                         }
4343
4344                         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4345                             NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4346                         if (error)
4347                                 continue;
4348
4349                         if (!(mi->mi_flags & MI_INT))
4350                                 cl->cl_nosignal = TRUE;
4351                         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4352                             xdr_void, NULL, tv);
4353                         if (!(mi->mi_flags & MI_INT))
4354                                 cl->cl_nosignal = FALSE;
4355                         AUTH_DESTROY(cl->cl_auth);
4356                         CLNT_DESTROY(cl);
4357                         if (status == RPC_SUCCESS) {
4358                                 if (svp == mi->mi_curr_serv) {
4359 #ifdef DEBUG
4360                                         zcmn_err(zoneid, CE_NOTE,
4361                         "NFS%d: failing over: selecting original server %s",
4362                                             mi->mi_vers, svp->sv_hostname);
4363 #else
4364                                         zcmn_err(zoneid, CE_NOTE,
4365                         "NFS: failing over: selecting original server %s",
4366                                             svp->sv_hostname);
4367 #endif
4368                                 } else {
4369 #ifdef DEBUG
4370                                         zcmn_err(zoneid, CE_NOTE,
4371                                     "NFS%d: failing over from %s to %s",
4372                                             mi->mi_vers,
4373                                             mi->mi_curr_serv->sv_hostname,
4374                                             svp->sv_hostname);
4375 #else
4376                                         zcmn_err(zoneid, CE_NOTE,
4377                                     "NFS: failing over from %s to %s",
4378                                             mi->mi_curr_serv->sv_hostname,
4379                                             svp->sv_hostname);
4380 #endif
4381                                 }
4382                                 break;
4383                         }
4384                 }
4385
4386                 if (svp == NULL) {
4387                         if (!oncethru) {
4388                                 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4389 #ifdef DEBUG
4390                                 zprintf(zoneid,
4391                                     "NFS%d servers %s not responding "
4392                                     "still trying\n", mi->mi_vers, srvnames);
4393 #else
4394                                 zprintf(zoneid, "NFS servers %s not responding "
4395                                     "still trying\n", srvnames);
4396 #endif
4397                                 oncethru = 1;
4398                         }
4399                         mutex_enter(&mi->mi_lock);
4400                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
4401                         mutex_exit(&mi->mi_lock);
4402                         ddi_sleep(1);
4403                         mutex_enter(&mi->mi_lock);
4404                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4405                         mutex_exit(&mi->mi_lock);
4406                 }
4407         }
4408
4409         if (oncethru) {
4410 #ifdef DEBUG
4411                 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4412 #else
4413                 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4414 #endif
4415         }
4416
4417         if (svp != mi->mi_curr_serv) {
4418                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4419                 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4420                 rw_enter(&rtable[index].r_lock, RW_WRITER);
4421                 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4422                     mi->mi_vfsp);
4423                 if (rp != NULL) {
4424                         if (rp->r_flags & RHASHED)
4425                                 rp_rmhash_locked(rp);
4426                         rw_exit(&rtable[index].r_lock);
4427                         rp->r_server = svp;
4428                         rp->r_fh = svp->sv_fhandle;
4429                         (void) nfs_free_data_reclaim(rp);
4430                         index = rtablehash(&rp->r_fh);
4431                         rp->r_hashq = &rtable[index];
4432                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4433                         vn_exists(RTOV(rp));
4434                         rp_addhash(rp);
4435                         rw_exit(&rp->r_hashq->r_lock);
4436                         VN_RELE(RTOV(rp));
4437                 } else
4438                         rw_exit(&rtable[index].r_lock);
4439         }
4440
4441 done:
4442         if (oncethru)
4443                 kmem_free(srvnames, srvnames_len);
4444         mutex_enter(&mi->mi_lock);
4445         mi->mi_flags &= ~MI_BINDINPROG;
4446         if (svp != NULL) {
4447                 mi->mi_curr_serv = svp;
4448                 mi->mi_failover++;
4449 #ifdef DEBUG
4450         nfscl->nfscl_stat.failover.value.ui64++;
4451 #endif
4452         }
4453         cv_broadcast(&mi->mi_failover_cv);
4454         CALLB_CPR_EXIT(&cprinfo);
4455         VFS_RELE(mi->mi_vfsp);
4456         zthread_exit();
4457         /* NOTREACHED */
4458 }
4459
4460 /*
4461  * NFS client failover support
4462  *
4463  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4464  * is cleared, meaning that failover is complete.  Called with
4465  * mi_lock mutex held.
4466  */
4467 static int
4468 failover_wait(mntinfo_t *mi)
4469 {
4470         k_sigset_t smask;
4471
4472         /*
4473          * If someone else is hunting for a living server,
4474          * sleep until it's done.  After our sleep, we may
4475          * be bound to the right server and get off cheaply.
4476          */
4477         while (mi->mi_flags & MI_BINDINPROG) {
4478                 /*
4479                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4480                  * and SIGTERM. (Preserving the existing masks).
4481                  * Mask out SIGINT if mount option nointr is specified.
4482                  */
4483                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4484                 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4485                         /*
4486                          * restore original signal mask
4487                          */
4488                         sigunintr(&smask);
4489                         return (EINTR);
4490                 }
4491                 /*
4492                  * restore original signal mask
4493                  */
4494                 sigunintr(&smask);
4495         }
4496         return (0);
4497 }
4498
4499 /*
4500  * NFS client failover support
4501  *
4502  * failover_remap() will do a partial pathname lookup and find the
4503  * desired vnode on the current server.  The interim vnode will be
4504  * discarded after we pilfer the new filehandle.
4505  *
4506  * Side effects:
4507  * - This routine will also update the filehandle in the args structure
4508  *    pointed to by the fi->fhp pointer if it is non-NULL.
4509  */
4510
4511 static int
4512 failover_remap(failinfo_t *fi)
4513 {
4514         vnode_t *vp, *nvp, *rootvp;
4515         rnode_t *rp, *nrp;
4516         mntinfo_t *mi;
4517         int error;
4518 #ifdef DEBUG
4519         struct nfs_clnt *nfscl;
4520
4521         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4522         ASSERT(nfscl != NULL);
4523 #endif
4524         /*
4525          * Sanity check
4526          */
4527         if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4528                 return (EINVAL);
4529         vp = fi->vp;
4530         rp = VTOR(vp);
4531         mi = VTOMI(vp);
4532
4533         if (!(vp->v_flag & VROOT)) {
4534                 /*
4535                  * Given the root fh, use the path stored in
4536                  * the rnode to find the fh for the new server.
4537                  */
4538                 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4539                 if (error)
4540                         return (error);
4541
4542                 error = failover_lookup(rp->r_path, rootvp,
4543                     fi->lookupproc, fi->xattrdirproc, &nvp);
4544
4545                 VN_RELE(rootvp);
4546
4547                 if (error)
4548                         return (error);
4549
4550                 /*
4551                  * If we found the same rnode, we're done now
4552                  */
4553                 if (nvp == vp) {
4554                         /*
4555                          * Failed and the new server may physically be same
4556                          * OR may share a same disk subsystem. In this case
4557                          * file handle for a particular file path is not going
4558                          * to change, given the same filehandle lookup will
4559                          * always locate the same rnode as the existing one.
4560                          * All we might need to do is to update the r_server
4561                          * with the current servinfo.
4562                          */
4563                         if (!VALID_FH(fi)) {
4564                                 rp->r_server = mi->mi_curr_serv;
4565                         }
4566                         VN_RELE(nvp);
4567                         return (0);
4568                 }
4569
4570                 /*
4571                  * Try to make it so that no one else will find this
4572                  * vnode because it is just a temporary to hold the
4573                  * new file handle until that file handle can be
4574                  * copied to the original vnode/rnode.
4575                  */
4576                 nrp = VTOR(nvp);
4577                 mutex_enter(&mi->mi_remap_lock);
4578                 /*
4579                  * Some other thread could have raced in here and could
4580                  * have done the remap for this particular rnode before
4581                  * this thread here. Check for rp->r_server and
4582                  * mi->mi_curr_serv and return if they are same.
4583                  */
4584                 if (VALID_FH(fi)) {
4585                         mutex_exit(&mi->mi_remap_lock);
4586                         VN_RELE(nvp);
4587                         return (0);
4588                 }
4589
4590                 if (nrp->r_flags & RHASHED)
4591                         rp_rmhash(nrp);
4592
4593                 /*
4594                  * As a heuristic check on the validity of the new
4595                  * file, check that the size and type match against
4596                  * that we remember from the old version.
4597                  */
4598                 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4599                         mutex_exit(&mi->mi_remap_lock);
4600                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4601                             "NFS replicas %s and %s: file %s not same.",
4602                             rp->r_server->sv_hostname,
4603                             nrp->r_server->sv_hostname, rp->r_path);
4604                         VN_RELE(nvp);
4605                         return (EINVAL);
4606                 }
4607
4608                 /*
4609                  * snarf the filehandle from the new rnode
4610                  * then release it, again while updating the
4611                  * hash queues for the rnode.
4612                  */
4613                 if (rp->r_flags & RHASHED)
4614                         rp_rmhash(rp);
4615                 rp->r_server = mi->mi_curr_serv;
4616                 rp->r_fh = nrp->r_fh;
4617                 rp->r_hashq = nrp->r_hashq;
4618                 /*
4619                  * Copy the attributes from the new rnode to the old
4620                  * rnode.  This will help to reduce unnecessary page
4621                  * cache flushes.
4622                  */
4623                 rp->r_attr = nrp->r_attr;
4624                 rp->r_attrtime = nrp->r_attrtime;
4625                 rp->r_mtime = nrp->r_mtime;
4626                 (void) nfs_free_data_reclaim(rp);
4627                 nfs_setswaplike(vp, &rp->r_attr);
4628                 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4629                 rp_addhash(rp);
4630                 rw_exit(&rp->r_hashq->r_lock);
4631                 mutex_exit(&mi->mi_remap_lock);
4632                 VN_RELE(nvp);
4633         }
4634
4635         /*
4636          * Update successful failover remap count
4637          */
4638         mutex_enter(&mi->mi_lock);
4639         mi->mi_remap++;
4640         mutex_exit(&mi->mi_lock);
4641 #ifdef DEBUG
4642         nfscl->nfscl_stat.remap.value.ui64++;
4643 #endif
4644
4645         /*
4646          * If we have a copied filehandle to update, do it now.
4647          */
4648         if (fi->fhp != NULL && fi->copyproc != NULL)
4649                 (*fi->copyproc)(fi->fhp, vp);
4650
4651         return (0);
4652 }
4653
4654 /*
4655  * NFS client failover support
4656  *
4657  * We want a simple pathname lookup routine to parse the pieces
4658  * of path in rp->r_path.  We know that the path was a created
4659  * as rnodes were made, so we know we have only to deal with
4660  * paths that look like:
4661  *      dir1/dir2/dir3/file
4662  * Any evidence of anything like .., symlinks, and ENOTDIR
4663  * are hard errors, because they mean something in this filesystem
4664  * is different from the one we came from, or has changed under
4665  * us in some way.  If this is true, we want the failure.
4666  *
4667  * Extended attributes: if the filesystem is mounted with extended
4668  * attributes enabled (-o xattr), the attribute directory will be
4669  * represented in the r_path as the magic name XATTR_RPATH. So if
4670  * we see that name in the pathname, is must be because this node
4671  * is an extended attribute.  Therefore, look it up that way.
4672  */
4673 static int
4674 failover_lookup(char *path, vnode_t *root,
4675     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4676     vnode_t *, cred_t *, int),
4677     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4678     vnode_t **new)
4679 {
4680         vnode_t *dvp, *nvp;
4681         int error = EINVAL;
4682         char *s, *p, *tmppath;
4683         size_t len;
4684         mntinfo_t *mi;
4685         bool_t xattr;
4686
4687         /* Make local copy of path */
4688         len = strlen(path) + 1;
4689         tmppath = kmem_alloc(len, KM_SLEEP);
4690         (void) strcpy(tmppath, path);
4691         s = tmppath;
4692
4693         dvp = root;
4694         VN_HOLD(dvp);
4695         mi = VTOMI(root);
4696         xattr = mi->mi_flags & MI_EXTATTR;
4697
4698         do {
4699                 p = strchr(s, '/');
4700                 if (p != NULL)
4701                         *p = '\0';
4702                 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4703                         error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4704                             RFSCALL_SOFT);
4705                 } else {
4706                         error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4707                             CRED(), RFSCALL_SOFT);
4708                 }
4709                 if (p != NULL)
4710                         *p++ = '/';
4711                 if (error) {
4712                         VN_RELE(dvp);
4713                         kmem_free(tmppath, len);
4714                         return (error);
4715                 }
4716                 s = p;
4717                 VN_RELE(dvp);
4718                 dvp = nvp;
4719         } while (p != NULL);
4720
4721         if (nvp != NULL && new != NULL)
4722                 *new = nvp;
4723         kmem_free(tmppath, len);
4724         return (0);
4725 }
4726
4727 /*
4728  * NFS client failover support
4729  *
4730  * sv_free() frees the malloc'd portion of a "servinfo_t".
4731  */
4732 void
4733 sv_free(servinfo_t *svp)
4734 {
4735         servinfo_t *next;
4736         struct knetconfig *knconf;
4737
4738         while (svp != NULL) {
4739                 next = svp->sv_next;
4740                 if (svp->sv_secdata)
4741                         sec_clnt_freeinfo(svp->sv_secdata);
4742                 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4743                         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4744                 knconf = svp->sv_knconf;
4745                 if (knconf != NULL) {
4746                         if (knconf->knc_protofmly != NULL)
4747                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4748                         if (knconf->knc_proto != NULL)
4749                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4750                         kmem_free(knconf, sizeof (*knconf));
4751                 }
4752                 knconf = svp->sv_origknconf;
4753                 if (knconf != NULL) {
4754                         if (knconf->knc_protofmly != NULL)
4755                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4756                         if (knconf->knc_proto != NULL)
4757                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4758                         kmem_free(knconf, sizeof (*knconf));
4759                 }
4760                 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4761                         kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4762                 mutex_destroy(&svp->sv_lock);
4763                 kmem_free(svp, sizeof (*svp));
4764                 svp = next;
4765         }
4766 }
4767
4768 /*
4769  * Only can return non-zero if intr != 0.
4770  */
4771 int
4772 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4773 {
4774
4775         mutex_enter(&l->lock);
4776
4777         /*
4778          * If this is a nested enter, then allow it.  There
4779          * must be as many exits as enters through.
4780          */
4781         if (l->owner == curthread) {
4782                 /* lock is held for writing by current thread */
4783                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4784                 l->count--;
4785         } else if (rw == RW_READER) {
4786                 /*
4787                  * While there is a writer active or writers waiting,
4788                  * then wait for them to finish up and move on.  Then,
4789                  * increment the count to indicate that a reader is
4790                  * active.
4791                  */
4792                 while (l->count < 0 || l->waiters > 0) {
4793                         if (intr) {
4794                                 klwp_t *lwp = ttolwp(curthread);
4795
4796                                 if (lwp != NULL)
4797                                         lwp->lwp_nostop++;
4798                                 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4799                                         if (lwp != NULL)
4800                                                 lwp->lwp_nostop--;
4801                                         mutex_exit(&l->lock);
4802                                         return (EINTR);
4803                                 }
4804                                 if (lwp != NULL)
4805                                         lwp->lwp_nostop--;
4806                         } else
4807                                 cv_wait(&l->cv_rd, &l->lock);
4808                 }
4809                 ASSERT(l->count < INT_MAX);
4810 #ifdef  DEBUG
4811                 if ((l->count % 10000) == 9999)
4812                         cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4813                             "rwlock @ %p\n", l->count, (void *)&l);
4814 #endif
4815                 l->count++;
4816         } else {
4817                 ASSERT(rw == RW_WRITER);
4818                 /*
4819                  * While there are readers active or a writer
4820                  * active, then wait for all of the readers
4821                  * to finish or for the writer to finish.
4822                  * Then, set the owner field to curthread and
4823                  * decrement count to indicate that a writer
4824                  * is active.
4825                  */
4826                 while (l->count != 0) {
4827                         l->waiters++;
4828                         if (intr) {
4829                                 klwp_t *lwp = ttolwp(curthread);
4830
4831                                 if (lwp != NULL)
4832                                         lwp->lwp_nostop++;
4833                                 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4834                                         if (lwp != NULL)
4835                                                 lwp->lwp_nostop--;
4836                                         l->waiters--;
4837                                         /*
4838                                          * If there are readers active and no
4839                                          * writers waiting then wake up all of
4840                                          * the waiting readers (if any).
4841                                          */
4842                                         if (l->count > 0 && l->waiters == 0)
4843                                                 cv_broadcast(&l->cv_rd);
4844                                         mutex_exit(&l->lock);
4845                                         return (EINTR);
4846                                 }
4847                                 if (lwp != NULL)
4848                                         lwp->lwp_nostop--;
4849                         } else
4850                                 cv_wait(&l->cv, &l->lock);
4851                         l->waiters--;
4852                 }
4853                 ASSERT(l->owner == NULL);
4854                 l->owner = curthread;
4855                 l->count--;
4856         }
4857
4858         mutex_exit(&l->lock);
4859
4860         return (0);
4861 }
4862
4863 /*
4864  * If the lock is available, obtain it and return non-zero.  If there is
4865  * already a conflicting lock, return 0 immediately.
4866  */
4867
4868 int
4869 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4870 {
4871         mutex_enter(&l->lock);
4872
4873         /*
4874          * If this is a nested enter, then allow it.  There
4875          * must be as many exits as enters through.
4876          */
4877         if (l->owner == curthread) {
4878                 /* lock is held for writing by current thread */
4879                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4880                 l->count--;
4881         } else if (rw == RW_READER) {
4882                 /*
4883                  * If there is a writer active or writers waiting, deny the
4884                  * lock.  Otherwise, bump the count of readers.
4885                  */
4886                 if (l->count < 0 || l->waiters > 0) {
4887                         mutex_exit(&l->lock);
4888                         return (0);
4889                 }
4890                 l->count++;
4891         } else {
4892                 ASSERT(rw == RW_WRITER);
4893                 /*
4894                  * If there are readers active or a writer active, deny the
4895                  * lock.  Otherwise, set the owner field to curthread and
4896                  * decrement count to indicate that a writer is active.
4897                  */
4898                 if (l->count != 0) {
4899                         mutex_exit(&l->lock);
4900                         return (0);
4901                 }
4902                 ASSERT(l->owner == NULL);
4903                 l->owner = curthread;
4904                 l->count--;
4905         }
4906
4907         mutex_exit(&l->lock);
4908
4909         return (1);
4910 }
4911
4912 void
4913 nfs_rw_exit(nfs_rwlock_t *l)
4914 {
4915
4916         mutex_enter(&l->lock);
4917
4918         if (l->owner != NULL) {
4919                 ASSERT(l->owner == curthread);
4920
4921                 /*
4922                  * To release a writer lock increment count to indicate that
4923                  * there is one less writer active.  If this was the last of
4924                  * possibly nested writer locks, then clear the owner field as
4925                  * well to indicate that there is no writer active.
4926                  */
4927                 ASSERT(l->count < 0);
4928                 l->count++;
4929                 if (l->count == 0) {
4930                         l->owner = NULL;
4931
4932                         /*
4933                          * If there are no writers waiting then wakeup all of
4934                          * the waiting readers (if any).
4935                          */
4936                         if (l->waiters == 0)
4937                                 cv_broadcast(&l->cv_rd);
4938                 }
4939         } else {
4940                 /*
4941                  * To release a reader lock just decrement count to indicate
4942                  * that there is one less reader active.
4943                  */
4944                 ASSERT(l->count > 0);
4945                 l->count--;
4946         }
4947
4948         /*
4949          * If there are no readers active nor a writer active and there is a
4950          * writer waiting we need to wake up it.
4951          */
4952         if (l->count == 0 && l->waiters > 0)
4953                 cv_signal(&l->cv);
4954         mutex_exit(&l->lock);
4955 }
4956
4957 int
4958 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4959 {
4960
4961         if (rw == RW_READER)
4962                 return (l->count > 0);
4963         ASSERT(rw == RW_WRITER);
4964         return (l->count < 0);
4965 }
4966
4967 /* ARGSUSED */
4968 void
4969 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4970 {
4971
4972         l->count = 0;
4973         l->waiters = 0;
4974         l->owner = NULL;
4975         mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4976         cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4977         cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4978 }
4979
4980 void
4981 nfs_rw_destroy(nfs_rwlock_t *l)
4982 {
4983
4984         mutex_destroy(&l->lock);
4985         cv_destroy(&l->cv);
4986         cv_destroy(&l->cv_rd);
4987 }
4988
4989 int
4990 nfs3_rddir_compar(const void *x, const void *y)
4991 {
4992         rddir_cache *a = (rddir_cache *)x;
4993         rddir_cache *b = (rddir_cache *)y;
4994
4995         if (a->nfs3_cookie == b->nfs3_cookie) {
4996                 if (a->buflen == b->buflen)
4997                         return (0);
4998                 if (a->buflen < b->buflen)
4999                         return (-1);
5000                 return (1);
5001         }
5002
5003         if (a->nfs3_cookie < b->nfs3_cookie)
5004                 return (-1);
5005
5006         return (1);
5007 }
5008
5009 int
5010 nfs_rddir_compar(const void *x, const void *y)
5011 {
5012         rddir_cache *a = (rddir_cache *)x;
5013         rddir_cache *b = (rddir_cache *)y;
5014
5015         if (a->nfs_cookie == b->nfs_cookie) {
5016                 if (a->buflen == b->buflen)
5017                         return (0);
5018                 if (a->buflen < b->buflen)
5019                         return (-1);
5020                 return (1);
5021         }
5022
5023         if (a->nfs_cookie < b->nfs_cookie)
5024                 return (-1);
5025
5026         return (1);
5027 }
5028
5029 static char *
5030 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5031 {
5032         servinfo_t *s;
5033         char *srvnames;
5034         char *namep;
5035         size_t length;
5036
5037         /*
5038          * Calculate the length of the string required to hold all
5039          * of the server names plus either a comma or a null
5040          * character following each individual one.
5041          */
5042         length = 0;
5043         for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5044                 length += s->sv_hostnamelen;
5045
5046         srvnames = kmem_alloc(length, KM_SLEEP);
5047
5048         namep = srvnames;
5049         for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5050                 (void) strcpy(namep, s->sv_hostname);
5051                 namep += s->sv_hostnamelen - 1;
5052                 *namep++ = ',';
5053         }
5054         *--namep = '\0';
5055
5056         *len = length;
5057
5058         return (srvnames);
5059 }
5060
5061 /*
5062  * These two functions are temporary and designed for the upgrade-workaround
5063  * only.  They cannot be used for general zone-crossing NFS client support, and
5064  * will be removed shortly.
5065  *
5066  * When the workaround is enabled, all NFS traffic is forced into the global
5067  * zone.  These functions are called when the code needs to refer to the state
5068  * of the underlying network connection.  They're not called when the function
5069  * needs to refer to the state of the process that invoked the system call.
5070  * (E.g., when checking whether the zone is shutting down during the mount()
5071  * call.)
5072  */
5073
5074 struct zone *
5075 nfs_zone(void)
5076 {
5077         return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5078 }
5079
5080 zoneid_t
5081 nfs_zoneid(void)
5082 {
5083         return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5084 }
5085
5086 boolean_t
5087 nfs_has_ctty(void)
5088 {
5089         boolean_t rv;
5090         mutex_enter(&curproc->p_splock);
5091         rv = (curproc->p_sessp->s_vp != NULL);
5092         mutex_exit(&curproc->p_splock);
5093         return (rv);
5094 }
5095
5096 /*
5097  * See if xattr directory to see if it has any generic user attributes
5098  */
5099 int
5100 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5101 {
5102         struct uio uio;
5103         struct iovec iov;
5104         char *dbuf;
5105         struct dirent *dp;
5106         size_t dlen = 8 * 1024;
5107         size_t dbuflen;
5108         int eof = 0;
5109         int error;
5110
5111         *valp = 0;
5112         dbuf = kmem_alloc(dlen, KM_SLEEP);
5113         uio.uio_iov = &iov;
5114         uio.uio_iovcnt = 1;
5115         uio.uio_segflg = UIO_SYSSPACE;
5116         uio.uio_fmode = 0;
5117         uio.uio_extflg = UIO_COPY_CACHED;
5118         uio.uio_loffset = 0;
5119         uio.uio_resid = dlen;
5120         iov.iov_base = dbuf;
5121         iov.iov_len = dlen;
5122         (void) fop_rwlock(vp, V_WRITELOCK_FALSE, NULL);
5123         error = fop_readdir(vp, &uio, cr, &eof, NULL, 0);
5124         fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
5125
5126         dbuflen = dlen - uio.uio_resid;
5127
5128         if (error || dbuflen == 0) {
5129                 kmem_free(dbuf, dlen);
5130                 return (error);
5131         }
5132
5133         dp = (dirent_t *)dbuf;
5134
5135         while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5136                 if (strcmp(dp->d_name, ".") == 0 ||
5137                     strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5138                     VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5139                     VIEW_READONLY) == 0) {
5140                         dp = (dirent_t *)((intptr_t)dp + dp->d_reclen);
5141                         continue;
5142                 }
5143
5144                 *valp = 1;
5145                 break;
5146         }
5147         kmem_free(dbuf, dlen);
5148         return (0);
5149 }