kernel/fs/nfs/nfs3_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  28  *      All rights reserved.
  29  */
  30
  31 /*
  32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  33  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/time.h>
  41 #include <sys/vnode.h>
  42 #include <sys/vfs.h>
  43 #include <sys/file.h>
  44 #include <sys/filio.h>
  45 #include <sys/uio.h>
  46 #include <sys/buf.h>
  47 #include <sys/mman.h>
  48 #include <sys/pathname.h>
  49 #include <sys/dirent.h>
  50 #include <sys/debug.h>
  51 #include <sys/vmsystm.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/flock.h>
  54 #include <sys/swap.h>
  55 #include <sys/errno.h>
  56 #include <sys/strsubr.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/kmem.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/pathconf.h>
  61 #include <sys/utsname.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/acl.h>
  64 #include <sys/systeminfo.h>
  65 #include <sys/atomic.h>
  66 #include <sys/policy.h>
  67 #include <sys/sdt.h>
  68 #include <sys/zone.h>
  69
  70 #include <rpc/types.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73 #include <rpc/rpc_rdma.h>
  74
  75 #include <nfs/nfs.h>
  76 #include <nfs/nfs_clnt.h>
  77 #include <nfs/rnode.h>
  78 #include <nfs/nfs_acl.h>
  79 #include <nfs/lm.h>
  80
  81 #include <vm/hat.h>
  82 #include <vm/as.h>
  83 #include <vm/page.h>
  84 #include <vm/pvn.h>
  85 #include <vm/seg.h>
  86 #include <vm/seg_map.h>
  87 #include <vm/seg_kpm.h>
  88 #include <vm/seg_vn.h>
  89
  90 #include <sys/fs_subr.h>
  91
  92 #include <sys/ddi.h>
  93
  94 static int      nfs3_rdwrlbn(vnode_t *, page_t *, uoff_t, size_t, int,
  95                         cred_t *);
  96 static int      nfs3write(vnode_t *, caddr_t, uoff_t, int, cred_t *,
  97                         stable_how *);
  98 static int      nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
  99 static int      nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
 100 static int      nfs3_accessx(void *, int, cred_t *);
 101 static int      nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
 102 static int      nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
 103 static int      nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
 104                         int, vnode_t **, cred_t *);
 105 static int      nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
 106 static int      nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 107                         int, vnode_t **, cred_t *);
 108 static int      nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 109                         caller_context_t *);
 110 static int      do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 111 static void     nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 112 static void     nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
 113 static int      nfs3_bio(struct buf *, stable_how *, cred_t *);
 114 static int      nfs3_getapage(vnode_t *, uoff_t, size_t, uint_t *,
 115                         page_t *[], size_t, struct seg *, caddr_t,
 116                         enum seg_rw, cred_t *);
 117 static void     nfs3_readahead(vnode_t *, uoff_t, caddr_t, struct seg *,
 118                         cred_t *);
 119 static int      nfs3_sync_putapage(vnode_t *, page_t *, uoff_t, size_t,
 120                         int, cred_t *);
 121 static int      nfs3_sync_pageio(vnode_t *, page_t *, uoff_t, size_t,
 122                         int, cred_t *);
 123 static int      nfs3_commit(vnode_t *, offset3, count3, cred_t *);
 124 static void     nfs3_set_mod(vnode_t *);
 125 static void     nfs3_get_commit(vnode_t *);
 126 static void     nfs3_get_commit_range(vnode_t *, uoff_t, size_t);
 127 static int      nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 128 static int      nfs3_commit_vp(vnode_t *, uoff_t, size_t,  cred_t *);
 129 static int      nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
 130                         cred_t *);
 131 static void     nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
 132                         cred_t *);
 133 static void     nfs3_delmap_callback(struct as *, void *, uint_t);
 134
 135 /*
 136  * Error flags used to pass information about certain special errors
 137  * which need to be handled specially.
 138  */
 139 #define NFS_EOF                 -98
 140 #define NFS_VERF_MISMATCH       -97
 141
 142 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 143 #define ALIGN64(x, ptr, sz)                                             \
 144         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);               \
 145         if (x) {                                                        \
 146                 x = sizeof (uint64_t) - (x);                            \
 147                 sz -= (x);                                              \
 148                 ptr += (x);                                             \
 149         }
 150
 151 /*
 152  * These are the vnode ops routines which implement the vnode interface to
 153  * the networked file system.  These routines just take their parameters,
 154  * make them look networkish by putting the right info into interface structs,
 155  * and then calling the appropriate remote routine(s) to do the work.
 156  *
 157  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 158  * we purge the directory cache relative to that vnode.  This way, the
 159  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 160  * more details on rnode locking.
 161  */
 162
 163 static int      nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
 164 static int      nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
 165                         caller_context_t *);
 166 static int      nfs3_read(vnode_t *, struct uio *, int, cred_t *,
 167                         caller_context_t *);
 168 static int      nfs3_write(vnode_t *, struct uio *, int, cred_t *,
 169                         caller_context_t *);
 170 static int      nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 171                         caller_context_t *);
 172 static int      nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
 173                         caller_context_t *);
 174 static int      nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
 175                         caller_context_t *);
 176 static int      nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 177 static int      nfs3_readlink(vnode_t *, struct uio *, cred_t *,
 178                         caller_context_t *);
 179 static int      nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 180 static void     nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
 181 static int      nfs3_lookup(vnode_t *, char *, vnode_t **,
 182                         struct pathname *, int, vnode_t *, cred_t *,
 183                         caller_context_t *, int *, pathname_t *);
 184 static int      nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 185                         int, vnode_t **, cred_t *, int, caller_context_t *,
 186                         vsecattr_t *);
 187 static int      nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 188                         int);
 189 static int      nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
 190                         caller_context_t *, int);
 191 static int      nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 192                         caller_context_t *, int);
 193 static int      nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 194                         cred_t *, caller_context_t *, int, vsecattr_t *);
 195 static int      nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 196                         caller_context_t *, int);
 197 static int      nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
 198                         cred_t *, caller_context_t *, int);
 199 static int      nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
 200                         caller_context_t *, int);
 201 static int      nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
 202 static int      nfs3_rwlock(vnode_t *, int, caller_context_t *);
 203 static void     nfs3_rwunlock(vnode_t *, int, caller_context_t *);
 204 static int      nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 205 static int      nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
 206                         page_t *[], size_t, struct seg *, caddr_t,
 207                         enum seg_rw, cred_t *, caller_context_t *);
 208 static int      nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 209                         caller_context_t *);
 210 static int      nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 211                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 212 static int      nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 213                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 214 static int      nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 215                         struct flk_callback *, cred_t *, caller_context_t *);
 216 static int      nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
 217                         cred_t *, caller_context_t *);
 218 static int      nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
 219 static int      nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 220                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 221 static int      nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 222                         caller_context_t *);
 223 static int      nfs3_pageio(vnode_t *, page_t *, uoff_t, size_t, int,
 224                         cred_t *, caller_context_t *);
 225 static void     nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
 226                         caller_context_t *);
 227 static int      nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 228                         caller_context_t *);
 229 static int      nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 230                         caller_context_t *);
 231 static int      nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 232                         caller_context_t *);
 233
 234 const struct vnodeops nfs3_vnodeops = {
 235         .vnop_name = "nfs3",
 236         .vop_open = nfs3_open,
 237         .vop_close = nfs3_close,
 238         .vop_read = nfs3_read,
 239         .vop_write = nfs3_write,
 240         .vop_ioctl = nfs3_ioctl,
 241         .vop_getattr = nfs3_getattr,
 242         .vop_setattr = nfs3_setattr,
 243         .vop_access = nfs3_access,
 244         .vop_lookup = nfs3_lookup,
 245         .vop_create = nfs3_create,
 246         .vop_remove = nfs3_remove,
 247         .vop_link = nfs3_link,
 248         .vop_rename = nfs3_rename,
 249         .vop_mkdir = nfs3_mkdir,
 250         .vop_rmdir = nfs3_rmdir,
 251         .vop_readdir = nfs3_readdir,
 252         .vop_symlink = nfs3_symlink,
 253         .vop_readlink = nfs3_readlink,
 254         .vop_fsync = nfs3_fsync,
 255         .vop_inactive = nfs3_inactive,
 256         .vop_fid = nfs3_fid,
 257         .vop_rwlock = nfs3_rwlock,
 258         .vop_rwunlock = nfs3_rwunlock,
 259         .vop_seek = nfs3_seek,
 260         .vop_frlock = nfs3_frlock,
 261         .vop_space = nfs3_space,
 262         .vop_realvp = nfs3_realvp,
 263         .vop_getpage = nfs3_getpage,
 264         .vop_putpage = nfs3_putpage,
 265         .vop_map = nfs3_map,
 266         .vop_addmap = nfs3_addmap,
 267         .vop_delmap = nfs3_delmap,
 268         /* no separate nfs3_dump */
 269         .vop_dump = nfs_dump,
 270         .vop_pathconf = nfs3_pathconf,
 271         .vop_pageio = nfs3_pageio,
 272         .vop_dispose = nfs3_dispose,
 273         .vop_setsecattr = nfs3_setsecattr,
 274         .vop_getsecattr = nfs3_getsecattr,
 275         .vop_shrlock = nfs3_shrlock,
 276         .vop_vnevent = fs_vnevent_support,
 277 };
 278
 279 /*
 280  * XXX:  This is referenced in modstubs.s
 281  */
 282 const struct vnodeops *
 283 nfs3_getvnodeops(void)
 284 {
 285         return (&nfs3_vnodeops);
 286 }
 287
 288 /* ARGSUSED */
 289 static int
 290 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 291 {
 292         int error;
 293         struct vattr va;
 294         rnode_t *rp;
 295         vnode_t *vp;
 296
 297         vp = *vpp;
 298         if (nfs_zone() != VTOMI(vp)->mi_zone)
 299                 return (EIO);
 300         rp = VTOR(vp);
 301         mutex_enter(&rp->r_statelock);
 302         if (rp->r_cred == NULL) {
 303                 crhold(cr);
 304                 rp->r_cred = cr;
 305         }
 306         mutex_exit(&rp->r_statelock);
 307
 308         /*
 309          * If there is no cached data or if close-to-open
 310          * consistency checking is turned off, we can avoid
 311          * the over the wire getattr.  Otherwise, if the
 312          * file system is mounted readonly, then just verify
 313          * the caches are up to date using the normal mechanism.
 314          * Else, if the file is not mmap'd, then just mark
 315          * the attributes as timed out.  They will be refreshed
 316          * and the caches validated prior to being used.
 317          * Else, the file system is mounted writeable so
 318          * force an over the wire GETATTR in order to ensure
 319          * that all cached data is valid.
 320          */
 321         if (vp->v_count > 1 ||
 322             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 323             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 324                 if (vn_is_readonly(vp))
 325                         error = nfs3_validate_caches(vp, cr);
 326                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 327                         PURGE_ATTRCACHE(vp);
 328                         error = 0;
 329                 } else {
 330                         va.va_mask = VATTR_ALL;
 331                         error = nfs3_getattr_otw(vp, &va, cr);
 332                 }
 333         } else
 334                 error = 0;
 335
 336         return (error);
 337 }
 338
 339 /* ARGSUSED */
 340 static int
 341 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 342                 caller_context_t *ct)
 343 {
 344         rnode_t *rp;
 345         int error;
 346         struct vattr va;
 347
 348         /*
 349          * zone_enter(2) prevents processes from changing zones with NFS files
 350          * open; if we happen to get here from the wrong zone we can't do
 351          * anything over the wire.
 352          */
 353         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 354                 /*
 355                  * We could attempt to clean up locks, except we're sure
 356                  * that the current process didn't acquire any locks on
 357                  * the file: any attempt to lock a file belong to another zone
 358                  * will fail, and one can't lock an NFS file and then change
 359                  * zones, as that fails too.
 360                  *
 361                  * Returning an error here is the sane thing to do.  A
 362                  * subsequent call to VN_RELE() which translates to a
 363                  * nfs3_inactive() will clean up state: if the zone of the
 364                  * vnode's origin is still alive and kicking, an async worker
 365                  * thread will handle the request (from the correct zone), and
 366                  * everything (minus the commit and final nfs3_getattr_otw()
 367                  * call) should be OK. If the zone is going away
 368                  * nfs_async_inactive() will throw away cached pages inline.
 369                  */
 370                 return (EIO);
 371         }
 372
 373         /*
 374          * If we are using local locking for this filesystem, then
 375          * release all of the SYSV style record locks.  Otherwise,
 376          * we are doing network locking and we need to release all
 377          * of the network locks.  All of the locks held by this
 378          * process on this file are released no matter what the
 379          * incoming reference count is.
 380          */
 381         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 382                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 383                 cleanshares(vp, ttoproc(curthread)->p_pid);
 384         } else
 385                 nfs_lockrelease(vp, flag, offset, cr);
 386
 387         if (count > 1)
 388                 return (0);
 389
 390         /*
 391          * If the file has been `unlinked', then purge the
 392          * DNLC so that this vnode will get reycled quicker
 393          * and the .nfs* file on the server will get removed.
 394          */
 395         rp = VTOR(vp);
 396         if (rp->r_unldvp != NULL)
 397                 dnlc_purge_vp(vp);
 398
 399         /*
 400          * If the file was open for write and there are pages,
 401          * then if the file system was mounted using the "no-close-
 402          *      to-open" semantics, then start an asynchronous flush
 403          *      of the all of the pages in the file.
 404          * else the file system was not mounted using the "no-close-
 405          *      to-open" semantics, then do a synchronous flush and
 406          *      commit of all of the dirty and uncommitted pages.
 407          *
 408          * The asynchronous flush of the pages in the "nocto" path
 409          * mostly just associates a cred pointer with the rnode so
 410          * writes which happen later will have a better chance of
 411          * working.  It also starts the data being written to the
 412          * server, but without unnecessarily delaying the application.
 413          */
 414         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 415                 if (VTOMI(vp)->mi_flags & MI_NOCTO) {
 416                         error = nfs3_putpage(vp, 0, 0, B_ASYNC,
 417                             cr, ct);
 418                         if (error == EAGAIN)
 419                                 error = 0;
 420                 } else
 421                         error = nfs3_putpage_commit(vp, 0, 0, cr);
 422                 if (!error) {
 423                         mutex_enter(&rp->r_statelock);
 424                         error = rp->r_error;
 425                         rp->r_error = 0;
 426                         mutex_exit(&rp->r_statelock);
 427                 }
 428         } else {
 429                 mutex_enter(&rp->r_statelock);
 430                 error = rp->r_error;
 431                 rp->r_error = 0;
 432                 mutex_exit(&rp->r_statelock);
 433         }
 434
 435         /*
 436          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 437          * refresh the attribute cache with a set of attributes which
 438          * weren't returned from a WRITE.  This will enable the close-
 439          * to-open processing to work.
 440          */
 441         if (rp->r_flags & RWRITEATTR)
 442                 (void) nfs3_getattr_otw(vp, &va, cr);
 443
 444         return (error);
 445 }
 446
 447 /* ARGSUSED */
 448 static int
 449 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
 450 {
 451         mntinfo_t *mi;
 452         READ3args args;
 453         READ3uiores res;
 454         int tsize;
 455         offset_t offset;
 456         ssize_t count;
 457         int error;
 458         int douprintf;
 459         failinfo_t fi;
 460         char *sv_hostname;
 461
 462         mi = VTOMI(vp);
 463         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 464         sv_hostname = VTOR(vp)->r_server->sv_hostname;
 465
 466         douprintf = 1;
 467         args.file = *VTOFH3(vp);
 468         fi.vp = vp;
 469         fi.fhp = (caddr_t)&args.file;
 470         fi.copyproc = nfs3copyfh;
 471         fi.lookupproc = nfs3lookup;
 472         fi.xattrdirproc = acl_getxattrdir3;
 473
 474         res.uiop = uiop;
 475
 476         res.wlist = NULL;
 477
 478         offset = uiop->uio_loffset;
 479         count = uiop->uio_resid;
 480
 481         do {
 482                 if (mi->mi_io_kstats) {
 483                         mutex_enter(&mi->mi_lock);
 484                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 485                         mutex_exit(&mi->mi_lock);
 486                 }
 487
 488                 do {
 489                         tsize = MIN(mi->mi_tsize, count);
 490                         args.offset = (offset3)offset;
 491                         args.count = (count3)tsize;
 492                         res.size = (uint_t)tsize;
 493                         args.res_uiop = uiop;
 494                         args.res_data_val_alt = NULL;
 495
 496                         error = rfs3call(mi, NFSPROC3_READ,
 497                             xdr_READ3args, (caddr_t)&args,
 498                             xdr_READ3uiores, (caddr_t)&res, cr,
 499                             &douprintf, &res.status, 0, &fi);
 500                 } while (error == ENFS_TRYAGAIN);
 501
 502                 if (mi->mi_io_kstats) {
 503                         mutex_enter(&mi->mi_lock);
 504                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 505                         mutex_exit(&mi->mi_lock);
 506                 }
 507
 508                 if (error)
 509                         return (error);
 510
 511                 error = geterrno3(res.status);
 512                 if (error)
 513                         return (error);
 514
 515                 if (res.count != res.size) {
 516                         zcmn_err(getzoneid(), CE_WARN,
 517 "nfs3_directio_read: server %s returned incorrect amount",
 518                             sv_hostname);
 519                         return (EIO);
 520                 }
 521                 count -= res.count;
 522                 offset += res.count;
 523                 if (mi->mi_io_kstats) {
 524                         mutex_enter(&mi->mi_lock);
 525                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
 526                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
 527                         mutex_exit(&mi->mi_lock);
 528                 }
 529                 lwp_stat_update(LWP_STAT_INBLK, 1);
 530         } while (count && !res.eof);
 531
 532         return (0);
 533 }
 534
 535 /* ARGSUSED */
 536 static int
 537 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 538         caller_context_t *ct)
 539 {
 540         rnode_t *rp;
 541         uoff_t off;
 542         offset_t diff;
 543         int on;
 544         size_t n;
 545         caddr_t base;
 546         uint_t flags;
 547         int error = 0;
 548         mntinfo_t *mi;
 549
 550         rp = VTOR(vp);
 551         mi = VTOMI(vp);
 552
 553         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 554
 555         if (nfs_zone() != mi->mi_zone)
 556                 return (EIO);
 557
 558         if (vp->v_type != VREG)
 559                 return (EISDIR);
 560
 561         if (uiop->uio_resid == 0)
 562                 return (0);
 563
 564         if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
 565                 return (EINVAL);
 566
 567         /*
 568          * Bypass VM if caching has been disabled (e.g., locking) or if
 569          * using client-side direct I/O and the file is not mmap'd and
 570          * there are no cached pages.
 571          */
 572         if ((vp->v_flag & VNOCACHE) ||
 573             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 574             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 575             !vn_has_cached_data(vp))) {
 576                 return (nfs3_directio_read(vp, uiop, cr));
 577         }
 578
 579         do {
 580                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 581                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 582                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 583
 584                 error = nfs3_validate_caches(vp, cr);
 585                 if (error)
 586                         break;
 587
 588                 mutex_enter(&rp->r_statelock);
 589                 while (rp->r_flags & RINCACHEPURGE) {
 590                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 591                                 mutex_exit(&rp->r_statelock);
 592                                 return (EINTR);
 593                         }
 594                 }
 595                 diff = rp->r_size - uiop->uio_loffset;
 596                 mutex_exit(&rp->r_statelock);
 597                 if (diff <= 0)
 598                         break;
 599                 if (diff < n)
 600                         n = (size_t)diff;
 601
 602                 if (vpm_enable) {
 603                         /*
 604                          * Copy data.
 605                          */
 606                         error = vpm_data_copy(vp, off + on, n, uiop,
 607                             1, NULL, 0, S_READ);
 608                 } else {
 609                         base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
 610                             S_READ);
 611
 612                         error = uiomove(base + on, n, UIO_READ, uiop);
 613                 }
 614
 615                 if (!error) {
 616                         /*
 617                          * If read a whole block or read to eof,
 618                          * won't need this buffer again soon.
 619                          */
 620                         mutex_enter(&rp->r_statelock);
 621                         if (n + on == MAXBSIZE ||
 622                             uiop->uio_loffset == rp->r_size)
 623                                 flags = SM_DONTNEED;
 624                         else
 625                                 flags = 0;
 626                         mutex_exit(&rp->r_statelock);
 627                         if (vpm_enable) {
 628                                 error = vpm_sync_pages(vp, off, n, flags);
 629                         } else {
 630                                 error = segmap_release(segkmap, base, flags);
 631                         }
 632                 } else {
 633                         if (vpm_enable) {
 634                                 (void) vpm_sync_pages(vp, off, n, 0);
 635                         } else {
 636                                 (void) segmap_release(segkmap, base, 0);
 637                         }
 638                 }
 639         } while (!error && uiop->uio_resid > 0);
 640
 641         return (error);
 642 }
 643
 644 /* ARGSUSED */
 645 static int
 646 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 647         caller_context_t *ct)
 648 {
 649         rlim_t limit = uiop->uio_llimit;
 650         rnode_t *rp;
 651         uoff_t off;
 652         caddr_t base;
 653         uint_t flags;
 654         int remainder;
 655         size_t n;
 656         int on;
 657         int error;
 658         int resid;
 659         offset_t offset;
 660         mntinfo_t *mi;
 661         uint_t bsize;
 662
 663         rp = VTOR(vp);
 664
 665         if (vp->v_type != VREG)
 666                 return (EISDIR);
 667
 668         mi = VTOMI(vp);
 669         if (nfs_zone() != mi->mi_zone)
 670                 return (EIO);
 671         if (uiop->uio_resid == 0)
 672                 return (0);
 673
 674         if (ioflag & FAPPEND) {
 675                 struct vattr va;
 676
 677                 /*
 678                  * Must serialize if appending.
 679                  */
 680                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 681                         nfs_rw_exit(&rp->r_rwlock);
 682                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 683                             INTR(vp)))
 684                                 return (EINTR);
 685                 }
 686
 687                 va.va_mask = VATTR_SIZE;
 688                 error = nfs3getattr(vp, &va, cr);
 689                 if (error)
 690                         return (error);
 691                 uiop->uio_loffset = va.va_size;
 692         }
 693
 694         offset = uiop->uio_loffset + uiop->uio_resid;
 695
 696         if (uiop->uio_loffset < 0 || offset < 0)
 697                 return (EINVAL);
 698
 699         if (limit == RLIM_INFINITY || limit > MAXOFFSET_T)
 700                 limit = MAXOFFSET_T;
 701
 702         /*
 703          * Check to make sure that the process will not exceed
 704          * its limit on file size.  It is okay to write up to
 705          * the limit, but not beyond.  Thus, the write which
 706          * reaches the limit will be short and the next write
 707          * will return an error.
 708          */
 709         remainder = 0;
 710         if (offset > limit) {
 711                 remainder = offset - limit;
 712                 uiop->uio_resid = limit - uiop->uio_loffset;
 713                 if (uiop->uio_resid <= 0) {
 714                         proc_t *p = ttoproc(curthread);
 715
 716                         uiop->uio_resid += remainder;
 717                         mutex_enter(&p->p_lock);
 718                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 719                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 720                         mutex_exit(&p->p_lock);
 721                         return (EFBIG);
 722                 }
 723         }
 724
 725         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 726                 return (EINTR);
 727
 728         /*
 729          * Bypass VM if caching has been disabled (e.g., locking) or if
 730          * using client-side direct I/O and the file is not mmap'd and
 731          * there are no cached pages.
 732          */
 733         if ((vp->v_flag & VNOCACHE) ||
 734             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 735             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 736             !vn_has_cached_data(vp))) {
 737                 size_t bufsize;
 738                 int count;
 739                 uoff_t org_offset;
 740                 stable_how stab_comm;
 741
 742 nfs3_fwrite:
 743                 if (rp->r_flags & RSTALE) {
 744                         resid = uiop->uio_resid;
 745                         offset = uiop->uio_loffset;
 746                         error = rp->r_error;
 747                         /*
 748                          * A close may have cleared r_error, if so,
 749                          * propagate ESTALE error return properly
 750                          */
 751                         if (error == 0)
 752                                 error = ESTALE;
 753                         goto bottom;
 754                 }
 755                 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
 756                 base = kmem_alloc(bufsize, KM_SLEEP);
 757                 do {
 758                         if (ioflag & FDSYNC)
 759                                 stab_comm = DATA_SYNC;
 760                         else
 761                                 stab_comm = FILE_SYNC;
 762                         resid = uiop->uio_resid;
 763                         offset = uiop->uio_loffset;
 764                         count = MIN(uiop->uio_resid, bufsize);
 765                         org_offset = uiop->uio_loffset;
 766                         error = uiomove(base, count, UIO_WRITE, uiop);
 767                         if (!error) {
 768                                 error = nfs3write(vp, base, org_offset,
 769                                     count, cr, &stab_comm);
 770                         }
 771                 } while (!error && uiop->uio_resid > 0);
 772                 kmem_free(base, bufsize);
 773                 goto bottom;
 774         }
 775
 776
 777         bsize = vp->v_vfsp->vfs_bsize;
 778
 779         do {
 780                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 781                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 782                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 783
 784                 resid = uiop->uio_resid;
 785                 offset = uiop->uio_loffset;
 786
 787                 if (rp->r_flags & RSTALE) {
 788                         error = rp->r_error;
 789                         /*
 790                          * A close may have cleared r_error, if so,
 791                          * propagate ESTALE error return properly
 792                          */
 793                         if (error == 0)
 794                                 error = ESTALE;
 795                         break;
 796                 }
 797
 798                 /*
 799                  * Don't create dirty pages faster than they
 800                  * can be cleaned so that the system doesn't
 801                  * get imbalanced.  If the async queue is
 802                  * maxed out, then wait for it to drain before
 803                  * creating more dirty pages.  Also, wait for
 804                  * any threads doing pagewalks in the vop_getattr
 805                  * entry points so that they don't block for
 806                  * long periods.
 807                  */
 808                 mutex_enter(&rp->r_statelock);
 809                 while ((mi->mi_max_threads != 0 &&
 810                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 811                     rp->r_gcount > 0) {
 812                         if (INTR(vp)) {
 813                                 klwp_t *lwp = ttolwp(curthread);
 814
 815                                 if (lwp != NULL)
 816                                         lwp->lwp_nostop++;
 817                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 818                                         mutex_exit(&rp->r_statelock);
 819                                         if (lwp != NULL)
 820                                                 lwp->lwp_nostop--;
 821                                         error = EINTR;
 822                                         goto bottom;
 823                                 }
 824                                 if (lwp != NULL)
 825                                         lwp->lwp_nostop--;
 826                         } else
 827                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 828                 }
 829                 mutex_exit(&rp->r_statelock);
 830
 831                 /*
 832                  * Touch the page and fault it in if it is not in core
 833                  * before segmap_getmapflt or vpm_data_copy can lock it.
 834                  * This is to avoid the deadlock if the buffer is mapped
 835                  * to the same file through mmap which we want to write.
 836                  */
 837                 uio_prefaultpages((long)n, uiop);
 838
 839                 if (vpm_enable) {
 840                         /*
 841                          * It will use kpm mappings, so no need to
 842                          * pass an address.
 843                          */
 844                         error = writerp(rp, NULL, n, uiop, 0);
 845                 } else  {
 846                         if (segmap_kpm) {
 847                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 848                                 size_t pn = MIN(PAGESIZE - pon,
 849                                     uiop->uio_resid);
 850                                 int pagecreate;
 851
 852                                 mutex_enter(&rp->r_statelock);
 853                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 854                                     uiop->uio_loffset + pn >= rp->r_size);
 855                                 mutex_exit(&rp->r_statelock);
 856
 857                                 base = segmap_getmapflt(segkmap, vp, off + on,
 858                                     pn, !pagecreate, S_WRITE);
 859
 860                                 error = writerp(rp, base + pon, n, uiop,
 861                                     pagecreate);
 862
 863                         } else {
 864                                 base = segmap_getmapflt(segkmap, vp, off + on,
 865                                     n, 0, S_READ);
 866                                 error = writerp(rp, base + on, n, uiop, 0);
 867                         }
 868                 }
 869
 870                 if (!error) {
 871                         if (mi->mi_flags & MI_NOAC)
 872                                 flags = SM_WRITE;
 873                         else if ((uiop->uio_loffset % bsize) == 0 ||
 874                             IS_SWAPVP(vp)) {
 875                                 /*
 876                                  * Have written a whole block.
 877                                  * Start an asynchronous write
 878                                  * and mark the buffer to
 879                                  * indicate that it won't be
 880                                  * needed again soon.
 881                                  */
 882                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 883                         } else
 884                                 flags = 0;
 885                         if ((ioflag & (FSYNC|FDSYNC)) ||
 886                             (rp->r_flags & ROUTOFSPACE)) {
 887                                 flags &= ~SM_ASYNC;
 888                                 flags |= SM_WRITE;
 889                         }
 890                         if (vpm_enable) {
 891                                 error = vpm_sync_pages(vp, off, n, flags);
 892                         } else {
 893                                 error = segmap_release(segkmap, base, flags);
 894                         }
 895                 } else {
 896                         if (vpm_enable) {
 897                                 (void) vpm_sync_pages(vp, off, n, 0);
 898                         } else {
 899                                 (void) segmap_release(segkmap, base, 0);
 900                         }
 901                         /*
 902                          * In the event that we got an access error while
 903                          * faulting in a page for a write-only file just
 904                          * force a write.
 905                          */
 906                         if (error == EACCES)
 907                                 goto nfs3_fwrite;
 908                 }
 909         } while (!error && uiop->uio_resid > 0);
 910
 911 bottom:
 912         if (error) {
 913                 uiop->uio_resid = resid + remainder;
 914                 uiop->uio_loffset = offset;
 915         } else
 916                 uiop->uio_resid += remainder;
 917
 918         nfs_rw_exit(&rp->r_lkserlock);
 919
 920         return (error);
 921 }
 922
 923 /*
 924  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 925  */
 926 static int
 927 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, uoff_t off, size_t len,
 928         int flags, cred_t *cr)
 929 {
 930         struct buf *bp;
 931         int error;
 932         page_t *savepp;
 933         uchar_t fsdata;
 934         stable_how stab_comm;
 935
 936         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 937         bp = pageio_setup(pp, len, vp, flags);
 938         ASSERT(bp != NULL);
 939
 940         /*
 941          * pageio_setup should have set b_addr to 0.  This
 942          * is correct since we want to do I/O on a page
 943          * boundary.  bp_mapin will use this addr to calculate
 944          * an offset, and then set b_addr to the kernel virtual
 945          * address it allocated for us.
 946          */
 947         ASSERT(bp->b_un.b_addr == 0);
 948
 949         bp->b_edev = 0;
 950         bp->b_dev = 0;
 951         bp->b_lblkno = lbtodb(off);
 952         bp->b_file = vp;
 953         bp->b_offset = (offset_t)off;
 954         bp_mapin(bp);
 955
 956         /*
 957          * Calculate the desired level of stability to write data
 958          * on the server and then mark all of the pages to reflect
 959          * this.
 960          */
 961         if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
 962             freemem > desfree) {
 963                 stab_comm = UNSTABLE;
 964                 fsdata = C_DELAYCOMMIT;
 965         } else {
 966                 stab_comm = FILE_SYNC;
 967                 fsdata = C_NOCOMMIT;
 968         }
 969
 970         savepp = pp;
 971         do {
 972                 pp->p_fsdata = fsdata;
 973         } while ((pp = pp->p_next) != savepp);
 974
 975         error = nfs3_bio(bp, &stab_comm, cr);
 976
 977         bp_mapout(bp);
 978         pageio_done(bp);
 979
 980         /*
 981          * If the server wrote pages in a more stable fashion than
 982          * was requested, then clear all of the marks in the pages
 983          * indicating that COMMIT operations were required.
 984          */
 985         if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
 986                 do {
 987                         pp->p_fsdata = C_NOCOMMIT;
 988                 } while ((pp = pp->p_next) != savepp);
 989         }
 990
 991         return (error);
 992 }
 993
 994 /*
 995  * Write to file.  Writes to remote server in largest size
 996  * chunks that the server can handle.  Write is synchronous.
 997  */
 998 static int
 999 nfs3write(vnode_t *vp, caddr_t base, uoff_t offset, int count, cred_t *cr,
1000         stable_how *stab_comm)
1001 {
1002         mntinfo_t *mi;
1003         WRITE3args args;
1004         WRITE3res res;
1005         int error;
1006         int tsize;
1007         rnode_t *rp;
1008         int douprintf;
1009
1010         rp = VTOR(vp);
1011         mi = VTOMI(vp);
1012
1013         ASSERT(nfs_zone() == mi->mi_zone);
1014
1015         args.file = *VTOFH3(vp);
1016         args.stable = *stab_comm;
1017
1018         *stab_comm = FILE_SYNC;
1019
1020         douprintf = 1;
1021
1022         do {
1023                 if ((vp->v_flag & VNOCACHE) ||
1024                     (rp->r_flags & RDIRECTIO) ||
1025                     (mi->mi_flags & MI_DIRECTIO))
1026                         tsize = MIN(mi->mi_stsize, count);
1027                 else
1028                         tsize = MIN(mi->mi_curwrite, count);
1029                 args.offset = (offset3)offset;
1030                 args.count = (count3)tsize;
1031                 args.data.data_len = (uint_t)tsize;
1032                 args.data.data_val = base;
1033
1034                 if (mi->mi_io_kstats) {
1035                         mutex_enter(&mi->mi_lock);
1036                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1037                         mutex_exit(&mi->mi_lock);
1038                 }
1039                 args.mblk = NULL;
1040                 do {
1041                         error = rfs3call(mi, NFSPROC3_WRITE,
1042                             xdr_WRITE3args, (caddr_t)&args,
1043                             xdr_WRITE3res, (caddr_t)&res, cr,
1044                             &douprintf, &res.status, 0, NULL);
1045                 } while (error == ENFS_TRYAGAIN);
1046                 if (mi->mi_io_kstats) {
1047                         mutex_enter(&mi->mi_lock);
1048                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1049                         mutex_exit(&mi->mi_lock);
1050                 }
1051
1052                 if (error)
1053                         return (error);
1054                 error = geterrno3(res.status);
1055                 if (!error) {
1056                         if (res.resok.count > args.count) {
1057                                 zcmn_err(getzoneid(), CE_WARN,
1058                                     "nfs3write: server %s wrote %u, "
1059                                     "requested was %u",
1060                                     rp->r_server->sv_hostname,
1061                                     res.resok.count, args.count);
1062                                 return (EIO);
1063                         }
1064                         if (res.resok.committed == UNSTABLE) {
1065                                 *stab_comm = UNSTABLE;
1066                                 if (args.stable == DATA_SYNC ||
1067                                     args.stable == FILE_SYNC) {
1068                                         zcmn_err(getzoneid(), CE_WARN,
1069                         "nfs3write: server %s did not commit to stable storage",
1070                                             rp->r_server->sv_hostname);
1071                                         return (EIO);
1072                                 }
1073                         }
1074                         tsize = (int)res.resok.count;
1075                         count -= tsize;
1076                         base += tsize;
1077                         offset += tsize;
1078                         if (mi->mi_io_kstats) {
1079                                 mutex_enter(&mi->mi_lock);
1080                                 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1081                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1082                                     tsize;
1083                                 mutex_exit(&mi->mi_lock);
1084                         }
1085                         lwp_stat_update(LWP_STAT_OUBLK, 1);
1086                         mutex_enter(&rp->r_statelock);
1087                         if (rp->r_flags & RHAVEVERF) {
1088                                 if (rp->r_verf != res.resok.verf) {
1089                                         nfs3_set_mod(vp);
1090                                         rp->r_verf = res.resok.verf;
1091                                         /*
1092                                          * If the data was written UNSTABLE,
1093                                          * then might as well stop because
1094                                          * the whole block will have to get
1095                                          * rewritten anyway.
1096                                          */
1097                                         if (*stab_comm == UNSTABLE) {
1098                                                 mutex_exit(&rp->r_statelock);
1099                                                 break;
1100                                         }
1101                                 }
1102                         } else {
1103                                 rp->r_verf = res.resok.verf;
1104                                 rp->r_flags |= RHAVEVERF;
1105                         }
1106                         /*
1107                          * Mark the attribute cache as timed out and
1108                          * set RWRITEATTR to indicate that the file
1109                          * was modified with a WRITE operation and
1110                          * that the attributes can not be trusted.
1111                          */
1112                         PURGE_ATTRCACHE_LOCKED(rp);
1113                         rp->r_flags |= RWRITEATTR;
1114                         mutex_exit(&rp->r_statelock);
1115                 }
1116         } while (!error && count);
1117
1118         return (error);
1119 }
1120
1121 /*
1122  * Read from a file.  Reads data in largest chunks our interface can handle.
1123  */
1124 static int
1125 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1126         size_t *residp, cred_t *cr)
1127 {
1128         mntinfo_t *mi;
1129         READ3args args;
1130         READ3vres res;
1131         int tsize;
1132         int error;
1133         int douprintf;
1134         failinfo_t fi;
1135         rnode_t *rp;
1136         struct vattr va;
1137         hrtime_t t;
1138
1139         rp = VTOR(vp);
1140         mi = VTOMI(vp);
1141         ASSERT(nfs_zone() == mi->mi_zone);
1142         douprintf = 1;
1143
1144         args.file = *VTOFH3(vp);
1145         fi.vp = vp;
1146         fi.fhp = (caddr_t)&args.file;
1147         fi.copyproc = nfs3copyfh;
1148         fi.lookupproc = nfs3lookup;
1149         fi.xattrdirproc = acl_getxattrdir3;
1150
1151         res.pov.fres.vp = vp;
1152         res.pov.fres.vap = &va;
1153
1154         res.wlist = NULL;
1155         *residp = count;
1156         do {
1157                 if (mi->mi_io_kstats) {
1158                         mutex_enter(&mi->mi_lock);
1159                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1160                         mutex_exit(&mi->mi_lock);
1161                 }
1162
1163                 do {
1164                         if ((vp->v_flag & VNOCACHE) ||
1165                             (rp->r_flags & RDIRECTIO) ||
1166                             (mi->mi_flags & MI_DIRECTIO))
1167                                 tsize = MIN(mi->mi_tsize, count);
1168                         else
1169                                 tsize = MIN(mi->mi_curread, count);
1170                         res.data.data_val = base;
1171                         res.data.data_len = tsize;
1172                         args.offset = (offset3)offset;
1173                         args.count = (count3)tsize;
1174                         args.res_uiop = NULL;
1175                         args.res_data_val_alt = base;
1176
1177                         t = gethrtime();
1178                         error = rfs3call(mi, NFSPROC3_READ,
1179                             xdr_READ3args, (caddr_t)&args,
1180                             xdr_READ3vres, (caddr_t)&res, cr,
1181                             &douprintf, &res.status, 0, &fi);
1182                 } while (error == ENFS_TRYAGAIN);
1183
1184                 if (mi->mi_io_kstats) {
1185                         mutex_enter(&mi->mi_lock);
1186                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1187                         mutex_exit(&mi->mi_lock);
1188                 }
1189
1190                 if (error)
1191                         return (error);
1192
1193                 error = geterrno3(res.status);
1194                 if (error)
1195                         return (error);
1196
1197                 if (res.count != res.data.data_len) {
1198                         zcmn_err(getzoneid(), CE_WARN,
1199                             "nfs3read: server %s returned incorrect amount",
1200                             rp->r_server->sv_hostname);
1201                         return (EIO);
1202                 }
1203
1204                 count -= res.count;
1205                 *residp = count;
1206                 base += res.count;
1207                 offset += res.count;
1208                 if (mi->mi_io_kstats) {
1209                         mutex_enter(&mi->mi_lock);
1210                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1211                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1212                         mutex_exit(&mi->mi_lock);
1213                 }
1214                 lwp_stat_update(LWP_STAT_INBLK, 1);
1215         } while (count && !res.eof);
1216
1217         if (res.pov.attributes) {
1218                 mutex_enter(&rp->r_statelock);
1219                 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1220                         mutex_exit(&rp->r_statelock);
1221                         PURGE_ATTRCACHE(vp);
1222                 } else {
1223                         if (rp->r_mtime <= t)
1224                                 nfs_attrcache_va(vp, &va);
1225                         mutex_exit(&rp->r_statelock);
1226                 }
1227         }
1228
1229         return (0);
1230 }
1231
1232 /* ARGSUSED */
1233 static int
1234 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1235         caller_context_t *ct)
1236 {
1237
1238         if (nfs_zone() != VTOMI(vp)->mi_zone)
1239                 return (EIO);
1240         switch (cmd) {
1241                 case _FIODIRECTIO:
1242                         return (nfs_directio(vp, (int)arg, cr));
1243                 default:
1244                         return (ENOTTY);
1245         }
1246 }
1247
1248 /* ARGSUSED */
1249 static int
1250 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1251         caller_context_t *ct)
1252 {
1253         int error;
1254         rnode_t *rp;
1255
1256         if (nfs_zone() != VTOMI(vp)->mi_zone)
1257                 return (EIO);
1258         /*
1259          * If it has been specified that the return value will
1260          * just be used as a hint, and we are only being asked
1261          * for size, fsid or rdevid, then return the client's
1262          * notion of these values without checking to make sure
1263          * that the attribute cache is up to date.
1264          * The whole point is to avoid an over the wire GETATTR
1265          * call.
1266          */
1267         rp = VTOR(vp);
1268         if (flags & ATTR_HINT) {
1269                 if (vap->va_mask ==
1270                     (vap->va_mask & (VATTR_SIZE | VATTR_FSID | VATTR_RDEV))) {
1271                         mutex_enter(&rp->r_statelock);
1272                         if (vap->va_mask | VATTR_SIZE)
1273                                 vap->va_size = rp->r_size;
1274                         if (vap->va_mask | VATTR_FSID)
1275                                 vap->va_fsid = rp->r_attr.va_fsid;
1276                         if (vap->va_mask | VATTR_RDEV)
1277                                 vap->va_rdev = rp->r_attr.va_rdev;
1278                         mutex_exit(&rp->r_statelock);
1279                         return (0);
1280                 }
1281         }
1282
1283         /*
1284          * Only need to flush pages if asking for the mtime
1285          * and if there any dirty pages or any outstanding
1286          * asynchronous (write) requests for this file.
1287          */
1288         if (vap->va_mask & VATTR_MTIME) {
1289                 if (vn_has_cached_data(vp) &&
1290                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1291                         mutex_enter(&rp->r_statelock);
1292                         rp->r_gcount++;
1293                         mutex_exit(&rp->r_statelock);
1294                         error = nfs3_putpage(vp, 0, 0, 0, cr, ct);
1295                         mutex_enter(&rp->r_statelock);
1296                         if (error && (error == ENOSPC || error == EDQUOT)) {
1297                                 if (!rp->r_error)
1298                                         rp->r_error = error;
1299                         }
1300                         if (--rp->r_gcount == 0)
1301                                 cv_broadcast(&rp->r_cv);
1302                         mutex_exit(&rp->r_statelock);
1303                 }
1304         }
1305
1306         return (nfs3getattr(vp, vap, cr));
1307 }
1308
1309 /*ARGSUSED4*/
1310 static int
1311 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1312                 caller_context_t *ct)
1313 {
1314         int error;
1315         struct vattr va;
1316
1317         if (vap->va_mask & VATTR_NOSET)
1318                 return (EINVAL);
1319         if (nfs_zone() != VTOMI(vp)->mi_zone)
1320                 return (EIO);
1321
1322         va.va_mask = VATTR_UID | VATTR_MODE;
1323         error = nfs3getattr(vp, &va, cr);
1324         if (error)
1325                 return (error);
1326
1327         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1328             vp);
1329         if (error)
1330                 return (error);
1331
1332         error = nfs3setattr(vp, vap, flags, cr);
1333
1334         if (error == 0 && (vap->va_mask & VATTR_SIZE) && vap->va_size == 0)
1335                 vnevent_truncate(vp, ct);
1336
1337         return (error);
1338 }
1339
1340 static int
1341 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1342 {
1343         int error;
1344         uint_t mask;
1345         SETATTR3args args;
1346         SETATTR3res res;
1347         int douprintf;
1348         rnode_t *rp;
1349         struct vattr va;
1350         mode_t omode;
1351         vsecattr_t *vsp;
1352         hrtime_t t;
1353
1354         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1355         mask = vap->va_mask;
1356
1357         rp = VTOR(vp);
1358
1359         /*
1360          * Only need to flush pages if there are any pages and
1361          * if the file is marked as dirty in some fashion.  The
1362          * file must be flushed so that we can accurately
1363          * determine the size of the file and the cached data
1364          * after the SETATTR returns.  A file is considered to
1365          * be dirty if it is either marked with RDIRTY, has
1366          * outstanding i/o's active, or is mmap'd.  In this
1367          * last case, we can't tell whether there are dirty
1368          * pages, so we flush just to be sure.
1369          */
1370         if (vn_has_cached_data(vp) &&
1371             ((rp->r_flags & RDIRTY) ||
1372             rp->r_count > 0 ||
1373             rp->r_mapcnt > 0)) {
1374                 ASSERT(vp->v_type != VCHR);
1375                 error = nfs3_putpage(vp, 0, 0, 0, cr, NULL);
1376                 if (error && (error == ENOSPC || error == EDQUOT)) {
1377                         mutex_enter(&rp->r_statelock);
1378                         if (!rp->r_error)
1379                                 rp->r_error = error;
1380                         mutex_exit(&rp->r_statelock);
1381                 }
1382         }
1383
1384         args.object = *RTOFH3(rp);
1385         /*
1386          * If the intent is for the server to set the times,
1387          * there is no point in have the mask indicating set mtime or
1388          * atime, because the vap values may be junk, and so result
1389          * in an overflow error. Remove these flags from the vap mask
1390          * before calling in this case, and restore them afterwards.
1391          */
1392         if ((mask & (VATTR_ATIME | VATTR_MTIME)) && !(flags & ATTR_UTIME)) {
1393                 /* Use server times, so don't set the args time fields */
1394                 vap->va_mask &= ~(VATTR_ATIME | VATTR_MTIME);
1395                 error = vattr_to_sattr3(vap, &args.new_attributes);
1396                 vap->va_mask |= (mask & (VATTR_ATIME | VATTR_MTIME));
1397                 if (mask & VATTR_ATIME) {
1398                         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1399                 }
1400                 if (mask & VATTR_MTIME) {
1401                         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1402                 }
1403         } else {
1404                 /* Either do not set times or use the client specified times */
1405                 error = vattr_to_sattr3(vap, &args.new_attributes);
1406         }
1407
1408         if (error) {
1409                 /* req time field(s) overflow - return immediately */
1410                 return (error);
1411         }
1412
1413         va.va_mask = VATTR_MODE | VATTR_CTIME;
1414         error = nfs3getattr(vp, &va, cr);
1415         if (error)
1416                 return (error);
1417         omode = va.va_mode;
1418
1419 tryagain:
1420         if (mask & VATTR_SIZE) {
1421                 args.guard.check = TRUE;
1422                 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1423                 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1424         } else
1425                 args.guard.check = FALSE;
1426
1427         douprintf = 1;
1428
1429         t = gethrtime();
1430
1431         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1432             xdr_SETATTR3args, (caddr_t)&args,
1433             xdr_SETATTR3res, (caddr_t)&res, cr,
1434             &douprintf, &res.status, 0, NULL);
1435
1436         /*
1437          * Purge the access cache and ACL cache if changing either the
1438          * owner of the file, the group owner, or the mode.  These may
1439          * change the access permissions of the file, so purge old
1440          * information and start over again.
1441          */
1442         if (mask & (VATTR_UID | VATTR_GID | VATTR_MODE)) {
1443                 (void) nfs_access_purge_rp(rp);
1444                 if (rp->r_secattr != NULL) {
1445                         mutex_enter(&rp->r_statelock);
1446                         vsp = rp->r_secattr;
1447                         rp->r_secattr = NULL;
1448                         mutex_exit(&rp->r_statelock);
1449                         if (vsp != NULL)
1450                                 nfs_acl_free(vsp);
1451                 }
1452         }
1453
1454         if (error) {
1455                 PURGE_ATTRCACHE(vp);
1456                 return (error);
1457         }
1458
1459         error = geterrno3(res.status);
1460         if (!error) {
1461                 /*
1462                  * If changing the size of the file, invalidate
1463                  * any local cached data which is no longer part
1464                  * of the file.  We also possibly invalidate the
1465                  * last page in the file.  We could use
1466                  * pvn_vpzero(), but this would mark the page as
1467                  * modified and require it to be written back to
1468                  * the server for no particularly good reason.
1469                  * This way, if we access it, then we bring it
1470                  * back in.  A read should be cheaper than a
1471                  * write.
1472                  */
1473                 if (mask & VATTR_SIZE) {
1474                         nfs_invalidate_pages(vp,
1475                             (vap->va_size & PAGEMASK), cr);
1476                 }
1477                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1478                 /*
1479                  * Some servers will change the mode to clear the setuid
1480                  * and setgid bits when changing the uid or gid.  The
1481                  * client needs to compensate appropriately.
1482                  */
1483                 if (mask & (VATTR_UID | VATTR_GID)) {
1484                         int terror;
1485
1486                         va.va_mask = VATTR_MODE;
1487                         terror = nfs3getattr(vp, &va, cr);
1488                         if (!terror &&
1489                             (((mask & VATTR_MODE) && va.va_mode != vap->va_mode) ||
1490                             (!(mask & VATTR_MODE) && va.va_mode != omode))) {
1491                                 va.va_mask = VATTR_MODE;
1492                                 if (mask & VATTR_MODE)
1493                                         va.va_mode = vap->va_mode;
1494                                 else
1495                                         va.va_mode = omode;
1496                                 (void) nfs3setattr(vp, &va, 0, cr);
1497                         }
1498                 }
1499         } else {
1500                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1501                 /*
1502                  * If we got back a "not synchronized" error, then
1503                  * we need to retry with a new guard value.  The
1504                  * guard value used is the change time.  If the
1505                  * server returned post_op_attr, then we can just
1506                  * retry because we have the latest attributes.
1507                  * Otherwise, we issue a GETATTR to get the latest
1508                  * attributes and then retry.  If we couldn't get
1509                  * the attributes this way either, then we give
1510                  * up because we can't complete the operation as
1511                  * required.
1512                  */
1513                 if (res.status == NFS3ERR_NOT_SYNC) {
1514                         va.va_mask = VATTR_CTIME;
1515                         if (nfs3getattr(vp, &va, cr) == 0)
1516                                 goto tryagain;
1517                 }
1518                 PURGE_STALE_FH(error, vp, cr);
1519         }
1520
1521         return (error);
1522 }
1523
1524 static int
1525 nfs3_accessx(void *vp, int mode, cred_t *cr)
1526 {
1527         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1528         return (nfs3_access(vp, mode, 0, cr, NULL));
1529 }
1530
1531 /* ARGSUSED */
1532 static int
1533 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1534 {
1535         int error;
1536         ACCESS3args args;
1537         ACCESS3res res;
1538         int douprintf;
1539         uint32 acc;
1540         rnode_t *rp;
1541         cred_t *cred, *ncr, *ncrfree = NULL;
1542         failinfo_t fi;
1543         nfs_access_type_t cacc;
1544         hrtime_t t;
1545
1546         acc = 0;
1547         if (nfs_zone() != VTOMI(vp)->mi_zone)
1548                 return (EIO);
1549         if (mode & VREAD)
1550                 acc |= ACCESS3_READ;
1551         if (mode & VWRITE) {
1552                 if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1553                         return (EROFS);
1554                 if (vp->v_type == VDIR)
1555                         acc |= ACCESS3_DELETE;
1556                 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1557         }
1558         if (mode & VEXEC) {
1559                 if (vp->v_type == VDIR)
1560                         acc |= ACCESS3_LOOKUP;
1561                 else
1562                         acc |= ACCESS3_EXECUTE;
1563         }
1564
1565         rp = VTOR(vp);
1566         args.object = *VTOFH3(vp);
1567         if (vp->v_type == VDIR) {
1568                 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1569                     ACCESS3_EXTEND | ACCESS3_LOOKUP;
1570         } else {
1571                 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1572                     ACCESS3_EXECUTE;
1573         }
1574         fi.vp = vp;
1575         fi.fhp = (caddr_t)&args.object;
1576         fi.copyproc = nfs3copyfh;
1577         fi.lookupproc = nfs3lookup;
1578         fi.xattrdirproc = acl_getxattrdir3;
1579
1580         cred = cr;
1581         /*
1582          * ncr and ncrfree both initially
1583          * point to the memory area returned
1584          * by crnetadjust();
1585          * ncrfree not NULL when exiting means
1586          * that we need to release it
1587          */
1588         ncr = crnetadjust(cred);
1589         ncrfree = ncr;
1590 tryagain:
1591         if (rp->r_acache != NULL) {
1592                 cacc = nfs_access_check(rp, acc, cred);
1593                 if (cacc == NFS_ACCESS_ALLOWED) {
1594                         if (ncrfree != NULL)
1595                                 crfree(ncrfree);
1596                         return (0);
1597                 }
1598                 if (cacc == NFS_ACCESS_DENIED) {
1599                         /*
1600                          * If the cred can be adjusted, try again
1601                          * with the new cred.
1602                          */
1603                         if (ncr != NULL) {
1604                                 cred = ncr;
1605                                 ncr = NULL;
1606                                 goto tryagain;
1607                         }
1608                         if (ncrfree != NULL)
1609                                 crfree(ncrfree);
1610                         return (EACCES);
1611                 }
1612         }
1613
1614         douprintf = 1;
1615
1616         t = gethrtime();
1617
1618         error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1619             xdr_ACCESS3args, (caddr_t)&args,
1620             xdr_ACCESS3res, (caddr_t)&res, cred,
1621             &douprintf, &res.status, 0, &fi);
1622
1623         if (error) {
1624                 if (ncrfree != NULL)
1625                         crfree(ncrfree);
1626                 return (error);
1627         }
1628
1629         error = geterrno3(res.status);
1630         if (!error) {
1631                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1632                 nfs_access_cache(rp, args.access, res.resok.access, cred);
1633                 /*
1634                  * we just cached results with cred; if cred is the
1635                  * adjusted credentials from crnetadjust, we do not want
1636                  * to release them before exiting: hence setting ncrfree
1637                  * to NULL
1638                  */
1639                 if (cred != cr)
1640                         ncrfree = NULL;
1641                 if ((acc & res.resok.access) != acc) {
1642                         /*
1643                          * If the cred can be adjusted, try again
1644                          * with the new cred.
1645                          */
1646                         if (ncr != NULL) {
1647                                 cred = ncr;
1648                                 ncr = NULL;
1649                                 goto tryagain;
1650                         }
1651                         error = EACCES;
1652                 }
1653         } else {
1654                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1655                 PURGE_STALE_FH(error, vp, cr);
1656         }
1657
1658         if (ncrfree != NULL)
1659                 crfree(ncrfree);
1660
1661         return (error);
1662 }
1663
1664 static int nfs3_do_symlink_cache = 1;
1665
1666 /* ARGSUSED */
1667 static int
1668 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1669 {
1670         int error;
1671         READLINK3args args;
1672         READLINK3res res;
1673         nfspath3 resdata_backup;
1674         rnode_t *rp;
1675         int douprintf;
1676         int len;
1677         failinfo_t fi;
1678         hrtime_t t;
1679
1680         /*
1681          * Can't readlink anything other than a symbolic link.
1682          */
1683         if (vp->v_type != VLNK)
1684                 return (EINVAL);
1685         if (nfs_zone() != VTOMI(vp)->mi_zone)
1686                 return (EIO);
1687
1688         rp = VTOR(vp);
1689         if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1690                 error = nfs3_validate_caches(vp, cr);
1691                 if (error)
1692                         return (error);
1693                 mutex_enter(&rp->r_statelock);
1694                 if (rp->r_symlink.contents != NULL) {
1695                         error = uiomove(rp->r_symlink.contents,
1696                             rp->r_symlink.len, UIO_READ, uiop);
1697                         mutex_exit(&rp->r_statelock);
1698                         return (error);
1699                 }
1700                 mutex_exit(&rp->r_statelock);
1701         }
1702
1703         args.symlink = *VTOFH3(vp);
1704         fi.vp = vp;
1705         fi.fhp = (caddr_t)&args.symlink;
1706         fi.copyproc = nfs3copyfh;
1707         fi.lookupproc = nfs3lookup;
1708         fi.xattrdirproc = acl_getxattrdir3;
1709
1710         res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1711
1712         resdata_backup = res.resok.data;
1713
1714         douprintf = 1;
1715
1716         t = gethrtime();
1717
1718         error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1719             xdr_READLINK3args, (caddr_t)&args,
1720             xdr_READLINK3res, (caddr_t)&res, cr,
1721             &douprintf, &res.status, 0, &fi);
1722
1723         if (res.resok.data == nfs3nametoolong)
1724                 error = EINVAL;
1725
1726         if (error) {
1727                 kmem_free(resdata_backup, MAXPATHLEN);
1728                 return (error);
1729         }
1730
1731         error = geterrno3(res.status);
1732         if (!error) {
1733                 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1734                     cr);
1735                 len = strlen(res.resok.data);
1736                 error = uiomove(res.resok.data, len, UIO_READ, uiop);
1737                 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1738                         mutex_enter(&rp->r_statelock);
1739                                 if (rp->r_symlink.contents == NULL) {
1740                                 rp->r_symlink.contents = res.resok.data;
1741                                 rp->r_symlink.len = len;
1742                                 rp->r_symlink.size = MAXPATHLEN;
1743                                 mutex_exit(&rp->r_statelock);
1744                         } else {
1745                                 mutex_exit(&rp->r_statelock);
1746
1747                                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1748                         }
1749                 } else {
1750                         kmem_free((void *)res.resok.data, MAXPATHLEN);
1751                 }
1752         } else {
1753                 nfs3_cache_post_op_attr(vp,
1754                     &res.resfail.symlink_attributes, t, cr);
1755                 PURGE_STALE_FH(error, vp, cr);
1756
1757                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1758
1759         }
1760
1761         /*
1762          * The over the wire error for attempting to readlink something
1763          * other than a symbolic link is ENXIO.  However, we need to
1764          * return EINVAL instead of ENXIO, so we map it here.
1765          */
1766         return (error == ENXIO ? EINVAL : error);
1767 }
1768
1769 /*
1770  * Flush local dirty pages to stable storage on the server.
1771  *
1772  * If FNODSYNC is specified, then there is nothing to do because
1773  * metadata changes are not cached on the client before being
1774  * sent to the server.
1775  */
1776 /* ARGSUSED */
1777 static int
1778 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1779 {
1780         int error;
1781
1782         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1783                 return (0);
1784         if (nfs_zone() != VTOMI(vp)->mi_zone)
1785                 return (EIO);
1786
1787         error = nfs3_putpage_commit(vp, 0, 0, cr);
1788         if (!error)
1789                 error = VTOR(vp)->r_error;
1790         return (error);
1791 }
1792
1793 /*
1794  * Weirdness: if the file was removed or the target of a rename
1795  * operation while it was open, it got renamed instead.  Here we
1796  * remove the renamed file.
1797  */
1798 /* ARGSUSED */
1799 static void
1800 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1801 {
1802         rnode_t *rp;
1803
1804         ASSERT(vp != DNLC_NO_VNODE);
1805
1806         /*
1807          * If this is coming from the wrong zone, we let someone in the right
1808          * zone take care of it asynchronously.  We can get here due to
1809          * VN_RELE() being called from pageout() or fsflush().  This call may
1810          * potentially turn into an expensive no-op if, for instance, v_count
1811          * gets incremented in the meantime, but it's still correct.
1812          */
1813         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1814                 nfs_async_inactive(vp, cr, nfs3_inactive);
1815                 return;
1816         }
1817
1818         rp = VTOR(vp);
1819 redo:
1820         if (rp->r_unldvp != NULL) {
1821                 /*
1822                  * Save the vnode pointer for the directory where the
1823                  * unlinked-open file got renamed, then set it to NULL
1824                  * to prevent another thread from getting here before
1825                  * we're done with the remove.  While we have the
1826                  * statelock, make local copies of the pertinent rnode
1827                  * fields.  If we weren't to do this in an atomic way, the
1828                  * the unl* fields could become inconsistent with respect
1829                  * to each other due to a race condition between this
1830                  * code and nfs_remove().  See bug report 1034328.
1831                  */
1832                 mutex_enter(&rp->r_statelock);
1833                 if (rp->r_unldvp != NULL) {
1834                         vnode_t *unldvp;
1835                         char *unlname;
1836                         cred_t *unlcred;
1837                         REMOVE3args args;
1838                         REMOVE3res res;
1839                         int douprintf;
1840                         int error;
1841                         hrtime_t t;
1842
1843                         unldvp = rp->r_unldvp;
1844                         rp->r_unldvp = NULL;
1845                         unlname = rp->r_unlname;
1846                         rp->r_unlname = NULL;
1847                         unlcred = rp->r_unlcred;
1848                         rp->r_unlcred = NULL;
1849                         mutex_exit(&rp->r_statelock);
1850
1851                         /*
1852                          * If there are any dirty pages left, then flush
1853                          * them.  This is unfortunate because they just
1854                          * may get thrown away during the remove operation,
1855                          * but we have to do this for correctness.
1856                          */
1857                         if (vn_has_cached_data(vp) &&
1858                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1859                                 ASSERT(vp->v_type != VCHR);
1860                                 error = nfs3_putpage(vp, 0, 0, 0,
1861                                     cr, ct);
1862                                 if (error) {
1863                                         mutex_enter(&rp->r_statelock);
1864                                         if (!rp->r_error)
1865                                                 rp->r_error = error;
1866                                         mutex_exit(&rp->r_statelock);
1867                                 }
1868                         }
1869
1870                         /*
1871                          * Do the remove operation on the renamed file
1872                          */
1873                         setdiropargs3(&args.object, unlname, unldvp);
1874
1875                         douprintf = 1;
1876
1877                         t = gethrtime();
1878
1879                         error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1880                             xdr_diropargs3, (caddr_t)&args,
1881                             xdr_REMOVE3res, (caddr_t)&res, unlcred,
1882                             &douprintf, &res.status, 0, NULL);
1883
1884                         if (error) {
1885                                 PURGE_ATTRCACHE(unldvp);
1886                         } else {
1887                                 error = geterrno3(res.status);
1888                                 if (!error) {
1889                                         nfs3_cache_wcc_data(unldvp,
1890                                             &res.resok.dir_wcc, t, cr);
1891                                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1892                                                 nfs_purge_rddir_cache(unldvp);
1893                                 } else {
1894                                         nfs3_cache_wcc_data(unldvp,
1895                                             &res.resfail.dir_wcc, t, cr);
1896                                         PURGE_STALE_FH(error, unldvp, cr);
1897                                 }
1898                         }
1899
1900                         /*
1901                          * Release stuff held for the remove
1902                          */
1903                         VN_RELE(unldvp);
1904                         kmem_free(unlname, MAXNAMELEN);
1905                         crfree(unlcred);
1906                         goto redo;
1907                 }
1908                 mutex_exit(&rp->r_statelock);
1909         }
1910
1911         rp_addfree(rp, cr);
1912 }
1913
1914 /*
1915  * Remote file system operations having to do with directory manipulation.
1916  */
1917
1918 /* ARGSUSED */
1919 static int
1920 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1921         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1922         int *direntflags, pathname_t *realpnp)
1923 {
1924         int error;
1925         vnode_t *vp;
1926         vnode_t *avp = NULL;
1927         rnode_t *drp;
1928
1929         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1930                 return (EPERM);
1931
1932         drp = VTOR(dvp);
1933
1934         /*
1935          * Are we looking up extended attributes?  If so, "dvp" is
1936          * the file or directory for which we want attributes, and
1937          * we need a lookup of the hidden attribute directory
1938          * before we lookup the rest of the path.
1939          */
1940         if (flags & LOOKUP_XATTR) {
1941                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1942                 mntinfo_t *mi;
1943
1944                 mi = VTOMI(dvp);
1945                 if (!(mi->mi_flags & MI_EXTATTR))
1946                         return (EINVAL);
1947
1948                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1949                         return (EINTR);
1950
1951                 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1952                 if (avp == NULL)
1953                         error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1954                 else
1955                         error = 0;
1956
1957                 nfs_rw_exit(&drp->r_rwlock);
1958
1959                 if (error) {
1960                         if (mi->mi_flags & MI_EXTATTR)
1961                                 return (error);
1962                         return (EINVAL);
1963                 }
1964                 dvp = avp;
1965                 drp = VTOR(dvp);
1966         }
1967
1968         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1969                 error = EINTR;
1970                 goto out;
1971         }
1972
1973         error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1974
1975         nfs_rw_exit(&drp->r_rwlock);
1976
1977         /*
1978          * If vnode is a device, create special vnode.
1979          */
1980         if (!error && IS_DEVVP(*vpp)) {
1981                 vp = *vpp;
1982                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1983                 VN_RELE(vp);
1984         }
1985
1986 out:
1987         if (avp != NULL)
1988                 VN_RELE(avp);
1989
1990         return (error);
1991 }
1992
1993 static int nfs3_lookup_neg_cache = 1;
1994
1995 #ifdef DEBUG
1996 static int nfs3_lookup_dnlc_hits = 0;
1997 static int nfs3_lookup_dnlc_misses = 0;
1998 static int nfs3_lookup_dnlc_neg_hits = 0;
1999 static int nfs3_lookup_dnlc_disappears = 0;
2000 static int nfs3_lookup_dnlc_lookups = 0;
2001 #endif
2002
2003 /* ARGSUSED */
2004 int
2005 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
2006         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
2007 {
2008         int error;
2009         rnode_t *drp;
2010
2011         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2012         /*
2013          * If lookup is for "", just return dvp.  Don't need
2014          * to send it over the wire, look it up in the dnlc,
2015          * or perform any access checks.
2016          */
2017         if (*nm == '\0') {
2018                 VN_HOLD(dvp);
2019                 *vpp = dvp;
2020                 return (0);
2021         }
2022
2023         /*
2024          * Can't do lookups in non-directories.
2025          */
2026         if (dvp->v_type != VDIR)
2027                 return (ENOTDIR);
2028
2029         /*
2030          * If we're called with RFSCALL_SOFT, it's important that
2031          * the only rfscall is one we make directly; if we permit
2032          * an access call because we're looking up "." or validating
2033          * a dnlc hit, we'll deadlock because that rfscall will not
2034          * have the RFSCALL_SOFT set.
2035          */
2036         if (rfscall_flags & RFSCALL_SOFT)
2037                 goto callit;
2038
2039         /*
2040          * If lookup is for ".", just return dvp.  Don't need
2041          * to send it over the wire or look it up in the dnlc,
2042          * just need to check access.
2043          */
2044         if (strcmp(nm, ".") == 0) {
2045                 error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2046                 if (error)
2047                         return (error);
2048                 VN_HOLD(dvp);
2049                 *vpp = dvp;
2050                 return (0);
2051         }
2052
2053         drp = VTOR(dvp);
2054         if (!(drp->r_flags & RLOOKUP)) {
2055                 mutex_enter(&drp->r_statelock);
2056                 drp->r_flags |= RLOOKUP;
2057                 mutex_exit(&drp->r_statelock);
2058         }
2059
2060         /*
2061          * Lookup this name in the DNLC.  If there was a valid entry,
2062          * then return the results of the lookup.
2063          */
2064         error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2065         if (error || *vpp != NULL)
2066                 return (error);
2067
2068 callit:
2069         error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2070
2071         return (error);
2072 }
2073
2074 static int
2075 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2076 {
2077         int error;
2078         vnode_t *vp;
2079
2080         ASSERT(*nm != '\0');
2081         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2082         /*
2083          * Lookup this name in the DNLC.  If successful, then validate
2084          * the caches and then recheck the DNLC.  The DNLC is rechecked
2085          * just in case this entry got invalidated during the call
2086          * to nfs3_validate_caches.
2087          *
2088          * An assumption is being made that it is safe to say that a
2089          * file exists which may not on the server.  Any operations to
2090          * the server will fail with ESTALE.
2091          */
2092 #ifdef DEBUG
2093         nfs3_lookup_dnlc_lookups++;
2094 #endif
2095         vp = dnlc_lookup(dvp, nm);
2096         if (vp != NULL) {
2097                 VN_RELE(vp);
2098                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2099                         PURGE_ATTRCACHE(dvp);
2100                 }
2101                 error = nfs3_validate_caches(dvp, cr);
2102                 if (error)
2103                         return (error);
2104                 vp = dnlc_lookup(dvp, nm);
2105                 if (vp != NULL) {
2106                         error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2107                         if (error) {
2108                                 VN_RELE(vp);
2109                                 return (error);
2110                         }
2111                         if (vp == DNLC_NO_VNODE) {
2112                                 VN_RELE(vp);
2113 #ifdef DEBUG
2114                                 nfs3_lookup_dnlc_neg_hits++;
2115 #endif
2116                                 return (ENOENT);
2117                         }
2118                         *vpp = vp;
2119 #ifdef DEBUG
2120                         nfs3_lookup_dnlc_hits++;
2121 #endif
2122                         return (0);
2123                 }
2124 #ifdef DEBUG
2125                 nfs3_lookup_dnlc_disappears++;
2126 #endif
2127         }
2128 #ifdef DEBUG
2129         else
2130                 nfs3_lookup_dnlc_misses++;
2131 #endif
2132
2133         *vpp = NULL;
2134
2135         return (0);
2136 }
2137
2138 static int
2139 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2140         int rfscall_flags)
2141 {
2142         int error;
2143         LOOKUP3args args;
2144         LOOKUP3vres res;
2145         int douprintf;
2146         struct vattr vattr;
2147         struct vattr dvattr;
2148         vnode_t *vp;
2149         failinfo_t fi;
2150         hrtime_t t;
2151
2152         ASSERT(*nm != '\0');
2153         ASSERT(dvp->v_type == VDIR);
2154         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2155
2156         setdiropargs3(&args.what, nm, dvp);
2157
2158         fi.vp = dvp;
2159         fi.fhp = (caddr_t)&args.what.dir;
2160         fi.copyproc = nfs3copyfh;
2161         fi.lookupproc = nfs3lookup;
2162         fi.xattrdirproc = acl_getxattrdir3;
2163         res.obj_attributes.fres.vp = dvp;
2164         res.obj_attributes.fres.vap = &vattr;
2165         res.dir_attributes.fres.vp = dvp;
2166         res.dir_attributes.fres.vap = &dvattr;
2167
2168         douprintf = 1;
2169
2170         t = gethrtime();
2171
2172         error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2173             xdr_diropargs3, (caddr_t)&args,
2174             xdr_LOOKUP3vres, (caddr_t)&res, cr,
2175             &douprintf, &res.status, rfscall_flags, &fi);
2176
2177         if (error)
2178                 return (error);
2179
2180         nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2181
2182         error = geterrno3(res.status);
2183         if (error) {
2184                 PURGE_STALE_FH(error, dvp, cr);
2185                 if (error == ENOENT && nfs3_lookup_neg_cache)
2186                         dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2187                 return (error);
2188         }
2189
2190         if (res.obj_attributes.attributes) {
2191                 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2192                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2193         } else {
2194                 vp = makenfs3node_va(&res.object, NULL,
2195                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2196                 if (vp->v_type == VNON) {
2197                         vattr.va_mask = VATTR_TYPE;
2198                         error = nfs3getattr(vp, &vattr, cr);
2199                         if (error) {
2200                                 VN_RELE(vp);
2201                                 return (error);
2202                         }
2203                         vp->v_type = vattr.va_type;
2204                 }
2205         }
2206
2207         if (!(rfscall_flags & RFSCALL_SOFT))
2208                 dnlc_update(dvp, nm, vp);
2209
2210         *vpp = vp;
2211
2212         return (error);
2213 }
2214
2215 #ifdef DEBUG
2216 static int nfs3_create_misses = 0;
2217 #endif
2218
2219 /* ARGSUSED */
2220 static int
2221 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2222         int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
2223         vsecattr_t *vsecp)
2224 {
2225         int error;
2226         vnode_t *vp;
2227         rnode_t *rp;
2228         struct vattr vattr;
2229         rnode_t *drp;
2230         vnode_t *tempvp;
2231
2232         drp = VTOR(dvp);
2233         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2234                 return (EPERM);
2235         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2236                 return (EINTR);
2237
2238 top:
2239         /*
2240          * We make a copy of the attributes because the caller does not
2241          * expect us to change what va points to.
2242          */
2243         vattr = *va;
2244
2245         /*
2246          * If the pathname is "", just use dvp.  Don't need
2247          * to send it over the wire, look it up in the dnlc,
2248          * or perform any access checks.
2249          */
2250         if (*nm == '\0') {
2251                 error = 0;
2252                 VN_HOLD(dvp);
2253                 vp = dvp;
2254         /*
2255          * If the pathname is ".", just use dvp.  Don't need
2256          * to send it over the wire or look it up in the dnlc,
2257          * just need to check access.
2258          */
2259         } else if (strcmp(nm, ".") == 0) {
2260                 error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2261                 if (error) {
2262                         nfs_rw_exit(&drp->r_rwlock);
2263                         return (error);
2264                 }
2265                 VN_HOLD(dvp);
2266                 vp = dvp;
2267         /*
2268          * We need to go over the wire, just to be sure whether the
2269          * file exists or not.  Using the DNLC can be dangerous in
2270          * this case when making a decision regarding existence.
2271          */
2272         } else {
2273                 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2274         }
2275         if (!error) {
2276                 if (exclusive == EXCL)
2277                         error = EEXIST;
2278                 else if (vp->v_type == VDIR && (mode & VWRITE))
2279                         error = EISDIR;
2280                 else {
2281                         /*
2282                          * If vnode is a device, create special vnode.
2283                          */
2284                         if (IS_DEVVP(vp)) {
2285                                 tempvp = vp;
2286                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2287                                 VN_RELE(tempvp);
2288                         }
2289                         if (!(error = fop_access(vp, mode, 0, cr, ct))) {
2290                                 if ((vattr.va_mask & VATTR_SIZE) &&
2291                                     vp->v_type == VREG) {
2292                                         vattr.va_mask = VATTR_SIZE;
2293                                         error = nfs3setattr(vp, &vattr, 0, cr);
2294
2295                                         /*
2296                                          * Existing file was truncated; emit a
2297                                          * create event.
2298                                          */
2299                                         vnevent_create(vp, ct);
2300                                 }
2301                         }
2302                 }
2303                 nfs_rw_exit(&drp->r_rwlock);
2304                 if (error) {
2305                         VN_RELE(vp);
2306                 } else {
2307                         *vpp = vp;
2308                 }
2309
2310                 return (error);
2311         }
2312
2313         dnlc_remove(dvp, nm);
2314
2315         /*
2316          * Decide what the group-id of the created file should be.
2317          * Set it in attribute list as advisory...
2318          */
2319         error = setdirgid(dvp, &vattr.va_gid, cr);
2320         if (error) {
2321                 nfs_rw_exit(&drp->r_rwlock);
2322                 return (error);
2323         }
2324         vattr.va_mask |= VATTR_GID;
2325
2326         ASSERT(vattr.va_mask & VATTR_TYPE);
2327         if (vattr.va_type == VREG) {
2328                 ASSERT(vattr.va_mask & VATTR_MODE);
2329                 if (MANDMODE(vattr.va_mode)) {
2330                         nfs_rw_exit(&drp->r_rwlock);
2331                         return (EACCES);
2332                 }
2333                 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2334                 /*
2335                  * If this is not an exclusive create, then the CREATE
2336                  * request will be made with the GUARDED mode set.  This
2337                  * means that the server will return EEXIST if the file
2338                  * exists.  The file could exist because of a retransmitted
2339                  * request.  In this case, we recover by starting over and
2340                  * checking to see whether the file exists.  This second
2341                  * time through it should and a CREATE request will not be
2342                  * sent.
2343                  *
2344                  * This handles the problem of a dangling CREATE request
2345                  * which contains attributes which indicate that the file
2346                  * should be truncated.  This retransmitted request could
2347                  * possibly truncate valid data in the file if not caught
2348                  * by the duplicate request mechanism on the server or if
2349                  * not caught by other means.  The scenario is:
2350                  *
2351                  * Client transmits CREATE request with size = 0
2352                  * Client times out, retransmits request.
2353                  * Response to the first request arrives from the server
2354                  *  and the client proceeds on.
2355                  * Client writes data to the file.
2356                  * The server now processes retransmitted CREATE request
2357                  *  and truncates file.
2358                  *
2359                  * The use of the GUARDED CREATE request prevents this from
2360                  * happening because the retransmitted CREATE would fail
2361                  * with EEXIST and would not truncate the file.
2362                  */
2363                 if (error == EEXIST && exclusive == NONEXCL) {
2364 #ifdef DEBUG
2365                         nfs3_create_misses++;
2366 #endif
2367                         goto top;
2368                 }
2369                 nfs_rw_exit(&drp->r_rwlock);
2370                 return (error);
2371         }
2372         error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2373         nfs_rw_exit(&drp->r_rwlock);
2374         return (error);
2375 }
2376
2377 /* ARGSUSED */
2378 static int
2379 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2380         int mode, vnode_t **vpp, cred_t *cr)
2381 {
2382         int error;
2383         CREATE3args args;
2384         CREATE3res res;
2385         int douprintf;
2386         vnode_t *vp;
2387         struct vattr vattr;
2388         nfstime3 *verfp;
2389         rnode_t *rp;
2390         timestruc_t now;
2391         hrtime_t t;
2392
2393         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2394         setdiropargs3(&args.where, nm, dvp);
2395         if (exclusive == EXCL) {
2396                 args.how.mode = EXCLUSIVE;
2397                 /*
2398                  * Construct the create verifier.  This verifier needs
2399                  * to be unique between different clients.  It also needs
2400                  * to vary for each exclusive create request generated
2401                  * from the client to the server.
2402                  *
2403                  * The first attempt is made to use the hostid and a
2404                  * unique number on the client.  If the hostid has not
2405                  * been set, the high resolution time that the exclusive
2406                  * create request is being made is used.  This will work
2407                  * unless two different clients, both with the hostid
2408                  * not set, attempt an exclusive create request on the
2409                  * same file, at exactly the same clock time.  The
2410                  * chances of this happening seem small enough to be
2411                  * reasonable.
2412                  */
2413                 verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2414                 verfp->seconds = zone_get_hostid(NULL);
2415                 if (verfp->seconds != 0)
2416                         verfp->nseconds = newnum();
2417                 else {
2418                         gethrestime(&now);
2419                         verfp->seconds = now.tv_sec;
2420                         verfp->nseconds = now.tv_nsec;
2421                 }
2422                 /*
2423                  * Since the server will use this value for the mtime,
2424                  * make sure that it can't overflow. Zero out the MSB.
2425                  * The actual value does not matter here, only its uniqeness.
2426                  */
2427                 verfp->seconds %= INT32_MAX;
2428         } else {
2429                 /*
2430                  * Issue the non-exclusive create in guarded mode.  This
2431                  * may result in some false EEXIST responses for
2432                  * retransmitted requests, but these will be handled at
2433                  * a higher level.  By using GUARDED, duplicate requests
2434                  * to do file truncation and possible access problems
2435                  * can be avoided.
2436                  */
2437                 args.how.mode = GUARDED;
2438                 error = vattr_to_sattr3(va,
2439                     &args.how.createhow3_u.obj_attributes);
2440                 if (error) {
2441                         /* req time field(s) overflow - return immediately */
2442                         return (error);
2443                 }
2444         }
2445
2446         douprintf = 1;
2447
2448         t = gethrtime();
2449
2450         error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2451             xdr_CREATE3args, (caddr_t)&args,
2452             xdr_CREATE3res, (caddr_t)&res, cr,
2453             &douprintf, &res.status, 0, NULL);
2454
2455         if (error) {
2456                 PURGE_ATTRCACHE(dvp);
2457                 return (error);
2458         }
2459
2460         error = geterrno3(res.status);
2461         if (!error) {
2462                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2463                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2464                         nfs_purge_rddir_cache(dvp);
2465
2466                 /*
2467                  * On exclusive create the times need to be explicitly
2468                  * set to clear any potential verifier that may be stored
2469                  * in one of these fields (see comment below).  This
2470                  * is done here to cover the case where no post op attrs
2471                  * were returned or a 'invalid' time was returned in
2472                  * the attributes.
2473                  */
2474                 if (exclusive == EXCL)
2475                         va->va_mask |= (VATTR_MTIME | VATTR_ATIME);
2476
2477                 if (!res.resok.obj.handle_follows) {
2478                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2479                         if (error)
2480                                 return (error);
2481                 } else {
2482                         if (res.resok.obj_attributes.attributes) {
2483                                 vp = makenfs3node(&res.resok.obj.handle,
2484                                     &res.resok.obj_attributes.attr,
2485                                     dvp->v_vfsp, t, cr, NULL, NULL);
2486                         } else {
2487                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2488                                     dvp->v_vfsp, t, cr, NULL, NULL);
2489
2490                                 /*
2491                                  * On an exclusive create, it is possible
2492                                  * that attributes were returned but those
2493                                  * postop attributes failed to decode
2494                                  * properly.  If this is the case,
2495                                  * then most likely the atime or mtime
2496                                  * were invalid for our client; this
2497                                  * is caused by the server storing the
2498                                  * create verifier in one of the time
2499                                  * fields(most likely mtime).
2500                                  * So... we are going to setattr just the
2501                                  * atime/mtime to clear things up.
2502                                  */
2503                                 if (exclusive == EXCL) {
2504                                         if (error =
2505                                             nfs3excl_create_settimes(vp,
2506                                             va, cr)) {
2507                                                 /*
2508                                                  * Setting the times failed.
2509                                                  * Remove the file and return
2510                                                  * the error.
2511                                                  */
2512                                                 VN_RELE(vp);
2513                                                 (void) nfs3_remove(dvp,
2514                                                     nm, cr, NULL, 0);
2515                                                 return (error);
2516                                         }
2517                                 }
2518
2519                                 /*
2520                                  * This handles the non-exclusive case
2521                                  * and the exclusive case where no post op
2522                                  * attrs were returned.
2523                                  */
2524                                 if (vp->v_type == VNON) {
2525                                         vattr.va_mask = VATTR_TYPE;
2526                                         error = nfs3getattr(vp, &vattr, cr);
2527                                         if (error) {
2528                                                 VN_RELE(vp);
2529                                                 return (error);
2530                                         }
2531                                         vp->v_type = vattr.va_type;
2532                                 }
2533                         }
2534                         dnlc_update(dvp, nm, vp);
2535                 }
2536
2537                 if (exclusive == EXCL &&
2538                     (va->va_mask & ~(VATTR_GID | VATTR_SIZE))) {
2539                         /*
2540                          * If doing an exclusive create, then generate
2541                          * a SETATTR to set the initial attributes.
2542                          * Try to set the mtime and the atime to the
2543                          * server's current time.  It is somewhat
2544                          * expected that these fields will be used to
2545                          * store the exclusive create cookie.  If not,
2546                          * server implementors will need to know that
2547                          * a SETATTR will follow an exclusive create
2548                          * and the cookie should be destroyed if
2549                          * appropriate. This work may have been done
2550                          * earlier in this function if post op attrs
2551                          * were not available.
2552                          *
2553                          * The VATTR_GID and VATTR_SIZE bits are turned off
2554                          * so that the SETATTR request will not attempt
2555                          * to process these.  The gid will be set
2556                          * separately if appropriate.  The size is turned
2557                          * off because it is assumed that a new file will
2558                          * be created empty and if the file wasn't empty,
2559                          * then the exclusive create will have failed
2560                          * because the file must have existed already.
2561                          * Therefore, no truncate operation is needed.
2562                          */
2563                         va->va_mask &= ~(VATTR_GID | VATTR_SIZE);
2564                         error = nfs3setattr(vp, va, 0, cr);
2565                         if (error) {
2566                                 /*
2567                                  * Couldn't correct the attributes of
2568                                  * the newly created file and the
2569                                  * attributes are wrong.  Remove the
2570                                  * file and return an error to the
2571                                  * application.
2572                                  */
2573                                 VN_RELE(vp);
2574                                 (void) nfs3_remove(dvp, nm, cr, NULL, 0);
2575                                 return (error);
2576                         }
2577                 }
2578
2579                 rp = VTOR(vp);
2580                 if (va->va_gid != rp->r_attr.va_gid) {
2581                         /*
2582                          * If the gid on the file isn't right, then
2583                          * generate a SETATTR to attempt to change
2584                          * it.  This may or may not work, depending
2585                          * upon the server's semantics for allowing
2586                          * file ownership changes.
2587                          */
2588                         va->va_mask = VATTR_GID;
2589                         (void) nfs3setattr(vp, va, 0, cr);
2590                 }
2591
2592                 /*
2593                  * If vnode is a device create special vnode
2594                  */
2595                 if (IS_DEVVP(vp)) {
2596                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2597                         VN_RELE(vp);
2598                 } else
2599                         *vpp = vp;
2600         } else {
2601                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2602                 PURGE_STALE_FH(error, dvp, cr);
2603         }
2604
2605         return (error);
2606 }
2607
2608 /*
2609  * Special setattr function to take care of rest of atime/mtime
2610  * after successful exclusive create.  This function exists to avoid
2611  * handling attributes from the server; exclusive the atime/mtime fields
2612  * may be 'invalid' in client's view and therefore can not be trusted.
2613  */
2614 static int
2615 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2616 {
2617         int error;
2618         uint_t mask;
2619         SETATTR3args args;
2620         SETATTR3res res;
2621         int douprintf;
2622         rnode_t *rp;
2623         hrtime_t t;
2624
2625         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2626         /* save the caller's mask so that it can be reset later */
2627         mask = vap->va_mask;
2628
2629         rp = VTOR(vp);
2630
2631         args.object = *RTOFH3(rp);
2632         args.guard.check = FALSE;
2633
2634         /* Use the mask to initialize the arguments */
2635         vap->va_mask = 0;
2636         error = vattr_to_sattr3(vap, &args.new_attributes);
2637
2638         /* We want to set just atime/mtime on this request */
2639         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2640         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2641
2642         douprintf = 1;
2643
2644         t = gethrtime();
2645
2646         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2647             xdr_SETATTR3args, (caddr_t)&args,
2648             xdr_SETATTR3res, (caddr_t)&res, cr,
2649             &douprintf, &res.status, 0, NULL);
2650
2651         if (error) {
2652                 vap->va_mask = mask;
2653                 return (error);
2654         }
2655
2656         error = geterrno3(res.status);
2657         if (!error) {
2658                 /*
2659                  * It is important to pick up the attributes.
2660                  * Since this is the exclusive create path, the
2661                  * attributes on the initial create were ignored
2662                  * and we need these to have the correct info.
2663                  */
2664                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2665                 /*
2666                  * No need to do the atime/mtime work again so clear
2667                  * the bits.
2668                  */
2669                 mask &= ~(VATTR_ATIME | VATTR_MTIME);
2670         } else {
2671                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2672         }
2673
2674         vap->va_mask = mask;
2675
2676         return (error);
2677 }
2678
2679 /* ARGSUSED */
2680 static int
2681 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2682         int mode, vnode_t **vpp, cred_t *cr)
2683 {
2684         int error;
2685         MKNOD3args args;
2686         MKNOD3res res;
2687         int douprintf;
2688         vnode_t *vp;
2689         struct vattr vattr;
2690         hrtime_t t;
2691
2692         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2693         switch (va->va_type) {
2694         case VCHR:
2695         case VBLK:
2696                 setdiropargs3(&args.where, nm, dvp);
2697                 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2698                 error = vattr_to_sattr3(va,
2699                     &args.what.mknoddata3_u.device.dev_attributes);
2700                 if (error) {
2701                         /* req time field(s) overflow - return immediately */
2702                         return (error);
2703                 }
2704                 args.what.mknoddata3_u.device.spec.specdata1 =
2705                     getmajor(va->va_rdev);
2706                 args.what.mknoddata3_u.device.spec.specdata2 =
2707                     getminor(va->va_rdev);
2708                 break;
2709
2710         case VFIFO:
2711         case VSOCK:
2712                 setdiropargs3(&args.where, nm, dvp);
2713                 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2714                 error = vattr_to_sattr3(va,
2715                     &args.what.mknoddata3_u.pipe_attributes);
2716                 if (error) {
2717                         /* req time field(s) overflow - return immediately */
2718                         return (error);
2719                 }
2720                 break;
2721
2722         default:
2723                 return (EINVAL);
2724         }
2725
2726         douprintf = 1;
2727
2728         t = gethrtime();
2729
2730         error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2731             xdr_MKNOD3args, (caddr_t)&args,
2732             xdr_MKNOD3res, (caddr_t)&res, cr,
2733             &douprintf, &res.status, 0, NULL);
2734
2735         if (error) {
2736                 PURGE_ATTRCACHE(dvp);
2737                 return (error);
2738         }
2739
2740         error = geterrno3(res.status);
2741         if (!error) {
2742                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2743                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2744                         nfs_purge_rddir_cache(dvp);
2745
2746                 if (!res.resok.obj.handle_follows) {
2747                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2748                         if (error)
2749                                 return (error);
2750                 } else {
2751                         if (res.resok.obj_attributes.attributes) {
2752                                 vp = makenfs3node(&res.resok.obj.handle,
2753                                     &res.resok.obj_attributes.attr,
2754                                     dvp->v_vfsp, t, cr, NULL, NULL);
2755                         } else {
2756                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2757                                     dvp->v_vfsp, t, cr, NULL, NULL);
2758                                 if (vp->v_type == VNON) {
2759                                         vattr.va_mask = VATTR_TYPE;
2760                                         error = nfs3getattr(vp, &vattr, cr);
2761                                         if (error) {
2762                                                 VN_RELE(vp);
2763                                                 return (error);
2764                                         }
2765                                         vp->v_type = vattr.va_type;
2766                                 }
2767
2768                         }
2769                         dnlc_update(dvp, nm, vp);
2770                 }
2771
2772                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2773                         va->va_mask = VATTR_GID;
2774                         (void) nfs3setattr(vp, va, 0, cr);
2775                 }
2776
2777                 /*
2778                  * If vnode is a device create special vnode
2779                  */
2780                 if (IS_DEVVP(vp)) {
2781                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2782                         VN_RELE(vp);
2783                 } else
2784                         *vpp = vp;
2785         } else {
2786                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2787                 PURGE_STALE_FH(error, dvp, cr);
2788         }
2789         return (error);
2790 }
2791
2792 /*
2793  * Weirdness: if the vnode to be removed is open
2794  * we rename it instead of removing it and nfs_inactive
2795  * will remove the new name.
2796  */
2797 /* ARGSUSED */
2798 static int
2799 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2800 {
2801         int error;
2802         REMOVE3args args;
2803         REMOVE3res res;
2804         vnode_t *vp;
2805         char *tmpname;
2806         int douprintf;
2807         rnode_t *rp;
2808         rnode_t *drp;
2809         hrtime_t t;
2810
2811         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2812                 return (EPERM);
2813         drp = VTOR(dvp);
2814         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2815                 return (EINTR);
2816
2817         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2818         if (error) {
2819                 nfs_rw_exit(&drp->r_rwlock);
2820                 return (error);
2821         }
2822
2823         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2824                 VN_RELE(vp);
2825                 nfs_rw_exit(&drp->r_rwlock);
2826                 return (EPERM);
2827         }
2828
2829         /*
2830          * First just remove the entry from the name cache, as it
2831          * is most likely the only entry for this vp.
2832          */
2833         dnlc_remove(dvp, nm);
2834
2835         /*
2836          * If the file has a v_count > 1 then there may be more than one
2837          * entry in the name cache due multiple links or an open file,
2838          * but we don't have the real reference count so flush all
2839          * possible entries.
2840          */
2841         if (vp->v_count > 1)
2842                 dnlc_purge_vp(vp);
2843
2844         /*
2845          * Now we have the real reference count on the vnode
2846          */
2847         rp = VTOR(vp);
2848         mutex_enter(&rp->r_statelock);
2849         if (vp->v_count > 1 &&
2850             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2851                 mutex_exit(&rp->r_statelock);
2852                 tmpname = newname();
2853                 error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2854                 if (error)
2855                         kmem_free(tmpname, MAXNAMELEN);
2856                 else {
2857                         mutex_enter(&rp->r_statelock);
2858                         if (rp->r_unldvp == NULL) {
2859                                 VN_HOLD(dvp);
2860                                 rp->r_unldvp = dvp;
2861                                 if (rp->r_unlcred != NULL)
2862                                         crfree(rp->r_unlcred);
2863                                 crhold(cr);
2864                                 rp->r_unlcred = cr;
2865                                 rp->r_unlname = tmpname;
2866                         } else {
2867                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2868                                 rp->r_unlname = tmpname;
2869                         }
2870                         mutex_exit(&rp->r_statelock);
2871                 }
2872         } else {
2873                 mutex_exit(&rp->r_statelock);
2874                 /*
2875                  * We need to flush any dirty pages which happen to
2876                  * be hanging around before removing the file.  This
2877                  * shouldn't happen very often and mostly on file
2878                  * systems mounted "nocto".
2879                  */
2880                 if (vn_has_cached_data(vp) &&
2881                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2882                         error = nfs3_putpage(vp, 0, 0, 0, cr, ct);
2883                         if (error && (error == ENOSPC || error == EDQUOT)) {
2884                                 mutex_enter(&rp->r_statelock);
2885                                 if (!rp->r_error)
2886                                         rp->r_error = error;
2887                                 mutex_exit(&rp->r_statelock);
2888                         }
2889                 }
2890
2891                 setdiropargs3(&args.object, nm, dvp);
2892
2893                 douprintf = 1;
2894
2895                 t = gethrtime();
2896
2897                 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2898                     xdr_diropargs3, (caddr_t)&args,
2899                     xdr_REMOVE3res, (caddr_t)&res, cr,
2900                     &douprintf, &res.status, 0, NULL);
2901
2902                 /*
2903                  * The xattr dir may be gone after last attr is removed,
2904                  * so flush it from dnlc.
2905                  */
2906                 if (dvp->v_flag & V_XATTRDIR)
2907                         dnlc_purge_vp(dvp);
2908
2909                 PURGE_ATTRCACHE(vp);
2910
2911                 if (error) {
2912                         PURGE_ATTRCACHE(dvp);
2913                 } else {
2914                         error = geterrno3(res.status);
2915                         if (!error) {
2916                                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2917                                     cr);
2918                                 if (HAVE_RDDIR_CACHE(drp))
2919                                         nfs_purge_rddir_cache(dvp);
2920                         } else {
2921                                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2922                                     t, cr);
2923                                 PURGE_STALE_FH(error, dvp, cr);
2924                         }
2925                 }
2926         }
2927
2928         if (error == 0) {
2929                 vnevent_remove(vp, dvp, nm, ct);
2930         }
2931         VN_RELE(vp);
2932
2933         nfs_rw_exit(&drp->r_rwlock);
2934
2935         return (error);
2936 }
2937
2938 /* ARGSUSED */
2939 static int
2940 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2941         caller_context_t *ct, int flags)
2942 {
2943         int error;
2944         LINK3args args;
2945         LINK3res res;
2946         vnode_t *realvp;
2947         int douprintf;
2948         mntinfo_t *mi;
2949         rnode_t *tdrp;
2950         hrtime_t t;
2951
2952         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2953                 return (EPERM);
2954         if (fop_realvp(svp, &realvp, ct) == 0)
2955                 svp = realvp;
2956
2957         mi = VTOMI(svp);
2958
2959         if (!(mi->mi_flags & MI_LINK))
2960                 return (EOPNOTSUPP);
2961
2962         args.file = *VTOFH3(svp);
2963         setdiropargs3(&args.link, tnm, tdvp);
2964
2965         tdrp = VTOR(tdvp);
2966         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2967                 return (EINTR);
2968
2969         dnlc_remove(tdvp, tnm);
2970
2971         douprintf = 1;
2972
2973         t = gethrtime();
2974
2975         error = rfs3call(mi, NFSPROC3_LINK,
2976             xdr_LINK3args, (caddr_t)&args,
2977             xdr_LINK3res, (caddr_t)&res, cr,
2978             &douprintf, &res.status, 0, NULL);
2979
2980         if (error) {
2981                 PURGE_ATTRCACHE(tdvp);
2982                 PURGE_ATTRCACHE(svp);
2983                 nfs_rw_exit(&tdrp->r_rwlock);
2984                 return (error);
2985         }
2986
2987         error = geterrno3(res.status);
2988
2989         if (!error) {
2990                 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
2991                 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
2992                 if (HAVE_RDDIR_CACHE(tdrp))
2993                         nfs_purge_rddir_cache(tdvp);
2994                 dnlc_update(tdvp, tnm, svp);
2995         } else {
2996                 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
2997                     cr);
2998                 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
2999                 if (error == EOPNOTSUPP) {
3000                         mutex_enter(&mi->mi_lock);
3001                         mi->mi_flags &= ~MI_LINK;
3002                         mutex_exit(&mi->mi_lock);
3003                 }
3004         }
3005
3006         nfs_rw_exit(&tdrp->r_rwlock);
3007
3008         if (!error) {
3009                 /*
3010                  * Notify the source file of this link operation.
3011                  */
3012                 vnevent_link(svp, ct);
3013         }
3014         return (error);
3015 }
3016
3017 /* ARGSUSED */
3018 static int
3019 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3020         caller_context_t *ct, int flags)
3021 {
3022         vnode_t *realvp;
3023
3024         if (nfs_zone() != VTOMI(odvp)->mi_zone)
3025                 return (EPERM);
3026         if (fop_realvp(ndvp, &realvp, ct) == 0)
3027                 ndvp = realvp;
3028
3029         return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3030 }
3031
3032 /*
3033  * nfs3rename does the real work of renaming in NFS Version 3.
3034  */
3035 static int
3036 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3037     caller_context_t *ct)
3038 {
3039         int error;
3040         RENAME3args args;
3041         RENAME3res res;
3042         int douprintf;
3043         vnode_t *nvp = NULL;
3044         vnode_t *ovp = NULL;
3045         char *tmpname;
3046         rnode_t *rp;
3047         rnode_t *odrp;
3048         rnode_t *ndrp;
3049         hrtime_t t;
3050
3051         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3052
3053         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3054             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3055                 return (EINVAL);
3056
3057         odrp = VTOR(odvp);
3058         ndrp = VTOR(ndvp);
3059         if ((intptr_t)odrp < (intptr_t)ndrp) {
3060                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3061                         return (EINTR);
3062                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3063                         nfs_rw_exit(&odrp->r_rwlock);
3064                         return (EINTR);
3065                 }
3066         } else {
3067                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3068                         return (EINTR);
3069                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3070                         nfs_rw_exit(&ndrp->r_rwlock);
3071                         return (EINTR);
3072                 }
3073         }
3074
3075         /*
3076          * Lookup the target file.  If it exists, it needs to be
3077          * checked to see whether it is a mount point and whether
3078          * it is active (open).
3079          */
3080         error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3081         if (!error) {
3082                 /*
3083                  * If this file has been mounted on, then just
3084                  * return busy because renaming to it would remove
3085                  * the mounted file system from the name space.
3086                  */
3087                 if (vn_mountedvfs(nvp) != NULL) {
3088                         VN_RELE(nvp);
3089                         nfs_rw_exit(&odrp->r_rwlock);
3090                         nfs_rw_exit(&ndrp->r_rwlock);
3091                         return (EBUSY);
3092                 }
3093
3094                 /*
3095                  * Purge the name cache of all references to this vnode
3096                  * so that we can check the reference count to infer
3097                  * whether it is active or not.
3098                  */
3099                 /*
3100                  * First just remove the entry from the name cache, as it
3101                  * is most likely the only entry for this vp.
3102                  */
3103                 dnlc_remove(ndvp, nnm);
3104                 /*
3105                  * If the file has a v_count > 1 then there may be more
3106                  * than one entry in the name cache due multiple links
3107                  * or an open file, but we don't have the real reference
3108                  * count so flush all possible entries.
3109                  */
3110                 if (nvp->v_count > 1)
3111                         dnlc_purge_vp(nvp);
3112
3113                 /*
3114                  * If the vnode is active and is not a directory,
3115                  * arrange to rename it to a
3116                  * temporary file so that it will continue to be
3117                  * accessible.  This implements the "unlink-open-file"
3118                  * semantics for the target of a rename operation.
3119                  * Before doing this though, make sure that the
3120                  * source and target files are not already the same.
3121                  */
3122                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3123                         /*
3124                          * Lookup the source name.
3125                          */
3126                         error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3127                             cr, 0);
3128
3129                         /*
3130                          * The source name *should* already exist.
3131                          */
3132                         if (error) {
3133                                 VN_RELE(nvp);
3134                                 nfs_rw_exit(&odrp->r_rwlock);
3135                                 nfs_rw_exit(&ndrp->r_rwlock);
3136                                 return (error);
3137                         }
3138
3139                         /*
3140                          * Compare the two vnodes.  If they are the same,
3141                          * just release all held vnodes and return success.
3142                          */
3143                         if (ovp == nvp) {
3144                                 VN_RELE(ovp);
3145                                 VN_RELE(nvp);
3146                                 nfs_rw_exit(&odrp->r_rwlock);
3147                                 nfs_rw_exit(&ndrp->r_rwlock);
3148                                 return (0);
3149                         }
3150
3151                         /*
3152                          * Can't mix and match directories and non-
3153                          * directories in rename operations.  We already
3154                          * know that the target is not a directory.  If
3155                          * the source is a directory, return an error.
3156                          */
3157                         if (ovp->v_type == VDIR) {
3158                                 VN_RELE(ovp);
3159                                 VN_RELE(nvp);
3160                                 nfs_rw_exit(&odrp->r_rwlock);
3161                                 nfs_rw_exit(&ndrp->r_rwlock);
3162                                 return (ENOTDIR);
3163                         }
3164
3165                         /*
3166                          * The target file exists, is not the same as
3167                          * the source file, and is active.  Link it
3168                          * to a temporary filename to avoid having
3169                          * the server removing the file completely.
3170                          */
3171                         tmpname = newname();
3172                         error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3173                         if (error == EOPNOTSUPP) {
3174                                 error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3175                                     cr, NULL, 0);
3176                         }
3177                         if (error) {
3178                                 kmem_free(tmpname, MAXNAMELEN);
3179                                 VN_RELE(ovp);
3180                                 VN_RELE(nvp);
3181                                 nfs_rw_exit(&odrp->r_rwlock);
3182                                 nfs_rw_exit(&ndrp->r_rwlock);
3183                                 return (error);
3184                         }
3185                         rp = VTOR(nvp);
3186                         mutex_enter(&rp->r_statelock);
3187                         if (rp->r_unldvp == NULL) {
3188                                 VN_HOLD(ndvp);
3189                                 rp->r_unldvp = ndvp;
3190                                 if (rp->r_unlcred != NULL)
3191                                         crfree(rp->r_unlcred);
3192                                 crhold(cr);
3193                                 rp->r_unlcred = cr;
3194                                 rp->r_unlname = tmpname;
3195                         } else {
3196                                 kmem_free(rp->r_unlname, MAXNAMELEN);
3197                                 rp->r_unlname = tmpname;
3198                         }
3199                         mutex_exit(&rp->r_statelock);
3200                 }
3201         }
3202
3203         if (ovp == NULL) {
3204                 /*
3205                  * When renaming directories to be a subdirectory of a
3206                  * different parent, the dnlc entry for ".." will no
3207                  * longer be valid, so it must be removed.
3208                  *
3209                  * We do a lookup here to determine whether we are renaming
3210                  * a directory and we need to check if we are renaming
3211                  * an unlinked file.  This might have already been done
3212                  * in previous code, so we check ovp == NULL to avoid
3213                  * doing it twice.
3214                  */
3215
3216                 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3217                 /*
3218                  * The source name *should* already exist.
3219                  */
3220                 if (error) {
3221                         nfs_rw_exit(&odrp->r_rwlock);
3222                         nfs_rw_exit(&ndrp->r_rwlock);
3223                         if (nvp) {
3224                                 VN_RELE(nvp);
3225                         }
3226                         return (error);
3227                 }
3228                 ASSERT(ovp != NULL);
3229         }
3230
3231         dnlc_remove(odvp, onm);
3232         dnlc_remove(ndvp, nnm);
3233
3234         setdiropargs3(&args.from, onm, odvp);
3235         setdiropargs3(&args.to, nnm, ndvp);
3236
3237         douprintf = 1;
3238
3239         t = gethrtime();
3240
3241         error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3242             xdr_RENAME3args, (caddr_t)&args,
3243             xdr_RENAME3res, (caddr_t)&res, cr,
3244             &douprintf, &res.status, 0, NULL);
3245
3246         if (error) {
3247                 PURGE_ATTRCACHE(odvp);
3248                 PURGE_ATTRCACHE(ndvp);
3249                 VN_RELE(ovp);
3250                 nfs_rw_exit(&odrp->r_rwlock);
3251                 nfs_rw_exit(&ndrp->r_rwlock);
3252                 if (nvp) {
3253                         VN_RELE(nvp);
3254                 }
3255                 return (error);
3256         }
3257
3258         error = geterrno3(res.status);
3259
3260         if (!error) {
3261                 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3262                 if (HAVE_RDDIR_CACHE(odrp))
3263                         nfs_purge_rddir_cache(odvp);
3264                 if (ndvp != odvp) {
3265                         nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3266                         if (HAVE_RDDIR_CACHE(ndrp))
3267                                 nfs_purge_rddir_cache(ndvp);
3268                 }
3269                 /*
3270                  * when renaming directories to be a subdirectory of a
3271                  * different parent, the dnlc entry for ".." will no
3272                  * longer be valid, so it must be removed
3273                  */
3274                 rp = VTOR(ovp);
3275                 if (ndvp != odvp) {
3276                         if (ovp->v_type == VDIR) {
3277                                 dnlc_remove(ovp, "..");
3278                                 if (HAVE_RDDIR_CACHE(rp))
3279                                         nfs_purge_rddir_cache(ovp);
3280                         }
3281                 }
3282
3283                 /*
3284                  * If we are renaming the unlinked file, update the
3285                  * r_unldvp and r_unlname as needed.
3286                  */
3287                 mutex_enter(&rp->r_statelock);
3288                 if (rp->r_unldvp != NULL) {
3289                         if (strcmp(rp->r_unlname, onm) == 0) {
3290                                 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3291                                 rp->r_unlname[MAXNAMELEN - 1] = '\0';
3292
3293                                 if (ndvp != rp->r_unldvp) {
3294                                         VN_RELE(rp->r_unldvp);
3295                                         rp->r_unldvp = ndvp;
3296                                         VN_HOLD(ndvp);
3297                                 }
3298                         }
3299                 }
3300                 mutex_exit(&rp->r_statelock);
3301         } else {
3302                 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3303                 if (ndvp != odvp) {
3304                         nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3305                             cr);
3306                 }
3307                 /*
3308                  * System V defines rename to return EEXIST, not
3309                  * ENOTEMPTY if the target directory is not empty.
3310                  * Over the wire, the error is NFSERR_ENOTEMPTY
3311                  * which geterrno maps to ENOTEMPTY.
3312                  */
3313                 if (error == ENOTEMPTY)
3314                         error = EEXIST;
3315         }
3316
3317         if (error == 0) {
3318                 if (nvp)
3319                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
3320
3321                 if (odvp != ndvp)
3322                         vnevent_rename_dest_dir(ndvp, ct);
3323                 ASSERT(ovp != NULL);
3324                 vnevent_rename_src(ovp, odvp, onm, ct);
3325         }
3326
3327         if (nvp) {
3328                 VN_RELE(nvp);
3329         }
3330         VN_RELE(ovp);
3331
3332         nfs_rw_exit(&odrp->r_rwlock);
3333         nfs_rw_exit(&ndrp->r_rwlock);
3334
3335         return (error);
3336 }
3337
3338 /* ARGSUSED */
3339 static int
3340 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3341         caller_context_t *ct, int flags, vsecattr_t *vsecp)
3342 {
3343         int error;
3344         MKDIR3args args;
3345         MKDIR3res res;
3346         int douprintf;
3347         struct vattr vattr;
3348         vnode_t *vp;
3349         rnode_t *drp;
3350         hrtime_t t;
3351
3352         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3353                 return (EPERM);
3354         setdiropargs3(&args.where, nm, dvp);
3355
3356         /*
3357          * Decide what the group-id and set-gid bit of the created directory
3358          * should be.  May have to do a setattr to get the gid right.
3359          */
3360         error = setdirgid(dvp, &va->va_gid, cr);
3361         if (error)
3362                 return (error);
3363         error = setdirmode(dvp, &va->va_mode, cr);
3364         if (error)
3365                 return (error);
3366         va->va_mask |= VATTR_MODE|VATTR_GID;
3367
3368         error = vattr_to_sattr3(va, &args.attributes);
3369         if (error) {
3370                 /* req time field(s) overflow - return immediately */
3371                 return (error);
3372         }
3373
3374         drp = VTOR(dvp);
3375         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3376                 return (EINTR);
3377
3378         dnlc_remove(dvp, nm);
3379
3380         douprintf = 1;
3381
3382         t = gethrtime();
3383
3384         error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3385             xdr_MKDIR3args, (caddr_t)&args,
3386             xdr_MKDIR3res, (caddr_t)&res, cr,
3387             &douprintf, &res.status, 0, NULL);
3388
3389         if (error) {
3390                 PURGE_ATTRCACHE(dvp);
3391                 nfs_rw_exit(&drp->r_rwlock);
3392                 return (error);
3393         }
3394
3395         error = geterrno3(res.status);
3396         if (!error) {
3397                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3398                 if (HAVE_RDDIR_CACHE(drp))
3399                         nfs_purge_rddir_cache(dvp);
3400
3401                 if (!res.resok.obj.handle_follows) {
3402                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3403                         if (error) {
3404                                 nfs_rw_exit(&drp->r_rwlock);
3405                                 return (error);
3406                         }
3407                 } else {
3408                         if (res.resok.obj_attributes.attributes) {
3409                                 vp = makenfs3node(&res.resok.obj.handle,
3410                                     &res.resok.obj_attributes.attr,
3411                                     dvp->v_vfsp, t, cr, NULL, NULL);
3412                         } else {
3413                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3414                                     dvp->v_vfsp, t, cr, NULL, NULL);
3415                                 if (vp->v_type == VNON) {
3416                                         vattr.va_mask = VATTR_TYPE;
3417                                         error = nfs3getattr(vp, &vattr, cr);
3418                                         if (error) {
3419                                                 VN_RELE(vp);
3420                                                 nfs_rw_exit(&drp->r_rwlock);
3421                                                 return (error);
3422                                         }
3423                                         vp->v_type = vattr.va_type;
3424                                 }
3425                         }
3426                         dnlc_update(dvp, nm, vp);
3427                 }
3428                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3429                         va->va_mask = VATTR_GID;
3430                         (void) nfs3setattr(vp, va, 0, cr);
3431                 }
3432                 *vpp = vp;
3433         } else {
3434                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3435                 PURGE_STALE_FH(error, dvp, cr);
3436         }
3437
3438         nfs_rw_exit(&drp->r_rwlock);
3439
3440         return (error);
3441 }
3442
3443 /* ARGSUSED */
3444 static int
3445 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3446         caller_context_t *ct, int flags)
3447 {
3448         int error;
3449         RMDIR3args args;
3450         RMDIR3res res;
3451         vnode_t *vp;
3452         int douprintf;
3453         rnode_t *drp;
3454         hrtime_t t;
3455
3456         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3457                 return (EPERM);
3458         drp = VTOR(dvp);
3459         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3460                 return (EINTR);
3461
3462         /*
3463          * Attempt to prevent a rmdir(".") from succeeding.
3464          */
3465         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3466         if (error) {
3467                 nfs_rw_exit(&drp->r_rwlock);
3468                 return (error);
3469         }
3470
3471         if (vp == cdir) {
3472                 VN_RELE(vp);
3473                 nfs_rw_exit(&drp->r_rwlock);
3474                 return (EINVAL);
3475         }
3476
3477         setdiropargs3(&args.object, nm, dvp);
3478
3479         /*
3480          * First just remove the entry from the name cache, as it
3481          * is most likely an entry for this vp.
3482          */
3483         dnlc_remove(dvp, nm);
3484
3485         /*
3486          * If there vnode reference count is greater than one, then
3487          * there may be additional references in the DNLC which will
3488          * need to be purged.  First, trying removing the entry for
3489          * the parent directory and see if that removes the additional
3490          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3491          * to completely remove any references to the directory which
3492          * might still exist in the DNLC.
3493          */
3494         if (vp->v_count > 1) {
3495                 dnlc_remove(vp, "..");
3496                 if (vp->v_count > 1)
3497                         dnlc_purge_vp(vp);
3498         }
3499
3500         douprintf = 1;
3501
3502         t = gethrtime();
3503
3504         error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3505             xdr_diropargs3, (caddr_t)&args,
3506             xdr_RMDIR3res, (caddr_t)&res, cr,
3507             &douprintf, &res.status, 0, NULL);
3508
3509         PURGE_ATTRCACHE(vp);
3510
3511         if (error) {
3512                 PURGE_ATTRCACHE(dvp);
3513                 VN_RELE(vp);
3514                 nfs_rw_exit(&drp->r_rwlock);
3515                 return (error);
3516         }
3517
3518         error = geterrno3(res.status);
3519         if (!error) {
3520                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3521                 if (HAVE_RDDIR_CACHE(drp))
3522                         nfs_purge_rddir_cache(dvp);
3523                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
3524                         nfs_purge_rddir_cache(vp);
3525         } else {
3526                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3527                 PURGE_STALE_FH(error, dvp, cr);
3528                 /*
3529                  * System V defines rmdir to return EEXIST, not
3530                  * ENOTEMPTY if the directory is not empty.  Over
3531                  * the wire, the error is NFSERR_ENOTEMPTY which
3532                  * geterrno maps to ENOTEMPTY.
3533                  */
3534                 if (error == ENOTEMPTY)
3535                         error = EEXIST;
3536         }
3537
3538         if (error == 0) {
3539                 vnevent_rmdir(vp, dvp, nm, ct);
3540         }
3541         VN_RELE(vp);
3542
3543         nfs_rw_exit(&drp->r_rwlock);
3544
3545         return (error);
3546 }
3547
3548 /* ARGSUSED */
3549 static int
3550 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3551         caller_context_t *ct, int flags)
3552 {
3553         int error;
3554         SYMLINK3args args;
3555         SYMLINK3res res;
3556         int douprintf;
3557         mntinfo_t *mi;
3558         vnode_t *vp;
3559         rnode_t *rp;
3560         char *contents;
3561         rnode_t *drp;
3562         hrtime_t t;
3563
3564         mi = VTOMI(dvp);
3565
3566         if (nfs_zone() != mi->mi_zone)
3567                 return (EPERM);
3568         if (!(mi->mi_flags & MI_SYMLINK))
3569                 return (EOPNOTSUPP);
3570
3571         setdiropargs3(&args.where, lnm, dvp);
3572         error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3573         if (error) {
3574                 /* req time field(s) overflow - return immediately */
3575                 return (error);
3576         }
3577         args.symlink.symlink_data = tnm;
3578
3579         drp = VTOR(dvp);
3580         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3581                 return (EINTR);
3582
3583         dnlc_remove(dvp, lnm);
3584
3585         douprintf = 1;
3586
3587         t = gethrtime();
3588
3589         error = rfs3call(mi, NFSPROC3_SYMLINK,
3590             xdr_SYMLINK3args, (caddr_t)&args,
3591             xdr_SYMLINK3res, (caddr_t)&res, cr,
3592             &douprintf, &res.status, 0, NULL);
3593
3594         if (error) {
3595                 PURGE_ATTRCACHE(dvp);
3596                 nfs_rw_exit(&drp->r_rwlock);
3597                 return (error);
3598         }
3599
3600         error = geterrno3(res.status);
3601         if (!error) {
3602                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3603                 if (HAVE_RDDIR_CACHE(drp))
3604                         nfs_purge_rddir_cache(dvp);
3605
3606                 if (res.resok.obj.handle_follows) {
3607                         if (res.resok.obj_attributes.attributes) {
3608                                 vp = makenfs3node(&res.resok.obj.handle,
3609                                     &res.resok.obj_attributes.attr,
3610                                     dvp->v_vfsp, t, cr, NULL, NULL);
3611                         } else {
3612                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3613                                     dvp->v_vfsp, t, cr, NULL, NULL);
3614                                 vp->v_type = VLNK;
3615                                 vp->v_rdev = 0;
3616                         }
3617                         dnlc_update(dvp, lnm, vp);
3618                         rp = VTOR(vp);
3619                         if (nfs3_do_symlink_cache &&
3620                             rp->r_symlink.contents == NULL) {
3621
3622                                 contents = kmem_alloc(MAXPATHLEN,
3623                                     KM_NOSLEEP);
3624
3625                                 if (contents != NULL) {
3626                                         mutex_enter(&rp->r_statelock);
3627                                         if (rp->r_symlink.contents == NULL) {
3628                                                 rp->r_symlink.len = strlen(tnm);
3629                                                 bcopy(tnm, contents,
3630                                                     rp->r_symlink.len);
3631                                                 rp->r_symlink.contents =
3632                                                     contents;
3633                                                 rp->r_symlink.size = MAXPATHLEN;
3634                                                 mutex_exit(&rp->r_statelock);
3635                                         } else {
3636                                                 mutex_exit(&rp->r_statelock);
3637                                                 kmem_free((void *)contents,
3638                                                     MAXPATHLEN);
3639                                         }
3640                                 }
3641                         }
3642                         VN_RELE(vp);
3643                 }
3644         } else {
3645                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3646                 PURGE_STALE_FH(error, dvp, cr);
3647                 if (error == EOPNOTSUPP) {
3648                         mutex_enter(&mi->mi_lock);
3649                         mi->mi_flags &= ~MI_SYMLINK;
3650                         mutex_exit(&mi->mi_lock);
3651                 }
3652         }
3653
3654         nfs_rw_exit(&drp->r_rwlock);
3655
3656         return (error);
3657 }
3658
3659 #ifdef DEBUG
3660 static int nfs3_readdir_cache_hits = 0;
3661 static int nfs3_readdir_cache_shorts = 0;
3662 static int nfs3_readdir_cache_waits = 0;
3663 static int nfs3_readdir_cache_misses = 0;
3664 static int nfs3_readdir_readahead = 0;
3665 #endif
3666
3667 static int nfs3_shrinkreaddir = 0;
3668
3669 /*
3670  * Read directory entries.
3671  * There are some weird things to look out for here.  The uio_loffset
3672  * field is either 0 or it is the offset returned from a previous
3673  * readdir.  It is an opaque value used by the server to find the
3674  * correct directory block to read. The count field is the number
3675  * of blocks to read on the server.  This is advisory only, the server
3676  * may return only one block's worth of entries.  Entries may be compressed
3677  * on the server.
3678  */
3679 /* ARGSUSED */
3680 static int
3681 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3682         caller_context_t *ct, int flags)
3683 {
3684         int error;
3685         size_t count;
3686         rnode_t *rp;
3687         rddir_cache *rdc;
3688         rddir_cache *nrdc;
3689         rddir_cache *rrdc;
3690 #ifdef DEBUG
3691         int missed;
3692 #endif
3693         int doreadahead;
3694         rddir_cache srdc;
3695         avl_index_t where;
3696
3697         if (nfs_zone() != VTOMI(vp)->mi_zone)
3698                 return (EIO);
3699         rp = VTOR(vp);
3700
3701         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3702
3703         /*
3704          * Make sure that the directory cache is valid.
3705          */
3706         if (HAVE_RDDIR_CACHE(rp)) {
3707                 if (nfs_disable_rddir_cache) {
3708                         /*
3709                          * Setting nfs_disable_rddir_cache in /etc/system
3710                          * allows interoperability with servers that do not
3711                          * properly update the attributes of directories.
3712                          * Any cached information gets purged before an
3713                          * access is made to it.
3714                          */
3715                         nfs_purge_rddir_cache(vp);
3716                 } else {
3717                         error = nfs3_validate_caches(vp, cr);
3718                         if (error)
3719                                 return (error);
3720                 }
3721         }
3722
3723         /*
3724          * It is possible that some servers may not be able to correctly
3725          * handle a large READDIR or READDIRPLUS request due to bugs in
3726          * their implementation.  In order to continue to interoperate
3727          * with them, this workaround is provided to limit the maximum
3728          * size of a READDIRPLUS request to 1024.  In any case, the request
3729          * size is limited to MAXBSIZE.
3730          */
3731         count = MIN(uiop->uio_iov->iov_len,
3732             nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3733
3734         nrdc = NULL;
3735 #ifdef DEBUG
3736         missed = 0;
3737 #endif
3738 top:
3739         /*
3740          * Short circuit last readdir which always returns 0 bytes.
3741          * This can be done after the directory has been read through
3742          * completely at least once.  This will set r_direof which
3743          * can be used to find the value of the last cookie.
3744          */
3745         mutex_enter(&rp->r_statelock);
3746         if (rp->r_direof != NULL &&
3747             uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3748                 mutex_exit(&rp->r_statelock);
3749 #ifdef DEBUG
3750                 nfs3_readdir_cache_shorts++;
3751 #endif
3752                 if (eofp)
3753                         *eofp = 1;
3754                 if (nrdc != NULL)
3755                         rddir_cache_rele(nrdc);
3756                 return (0);
3757         }
3758         /*
3759          * Look for a cache entry.  Cache entries are identified
3760          * by the NFS cookie value and the byte count requested.
3761          */
3762         srdc.nfs3_cookie = uiop->uio_loffset;
3763         srdc.buflen = count;
3764         rdc = avl_find(&rp->r_dir, &srdc, &where);
3765         if (rdc != NULL) {
3766                 rddir_cache_hold(rdc);
3767                 /*
3768                  * If the cache entry is in the process of being
3769                  * filled in, wait until this completes.  The
3770                  * RDDIRWAIT bit is set to indicate that someone
3771                  * is waiting and then the thread currently
3772                  * filling the entry is done, it should do a
3773                  * cv_broadcast to wakeup all of the threads
3774                  * waiting for it to finish.
3775                  */
3776                 if (rdc->flags & RDDIR) {
3777                         nfs_rw_exit(&rp->r_rwlock);
3778                         rdc->flags |= RDDIRWAIT;
3779 #ifdef DEBUG
3780                         nfs3_readdir_cache_waits++;
3781 #endif
3782                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3783                                 /*
3784                                  * We got interrupted, probably
3785                                  * the user typed ^C or an alarm
3786                                  * fired.  We free the new entry
3787                                  * if we allocated one.
3788                                  */
3789                                 mutex_exit(&rp->r_statelock);
3790                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3791                                     RW_READER, FALSE);
3792                                 rddir_cache_rele(rdc);
3793                                 if (nrdc != NULL)
3794                                         rddir_cache_rele(nrdc);
3795                                 return (EINTR);
3796                         }
3797                         mutex_exit(&rp->r_statelock);
3798                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3799                             RW_READER, FALSE);
3800                         rddir_cache_rele(rdc);
3801                         goto top;
3802                 }
3803                 /*
3804                  * Check to see if a readdir is required to
3805                  * fill the entry.  If so, mark this entry
3806                  * as being filled, remove our reference,
3807                  * and branch to the code to fill the entry.
3808                  */
3809                 if (rdc->flags & RDDIRREQ) {
3810                         rdc->flags &= ~RDDIRREQ;
3811                         rdc->flags |= RDDIR;
3812                         if (nrdc != NULL)
3813                                 rddir_cache_rele(nrdc);
3814                         nrdc = rdc;
3815                         mutex_exit(&rp->r_statelock);
3816                         goto bottom;
3817                 }
3818 #ifdef DEBUG
3819                 if (!missed)
3820                         nfs3_readdir_cache_hits++;
3821 #endif
3822                 /*
3823                  * If an error occurred while attempting
3824                  * to fill the cache entry, just return it.
3825                  */
3826                 if (rdc->error) {
3827                         error = rdc->error;
3828                         mutex_exit(&rp->r_statelock);
3829                         rddir_cache_rele(rdc);
3830                         if (nrdc != NULL)
3831                                 rddir_cache_rele(nrdc);
3832                         return (error);
3833                 }
3834
3835                 /*
3836                  * The cache entry is complete and good,
3837                  * copyout the dirent structs to the calling
3838                  * thread.
3839                  */
3840                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3841
3842                 /*
3843                  * If no error occurred during the copyout,
3844                  * update the offset in the uio struct to
3845                  * contain the value of the next cookie
3846                  * and set the eof value appropriately.
3847                  */
3848                 if (!error) {
3849                         uiop->uio_loffset = rdc->nfs3_ncookie;
3850                         if (eofp)
3851                                 *eofp = rdc->eof;
3852                 }
3853
3854                 /*
3855                  * Decide whether to do readahead.
3856                  *
3857                  * Don't if have already read to the end of
3858                  * directory.  There is nothing more to read.
3859                  *
3860                  * Don't if the application is not doing
3861                  * lookups in the directory.  The readahead
3862                  * is only effective if the application can
3863                  * be doing work while an async thread is
3864                  * handling the over the wire request.
3865                  */
3866                 if (rdc->eof) {
3867                         rp->r_direof = rdc;
3868                         doreadahead = FALSE;
3869                 } else if (!(rp->r_flags & RLOOKUP))
3870                         doreadahead = FALSE;
3871                 else
3872                         doreadahead = TRUE;
3873
3874                 if (!doreadahead) {
3875                         mutex_exit(&rp->r_statelock);
3876                         rddir_cache_rele(rdc);
3877                         if (nrdc != NULL)
3878                                 rddir_cache_rele(nrdc);
3879                         return (error);
3880                 }
3881
3882                 /*
3883                  * Check to see whether we found an entry
3884                  * for the readahead.  If so, we don't need
3885                  * to do anything further, so free the new
3886                  * entry if one was allocated.  Otherwise,
3887                  * allocate a new entry, add it to the cache,
3888                  * and then initiate an asynchronous readdir
3889                  * operation to fill it.
3890                  */
3891                 srdc.nfs3_cookie = rdc->nfs3_ncookie;
3892                 srdc.buflen = count;
3893                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3894                 if (rrdc != NULL) {
3895                         if (nrdc != NULL)
3896                                 rddir_cache_rele(nrdc);
3897                 } else {
3898                         if (nrdc != NULL)
3899                                 rrdc = nrdc;
3900                         else {
3901                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3902                         }
3903                         if (rrdc != NULL) {
3904                                 rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3905                                 rrdc->buflen = count;
3906                                 avl_insert(&rp->r_dir, rrdc, where);
3907                                 rddir_cache_hold(rrdc);
3908                                 mutex_exit(&rp->r_statelock);
3909                                 rddir_cache_rele(rdc);
3910 #ifdef DEBUG
3911                                 nfs3_readdir_readahead++;
3912 #endif
3913                                 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3914                                 return (error);
3915                         }
3916                 }
3917
3918                 mutex_exit(&rp->r_statelock);
3919                 rddir_cache_rele(rdc);
3920                 return (error);
3921         }
3922
3923         /*
3924          * Didn't find an entry in the cache.  Construct a new empty
3925          * entry and link it into the cache.  Other processes attempting
3926          * to access this entry will need to wait until it is filled in.
3927          *
3928          * Since kmem_alloc may block, another pass through the cache
3929          * will need to be taken to make sure that another process
3930          * hasn't already added an entry to the cache for this request.
3931          */
3932         if (nrdc == NULL) {
3933                 mutex_exit(&rp->r_statelock);
3934                 nrdc = rddir_cache_alloc(KM_SLEEP);
3935                 nrdc->nfs3_cookie = uiop->uio_loffset;
3936                 nrdc->buflen = count;
3937                 goto top;
3938         }
3939
3940         /*
3941          * Add this entry to the cache.
3942          */
3943         avl_insert(&rp->r_dir, nrdc, where);
3944         rddir_cache_hold(nrdc);
3945         mutex_exit(&rp->r_statelock);
3946
3947 bottom:
3948 #ifdef DEBUG
3949         missed = 1;
3950         nfs3_readdir_cache_misses++;
3951 #endif
3952         /*
3953          * Do the readdir.  This routine decides whether to use
3954          * READDIR or READDIRPLUS.
3955          */
3956         error = do_nfs3readdir(vp, nrdc, cr);
3957
3958         /*
3959          * If this operation failed, just return the error which occurred.
3960          */
3961         if (error != 0)
3962                 return (error);
3963
3964         /*
3965          * Since the RPC operation will have taken sometime and blocked
3966          * this process, another pass through the cache will need to be
3967          * taken to find the correct cache entry.  It is possible that
3968          * the correct cache entry will not be there (although one was
3969          * added) because the directory changed during the RPC operation
3970          * and the readdir cache was flushed.  In this case, just start
3971          * over.  It is hoped that this will not happen too often... :-)
3972          */
3973         nrdc = NULL;
3974         goto top;
3975         /* NOTREACHED */
3976 }
3977
3978 static int
3979 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3980 {
3981         int error;
3982         rnode_t *rp;
3983         mntinfo_t *mi;
3984
3985         rp = VTOR(vp);
3986         mi = VTOMI(vp);
3987         ASSERT(nfs_zone() == mi->mi_zone);
3988         /*
3989          * Issue the proper request.
3990          *
3991          * If the server does not support READDIRPLUS, then use READDIR.
3992          *
3993          * Otherwise --
3994          * Issue a READDIRPLUS if reading to fill an empty cache or if
3995          * an application has performed a lookup in the directory which
3996          * required an over the wire lookup.  The use of READDIRPLUS
3997          * will help to (re)populate the DNLC.
3998          */
3999         if (!(mi->mi_flags & MI_READDIRONLY) &&
4000             (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4001                 if (rp->r_flags & RREADDIRPLUS) {
4002                         mutex_enter(&rp->r_statelock);
4003                         rp->r_flags &= ~RREADDIRPLUS;
4004                         mutex_exit(&rp->r_statelock);
4005                 }
4006                 nfs3readdirplus(vp, rdc, cr);
4007                 if (rdc->error == EOPNOTSUPP)
4008                         nfs3readdir(vp, rdc, cr);
4009         } else
4010                 nfs3readdir(vp, rdc, cr);
4011
4012         mutex_enter(&rp->r_statelock);
4013         rdc->flags &= ~RDDIR;
4014         if (rdc->flags & RDDIRWAIT) {
4015                 rdc->flags &= ~RDDIRWAIT;
4016                 cv_broadcast(&rdc->cv);
4017         }
4018         error = rdc->error;
4019         if (error)
4020                 rdc->flags |= RDDIRREQ;
4021         mutex_exit(&rp->r_statelock);
4022
4023         rddir_cache_rele(rdc);
4024
4025         return (error);
4026 }
4027
4028 static void
4029 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4030 {
4031         int error;
4032         READDIR3args args;
4033         READDIR3vres res;
4034         vattr_t dva;
4035         rnode_t *rp;
4036         int douprintf;
4037         failinfo_t fi, *fip = NULL;
4038         mntinfo_t *mi;
4039         hrtime_t t;
4040
4041         rp = VTOR(vp);
4042         mi = VTOMI(vp);
4043         ASSERT(nfs_zone() == mi->mi_zone);
4044
4045         args.dir = *RTOFH3(rp);
4046         args.cookie = (cookie3)rdc->nfs3_cookie;
4047         args.cookieverf = rp->r_cookieverf;
4048         args.count = rdc->buflen;
4049
4050         /*
4051          * NFS client failover support
4052          * suppress failover unless we have a zero cookie
4053          */
4054         if (args.cookie == (cookie3) 0) {
4055                 fi.vp = vp;
4056                 fi.fhp = (caddr_t)&args.dir;
4057                 fi.copyproc = nfs3copyfh;
4058                 fi.lookupproc = nfs3lookup;
4059                 fi.xattrdirproc = acl_getxattrdir3;
4060                 fip = &fi;
4061         }
4062
4063 #ifdef DEBUG
4064         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4065 #else
4066         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4067 #endif
4068
4069         res.entries = (dirent_t *)rdc->entries;
4070         res.entries_size = rdc->buflen;
4071         res.dir_attributes.fres.vap = &dva;
4072         res.dir_attributes.fres.vp = vp;
4073         res.loff = rdc->nfs3_cookie;
4074
4075         douprintf = 1;
4076
4077         if (mi->mi_io_kstats) {
4078                 mutex_enter(&mi->mi_lock);
4079                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4080                 mutex_exit(&mi->mi_lock);
4081         }
4082
4083         t = gethrtime();
4084
4085         error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4086             xdr_READDIR3args, (caddr_t)&args,
4087             xdr_READDIR3vres, (caddr_t)&res, cr,
4088             &douprintf, &res.status, 0, fip);
4089
4090         if (mi->mi_io_kstats) {
4091                 mutex_enter(&mi->mi_lock);
4092                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4093                 mutex_exit(&mi->mi_lock);
4094         }
4095
4096         if (error)
4097                 goto err;
4098
4099         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4100
4101         error = geterrno3(res.status);
4102         if (error) {
4103                 PURGE_STALE_FH(error, vp, cr);
4104                 goto err;
4105         }
4106
4107         if (mi->mi_io_kstats) {
4108                 mutex_enter(&mi->mi_lock);
4109                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4110                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4111                 mutex_exit(&mi->mi_lock);
4112         }
4113
4114         rdc->nfs3_ncookie = res.loff;
4115         rp->r_cookieverf = res.cookieverf;
4116         rdc->eof = res.eof ? 1 : 0;
4117         rdc->entlen = res.size;
4118         ASSERT(rdc->entlen <= rdc->buflen);
4119         rdc->error = 0;
4120         return;
4121
4122 err:
4123         kmem_free(rdc->entries, rdc->buflen);
4124         rdc->entries = NULL;
4125         rdc->error = error;
4126 }
4127
4128 /*
4129  * Read directory entries.
4130  * There are some weird things to look out for here.  The uio_loffset
4131  * field is either 0 or it is the offset returned from a previous
4132  * readdir.  It is an opaque value used by the server to find the
4133  * correct directory block to read. The count field is the number
4134  * of blocks to read on the server.  This is advisory only, the server
4135  * may return only one block's worth of entries.  Entries may be compressed
4136  * on the server.
4137  */
4138 static void
4139 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4140 {
4141         int error;
4142         READDIRPLUS3args args;
4143         READDIRPLUS3vres res;
4144         vattr_t dva;
4145         rnode_t *rp;
4146         mntinfo_t *mi;
4147         int douprintf;
4148         failinfo_t fi, *fip = NULL;
4149
4150         rp = VTOR(vp);
4151         mi = VTOMI(vp);
4152         ASSERT(nfs_zone() == mi->mi_zone);
4153
4154         args.dir = *RTOFH3(rp);
4155         args.cookie = (cookie3)rdc->nfs3_cookie;
4156         args.cookieverf = rp->r_cookieverf;
4157         args.dircount = rdc->buflen;
4158         args.maxcount = mi->mi_tsize;
4159
4160         /*
4161          * NFS client failover support
4162          * suppress failover unless we have a zero cookie
4163          */
4164         if (args.cookie == (cookie3)0) {
4165                 fi.vp = vp;
4166                 fi.fhp = (caddr_t)&args.dir;
4167                 fi.copyproc = nfs3copyfh;
4168                 fi.lookupproc = nfs3lookup;
4169                 fi.xattrdirproc = acl_getxattrdir3;
4170                 fip = &fi;
4171         }
4172
4173 #ifdef DEBUG
4174         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4175 #else
4176         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4177 #endif
4178
4179         res.entries = (dirent_t *)rdc->entries;
4180         res.entries_size = rdc->buflen;
4181         res.dir_attributes.fres.vap = &dva;
4182         res.dir_attributes.fres.vp = vp;
4183         res.loff = rdc->nfs3_cookie;
4184         res.credentials = cr;
4185
4186         douprintf = 1;
4187
4188         if (mi->mi_io_kstats) {
4189                 mutex_enter(&mi->mi_lock);
4190                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4191                 mutex_exit(&mi->mi_lock);
4192         }
4193
4194         res.time = gethrtime();
4195
4196         error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4197             xdr_READDIRPLUS3args, (caddr_t)&args,
4198             xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4199             &douprintf, &res.status, 0, fip);
4200
4201         if (mi->mi_io_kstats) {
4202                 mutex_enter(&mi->mi_lock);
4203                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4204                 mutex_exit(&mi->mi_lock);
4205         }
4206
4207         if (error) {
4208                 goto err;
4209         }
4210
4211         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4212
4213         error = geterrno3(res.status);
4214         if (error) {
4215                 PURGE_STALE_FH(error, vp, cr);
4216                 if (error == EOPNOTSUPP) {
4217                         mutex_enter(&mi->mi_lock);
4218                         mi->mi_flags |= MI_READDIRONLY;
4219                         mutex_exit(&mi->mi_lock);
4220                 }
4221                 goto err;
4222         }
4223
4224         if (mi->mi_io_kstats) {
4225                 mutex_enter(&mi->mi_lock);
4226                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4227                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4228                 mutex_exit(&mi->mi_lock);
4229         }
4230
4231         rdc->nfs3_ncookie = res.loff;
4232         rp->r_cookieverf = res.cookieverf;
4233         rdc->eof = res.eof ? 1 : 0;
4234         rdc->entlen = res.size;
4235         ASSERT(rdc->entlen <= rdc->buflen);
4236         rdc->error = 0;
4237
4238         return;
4239
4240 err:
4241         kmem_free(rdc->entries, rdc->buflen);
4242         rdc->entries = NULL;
4243         rdc->error = error;
4244 }
4245
4246 #ifdef DEBUG
4247 static int nfs3_bio_do_stop = 0;
4248 #endif
4249
4250 static int
4251 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4252 {
4253         rnode_t *rp = VTOR(bp->b_vp);
4254         int count;
4255         int error;
4256         cred_t *cred;
4257         offset_t offset;
4258
4259         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4260         offset = ldbtob(bp->b_lblkno);
4261
4262         DTRACE_IO1(start, struct buf *, bp);
4263
4264         if (bp->b_flags & B_READ) {
4265                 mutex_enter(&rp->r_statelock);
4266                 if (rp->r_cred != NULL) {
4267                         cred = rp->r_cred;
4268                         crhold(cred);
4269                 } else {
4270                         rp->r_cred = cr;
4271                         crhold(cr);
4272                         cred = cr;
4273                         crhold(cred);
4274                 }
4275                 mutex_exit(&rp->r_statelock);
4276         read_again:
4277                 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4278                     offset, bp->b_bcount, &bp->b_resid, cred);
4279                 crfree(cred);
4280                 if (!error) {
4281                         if (bp->b_resid) {
4282                                 /*
4283                                  * Didn't get it all because we hit EOF,
4284                                  * zero all the memory beyond the EOF.
4285                                  */
4286                                 /* bzero(rdaddr + */
4287                                 bzero(bp->b_un.b_addr +
4288                                     bp->b_bcount - bp->b_resid, bp->b_resid);
4289                         }
4290                         mutex_enter(&rp->r_statelock);
4291                         if (bp->b_resid == bp->b_bcount &&
4292                             offset >= rp->r_size) {
4293                                 /*
4294                                  * We didn't read anything at all as we are
4295                                  * past EOF.  Return an error indicator back
4296                                  * but don't destroy the pages (yet).
4297                                  */
4298                                 error = NFS_EOF;
4299                         }
4300                         mutex_exit(&rp->r_statelock);
4301                 } else if (error == EACCES) {
4302                         mutex_enter(&rp->r_statelock);
4303                         if (cred != cr) {
4304                                 if (rp->r_cred != NULL)
4305                                         crfree(rp->r_cred);
4306                                 rp->r_cred = cr;
4307                                 crhold(cr);
4308                                 cred = cr;
4309                                 crhold(cred);
4310                                 mutex_exit(&rp->r_statelock);
4311                                 goto read_again;
4312                         }
4313                         mutex_exit(&rp->r_statelock);
4314                 }
4315         } else {
4316                 if (!(rp->r_flags & RSTALE)) {
4317                         mutex_enter(&rp->r_statelock);
4318                         if (rp->r_cred != NULL) {
4319                                 cred = rp->r_cred;
4320                                 crhold(cred);
4321                         } else {
4322                                 rp->r_cred = cr;
4323                                 crhold(cr);
4324                                 cred = cr;
4325                                 crhold(cred);
4326                         }
4327                         mutex_exit(&rp->r_statelock);
4328                 write_again:
4329                         mutex_enter(&rp->r_statelock);
4330                         count = MIN(bp->b_bcount, rp->r_size - offset);
4331                         mutex_exit(&rp->r_statelock);
4332                         if (count < 0)
4333                                 cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4334 #ifdef DEBUG
4335                         if (count == 0) {
4336                                 zcmn_err(getzoneid(), CE_WARN,
4337                                     "nfs3_bio: zero length write at %lld",
4338                                     offset);
4339                                 nfs_printfhandle(&rp->r_fh);
4340                                 if (nfs3_bio_do_stop)
4341                                         debug_enter("nfs3_bio");
4342                         }
4343 #endif
4344                         error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4345                             count, cred, stab_comm);
4346                         if (error == EACCES) {
4347                                 mutex_enter(&rp->r_statelock);
4348                                 if (cred != cr) {
4349                                         if (rp->r_cred != NULL)
4350                                                 crfree(rp->r_cred);
4351                                         rp->r_cred = cr;
4352                                         crhold(cr);
4353                                         crfree(cred);
4354                                         cred = cr;
4355                                         crhold(cred);
4356                                         mutex_exit(&rp->r_statelock);
4357                                         goto write_again;
4358                                 }
4359                                 mutex_exit(&rp->r_statelock);
4360                         }
4361                         bp->b_error = error;
4362                         if (error && error != EINTR) {
4363                                 /*
4364                                  * Don't print EDQUOT errors on the console.
4365                                  * Don't print asynchronous EACCES errors.
4366                                  * Don't print EFBIG errors.
4367                                  * Print all other write errors.
4368                                  */
4369                                 if (error != EDQUOT && error != EFBIG &&
4370                                     (error != EACCES ||
4371                                     !(bp->b_flags & B_ASYNC)))
4372                                         nfs_write_error(bp->b_vp, error, cred);
4373                                 /*
4374                                  * Update r_error and r_flags as appropriate.
4375                                  * If the error was ESTALE, then mark the
4376                                  * rnode as not being writeable and save
4377                                  * the error status.  Otherwise, save any
4378                                  * errors which occur from asynchronous
4379                                  * page invalidations.  Any errors occurring
4380                                  * from other operations should be saved
4381                                  * by the caller.
4382                                  */
4383                                 mutex_enter(&rp->r_statelock);
4384                                 if (error == ESTALE) {
4385                                         rp->r_flags |= RSTALE;
4386                                         if (!rp->r_error)
4387                                                 rp->r_error = error;
4388                                 } else if (!rp->r_error &&
4389                                     (bp->b_flags &
4390                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
4391                                     (B_INVAL|B_FORCE|B_ASYNC)) {
4392                                         rp->r_error = error;
4393                                 }
4394                                 mutex_exit(&rp->r_statelock);
4395                         }
4396                         crfree(cred);
4397                 } else {
4398                         error = rp->r_error;
4399                         /*
4400                          * A close may have cleared r_error, if so,
4401                          * propagate ESTALE error return properly
4402                          */
4403                         if (error == 0)
4404                                 error = ESTALE;
4405                 }
4406         }
4407
4408         if (error != 0 && error != NFS_EOF)
4409                 bp->b_flags |= B_ERROR;
4410
4411         DTRACE_IO1(done, struct buf *, bp);
4412
4413         return (error);
4414 }
4415
4416 /* ARGSUSED */
4417 static int
4418 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4419 {
4420         rnode_t *rp;
4421
4422         if (nfs_zone() != VTOMI(vp)->mi_zone)
4423                 return (EIO);
4424         rp = VTOR(vp);
4425
4426         if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4427                 fidp->fid_len = rp->r_fh.fh_len;
4428                 return (ENOSPC);
4429         }
4430         fidp->fid_len = rp->r_fh.fh_len;
4431         bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4432         return (0);
4433 }
4434
4435 /* ARGSUSED2 */
4436 static int
4437 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4438 {
4439         rnode_t *rp = VTOR(vp);
4440
4441         if (!write_lock) {
4442                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4443                 return (V_WRITELOCK_FALSE);
4444         }
4445
4446         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4447                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4448                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4449                         return (V_WRITELOCK_FALSE);
4450                 nfs_rw_exit(&rp->r_rwlock);
4451         }
4452
4453         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4454         return (V_WRITELOCK_TRUE);
4455 }
4456
4457 /* ARGSUSED */
4458 static void
4459 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4460 {
4461         rnode_t *rp = VTOR(vp);
4462
4463         nfs_rw_exit(&rp->r_rwlock);
4464 }
4465
4466 /* ARGSUSED */
4467 static int
4468 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4469 {
4470
4471         /*
4472          * Because we stuff the readdir cookie into the offset field
4473          * someone may attempt to do an lseek with the cookie which
4474          * we want to succeed.
4475          */
4476         if (vp->v_type == VDIR)
4477                 return (0);
4478         if (*noffp < 0)
4479                 return (EINVAL);
4480         return (0);
4481 }
4482
4483 /*
4484  * number of nfs3_bsize blocks to read ahead.
4485  */
4486 static int nfs3_nra = 4;
4487
4488 #ifdef DEBUG
4489 static int nfs3_lostpage = 0;   /* number of times we lost original page */
4490 #endif
4491
4492 /*
4493  * Return all the pages from [off..off+len) in file
4494  */
4495 /* ARGSUSED */
4496 static int
4497 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4498         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4499         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4500 {
4501         rnode_t *rp;
4502         int error;
4503         mntinfo_t *mi;
4504
4505         if (vp->v_flag & VNOMAP)
4506                 return (ENOSYS);
4507
4508         if (nfs_zone() != VTOMI(vp)->mi_zone)
4509                 return (EIO);
4510         if (protp != NULL)
4511                 *protp = PROT_ALL;
4512
4513         /*
4514          * Now valididate that the caches are up to date.
4515          */
4516         error = nfs3_validate_caches(vp, cr);
4517         if (error)
4518                 return (error);
4519
4520         rp = VTOR(vp);
4521         mi = VTOMI(vp);
4522 retry:
4523         mutex_enter(&rp->r_statelock);
4524
4525         /*
4526          * Don't create dirty pages faster than they
4527          * can be cleaned so that the system doesn't
4528          * get imbalanced.  If the async queue is
4529          * maxed out, then wait for it to drain before
4530          * creating more dirty pages.  Also, wait for
4531          * any threads doing pagewalks in the vop_getattr
4532          * entry points so that they don't block for
4533          * long periods.
4534          */
4535         if (rw == S_CREATE) {
4536                 while ((mi->mi_max_threads != 0 &&
4537                     rp->r_awcount > 2 * mi->mi_max_threads) ||
4538                     rp->r_gcount > 0)
4539                         cv_wait(&rp->r_cv, &rp->r_statelock);
4540         }
4541
4542         /*
4543          * If we are getting called as a side effect of an nfs_write()
4544          * operation the local file size might not be extended yet.
4545          * In this case we want to be able to return pages of zeroes.
4546          */
4547         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4548                 mutex_exit(&rp->r_statelock);
4549                 return (EFAULT);                /* beyond EOF */
4550         }
4551
4552         mutex_exit(&rp->r_statelock);
4553
4554         error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4555             pl, plsz, seg, addr, rw, cr);
4556
4557         switch (error) {
4558         case NFS_EOF:
4559                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4560                 goto retry;
4561         case ESTALE:
4562                 PURGE_STALE_FH(error, vp, cr);
4563         }
4564
4565         return (error);
4566 }
4567
4568 /*
4569  * Called from pvn_getpages to get a particular page.
4570  */
4571 /* ARGSUSED */
4572 static int
4573 nfs3_getapage(vnode_t *vp, uoff_t off, size_t len, uint_t *protp,
4574         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4575         enum seg_rw rw, cred_t *cr)
4576 {
4577         rnode_t *rp;
4578         uint_t bsize;
4579         struct buf *bp;
4580         page_t *pp;
4581         uoff_t lbn;
4582         uoff_t io_off;
4583         uoff_t blkoff;
4584         uoff_t rablkoff;
4585         size_t io_len;
4586         uint_t blksize;
4587         int error;
4588         int readahead;
4589         int readahead_issued = 0;
4590         int ra_window; /* readahead window */
4591         page_t *pagefound;
4592         page_t *savepp;
4593
4594         if (nfs_zone() != VTOMI(vp)->mi_zone)
4595                 return (EIO);
4596         rp = VTOR(vp);
4597         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4598
4599 reread:
4600         bp = NULL;
4601         pp = NULL;
4602         pagefound = NULL;
4603
4604         if (pl != NULL)
4605                 pl[0] = NULL;
4606
4607         error = 0;
4608         lbn = off / bsize;
4609         blkoff = lbn * bsize;
4610
4611         /*
4612          * Queueing up the readahead before doing the synchronous read
4613          * results in a significant increase in read throughput because
4614          * of the increased parallelism between the async threads and
4615          * the process context.
4616          */
4617         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4618             rw != S_CREATE &&
4619             !(vp->v_flag & VNOCACHE)) {
4620                 mutex_enter(&rp->r_statelock);
4621
4622                 /*
4623                  * Calculate the number of readaheads to do.
4624                  * a) No readaheads at offset = 0.
4625                  * b) Do maximum(nfs3_nra) readaheads when the readahead
4626                  *    window is closed.
4627                  * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4628                  *    upon how far the readahead window is open or close.
4629                  * d) No readaheads if rp->r_nextr is not within the scope
4630                  *    of the readahead window (random i/o).
4631                  */
4632
4633                 if (off == 0)
4634                         readahead = 0;
4635                 else if (blkoff == rp->r_nextr)
4636                         readahead = nfs3_nra;
4637                 else if (rp->r_nextr > blkoff &&
4638                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
4639                     <= (nfs3_nra - 1)))
4640                         readahead = nfs3_nra - ra_window;
4641                 else
4642                         readahead = 0;
4643
4644                 rablkoff = rp->r_nextr;
4645                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4646                         mutex_exit(&rp->r_statelock);
4647                         if (nfs_async_readahead(vp, rablkoff + bsize,
4648                             addr + (rablkoff + bsize - off), seg, cr,
4649                             nfs3_readahead) < 0) {
4650                                 mutex_enter(&rp->r_statelock);
4651                                 break;
4652                         }
4653                         readahead--;
4654                         rablkoff += bsize;
4655                         /*
4656                          * Indicate that we did a readahead so
4657                          * readahead offset is not updated
4658                          * by the synchronous read below.
4659                          */
4660                         readahead_issued = 1;
4661                         mutex_enter(&rp->r_statelock);
4662                         /*
4663                          * set readahead offset to
4664                          * offset of last async readahead
4665                          * request.
4666                          */
4667                         rp->r_nextr = rablkoff;
4668                 }
4669                 mutex_exit(&rp->r_statelock);
4670         }
4671
4672 again:
4673         if ((pagefound = page_exists(&vp->v_object, off)) == NULL) {
4674                 if (pl == NULL) {
4675                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4676                             nfs3_readahead);
4677                 } else if (rw == S_CREATE) {
4678                         /*
4679                          * Block for this page is not allocated, or the offset
4680                          * is beyond the current allocation size, or we're
4681                          * allocating a swap slot and the page was not found,
4682                          * so allocate it and return a zero page.
4683                          */
4684                         if ((pp = page_create_va(&vp->v_object, off,
4685                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4686                                 cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4687                         io_len = PAGESIZE;
4688                         mutex_enter(&rp->r_statelock);
4689                         rp->r_nextr = off + PAGESIZE;
4690                         mutex_exit(&rp->r_statelock);
4691                 } else {
4692                         /*
4693                          * Need to go to server to get a BLOCK, exception to
4694                          * that being while reading at offset = 0 or doing
4695                          * random i/o, in that case read only a PAGE.
4696                          */
4697                         mutex_enter(&rp->r_statelock);
4698                         if (blkoff < rp->r_size &&
4699                             blkoff + bsize >= rp->r_size) {
4700                                 /*
4701                                  * If only a block or less is left in
4702                                  * the file, read all that is remaining.
4703                                  */
4704                                 if (rp->r_size <= off) {
4705                                         /*
4706                                          * Trying to access beyond EOF,
4707                                          * set up to get at least one page.
4708                                          */
4709                                         blksize = off + PAGESIZE - blkoff;
4710                                 } else
4711                                         blksize = rp->r_size - blkoff;
4712                         } else if ((off == 0) ||
4713                             (off != rp->r_nextr && !readahead_issued)) {
4714                                 blksize = PAGESIZE;
4715                                 blkoff = off; /* block = page here */
4716                         } else
4717                                 blksize = bsize;
4718                         mutex_exit(&rp->r_statelock);
4719
4720                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4721                             &io_len, blkoff, blksize, 0);
4722
4723                         /*
4724                          * Some other thread has entered the page,
4725                          * so just use it.
4726                          */
4727                         if (pp == NULL)
4728                                 goto again;
4729
4730                         /*
4731                          * Now round the request size up to page boundaries.
4732                          * This ensures that the entire page will be
4733                          * initialized to zeroes if EOF is encountered.
4734                          */
4735                         io_len = ptob(btopr(io_len));
4736
4737                         bp = pageio_setup(pp, io_len, vp, B_READ);
4738                         ASSERT(bp != NULL);
4739
4740                         /*
4741                          * pageio_setup should have set b_addr to 0.  This
4742                          * is correct since we want to do I/O on a page
4743                          * boundary.  bp_mapin will use this addr to calculate
4744                          * an offset, and then set b_addr to the kernel virtual
4745                          * address it allocated for us.
4746                          */
4747                         ASSERT(bp->b_un.b_addr == 0);
4748
4749                         bp->b_edev = 0;
4750                         bp->b_dev = 0;
4751                         bp->b_lblkno = lbtodb(io_off);
4752                         bp->b_file = vp;
4753                         bp->b_offset = (offset_t)off;
4754                         bp_mapin(bp);
4755
4756                         /*
4757                          * If doing a write beyond what we believe is EOF,
4758                          * don't bother trying to read the pages from the
4759                          * server, we'll just zero the pages here.  We
4760                          * don't check that the rw flag is S_WRITE here
4761                          * because some implementations may attempt a
4762                          * read access to the buffer before copying data.
4763                          */
4764                         mutex_enter(&rp->r_statelock);
4765                         if (io_off >= rp->r_size && seg == segkmap) {
4766                                 mutex_exit(&rp->r_statelock);
4767                                 bzero(bp->b_un.b_addr, io_len);
4768                         } else {
4769                                 mutex_exit(&rp->r_statelock);
4770                                 error = nfs3_bio(bp, NULL, cr);
4771                         }
4772
4773                         /*
4774                          * Unmap the buffer before freeing it.
4775                          */
4776                         bp_mapout(bp);
4777                         pageio_done(bp);
4778
4779                         savepp = pp;
4780                         do {
4781                                 pp->p_fsdata = C_NOCOMMIT;
4782                         } while ((pp = pp->p_next) != savepp);
4783
4784                         if (error == NFS_EOF) {
4785                                 /*
4786                                  * If doing a write system call just return
4787                                  * zeroed pages, else user tried to get pages
4788                                  * beyond EOF, return error.  We don't check
4789                                  * that the rw flag is S_WRITE here because
4790                                  * some implementations may attempt a read
4791                                  * access to the buffer before copying data.
4792                                  */
4793                                 if (seg == segkmap)
4794                                         error = 0;
4795                                 else
4796                                         error = EFAULT;
4797                         }
4798
4799                         if (!readahead_issued && !error) {
4800                                 mutex_enter(&rp->r_statelock);
4801                                 rp->r_nextr = io_off + io_len;
4802                                 mutex_exit(&rp->r_statelock);
4803                         }
4804                 }
4805         }
4806
4807 out:
4808         if (pl == NULL)
4809                 return (error);
4810
4811         if (error) {
4812                 if (pp != NULL)
4813                         pvn_read_done(pp, B_ERROR);
4814                 return (error);
4815         }
4816
4817         if (pagefound) {
4818                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4819
4820                 /*
4821                  * Page exists in the cache, acquire the appropriate lock.
4822                  * If this fails, start all over again.
4823                  */
4824                 if ((pp = page_lookup(&vp->v_object, off, se)) == NULL) {
4825 #ifdef DEBUG
4826                         nfs3_lostpage++;
4827 #endif
4828                         goto reread;
4829                 }
4830                 pl[0] = pp;
4831                 pl[1] = NULL;
4832                 return (0);
4833         }
4834
4835         if (pp != NULL)
4836                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4837
4838         return (error);
4839 }
4840
4841 static void
4842 nfs3_readahead(vnode_t *vp, uoff_t blkoff, caddr_t addr, struct seg *seg,
4843         cred_t *cr)
4844 {
4845         int error;
4846         page_t *pp;
4847         uoff_t io_off;
4848         size_t io_len;
4849         struct buf *bp;
4850         uint_t bsize, blksize;
4851         rnode_t *rp = VTOR(vp);
4852         page_t *savepp;
4853
4854         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4855         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4856
4857         mutex_enter(&rp->r_statelock);
4858         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4859                 /*
4860                  * If less than a block left in file read less
4861                  * than a block.
4862                  */
4863                 blksize = rp->r_size - blkoff;
4864         } else
4865                 blksize = bsize;
4866         mutex_exit(&rp->r_statelock);
4867
4868         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4869             &io_off, &io_len, blkoff, blksize, 1);
4870         /*
4871          * The isra flag passed to the kluster function is 1, we may have
4872          * gotten a return value of NULL for a variety of reasons (# of free
4873          * pages < minfree, someone entered the page on the vnode etc). In all
4874          * cases, we want to punt on the readahead.
4875          */
4876         if (pp == NULL)
4877                 return;
4878
4879         /*
4880          * Now round the request size up to page boundaries.
4881          * This ensures that the entire page will be
4882          * initialized to zeroes if EOF is encountered.
4883          */
4884         io_len = ptob(btopr(io_len));
4885
4886         bp = pageio_setup(pp, io_len, vp, B_READ);
4887         ASSERT(bp != NULL);
4888
4889         /*
4890          * pageio_setup should have set b_addr to 0.  This is correct since
4891          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4892          * to calculate an offset, and then set b_addr to the kernel virtual
4893          * address it allocated for us.
4894          */
4895         ASSERT(bp->b_un.b_addr == 0);
4896
4897         bp->b_edev = 0;
4898         bp->b_dev = 0;
4899         bp->b_lblkno = lbtodb(io_off);
4900         bp->b_file = vp;
4901         bp->b_offset = (offset_t)blkoff;
4902         bp_mapin(bp);
4903
4904         /*
4905          * If doing a write beyond what we believe is EOF, don't bother trying
4906          * to read the pages from the server, we'll just zero the pages here.
4907          * We don't check that the rw flag is S_WRITE here because some
4908          * implementations may attempt a read access to the buffer before
4909          * copying data.
4910          */
4911         mutex_enter(&rp->r_statelock);
4912         if (io_off >= rp->r_size && seg == segkmap) {
4913                 mutex_exit(&rp->r_statelock);
4914                 bzero(bp->b_un.b_addr, io_len);
4915                 error = 0;
4916         } else {
4917                 mutex_exit(&rp->r_statelock);
4918                 error = nfs3_bio(bp, NULL, cr);
4919                 if (error == NFS_EOF)
4920                         error = 0;
4921         }
4922
4923         /*
4924          * Unmap the buffer before freeing it.
4925          */
4926         bp_mapout(bp);
4927         pageio_done(bp);
4928
4929         savepp = pp;
4930         do {
4931                 pp->p_fsdata = C_NOCOMMIT;
4932         } while ((pp = pp->p_next) != savepp);
4933
4934         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4935
4936         /*
4937          * In case of error set readahead offset
4938          * to the lowest offset.
4939          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4940          */
4941         if (error && rp->r_nextr > io_off) {
4942                 mutex_enter(&rp->r_statelock);
4943                 if (rp->r_nextr > io_off)
4944                         rp->r_nextr = io_off;
4945                 mutex_exit(&rp->r_statelock);
4946         }
4947 }
4948
4949 /*
4950  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4951  * If len == 0, do from off to EOF.
4952  *
4953  * The normal cases should be len == 0 && off == 0 (entire vp list),
4954  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4955  * (from pageout).
4956  */
4957 /* ARGSUSED */
4958 static int
4959 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4960         caller_context_t *ct)
4961 {
4962         int error;
4963         rnode_t *rp;
4964
4965         ASSERT(cr != NULL);
4966
4967         /*
4968          * XXX - Why should this check be made here?
4969          */
4970         if (vp->v_flag & VNOMAP)
4971                 return (ENOSYS);
4972         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4973                 return (0);
4974         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4975                 return (EIO);
4976
4977         rp = VTOR(vp);
4978         mutex_enter(&rp->r_statelock);
4979         rp->r_count++;
4980         mutex_exit(&rp->r_statelock);
4981         error = nfs_putpages(vp, off, len, flags, cr);
4982         mutex_enter(&rp->r_statelock);
4983         rp->r_count--;
4984         cv_broadcast(&rp->r_cv);
4985         mutex_exit(&rp->r_statelock);
4986
4987         return (error);
4988 }
4989
4990 /*
4991  * Write out a single page, possibly klustering adjacent dirty pages.
4992  */
4993 int
4994 nfs3_putapage(vnode_t *vp, page_t *pp, uoff_t *offp, size_t *lenp,
4995         int flags, cred_t *cr)
4996 {
4997         uoff_t io_off;
4998         uoff_t lbn_off;
4999         uoff_t lbn;
5000         size_t io_len;
5001         uint_t bsize;
5002         int error;
5003         rnode_t *rp;
5004
5005         ASSERT(!vn_is_readonly(vp));
5006         ASSERT(pp != NULL);
5007         ASSERT(cr != NULL);
5008         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5009
5010         rp = VTOR(vp);
5011         ASSERT(rp->r_count > 0);
5012
5013         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5014         lbn = pp->p_offset / bsize;
5015         lbn_off = lbn * bsize;
5016
5017         /*
5018          * Find a kluster that fits in one block, or in
5019          * one page if pages are bigger than blocks.  If
5020          * there is less file space allocated than a whole
5021          * page, we'll shorten the i/o request below.
5022          */
5023         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5024             roundup(bsize, PAGESIZE), flags);
5025
5026         /*
5027          * pvn_write_kluster shouldn't have returned a page with offset
5028          * behind the original page we were given.  Verify that.
5029          */
5030         ASSERT((pp->p_offset / bsize) >= lbn);
5031
5032         /*
5033          * Now pp will have the list of kept dirty pages marked for
5034          * write back.  It will also handle invalidation and freeing
5035          * of pages that are not dirty.  Check for page length rounding
5036          * problems.
5037          */
5038         if (io_off + io_len > lbn_off + bsize) {
5039                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5040                 io_len = lbn_off + bsize - io_off;
5041         }
5042         /*
5043          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5044          * consistent value of r_size. RMODINPROGRESS is set in writerp().
5045          * When RMODINPROGRESS is set it indicates that a uiomove() is in
5046          * progress and the r_size has not been made consistent with the
5047          * new size of the file. When the uiomove() completes the r_size is
5048          * updated and the RMODINPROGRESS flag is cleared.
5049          *
5050          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5051          * consistent value of r_size. Without this handshaking, it is
5052          * possible that nfs(3)_bio() picks  up the old value of r_size
5053          * before the uiomove() in writerp() completes. This will result
5054          * in the write through nfs(3)_bio() being dropped.
5055          *
5056          * More precisely, there is a window between the time the uiomove()
5057          * completes and the time the r_size is updated. If a fop_putpage()
5058          * operation intervenes in this window, the page will be picked up,
5059          * because it is dirty (it will be unlocked, unless it was
5060          * pagecreate'd). When the page is picked up as dirty, the dirty
5061          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5062          * checked. This will still be the old size. Therefore the page will
5063          * not be written out. When segmap_release() calls fop_putpage(),
5064          * the page will be found to be clean and the write will be dropped.
5065          */
5066         if (rp->r_flags & RMODINPROGRESS) {
5067                 mutex_enter(&rp->r_statelock);
5068                 if ((rp->r_flags & RMODINPROGRESS) &&
5069                     rp->r_modaddr + MAXBSIZE > io_off &&
5070                     rp->r_modaddr < io_off + io_len) {
5071                         page_t *plist;
5072                         /*
5073                          * A write is in progress for this region of the file.
5074                          * If we did not detect RMODINPROGRESS here then this
5075                          * path through nfs_putapage() would eventually go to
5076                          * nfs(3)_bio() and may not write out all of the data
5077                          * in the pages. We end up losing data. So we decide
5078                          * to set the modified bit on each page in the page
5079                          * list and mark the rnode with RDIRTY. This write
5080                          * will be restarted at some later time.
5081                          */
5082                         plist = pp;
5083                         while (plist != NULL) {
5084                                 pp = plist;
5085                                 page_sub(&plist, pp);
5086                                 hat_setmod(pp);
5087                                 page_io_unlock(pp);
5088                                 page_unlock(pp);
5089                         }
5090                         rp->r_flags |= RDIRTY;
5091                         mutex_exit(&rp->r_statelock);
5092                         if (offp)
5093                                 *offp = io_off;
5094                         if (lenp)
5095                                 *lenp = io_len;
5096                         return (0);
5097                 }
5098                 mutex_exit(&rp->r_statelock);
5099         }
5100
5101         if (flags & B_ASYNC) {
5102                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5103                     nfs3_sync_putapage);
5104         } else
5105                 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5106
5107         if (offp)
5108                 *offp = io_off;
5109         if (lenp)
5110                 *lenp = io_len;
5111         return (error);
5112 }
5113
5114 static int
5115 nfs3_sync_putapage(vnode_t *vp, page_t *pp, uoff_t io_off, size_t io_len,
5116         int flags, cred_t *cr)
5117 {
5118         int error;
5119         rnode_t *rp;
5120
5121         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5122
5123         flags |= B_WRITE;
5124
5125         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5126
5127         rp = VTOR(vp);
5128
5129         if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5130             error == EACCES) &&
5131             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5132                 if (!(rp->r_flags & ROUTOFSPACE)) {
5133                         mutex_enter(&rp->r_statelock);
5134                         rp->r_flags |= ROUTOFSPACE;
5135                         mutex_exit(&rp->r_statelock);
5136                 }
5137                 flags |= B_ERROR;
5138                 pvn_write_done(pp, flags);
5139                 /*
5140                  * If this was not an async thread, then try again to
5141                  * write out the pages, but this time, also destroy
5142                  * them whether or not the write is successful.  This
5143                  * will prevent memory from filling up with these
5144                  * pages and destroying them is the only alternative
5145                  * if they can't be written out.
5146                  *
5147                  * Don't do this if this is an async thread because
5148                  * when the pages are unlocked in pvn_write_done,
5149                  * some other thread could have come along, locked
5150                  * them, and queued for an async thread.  It would be
5151                  * possible for all of the async threads to be tied
5152                  * up waiting to lock the pages again and they would
5153                  * all already be locked and waiting for an async
5154                  * thread to handle them.  Deadlock.
5155                  */
5156                 if (!(flags & B_ASYNC)) {
5157                         error = nfs3_putpage(vp, io_off, io_len,
5158                             B_INVAL | B_FORCE, cr, NULL);
5159                 }
5160         } else {
5161                 if (error)
5162                         flags |= B_ERROR;
5163                 else if (rp->r_flags & ROUTOFSPACE) {
5164                         mutex_enter(&rp->r_statelock);
5165                         rp->r_flags &= ~ROUTOFSPACE;
5166                         mutex_exit(&rp->r_statelock);
5167                 }
5168                 pvn_write_done(pp, flags);
5169                 if (freemem < desfree)
5170                         (void) nfs3_commit_vp(vp, 0, 0, cr);
5171         }
5172
5173         return (error);
5174 }
5175
5176 /* ARGSUSED */
5177 static int
5178 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5179         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5180         cred_t *cr, caller_context_t *ct)
5181 {
5182         struct segvn_crargs vn_a;
5183         int error;
5184         rnode_t *rp;
5185         struct vattr va;
5186
5187         if (nfs_zone() != VTOMI(vp)->mi_zone)
5188                 return (EIO);
5189
5190         if (vp->v_flag & VNOMAP)
5191                 return (ENOSYS);
5192
5193         if (off < 0 || off + len < 0)
5194                 return (ENXIO);
5195
5196         if (vp->v_type != VREG)
5197                 return (ENODEV);
5198
5199         /*
5200          * If there is cached data and if close-to-open consistency
5201          * checking is not turned off and if the file system is not
5202          * mounted readonly, then force an over the wire getattr.
5203          * Otherwise, just invoke nfs3getattr to get a copy of the
5204          * attributes.  The attribute cache will be used unless it
5205          * is timed out and if it is, then an over the wire getattr
5206          * will be issued.
5207          */
5208         va.va_mask = VATTR_ALL;
5209         if (vn_has_cached_data(vp) &&
5210             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5211                 error = nfs3_getattr_otw(vp, &va, cr);
5212         else
5213                 error = nfs3getattr(vp, &va, cr);
5214         if (error)
5215                 return (error);
5216
5217         /*
5218          * Check to see if the vnode is currently marked as not cachable.
5219          * This means portions of the file are locked (through fop_frlock).
5220          * In this case the map request must be refused.  We use
5221          * rp->r_lkserlock to avoid a race with concurrent lock requests.
5222          */
5223         rp = VTOR(vp);
5224
5225         /*
5226          * Atomically increment r_inmap after acquiring r_rwlock. The
5227          * idea here is to acquire r_rwlock to block read/write and
5228          * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5229          * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5230          * and we can prevent the deadlock that would have occurred
5231          * when nfs3_addmap() would have acquired it out of order.
5232          *
5233          * Since we are not protecting r_inmap by any lock, we do not
5234          * hold any lock when we decrement it. We atomically decrement
5235          * r_inmap after we release r_lkserlock.
5236          */
5237
5238         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5239                 return (EINTR);
5240         atomic_inc_uint(&rp->r_inmap);
5241         nfs_rw_exit(&rp->r_rwlock);
5242
5243         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5244                 atomic_dec_uint(&rp->r_inmap);
5245                 return (EINTR);
5246         }
5247
5248         if (vp->v_flag & VNOCACHE) {
5249                 error = EAGAIN;
5250                 goto done;
5251         }
5252
5253         /*
5254          * Don't allow concurrent locks and mapping if mandatory locking is
5255          * enabled.
5256          */
5257         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5258             MANDLOCK(vp, va.va_mode)) {
5259                 error = EAGAIN;
5260                 goto done;
5261         }
5262
5263         as_rangelock(as);
5264         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5265         if (error != 0) {
5266                 as_rangeunlock(as);
5267                 goto done;
5268         }
5269
5270         vn_a.vp = vp;
5271         vn_a.offset = off;
5272         vn_a.type = (flags & MAP_TYPE);
5273         vn_a.prot = (uchar_t)prot;
5274         vn_a.maxprot = (uchar_t)maxprot;
5275         vn_a.flags = (flags & ~MAP_TYPE);
5276         vn_a.cred = cr;
5277         vn_a.amp = NULL;
5278         vn_a.szc = 0;
5279         vn_a.lgrp_mem_policy_flags = 0;
5280
5281         error = as_map(as, *addrp, len, segvn_create, &vn_a);
5282         as_rangeunlock(as);
5283
5284 done:
5285         nfs_rw_exit(&rp->r_lkserlock);
5286         atomic_dec_uint(&rp->r_inmap);
5287         return (error);
5288 }
5289
5290 /* ARGSUSED */
5291 static int
5292 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5293         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5294         cred_t *cr, caller_context_t *ct)
5295 {
5296         rnode_t *rp;
5297
5298         if (vp->v_flag & VNOMAP)
5299                 return (ENOSYS);
5300         if (nfs_zone() != VTOMI(vp)->mi_zone)
5301                 return (EIO);
5302
5303         rp = VTOR(vp);
5304         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5305
5306         return (0);
5307 }
5308
5309 /* ARGSUSED */
5310 static int
5311 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5312         offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5313         caller_context_t *ct)
5314 {
5315         netobj lm_fh3;
5316         int rc;
5317         uoff_t start, end;
5318         rnode_t *rp;
5319         int error = 0, intr = INTR(vp);
5320
5321         if (nfs_zone() != VTOMI(vp)->mi_zone)
5322                 return (EIO);
5323         /* check for valid cmd parameter */
5324         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5325                 return (EINVAL);
5326
5327         /* Verify l_type. */
5328         switch (bfp->l_type) {
5329         case F_RDLCK:
5330                 if (cmd != F_GETLK && !(flag & FREAD))
5331                         return (EBADF);
5332                 break;
5333         case F_WRLCK:
5334                 if (cmd != F_GETLK && !(flag & FWRITE))
5335                         return (EBADF);
5336                 break;
5337         case F_UNLCK:
5338                 intr = 0;
5339                 break;
5340
5341         default:
5342                 return (EINVAL);
5343         }
5344
5345         /* check the validity of the lock range */
5346         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5347                 return (rc);
5348         if (rc = flk_check_lock_data(start, end, MAXEND))
5349                 return (rc);
5350
5351         /*
5352          * If the filesystem is mounted using local locking, pass the
5353          * request off to the local locking code.
5354          */
5355         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5356                 if (cmd == F_SETLK || cmd == F_SETLKW) {
5357                         /*
5358                          * For complete safety, we should be holding
5359                          * r_lkserlock.  However, we can't call
5360                          * lm_safelock and then fs_frlock while
5361                          * holding r_lkserlock, so just invoke
5362                          * lm_safelock and expect that this will
5363                          * catch enough of the cases.
5364                          */
5365                         if (!lm_safelock(vp, bfp, cr))
5366                                 return (EAGAIN);
5367                 }
5368                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5369         }
5370
5371         rp = VTOR(vp);
5372
5373         /*
5374          * Check whether the given lock request can proceed, given the
5375          * current file mappings.
5376          */
5377         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5378                 return (EINTR);
5379         if (cmd == F_SETLK || cmd == F_SETLKW) {
5380                 if (!lm_safelock(vp, bfp, cr)) {
5381                         rc = EAGAIN;
5382                         goto done;
5383                 }
5384         }
5385
5386         /*
5387          * Flush the cache after waiting for async I/O to finish.  For new
5388          * locks, this is so that the process gets the latest bits from the
5389          * server.  For unlocks, this is so that other clients see the
5390          * latest bits once the file has been unlocked.  If currently dirty
5391          * pages can't be flushed, then don't allow a lock to be set.  But
5392          * allow unlocks to succeed, to avoid having orphan locks on the
5393          * server.
5394          */
5395         if (cmd != F_GETLK) {
5396                 mutex_enter(&rp->r_statelock);
5397                 while (rp->r_count > 0) {
5398                         if (intr) {
5399                                 klwp_t *lwp = ttolwp(curthread);
5400
5401                                 if (lwp != NULL)
5402                                         lwp->lwp_nostop++;
5403                                 if (cv_wait_sig(&rp->r_cv,
5404                                     &rp->r_statelock) == 0) {
5405                                         if (lwp != NULL)
5406                                                 lwp->lwp_nostop--;
5407                                         rc = EINTR;
5408                                         break;
5409                                 }
5410                                 if (lwp != NULL)
5411                                         lwp->lwp_nostop--;
5412                         } else
5413                                 cv_wait(&rp->r_cv, &rp->r_statelock);
5414                 }
5415                 mutex_exit(&rp->r_statelock);
5416                 if (rc != 0)
5417                         goto done;
5418                 error = nfs3_putpage(vp, 0, 0, B_INVAL, cr, ct);
5419                 if (error) {
5420                         if (error == ENOSPC || error == EDQUOT) {
5421                                 mutex_enter(&rp->r_statelock);
5422                                 if (!rp->r_error)
5423                                         rp->r_error = error;
5424                                 mutex_exit(&rp->r_statelock);
5425                         }
5426                         if (bfp->l_type != F_UNLCK) {
5427                                 rc = ENOLCK;
5428                                 goto done;
5429                         }
5430                 }
5431         }
5432
5433         lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5434         lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5435
5436         /*
5437          * Call the lock manager to do the real work of contacting
5438          * the server and obtaining the lock.
5439          */
5440         rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5441
5442         if (rc == 0)
5443                 nfs_lockcompletion(vp, cmd);
5444
5445 done:
5446         nfs_rw_exit(&rp->r_lkserlock);
5447         return (rc);
5448 }
5449
5450 /*
5451  * Free storage space associated with the specified vnode.  The portion
5452  * to be freed is specified by bfp->l_start and bfp->l_len (already
5453  * normalized to a "whence" of 0).
5454  *
5455  * This is an experimental facility whose continued existence is not
5456  * guaranteed.  Currently, we only support the special case
5457  * of l_len == 0, meaning free to end of file.
5458  */
5459 /* ARGSUSED */
5460 static int
5461 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5462         offset_t offset, cred_t *cr, caller_context_t *ct)
5463 {
5464         int error;
5465
5466         ASSERT(vp->v_type == VREG);
5467         if (cmd != F_FREESP)
5468                 return (EINVAL);
5469         if (nfs_zone() != VTOMI(vp)->mi_zone)
5470                 return (EIO);
5471
5472         error = convoff(vp, bfp, 0, offset);
5473         if (!error) {
5474                 ASSERT(bfp->l_start >= 0);
5475                 if (bfp->l_len == 0) {
5476                         struct vattr va;
5477
5478                         /*
5479                          * ftruncate should not change the ctime and
5480                          * mtime if we truncate the file to its
5481                          * previous size.
5482                          */
5483                         va.va_mask = VATTR_SIZE;
5484                         error = nfs3getattr(vp, &va, cr);
5485                         if (error || va.va_size == bfp->l_start)
5486                                 return (error);
5487                         va.va_mask = VATTR_SIZE;
5488                         va.va_size = bfp->l_start;
5489                         error = nfs3setattr(vp, &va, 0, cr);
5490
5491                         if (error == 0 && bfp->l_start == 0)
5492                                 vnevent_truncate(vp, ct);
5493                 } else
5494                         error = EINVAL;
5495         }
5496
5497         return (error);
5498 }
5499
5500 /* ARGSUSED */
5501 static int
5502 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5503 {
5504
5505         return (EINVAL);
5506 }
5507
5508 /*
5509  * Setup and add an address space callback to do the work of the delmap call.
5510  * The callback will (and must be) deleted in the actual callback function.
5511  *
5512  * This is done in order to take care of the problem that we have with holding
5513  * the address space's a_lock for a long period of time (e.g. if the NFS server
5514  * is down).  Callbacks will be executed in the address space code while the
5515  * a_lock is not held.  Holding the address space's a_lock causes things such
5516  * as ps and fork to hang because they are trying to acquire this lock as well.
5517  */
5518 /* ARGSUSED */
5519 static int
5520 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5521         size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5522         cred_t *cr, caller_context_t *ct)
5523 {
5524         int                     caller_found;
5525         int                     error;
5526         rnode_t                 *rp;
5527         nfs_delmap_args_t       *dmapp;
5528         nfs_delmapcall_t        *delmap_call;
5529
5530         if (vp->v_flag & VNOMAP)
5531                 return (ENOSYS);
5532         /*
5533          * A process may not change zones if it has NFS pages mmap'ed
5534          * in, so we can't legitimately get here from the wrong zone.
5535          */
5536         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5537
5538         rp = VTOR(vp);
5539
5540         /*
5541          * The way that the address space of this process deletes its mapping
5542          * of this file is via the following call chains:
5543          * - as_free()->segop_unmap()/segvn_unmap()->fop_delmap()/nfs3_delmap()
5544          * - as_unmap()->segop_unmap()/segvn_unmap()->fop_delmap()/nfs3_delmap()
5545          *
5546          * With the use of address space callbacks we are allowed to drop the
5547          * address space lock, a_lock, while executing the NFS operations that
5548          * need to go over the wire.  Returning EAGAIN to the caller of this
5549          * function is what drives the execution of the callback that we add
5550          * below.  The callback will be executed by the address space code
5551          * after dropping the a_lock.  When the callback is finished, since
5552          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5553          * is called again on the same segment to finish the rest of the work
5554          * that needs to happen during unmapping.
5555          *
5556          * This action of calling back into the segment driver causes
5557          * nfs3_delmap() to get called again, but since the callback was
5558          * already executed at this point, it already did the work and there
5559          * is nothing left for us to do.
5560          *
5561          * To Summarize:
5562          * - The first time nfs3_delmap is called by the current thread is when
5563          * we add the caller associated with this delmap to the delmap caller
5564          * list, add the callback, and return EAGAIN.
5565          * - The second time in this call chain when nfs3_delmap is called we
5566          * will find this caller in the delmap caller list and realize there
5567          * is no more work to do thus removing this caller from the list and
5568          * returning the error that was set in the callback execution.
5569          */
5570         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5571         if (caller_found) {
5572                 /*
5573                  * 'error' is from the actual delmap operations.  To avoid
5574                  * hangs, we need to handle the return of EAGAIN differently
5575                  * since this is what drives the callback execution.
5576                  * In this case, we don't want to return EAGAIN and do the
5577                  * callback execution because there are none to execute.
5578                  */
5579                 if (error == EAGAIN)
5580                         return (0);
5581                 else
5582                         return (error);
5583         }
5584
5585         /* current caller was not in the list */
5586         delmap_call = nfs_init_delmapcall();
5587
5588         mutex_enter(&rp->r_statelock);
5589         list_insert_tail(&rp->r_indelmap, delmap_call);
5590         mutex_exit(&rp->r_statelock);
5591
5592         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5593
5594         dmapp->vp = vp;
5595         dmapp->off = off;
5596         dmapp->addr = addr;
5597         dmapp->len = len;
5598         dmapp->prot = prot;
5599         dmapp->maxprot = maxprot;
5600         dmapp->flags = flags;
5601         dmapp->cr = cr;
5602         dmapp->caller = delmap_call;
5603
5604         error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5605             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5606
5607         return (error ? error : EAGAIN);
5608 }
5609
5610 /*
5611  * Remove some pages from an mmap'd vnode.  Just update the
5612  * count of pages.  If doing close-to-open, then flush and
5613  * commit all of the pages associated with this file.
5614  * Otherwise, start an asynchronous page flush to write out
5615  * any dirty pages.  This will also associate a credential
5616  * with the rnode which can be used to write the pages.
5617  */
5618 /* ARGSUSED */
5619 static void
5620 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5621 {
5622         int                     error;
5623         rnode_t                 *rp;
5624         mntinfo_t               *mi;
5625         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
5626
5627         rp = VTOR(dmapp->vp);
5628         mi = VTOMI(dmapp->vp);
5629
5630         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5631         ASSERT(rp->r_mapcnt >= 0);
5632
5633         /*
5634          * Initiate a page flush and potential commit if there are
5635          * pages, the file system was not mounted readonly, the segment
5636          * was mapped shared, and the pages themselves were writeable.
5637          */
5638         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5639             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5640                 mutex_enter(&rp->r_statelock);
5641                 rp->r_flags |= RDIRTY;
5642                 mutex_exit(&rp->r_statelock);
5643                 /*
5644                  * If this is a cross-zone access a sync putpage won't work, so
5645                  * the best we can do is try an async putpage.  That seems
5646                  * better than something more draconian such as discarding the
5647                  * dirty pages.
5648                  */
5649                 if ((mi->mi_flags & MI_NOCTO) ||
5650                     nfs_zone() != mi->mi_zone)
5651                         error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5652                             B_ASYNC, dmapp->cr, NULL);
5653                 else
5654                         error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5655                             dmapp->len, dmapp->cr);
5656                 if (!error) {
5657                         mutex_enter(&rp->r_statelock);
5658                         error = rp->r_error;
5659                         rp->r_error = 0;
5660                         mutex_exit(&rp->r_statelock);
5661                 }
5662         } else
5663                 error = 0;
5664
5665         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5666                 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5667                     B_INVAL, dmapp->cr, NULL);
5668
5669         dmapp->caller->error = error;
5670         (void) as_delete_callback(as, arg);
5671         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5672 }
5673
5674 static int nfs3_pathconf_disable_cache = 0;
5675
5676 #ifdef DEBUG
5677 static int nfs3_pathconf_cache_hits = 0;
5678 static int nfs3_pathconf_cache_misses = 0;
5679 #endif
5680
5681 /* ARGSUSED */
5682 static int
5683 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5684         caller_context_t *ct)
5685 {
5686         int error;
5687         PATHCONF3args args;
5688         PATHCONF3res res;
5689         int douprintf;
5690         failinfo_t fi;
5691         rnode_t *rp;
5692         hrtime_t t;
5693
5694         if (nfs_zone() != VTOMI(vp)->mi_zone)
5695                 return (EIO);
5696         /*
5697          * Large file spec - need to base answer on info stored
5698          * on original FSINFO response.
5699          */
5700         if (cmd == _PC_FILESIZEBITS) {
5701                 unsigned long long ll;
5702                 long l = 1;
5703
5704                 ll = VTOMI(vp)->mi_maxfilesize;
5705
5706                 if (ll == 0) {
5707                         *valp = 0;
5708                         return (0);
5709                 }
5710
5711                 if (ll & 0xffffffff00000000) {
5712                         l += 32; ll >>= 32;
5713                 }
5714                 if (ll & 0xffff0000) {
5715                         l += 16; ll >>= 16;
5716                 }
5717                 if (ll & 0xff00) {
5718                         l += 8; ll >>= 8;
5719                 }
5720                 if (ll & 0xf0) {
5721                         l += 4; ll >>= 4;
5722                 }
5723                 if (ll & 0xc) {
5724                         l += 2; ll >>= 2;
5725                 }
5726                 if (ll & 0x2)
5727                         l += 2;
5728                 else if (ll & 0x1)
5729                         l += 1;
5730                 *valp = l;
5731                 return (0);
5732         }
5733
5734         if (cmd == _PC_ACL_ENABLED) {
5735                 *valp = _ACL_ACLENT_ENABLED;
5736                 return (0);
5737         }
5738
5739         if (cmd == _PC_XATTR_EXISTS) {
5740                 error = 0;
5741                 *valp = 0;
5742                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5743                         vnode_t *avp;
5744                         rnode_t *rp;
5745                         int error = 0;
5746                         mntinfo_t *mi = VTOMI(vp);
5747
5748                         if (!(mi->mi_flags & MI_EXTATTR))
5749                                 return (0);
5750
5751                         rp = VTOR(vp);
5752                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5753                             INTR(vp)))
5754                                 return (EINTR);
5755
5756                         error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5757                         if (error || avp == NULL)
5758                                 error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5759
5760                         nfs_rw_exit(&rp->r_rwlock);
5761
5762                         if (error == 0 && avp != NULL) {
5763                                 error = do_xattr_exists_check(avp, valp, cr);
5764                                 VN_RELE(avp);
5765                         } else if (error == ENOENT) {
5766                                 error = 0;
5767                                 *valp = 0;
5768                         }
5769                 }
5770                 return (error);
5771         }
5772
5773         rp = VTOR(vp);
5774         if (rp->r_pathconf != NULL) {
5775                 mutex_enter(&rp->r_statelock);
5776                 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5777                         kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5778                         rp->r_pathconf = NULL;
5779                 }
5780                 if (rp->r_pathconf != NULL) {
5781                         error = 0;
5782                         switch (cmd) {
5783                         case _PC_LINK_MAX:
5784                                 *valp = rp->r_pathconf->link_max;
5785                                 break;
5786                         case _PC_NAME_MAX:
5787                                 *valp = rp->r_pathconf->name_max;
5788                                 break;
5789                         case _PC_PATH_MAX:
5790                         case _PC_SYMLINK_MAX:
5791                                 *valp = MAXPATHLEN;
5792                                 break;
5793                         case _PC_CHOWN_RESTRICTED:
5794                                 *valp = rp->r_pathconf->chown_restricted;
5795                                 break;
5796                         case _PC_NO_TRUNC:
5797                                 *valp = rp->r_pathconf->no_trunc;
5798                                 break;
5799                         default:
5800                                 error = EINVAL;
5801                                 break;
5802                         }
5803                         mutex_exit(&rp->r_statelock);
5804 #ifdef DEBUG
5805                         nfs3_pathconf_cache_hits++;
5806 #endif
5807                         return (error);
5808                 }
5809                 mutex_exit(&rp->r_statelock);
5810         }
5811 #ifdef DEBUG
5812         nfs3_pathconf_cache_misses++;
5813 #endif
5814
5815         args.object = *VTOFH3(vp);
5816         fi.vp = vp;
5817         fi.fhp = (caddr_t)&args.object;
5818         fi.copyproc = nfs3copyfh;
5819         fi.lookupproc = nfs3lookup;
5820         fi.xattrdirproc = acl_getxattrdir3;
5821
5822         douprintf = 1;
5823
5824         t = gethrtime();
5825
5826         error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5827             xdr_nfs_fh3, (caddr_t)&args,
5828             xdr_PATHCONF3res, (caddr_t)&res, cr,
5829             &douprintf, &res.status, 0, &fi);
5830
5831         if (error)
5832                 return (error);
5833
5834         error = geterrno3(res.status);
5835
5836         if (!error) {
5837                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5838                 if (!nfs3_pathconf_disable_cache) {
5839                         mutex_enter(&rp->r_statelock);
5840                         if (rp->r_pathconf == NULL) {
5841                                 rp->r_pathconf = kmem_alloc(
5842                                     sizeof (*rp->r_pathconf), KM_NOSLEEP);
5843                                 if (rp->r_pathconf != NULL)
5844                                         *rp->r_pathconf = res.resok.info;
5845                         }
5846                         mutex_exit(&rp->r_statelock);
5847                 }
5848                 switch (cmd) {
5849                 case _PC_LINK_MAX:
5850                         *valp = res.resok.info.link_max;
5851                         break;
5852                 case _PC_NAME_MAX:
5853                         *valp = res.resok.info.name_max;
5854                         break;
5855                 case _PC_PATH_MAX:
5856                 case _PC_SYMLINK_MAX:
5857                         *valp = MAXPATHLEN;
5858                         break;
5859                 case _PC_CHOWN_RESTRICTED:
5860                         *valp = res.resok.info.chown_restricted;
5861                         break;
5862                 case _PC_NO_TRUNC:
5863                         *valp = res.resok.info.no_trunc;
5864                         break;
5865                 default:
5866                         return (EINVAL);
5867                 }
5868         } else {
5869                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5870                 PURGE_STALE_FH(error, vp, cr);
5871         }
5872
5873         return (error);
5874 }
5875
5876 /*
5877  * Called by async thread to do synchronous pageio. Do the i/o, wait
5878  * for it to complete, and cleanup the page list when done.
5879  */
5880 static int
5881 nfs3_sync_pageio(vnode_t *vp, page_t *pp, uoff_t io_off, size_t io_len,
5882         int flags, cred_t *cr)
5883 {
5884         int error;
5885
5886         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5887         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5888         if (flags & B_READ)
5889                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5890         else
5891                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5892         return (error);
5893 }
5894
5895 /* ARGSUSED */
5896 static int
5897 nfs3_pageio(vnode_t *vp, page_t *pp, uoff_t io_off, size_t io_len,
5898         int flags, cred_t *cr, caller_context_t *ct)
5899 {
5900         int error;
5901         rnode_t *rp;
5902
5903         if (pp == NULL)
5904                 return (EINVAL);
5905         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5906                 return (EIO);
5907
5908         rp = VTOR(vp);
5909         mutex_enter(&rp->r_statelock);
5910         rp->r_count++;
5911         mutex_exit(&rp->r_statelock);
5912
5913         if (flags & B_ASYNC) {
5914                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5915                     nfs3_sync_pageio);
5916         } else
5917                 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5918         mutex_enter(&rp->r_statelock);
5919         rp->r_count--;
5920         cv_broadcast(&rp->r_cv);
5921         mutex_exit(&rp->r_statelock);
5922         return (error);
5923 }
5924
5925 /* ARGSUSED */
5926 static void
5927 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5928         caller_context_t *ct)
5929 {
5930         int error;
5931         rnode_t *rp;
5932         page_t *plist;
5933         page_t *pptr;
5934         offset3 offset;
5935         count3 len;
5936         k_sigset_t smask;
5937
5938         /*
5939          * We should get called with fl equal to either B_FREE or
5940          * B_INVAL.  Any other value is illegal.
5941          *
5942          * The page that we are either supposed to free or destroy
5943          * should be exclusive locked and its io lock should not
5944          * be held.
5945          */
5946         ASSERT(fl == B_FREE || fl == B_INVAL);
5947         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5948         rp = VTOR(vp);
5949
5950         /*
5951          * If the page doesn't need to be committed or we shouldn't
5952          * even bother attempting to commit it, then just make sure
5953          * that the p_fsdata byte is clear and then either free or
5954          * destroy the page as appropriate.
5955          */
5956         if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5957                 pp->p_fsdata = C_NOCOMMIT;
5958                 if (fl == B_FREE)
5959                         page_free(pp, dn);
5960                 else
5961                         page_destroy(pp, dn);
5962                 return;
5963         }
5964
5965         /*
5966          * If there is a page invalidation operation going on, then
5967          * if this is one of the pages being destroyed, then just
5968          * clear the p_fsdata byte and then either free or destroy
5969          * the page as appropriate.
5970          */
5971         mutex_enter(&rp->r_statelock);
5972         if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
5973                 mutex_exit(&rp->r_statelock);
5974                 pp->p_fsdata = C_NOCOMMIT;
5975                 if (fl == B_FREE)
5976                         page_free(pp, dn);
5977                 else
5978                         page_destroy(pp, dn);
5979                 return;
5980         }
5981
5982         /*
5983          * If we are freeing this page and someone else is already
5984          * waiting to do a commit, then just unlock the page and
5985          * return.  That other thread will take care of commiting
5986          * this page.  The page can be freed sometime after the
5987          * commit has finished.  Otherwise, if the page is marked
5988          * as delay commit, then we may be getting called from
5989          * pvn_write_done, one page at a time.   This could result
5990          * in one commit per page, so we end up doing lots of small
5991          * commits instead of fewer larger commits.  This is bad,
5992          * we want do as few commits as possible.
5993          */
5994         if (fl == B_FREE) {
5995                 if (rp->r_flags & RCOMMITWAIT) {
5996                         page_unlock(pp);
5997                         mutex_exit(&rp->r_statelock);
5998                         return;
5999                 }
6000                 if (pp->p_fsdata == C_DELAYCOMMIT) {
6001                         pp->p_fsdata = C_COMMIT;
6002                         page_unlock(pp);
6003                         mutex_exit(&rp->r_statelock);
6004                         return;
6005                 }
6006         }
6007
6008         /*
6009          * Check to see if there is a signal which would prevent an
6010          * attempt to commit the pages from being successful.  If so,
6011          * then don't bother with all of the work to gather pages and
6012          * generate the unsuccessful RPC.  Just return from here and
6013          * let the page be committed at some later time.
6014          */
6015         sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6016         if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6017                 sigunintr(&smask);
6018                 page_unlock(pp);
6019                 mutex_exit(&rp->r_statelock);
6020                 return;
6021         }
6022         sigunintr(&smask);
6023
6024         /*
6025          * We are starting to need to commit pages, so let's try
6026          * to commit as many as possible at once to reduce the
6027          * overhead.
6028          *
6029          * Set the `commit inprogress' state bit.  We must
6030          * first wait until any current one finishes.  Then
6031          * we initialize the c_pages list with this page.
6032          */
6033         while (rp->r_flags & RCOMMIT) {
6034                 rp->r_flags |= RCOMMITWAIT;
6035                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6036                 rp->r_flags &= ~RCOMMITWAIT;
6037         }
6038         rp->r_flags |= RCOMMIT;
6039         mutex_exit(&rp->r_statelock);
6040         ASSERT(rp->r_commit.c_pages == NULL);
6041         rp->r_commit.c_pages = pp;
6042         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6043         rp->r_commit.c_commlen = PAGESIZE;
6044
6045         /*
6046          * Gather together all other pages which can be committed.
6047          * They will all be chained off r_commit.c_pages.
6048          */
6049         nfs3_get_commit(vp);
6050
6051         /*
6052          * Clear the `commit inprogress' status and disconnect
6053          * the list of pages to be committed from the rnode.
6054          * At this same time, we also save the starting offset
6055          * and length of data to be committed on the server.
6056          */
6057         plist = rp->r_commit.c_pages;
6058         rp->r_commit.c_pages = NULL;
6059         offset = rp->r_commit.c_commbase;
6060         len = rp->r_commit.c_commlen;
6061         mutex_enter(&rp->r_statelock);
6062         rp->r_flags &= ~RCOMMIT;
6063         cv_broadcast(&rp->r_commit.c_cv);
6064         mutex_exit(&rp->r_statelock);
6065
6066         if (curproc == proc_pageout || curproc == proc_fsflush ||
6067             nfs_zone() != VTOMI(vp)->mi_zone) {
6068                 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6069                 return;
6070         }
6071
6072         /*
6073          * Actually generate the COMMIT3 over the wire operation.
6074          */
6075         error = nfs3_commit(vp, offset, len, cr);
6076
6077         /*
6078          * If we got an error during the commit, just unlock all
6079          * of the pages.  The pages will get retransmitted to the
6080          * server during a putpage operation.
6081          */
6082         if (error) {
6083                 while (plist != NULL) {
6084                         pptr = plist;
6085                         page_sub(&plist, pptr);
6086                         page_unlock(pptr);
6087                 }
6088                 return;
6089         }
6090
6091         /*
6092          * We've tried as hard as we can to commit the data to stable
6093          * storage on the server.  We release the rest of the pages
6094          * and clear the commit required state.  They will be put
6095          * onto the tail of the cachelist if they are nolonger
6096          * mapped.
6097          */
6098         while (plist != pp) {
6099                 pptr = plist;
6100                 page_sub(&plist, pptr);
6101                 pptr->p_fsdata = C_NOCOMMIT;
6102                 (void) page_release(pptr, 1);
6103         }
6104
6105         /*
6106          * It is possible that nfs3_commit didn't return error but
6107          * some other thread has modified the page we are going
6108          * to free/destroy.
6109          *    In this case we need to rewrite the page. Do an explicit check
6110          * before attempting to free/destroy the page. If modified, needs to
6111          * be rewritten so unlock the page and return.
6112          */
6113         if (hat_ismod(pp)) {
6114                 pp->p_fsdata = C_NOCOMMIT;
6115                 page_unlock(pp);
6116                 return;
6117         }
6118
6119         /*
6120          * Now, as appropriate, either free or destroy the page
6121          * that we were called with.
6122          */
6123         pp->p_fsdata = C_NOCOMMIT;
6124         if (fl == B_FREE)
6125                 page_free(pp, dn);
6126         else
6127                 page_destroy(pp, dn);
6128 }
6129
6130 static int
6131 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6132 {
6133         int error;
6134         rnode_t *rp;
6135         COMMIT3args args;
6136         COMMIT3res res;
6137         int douprintf;
6138         cred_t *cred;
6139
6140         rp = VTOR(vp);
6141         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6142
6143         mutex_enter(&rp->r_statelock);
6144         if (rp->r_cred != NULL) {
6145                 cred = rp->r_cred;
6146                 crhold(cred);
6147         } else {
6148                 rp->r_cred = cr;
6149                 crhold(cr);
6150                 cred = cr;
6151                 crhold(cred);
6152         }
6153         mutex_exit(&rp->r_statelock);
6154
6155         args.file = *VTOFH3(vp);
6156         args.offset = offset;
6157         args.count = count;
6158
6159 doitagain:
6160         douprintf = 1;
6161         error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6162             xdr_COMMIT3args, (caddr_t)&args,
6163             xdr_COMMIT3res, (caddr_t)&res, cred,
6164             &douprintf, &res.status, 0, NULL);
6165
6166         crfree(cred);
6167
6168         if (error)
6169                 return (error);
6170
6171         error = geterrno3(res.status);
6172         if (!error) {
6173                 ASSERT(rp->r_flags & RHAVEVERF);
6174                 mutex_enter(&rp->r_statelock);
6175                 if (rp->r_verf == res.resok.verf) {
6176                         mutex_exit(&rp->r_statelock);
6177                         return (0);
6178                 }
6179                 nfs3_set_mod(vp);
6180                 rp->r_verf = res.resok.verf;
6181                 mutex_exit(&rp->r_statelock);
6182                 error = NFS_VERF_MISMATCH;
6183         } else {
6184                 if (error == EACCES) {
6185                         mutex_enter(&rp->r_statelock);
6186                         if (cred != cr) {
6187                                 if (rp->r_cred != NULL)
6188                                         crfree(rp->r_cred);
6189                                 rp->r_cred = cr;
6190                                 crhold(cr);
6191                                 cred = cr;
6192                                 crhold(cred);
6193                                 mutex_exit(&rp->r_statelock);
6194                                 goto doitagain;
6195                         }
6196                         mutex_exit(&rp->r_statelock);
6197                 }
6198                 /*
6199                  * Can't do a PURGE_STALE_FH here because this
6200                  * can cause a deadlock.  nfs3_commit can
6201                  * be called from nfs3_dispose which can be called
6202                  * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6203                  * can call back to pvn_vplist_dirty.
6204                  */
6205                 if (error == ESTALE) {
6206                         mutex_enter(&rp->r_statelock);
6207                         rp->r_flags |= RSTALE;
6208                         if (!rp->r_error)
6209                                 rp->r_error = error;
6210                         mutex_exit(&rp->r_statelock);
6211                         PURGE_ATTRCACHE(vp);
6212                 } else {
6213                         mutex_enter(&rp->r_statelock);
6214                         if (!rp->r_error)
6215                                 rp->r_error = error;
6216                         mutex_exit(&rp->r_statelock);
6217                 }
6218         }
6219
6220         return (error);
6221 }
6222
6223 static void
6224 nfs3_set_mod(vnode_t *vp)
6225 {
6226         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6227
6228         pvn_vplist_setdirty(vp, nfs_setmod_check);
6229 }
6230
6231 /*
6232  * This routine is used to gather together a page list of the pages
6233  * which are to be committed on the server.  This routine must not
6234  * be called if the calling thread holds any locked pages.
6235  *
6236  * The calling thread must have set RCOMMIT.  This bit is used to
6237  * serialize access to the commit structure in the rnode.  As long
6238  * as the thread has set RCOMMIT, then it can manipulate the commit
6239  * structure without requiring any other locks.
6240  */
6241 static void
6242 nfs3_get_commit(vnode_t *vp)
6243 {
6244         rnode_t *rp;
6245         page_t *pp;
6246
6247         rp = VTOR(vp);
6248
6249         ASSERT(rp->r_flags & RCOMMIT);
6250
6251         vmobject_lock(&vp->v_object);
6252
6253         /*
6254          * Step through all of the pages associated with this vnode
6255          * looking for pages which need to be committed.
6256          */
6257         for (pp = vmobject_get_head(&vp->v_object);
6258              pp != NULL;
6259              pp = vmobject_get_next(&vp->v_object, pp)) {
6260                 /* Skip marker pages. */
6261                 if (PP_ISPVN_TAG(pp))
6262                         continue;
6263
6264                 /*
6265                  * If this page does not need to be committed or is
6266                  * modified, then just skip it.
6267                  */
6268                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6269                         continue;
6270
6271                 /*
6272                  * Attempt to lock the page.  If we can't, then
6273                  * someone else is messing with it and we will
6274                  * just skip it.
6275                  */
6276                 if (!page_trylock(pp, SE_EXCL))
6277                         continue;
6278
6279                 /*
6280                  * If this page does not need to be committed or is
6281                  * modified, then just skip it.  Recheck now that
6282                  * the page is locked.
6283                  */
6284                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6285                         page_unlock(pp);
6286                         continue;
6287                 }
6288
6289                 if (PP_ISFREE(pp)) {
6290                         cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6291                             (void *)pp);
6292                 }
6293
6294                 /*
6295                  * The page needs to be committed and we locked it.
6296                  * Update the base and length parameters and add it
6297                  * to r_pages.
6298                  */
6299                 if (rp->r_commit.c_pages == NULL) {
6300                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6301                         rp->r_commit.c_commlen = PAGESIZE;
6302                 } else if (pp->p_offset < rp->r_commit.c_commbase) {
6303                         rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6304                             (offset3)pp->p_offset + rp->r_commit.c_commlen;
6305                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6306                 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6307                     <= pp->p_offset) {
6308                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6309                             rp->r_commit.c_commbase + PAGESIZE;
6310                 }
6311                 page_add(&rp->r_commit.c_pages, pp);
6312         }
6313
6314         vmobject_unlock(&vp->v_object);
6315 }
6316
6317 /*
6318  * This routine is used to gather together a page list of the pages
6319  * which are to be committed on the server.  This routine must not
6320  * be called if the calling thread holds any locked pages.
6321  *
6322  * The calling thread must have set RCOMMIT.  This bit is used to
6323  * serialize access to the commit structure in the rnode.  As long
6324  * as the thread has set RCOMMIT, then it can manipulate the commit
6325  * structure without requiring any other locks.
6326  */
6327 static void
6328 nfs3_get_commit_range(vnode_t *vp, uoff_t soff, size_t len)
6329 {
6330
6331         rnode_t *rp;
6332         page_t *pp;
6333         uoff_t end;
6334         uoff_t off;
6335
6336         ASSERT(len != 0);
6337
6338         rp = VTOR(vp);
6339
6340         ASSERT(rp->r_flags & RCOMMIT);
6341         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6342
6343         /*
6344          * If there are no pages associated with this vnode, then
6345          * just return.
6346          */
6347         if (!vn_has_cached_data(vp))
6348                 return;
6349
6350         /*
6351          * Calculate the ending offset.
6352          */
6353         end = soff + len;
6354
6355         for (off = soff; off < end; off += PAGESIZE) {
6356                 /*
6357                  * Lookup each page by vp, offset.
6358                  */
6359                 if ((pp = page_lookup_nowait(&vp->v_object, off, SE_EXCL)) == NULL)
6360                         continue;
6361
6362                 /*
6363                  * If this page does not need to be committed or is
6364                  * modified, then just skip it.
6365                  */
6366                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6367                         page_unlock(pp);
6368                         continue;
6369                 }
6370
6371                 ASSERT(PP_ISFREE(pp) == 0);
6372
6373                 /*
6374                  * The page needs to be committed and we locked it.
6375                  * Update the base and length parameters and add it
6376                  * to r_pages.
6377                  */
6378                 if (rp->r_commit.c_pages == NULL) {
6379                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6380                         rp->r_commit.c_commlen = PAGESIZE;
6381                 } else {
6382                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6383                             rp->r_commit.c_commbase + PAGESIZE;
6384                 }
6385                 page_add(&rp->r_commit.c_pages, pp);
6386         }
6387 }
6388
6389 static int
6390 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6391 {
6392         int error;
6393         writeverf3 write_verf;
6394         rnode_t *rp = VTOR(vp);
6395
6396         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6397         /*
6398          * Flush the data portion of the file and then commit any
6399          * portions which need to be committed.  This may need to
6400          * be done twice if the server has changed state since
6401          * data was last written.  The data will need to be
6402          * rewritten to the server and then a new commit done.
6403          *
6404          * In fact, this may need to be done several times if the
6405          * server is having problems and crashing while we are
6406          * attempting to do this.
6407          */
6408
6409 top:
6410         /*
6411          * Do a flush based on the poff and plen arguments.  This
6412          * will asynchronously write out any modified pages in the
6413          * range specified by (poff, plen).  This starts all of the
6414          * i/o operations which will be waited for in the next
6415          * call to nfs3_putpage
6416          */
6417
6418         mutex_enter(&rp->r_statelock);
6419         write_verf = rp->r_verf;
6420         mutex_exit(&rp->r_statelock);
6421
6422         error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6423         if (error == EAGAIN)
6424                 error = 0;
6425
6426         /*
6427          * Do a flush based on the poff and plen arguments.  This
6428          * will synchronously write out any modified pages in the
6429          * range specified by (poff, plen) and wait until all of
6430          * the asynchronous i/o's in that range are done as well.
6431          */
6432         if (!error)
6433                 error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6434
6435         if (error)
6436                 return (error);
6437
6438         mutex_enter(&rp->r_statelock);
6439         if (rp->r_verf != write_verf) {
6440                 mutex_exit(&rp->r_statelock);
6441                 goto top;
6442         }
6443         mutex_exit(&rp->r_statelock);
6444
6445         /*
6446          * Now commit any pages which might need to be committed.
6447          * If the error, NFS_VERF_MISMATCH, is returned, then
6448          * start over with the flush operation.
6449          */
6450
6451         error = nfs3_commit_vp(vp, poff, plen, cr);
6452
6453         if (error == NFS_VERF_MISMATCH)
6454                 goto top;
6455
6456         return (error);
6457 }
6458
6459 static int
6460 nfs3_commit_vp(vnode_t *vp, uoff_t poff, size_t plen, cred_t *cr)
6461 {
6462         rnode_t *rp;
6463         page_t *plist;
6464         offset3 offset;
6465         count3 len;
6466
6467
6468         rp = VTOR(vp);
6469
6470         if (nfs_zone() != VTOMI(vp)->mi_zone)
6471                 return (EIO);
6472         /*
6473          * Set the `commit inprogress' state bit.  We must
6474          * first wait until any current one finishes.
6475          */
6476         mutex_enter(&rp->r_statelock);
6477         while (rp->r_flags & RCOMMIT) {
6478                 rp->r_flags |= RCOMMITWAIT;
6479                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6480                 rp->r_flags &= ~RCOMMITWAIT;
6481         }
6482         rp->r_flags |= RCOMMIT;
6483         mutex_exit(&rp->r_statelock);
6484
6485         /*
6486          * Gather together all of the pages which need to be
6487          * committed.
6488          */
6489         if (plen == 0)
6490                 nfs3_get_commit(vp);
6491         else
6492                 nfs3_get_commit_range(vp, poff, plen);
6493
6494         /*
6495          * Clear the `commit inprogress' bit and disconnect the
6496          * page list which was gathered together in nfs3_get_commit.
6497          */
6498         plist = rp->r_commit.c_pages;
6499         rp->r_commit.c_pages = NULL;
6500         offset = rp->r_commit.c_commbase;
6501         len = rp->r_commit.c_commlen;
6502         mutex_enter(&rp->r_statelock);
6503         rp->r_flags &= ~RCOMMIT;
6504         cv_broadcast(&rp->r_commit.c_cv);
6505         mutex_exit(&rp->r_statelock);
6506
6507         /*
6508          * If any pages need to be committed, commit them and
6509          * then unlock them so that they can be freed some
6510          * time later.
6511          */
6512         if (plist != NULL) {
6513                 /*
6514                  * No error occurred during the flush portion
6515                  * of this operation, so now attempt to commit
6516                  * the data to stable storage on the server.
6517                  *
6518                  * This will unlock all of the pages on the list.
6519                  */
6520                 return (nfs3_sync_commit(vp, plist, offset, len, cr));
6521         }
6522         return (0);
6523 }
6524
6525 static int
6526 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6527         cred_t *cr)
6528 {
6529         int error;
6530         page_t *pp;
6531
6532         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6533         error = nfs3_commit(vp, offset, count, cr);
6534
6535         /*
6536          * If we got an error, then just unlock all of the pages
6537          * on the list.
6538          */
6539         if (error) {
6540                 while (plist != NULL) {
6541                         pp = plist;
6542                         page_sub(&plist, pp);
6543                         page_unlock(pp);
6544                 }
6545                 return (error);
6546         }
6547         /*
6548          * We've tried as hard as we can to commit the data to stable
6549          * storage on the server.  We just unlock the pages and clear
6550          * the commit required state.  They will get freed later.
6551          */
6552         while (plist != NULL) {
6553                 pp = plist;
6554                 page_sub(&plist, pp);
6555                 pp->p_fsdata = C_NOCOMMIT;
6556                 page_unlock(pp);
6557         }
6558
6559         return (error);
6560 }
6561
6562 static void
6563 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6564         cred_t *cr)
6565 {
6566         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6567         (void) nfs3_sync_commit(vp, plist, offset, count, cr);
6568 }
6569
6570 /* ARGSUSED */
6571 static int
6572 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6573         caller_context_t *ct)
6574 {
6575         int error;
6576         mntinfo_t *mi;
6577
6578         mi = VTOMI(vp);
6579
6580         if (nfs_zone() != mi->mi_zone)
6581                 return (EIO);
6582
6583         if (mi->mi_flags & MI_ACL) {
6584                 error = acl_setacl3(vp, vsecattr, flag, cr);
6585                 if (mi->mi_flags & MI_ACL)
6586                         return (error);
6587         }
6588
6589         return (ENOSYS);
6590 }
6591
6592 /* ARGSUSED */
6593 static int
6594 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6595         caller_context_t *ct)
6596 {
6597         int error;
6598         mntinfo_t *mi;
6599
6600         mi = VTOMI(vp);
6601
6602         if (nfs_zone() != mi->mi_zone)
6603                 return (EIO);
6604
6605         if (mi->mi_flags & MI_ACL) {
6606                 error = acl_getacl3(vp, vsecattr, flag, cr);
6607                 if (mi->mi_flags & MI_ACL)
6608                         return (error);
6609         }
6610
6611         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6612 }
6613
6614 /* ARGSUSED */
6615 static int
6616 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6617         caller_context_t *ct)
6618 {
6619         int error;
6620         struct shrlock nshr;
6621         struct nfs_owner nfs_owner;
6622         netobj lm_fh3;
6623
6624         if (nfs_zone() != VTOMI(vp)->mi_zone)
6625                 return (EIO);
6626
6627         /*
6628          * check for valid cmd parameter
6629          */
6630         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6631                 return (EINVAL);
6632
6633         /*
6634          * Check access permissions
6635          */
6636         if (cmd == F_SHARE &&
6637             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6638             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6639                 return (EBADF);
6640
6641         /*
6642          * If the filesystem is mounted using local locking, pass the
6643          * request off to the local share code.
6644          */
6645         if (VTOMI(vp)->mi_flags & MI_LLOCK)
6646                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6647
6648         switch (cmd) {
6649         case F_SHARE:
6650         case F_UNSHARE:
6651                 lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6652                 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6653
6654                 /*
6655                  * If passed an owner that is too large to fit in an
6656                  * nfs_owner it is likely a recursive call from the
6657                  * lock manager client and pass it straight through.  If
6658                  * it is not a nfs_owner then simply return an error.
6659                  */
6660                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6661                         if (((struct nfs_owner *)shr->s_owner)->magic !=
6662                             NFS_OWNER_MAGIC)
6663                                 return (EINVAL);
6664
6665                         if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6666                                 error = set_errno(error);
6667                         }
6668                         return (error);
6669                 }
6670                 /*
6671                  * Remote share reservations owner is a combination of
6672                  * a magic number, hostname, and the local owner
6673                  */
6674                 bzero(&nfs_owner, sizeof (nfs_owner));
6675                 nfs_owner.magic = NFS_OWNER_MAGIC;
6676                 (void) strncpy(nfs_owner.hname, uts_nodename(),
6677                     sizeof (nfs_owner.hname));
6678                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6679                 nshr.s_access = shr->s_access;
6680                 nshr.s_deny = shr->s_deny;
6681                 nshr.s_sysid = 0;
6682                 nshr.s_pid = ttoproc(curthread)->p_pid;
6683                 nshr.s_own_len = sizeof (nfs_owner);
6684                 nshr.s_owner = (caddr_t)&nfs_owner;
6685
6686                 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6687                         error = set_errno(error);
6688                 }
6689
6690                 break;
6691
6692         case F_HASREMOTELOCKS:
6693                 /*
6694                  * NFS client can't store remote locks itself
6695                  */
6696                 shr->s_access = 0;
6697                 error = 0;
6698                 break;
6699
6700         default:
6701                 error = EINVAL;
6702                 break;
6703         }
6704
6705         return (error);
6706 }