kernel/fs/nfs/nfs3_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  28  *      All rights reserved.
  29  */
  30
  31 /*
  32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  33  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/time.h>
  41 #include <sys/vnode.h>
  42 #include <sys/vfs.h>
  43 #include <sys/file.h>
  44 #include <sys/filio.h>
  45 #include <sys/uio.h>
  46 #include <sys/buf.h>
  47 #include <sys/mman.h>
  48 #include <sys/pathname.h>
  49 #include <sys/dirent.h>
  50 #include <sys/debug.h>
  51 #include <sys/vmsystm.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/flock.h>
  54 #include <sys/swap.h>
  55 #include <sys/errno.h>
  56 #include <sys/strsubr.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/kmem.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/pathconf.h>
  61 #include <sys/utsname.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/acl.h>
  64 #include <sys/systeminfo.h>
  65 #include <sys/atomic.h>
  66 #include <sys/policy.h>
  67 #include <sys/sdt.h>
  68 #include <sys/zone.h>
  69
  70 #include <rpc/types.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73 #include <rpc/rpc_rdma.h>
  74
  75 #include <nfs/nfs.h>
  76 #include <nfs/nfs_clnt.h>
  77 #include <nfs/rnode.h>
  78 #include <nfs/nfs_acl.h>
  79 #include <nfs/lm.h>
  80
  81 #include <vm/hat.h>
  82 #include <vm/as.h>
  83 #include <vm/page.h>
  84 #include <vm/pvn.h>
  85 #include <vm/seg.h>
  86 #include <vm/seg_map.h>
  87 #include <vm/seg_kpm.h>
  88 #include <vm/seg_vn.h>
  89
  90 #include <sys/fs_subr.h>
  91
  92 #include <sys/ddi.h>
  93
  94 static int      nfs3_rdwrlbn(vnode_t *, page_t *, uoff_t, size_t, int,
  95                         cred_t *);
  96 static int      nfs3write(vnode_t *, caddr_t, uoff_t, int, cred_t *,
  97                         stable_how *);
  98 static int      nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
  99 static int      nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
 100 static int      nfs3_accessx(void *, int, cred_t *);
 101 static int      nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
 102 static int      nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
 103 static int      nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
 104                         int, vnode_t **, cred_t *, int);
 105 static int      nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
 106 static int      nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 107                         int, vnode_t **, cred_t *);
 108 static int      nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 109                         caller_context_t *);
 110 static int      do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 111 static void     nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 112 static void     nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
 113 static int      nfs3_bio(struct buf *, stable_how *, cred_t *);
 114 static int      nfs3_getapage(vnode_t *, uoff_t, size_t, uint_t *,
 115                         page_t *[], size_t, struct seg *, caddr_t,
 116                         enum seg_rw, cred_t *);
 117 static void     nfs3_readahead(vnode_t *, uoff_t, caddr_t, struct seg *,
 118                         cred_t *);
 119 static int      nfs3_sync_putapage(vnode_t *, page_t *, uoff_t, size_t,
 120                         int, cred_t *);
 121 static int      nfs3_sync_pageio(vnode_t *, page_t *, uoff_t, size_t,
 122                         int, cred_t *);
 123 static int      nfs3_commit(vnode_t *, offset3, count3, cred_t *);
 124 static void     nfs3_set_mod(vnode_t *);
 125 static void     nfs3_get_commit(vnode_t *);
 126 static void     nfs3_get_commit_range(vnode_t *, uoff_t, size_t);
 127 static int      nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 128 static int      nfs3_commit_vp(vnode_t *, uoff_t, size_t,  cred_t *);
 129 static int      nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
 130                         cred_t *);
 131 static void     nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
 132                         cred_t *);
 133 static void     nfs3_delmap_callback(struct as *, void *, uint_t);
 134
 135 /*
 136  * Error flags used to pass information about certain special errors
 137  * which need to be handled specially.
 138  */
 139 #define NFS_EOF                 -98
 140 #define NFS_VERF_MISMATCH       -97
 141
 142 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 143 #define ALIGN64(x, ptr, sz)                                             \
 144         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);               \
 145         if (x) {                                                        \
 146                 x = sizeof (uint64_t) - (x);                            \
 147                 sz -= (x);                                              \
 148                 ptr += (x);                                             \
 149         }
 150
 151 /*
 152  * These are the vnode ops routines which implement the vnode interface to
 153  * the networked file system.  These routines just take their parameters,
 154  * make them look networkish by putting the right info into interface structs,
 155  * and then calling the appropriate remote routine(s) to do the work.
 156  *
 157  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 158  * we purge the directory cache relative to that vnode.  This way, the
 159  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 160  * more details on rnode locking.
 161  */
 162
 163 static int      nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
 164 static int      nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
 165                         caller_context_t *);
 166 static int      nfs3_read(vnode_t *, struct uio *, int, cred_t *,
 167                         caller_context_t *);
 168 static int      nfs3_write(vnode_t *, struct uio *, int, cred_t *,
 169                         caller_context_t *);
 170 static int      nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 171                         caller_context_t *);
 172 static int      nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
 173                         caller_context_t *);
 174 static int      nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
 175                         caller_context_t *);
 176 static int      nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 177 static int      nfs3_readlink(vnode_t *, struct uio *, cred_t *,
 178                         caller_context_t *);
 179 static int      nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 180 static void     nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
 181 static int      nfs3_lookup(vnode_t *, char *, vnode_t **,
 182                         struct pathname *, int, vnode_t *, cred_t *,
 183                         caller_context_t *, int *, pathname_t *);
 184 static int      nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 185                         int, vnode_t **, cred_t *, int, caller_context_t *,
 186                         vsecattr_t *);
 187 static int      nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 188                         int);
 189 static int      nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
 190                         caller_context_t *, int);
 191 static int      nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 192                         caller_context_t *, int);
 193 static int      nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 194                         cred_t *, caller_context_t *, int, vsecattr_t *);
 195 static int      nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 196                         caller_context_t *, int);
 197 static int      nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
 198                         cred_t *, caller_context_t *, int);
 199 static int      nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
 200                         caller_context_t *, int);
 201 static int      nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
 202 static int      nfs3_rwlock(vnode_t *, int, caller_context_t *);
 203 static void     nfs3_rwunlock(vnode_t *, int, caller_context_t *);
 204 static int      nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 205 static int      nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
 206                         page_t *[], size_t, struct seg *, caddr_t,
 207                         enum seg_rw, cred_t *, caller_context_t *);
 208 static int      nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 209                         caller_context_t *);
 210 static int      nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 211                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 212 static int      nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 213                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 214 static int      nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 215                         struct flk_callback *, cred_t *, caller_context_t *);
 216 static int      nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
 217                         cred_t *, caller_context_t *);
 218 static int      nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
 219 static int      nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 220                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 221 static int      nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 222                         caller_context_t *);
 223 static int      nfs3_pageio(vnode_t *, page_t *, uoff_t, size_t, int,
 224                         cred_t *, caller_context_t *);
 225 static void     nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
 226                         caller_context_t *);
 227 static int      nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 228                         caller_context_t *);
 229 static int      nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 230                         caller_context_t *);
 231 static int      nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 232                         caller_context_t *);
 233
 234 const struct vnodeops nfs3_vnodeops = {
 235         .vnop_name = "nfs3",
 236         .vop_open = nfs3_open,
 237         .vop_close = nfs3_close,
 238         .vop_read = nfs3_read,
 239         .vop_write = nfs3_write,
 240         .vop_ioctl = nfs3_ioctl,
 241         .vop_getattr = nfs3_getattr,
 242         .vop_setattr = nfs3_setattr,
 243         .vop_access = nfs3_access,
 244         .vop_lookup = nfs3_lookup,
 245         .vop_create = nfs3_create,
 246         .vop_remove = nfs3_remove,
 247         .vop_link = nfs3_link,
 248         .vop_rename = nfs3_rename,
 249         .vop_mkdir = nfs3_mkdir,
 250         .vop_rmdir = nfs3_rmdir,
 251         .vop_readdir = nfs3_readdir,
 252         .vop_symlink = nfs3_symlink,
 253         .vop_readlink = nfs3_readlink,
 254         .vop_fsync = nfs3_fsync,
 255         .vop_inactive = nfs3_inactive,
 256         .vop_fid = nfs3_fid,
 257         .vop_rwlock = nfs3_rwlock,
 258         .vop_rwunlock = nfs3_rwunlock,
 259         .vop_seek = nfs3_seek,
 260         .vop_frlock = nfs3_frlock,
 261         .vop_space = nfs3_space,
 262         .vop_realvp = nfs3_realvp,
 263         .vop_getpage = nfs3_getpage,
 264         .vop_putpage = nfs3_putpage,
 265         .vop_map = nfs3_map,
 266         .vop_addmap = nfs3_addmap,
 267         .vop_delmap = nfs3_delmap,
 268         /* no separate nfs3_dump */
 269         .vop_dump = nfs_dump,
 270         .vop_pathconf = nfs3_pathconf,
 271         .vop_pageio = nfs3_pageio,
 272         .vop_dispose = nfs3_dispose,
 273         .vop_setsecattr = nfs3_setsecattr,
 274         .vop_getsecattr = nfs3_getsecattr,
 275         .vop_shrlock = nfs3_shrlock,
 276         .vop_vnevent = fs_vnevent_support,
 277 };
 278
 279 /*
 280  * XXX:  This is referenced in modstubs.s
 281  */
 282 const struct vnodeops *
 283 nfs3_getvnodeops(void)
 284 {
 285         return (&nfs3_vnodeops);
 286 }
 287
 288 /* ARGSUSED */
 289 static int
 290 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 291 {
 292         int error;
 293         struct vattr va;
 294         rnode_t *rp;
 295         vnode_t *vp;
 296
 297         vp = *vpp;
 298         if (nfs_zone() != VTOMI(vp)->mi_zone)
 299                 return (EIO);
 300         rp = VTOR(vp);
 301         mutex_enter(&rp->r_statelock);
 302         if (rp->r_cred == NULL) {
 303                 crhold(cr);
 304                 rp->r_cred = cr;
 305         }
 306         mutex_exit(&rp->r_statelock);
 307
 308         /*
 309          * If there is no cached data or if close-to-open
 310          * consistency checking is turned off, we can avoid
 311          * the over the wire getattr.  Otherwise, if the
 312          * file system is mounted readonly, then just verify
 313          * the caches are up to date using the normal mechanism.
 314          * Else, if the file is not mmap'd, then just mark
 315          * the attributes as timed out.  They will be refreshed
 316          * and the caches validated prior to being used.
 317          * Else, the file system is mounted writeable so
 318          * force an over the wire GETATTR in order to ensure
 319          * that all cached data is valid.
 320          */
 321         if (vp->v_count > 1 ||
 322             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 323             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 324                 if (vn_is_readonly(vp))
 325                         error = nfs3_validate_caches(vp, cr);
 326                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 327                         PURGE_ATTRCACHE(vp);
 328                         error = 0;
 329                 } else {
 330                         va.va_mask = AT_ALL;
 331                         error = nfs3_getattr_otw(vp, &va, cr);
 332                 }
 333         } else
 334                 error = 0;
 335
 336         return (error);
 337 }
 338
 339 /* ARGSUSED */
 340 static int
 341 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 342                 caller_context_t *ct)
 343 {
 344         rnode_t *rp;
 345         int error;
 346         struct vattr va;
 347
 348         /*
 349          * zone_enter(2) prevents processes from changing zones with NFS files
 350          * open; if we happen to get here from the wrong zone we can't do
 351          * anything over the wire.
 352          */
 353         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 354                 /*
 355                  * We could attempt to clean up locks, except we're sure
 356                  * that the current process didn't acquire any locks on
 357                  * the file: any attempt to lock a file belong to another zone
 358                  * will fail, and one can't lock an NFS file and then change
 359                  * zones, as that fails too.
 360                  *
 361                  * Returning an error here is the sane thing to do.  A
 362                  * subsequent call to VN_RELE() which translates to a
 363                  * nfs3_inactive() will clean up state: if the zone of the
 364                  * vnode's origin is still alive and kicking, an async worker
 365                  * thread will handle the request (from the correct zone), and
 366                  * everything (minus the commit and final nfs3_getattr_otw()
 367                  * call) should be OK. If the zone is going away
 368                  * nfs_async_inactive() will throw away cached pages inline.
 369                  */
 370                 return (EIO);
 371         }
 372
 373         /*
 374          * If we are using local locking for this filesystem, then
 375          * release all of the SYSV style record locks.  Otherwise,
 376          * we are doing network locking and we need to release all
 377          * of the network locks.  All of the locks held by this
 378          * process on this file are released no matter what the
 379          * incoming reference count is.
 380          */
 381         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 382                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 383                 cleanshares(vp, ttoproc(curthread)->p_pid);
 384         } else
 385                 nfs_lockrelease(vp, flag, offset, cr);
 386
 387         if (count > 1)
 388                 return (0);
 389
 390         /*
 391          * If the file has been `unlinked', then purge the
 392          * DNLC so that this vnode will get reycled quicker
 393          * and the .nfs* file on the server will get removed.
 394          */
 395         rp = VTOR(vp);
 396         if (rp->r_unldvp != NULL)
 397                 dnlc_purge_vp(vp);
 398
 399         /*
 400          * If the file was open for write and there are pages,
 401          * then if the file system was mounted using the "no-close-
 402          *      to-open" semantics, then start an asynchronous flush
 403          *      of the all of the pages in the file.
 404          * else the file system was not mounted using the "no-close-
 405          *      to-open" semantics, then do a synchronous flush and
 406          *      commit of all of the dirty and uncommitted pages.
 407          *
 408          * The asynchronous flush of the pages in the "nocto" path
 409          * mostly just associates a cred pointer with the rnode so
 410          * writes which happen later will have a better chance of
 411          * working.  It also starts the data being written to the
 412          * server, but without unnecessarily delaying the application.
 413          */
 414         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 415                 if (VTOMI(vp)->mi_flags & MI_NOCTO) {
 416                         error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
 417                             cr, ct);
 418                         if (error == EAGAIN)
 419                                 error = 0;
 420                 } else
 421                         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
 422                 if (!error) {
 423                         mutex_enter(&rp->r_statelock);
 424                         error = rp->r_error;
 425                         rp->r_error = 0;
 426                         mutex_exit(&rp->r_statelock);
 427                 }
 428         } else {
 429                 mutex_enter(&rp->r_statelock);
 430                 error = rp->r_error;
 431                 rp->r_error = 0;
 432                 mutex_exit(&rp->r_statelock);
 433         }
 434
 435         /*
 436          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 437          * refresh the attribute cache with a set of attributes which
 438          * weren't returned from a WRITE.  This will enable the close-
 439          * to-open processing to work.
 440          */
 441         if (rp->r_flags & RWRITEATTR)
 442                 (void) nfs3_getattr_otw(vp, &va, cr);
 443
 444         return (error);
 445 }
 446
 447 /* ARGSUSED */
 448 static int
 449 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
 450 {
 451         mntinfo_t *mi;
 452         READ3args args;
 453         READ3uiores res;
 454         int tsize;
 455         offset_t offset;
 456         ssize_t count;
 457         int error;
 458         int douprintf;
 459         failinfo_t fi;
 460         char *sv_hostname;
 461
 462         mi = VTOMI(vp);
 463         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 464         sv_hostname = VTOR(vp)->r_server->sv_hostname;
 465
 466         douprintf = 1;
 467         args.file = *VTOFH3(vp);
 468         fi.vp = vp;
 469         fi.fhp = (caddr_t)&args.file;
 470         fi.copyproc = nfs3copyfh;
 471         fi.lookupproc = nfs3lookup;
 472         fi.xattrdirproc = acl_getxattrdir3;
 473
 474         res.uiop = uiop;
 475
 476         res.wlist = NULL;
 477
 478         offset = uiop->uio_loffset;
 479         count = uiop->uio_resid;
 480
 481         do {
 482                 if (mi->mi_io_kstats) {
 483                         mutex_enter(&mi->mi_lock);
 484                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 485                         mutex_exit(&mi->mi_lock);
 486                 }
 487
 488                 do {
 489                         tsize = MIN(mi->mi_tsize, count);
 490                         args.offset = (offset3)offset;
 491                         args.count = (count3)tsize;
 492                         res.size = (uint_t)tsize;
 493                         args.res_uiop = uiop;
 494                         args.res_data_val_alt = NULL;
 495
 496                         error = rfs3call(mi, NFSPROC3_READ,
 497                             xdr_READ3args, (caddr_t)&args,
 498                             xdr_READ3uiores, (caddr_t)&res, cr,
 499                             &douprintf, &res.status, 0, &fi);
 500                 } while (error == ENFS_TRYAGAIN);
 501
 502                 if (mi->mi_io_kstats) {
 503                         mutex_enter(&mi->mi_lock);
 504                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 505                         mutex_exit(&mi->mi_lock);
 506                 }
 507
 508                 if (error)
 509                         return (error);
 510
 511                 error = geterrno3(res.status);
 512                 if (error)
 513                         return (error);
 514
 515                 if (res.count != res.size) {
 516                         zcmn_err(getzoneid(), CE_WARN,
 517 "nfs3_directio_read: server %s returned incorrect amount",
 518                             sv_hostname);
 519                         return (EIO);
 520                 }
 521                 count -= res.count;
 522                 offset += res.count;
 523                 if (mi->mi_io_kstats) {
 524                         mutex_enter(&mi->mi_lock);
 525                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
 526                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
 527                         mutex_exit(&mi->mi_lock);
 528                 }
 529                 lwp_stat_update(LWP_STAT_INBLK, 1);
 530         } while (count && !res.eof);
 531
 532         return (0);
 533 }
 534
 535 /* ARGSUSED */
 536 static int
 537 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 538         caller_context_t *ct)
 539 {
 540         rnode_t *rp;
 541         uoff_t off;
 542         offset_t diff;
 543         int on;
 544         size_t n;
 545         caddr_t base;
 546         uint_t flags;
 547         int error = 0;
 548         mntinfo_t *mi;
 549
 550         rp = VTOR(vp);
 551         mi = VTOMI(vp);
 552
 553         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 554
 555         if (nfs_zone() != mi->mi_zone)
 556                 return (EIO);
 557
 558         if (vp->v_type != VREG)
 559                 return (EISDIR);
 560
 561         if (uiop->uio_resid == 0)
 562                 return (0);
 563
 564         if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
 565                 return (EINVAL);
 566
 567         /*
 568          * Bypass VM if caching has been disabled (e.g., locking) or if
 569          * using client-side direct I/O and the file is not mmap'd and
 570          * there are no cached pages.
 571          */
 572         if ((vp->v_flag & VNOCACHE) ||
 573             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 574             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 575             !vn_has_cached_data(vp))) {
 576                 return (nfs3_directio_read(vp, uiop, cr));
 577         }
 578
 579         do {
 580                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 581                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 582                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 583
 584                 error = nfs3_validate_caches(vp, cr);
 585                 if (error)
 586                         break;
 587
 588                 mutex_enter(&rp->r_statelock);
 589                 while (rp->r_flags & RINCACHEPURGE) {
 590                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 591                                 mutex_exit(&rp->r_statelock);
 592                                 return (EINTR);
 593                         }
 594                 }
 595                 diff = rp->r_size - uiop->uio_loffset;
 596                 mutex_exit(&rp->r_statelock);
 597                 if (diff <= 0)
 598                         break;
 599                 if (diff < n)
 600                         n = (size_t)diff;
 601
 602                 if (vpm_enable) {
 603                         /*
 604                          * Copy data.
 605                          */
 606                         error = vpm_data_copy(vp, off + on, n, uiop,
 607                             1, NULL, 0, S_READ);
 608                 } else {
 609                         base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
 610                             S_READ);
 611
 612                         error = uiomove(base + on, n, UIO_READ, uiop);
 613                 }
 614
 615                 if (!error) {
 616                         /*
 617                          * If read a whole block or read to eof,
 618                          * won't need this buffer again soon.
 619                          */
 620                         mutex_enter(&rp->r_statelock);
 621                         if (n + on == MAXBSIZE ||
 622                             uiop->uio_loffset == rp->r_size)
 623                                 flags = SM_DONTNEED;
 624                         else
 625                                 flags = 0;
 626                         mutex_exit(&rp->r_statelock);
 627                         if (vpm_enable) {
 628                                 error = vpm_sync_pages(vp, off, n, flags);
 629                         } else {
 630                                 error = segmap_release(segkmap, base, flags);
 631                         }
 632                 } else {
 633                         if (vpm_enable) {
 634                                 (void) vpm_sync_pages(vp, off, n, 0);
 635                         } else {
 636                                 (void) segmap_release(segkmap, base, 0);
 637                         }
 638                 }
 639         } while (!error && uiop->uio_resid > 0);
 640
 641         return (error);
 642 }
 643
 644 /* ARGSUSED */
 645 static int
 646 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 647         caller_context_t *ct)
 648 {
 649         rlim64_t limit = uiop->uio_llimit;
 650         rnode_t *rp;
 651         uoff_t off;
 652         caddr_t base;
 653         uint_t flags;
 654         int remainder;
 655         size_t n;
 656         int on;
 657         int error;
 658         int resid;
 659         offset_t offset;
 660         mntinfo_t *mi;
 661         uint_t bsize;
 662
 663         rp = VTOR(vp);
 664
 665         if (vp->v_type != VREG)
 666                 return (EISDIR);
 667
 668         mi = VTOMI(vp);
 669         if (nfs_zone() != mi->mi_zone)
 670                 return (EIO);
 671         if (uiop->uio_resid == 0)
 672                 return (0);
 673
 674         if (ioflag & FAPPEND) {
 675                 struct vattr va;
 676
 677                 /*
 678                  * Must serialize if appending.
 679                  */
 680                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 681                         nfs_rw_exit(&rp->r_rwlock);
 682                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 683                             INTR(vp)))
 684                                 return (EINTR);
 685                 }
 686
 687                 va.va_mask = AT_SIZE;
 688                 error = nfs3getattr(vp, &va, cr);
 689                 if (error)
 690                         return (error);
 691                 uiop->uio_loffset = va.va_size;
 692         }
 693
 694         offset = uiop->uio_loffset + uiop->uio_resid;
 695
 696         if (uiop->uio_loffset < 0 || offset < 0)
 697                 return (EINVAL);
 698
 699         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 700                 limit = MAXOFFSET_T;
 701
 702         /*
 703          * Check to make sure that the process will not exceed
 704          * its limit on file size.  It is okay to write up to
 705          * the limit, but not beyond.  Thus, the write which
 706          * reaches the limit will be short and the next write
 707          * will return an error.
 708          */
 709         remainder = 0;
 710         if (offset > limit) {
 711                 remainder = offset - limit;
 712                 uiop->uio_resid = limit - uiop->uio_loffset;
 713                 if (uiop->uio_resid <= 0) {
 714                         proc_t *p = ttoproc(curthread);
 715
 716                         uiop->uio_resid += remainder;
 717                         mutex_enter(&p->p_lock);
 718                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 719                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 720                         mutex_exit(&p->p_lock);
 721                         return (EFBIG);
 722                 }
 723         }
 724
 725         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 726                 return (EINTR);
 727
 728         /*
 729          * Bypass VM if caching has been disabled (e.g., locking) or if
 730          * using client-side direct I/O and the file is not mmap'd and
 731          * there are no cached pages.
 732          */
 733         if ((vp->v_flag & VNOCACHE) ||
 734             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 735             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 736             !vn_has_cached_data(vp))) {
 737                 size_t bufsize;
 738                 int count;
 739                 uoff_t org_offset;
 740                 stable_how stab_comm;
 741
 742 nfs3_fwrite:
 743                 if (rp->r_flags & RSTALE) {
 744                         resid = uiop->uio_resid;
 745                         offset = uiop->uio_loffset;
 746                         error = rp->r_error;
 747                         /*
 748                          * A close may have cleared r_error, if so,
 749                          * propagate ESTALE error return properly
 750                          */
 751                         if (error == 0)
 752                                 error = ESTALE;
 753                         goto bottom;
 754                 }
 755                 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
 756                 base = kmem_alloc(bufsize, KM_SLEEP);
 757                 do {
 758                         if (ioflag & FDSYNC)
 759                                 stab_comm = DATA_SYNC;
 760                         else
 761                                 stab_comm = FILE_SYNC;
 762                         resid = uiop->uio_resid;
 763                         offset = uiop->uio_loffset;
 764                         count = MIN(uiop->uio_resid, bufsize);
 765                         org_offset = uiop->uio_loffset;
 766                         error = uiomove(base, count, UIO_WRITE, uiop);
 767                         if (!error) {
 768                                 error = nfs3write(vp, base, org_offset,
 769                                     count, cr, &stab_comm);
 770                         }
 771                 } while (!error && uiop->uio_resid > 0);
 772                 kmem_free(base, bufsize);
 773                 goto bottom;
 774         }
 775
 776
 777         bsize = vp->v_vfsp->vfs_bsize;
 778
 779         do {
 780                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 781                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 782                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 783
 784                 resid = uiop->uio_resid;
 785                 offset = uiop->uio_loffset;
 786
 787                 if (rp->r_flags & RSTALE) {
 788                         error = rp->r_error;
 789                         /*
 790                          * A close may have cleared r_error, if so,
 791                          * propagate ESTALE error return properly
 792                          */
 793                         if (error == 0)
 794                                 error = ESTALE;
 795                         break;
 796                 }
 797
 798                 /*
 799                  * Don't create dirty pages faster than they
 800                  * can be cleaned so that the system doesn't
 801                  * get imbalanced.  If the async queue is
 802                  * maxed out, then wait for it to drain before
 803                  * creating more dirty pages.  Also, wait for
 804                  * any threads doing pagewalks in the vop_getattr
 805                  * entry points so that they don't block for
 806                  * long periods.
 807                  */
 808                 mutex_enter(&rp->r_statelock);
 809                 while ((mi->mi_max_threads != 0 &&
 810                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 811                     rp->r_gcount > 0) {
 812                         if (INTR(vp)) {
 813                                 klwp_t *lwp = ttolwp(curthread);
 814
 815                                 if (lwp != NULL)
 816                                         lwp->lwp_nostop++;
 817                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 818                                         mutex_exit(&rp->r_statelock);
 819                                         if (lwp != NULL)
 820                                                 lwp->lwp_nostop--;
 821                                         error = EINTR;
 822                                         goto bottom;
 823                                 }
 824                                 if (lwp != NULL)
 825                                         lwp->lwp_nostop--;
 826                         } else
 827                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 828                 }
 829                 mutex_exit(&rp->r_statelock);
 830
 831                 /*
 832                  * Touch the page and fault it in if it is not in core
 833                  * before segmap_getmapflt or vpm_data_copy can lock it.
 834                  * This is to avoid the deadlock if the buffer is mapped
 835                  * to the same file through mmap which we want to write.
 836                  */
 837                 uio_prefaultpages((long)n, uiop);
 838
 839                 if (vpm_enable) {
 840                         /*
 841                          * It will use kpm mappings, so no need to
 842                          * pass an address.
 843                          */
 844                         error = writerp(rp, NULL, n, uiop, 0);
 845                 } else  {
 846                         if (segmap_kpm) {
 847                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 848                                 size_t pn = MIN(PAGESIZE - pon,
 849                                     uiop->uio_resid);
 850                                 int pagecreate;
 851
 852                                 mutex_enter(&rp->r_statelock);
 853                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 854                                     uiop->uio_loffset + pn >= rp->r_size);
 855                                 mutex_exit(&rp->r_statelock);
 856
 857                                 base = segmap_getmapflt(segkmap, vp, off + on,
 858                                     pn, !pagecreate, S_WRITE);
 859
 860                                 error = writerp(rp, base + pon, n, uiop,
 861                                     pagecreate);
 862
 863                         } else {
 864                                 base = segmap_getmapflt(segkmap, vp, off + on,
 865                                     n, 0, S_READ);
 866                                 error = writerp(rp, base + on, n, uiop, 0);
 867                         }
 868                 }
 869
 870                 if (!error) {
 871                         if (mi->mi_flags & MI_NOAC)
 872                                 flags = SM_WRITE;
 873                         else if ((uiop->uio_loffset % bsize) == 0 ||
 874                             IS_SWAPVP(vp)) {
 875                                 /*
 876                                  * Have written a whole block.
 877                                  * Start an asynchronous write
 878                                  * and mark the buffer to
 879                                  * indicate that it won't be
 880                                  * needed again soon.
 881                                  */
 882                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 883                         } else
 884                                 flags = 0;
 885                         if ((ioflag & (FSYNC|FDSYNC)) ||
 886                             (rp->r_flags & ROUTOFSPACE)) {
 887                                 flags &= ~SM_ASYNC;
 888                                 flags |= SM_WRITE;
 889                         }
 890                         if (vpm_enable) {
 891                                 error = vpm_sync_pages(vp, off, n, flags);
 892                         } else {
 893                                 error = segmap_release(segkmap, base, flags);
 894                         }
 895                 } else {
 896                         if (vpm_enable) {
 897                                 (void) vpm_sync_pages(vp, off, n, 0);
 898                         } else {
 899                                 (void) segmap_release(segkmap, base, 0);
 900                         }
 901                         /*
 902                          * In the event that we got an access error while
 903                          * faulting in a page for a write-only file just
 904                          * force a write.
 905                          */
 906                         if (error == EACCES)
 907                                 goto nfs3_fwrite;
 908                 }
 909         } while (!error && uiop->uio_resid > 0);
 910
 911 bottom:
 912         if (error) {
 913                 uiop->uio_resid = resid + remainder;
 914                 uiop->uio_loffset = offset;
 915         } else
 916                 uiop->uio_resid += remainder;
 917
 918         nfs_rw_exit(&rp->r_lkserlock);
 919
 920         return (error);
 921 }
 922
 923 /*
 924  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 925  */
 926 static int
 927 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, uoff_t off, size_t len,
 928         int flags, cred_t *cr)
 929 {
 930         struct buf *bp;
 931         int error;
 932         page_t *savepp;
 933         uchar_t fsdata;
 934         stable_how stab_comm;
 935
 936         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 937         bp = pageio_setup(pp, len, vp, flags);
 938         ASSERT(bp != NULL);
 939
 940         /*
 941          * pageio_setup should have set b_addr to 0.  This
 942          * is correct since we want to do I/O on a page
 943          * boundary.  bp_mapin will use this addr to calculate
 944          * an offset, and then set b_addr to the kernel virtual
 945          * address it allocated for us.
 946          */
 947         ASSERT(bp->b_un.b_addr == 0);
 948
 949         bp->b_edev = 0;
 950         bp->b_dev = 0;
 951         bp->b_lblkno = lbtodb(off);
 952         bp->b_file = vp;
 953         bp->b_offset = (offset_t)off;
 954         bp_mapin(bp);
 955
 956         /*
 957          * Calculate the desired level of stability to write data
 958          * on the server and then mark all of the pages to reflect
 959          * this.
 960          */
 961         if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
 962             freemem > desfree) {
 963                 stab_comm = UNSTABLE;
 964                 fsdata = C_DELAYCOMMIT;
 965         } else {
 966                 stab_comm = FILE_SYNC;
 967                 fsdata = C_NOCOMMIT;
 968         }
 969
 970         savepp = pp;
 971         do {
 972                 pp->p_fsdata = fsdata;
 973         } while ((pp = pp->p_next) != savepp);
 974
 975         error = nfs3_bio(bp, &stab_comm, cr);
 976
 977         bp_mapout(bp);
 978         pageio_done(bp);
 979
 980         /*
 981          * If the server wrote pages in a more stable fashion than
 982          * was requested, then clear all of the marks in the pages
 983          * indicating that COMMIT operations were required.
 984          */
 985         if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
 986                 do {
 987                         pp->p_fsdata = C_NOCOMMIT;
 988                 } while ((pp = pp->p_next) != savepp);
 989         }
 990
 991         return (error);
 992 }
 993
 994 /*
 995  * Write to file.  Writes to remote server in largest size
 996  * chunks that the server can handle.  Write is synchronous.
 997  */
 998 static int
 999 nfs3write(vnode_t *vp, caddr_t base, uoff_t offset, int count, cred_t *cr,
1000         stable_how *stab_comm)
1001 {
1002         mntinfo_t *mi;
1003         WRITE3args args;
1004         WRITE3res res;
1005         int error;
1006         int tsize;
1007         rnode_t *rp;
1008         int douprintf;
1009
1010         rp = VTOR(vp);
1011         mi = VTOMI(vp);
1012
1013         ASSERT(nfs_zone() == mi->mi_zone);
1014
1015         args.file = *VTOFH3(vp);
1016         args.stable = *stab_comm;
1017
1018         *stab_comm = FILE_SYNC;
1019
1020         douprintf = 1;
1021
1022         do {
1023                 if ((vp->v_flag & VNOCACHE) ||
1024                     (rp->r_flags & RDIRECTIO) ||
1025                     (mi->mi_flags & MI_DIRECTIO))
1026                         tsize = MIN(mi->mi_stsize, count);
1027                 else
1028                         tsize = MIN(mi->mi_curwrite, count);
1029                 args.offset = (offset3)offset;
1030                 args.count = (count3)tsize;
1031                 args.data.data_len = (uint_t)tsize;
1032                 args.data.data_val = base;
1033
1034                 if (mi->mi_io_kstats) {
1035                         mutex_enter(&mi->mi_lock);
1036                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1037                         mutex_exit(&mi->mi_lock);
1038                 }
1039                 args.mblk = NULL;
1040                 do {
1041                         error = rfs3call(mi, NFSPROC3_WRITE,
1042                             xdr_WRITE3args, (caddr_t)&args,
1043                             xdr_WRITE3res, (caddr_t)&res, cr,
1044                             &douprintf, &res.status, 0, NULL);
1045                 } while (error == ENFS_TRYAGAIN);
1046                 if (mi->mi_io_kstats) {
1047                         mutex_enter(&mi->mi_lock);
1048                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1049                         mutex_exit(&mi->mi_lock);
1050                 }
1051
1052                 if (error)
1053                         return (error);
1054                 error = geterrno3(res.status);
1055                 if (!error) {
1056                         if (res.resok.count > args.count) {
1057                                 zcmn_err(getzoneid(), CE_WARN,
1058                                     "nfs3write: server %s wrote %u, "
1059                                     "requested was %u",
1060                                     rp->r_server->sv_hostname,
1061                                     res.resok.count, args.count);
1062                                 return (EIO);
1063                         }
1064                         if (res.resok.committed == UNSTABLE) {
1065                                 *stab_comm = UNSTABLE;
1066                                 if (args.stable == DATA_SYNC ||
1067                                     args.stable == FILE_SYNC) {
1068                                         zcmn_err(getzoneid(), CE_WARN,
1069                         "nfs3write: server %s did not commit to stable storage",
1070                                             rp->r_server->sv_hostname);
1071                                         return (EIO);
1072                                 }
1073                         }
1074                         tsize = (int)res.resok.count;
1075                         count -= tsize;
1076                         base += tsize;
1077                         offset += tsize;
1078                         if (mi->mi_io_kstats) {
1079                                 mutex_enter(&mi->mi_lock);
1080                                 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1081                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1082                                     tsize;
1083                                 mutex_exit(&mi->mi_lock);
1084                         }
1085                         lwp_stat_update(LWP_STAT_OUBLK, 1);
1086                         mutex_enter(&rp->r_statelock);
1087                         if (rp->r_flags & RHAVEVERF) {
1088                                 if (rp->r_verf != res.resok.verf) {
1089                                         nfs3_set_mod(vp);
1090                                         rp->r_verf = res.resok.verf;
1091                                         /*
1092                                          * If the data was written UNSTABLE,
1093                                          * then might as well stop because
1094                                          * the whole block will have to get
1095                                          * rewritten anyway.
1096                                          */
1097                                         if (*stab_comm == UNSTABLE) {
1098                                                 mutex_exit(&rp->r_statelock);
1099                                                 break;
1100                                         }
1101                                 }
1102                         } else {
1103                                 rp->r_verf = res.resok.verf;
1104                                 rp->r_flags |= RHAVEVERF;
1105                         }
1106                         /*
1107                          * Mark the attribute cache as timed out and
1108                          * set RWRITEATTR to indicate that the file
1109                          * was modified with a WRITE operation and
1110                          * that the attributes can not be trusted.
1111                          */
1112                         PURGE_ATTRCACHE_LOCKED(rp);
1113                         rp->r_flags |= RWRITEATTR;
1114                         mutex_exit(&rp->r_statelock);
1115                 }
1116         } while (!error && count);
1117
1118         return (error);
1119 }
1120
1121 /*
1122  * Read from a file.  Reads data in largest chunks our interface can handle.
1123  */
1124 static int
1125 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1126         size_t *residp, cred_t *cr)
1127 {
1128         mntinfo_t *mi;
1129         READ3args args;
1130         READ3vres res;
1131         int tsize;
1132         int error;
1133         int douprintf;
1134         failinfo_t fi;
1135         rnode_t *rp;
1136         struct vattr va;
1137         hrtime_t t;
1138
1139         rp = VTOR(vp);
1140         mi = VTOMI(vp);
1141         ASSERT(nfs_zone() == mi->mi_zone);
1142         douprintf = 1;
1143
1144         args.file = *VTOFH3(vp);
1145         fi.vp = vp;
1146         fi.fhp = (caddr_t)&args.file;
1147         fi.copyproc = nfs3copyfh;
1148         fi.lookupproc = nfs3lookup;
1149         fi.xattrdirproc = acl_getxattrdir3;
1150
1151         res.pov.fres.vp = vp;
1152         res.pov.fres.vap = &va;
1153
1154         res.wlist = NULL;
1155         *residp = count;
1156         do {
1157                 if (mi->mi_io_kstats) {
1158                         mutex_enter(&mi->mi_lock);
1159                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1160                         mutex_exit(&mi->mi_lock);
1161                 }
1162
1163                 do {
1164                         if ((vp->v_flag & VNOCACHE) ||
1165                             (rp->r_flags & RDIRECTIO) ||
1166                             (mi->mi_flags & MI_DIRECTIO))
1167                                 tsize = MIN(mi->mi_tsize, count);
1168                         else
1169                                 tsize = MIN(mi->mi_curread, count);
1170                         res.data.data_val = base;
1171                         res.data.data_len = tsize;
1172                         args.offset = (offset3)offset;
1173                         args.count = (count3)tsize;
1174                         args.res_uiop = NULL;
1175                         args.res_data_val_alt = base;
1176
1177                         t = gethrtime();
1178                         error = rfs3call(mi, NFSPROC3_READ,
1179                             xdr_READ3args, (caddr_t)&args,
1180                             xdr_READ3vres, (caddr_t)&res, cr,
1181                             &douprintf, &res.status, 0, &fi);
1182                 } while (error == ENFS_TRYAGAIN);
1183
1184                 if (mi->mi_io_kstats) {
1185                         mutex_enter(&mi->mi_lock);
1186                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1187                         mutex_exit(&mi->mi_lock);
1188                 }
1189
1190                 if (error)
1191                         return (error);
1192
1193                 error = geterrno3(res.status);
1194                 if (error)
1195                         return (error);
1196
1197                 if (res.count != res.data.data_len) {
1198                         zcmn_err(getzoneid(), CE_WARN,
1199                             "nfs3read: server %s returned incorrect amount",
1200                             rp->r_server->sv_hostname);
1201                         return (EIO);
1202                 }
1203
1204                 count -= res.count;
1205                 *residp = count;
1206                 base += res.count;
1207                 offset += res.count;
1208                 if (mi->mi_io_kstats) {
1209                         mutex_enter(&mi->mi_lock);
1210                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1211                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1212                         mutex_exit(&mi->mi_lock);
1213                 }
1214                 lwp_stat_update(LWP_STAT_INBLK, 1);
1215         } while (count && !res.eof);
1216
1217         if (res.pov.attributes) {
1218                 mutex_enter(&rp->r_statelock);
1219                 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1220                         mutex_exit(&rp->r_statelock);
1221                         PURGE_ATTRCACHE(vp);
1222                 } else {
1223                         if (rp->r_mtime <= t)
1224                                 nfs_attrcache_va(vp, &va);
1225                         mutex_exit(&rp->r_statelock);
1226                 }
1227         }
1228
1229         return (0);
1230 }
1231
1232 /* ARGSUSED */
1233 static int
1234 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1235         caller_context_t *ct)
1236 {
1237
1238         if (nfs_zone() != VTOMI(vp)->mi_zone)
1239                 return (EIO);
1240         switch (cmd) {
1241                 case _FIODIRECTIO:
1242                         return (nfs_directio(vp, (int)arg, cr));
1243                 default:
1244                         return (ENOTTY);
1245         }
1246 }
1247
1248 /* ARGSUSED */
1249 static int
1250 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1251         caller_context_t *ct)
1252 {
1253         int error;
1254         rnode_t *rp;
1255
1256         if (nfs_zone() != VTOMI(vp)->mi_zone)
1257                 return (EIO);
1258         /*
1259          * If it has been specified that the return value will
1260          * just be used as a hint, and we are only being asked
1261          * for size, fsid or rdevid, then return the client's
1262          * notion of these values without checking to make sure
1263          * that the attribute cache is up to date.
1264          * The whole point is to avoid an over the wire GETATTR
1265          * call.
1266          */
1267         rp = VTOR(vp);
1268         if (flags & ATTR_HINT) {
1269                 if (vap->va_mask ==
1270                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1271                         mutex_enter(&rp->r_statelock);
1272                         if (vap->va_mask | AT_SIZE)
1273                                 vap->va_size = rp->r_size;
1274                         if (vap->va_mask | AT_FSID)
1275                                 vap->va_fsid = rp->r_attr.va_fsid;
1276                         if (vap->va_mask | AT_RDEV)
1277                                 vap->va_rdev = rp->r_attr.va_rdev;
1278                         mutex_exit(&rp->r_statelock);
1279                         return (0);
1280                 }
1281         }
1282
1283         /*
1284          * Only need to flush pages if asking for the mtime
1285          * and if there any dirty pages or any outstanding
1286          * asynchronous (write) requests for this file.
1287          */
1288         if (vap->va_mask & AT_MTIME) {
1289                 if (vn_has_cached_data(vp) &&
1290                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1291                         mutex_enter(&rp->r_statelock);
1292                         rp->r_gcount++;
1293                         mutex_exit(&rp->r_statelock);
1294                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1295                         mutex_enter(&rp->r_statelock);
1296                         if (error && (error == ENOSPC || error == EDQUOT)) {
1297                                 if (!rp->r_error)
1298                                         rp->r_error = error;
1299                         }
1300                         if (--rp->r_gcount == 0)
1301                                 cv_broadcast(&rp->r_cv);
1302                         mutex_exit(&rp->r_statelock);
1303                 }
1304         }
1305
1306         return (nfs3getattr(vp, vap, cr));
1307 }
1308
1309 /*ARGSUSED4*/
1310 static int
1311 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1312                 caller_context_t *ct)
1313 {
1314         int error;
1315         struct vattr va;
1316
1317         if (vap->va_mask & AT_NOSET)
1318                 return (EINVAL);
1319         if (nfs_zone() != VTOMI(vp)->mi_zone)
1320                 return (EIO);
1321
1322         va.va_mask = AT_UID | AT_MODE;
1323         error = nfs3getattr(vp, &va, cr);
1324         if (error)
1325                 return (error);
1326
1327         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1328             vp);
1329         if (error)
1330                 return (error);
1331
1332         error = nfs3setattr(vp, vap, flags, cr);
1333
1334         if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
1335                 vnevent_truncate(vp, ct);
1336
1337         return (error);
1338 }
1339
1340 static int
1341 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1342 {
1343         int error;
1344         uint_t mask;
1345         SETATTR3args args;
1346         SETATTR3res res;
1347         int douprintf;
1348         rnode_t *rp;
1349         struct vattr va;
1350         mode_t omode;
1351         vsecattr_t *vsp;
1352         hrtime_t t;
1353
1354         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1355         mask = vap->va_mask;
1356
1357         rp = VTOR(vp);
1358
1359         /*
1360          * Only need to flush pages if there are any pages and
1361          * if the file is marked as dirty in some fashion.  The
1362          * file must be flushed so that we can accurately
1363          * determine the size of the file and the cached data
1364          * after the SETATTR returns.  A file is considered to
1365          * be dirty if it is either marked with RDIRTY, has
1366          * outstanding i/o's active, or is mmap'd.  In this
1367          * last case, we can't tell whether there are dirty
1368          * pages, so we flush just to be sure.
1369          */
1370         if (vn_has_cached_data(vp) &&
1371             ((rp->r_flags & RDIRTY) ||
1372             rp->r_count > 0 ||
1373             rp->r_mapcnt > 0)) {
1374                 ASSERT(vp->v_type != VCHR);
1375                 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1376                 if (error && (error == ENOSPC || error == EDQUOT)) {
1377                         mutex_enter(&rp->r_statelock);
1378                         if (!rp->r_error)
1379                                 rp->r_error = error;
1380                         mutex_exit(&rp->r_statelock);
1381                 }
1382         }
1383
1384         args.object = *RTOFH3(rp);
1385         /*
1386          * If the intent is for the server to set the times,
1387          * there is no point in have the mask indicating set mtime or
1388          * atime, because the vap values may be junk, and so result
1389          * in an overflow error. Remove these flags from the vap mask
1390          * before calling in this case, and restore them afterwards.
1391          */
1392         if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1393                 /* Use server times, so don't set the args time fields */
1394                 vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1395                 error = vattr_to_sattr3(vap, &args.new_attributes);
1396                 vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1397                 if (mask & AT_ATIME) {
1398                         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1399                 }
1400                 if (mask & AT_MTIME) {
1401                         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1402                 }
1403         } else {
1404                 /* Either do not set times or use the client specified times */
1405                 error = vattr_to_sattr3(vap, &args.new_attributes);
1406         }
1407
1408         if (error) {
1409                 /* req time field(s) overflow - return immediately */
1410                 return (error);
1411         }
1412
1413         va.va_mask = AT_MODE | AT_CTIME;
1414         error = nfs3getattr(vp, &va, cr);
1415         if (error)
1416                 return (error);
1417         omode = va.va_mode;
1418
1419 tryagain:
1420         if (mask & AT_SIZE) {
1421                 args.guard.check = TRUE;
1422                 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1423                 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1424         } else
1425                 args.guard.check = FALSE;
1426
1427         douprintf = 1;
1428
1429         t = gethrtime();
1430
1431         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1432             xdr_SETATTR3args, (caddr_t)&args,
1433             xdr_SETATTR3res, (caddr_t)&res, cr,
1434             &douprintf, &res.status, 0, NULL);
1435
1436         /*
1437          * Purge the access cache and ACL cache if changing either the
1438          * owner of the file, the group owner, or the mode.  These may
1439          * change the access permissions of the file, so purge old
1440          * information and start over again.
1441          */
1442         if (mask & (AT_UID | AT_GID | AT_MODE)) {
1443                 (void) nfs_access_purge_rp(rp);
1444                 if (rp->r_secattr != NULL) {
1445                         mutex_enter(&rp->r_statelock);
1446                         vsp = rp->r_secattr;
1447                         rp->r_secattr = NULL;
1448                         mutex_exit(&rp->r_statelock);
1449                         if (vsp != NULL)
1450                                 nfs_acl_free(vsp);
1451                 }
1452         }
1453
1454         if (error) {
1455                 PURGE_ATTRCACHE(vp);
1456                 return (error);
1457         }
1458
1459         error = geterrno3(res.status);
1460         if (!error) {
1461                 /*
1462                  * If changing the size of the file, invalidate
1463                  * any local cached data which is no longer part
1464                  * of the file.  We also possibly invalidate the
1465                  * last page in the file.  We could use
1466                  * pvn_vpzero(), but this would mark the page as
1467                  * modified and require it to be written back to
1468                  * the server for no particularly good reason.
1469                  * This way, if we access it, then we bring it
1470                  * back in.  A read should be cheaper than a
1471                  * write.
1472                  */
1473                 if (mask & AT_SIZE) {
1474                         nfs_invalidate_pages(vp,
1475                             (vap->va_size & PAGEMASK), cr);
1476                 }
1477                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1478                 /*
1479                  * Some servers will change the mode to clear the setuid
1480                  * and setgid bits when changing the uid or gid.  The
1481                  * client needs to compensate appropriately.
1482                  */
1483                 if (mask & (AT_UID | AT_GID)) {
1484                         int terror;
1485
1486                         va.va_mask = AT_MODE;
1487                         terror = nfs3getattr(vp, &va, cr);
1488                         if (!terror &&
1489                             (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1490                             (!(mask & AT_MODE) && va.va_mode != omode))) {
1491                                 va.va_mask = AT_MODE;
1492                                 if (mask & AT_MODE)
1493                                         va.va_mode = vap->va_mode;
1494                                 else
1495                                         va.va_mode = omode;
1496                                 (void) nfs3setattr(vp, &va, 0, cr);
1497                         }
1498                 }
1499         } else {
1500                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1501                 /*
1502                  * If we got back a "not synchronized" error, then
1503                  * we need to retry with a new guard value.  The
1504                  * guard value used is the change time.  If the
1505                  * server returned post_op_attr, then we can just
1506                  * retry because we have the latest attributes.
1507                  * Otherwise, we issue a GETATTR to get the latest
1508                  * attributes and then retry.  If we couldn't get
1509                  * the attributes this way either, then we give
1510                  * up because we can't complete the operation as
1511                  * required.
1512                  */
1513                 if (res.status == NFS3ERR_NOT_SYNC) {
1514                         va.va_mask = AT_CTIME;
1515                         if (nfs3getattr(vp, &va, cr) == 0)
1516                                 goto tryagain;
1517                 }
1518                 PURGE_STALE_FH(error, vp, cr);
1519         }
1520
1521         return (error);
1522 }
1523
1524 static int
1525 nfs3_accessx(void *vp, int mode, cred_t *cr)
1526 {
1527         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1528         return (nfs3_access(vp, mode, 0, cr, NULL));
1529 }
1530
1531 /* ARGSUSED */
1532 static int
1533 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1534 {
1535         int error;
1536         ACCESS3args args;
1537         ACCESS3res res;
1538         int douprintf;
1539         uint32 acc;
1540         rnode_t *rp;
1541         cred_t *cred, *ncr, *ncrfree = NULL;
1542         failinfo_t fi;
1543         nfs_access_type_t cacc;
1544         hrtime_t t;
1545
1546         acc = 0;
1547         if (nfs_zone() != VTOMI(vp)->mi_zone)
1548                 return (EIO);
1549         if (mode & VREAD)
1550                 acc |= ACCESS3_READ;
1551         if (mode & VWRITE) {
1552                 if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1553                         return (EROFS);
1554                 if (vp->v_type == VDIR)
1555                         acc |= ACCESS3_DELETE;
1556                 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1557         }
1558         if (mode & VEXEC) {
1559                 if (vp->v_type == VDIR)
1560                         acc |= ACCESS3_LOOKUP;
1561                 else
1562                         acc |= ACCESS3_EXECUTE;
1563         }
1564
1565         rp = VTOR(vp);
1566         args.object = *VTOFH3(vp);
1567         if (vp->v_type == VDIR) {
1568                 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1569                     ACCESS3_EXTEND | ACCESS3_LOOKUP;
1570         } else {
1571                 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1572                     ACCESS3_EXECUTE;
1573         }
1574         fi.vp = vp;
1575         fi.fhp = (caddr_t)&args.object;
1576         fi.copyproc = nfs3copyfh;
1577         fi.lookupproc = nfs3lookup;
1578         fi.xattrdirproc = acl_getxattrdir3;
1579
1580         cred = cr;
1581         /*
1582          * ncr and ncrfree both initially
1583          * point to the memory area returned
1584          * by crnetadjust();
1585          * ncrfree not NULL when exiting means
1586          * that we need to release it
1587          */
1588         ncr = crnetadjust(cred);
1589         ncrfree = ncr;
1590 tryagain:
1591         if (rp->r_acache != NULL) {
1592                 cacc = nfs_access_check(rp, acc, cred);
1593                 if (cacc == NFS_ACCESS_ALLOWED) {
1594                         if (ncrfree != NULL)
1595                                 crfree(ncrfree);
1596                         return (0);
1597                 }
1598                 if (cacc == NFS_ACCESS_DENIED) {
1599                         /*
1600                          * If the cred can be adjusted, try again
1601                          * with the new cred.
1602                          */
1603                         if (ncr != NULL) {
1604                                 cred = ncr;
1605                                 ncr = NULL;
1606                                 goto tryagain;
1607                         }
1608                         if (ncrfree != NULL)
1609                                 crfree(ncrfree);
1610                         return (EACCES);
1611                 }
1612         }
1613
1614         douprintf = 1;
1615
1616         t = gethrtime();
1617
1618         error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1619             xdr_ACCESS3args, (caddr_t)&args,
1620             xdr_ACCESS3res, (caddr_t)&res, cred,
1621             &douprintf, &res.status, 0, &fi);
1622
1623         if (error) {
1624                 if (ncrfree != NULL)
1625                         crfree(ncrfree);
1626                 return (error);
1627         }
1628
1629         error = geterrno3(res.status);
1630         if (!error) {
1631                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1632                 nfs_access_cache(rp, args.access, res.resok.access, cred);
1633                 /*
1634                  * we just cached results with cred; if cred is the
1635                  * adjusted credentials from crnetadjust, we do not want
1636                  * to release them before exiting: hence setting ncrfree
1637                  * to NULL
1638                  */
1639                 if (cred != cr)
1640                         ncrfree = NULL;
1641                 if ((acc & res.resok.access) != acc) {
1642                         /*
1643                          * If the cred can be adjusted, try again
1644                          * with the new cred.
1645                          */
1646                         if (ncr != NULL) {
1647                                 cred = ncr;
1648                                 ncr = NULL;
1649                                 goto tryagain;
1650                         }
1651                         error = EACCES;
1652                 }
1653         } else {
1654                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1655                 PURGE_STALE_FH(error, vp, cr);
1656         }
1657
1658         if (ncrfree != NULL)
1659                 crfree(ncrfree);
1660
1661         return (error);
1662 }
1663
1664 static int nfs3_do_symlink_cache = 1;
1665
1666 /* ARGSUSED */
1667 static int
1668 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1669 {
1670         int error;
1671         READLINK3args args;
1672         READLINK3res res;
1673         nfspath3 resdata_backup;
1674         rnode_t *rp;
1675         int douprintf;
1676         int len;
1677         failinfo_t fi;
1678         hrtime_t t;
1679
1680         /*
1681          * Can't readlink anything other than a symbolic link.
1682          */
1683         if (vp->v_type != VLNK)
1684                 return (EINVAL);
1685         if (nfs_zone() != VTOMI(vp)->mi_zone)
1686                 return (EIO);
1687
1688         rp = VTOR(vp);
1689         if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1690                 error = nfs3_validate_caches(vp, cr);
1691                 if (error)
1692                         return (error);
1693                 mutex_enter(&rp->r_statelock);
1694                 if (rp->r_symlink.contents != NULL) {
1695                         error = uiomove(rp->r_symlink.contents,
1696                             rp->r_symlink.len, UIO_READ, uiop);
1697                         mutex_exit(&rp->r_statelock);
1698                         return (error);
1699                 }
1700                 mutex_exit(&rp->r_statelock);
1701         }
1702
1703         args.symlink = *VTOFH3(vp);
1704         fi.vp = vp;
1705         fi.fhp = (caddr_t)&args.symlink;
1706         fi.copyproc = nfs3copyfh;
1707         fi.lookupproc = nfs3lookup;
1708         fi.xattrdirproc = acl_getxattrdir3;
1709
1710         res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1711
1712         resdata_backup = res.resok.data;
1713
1714         douprintf = 1;
1715
1716         t = gethrtime();
1717
1718         error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1719             xdr_READLINK3args, (caddr_t)&args,
1720             xdr_READLINK3res, (caddr_t)&res, cr,
1721             &douprintf, &res.status, 0, &fi);
1722
1723         if (res.resok.data == nfs3nametoolong)
1724                 error = EINVAL;
1725
1726         if (error) {
1727                 kmem_free(resdata_backup, MAXPATHLEN);
1728                 return (error);
1729         }
1730
1731         error = geterrno3(res.status);
1732         if (!error) {
1733                 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1734                     cr);
1735                 len = strlen(res.resok.data);
1736                 error = uiomove(res.resok.data, len, UIO_READ, uiop);
1737                 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1738                         mutex_enter(&rp->r_statelock);
1739                                 if (rp->r_symlink.contents == NULL) {
1740                                 rp->r_symlink.contents = res.resok.data;
1741                                 rp->r_symlink.len = len;
1742                                 rp->r_symlink.size = MAXPATHLEN;
1743                                 mutex_exit(&rp->r_statelock);
1744                         } else {
1745                                 mutex_exit(&rp->r_statelock);
1746
1747                                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1748                         }
1749                 } else {
1750                         kmem_free((void *)res.resok.data, MAXPATHLEN);
1751                 }
1752         } else {
1753                 nfs3_cache_post_op_attr(vp,
1754                     &res.resfail.symlink_attributes, t, cr);
1755                 PURGE_STALE_FH(error, vp, cr);
1756
1757                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1758
1759         }
1760
1761         /*
1762          * The over the wire error for attempting to readlink something
1763          * other than a symbolic link is ENXIO.  However, we need to
1764          * return EINVAL instead of ENXIO, so we map it here.
1765          */
1766         return (error == ENXIO ? EINVAL : error);
1767 }
1768
1769 /*
1770  * Flush local dirty pages to stable storage on the server.
1771  *
1772  * If FNODSYNC is specified, then there is nothing to do because
1773  * metadata changes are not cached on the client before being
1774  * sent to the server.
1775  */
1776 /* ARGSUSED */
1777 static int
1778 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1779 {
1780         int error;
1781
1782         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1783                 return (0);
1784         if (nfs_zone() != VTOMI(vp)->mi_zone)
1785                 return (EIO);
1786
1787         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1788         if (!error)
1789                 error = VTOR(vp)->r_error;
1790         return (error);
1791 }
1792
1793 /*
1794  * Weirdness: if the file was removed or the target of a rename
1795  * operation while it was open, it got renamed instead.  Here we
1796  * remove the renamed file.
1797  */
1798 /* ARGSUSED */
1799 static void
1800 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1801 {
1802         rnode_t *rp;
1803
1804         ASSERT(vp != DNLC_NO_VNODE);
1805
1806         /*
1807          * If this is coming from the wrong zone, we let someone in the right
1808          * zone take care of it asynchronously.  We can get here due to
1809          * VN_RELE() being called from pageout() or fsflush().  This call may
1810          * potentially turn into an expensive no-op if, for instance, v_count
1811          * gets incremented in the meantime, but it's still correct.
1812          */
1813         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1814                 nfs_async_inactive(vp, cr, nfs3_inactive);
1815                 return;
1816         }
1817
1818         rp = VTOR(vp);
1819 redo:
1820         if (rp->r_unldvp != NULL) {
1821                 /*
1822                  * Save the vnode pointer for the directory where the
1823                  * unlinked-open file got renamed, then set it to NULL
1824                  * to prevent another thread from getting here before
1825                  * we're done with the remove.  While we have the
1826                  * statelock, make local copies of the pertinent rnode
1827                  * fields.  If we weren't to do this in an atomic way, the
1828                  * the unl* fields could become inconsistent with respect
1829                  * to each other due to a race condition between this
1830                  * code and nfs_remove().  See bug report 1034328.
1831                  */
1832                 mutex_enter(&rp->r_statelock);
1833                 if (rp->r_unldvp != NULL) {
1834                         vnode_t *unldvp;
1835                         char *unlname;
1836                         cred_t *unlcred;
1837                         REMOVE3args args;
1838                         REMOVE3res res;
1839                         int douprintf;
1840                         int error;
1841                         hrtime_t t;
1842
1843                         unldvp = rp->r_unldvp;
1844                         rp->r_unldvp = NULL;
1845                         unlname = rp->r_unlname;
1846                         rp->r_unlname = NULL;
1847                         unlcred = rp->r_unlcred;
1848                         rp->r_unlcred = NULL;
1849                         mutex_exit(&rp->r_statelock);
1850
1851                         /*
1852                          * If there are any dirty pages left, then flush
1853                          * them.  This is unfortunate because they just
1854                          * may get thrown away during the remove operation,
1855                          * but we have to do this for correctness.
1856                          */
1857                         if (vn_has_cached_data(vp) &&
1858                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1859                                 ASSERT(vp->v_type != VCHR);
1860                                 error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1861                                     cr, ct);
1862                                 if (error) {
1863                                         mutex_enter(&rp->r_statelock);
1864                                         if (!rp->r_error)
1865                                                 rp->r_error = error;
1866                                         mutex_exit(&rp->r_statelock);
1867                                 }
1868                         }
1869
1870                         /*
1871                          * Do the remove operation on the renamed file
1872                          */
1873                         setdiropargs3(&args.object, unlname, unldvp);
1874
1875                         douprintf = 1;
1876
1877                         t = gethrtime();
1878
1879                         error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1880                             xdr_diropargs3, (caddr_t)&args,
1881                             xdr_REMOVE3res, (caddr_t)&res, unlcred,
1882                             &douprintf, &res.status, 0, NULL);
1883
1884                         if (error) {
1885                                 PURGE_ATTRCACHE(unldvp);
1886                         } else {
1887                                 error = geterrno3(res.status);
1888                                 if (!error) {
1889                                         nfs3_cache_wcc_data(unldvp,
1890                                             &res.resok.dir_wcc, t, cr);
1891                                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1892                                                 nfs_purge_rddir_cache(unldvp);
1893                                 } else {
1894                                         nfs3_cache_wcc_data(unldvp,
1895                                             &res.resfail.dir_wcc, t, cr);
1896                                         PURGE_STALE_FH(error, unldvp, cr);
1897                                 }
1898                         }
1899
1900                         /*
1901                          * Release stuff held for the remove
1902                          */
1903                         VN_RELE(unldvp);
1904                         kmem_free(unlname, MAXNAMELEN);
1905                         crfree(unlcred);
1906                         goto redo;
1907                 }
1908                 mutex_exit(&rp->r_statelock);
1909         }
1910
1911         rp_addfree(rp, cr);
1912 }
1913
1914 /*
1915  * Remote file system operations having to do with directory manipulation.
1916  */
1917
1918 /* ARGSUSED */
1919 static int
1920 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1921         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1922         int *direntflags, pathname_t *realpnp)
1923 {
1924         int error;
1925         vnode_t *vp;
1926         vnode_t *avp = NULL;
1927         rnode_t *drp;
1928
1929         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1930                 return (EPERM);
1931
1932         drp = VTOR(dvp);
1933
1934         /*
1935          * Are we looking up extended attributes?  If so, "dvp" is
1936          * the file or directory for which we want attributes, and
1937          * we need a lookup of the hidden attribute directory
1938          * before we lookup the rest of the path.
1939          */
1940         if (flags & LOOKUP_XATTR) {
1941                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1942                 mntinfo_t *mi;
1943
1944                 mi = VTOMI(dvp);
1945                 if (!(mi->mi_flags & MI_EXTATTR))
1946                         return (EINVAL);
1947
1948                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1949                         return (EINTR);
1950
1951                 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1952                 if (avp == NULL)
1953                         error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1954                 else
1955                         error = 0;
1956
1957                 nfs_rw_exit(&drp->r_rwlock);
1958
1959                 if (error) {
1960                         if (mi->mi_flags & MI_EXTATTR)
1961                                 return (error);
1962                         return (EINVAL);
1963                 }
1964                 dvp = avp;
1965                 drp = VTOR(dvp);
1966         }
1967
1968         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1969                 error = EINTR;
1970                 goto out;
1971         }
1972
1973         error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1974
1975         nfs_rw_exit(&drp->r_rwlock);
1976
1977         /*
1978          * If vnode is a device, create special vnode.
1979          */
1980         if (!error && IS_DEVVP(*vpp)) {
1981                 vp = *vpp;
1982                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1983                 VN_RELE(vp);
1984         }
1985
1986 out:
1987         if (avp != NULL)
1988                 VN_RELE(avp);
1989
1990         return (error);
1991 }
1992
1993 static int nfs3_lookup_neg_cache = 1;
1994
1995 #ifdef DEBUG
1996 static int nfs3_lookup_dnlc_hits = 0;
1997 static int nfs3_lookup_dnlc_misses = 0;
1998 static int nfs3_lookup_dnlc_neg_hits = 0;
1999 static int nfs3_lookup_dnlc_disappears = 0;
2000 static int nfs3_lookup_dnlc_lookups = 0;
2001 #endif
2002
2003 /* ARGSUSED */
2004 int
2005 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
2006         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
2007 {
2008         int error;
2009         rnode_t *drp;
2010
2011         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2012         /*
2013          * If lookup is for "", just return dvp.  Don't need
2014          * to send it over the wire, look it up in the dnlc,
2015          * or perform any access checks.
2016          */
2017         if (*nm == '\0') {
2018                 VN_HOLD(dvp);
2019                 *vpp = dvp;
2020                 return (0);
2021         }
2022
2023         /*
2024          * Can't do lookups in non-directories.
2025          */
2026         if (dvp->v_type != VDIR)
2027                 return (ENOTDIR);
2028
2029         /*
2030          * If we're called with RFSCALL_SOFT, it's important that
2031          * the only rfscall is one we make directly; if we permit
2032          * an access call because we're looking up "." or validating
2033          * a dnlc hit, we'll deadlock because that rfscall will not
2034          * have the RFSCALL_SOFT set.
2035          */
2036         if (rfscall_flags & RFSCALL_SOFT)
2037                 goto callit;
2038
2039         /*
2040          * If lookup is for ".", just return dvp.  Don't need
2041          * to send it over the wire or look it up in the dnlc,
2042          * just need to check access.
2043          */
2044         if (strcmp(nm, ".") == 0) {
2045                 error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2046                 if (error)
2047                         return (error);
2048                 VN_HOLD(dvp);
2049                 *vpp = dvp;
2050                 return (0);
2051         }
2052
2053         drp = VTOR(dvp);
2054         if (!(drp->r_flags & RLOOKUP)) {
2055                 mutex_enter(&drp->r_statelock);
2056                 drp->r_flags |= RLOOKUP;
2057                 mutex_exit(&drp->r_statelock);
2058         }
2059
2060         /*
2061          * Lookup this name in the DNLC.  If there was a valid entry,
2062          * then return the results of the lookup.
2063          */
2064         error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2065         if (error || *vpp != NULL)
2066                 return (error);
2067
2068 callit:
2069         error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2070
2071         return (error);
2072 }
2073
2074 static int
2075 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2076 {
2077         int error;
2078         vnode_t *vp;
2079
2080         ASSERT(*nm != '\0');
2081         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2082         /*
2083          * Lookup this name in the DNLC.  If successful, then validate
2084          * the caches and then recheck the DNLC.  The DNLC is rechecked
2085          * just in case this entry got invalidated during the call
2086          * to nfs3_validate_caches.
2087          *
2088          * An assumption is being made that it is safe to say that a
2089          * file exists which may not on the server.  Any operations to
2090          * the server will fail with ESTALE.
2091          */
2092 #ifdef DEBUG
2093         nfs3_lookup_dnlc_lookups++;
2094 #endif
2095         vp = dnlc_lookup(dvp, nm);
2096         if (vp != NULL) {
2097                 VN_RELE(vp);
2098                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2099                         PURGE_ATTRCACHE(dvp);
2100                 }
2101                 error = nfs3_validate_caches(dvp, cr);
2102                 if (error)
2103                         return (error);
2104                 vp = dnlc_lookup(dvp, nm);
2105                 if (vp != NULL) {
2106                         error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2107                         if (error) {
2108                                 VN_RELE(vp);
2109                                 return (error);
2110                         }
2111                         if (vp == DNLC_NO_VNODE) {
2112                                 VN_RELE(vp);
2113 #ifdef DEBUG
2114                                 nfs3_lookup_dnlc_neg_hits++;
2115 #endif
2116                                 return (ENOENT);
2117                         }
2118                         *vpp = vp;
2119 #ifdef DEBUG
2120                         nfs3_lookup_dnlc_hits++;
2121 #endif
2122                         return (0);
2123                 }
2124 #ifdef DEBUG
2125                 nfs3_lookup_dnlc_disappears++;
2126 #endif
2127         }
2128 #ifdef DEBUG
2129         else
2130                 nfs3_lookup_dnlc_misses++;
2131 #endif
2132
2133         *vpp = NULL;
2134
2135         return (0);
2136 }
2137
2138 static int
2139 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2140         int rfscall_flags)
2141 {
2142         int error;
2143         LOOKUP3args args;
2144         LOOKUP3vres res;
2145         int douprintf;
2146         struct vattr vattr;
2147         struct vattr dvattr;
2148         vnode_t *vp;
2149         failinfo_t fi;
2150         hrtime_t t;
2151
2152         ASSERT(*nm != '\0');
2153         ASSERT(dvp->v_type == VDIR);
2154         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2155
2156         setdiropargs3(&args.what, nm, dvp);
2157
2158         fi.vp = dvp;
2159         fi.fhp = (caddr_t)&args.what.dir;
2160         fi.copyproc = nfs3copyfh;
2161         fi.lookupproc = nfs3lookup;
2162         fi.xattrdirproc = acl_getxattrdir3;
2163         res.obj_attributes.fres.vp = dvp;
2164         res.obj_attributes.fres.vap = &vattr;
2165         res.dir_attributes.fres.vp = dvp;
2166         res.dir_attributes.fres.vap = &dvattr;
2167
2168         douprintf = 1;
2169
2170         t = gethrtime();
2171
2172         error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2173             xdr_diropargs3, (caddr_t)&args,
2174             xdr_LOOKUP3vres, (caddr_t)&res, cr,
2175             &douprintf, &res.status, rfscall_flags, &fi);
2176
2177         if (error)
2178                 return (error);
2179
2180         nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2181
2182         error = geterrno3(res.status);
2183         if (error) {
2184                 PURGE_STALE_FH(error, dvp, cr);
2185                 if (error == ENOENT && nfs3_lookup_neg_cache)
2186                         dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2187                 return (error);
2188         }
2189
2190         if (res.obj_attributes.attributes) {
2191                 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2192                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2193         } else {
2194                 vp = makenfs3node_va(&res.object, NULL,
2195                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2196                 if (vp->v_type == VNON) {
2197                         vattr.va_mask = AT_TYPE;
2198                         error = nfs3getattr(vp, &vattr, cr);
2199                         if (error) {
2200                                 VN_RELE(vp);
2201                                 return (error);
2202                         }
2203                         vp->v_type = vattr.va_type;
2204                 }
2205         }
2206
2207         if (!(rfscall_flags & RFSCALL_SOFT))
2208                 dnlc_update(dvp, nm, vp);
2209
2210         *vpp = vp;
2211
2212         return (error);
2213 }
2214
2215 #ifdef DEBUG
2216 static int nfs3_create_misses = 0;
2217 #endif
2218
2219 /* ARGSUSED */
2220 static int
2221 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2222         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2223         vsecattr_t *vsecp)
2224 {
2225         int error;
2226         vnode_t *vp;
2227         rnode_t *rp;
2228         struct vattr vattr;
2229         rnode_t *drp;
2230         vnode_t *tempvp;
2231
2232         drp = VTOR(dvp);
2233         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2234                 return (EPERM);
2235         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2236                 return (EINTR);
2237
2238 top:
2239         /*
2240          * We make a copy of the attributes because the caller does not
2241          * expect us to change what va points to.
2242          */
2243         vattr = *va;
2244
2245         /*
2246          * If the pathname is "", just use dvp.  Don't need
2247          * to send it over the wire, look it up in the dnlc,
2248          * or perform any access checks.
2249          */
2250         if (*nm == '\0') {
2251                 error = 0;
2252                 VN_HOLD(dvp);
2253                 vp = dvp;
2254         /*
2255          * If the pathname is ".", just use dvp.  Don't need
2256          * to send it over the wire or look it up in the dnlc,
2257          * just need to check access.
2258          */
2259         } else if (strcmp(nm, ".") == 0) {
2260                 error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2261                 if (error) {
2262                         nfs_rw_exit(&drp->r_rwlock);
2263                         return (error);
2264                 }
2265                 VN_HOLD(dvp);
2266                 vp = dvp;
2267         /*
2268          * We need to go over the wire, just to be sure whether the
2269          * file exists or not.  Using the DNLC can be dangerous in
2270          * this case when making a decision regarding existence.
2271          */
2272         } else {
2273                 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2274         }
2275         if (!error) {
2276                 if (exclusive == EXCL)
2277                         error = EEXIST;
2278                 else if (vp->v_type == VDIR && (mode & VWRITE))
2279                         error = EISDIR;
2280                 else {
2281                         /*
2282                          * If vnode is a device, create special vnode.
2283                          */
2284                         if (IS_DEVVP(vp)) {
2285                                 tempvp = vp;
2286                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2287                                 VN_RELE(tempvp);
2288                         }
2289                         if (!(error = fop_access(vp, mode, 0, cr, ct))) {
2290                                 if ((vattr.va_mask & AT_SIZE) &&
2291                                     vp->v_type == VREG) {
2292                                         rp = VTOR(vp);
2293                                         /*
2294                                          * Check here for large file handled
2295                                          * by LF-unaware process (as
2296                                          * ufs_create() does)
2297                                          */
2298                                         if (!(lfaware & FOFFMAX)) {
2299                                                 mutex_enter(&rp->r_statelock);
2300                                                 if (rp->r_size > MAXOFF32_T)
2301                                                         error = EOVERFLOW;
2302                                                 mutex_exit(&rp->r_statelock);
2303                                         }
2304                                         if (!error) {
2305                                                 vattr.va_mask = AT_SIZE;
2306                                                 error = nfs3setattr(vp,
2307                                                     &vattr, 0, cr);
2308
2309                                                 /*
2310                                                  * Existing file was truncated;
2311                                                  * emit a create event.
2312                                                  */
2313                                                 vnevent_create(vp, ct);
2314                                         }
2315                                 }
2316                         }
2317                 }
2318                 nfs_rw_exit(&drp->r_rwlock);
2319                 if (error) {
2320                         VN_RELE(vp);
2321                 } else {
2322                         *vpp = vp;
2323                 }
2324
2325                 return (error);
2326         }
2327
2328         dnlc_remove(dvp, nm);
2329
2330         /*
2331          * Decide what the group-id of the created file should be.
2332          * Set it in attribute list as advisory...
2333          */
2334         error = setdirgid(dvp, &vattr.va_gid, cr);
2335         if (error) {
2336                 nfs_rw_exit(&drp->r_rwlock);
2337                 return (error);
2338         }
2339         vattr.va_mask |= AT_GID;
2340
2341         ASSERT(vattr.va_mask & AT_TYPE);
2342         if (vattr.va_type == VREG) {
2343                 ASSERT(vattr.va_mask & AT_MODE);
2344                 if (MANDMODE(vattr.va_mode)) {
2345                         nfs_rw_exit(&drp->r_rwlock);
2346                         return (EACCES);
2347                 }
2348                 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2349                     lfaware);
2350                 /*
2351                  * If this is not an exclusive create, then the CREATE
2352                  * request will be made with the GUARDED mode set.  This
2353                  * means that the server will return EEXIST if the file
2354                  * exists.  The file could exist because of a retransmitted
2355                  * request.  In this case, we recover by starting over and
2356                  * checking to see whether the file exists.  This second
2357                  * time through it should and a CREATE request will not be
2358                  * sent.
2359                  *
2360                  * This handles the problem of a dangling CREATE request
2361                  * which contains attributes which indicate that the file
2362                  * should be truncated.  This retransmitted request could
2363                  * possibly truncate valid data in the file if not caught
2364                  * by the duplicate request mechanism on the server or if
2365                  * not caught by other means.  The scenario is:
2366                  *
2367                  * Client transmits CREATE request with size = 0
2368                  * Client times out, retransmits request.
2369                  * Response to the first request arrives from the server
2370                  *  and the client proceeds on.
2371                  * Client writes data to the file.
2372                  * The server now processes retransmitted CREATE request
2373                  *  and truncates file.
2374                  *
2375                  * The use of the GUARDED CREATE request prevents this from
2376                  * happening because the retransmitted CREATE would fail
2377                  * with EEXIST and would not truncate the file.
2378                  */
2379                 if (error == EEXIST && exclusive == NONEXCL) {
2380 #ifdef DEBUG
2381                         nfs3_create_misses++;
2382 #endif
2383                         goto top;
2384                 }
2385                 nfs_rw_exit(&drp->r_rwlock);
2386                 return (error);
2387         }
2388         error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2389         nfs_rw_exit(&drp->r_rwlock);
2390         return (error);
2391 }
2392
2393 /* ARGSUSED */
2394 static int
2395 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2396         int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2397 {
2398         int error;
2399         CREATE3args args;
2400         CREATE3res res;
2401         int douprintf;
2402         vnode_t *vp;
2403         struct vattr vattr;
2404         nfstime3 *verfp;
2405         rnode_t *rp;
2406         timestruc_t now;
2407         hrtime_t t;
2408
2409         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2410         setdiropargs3(&args.where, nm, dvp);
2411         if (exclusive == EXCL) {
2412                 args.how.mode = EXCLUSIVE;
2413                 /*
2414                  * Construct the create verifier.  This verifier needs
2415                  * to be unique between different clients.  It also needs
2416                  * to vary for each exclusive create request generated
2417                  * from the client to the server.
2418                  *
2419                  * The first attempt is made to use the hostid and a
2420                  * unique number on the client.  If the hostid has not
2421                  * been set, the high resolution time that the exclusive
2422                  * create request is being made is used.  This will work
2423                  * unless two different clients, both with the hostid
2424                  * not set, attempt an exclusive create request on the
2425                  * same file, at exactly the same clock time.  The
2426                  * chances of this happening seem small enough to be
2427                  * reasonable.
2428                  */
2429                 verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2430                 verfp->seconds = zone_get_hostid(NULL);
2431                 if (verfp->seconds != 0)
2432                         verfp->nseconds = newnum();
2433                 else {
2434                         gethrestime(&now);
2435                         verfp->seconds = now.tv_sec;
2436                         verfp->nseconds = now.tv_nsec;
2437                 }
2438                 /*
2439                  * Since the server will use this value for the mtime,
2440                  * make sure that it can't overflow. Zero out the MSB.
2441                  * The actual value does not matter here, only its uniqeness.
2442                  */
2443                 verfp->seconds %= INT32_MAX;
2444         } else {
2445                 /*
2446                  * Issue the non-exclusive create in guarded mode.  This
2447                  * may result in some false EEXIST responses for
2448                  * retransmitted requests, but these will be handled at
2449                  * a higher level.  By using GUARDED, duplicate requests
2450                  * to do file truncation and possible access problems
2451                  * can be avoided.
2452                  */
2453                 args.how.mode = GUARDED;
2454                 error = vattr_to_sattr3(va,
2455                     &args.how.createhow3_u.obj_attributes);
2456                 if (error) {
2457                         /* req time field(s) overflow - return immediately */
2458                         return (error);
2459                 }
2460         }
2461
2462         douprintf = 1;
2463
2464         t = gethrtime();
2465
2466         error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2467             xdr_CREATE3args, (caddr_t)&args,
2468             xdr_CREATE3res, (caddr_t)&res, cr,
2469             &douprintf, &res.status, 0, NULL);
2470
2471         if (error) {
2472                 PURGE_ATTRCACHE(dvp);
2473                 return (error);
2474         }
2475
2476         error = geterrno3(res.status);
2477         if (!error) {
2478                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2479                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2480                         nfs_purge_rddir_cache(dvp);
2481
2482                 /*
2483                  * On exclusive create the times need to be explicitly
2484                  * set to clear any potential verifier that may be stored
2485                  * in one of these fields (see comment below).  This
2486                  * is done here to cover the case where no post op attrs
2487                  * were returned or a 'invalid' time was returned in
2488                  * the attributes.
2489                  */
2490                 if (exclusive == EXCL)
2491                         va->va_mask |= (AT_MTIME | AT_ATIME);
2492
2493                 if (!res.resok.obj.handle_follows) {
2494                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2495                         if (error)
2496                                 return (error);
2497                 } else {
2498                         if (res.resok.obj_attributes.attributes) {
2499                                 vp = makenfs3node(&res.resok.obj.handle,
2500                                     &res.resok.obj_attributes.attr,
2501                                     dvp->v_vfsp, t, cr, NULL, NULL);
2502                         } else {
2503                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2504                                     dvp->v_vfsp, t, cr, NULL, NULL);
2505
2506                                 /*
2507                                  * On an exclusive create, it is possible
2508                                  * that attributes were returned but those
2509                                  * postop attributes failed to decode
2510                                  * properly.  If this is the case,
2511                                  * then most likely the atime or mtime
2512                                  * were invalid for our client; this
2513                                  * is caused by the server storing the
2514                                  * create verifier in one of the time
2515                                  * fields(most likely mtime).
2516                                  * So... we are going to setattr just the
2517                                  * atime/mtime to clear things up.
2518                                  */
2519                                 if (exclusive == EXCL) {
2520                                         if (error =
2521                                             nfs3excl_create_settimes(vp,
2522                                             va, cr)) {
2523                                                 /*
2524                                                  * Setting the times failed.
2525                                                  * Remove the file and return
2526                                                  * the error.
2527                                                  */
2528                                                 VN_RELE(vp);
2529                                                 (void) nfs3_remove(dvp,
2530                                                     nm, cr, NULL, 0);
2531                                                 return (error);
2532                                         }
2533                                 }
2534
2535                                 /*
2536                                  * This handles the non-exclusive case
2537                                  * and the exclusive case where no post op
2538                                  * attrs were returned.
2539                                  */
2540                                 if (vp->v_type == VNON) {
2541                                         vattr.va_mask = AT_TYPE;
2542                                         error = nfs3getattr(vp, &vattr, cr);
2543                                         if (error) {
2544                                                 VN_RELE(vp);
2545                                                 return (error);
2546                                         }
2547                                         vp->v_type = vattr.va_type;
2548                                 }
2549                         }
2550                         dnlc_update(dvp, nm, vp);
2551                 }
2552
2553                 rp = VTOR(vp);
2554
2555                 /*
2556                  * Check here for large file handled by
2557                  * LF-unaware process (as ufs_create() does)
2558                  */
2559                 if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2560                     !(lfaware & FOFFMAX)) {
2561                         mutex_enter(&rp->r_statelock);
2562                         if (rp->r_size > MAXOFF32_T) {
2563                                 mutex_exit(&rp->r_statelock);
2564                                 VN_RELE(vp);
2565                                 return (EOVERFLOW);
2566                         }
2567                         mutex_exit(&rp->r_statelock);
2568                 }
2569
2570                 if (exclusive == EXCL &&
2571                     (va->va_mask & ~(AT_GID | AT_SIZE))) {
2572                         /*
2573                          * If doing an exclusive create, then generate
2574                          * a SETATTR to set the initial attributes.
2575                          * Try to set the mtime and the atime to the
2576                          * server's current time.  It is somewhat
2577                          * expected that these fields will be used to
2578                          * store the exclusive create cookie.  If not,
2579                          * server implementors will need to know that
2580                          * a SETATTR will follow an exclusive create
2581                          * and the cookie should be destroyed if
2582                          * appropriate. This work may have been done
2583                          * earlier in this function if post op attrs
2584                          * were not available.
2585                          *
2586                          * The AT_GID and AT_SIZE bits are turned off
2587                          * so that the SETATTR request will not attempt
2588                          * to process these.  The gid will be set
2589                          * separately if appropriate.  The size is turned
2590                          * off because it is assumed that a new file will
2591                          * be created empty and if the file wasn't empty,
2592                          * then the exclusive create will have failed
2593                          * because the file must have existed already.
2594                          * Therefore, no truncate operation is needed.
2595                          */
2596                         va->va_mask &= ~(AT_GID | AT_SIZE);
2597                         error = nfs3setattr(vp, va, 0, cr);
2598                         if (error) {
2599                                 /*
2600                                  * Couldn't correct the attributes of
2601                                  * the newly created file and the
2602                                  * attributes are wrong.  Remove the
2603                                  * file and return an error to the
2604                                  * application.
2605                                  */
2606                                 VN_RELE(vp);
2607                                 (void) nfs3_remove(dvp, nm, cr, NULL, 0);
2608                                 return (error);
2609                         }
2610                 }
2611
2612                 if (va->va_gid != rp->r_attr.va_gid) {
2613                         /*
2614                          * If the gid on the file isn't right, then
2615                          * generate a SETATTR to attempt to change
2616                          * it.  This may or may not work, depending
2617                          * upon the server's semantics for allowing
2618                          * file ownership changes.
2619                          */
2620                         va->va_mask = AT_GID;
2621                         (void) nfs3setattr(vp, va, 0, cr);
2622                 }
2623
2624                 /*
2625                  * If vnode is a device create special vnode
2626                  */
2627                 if (IS_DEVVP(vp)) {
2628                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2629                         VN_RELE(vp);
2630                 } else
2631                         *vpp = vp;
2632         } else {
2633                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2634                 PURGE_STALE_FH(error, dvp, cr);
2635         }
2636
2637         return (error);
2638 }
2639
2640 /*
2641  * Special setattr function to take care of rest of atime/mtime
2642  * after successful exclusive create.  This function exists to avoid
2643  * handling attributes from the server; exclusive the atime/mtime fields
2644  * may be 'invalid' in client's view and therefore can not be trusted.
2645  */
2646 static int
2647 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2648 {
2649         int error;
2650         uint_t mask;
2651         SETATTR3args args;
2652         SETATTR3res res;
2653         int douprintf;
2654         rnode_t *rp;
2655         hrtime_t t;
2656
2657         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2658         /* save the caller's mask so that it can be reset later */
2659         mask = vap->va_mask;
2660
2661         rp = VTOR(vp);
2662
2663         args.object = *RTOFH3(rp);
2664         args.guard.check = FALSE;
2665
2666         /* Use the mask to initialize the arguments */
2667         vap->va_mask = 0;
2668         error = vattr_to_sattr3(vap, &args.new_attributes);
2669
2670         /* We want to set just atime/mtime on this request */
2671         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2672         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2673
2674         douprintf = 1;
2675
2676         t = gethrtime();
2677
2678         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2679             xdr_SETATTR3args, (caddr_t)&args,
2680             xdr_SETATTR3res, (caddr_t)&res, cr,
2681             &douprintf, &res.status, 0, NULL);
2682
2683         if (error) {
2684                 vap->va_mask = mask;
2685                 return (error);
2686         }
2687
2688         error = geterrno3(res.status);
2689         if (!error) {
2690                 /*
2691                  * It is important to pick up the attributes.
2692                  * Since this is the exclusive create path, the
2693                  * attributes on the initial create were ignored
2694                  * and we need these to have the correct info.
2695                  */
2696                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2697                 /*
2698                  * No need to do the atime/mtime work again so clear
2699                  * the bits.
2700                  */
2701                 mask &= ~(AT_ATIME | AT_MTIME);
2702         } else {
2703                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2704         }
2705
2706         vap->va_mask = mask;
2707
2708         return (error);
2709 }
2710
2711 /* ARGSUSED */
2712 static int
2713 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2714         int mode, vnode_t **vpp, cred_t *cr)
2715 {
2716         int error;
2717         MKNOD3args args;
2718         MKNOD3res res;
2719         int douprintf;
2720         vnode_t *vp;
2721         struct vattr vattr;
2722         hrtime_t t;
2723
2724         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2725         switch (va->va_type) {
2726         case VCHR:
2727         case VBLK:
2728                 setdiropargs3(&args.where, nm, dvp);
2729                 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2730                 error = vattr_to_sattr3(va,
2731                     &args.what.mknoddata3_u.device.dev_attributes);
2732                 if (error) {
2733                         /* req time field(s) overflow - return immediately */
2734                         return (error);
2735                 }
2736                 args.what.mknoddata3_u.device.spec.specdata1 =
2737                     getmajor(va->va_rdev);
2738                 args.what.mknoddata3_u.device.spec.specdata2 =
2739                     getminor(va->va_rdev);
2740                 break;
2741
2742         case VFIFO:
2743         case VSOCK:
2744                 setdiropargs3(&args.where, nm, dvp);
2745                 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2746                 error = vattr_to_sattr3(va,
2747                     &args.what.mknoddata3_u.pipe_attributes);
2748                 if (error) {
2749                         /* req time field(s) overflow - return immediately */
2750                         return (error);
2751                 }
2752                 break;
2753
2754         default:
2755                 return (EINVAL);
2756         }
2757
2758         douprintf = 1;
2759
2760         t = gethrtime();
2761
2762         error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2763             xdr_MKNOD3args, (caddr_t)&args,
2764             xdr_MKNOD3res, (caddr_t)&res, cr,
2765             &douprintf, &res.status, 0, NULL);
2766
2767         if (error) {
2768                 PURGE_ATTRCACHE(dvp);
2769                 return (error);
2770         }
2771
2772         error = geterrno3(res.status);
2773         if (!error) {
2774                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2775                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2776                         nfs_purge_rddir_cache(dvp);
2777
2778                 if (!res.resok.obj.handle_follows) {
2779                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2780                         if (error)
2781                                 return (error);
2782                 } else {
2783                         if (res.resok.obj_attributes.attributes) {
2784                                 vp = makenfs3node(&res.resok.obj.handle,
2785                                     &res.resok.obj_attributes.attr,
2786                                     dvp->v_vfsp, t, cr, NULL, NULL);
2787                         } else {
2788                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2789                                     dvp->v_vfsp, t, cr, NULL, NULL);
2790                                 if (vp->v_type == VNON) {
2791                                         vattr.va_mask = AT_TYPE;
2792                                         error = nfs3getattr(vp, &vattr, cr);
2793                                         if (error) {
2794                                                 VN_RELE(vp);
2795                                                 return (error);
2796                                         }
2797                                         vp->v_type = vattr.va_type;
2798                                 }
2799
2800                         }
2801                         dnlc_update(dvp, nm, vp);
2802                 }
2803
2804                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2805                         va->va_mask = AT_GID;
2806                         (void) nfs3setattr(vp, va, 0, cr);
2807                 }
2808
2809                 /*
2810                  * If vnode is a device create special vnode
2811                  */
2812                 if (IS_DEVVP(vp)) {
2813                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2814                         VN_RELE(vp);
2815                 } else
2816                         *vpp = vp;
2817         } else {
2818                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2819                 PURGE_STALE_FH(error, dvp, cr);
2820         }
2821         return (error);
2822 }
2823
2824 /*
2825  * Weirdness: if the vnode to be removed is open
2826  * we rename it instead of removing it and nfs_inactive
2827  * will remove the new name.
2828  */
2829 /* ARGSUSED */
2830 static int
2831 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2832 {
2833         int error;
2834         REMOVE3args args;
2835         REMOVE3res res;
2836         vnode_t *vp;
2837         char *tmpname;
2838         int douprintf;
2839         rnode_t *rp;
2840         rnode_t *drp;
2841         hrtime_t t;
2842
2843         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2844                 return (EPERM);
2845         drp = VTOR(dvp);
2846         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2847                 return (EINTR);
2848
2849         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2850         if (error) {
2851                 nfs_rw_exit(&drp->r_rwlock);
2852                 return (error);
2853         }
2854
2855         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2856                 VN_RELE(vp);
2857                 nfs_rw_exit(&drp->r_rwlock);
2858                 return (EPERM);
2859         }
2860
2861         /*
2862          * First just remove the entry from the name cache, as it
2863          * is most likely the only entry for this vp.
2864          */
2865         dnlc_remove(dvp, nm);
2866
2867         /*
2868          * If the file has a v_count > 1 then there may be more than one
2869          * entry in the name cache due multiple links or an open file,
2870          * but we don't have the real reference count so flush all
2871          * possible entries.
2872          */
2873         if (vp->v_count > 1)
2874                 dnlc_purge_vp(vp);
2875
2876         /*
2877          * Now we have the real reference count on the vnode
2878          */
2879         rp = VTOR(vp);
2880         mutex_enter(&rp->r_statelock);
2881         if (vp->v_count > 1 &&
2882             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2883                 mutex_exit(&rp->r_statelock);
2884                 tmpname = newname();
2885                 error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2886                 if (error)
2887                         kmem_free(tmpname, MAXNAMELEN);
2888                 else {
2889                         mutex_enter(&rp->r_statelock);
2890                         if (rp->r_unldvp == NULL) {
2891                                 VN_HOLD(dvp);
2892                                 rp->r_unldvp = dvp;
2893                                 if (rp->r_unlcred != NULL)
2894                                         crfree(rp->r_unlcred);
2895                                 crhold(cr);
2896                                 rp->r_unlcred = cr;
2897                                 rp->r_unlname = tmpname;
2898                         } else {
2899                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2900                                 rp->r_unlname = tmpname;
2901                         }
2902                         mutex_exit(&rp->r_statelock);
2903                 }
2904         } else {
2905                 mutex_exit(&rp->r_statelock);
2906                 /*
2907                  * We need to flush any dirty pages which happen to
2908                  * be hanging around before removing the file.  This
2909                  * shouldn't happen very often and mostly on file
2910                  * systems mounted "nocto".
2911                  */
2912                 if (vn_has_cached_data(vp) &&
2913                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2914                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2915                         if (error && (error == ENOSPC || error == EDQUOT)) {
2916                                 mutex_enter(&rp->r_statelock);
2917                                 if (!rp->r_error)
2918                                         rp->r_error = error;
2919                                 mutex_exit(&rp->r_statelock);
2920                         }
2921                 }
2922
2923                 setdiropargs3(&args.object, nm, dvp);
2924
2925                 douprintf = 1;
2926
2927                 t = gethrtime();
2928
2929                 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2930                     xdr_diropargs3, (caddr_t)&args,
2931                     xdr_REMOVE3res, (caddr_t)&res, cr,
2932                     &douprintf, &res.status, 0, NULL);
2933
2934                 /*
2935                  * The xattr dir may be gone after last attr is removed,
2936                  * so flush it from dnlc.
2937                  */
2938                 if (dvp->v_flag & V_XATTRDIR)
2939                         dnlc_purge_vp(dvp);
2940
2941                 PURGE_ATTRCACHE(vp);
2942
2943                 if (error) {
2944                         PURGE_ATTRCACHE(dvp);
2945                 } else {
2946                         error = geterrno3(res.status);
2947                         if (!error) {
2948                                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2949                                     cr);
2950                                 if (HAVE_RDDIR_CACHE(drp))
2951                                         nfs_purge_rddir_cache(dvp);
2952                         } else {
2953                                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2954                                     t, cr);
2955                                 PURGE_STALE_FH(error, dvp, cr);
2956                         }
2957                 }
2958         }
2959
2960         if (error == 0) {
2961                 vnevent_remove(vp, dvp, nm, ct);
2962         }
2963         VN_RELE(vp);
2964
2965         nfs_rw_exit(&drp->r_rwlock);
2966
2967         return (error);
2968 }
2969
2970 /* ARGSUSED */
2971 static int
2972 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2973         caller_context_t *ct, int flags)
2974 {
2975         int error;
2976         LINK3args args;
2977         LINK3res res;
2978         vnode_t *realvp;
2979         int douprintf;
2980         mntinfo_t *mi;
2981         rnode_t *tdrp;
2982         hrtime_t t;
2983
2984         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2985                 return (EPERM);
2986         if (fop_realvp(svp, &realvp, ct) == 0)
2987                 svp = realvp;
2988
2989         mi = VTOMI(svp);
2990
2991         if (!(mi->mi_flags & MI_LINK))
2992                 return (EOPNOTSUPP);
2993
2994         args.file = *VTOFH3(svp);
2995         setdiropargs3(&args.link, tnm, tdvp);
2996
2997         tdrp = VTOR(tdvp);
2998         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2999                 return (EINTR);
3000
3001         dnlc_remove(tdvp, tnm);
3002
3003         douprintf = 1;
3004
3005         t = gethrtime();
3006
3007         error = rfs3call(mi, NFSPROC3_LINK,
3008             xdr_LINK3args, (caddr_t)&args,
3009             xdr_LINK3res, (caddr_t)&res, cr,
3010             &douprintf, &res.status, 0, NULL);
3011
3012         if (error) {
3013                 PURGE_ATTRCACHE(tdvp);
3014                 PURGE_ATTRCACHE(svp);
3015                 nfs_rw_exit(&tdrp->r_rwlock);
3016                 return (error);
3017         }
3018
3019         error = geterrno3(res.status);
3020
3021         if (!error) {
3022                 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
3023                 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
3024                 if (HAVE_RDDIR_CACHE(tdrp))
3025                         nfs_purge_rddir_cache(tdvp);
3026                 dnlc_update(tdvp, tnm, svp);
3027         } else {
3028                 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
3029                     cr);
3030                 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
3031                 if (error == EOPNOTSUPP) {
3032                         mutex_enter(&mi->mi_lock);
3033                         mi->mi_flags &= ~MI_LINK;
3034                         mutex_exit(&mi->mi_lock);
3035                 }
3036         }
3037
3038         nfs_rw_exit(&tdrp->r_rwlock);
3039
3040         if (!error) {
3041                 /*
3042                  * Notify the source file of this link operation.
3043                  */
3044                 vnevent_link(svp, ct);
3045         }
3046         return (error);
3047 }
3048
3049 /* ARGSUSED */
3050 static int
3051 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3052         caller_context_t *ct, int flags)
3053 {
3054         vnode_t *realvp;
3055
3056         if (nfs_zone() != VTOMI(odvp)->mi_zone)
3057                 return (EPERM);
3058         if (fop_realvp(ndvp, &realvp, ct) == 0)
3059                 ndvp = realvp;
3060
3061         return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3062 }
3063
3064 /*
3065  * nfs3rename does the real work of renaming in NFS Version 3.
3066  */
3067 static int
3068 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3069     caller_context_t *ct)
3070 {
3071         int error;
3072         RENAME3args args;
3073         RENAME3res res;
3074         int douprintf;
3075         vnode_t *nvp = NULL;
3076         vnode_t *ovp = NULL;
3077         char *tmpname;
3078         rnode_t *rp;
3079         rnode_t *odrp;
3080         rnode_t *ndrp;
3081         hrtime_t t;
3082
3083         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3084
3085         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3086             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3087                 return (EINVAL);
3088
3089         odrp = VTOR(odvp);
3090         ndrp = VTOR(ndvp);
3091         if ((intptr_t)odrp < (intptr_t)ndrp) {
3092                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3093                         return (EINTR);
3094                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3095                         nfs_rw_exit(&odrp->r_rwlock);
3096                         return (EINTR);
3097                 }
3098         } else {
3099                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3100                         return (EINTR);
3101                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3102                         nfs_rw_exit(&ndrp->r_rwlock);
3103                         return (EINTR);
3104                 }
3105         }
3106
3107         /*
3108          * Lookup the target file.  If it exists, it needs to be
3109          * checked to see whether it is a mount point and whether
3110          * it is active (open).
3111          */
3112         error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3113         if (!error) {
3114                 /*
3115                  * If this file has been mounted on, then just
3116                  * return busy because renaming to it would remove
3117                  * the mounted file system from the name space.
3118                  */
3119                 if (vn_mountedvfs(nvp) != NULL) {
3120                         VN_RELE(nvp);
3121                         nfs_rw_exit(&odrp->r_rwlock);
3122                         nfs_rw_exit(&ndrp->r_rwlock);
3123                         return (EBUSY);
3124                 }
3125
3126                 /*
3127                  * Purge the name cache of all references to this vnode
3128                  * so that we can check the reference count to infer
3129                  * whether it is active or not.
3130                  */
3131                 /*
3132                  * First just remove the entry from the name cache, as it
3133                  * is most likely the only entry for this vp.
3134                  */
3135                 dnlc_remove(ndvp, nnm);
3136                 /*
3137                  * If the file has a v_count > 1 then there may be more
3138                  * than one entry in the name cache due multiple links
3139                  * or an open file, but we don't have the real reference
3140                  * count so flush all possible entries.
3141                  */
3142                 if (nvp->v_count > 1)
3143                         dnlc_purge_vp(nvp);
3144
3145                 /*
3146                  * If the vnode is active and is not a directory,
3147                  * arrange to rename it to a
3148                  * temporary file so that it will continue to be
3149                  * accessible.  This implements the "unlink-open-file"
3150                  * semantics for the target of a rename operation.
3151                  * Before doing this though, make sure that the
3152                  * source and target files are not already the same.
3153                  */
3154                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3155                         /*
3156                          * Lookup the source name.
3157                          */
3158                         error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3159                             cr, 0);
3160
3161                         /*
3162                          * The source name *should* already exist.
3163                          */
3164                         if (error) {
3165                                 VN_RELE(nvp);
3166                                 nfs_rw_exit(&odrp->r_rwlock);
3167                                 nfs_rw_exit(&ndrp->r_rwlock);
3168                                 return (error);
3169                         }
3170
3171                         /*
3172                          * Compare the two vnodes.  If they are the same,
3173                          * just release all held vnodes and return success.
3174                          */
3175                         if (ovp == nvp) {
3176                                 VN_RELE(ovp);
3177                                 VN_RELE(nvp);
3178                                 nfs_rw_exit(&odrp->r_rwlock);
3179                                 nfs_rw_exit(&ndrp->r_rwlock);
3180                                 return (0);
3181                         }
3182
3183                         /*
3184                          * Can't mix and match directories and non-
3185                          * directories in rename operations.  We already
3186                          * know that the target is not a directory.  If
3187                          * the source is a directory, return an error.
3188                          */
3189                         if (ovp->v_type == VDIR) {
3190                                 VN_RELE(ovp);
3191                                 VN_RELE(nvp);
3192                                 nfs_rw_exit(&odrp->r_rwlock);
3193                                 nfs_rw_exit(&ndrp->r_rwlock);
3194                                 return (ENOTDIR);
3195                         }
3196
3197                         /*
3198                          * The target file exists, is not the same as
3199                          * the source file, and is active.  Link it
3200                          * to a temporary filename to avoid having
3201                          * the server removing the file completely.
3202                          */
3203                         tmpname = newname();
3204                         error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3205                         if (error == EOPNOTSUPP) {
3206                                 error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3207                                     cr, NULL, 0);
3208                         }
3209                         if (error) {
3210                                 kmem_free(tmpname, MAXNAMELEN);
3211                                 VN_RELE(ovp);
3212                                 VN_RELE(nvp);
3213                                 nfs_rw_exit(&odrp->r_rwlock);
3214                                 nfs_rw_exit(&ndrp->r_rwlock);
3215                                 return (error);
3216                         }
3217                         rp = VTOR(nvp);
3218                         mutex_enter(&rp->r_statelock);
3219                         if (rp->r_unldvp == NULL) {
3220                                 VN_HOLD(ndvp);
3221                                 rp->r_unldvp = ndvp;
3222                                 if (rp->r_unlcred != NULL)
3223                                         crfree(rp->r_unlcred);
3224                                 crhold(cr);
3225                                 rp->r_unlcred = cr;
3226                                 rp->r_unlname = tmpname;
3227                         } else {
3228                                 kmem_free(rp->r_unlname, MAXNAMELEN);
3229                                 rp->r_unlname = tmpname;
3230                         }
3231                         mutex_exit(&rp->r_statelock);
3232                 }
3233         }
3234
3235         if (ovp == NULL) {
3236                 /*
3237                  * When renaming directories to be a subdirectory of a
3238                  * different parent, the dnlc entry for ".." will no
3239                  * longer be valid, so it must be removed.
3240                  *
3241                  * We do a lookup here to determine whether we are renaming
3242                  * a directory and we need to check if we are renaming
3243                  * an unlinked file.  This might have already been done
3244                  * in previous code, so we check ovp == NULL to avoid
3245                  * doing it twice.
3246                  */
3247
3248                 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3249                 /*
3250                  * The source name *should* already exist.
3251                  */
3252                 if (error) {
3253                         nfs_rw_exit(&odrp->r_rwlock);
3254                         nfs_rw_exit(&ndrp->r_rwlock);
3255                         if (nvp) {
3256                                 VN_RELE(nvp);
3257                         }
3258                         return (error);
3259                 }
3260                 ASSERT(ovp != NULL);
3261         }
3262
3263         dnlc_remove(odvp, onm);
3264         dnlc_remove(ndvp, nnm);
3265
3266         setdiropargs3(&args.from, onm, odvp);
3267         setdiropargs3(&args.to, nnm, ndvp);
3268
3269         douprintf = 1;
3270
3271         t = gethrtime();
3272
3273         error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3274             xdr_RENAME3args, (caddr_t)&args,
3275             xdr_RENAME3res, (caddr_t)&res, cr,
3276             &douprintf, &res.status, 0, NULL);
3277
3278         if (error) {
3279                 PURGE_ATTRCACHE(odvp);
3280                 PURGE_ATTRCACHE(ndvp);
3281                 VN_RELE(ovp);
3282                 nfs_rw_exit(&odrp->r_rwlock);
3283                 nfs_rw_exit(&ndrp->r_rwlock);
3284                 if (nvp) {
3285                         VN_RELE(nvp);
3286                 }
3287                 return (error);
3288         }
3289
3290         error = geterrno3(res.status);
3291
3292         if (!error) {
3293                 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3294                 if (HAVE_RDDIR_CACHE(odrp))
3295                         nfs_purge_rddir_cache(odvp);
3296                 if (ndvp != odvp) {
3297                         nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3298                         if (HAVE_RDDIR_CACHE(ndrp))
3299                                 nfs_purge_rddir_cache(ndvp);
3300                 }
3301                 /*
3302                  * when renaming directories to be a subdirectory of a
3303                  * different parent, the dnlc entry for ".." will no
3304                  * longer be valid, so it must be removed
3305                  */
3306                 rp = VTOR(ovp);
3307                 if (ndvp != odvp) {
3308                         if (ovp->v_type == VDIR) {
3309                                 dnlc_remove(ovp, "..");
3310                                 if (HAVE_RDDIR_CACHE(rp))
3311                                         nfs_purge_rddir_cache(ovp);
3312                         }
3313                 }
3314
3315                 /*
3316                  * If we are renaming the unlinked file, update the
3317                  * r_unldvp and r_unlname as needed.
3318                  */
3319                 mutex_enter(&rp->r_statelock);
3320                 if (rp->r_unldvp != NULL) {
3321                         if (strcmp(rp->r_unlname, onm) == 0) {
3322                                 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3323                                 rp->r_unlname[MAXNAMELEN - 1] = '\0';
3324
3325                                 if (ndvp != rp->r_unldvp) {
3326                                         VN_RELE(rp->r_unldvp);
3327                                         rp->r_unldvp = ndvp;
3328                                         VN_HOLD(ndvp);
3329                                 }
3330                         }
3331                 }
3332                 mutex_exit(&rp->r_statelock);
3333         } else {
3334                 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3335                 if (ndvp != odvp) {
3336                         nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3337                             cr);
3338                 }
3339                 /*
3340                  * System V defines rename to return EEXIST, not
3341                  * ENOTEMPTY if the target directory is not empty.
3342                  * Over the wire, the error is NFSERR_ENOTEMPTY
3343                  * which geterrno maps to ENOTEMPTY.
3344                  */
3345                 if (error == ENOTEMPTY)
3346                         error = EEXIST;
3347         }
3348
3349         if (error == 0) {
3350                 if (nvp)
3351                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
3352
3353                 if (odvp != ndvp)
3354                         vnevent_rename_dest_dir(ndvp, ct);
3355                 ASSERT(ovp != NULL);
3356                 vnevent_rename_src(ovp, odvp, onm, ct);
3357         }
3358
3359         if (nvp) {
3360                 VN_RELE(nvp);
3361         }
3362         VN_RELE(ovp);
3363
3364         nfs_rw_exit(&odrp->r_rwlock);
3365         nfs_rw_exit(&ndrp->r_rwlock);
3366
3367         return (error);
3368 }
3369
3370 /* ARGSUSED */
3371 static int
3372 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3373         caller_context_t *ct, int flags, vsecattr_t *vsecp)
3374 {
3375         int error;
3376         MKDIR3args args;
3377         MKDIR3res res;
3378         int douprintf;
3379         struct vattr vattr;
3380         vnode_t *vp;
3381         rnode_t *drp;
3382         hrtime_t t;
3383
3384         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3385                 return (EPERM);
3386         setdiropargs3(&args.where, nm, dvp);
3387
3388         /*
3389          * Decide what the group-id and set-gid bit of the created directory
3390          * should be.  May have to do a setattr to get the gid right.
3391          */
3392         error = setdirgid(dvp, &va->va_gid, cr);
3393         if (error)
3394                 return (error);
3395         error = setdirmode(dvp, &va->va_mode, cr);
3396         if (error)
3397                 return (error);
3398         va->va_mask |= AT_MODE|AT_GID;
3399
3400         error = vattr_to_sattr3(va, &args.attributes);
3401         if (error) {
3402                 /* req time field(s) overflow - return immediately */
3403                 return (error);
3404         }
3405
3406         drp = VTOR(dvp);
3407         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3408                 return (EINTR);
3409
3410         dnlc_remove(dvp, nm);
3411
3412         douprintf = 1;
3413
3414         t = gethrtime();
3415
3416         error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3417             xdr_MKDIR3args, (caddr_t)&args,
3418             xdr_MKDIR3res, (caddr_t)&res, cr,
3419             &douprintf, &res.status, 0, NULL);
3420
3421         if (error) {
3422                 PURGE_ATTRCACHE(dvp);
3423                 nfs_rw_exit(&drp->r_rwlock);
3424                 return (error);
3425         }
3426
3427         error = geterrno3(res.status);
3428         if (!error) {
3429                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3430                 if (HAVE_RDDIR_CACHE(drp))
3431                         nfs_purge_rddir_cache(dvp);
3432
3433                 if (!res.resok.obj.handle_follows) {
3434                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3435                         if (error) {
3436                                 nfs_rw_exit(&drp->r_rwlock);
3437                                 return (error);
3438                         }
3439                 } else {
3440                         if (res.resok.obj_attributes.attributes) {
3441                                 vp = makenfs3node(&res.resok.obj.handle,
3442                                     &res.resok.obj_attributes.attr,
3443                                     dvp->v_vfsp, t, cr, NULL, NULL);
3444                         } else {
3445                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3446                                     dvp->v_vfsp, t, cr, NULL, NULL);
3447                                 if (vp->v_type == VNON) {
3448                                         vattr.va_mask = AT_TYPE;
3449                                         error = nfs3getattr(vp, &vattr, cr);
3450                                         if (error) {
3451                                                 VN_RELE(vp);
3452                                                 nfs_rw_exit(&drp->r_rwlock);
3453                                                 return (error);
3454                                         }
3455                                         vp->v_type = vattr.va_type;
3456                                 }
3457                         }
3458                         dnlc_update(dvp, nm, vp);
3459                 }
3460                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3461                         va->va_mask = AT_GID;
3462                         (void) nfs3setattr(vp, va, 0, cr);
3463                 }
3464                 *vpp = vp;
3465         } else {
3466                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3467                 PURGE_STALE_FH(error, dvp, cr);
3468         }
3469
3470         nfs_rw_exit(&drp->r_rwlock);
3471
3472         return (error);
3473 }
3474
3475 /* ARGSUSED */
3476 static int
3477 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3478         caller_context_t *ct, int flags)
3479 {
3480         int error;
3481         RMDIR3args args;
3482         RMDIR3res res;
3483         vnode_t *vp;
3484         int douprintf;
3485         rnode_t *drp;
3486         hrtime_t t;
3487
3488         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3489                 return (EPERM);
3490         drp = VTOR(dvp);
3491         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3492                 return (EINTR);
3493
3494         /*
3495          * Attempt to prevent a rmdir(".") from succeeding.
3496          */
3497         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3498         if (error) {
3499                 nfs_rw_exit(&drp->r_rwlock);
3500                 return (error);
3501         }
3502
3503         if (vp == cdir) {
3504                 VN_RELE(vp);
3505                 nfs_rw_exit(&drp->r_rwlock);
3506                 return (EINVAL);
3507         }
3508
3509         setdiropargs3(&args.object, nm, dvp);
3510
3511         /*
3512          * First just remove the entry from the name cache, as it
3513          * is most likely an entry for this vp.
3514          */
3515         dnlc_remove(dvp, nm);
3516
3517         /*
3518          * If there vnode reference count is greater than one, then
3519          * there may be additional references in the DNLC which will
3520          * need to be purged.  First, trying removing the entry for
3521          * the parent directory and see if that removes the additional
3522          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3523          * to completely remove any references to the directory which
3524          * might still exist in the DNLC.
3525          */
3526         if (vp->v_count > 1) {
3527                 dnlc_remove(vp, "..");
3528                 if (vp->v_count > 1)
3529                         dnlc_purge_vp(vp);
3530         }
3531
3532         douprintf = 1;
3533
3534         t = gethrtime();
3535
3536         error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3537             xdr_diropargs3, (caddr_t)&args,
3538             xdr_RMDIR3res, (caddr_t)&res, cr,
3539             &douprintf, &res.status, 0, NULL);
3540
3541         PURGE_ATTRCACHE(vp);
3542
3543         if (error) {
3544                 PURGE_ATTRCACHE(dvp);
3545                 VN_RELE(vp);
3546                 nfs_rw_exit(&drp->r_rwlock);
3547                 return (error);
3548         }
3549
3550         error = geterrno3(res.status);
3551         if (!error) {
3552                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3553                 if (HAVE_RDDIR_CACHE(drp))
3554                         nfs_purge_rddir_cache(dvp);
3555                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
3556                         nfs_purge_rddir_cache(vp);
3557         } else {
3558                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3559                 PURGE_STALE_FH(error, dvp, cr);
3560                 /*
3561                  * System V defines rmdir to return EEXIST, not
3562                  * ENOTEMPTY if the directory is not empty.  Over
3563                  * the wire, the error is NFSERR_ENOTEMPTY which
3564                  * geterrno maps to ENOTEMPTY.
3565                  */
3566                 if (error == ENOTEMPTY)
3567                         error = EEXIST;
3568         }
3569
3570         if (error == 0) {
3571                 vnevent_rmdir(vp, dvp, nm, ct);
3572         }
3573         VN_RELE(vp);
3574
3575         nfs_rw_exit(&drp->r_rwlock);
3576
3577         return (error);
3578 }
3579
3580 /* ARGSUSED */
3581 static int
3582 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3583         caller_context_t *ct, int flags)
3584 {
3585         int error;
3586         SYMLINK3args args;
3587         SYMLINK3res res;
3588         int douprintf;
3589         mntinfo_t *mi;
3590         vnode_t *vp;
3591         rnode_t *rp;
3592         char *contents;
3593         rnode_t *drp;
3594         hrtime_t t;
3595
3596         mi = VTOMI(dvp);
3597
3598         if (nfs_zone() != mi->mi_zone)
3599                 return (EPERM);
3600         if (!(mi->mi_flags & MI_SYMLINK))
3601                 return (EOPNOTSUPP);
3602
3603         setdiropargs3(&args.where, lnm, dvp);
3604         error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3605         if (error) {
3606                 /* req time field(s) overflow - return immediately */
3607                 return (error);
3608         }
3609         args.symlink.symlink_data = tnm;
3610
3611         drp = VTOR(dvp);
3612         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3613                 return (EINTR);
3614
3615         dnlc_remove(dvp, lnm);
3616
3617         douprintf = 1;
3618
3619         t = gethrtime();
3620
3621         error = rfs3call(mi, NFSPROC3_SYMLINK,
3622             xdr_SYMLINK3args, (caddr_t)&args,
3623             xdr_SYMLINK3res, (caddr_t)&res, cr,
3624             &douprintf, &res.status, 0, NULL);
3625
3626         if (error) {
3627                 PURGE_ATTRCACHE(dvp);
3628                 nfs_rw_exit(&drp->r_rwlock);
3629                 return (error);
3630         }
3631
3632         error = geterrno3(res.status);
3633         if (!error) {
3634                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3635                 if (HAVE_RDDIR_CACHE(drp))
3636                         nfs_purge_rddir_cache(dvp);
3637
3638                 if (res.resok.obj.handle_follows) {
3639                         if (res.resok.obj_attributes.attributes) {
3640                                 vp = makenfs3node(&res.resok.obj.handle,
3641                                     &res.resok.obj_attributes.attr,
3642                                     dvp->v_vfsp, t, cr, NULL, NULL);
3643                         } else {
3644                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3645                                     dvp->v_vfsp, t, cr, NULL, NULL);
3646                                 vp->v_type = VLNK;
3647                                 vp->v_rdev = 0;
3648                         }
3649                         dnlc_update(dvp, lnm, vp);
3650                         rp = VTOR(vp);
3651                         if (nfs3_do_symlink_cache &&
3652                             rp->r_symlink.contents == NULL) {
3653
3654                                 contents = kmem_alloc(MAXPATHLEN,
3655                                     KM_NOSLEEP);
3656
3657                                 if (contents != NULL) {
3658                                         mutex_enter(&rp->r_statelock);
3659                                         if (rp->r_symlink.contents == NULL) {
3660                                                 rp->r_symlink.len = strlen(tnm);
3661                                                 bcopy(tnm, contents,
3662                                                     rp->r_symlink.len);
3663                                                 rp->r_symlink.contents =
3664                                                     contents;
3665                                                 rp->r_symlink.size = MAXPATHLEN;
3666                                                 mutex_exit(&rp->r_statelock);
3667                                         } else {
3668                                                 mutex_exit(&rp->r_statelock);
3669                                                 kmem_free((void *)contents,
3670                                                     MAXPATHLEN);
3671                                         }
3672                                 }
3673                         }
3674                         VN_RELE(vp);
3675                 }
3676         } else {
3677                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3678                 PURGE_STALE_FH(error, dvp, cr);
3679                 if (error == EOPNOTSUPP) {
3680                         mutex_enter(&mi->mi_lock);
3681                         mi->mi_flags &= ~MI_SYMLINK;
3682                         mutex_exit(&mi->mi_lock);
3683                 }
3684         }
3685
3686         nfs_rw_exit(&drp->r_rwlock);
3687
3688         return (error);
3689 }
3690
3691 #ifdef DEBUG
3692 static int nfs3_readdir_cache_hits = 0;
3693 static int nfs3_readdir_cache_shorts = 0;
3694 static int nfs3_readdir_cache_waits = 0;
3695 static int nfs3_readdir_cache_misses = 0;
3696 static int nfs3_readdir_readahead = 0;
3697 #endif
3698
3699 static int nfs3_shrinkreaddir = 0;
3700
3701 /*
3702  * Read directory entries.
3703  * There are some weird things to look out for here.  The uio_loffset
3704  * field is either 0 or it is the offset returned from a previous
3705  * readdir.  It is an opaque value used by the server to find the
3706  * correct directory block to read. The count field is the number
3707  * of blocks to read on the server.  This is advisory only, the server
3708  * may return only one block's worth of entries.  Entries may be compressed
3709  * on the server.
3710  */
3711 /* ARGSUSED */
3712 static int
3713 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3714         caller_context_t *ct, int flags)
3715 {
3716         int error;
3717         size_t count;
3718         rnode_t *rp;
3719         rddir_cache *rdc;
3720         rddir_cache *nrdc;
3721         rddir_cache *rrdc;
3722 #ifdef DEBUG
3723         int missed;
3724 #endif
3725         int doreadahead;
3726         rddir_cache srdc;
3727         avl_index_t where;
3728
3729         if (nfs_zone() != VTOMI(vp)->mi_zone)
3730                 return (EIO);
3731         rp = VTOR(vp);
3732
3733         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3734
3735         /*
3736          * Make sure that the directory cache is valid.
3737          */
3738         if (HAVE_RDDIR_CACHE(rp)) {
3739                 if (nfs_disable_rddir_cache) {
3740                         /*
3741                          * Setting nfs_disable_rddir_cache in /etc/system
3742                          * allows interoperability with servers that do not
3743                          * properly update the attributes of directories.
3744                          * Any cached information gets purged before an
3745                          * access is made to it.
3746                          */
3747                         nfs_purge_rddir_cache(vp);
3748                 } else {
3749                         error = nfs3_validate_caches(vp, cr);
3750                         if (error)
3751                                 return (error);
3752                 }
3753         }
3754
3755         /*
3756          * It is possible that some servers may not be able to correctly
3757          * handle a large READDIR or READDIRPLUS request due to bugs in
3758          * their implementation.  In order to continue to interoperate
3759          * with them, this workaround is provided to limit the maximum
3760          * size of a READDIRPLUS request to 1024.  In any case, the request
3761          * size is limited to MAXBSIZE.
3762          */
3763         count = MIN(uiop->uio_iov->iov_len,
3764             nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3765
3766         nrdc = NULL;
3767 #ifdef DEBUG
3768         missed = 0;
3769 #endif
3770 top:
3771         /*
3772          * Short circuit last readdir which always returns 0 bytes.
3773          * This can be done after the directory has been read through
3774          * completely at least once.  This will set r_direof which
3775          * can be used to find the value of the last cookie.
3776          */
3777         mutex_enter(&rp->r_statelock);
3778         if (rp->r_direof != NULL &&
3779             uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3780                 mutex_exit(&rp->r_statelock);
3781 #ifdef DEBUG
3782                 nfs3_readdir_cache_shorts++;
3783 #endif
3784                 if (eofp)
3785                         *eofp = 1;
3786                 if (nrdc != NULL)
3787                         rddir_cache_rele(nrdc);
3788                 return (0);
3789         }
3790         /*
3791          * Look for a cache entry.  Cache entries are identified
3792          * by the NFS cookie value and the byte count requested.
3793          */
3794         srdc.nfs3_cookie = uiop->uio_loffset;
3795         srdc.buflen = count;
3796         rdc = avl_find(&rp->r_dir, &srdc, &where);
3797         if (rdc != NULL) {
3798                 rddir_cache_hold(rdc);
3799                 /*
3800                  * If the cache entry is in the process of being
3801                  * filled in, wait until this completes.  The
3802                  * RDDIRWAIT bit is set to indicate that someone
3803                  * is waiting and then the thread currently
3804                  * filling the entry is done, it should do a
3805                  * cv_broadcast to wakeup all of the threads
3806                  * waiting for it to finish.
3807                  */
3808                 if (rdc->flags & RDDIR) {
3809                         nfs_rw_exit(&rp->r_rwlock);
3810                         rdc->flags |= RDDIRWAIT;
3811 #ifdef DEBUG
3812                         nfs3_readdir_cache_waits++;
3813 #endif
3814                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3815                                 /*
3816                                  * We got interrupted, probably
3817                                  * the user typed ^C or an alarm
3818                                  * fired.  We free the new entry
3819                                  * if we allocated one.
3820                                  */
3821                                 mutex_exit(&rp->r_statelock);
3822                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3823                                     RW_READER, FALSE);
3824                                 rddir_cache_rele(rdc);
3825                                 if (nrdc != NULL)
3826                                         rddir_cache_rele(nrdc);
3827                                 return (EINTR);
3828                         }
3829                         mutex_exit(&rp->r_statelock);
3830                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3831                             RW_READER, FALSE);
3832                         rddir_cache_rele(rdc);
3833                         goto top;
3834                 }
3835                 /*
3836                  * Check to see if a readdir is required to
3837                  * fill the entry.  If so, mark this entry
3838                  * as being filled, remove our reference,
3839                  * and branch to the code to fill the entry.
3840                  */
3841                 if (rdc->flags & RDDIRREQ) {
3842                         rdc->flags &= ~RDDIRREQ;
3843                         rdc->flags |= RDDIR;
3844                         if (nrdc != NULL)
3845                                 rddir_cache_rele(nrdc);
3846                         nrdc = rdc;
3847                         mutex_exit(&rp->r_statelock);
3848                         goto bottom;
3849                 }
3850 #ifdef DEBUG
3851                 if (!missed)
3852                         nfs3_readdir_cache_hits++;
3853 #endif
3854                 /*
3855                  * If an error occurred while attempting
3856                  * to fill the cache entry, just return it.
3857                  */
3858                 if (rdc->error) {
3859                         error = rdc->error;
3860                         mutex_exit(&rp->r_statelock);
3861                         rddir_cache_rele(rdc);
3862                         if (nrdc != NULL)
3863                                 rddir_cache_rele(nrdc);
3864                         return (error);
3865                 }
3866
3867                 /*
3868                  * The cache entry is complete and good,
3869                  * copyout the dirent structs to the calling
3870                  * thread.
3871                  */
3872                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3873
3874                 /*
3875                  * If no error occurred during the copyout,
3876                  * update the offset in the uio struct to
3877                  * contain the value of the next cookie
3878                  * and set the eof value appropriately.
3879                  */
3880                 if (!error) {
3881                         uiop->uio_loffset = rdc->nfs3_ncookie;
3882                         if (eofp)
3883                                 *eofp = rdc->eof;
3884                 }
3885
3886                 /*
3887                  * Decide whether to do readahead.
3888                  *
3889                  * Don't if have already read to the end of
3890                  * directory.  There is nothing more to read.
3891                  *
3892                  * Don't if the application is not doing
3893                  * lookups in the directory.  The readahead
3894                  * is only effective if the application can
3895                  * be doing work while an async thread is
3896                  * handling the over the wire request.
3897                  */
3898                 if (rdc->eof) {
3899                         rp->r_direof = rdc;
3900                         doreadahead = FALSE;
3901                 } else if (!(rp->r_flags & RLOOKUP))
3902                         doreadahead = FALSE;
3903                 else
3904                         doreadahead = TRUE;
3905
3906                 if (!doreadahead) {
3907                         mutex_exit(&rp->r_statelock);
3908                         rddir_cache_rele(rdc);
3909                         if (nrdc != NULL)
3910                                 rddir_cache_rele(nrdc);
3911                         return (error);
3912                 }
3913
3914                 /*
3915                  * Check to see whether we found an entry
3916                  * for the readahead.  If so, we don't need
3917                  * to do anything further, so free the new
3918                  * entry if one was allocated.  Otherwise,
3919                  * allocate a new entry, add it to the cache,
3920                  * and then initiate an asynchronous readdir
3921                  * operation to fill it.
3922                  */
3923                 srdc.nfs3_cookie = rdc->nfs3_ncookie;
3924                 srdc.buflen = count;
3925                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3926                 if (rrdc != NULL) {
3927                         if (nrdc != NULL)
3928                                 rddir_cache_rele(nrdc);
3929                 } else {
3930                         if (nrdc != NULL)
3931                                 rrdc = nrdc;
3932                         else {
3933                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3934                         }
3935                         if (rrdc != NULL) {
3936                                 rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3937                                 rrdc->buflen = count;
3938                                 avl_insert(&rp->r_dir, rrdc, where);
3939                                 rddir_cache_hold(rrdc);
3940                                 mutex_exit(&rp->r_statelock);
3941                                 rddir_cache_rele(rdc);
3942 #ifdef DEBUG
3943                                 nfs3_readdir_readahead++;
3944 #endif
3945                                 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3946                                 return (error);
3947                         }
3948                 }
3949
3950                 mutex_exit(&rp->r_statelock);
3951                 rddir_cache_rele(rdc);
3952                 return (error);
3953         }
3954
3955         /*
3956          * Didn't find an entry in the cache.  Construct a new empty
3957          * entry and link it into the cache.  Other processes attempting
3958          * to access this entry will need to wait until it is filled in.
3959          *
3960          * Since kmem_alloc may block, another pass through the cache
3961          * will need to be taken to make sure that another process
3962          * hasn't already added an entry to the cache for this request.
3963          */
3964         if (nrdc == NULL) {
3965                 mutex_exit(&rp->r_statelock);
3966                 nrdc = rddir_cache_alloc(KM_SLEEP);
3967                 nrdc->nfs3_cookie = uiop->uio_loffset;
3968                 nrdc->buflen = count;
3969                 goto top;
3970         }
3971
3972         /*
3973          * Add this entry to the cache.
3974          */
3975         avl_insert(&rp->r_dir, nrdc, where);
3976         rddir_cache_hold(nrdc);
3977         mutex_exit(&rp->r_statelock);
3978
3979 bottom:
3980 #ifdef DEBUG
3981         missed = 1;
3982         nfs3_readdir_cache_misses++;
3983 #endif
3984         /*
3985          * Do the readdir.  This routine decides whether to use
3986          * READDIR or READDIRPLUS.
3987          */
3988         error = do_nfs3readdir(vp, nrdc, cr);
3989
3990         /*
3991          * If this operation failed, just return the error which occurred.
3992          */
3993         if (error != 0)
3994                 return (error);
3995
3996         /*
3997          * Since the RPC operation will have taken sometime and blocked
3998          * this process, another pass through the cache will need to be
3999          * taken to find the correct cache entry.  It is possible that
4000          * the correct cache entry will not be there (although one was
4001          * added) because the directory changed during the RPC operation
4002          * and the readdir cache was flushed.  In this case, just start
4003          * over.  It is hoped that this will not happen too often... :-)
4004          */
4005         nrdc = NULL;
4006         goto top;
4007         /* NOTREACHED */
4008 }
4009
4010 static int
4011 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4012 {
4013         int error;
4014         rnode_t *rp;
4015         mntinfo_t *mi;
4016
4017         rp = VTOR(vp);
4018         mi = VTOMI(vp);
4019         ASSERT(nfs_zone() == mi->mi_zone);
4020         /*
4021          * Issue the proper request.
4022          *
4023          * If the server does not support READDIRPLUS, then use READDIR.
4024          *
4025          * Otherwise --
4026          * Issue a READDIRPLUS if reading to fill an empty cache or if
4027          * an application has performed a lookup in the directory which
4028          * required an over the wire lookup.  The use of READDIRPLUS
4029          * will help to (re)populate the DNLC.
4030          */
4031         if (!(mi->mi_flags & MI_READDIRONLY) &&
4032             (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4033                 if (rp->r_flags & RREADDIRPLUS) {
4034                         mutex_enter(&rp->r_statelock);
4035                         rp->r_flags &= ~RREADDIRPLUS;
4036                         mutex_exit(&rp->r_statelock);
4037                 }
4038                 nfs3readdirplus(vp, rdc, cr);
4039                 if (rdc->error == EOPNOTSUPP)
4040                         nfs3readdir(vp, rdc, cr);
4041         } else
4042                 nfs3readdir(vp, rdc, cr);
4043
4044         mutex_enter(&rp->r_statelock);
4045         rdc->flags &= ~RDDIR;
4046         if (rdc->flags & RDDIRWAIT) {
4047                 rdc->flags &= ~RDDIRWAIT;
4048                 cv_broadcast(&rdc->cv);
4049         }
4050         error = rdc->error;
4051         if (error)
4052                 rdc->flags |= RDDIRREQ;
4053         mutex_exit(&rp->r_statelock);
4054
4055         rddir_cache_rele(rdc);
4056
4057         return (error);
4058 }
4059
4060 static void
4061 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4062 {
4063         int error;
4064         READDIR3args args;
4065         READDIR3vres res;
4066         vattr_t dva;
4067         rnode_t *rp;
4068         int douprintf;
4069         failinfo_t fi, *fip = NULL;
4070         mntinfo_t *mi;
4071         hrtime_t t;
4072
4073         rp = VTOR(vp);
4074         mi = VTOMI(vp);
4075         ASSERT(nfs_zone() == mi->mi_zone);
4076
4077         args.dir = *RTOFH3(rp);
4078         args.cookie = (cookie3)rdc->nfs3_cookie;
4079         args.cookieverf = rp->r_cookieverf;
4080         args.count = rdc->buflen;
4081
4082         /*
4083          * NFS client failover support
4084          * suppress failover unless we have a zero cookie
4085          */
4086         if (args.cookie == (cookie3) 0) {
4087                 fi.vp = vp;
4088                 fi.fhp = (caddr_t)&args.dir;
4089                 fi.copyproc = nfs3copyfh;
4090                 fi.lookupproc = nfs3lookup;
4091                 fi.xattrdirproc = acl_getxattrdir3;
4092                 fip = &fi;
4093         }
4094
4095 #ifdef DEBUG
4096         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4097 #else
4098         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4099 #endif
4100
4101         res.entries = (dirent64_t *)rdc->entries;
4102         res.entries_size = rdc->buflen;
4103         res.dir_attributes.fres.vap = &dva;
4104         res.dir_attributes.fres.vp = vp;
4105         res.loff = rdc->nfs3_cookie;
4106
4107         douprintf = 1;
4108
4109         if (mi->mi_io_kstats) {
4110                 mutex_enter(&mi->mi_lock);
4111                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4112                 mutex_exit(&mi->mi_lock);
4113         }
4114
4115         t = gethrtime();
4116
4117         error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4118             xdr_READDIR3args, (caddr_t)&args,
4119             xdr_READDIR3vres, (caddr_t)&res, cr,
4120             &douprintf, &res.status, 0, fip);
4121
4122         if (mi->mi_io_kstats) {
4123                 mutex_enter(&mi->mi_lock);
4124                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4125                 mutex_exit(&mi->mi_lock);
4126         }
4127
4128         if (error)
4129                 goto err;
4130
4131         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4132
4133         error = geterrno3(res.status);
4134         if (error) {
4135                 PURGE_STALE_FH(error, vp, cr);
4136                 goto err;
4137         }
4138
4139         if (mi->mi_io_kstats) {
4140                 mutex_enter(&mi->mi_lock);
4141                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4142                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4143                 mutex_exit(&mi->mi_lock);
4144         }
4145
4146         rdc->nfs3_ncookie = res.loff;
4147         rp->r_cookieverf = res.cookieverf;
4148         rdc->eof = res.eof ? 1 : 0;
4149         rdc->entlen = res.size;
4150         ASSERT(rdc->entlen <= rdc->buflen);
4151         rdc->error = 0;
4152         return;
4153
4154 err:
4155         kmem_free(rdc->entries, rdc->buflen);
4156         rdc->entries = NULL;
4157         rdc->error = error;
4158 }
4159
4160 /*
4161  * Read directory entries.
4162  * There are some weird things to look out for here.  The uio_loffset
4163  * field is either 0 or it is the offset returned from a previous
4164  * readdir.  It is an opaque value used by the server to find the
4165  * correct directory block to read. The count field is the number
4166  * of blocks to read on the server.  This is advisory only, the server
4167  * may return only one block's worth of entries.  Entries may be compressed
4168  * on the server.
4169  */
4170 static void
4171 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4172 {
4173         int error;
4174         READDIRPLUS3args args;
4175         READDIRPLUS3vres res;
4176         vattr_t dva;
4177         rnode_t *rp;
4178         mntinfo_t *mi;
4179         int douprintf;
4180         failinfo_t fi, *fip = NULL;
4181
4182         rp = VTOR(vp);
4183         mi = VTOMI(vp);
4184         ASSERT(nfs_zone() == mi->mi_zone);
4185
4186         args.dir = *RTOFH3(rp);
4187         args.cookie = (cookie3)rdc->nfs3_cookie;
4188         args.cookieverf = rp->r_cookieverf;
4189         args.dircount = rdc->buflen;
4190         args.maxcount = mi->mi_tsize;
4191
4192         /*
4193          * NFS client failover support
4194          * suppress failover unless we have a zero cookie
4195          */
4196         if (args.cookie == (cookie3)0) {
4197                 fi.vp = vp;
4198                 fi.fhp = (caddr_t)&args.dir;
4199                 fi.copyproc = nfs3copyfh;
4200                 fi.lookupproc = nfs3lookup;
4201                 fi.xattrdirproc = acl_getxattrdir3;
4202                 fip = &fi;
4203         }
4204
4205 #ifdef DEBUG
4206         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4207 #else
4208         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4209 #endif
4210
4211         res.entries = (dirent64_t *)rdc->entries;
4212         res.entries_size = rdc->buflen;
4213         res.dir_attributes.fres.vap = &dva;
4214         res.dir_attributes.fres.vp = vp;
4215         res.loff = rdc->nfs3_cookie;
4216         res.credentials = cr;
4217
4218         douprintf = 1;
4219
4220         if (mi->mi_io_kstats) {
4221                 mutex_enter(&mi->mi_lock);
4222                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4223                 mutex_exit(&mi->mi_lock);
4224         }
4225
4226         res.time = gethrtime();
4227
4228         error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4229             xdr_READDIRPLUS3args, (caddr_t)&args,
4230             xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4231             &douprintf, &res.status, 0, fip);
4232
4233         if (mi->mi_io_kstats) {
4234                 mutex_enter(&mi->mi_lock);
4235                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4236                 mutex_exit(&mi->mi_lock);
4237         }
4238
4239         if (error) {
4240                 goto err;
4241         }
4242
4243         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4244
4245         error = geterrno3(res.status);
4246         if (error) {
4247                 PURGE_STALE_FH(error, vp, cr);
4248                 if (error == EOPNOTSUPP) {
4249                         mutex_enter(&mi->mi_lock);
4250                         mi->mi_flags |= MI_READDIRONLY;
4251                         mutex_exit(&mi->mi_lock);
4252                 }
4253                 goto err;
4254         }
4255
4256         if (mi->mi_io_kstats) {
4257                 mutex_enter(&mi->mi_lock);
4258                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4259                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4260                 mutex_exit(&mi->mi_lock);
4261         }
4262
4263         rdc->nfs3_ncookie = res.loff;
4264         rp->r_cookieverf = res.cookieverf;
4265         rdc->eof = res.eof ? 1 : 0;
4266         rdc->entlen = res.size;
4267         ASSERT(rdc->entlen <= rdc->buflen);
4268         rdc->error = 0;
4269
4270         return;
4271
4272 err:
4273         kmem_free(rdc->entries, rdc->buflen);
4274         rdc->entries = NULL;
4275         rdc->error = error;
4276 }
4277
4278 #ifdef DEBUG
4279 static int nfs3_bio_do_stop = 0;
4280 #endif
4281
4282 static int
4283 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4284 {
4285         rnode_t *rp = VTOR(bp->b_vp);
4286         int count;
4287         int error;
4288         cred_t *cred;
4289         offset_t offset;
4290
4291         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4292         offset = ldbtob(bp->b_lblkno);
4293
4294         DTRACE_IO1(start, struct buf *, bp);
4295
4296         if (bp->b_flags & B_READ) {
4297                 mutex_enter(&rp->r_statelock);
4298                 if (rp->r_cred != NULL) {
4299                         cred = rp->r_cred;
4300                         crhold(cred);
4301                 } else {
4302                         rp->r_cred = cr;
4303                         crhold(cr);
4304                         cred = cr;
4305                         crhold(cred);
4306                 }
4307                 mutex_exit(&rp->r_statelock);
4308         read_again:
4309                 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4310                     offset, bp->b_bcount, &bp->b_resid, cred);
4311                 crfree(cred);
4312                 if (!error) {
4313                         if (bp->b_resid) {
4314                                 /*
4315                                  * Didn't get it all because we hit EOF,
4316                                  * zero all the memory beyond the EOF.
4317                                  */
4318                                 /* bzero(rdaddr + */
4319                                 bzero(bp->b_un.b_addr +
4320                                     bp->b_bcount - bp->b_resid, bp->b_resid);
4321                         }
4322                         mutex_enter(&rp->r_statelock);
4323                         if (bp->b_resid == bp->b_bcount &&
4324                             offset >= rp->r_size) {
4325                                 /*
4326                                  * We didn't read anything at all as we are
4327                                  * past EOF.  Return an error indicator back
4328                                  * but don't destroy the pages (yet).
4329                                  */
4330                                 error = NFS_EOF;
4331                         }
4332                         mutex_exit(&rp->r_statelock);
4333                 } else if (error == EACCES) {
4334                         mutex_enter(&rp->r_statelock);
4335                         if (cred != cr) {
4336                                 if (rp->r_cred != NULL)
4337                                         crfree(rp->r_cred);
4338                                 rp->r_cred = cr;
4339                                 crhold(cr);
4340                                 cred = cr;
4341                                 crhold(cred);
4342                                 mutex_exit(&rp->r_statelock);
4343                                 goto read_again;
4344                         }
4345                         mutex_exit(&rp->r_statelock);
4346                 }
4347         } else {
4348                 if (!(rp->r_flags & RSTALE)) {
4349                         mutex_enter(&rp->r_statelock);
4350                         if (rp->r_cred != NULL) {
4351                                 cred = rp->r_cred;
4352                                 crhold(cred);
4353                         } else {
4354                                 rp->r_cred = cr;
4355                                 crhold(cr);
4356                                 cred = cr;
4357                                 crhold(cred);
4358                         }
4359                         mutex_exit(&rp->r_statelock);
4360                 write_again:
4361                         mutex_enter(&rp->r_statelock);
4362                         count = MIN(bp->b_bcount, rp->r_size - offset);
4363                         mutex_exit(&rp->r_statelock);
4364                         if (count < 0)
4365                                 cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4366 #ifdef DEBUG
4367                         if (count == 0) {
4368                                 zcmn_err(getzoneid(), CE_WARN,
4369                                     "nfs3_bio: zero length write at %lld",
4370                                     offset);
4371                                 nfs_printfhandle(&rp->r_fh);
4372                                 if (nfs3_bio_do_stop)
4373                                         debug_enter("nfs3_bio");
4374                         }
4375 #endif
4376                         error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4377                             count, cred, stab_comm);
4378                         if (error == EACCES) {
4379                                 mutex_enter(&rp->r_statelock);
4380                                 if (cred != cr) {
4381                                         if (rp->r_cred != NULL)
4382                                                 crfree(rp->r_cred);
4383                                         rp->r_cred = cr;
4384                                         crhold(cr);
4385                                         crfree(cred);
4386                                         cred = cr;
4387                                         crhold(cred);
4388                                         mutex_exit(&rp->r_statelock);
4389                                         goto write_again;
4390                                 }
4391                                 mutex_exit(&rp->r_statelock);
4392                         }
4393                         bp->b_error = error;
4394                         if (error && error != EINTR) {
4395                                 /*
4396                                  * Don't print EDQUOT errors on the console.
4397                                  * Don't print asynchronous EACCES errors.
4398                                  * Don't print EFBIG errors.
4399                                  * Print all other write errors.
4400                                  */
4401                                 if (error != EDQUOT && error != EFBIG &&
4402                                     (error != EACCES ||
4403                                     !(bp->b_flags & B_ASYNC)))
4404                                         nfs_write_error(bp->b_vp, error, cred);
4405                                 /*
4406                                  * Update r_error and r_flags as appropriate.
4407                                  * If the error was ESTALE, then mark the
4408                                  * rnode as not being writeable and save
4409                                  * the error status.  Otherwise, save any
4410                                  * errors which occur from asynchronous
4411                                  * page invalidations.  Any errors occurring
4412                                  * from other operations should be saved
4413                                  * by the caller.
4414                                  */
4415                                 mutex_enter(&rp->r_statelock);
4416                                 if (error == ESTALE) {
4417                                         rp->r_flags |= RSTALE;
4418                                         if (!rp->r_error)
4419                                                 rp->r_error = error;
4420                                 } else if (!rp->r_error &&
4421                                     (bp->b_flags &
4422                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
4423                                     (B_INVAL|B_FORCE|B_ASYNC)) {
4424                                         rp->r_error = error;
4425                                 }
4426                                 mutex_exit(&rp->r_statelock);
4427                         }
4428                         crfree(cred);
4429                 } else {
4430                         error = rp->r_error;
4431                         /*
4432                          * A close may have cleared r_error, if so,
4433                          * propagate ESTALE error return properly
4434                          */
4435                         if (error == 0)
4436                                 error = ESTALE;
4437                 }
4438         }
4439
4440         if (error != 0 && error != NFS_EOF)
4441                 bp->b_flags |= B_ERROR;
4442
4443         DTRACE_IO1(done, struct buf *, bp);
4444
4445         return (error);
4446 }
4447
4448 /* ARGSUSED */
4449 static int
4450 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4451 {
4452         rnode_t *rp;
4453
4454         if (nfs_zone() != VTOMI(vp)->mi_zone)
4455                 return (EIO);
4456         rp = VTOR(vp);
4457
4458         if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4459                 fidp->fid_len = rp->r_fh.fh_len;
4460                 return (ENOSPC);
4461         }
4462         fidp->fid_len = rp->r_fh.fh_len;
4463         bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4464         return (0);
4465 }
4466
4467 /* ARGSUSED2 */
4468 static int
4469 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4470 {
4471         rnode_t *rp = VTOR(vp);
4472
4473         if (!write_lock) {
4474                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4475                 return (V_WRITELOCK_FALSE);
4476         }
4477
4478         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4479                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4480                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4481                         return (V_WRITELOCK_FALSE);
4482                 nfs_rw_exit(&rp->r_rwlock);
4483         }
4484
4485         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4486         return (V_WRITELOCK_TRUE);
4487 }
4488
4489 /* ARGSUSED */
4490 static void
4491 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4492 {
4493         rnode_t *rp = VTOR(vp);
4494
4495         nfs_rw_exit(&rp->r_rwlock);
4496 }
4497
4498 /* ARGSUSED */
4499 static int
4500 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4501 {
4502
4503         /*
4504          * Because we stuff the readdir cookie into the offset field
4505          * someone may attempt to do an lseek with the cookie which
4506          * we want to succeed.
4507          */
4508         if (vp->v_type == VDIR)
4509                 return (0);
4510         if (*noffp < 0)
4511                 return (EINVAL);
4512         return (0);
4513 }
4514
4515 /*
4516  * number of nfs3_bsize blocks to read ahead.
4517  */
4518 static int nfs3_nra = 4;
4519
4520 #ifdef DEBUG
4521 static int nfs3_lostpage = 0;   /* number of times we lost original page */
4522 #endif
4523
4524 /*
4525  * Return all the pages from [off..off+len) in file
4526  */
4527 /* ARGSUSED */
4528 static int
4529 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4530         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4531         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4532 {
4533         rnode_t *rp;
4534         int error;
4535         mntinfo_t *mi;
4536
4537         if (vp->v_flag & VNOMAP)
4538                 return (ENOSYS);
4539
4540         if (nfs_zone() != VTOMI(vp)->mi_zone)
4541                 return (EIO);
4542         if (protp != NULL)
4543                 *protp = PROT_ALL;
4544
4545         /*
4546          * Now valididate that the caches are up to date.
4547          */
4548         error = nfs3_validate_caches(vp, cr);
4549         if (error)
4550                 return (error);
4551
4552         rp = VTOR(vp);
4553         mi = VTOMI(vp);
4554 retry:
4555         mutex_enter(&rp->r_statelock);
4556
4557         /*
4558          * Don't create dirty pages faster than they
4559          * can be cleaned so that the system doesn't
4560          * get imbalanced.  If the async queue is
4561          * maxed out, then wait for it to drain before
4562          * creating more dirty pages.  Also, wait for
4563          * any threads doing pagewalks in the vop_getattr
4564          * entry points so that they don't block for
4565          * long periods.
4566          */
4567         if (rw == S_CREATE) {
4568                 while ((mi->mi_max_threads != 0 &&
4569                     rp->r_awcount > 2 * mi->mi_max_threads) ||
4570                     rp->r_gcount > 0)
4571                         cv_wait(&rp->r_cv, &rp->r_statelock);
4572         }
4573
4574         /*
4575          * If we are getting called as a side effect of an nfs_write()
4576          * operation the local file size might not be extended yet.
4577          * In this case we want to be able to return pages of zeroes.
4578          */
4579         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4580                 mutex_exit(&rp->r_statelock);
4581                 return (EFAULT);                /* beyond EOF */
4582         }
4583
4584         mutex_exit(&rp->r_statelock);
4585
4586         error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4587             pl, plsz, seg, addr, rw, cr);
4588
4589         switch (error) {
4590         case NFS_EOF:
4591                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4592                 goto retry;
4593         case ESTALE:
4594                 PURGE_STALE_FH(error, vp, cr);
4595         }
4596
4597         return (error);
4598 }
4599
4600 /*
4601  * Called from pvn_getpages to get a particular page.
4602  */
4603 /* ARGSUSED */
4604 static int
4605 nfs3_getapage(vnode_t *vp, uoff_t off, size_t len, uint_t *protp,
4606         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4607         enum seg_rw rw, cred_t *cr)
4608 {
4609         rnode_t *rp;
4610         uint_t bsize;
4611         struct buf *bp;
4612         page_t *pp;
4613         uoff_t lbn;
4614         uoff_t io_off;
4615         uoff_t blkoff;
4616         uoff_t rablkoff;
4617         size_t io_len;
4618         uint_t blksize;
4619         int error;
4620         int readahead;
4621         int readahead_issued = 0;
4622         int ra_window; /* readahead window */
4623         page_t *pagefound;
4624         page_t *savepp;
4625
4626         if (nfs_zone() != VTOMI(vp)->mi_zone)
4627                 return (EIO);
4628         rp = VTOR(vp);
4629         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4630
4631 reread:
4632         bp = NULL;
4633         pp = NULL;
4634         pagefound = NULL;
4635
4636         if (pl != NULL)
4637                 pl[0] = NULL;
4638
4639         error = 0;
4640         lbn = off / bsize;
4641         blkoff = lbn * bsize;
4642
4643         /*
4644          * Queueing up the readahead before doing the synchronous read
4645          * results in a significant increase in read throughput because
4646          * of the increased parallelism between the async threads and
4647          * the process context.
4648          */
4649         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4650             rw != S_CREATE &&
4651             !(vp->v_flag & VNOCACHE)) {
4652                 mutex_enter(&rp->r_statelock);
4653
4654                 /*
4655                  * Calculate the number of readaheads to do.
4656                  * a) No readaheads at offset = 0.
4657                  * b) Do maximum(nfs3_nra) readaheads when the readahead
4658                  *    window is closed.
4659                  * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4660                  *    upon how far the readahead window is open or close.
4661                  * d) No readaheads if rp->r_nextr is not within the scope
4662                  *    of the readahead window (random i/o).
4663                  */
4664
4665                 if (off == 0)
4666                         readahead = 0;
4667                 else if (blkoff == rp->r_nextr)
4668                         readahead = nfs3_nra;
4669                 else if (rp->r_nextr > blkoff &&
4670                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
4671                     <= (nfs3_nra - 1)))
4672                         readahead = nfs3_nra - ra_window;
4673                 else
4674                         readahead = 0;
4675
4676                 rablkoff = rp->r_nextr;
4677                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4678                         mutex_exit(&rp->r_statelock);
4679                         if (nfs_async_readahead(vp, rablkoff + bsize,
4680                             addr + (rablkoff + bsize - off), seg, cr,
4681                             nfs3_readahead) < 0) {
4682                                 mutex_enter(&rp->r_statelock);
4683                                 break;
4684                         }
4685                         readahead--;
4686                         rablkoff += bsize;
4687                         /*
4688                          * Indicate that we did a readahead so
4689                          * readahead offset is not updated
4690                          * by the synchronous read below.
4691                          */
4692                         readahead_issued = 1;
4693                         mutex_enter(&rp->r_statelock);
4694                         /*
4695                          * set readahead offset to
4696                          * offset of last async readahead
4697                          * request.
4698                          */
4699                         rp->r_nextr = rablkoff;
4700                 }
4701                 mutex_exit(&rp->r_statelock);
4702         }
4703
4704 again:
4705         if ((pagefound = page_exists(&vp->v_object, off)) == NULL) {
4706                 if (pl == NULL) {
4707                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4708                             nfs3_readahead);
4709                 } else if (rw == S_CREATE) {
4710                         /*
4711                          * Block for this page is not allocated, or the offset
4712                          * is beyond the current allocation size, or we're
4713                          * allocating a swap slot and the page was not found,
4714                          * so allocate it and return a zero page.
4715                          */
4716                         if ((pp = page_create_va(&vp->v_object, off,
4717                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4718                                 cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4719                         io_len = PAGESIZE;
4720                         mutex_enter(&rp->r_statelock);
4721                         rp->r_nextr = off + PAGESIZE;
4722                         mutex_exit(&rp->r_statelock);
4723                 } else {
4724                         /*
4725                          * Need to go to server to get a BLOCK, exception to
4726                          * that being while reading at offset = 0 or doing
4727                          * random i/o, in that case read only a PAGE.
4728                          */
4729                         mutex_enter(&rp->r_statelock);
4730                         if (blkoff < rp->r_size &&
4731                             blkoff + bsize >= rp->r_size) {
4732                                 /*
4733                                  * If only a block or less is left in
4734                                  * the file, read all that is remaining.
4735                                  */
4736                                 if (rp->r_size <= off) {
4737                                         /*
4738                                          * Trying to access beyond EOF,
4739                                          * set up to get at least one page.
4740                                          */
4741                                         blksize = off + PAGESIZE - blkoff;
4742                                 } else
4743                                         blksize = rp->r_size - blkoff;
4744                         } else if ((off == 0) ||
4745                             (off != rp->r_nextr && !readahead_issued)) {
4746                                 blksize = PAGESIZE;
4747                                 blkoff = off; /* block = page here */
4748                         } else
4749                                 blksize = bsize;
4750                         mutex_exit(&rp->r_statelock);
4751
4752                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4753                             &io_len, blkoff, blksize, 0);
4754
4755                         /*
4756                          * Some other thread has entered the page,
4757                          * so just use it.
4758                          */
4759                         if (pp == NULL)
4760                                 goto again;
4761
4762                         /*
4763                          * Now round the request size up to page boundaries.
4764                          * This ensures that the entire page will be
4765                          * initialized to zeroes if EOF is encountered.
4766                          */
4767                         io_len = ptob(btopr(io_len));
4768
4769                         bp = pageio_setup(pp, io_len, vp, B_READ);
4770                         ASSERT(bp != NULL);
4771
4772                         /*
4773                          * pageio_setup should have set b_addr to 0.  This
4774                          * is correct since we want to do I/O on a page
4775                          * boundary.  bp_mapin will use this addr to calculate
4776                          * an offset, and then set b_addr to the kernel virtual
4777                          * address it allocated for us.
4778                          */
4779                         ASSERT(bp->b_un.b_addr == 0);
4780
4781                         bp->b_edev = 0;
4782                         bp->b_dev = 0;
4783                         bp->b_lblkno = lbtodb(io_off);
4784                         bp->b_file = vp;
4785                         bp->b_offset = (offset_t)off;
4786                         bp_mapin(bp);
4787
4788                         /*
4789                          * If doing a write beyond what we believe is EOF,
4790                          * don't bother trying to read the pages from the
4791                          * server, we'll just zero the pages here.  We
4792                          * don't check that the rw flag is S_WRITE here
4793                          * because some implementations may attempt a
4794                          * read access to the buffer before copying data.
4795                          */
4796                         mutex_enter(&rp->r_statelock);
4797                         if (io_off >= rp->r_size && seg == segkmap) {
4798                                 mutex_exit(&rp->r_statelock);
4799                                 bzero(bp->b_un.b_addr, io_len);
4800                         } else {
4801                                 mutex_exit(&rp->r_statelock);
4802                                 error = nfs3_bio(bp, NULL, cr);
4803                         }
4804
4805                         /*
4806                          * Unmap the buffer before freeing it.
4807                          */
4808                         bp_mapout(bp);
4809                         pageio_done(bp);
4810
4811                         savepp = pp;
4812                         do {
4813                                 pp->p_fsdata = C_NOCOMMIT;
4814                         } while ((pp = pp->p_next) != savepp);
4815
4816                         if (error == NFS_EOF) {
4817                                 /*
4818                                  * If doing a write system call just return
4819                                  * zeroed pages, else user tried to get pages
4820                                  * beyond EOF, return error.  We don't check
4821                                  * that the rw flag is S_WRITE here because
4822                                  * some implementations may attempt a read
4823                                  * access to the buffer before copying data.
4824                                  */
4825                                 if (seg == segkmap)
4826                                         error = 0;
4827                                 else
4828                                         error = EFAULT;
4829                         }
4830
4831                         if (!readahead_issued && !error) {
4832                                 mutex_enter(&rp->r_statelock);
4833                                 rp->r_nextr = io_off + io_len;
4834                                 mutex_exit(&rp->r_statelock);
4835                         }
4836                 }
4837         }
4838
4839 out:
4840         if (pl == NULL)
4841                 return (error);
4842
4843         if (error) {
4844                 if (pp != NULL)
4845                         pvn_read_done(pp, B_ERROR);
4846                 return (error);
4847         }
4848
4849         if (pagefound) {
4850                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4851
4852                 /*
4853                  * Page exists in the cache, acquire the appropriate lock.
4854                  * If this fails, start all over again.
4855                  */
4856                 if ((pp = page_lookup(&vp->v_object, off, se)) == NULL) {
4857 #ifdef DEBUG
4858                         nfs3_lostpage++;
4859 #endif
4860                         goto reread;
4861                 }
4862                 pl[0] = pp;
4863                 pl[1] = NULL;
4864                 return (0);
4865         }
4866
4867         if (pp != NULL)
4868                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4869
4870         return (error);
4871 }
4872
4873 static void
4874 nfs3_readahead(vnode_t *vp, uoff_t blkoff, caddr_t addr, struct seg *seg,
4875         cred_t *cr)
4876 {
4877         int error;
4878         page_t *pp;
4879         uoff_t io_off;
4880         size_t io_len;
4881         struct buf *bp;
4882         uint_t bsize, blksize;
4883         rnode_t *rp = VTOR(vp);
4884         page_t *savepp;
4885
4886         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4887         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4888
4889         mutex_enter(&rp->r_statelock);
4890         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4891                 /*
4892                  * If less than a block left in file read less
4893                  * than a block.
4894                  */
4895                 blksize = rp->r_size - blkoff;
4896         } else
4897                 blksize = bsize;
4898         mutex_exit(&rp->r_statelock);
4899
4900         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4901             &io_off, &io_len, blkoff, blksize, 1);
4902         /*
4903          * The isra flag passed to the kluster function is 1, we may have
4904          * gotten a return value of NULL for a variety of reasons (# of free
4905          * pages < minfree, someone entered the page on the vnode etc). In all
4906          * cases, we want to punt on the readahead.
4907          */
4908         if (pp == NULL)
4909                 return;
4910
4911         /*
4912          * Now round the request size up to page boundaries.
4913          * This ensures that the entire page will be
4914          * initialized to zeroes if EOF is encountered.
4915          */
4916         io_len = ptob(btopr(io_len));
4917
4918         bp = pageio_setup(pp, io_len, vp, B_READ);
4919         ASSERT(bp != NULL);
4920
4921         /*
4922          * pageio_setup should have set b_addr to 0.  This is correct since
4923          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4924          * to calculate an offset, and then set b_addr to the kernel virtual
4925          * address it allocated for us.
4926          */
4927         ASSERT(bp->b_un.b_addr == 0);
4928
4929         bp->b_edev = 0;
4930         bp->b_dev = 0;
4931         bp->b_lblkno = lbtodb(io_off);
4932         bp->b_file = vp;
4933         bp->b_offset = (offset_t)blkoff;
4934         bp_mapin(bp);
4935
4936         /*
4937          * If doing a write beyond what we believe is EOF, don't bother trying
4938          * to read the pages from the server, we'll just zero the pages here.
4939          * We don't check that the rw flag is S_WRITE here because some
4940          * implementations may attempt a read access to the buffer before
4941          * copying data.
4942          */
4943         mutex_enter(&rp->r_statelock);
4944         if (io_off >= rp->r_size && seg == segkmap) {
4945                 mutex_exit(&rp->r_statelock);
4946                 bzero(bp->b_un.b_addr, io_len);
4947                 error = 0;
4948         } else {
4949                 mutex_exit(&rp->r_statelock);
4950                 error = nfs3_bio(bp, NULL, cr);
4951                 if (error == NFS_EOF)
4952                         error = 0;
4953         }
4954
4955         /*
4956          * Unmap the buffer before freeing it.
4957          */
4958         bp_mapout(bp);
4959         pageio_done(bp);
4960
4961         savepp = pp;
4962         do {
4963                 pp->p_fsdata = C_NOCOMMIT;
4964         } while ((pp = pp->p_next) != savepp);
4965
4966         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4967
4968         /*
4969          * In case of error set readahead offset
4970          * to the lowest offset.
4971          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4972          */
4973         if (error && rp->r_nextr > io_off) {
4974                 mutex_enter(&rp->r_statelock);
4975                 if (rp->r_nextr > io_off)
4976                         rp->r_nextr = io_off;
4977                 mutex_exit(&rp->r_statelock);
4978         }
4979 }
4980
4981 /*
4982  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4983  * If len == 0, do from off to EOF.
4984  *
4985  * The normal cases should be len == 0 && off == 0 (entire vp list),
4986  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4987  * (from pageout).
4988  */
4989 /* ARGSUSED */
4990 static int
4991 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4992         caller_context_t *ct)
4993 {
4994         int error;
4995         rnode_t *rp;
4996
4997         ASSERT(cr != NULL);
4998
4999         /*
5000          * XXX - Why should this check be made here?
5001          */
5002         if (vp->v_flag & VNOMAP)
5003                 return (ENOSYS);
5004         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
5005                 return (0);
5006         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5007                 return (EIO);
5008
5009         rp = VTOR(vp);
5010         mutex_enter(&rp->r_statelock);
5011         rp->r_count++;
5012         mutex_exit(&rp->r_statelock);
5013         error = nfs_putpages(vp, off, len, flags, cr);
5014         mutex_enter(&rp->r_statelock);
5015         rp->r_count--;
5016         cv_broadcast(&rp->r_cv);
5017         mutex_exit(&rp->r_statelock);
5018
5019         return (error);
5020 }
5021
5022 /*
5023  * Write out a single page, possibly klustering adjacent dirty pages.
5024  */
5025 int
5026 nfs3_putapage(vnode_t *vp, page_t *pp, uoff_t *offp, size_t *lenp,
5027         int flags, cred_t *cr)
5028 {
5029         uoff_t io_off;
5030         uoff_t lbn_off;
5031         uoff_t lbn;
5032         size_t io_len;
5033         uint_t bsize;
5034         int error;
5035         rnode_t *rp;
5036
5037         ASSERT(!vn_is_readonly(vp));
5038         ASSERT(pp != NULL);
5039         ASSERT(cr != NULL);
5040         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5041
5042         rp = VTOR(vp);
5043         ASSERT(rp->r_count > 0);
5044
5045         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5046         lbn = pp->p_offset / bsize;
5047         lbn_off = lbn * bsize;
5048
5049         /*
5050          * Find a kluster that fits in one block, or in
5051          * one page if pages are bigger than blocks.  If
5052          * there is less file space allocated than a whole
5053          * page, we'll shorten the i/o request below.
5054          */
5055         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5056             roundup(bsize, PAGESIZE), flags);
5057
5058         /*
5059          * pvn_write_kluster shouldn't have returned a page with offset
5060          * behind the original page we were given.  Verify that.
5061          */
5062         ASSERT((pp->p_offset / bsize) >= lbn);
5063
5064         /*
5065          * Now pp will have the list of kept dirty pages marked for
5066          * write back.  It will also handle invalidation and freeing
5067          * of pages that are not dirty.  Check for page length rounding
5068          * problems.
5069          */
5070         if (io_off + io_len > lbn_off + bsize) {
5071                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5072                 io_len = lbn_off + bsize - io_off;
5073         }
5074         /*
5075          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5076          * consistent value of r_size. RMODINPROGRESS is set in writerp().
5077          * When RMODINPROGRESS is set it indicates that a uiomove() is in
5078          * progress and the r_size has not been made consistent with the
5079          * new size of the file. When the uiomove() completes the r_size is
5080          * updated and the RMODINPROGRESS flag is cleared.
5081          *
5082          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5083          * consistent value of r_size. Without this handshaking, it is
5084          * possible that nfs(3)_bio() picks  up the old value of r_size
5085          * before the uiomove() in writerp() completes. This will result
5086          * in the write through nfs(3)_bio() being dropped.
5087          *
5088          * More precisely, there is a window between the time the uiomove()
5089          * completes and the time the r_size is updated. If a fop_putpage()
5090          * operation intervenes in this window, the page will be picked up,
5091          * because it is dirty (it will be unlocked, unless it was
5092          * pagecreate'd). When the page is picked up as dirty, the dirty
5093          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5094          * checked. This will still be the old size. Therefore the page will
5095          * not be written out. When segmap_release() calls fop_putpage(),
5096          * the page will be found to be clean and the write will be dropped.
5097          */
5098         if (rp->r_flags & RMODINPROGRESS) {
5099                 mutex_enter(&rp->r_statelock);
5100                 if ((rp->r_flags & RMODINPROGRESS) &&
5101                     rp->r_modaddr + MAXBSIZE > io_off &&
5102                     rp->r_modaddr < io_off + io_len) {
5103                         page_t *plist;
5104                         /*
5105                          * A write is in progress for this region of the file.
5106                          * If we did not detect RMODINPROGRESS here then this
5107                          * path through nfs_putapage() would eventually go to
5108                          * nfs(3)_bio() and may not write out all of the data
5109                          * in the pages. We end up losing data. So we decide
5110                          * to set the modified bit on each page in the page
5111                          * list and mark the rnode with RDIRTY. This write
5112                          * will be restarted at some later time.
5113                          */
5114                         plist = pp;
5115                         while (plist != NULL) {
5116                                 pp = plist;
5117                                 page_sub(&plist, pp);
5118                                 hat_setmod(pp);
5119                                 page_io_unlock(pp);
5120                                 page_unlock(pp);
5121                         }
5122                         rp->r_flags |= RDIRTY;
5123                         mutex_exit(&rp->r_statelock);
5124                         if (offp)
5125                                 *offp = io_off;
5126                         if (lenp)
5127                                 *lenp = io_len;
5128                         return (0);
5129                 }
5130                 mutex_exit(&rp->r_statelock);
5131         }
5132
5133         if (flags & B_ASYNC) {
5134                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5135                     nfs3_sync_putapage);
5136         } else
5137                 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5138
5139         if (offp)
5140                 *offp = io_off;
5141         if (lenp)
5142                 *lenp = io_len;
5143         return (error);
5144 }
5145
5146 static int
5147 nfs3_sync_putapage(vnode_t *vp, page_t *pp, uoff_t io_off, size_t io_len,
5148         int flags, cred_t *cr)
5149 {
5150         int error;
5151         rnode_t *rp;
5152
5153         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5154
5155         flags |= B_WRITE;
5156
5157         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5158
5159         rp = VTOR(vp);
5160
5161         if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5162             error == EACCES) &&
5163             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5164                 if (!(rp->r_flags & ROUTOFSPACE)) {
5165                         mutex_enter(&rp->r_statelock);
5166                         rp->r_flags |= ROUTOFSPACE;
5167                         mutex_exit(&rp->r_statelock);
5168                 }
5169                 flags |= B_ERROR;
5170                 pvn_write_done(pp, flags);
5171                 /*
5172                  * If this was not an async thread, then try again to
5173                  * write out the pages, but this time, also destroy
5174                  * them whether or not the write is successful.  This
5175                  * will prevent memory from filling up with these
5176                  * pages and destroying them is the only alternative
5177                  * if they can't be written out.
5178                  *
5179                  * Don't do this if this is an async thread because
5180                  * when the pages are unlocked in pvn_write_done,
5181                  * some other thread could have come along, locked
5182                  * them, and queued for an async thread.  It would be
5183                  * possible for all of the async threads to be tied
5184                  * up waiting to lock the pages again and they would
5185                  * all already be locked and waiting for an async
5186                  * thread to handle them.  Deadlock.
5187                  */
5188                 if (!(flags & B_ASYNC)) {
5189                         error = nfs3_putpage(vp, io_off, io_len,
5190                             B_INVAL | B_FORCE, cr, NULL);
5191                 }
5192         } else {
5193                 if (error)
5194                         flags |= B_ERROR;
5195                 else if (rp->r_flags & ROUTOFSPACE) {
5196                         mutex_enter(&rp->r_statelock);
5197                         rp->r_flags &= ~ROUTOFSPACE;
5198                         mutex_exit(&rp->r_statelock);
5199                 }
5200                 pvn_write_done(pp, flags);
5201                 if (freemem < desfree)
5202                         (void) nfs3_commit_vp(vp, 0, 0, cr);
5203         }
5204
5205         return (error);
5206 }
5207
5208 /* ARGSUSED */
5209 static int
5210 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5211         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5212         cred_t *cr, caller_context_t *ct)
5213 {
5214         struct segvn_crargs vn_a;
5215         int error;
5216         rnode_t *rp;
5217         struct vattr va;
5218
5219         if (nfs_zone() != VTOMI(vp)->mi_zone)
5220                 return (EIO);
5221
5222         if (vp->v_flag & VNOMAP)
5223                 return (ENOSYS);
5224
5225         if (off < 0 || off + len < 0)
5226                 return (ENXIO);
5227
5228         if (vp->v_type != VREG)
5229                 return (ENODEV);
5230
5231         /*
5232          * If there is cached data and if close-to-open consistency
5233          * checking is not turned off and if the file system is not
5234          * mounted readonly, then force an over the wire getattr.
5235          * Otherwise, just invoke nfs3getattr to get a copy of the
5236          * attributes.  The attribute cache will be used unless it
5237          * is timed out and if it is, then an over the wire getattr
5238          * will be issued.
5239          */
5240         va.va_mask = AT_ALL;
5241         if (vn_has_cached_data(vp) &&
5242             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5243                 error = nfs3_getattr_otw(vp, &va, cr);
5244         else
5245                 error = nfs3getattr(vp, &va, cr);
5246         if (error)
5247                 return (error);
5248
5249         /*
5250          * Check to see if the vnode is currently marked as not cachable.
5251          * This means portions of the file are locked (through fop_frlock).
5252          * In this case the map request must be refused.  We use
5253          * rp->r_lkserlock to avoid a race with concurrent lock requests.
5254          */
5255         rp = VTOR(vp);
5256
5257         /*
5258          * Atomically increment r_inmap after acquiring r_rwlock. The
5259          * idea here is to acquire r_rwlock to block read/write and
5260          * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5261          * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5262          * and we can prevent the deadlock that would have occurred
5263          * when nfs3_addmap() would have acquired it out of order.
5264          *
5265          * Since we are not protecting r_inmap by any lock, we do not
5266          * hold any lock when we decrement it. We atomically decrement
5267          * r_inmap after we release r_lkserlock.
5268          */
5269
5270         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5271                 return (EINTR);
5272         atomic_inc_uint(&rp->r_inmap);
5273         nfs_rw_exit(&rp->r_rwlock);
5274
5275         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5276                 atomic_dec_uint(&rp->r_inmap);
5277                 return (EINTR);
5278         }
5279
5280         if (vp->v_flag & VNOCACHE) {
5281                 error = EAGAIN;
5282                 goto done;
5283         }
5284
5285         /*
5286          * Don't allow concurrent locks and mapping if mandatory locking is
5287          * enabled.
5288          */
5289         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5290             MANDLOCK(vp, va.va_mode)) {
5291                 error = EAGAIN;
5292                 goto done;
5293         }
5294
5295         as_rangelock(as);
5296         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5297         if (error != 0) {
5298                 as_rangeunlock(as);
5299                 goto done;
5300         }
5301
5302         vn_a.vp = vp;
5303         vn_a.offset = off;
5304         vn_a.type = (flags & MAP_TYPE);
5305         vn_a.prot = (uchar_t)prot;
5306         vn_a.maxprot = (uchar_t)maxprot;
5307         vn_a.flags = (flags & ~MAP_TYPE);
5308         vn_a.cred = cr;
5309         vn_a.amp = NULL;
5310         vn_a.szc = 0;
5311         vn_a.lgrp_mem_policy_flags = 0;
5312
5313         error = as_map(as, *addrp, len, segvn_create, &vn_a);
5314         as_rangeunlock(as);
5315
5316 done:
5317         nfs_rw_exit(&rp->r_lkserlock);
5318         atomic_dec_uint(&rp->r_inmap);
5319         return (error);
5320 }
5321
5322 /* ARGSUSED */
5323 static int
5324 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5325         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5326         cred_t *cr, caller_context_t *ct)
5327 {
5328         rnode_t *rp;
5329
5330         if (vp->v_flag & VNOMAP)
5331                 return (ENOSYS);
5332         if (nfs_zone() != VTOMI(vp)->mi_zone)
5333                 return (EIO);
5334
5335         rp = VTOR(vp);
5336         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5337
5338         return (0);
5339 }
5340
5341 /* ARGSUSED */
5342 static int
5343 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5344         offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5345         caller_context_t *ct)
5346 {
5347         netobj lm_fh3;
5348         int rc;
5349         uoff_t start, end;
5350         rnode_t *rp;
5351         int error = 0, intr = INTR(vp);
5352
5353         if (nfs_zone() != VTOMI(vp)->mi_zone)
5354                 return (EIO);
5355         /* check for valid cmd parameter */
5356         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5357                 return (EINVAL);
5358
5359         /* Verify l_type. */
5360         switch (bfp->l_type) {
5361         case F_RDLCK:
5362                 if (cmd != F_GETLK && !(flag & FREAD))
5363                         return (EBADF);
5364                 break;
5365         case F_WRLCK:
5366                 if (cmd != F_GETLK && !(flag & FWRITE))
5367                         return (EBADF);
5368                 break;
5369         case F_UNLCK:
5370                 intr = 0;
5371                 break;
5372
5373         default:
5374                 return (EINVAL);
5375         }
5376
5377         /* check the validity of the lock range */
5378         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5379                 return (rc);
5380         if (rc = flk_check_lock_data(start, end, MAXEND))
5381                 return (rc);
5382
5383         /*
5384          * If the filesystem is mounted using local locking, pass the
5385          * request off to the local locking code.
5386          */
5387         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5388                 if (cmd == F_SETLK || cmd == F_SETLKW) {
5389                         /*
5390                          * For complete safety, we should be holding
5391                          * r_lkserlock.  However, we can't call
5392                          * lm_safelock and then fs_frlock while
5393                          * holding r_lkserlock, so just invoke
5394                          * lm_safelock and expect that this will
5395                          * catch enough of the cases.
5396                          */
5397                         if (!lm_safelock(vp, bfp, cr))
5398                                 return (EAGAIN);
5399                 }
5400                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5401         }
5402
5403         rp = VTOR(vp);
5404
5405         /*
5406          * Check whether the given lock request can proceed, given the
5407          * current file mappings.
5408          */
5409         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5410                 return (EINTR);
5411         if (cmd == F_SETLK || cmd == F_SETLKW) {
5412                 if (!lm_safelock(vp, bfp, cr)) {
5413                         rc = EAGAIN;
5414                         goto done;
5415                 }
5416         }
5417
5418         /*
5419          * Flush the cache after waiting for async I/O to finish.  For new
5420          * locks, this is so that the process gets the latest bits from the
5421          * server.  For unlocks, this is so that other clients see the
5422          * latest bits once the file has been unlocked.  If currently dirty
5423          * pages can't be flushed, then don't allow a lock to be set.  But
5424          * allow unlocks to succeed, to avoid having orphan locks on the
5425          * server.
5426          */
5427         if (cmd != F_GETLK) {
5428                 mutex_enter(&rp->r_statelock);
5429                 while (rp->r_count > 0) {
5430                         if (intr) {
5431                                 klwp_t *lwp = ttolwp(curthread);
5432
5433                                 if (lwp != NULL)
5434                                         lwp->lwp_nostop++;
5435                                 if (cv_wait_sig(&rp->r_cv,
5436                                     &rp->r_statelock) == 0) {
5437                                         if (lwp != NULL)
5438                                                 lwp->lwp_nostop--;
5439                                         rc = EINTR;
5440                                         break;
5441                                 }
5442                                 if (lwp != NULL)
5443                                         lwp->lwp_nostop--;
5444                         } else
5445                                 cv_wait(&rp->r_cv, &rp->r_statelock);
5446                 }
5447                 mutex_exit(&rp->r_statelock);
5448                 if (rc != 0)
5449                         goto done;
5450                 error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5451                 if (error) {
5452                         if (error == ENOSPC || error == EDQUOT) {
5453                                 mutex_enter(&rp->r_statelock);
5454                                 if (!rp->r_error)
5455                                         rp->r_error = error;
5456                                 mutex_exit(&rp->r_statelock);
5457                         }
5458                         if (bfp->l_type != F_UNLCK) {
5459                                 rc = ENOLCK;
5460                                 goto done;
5461                         }
5462                 }
5463         }
5464
5465         lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5466         lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5467
5468         /*
5469          * Call the lock manager to do the real work of contacting
5470          * the server and obtaining the lock.
5471          */
5472         rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5473
5474         if (rc == 0)
5475                 nfs_lockcompletion(vp, cmd);
5476
5477 done:
5478         nfs_rw_exit(&rp->r_lkserlock);
5479         return (rc);
5480 }
5481
5482 /*
5483  * Free storage space associated with the specified vnode.  The portion
5484  * to be freed is specified by bfp->l_start and bfp->l_len (already
5485  * normalized to a "whence" of 0).
5486  *
5487  * This is an experimental facility whose continued existence is not
5488  * guaranteed.  Currently, we only support the special case
5489  * of l_len == 0, meaning free to end of file.
5490  */
5491 /* ARGSUSED */
5492 static int
5493 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5494         offset_t offset, cred_t *cr, caller_context_t *ct)
5495 {
5496         int error;
5497
5498         ASSERT(vp->v_type == VREG);
5499         if (cmd != F_FREESP)
5500                 return (EINVAL);
5501         if (nfs_zone() != VTOMI(vp)->mi_zone)
5502                 return (EIO);
5503
5504         error = convoff(vp, bfp, 0, offset);
5505         if (!error) {
5506                 ASSERT(bfp->l_start >= 0);
5507                 if (bfp->l_len == 0) {
5508                         struct vattr va;
5509
5510                         /*
5511                          * ftruncate should not change the ctime and
5512                          * mtime if we truncate the file to its
5513                          * previous size.
5514                          */
5515                         va.va_mask = AT_SIZE;
5516                         error = nfs3getattr(vp, &va, cr);
5517                         if (error || va.va_size == bfp->l_start)
5518                                 return (error);
5519                         va.va_mask = AT_SIZE;
5520                         va.va_size = bfp->l_start;
5521                         error = nfs3setattr(vp, &va, 0, cr);
5522
5523                         if (error == 0 && bfp->l_start == 0)
5524                                 vnevent_truncate(vp, ct);
5525                 } else
5526                         error = EINVAL;
5527         }
5528
5529         return (error);
5530 }
5531
5532 /* ARGSUSED */
5533 static int
5534 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5535 {
5536
5537         return (EINVAL);
5538 }
5539
5540 /*
5541  * Setup and add an address space callback to do the work of the delmap call.
5542  * The callback will (and must be) deleted in the actual callback function.
5543  *
5544  * This is done in order to take care of the problem that we have with holding
5545  * the address space's a_lock for a long period of time (e.g. if the NFS server
5546  * is down).  Callbacks will be executed in the address space code while the
5547  * a_lock is not held.  Holding the address space's a_lock causes things such
5548  * as ps and fork to hang because they are trying to acquire this lock as well.
5549  */
5550 /* ARGSUSED */
5551 static int
5552 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5553         size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5554         cred_t *cr, caller_context_t *ct)
5555 {
5556         int                     caller_found;
5557         int                     error;
5558         rnode_t                 *rp;
5559         nfs_delmap_args_t       *dmapp;
5560         nfs_delmapcall_t        *delmap_call;
5561
5562         if (vp->v_flag & VNOMAP)
5563                 return (ENOSYS);
5564         /*
5565          * A process may not change zones if it has NFS pages mmap'ed
5566          * in, so we can't legitimately get here from the wrong zone.
5567          */
5568         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5569
5570         rp = VTOR(vp);
5571
5572         /*
5573          * The way that the address space of this process deletes its mapping
5574          * of this file is via the following call chains:
5575          * - as_free()->segop_unmap()/segvn_unmap()->fop_delmap()/nfs3_delmap()
5576          * - as_unmap()->segop_unmap()/segvn_unmap()->fop_delmap()/nfs3_delmap()
5577          *
5578          * With the use of address space callbacks we are allowed to drop the
5579          * address space lock, a_lock, while executing the NFS operations that
5580          * need to go over the wire.  Returning EAGAIN to the caller of this
5581          * function is what drives the execution of the callback that we add
5582          * below.  The callback will be executed by the address space code
5583          * after dropping the a_lock.  When the callback is finished, since
5584          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5585          * is called again on the same segment to finish the rest of the work
5586          * that needs to happen during unmapping.
5587          *
5588          * This action of calling back into the segment driver causes
5589          * nfs3_delmap() to get called again, but since the callback was
5590          * already executed at this point, it already did the work and there
5591          * is nothing left for us to do.
5592          *
5593          * To Summarize:
5594          * - The first time nfs3_delmap is called by the current thread is when
5595          * we add the caller associated with this delmap to the delmap caller
5596          * list, add the callback, and return EAGAIN.
5597          * - The second time in this call chain when nfs3_delmap is called we
5598          * will find this caller in the delmap caller list and realize there
5599          * is no more work to do thus removing this caller from the list and
5600          * returning the error that was set in the callback execution.
5601          */
5602         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5603         if (caller_found) {
5604                 /*
5605                  * 'error' is from the actual delmap operations.  To avoid
5606                  * hangs, we need to handle the return of EAGAIN differently
5607                  * since this is what drives the callback execution.
5608                  * In this case, we don't want to return EAGAIN and do the
5609                  * callback execution because there are none to execute.
5610                  */
5611                 if (error == EAGAIN)
5612                         return (0);
5613                 else
5614                         return (error);
5615         }
5616
5617         /* current caller was not in the list */
5618         delmap_call = nfs_init_delmapcall();
5619
5620         mutex_enter(&rp->r_statelock);
5621         list_insert_tail(&rp->r_indelmap, delmap_call);
5622         mutex_exit(&rp->r_statelock);
5623
5624         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5625
5626         dmapp->vp = vp;
5627         dmapp->off = off;
5628         dmapp->addr = addr;
5629         dmapp->len = len;
5630         dmapp->prot = prot;
5631         dmapp->maxprot = maxprot;
5632         dmapp->flags = flags;
5633         dmapp->cr = cr;
5634         dmapp->caller = delmap_call;
5635
5636         error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5637             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5638
5639         return (error ? error : EAGAIN);
5640 }
5641
5642 /*
5643  * Remove some pages from an mmap'd vnode.  Just update the
5644  * count of pages.  If doing close-to-open, then flush and
5645  * commit all of the pages associated with this file.
5646  * Otherwise, start an asynchronous page flush to write out
5647  * any dirty pages.  This will also associate a credential
5648  * with the rnode which can be used to write the pages.
5649  */
5650 /* ARGSUSED */
5651 static void
5652 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5653 {
5654         int                     error;
5655         rnode_t                 *rp;
5656         mntinfo_t               *mi;
5657         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
5658
5659         rp = VTOR(dmapp->vp);
5660         mi = VTOMI(dmapp->vp);
5661
5662         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5663         ASSERT(rp->r_mapcnt >= 0);
5664
5665         /*
5666          * Initiate a page flush and potential commit if there are
5667          * pages, the file system was not mounted readonly, the segment
5668          * was mapped shared, and the pages themselves were writeable.
5669          */
5670         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5671             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5672                 mutex_enter(&rp->r_statelock);
5673                 rp->r_flags |= RDIRTY;
5674                 mutex_exit(&rp->r_statelock);
5675                 /*
5676                  * If this is a cross-zone access a sync putpage won't work, so
5677                  * the best we can do is try an async putpage.  That seems
5678                  * better than something more draconian such as discarding the
5679                  * dirty pages.
5680                  */
5681                 if ((mi->mi_flags & MI_NOCTO) ||
5682                     nfs_zone() != mi->mi_zone)
5683                         error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5684                             B_ASYNC, dmapp->cr, NULL);
5685                 else
5686                         error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5687                             dmapp->len, dmapp->cr);
5688                 if (!error) {
5689                         mutex_enter(&rp->r_statelock);
5690                         error = rp->r_error;
5691                         rp->r_error = 0;
5692                         mutex_exit(&rp->r_statelock);
5693                 }
5694         } else
5695                 error = 0;
5696
5697         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5698                 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5699                     B_INVAL, dmapp->cr, NULL);
5700
5701         dmapp->caller->error = error;
5702         (void) as_delete_callback(as, arg);
5703         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5704 }
5705
5706 static int nfs3_pathconf_disable_cache = 0;
5707
5708 #ifdef DEBUG
5709 static int nfs3_pathconf_cache_hits = 0;
5710 static int nfs3_pathconf_cache_misses = 0;
5711 #endif
5712
5713 /* ARGSUSED */
5714 static int
5715 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5716         caller_context_t *ct)
5717 {
5718         int error;
5719         PATHCONF3args args;
5720         PATHCONF3res res;
5721         int douprintf;
5722         failinfo_t fi;
5723         rnode_t *rp;
5724         hrtime_t t;
5725
5726         if (nfs_zone() != VTOMI(vp)->mi_zone)
5727                 return (EIO);
5728         /*
5729          * Large file spec - need to base answer on info stored
5730          * on original FSINFO response.
5731          */
5732         if (cmd == _PC_FILESIZEBITS) {
5733                 unsigned long long ll;
5734                 long l = 1;
5735
5736                 ll = VTOMI(vp)->mi_maxfilesize;
5737
5738                 if (ll == 0) {
5739                         *valp = 0;
5740                         return (0);
5741                 }
5742
5743                 if (ll & 0xffffffff00000000) {
5744                         l += 32; ll >>= 32;
5745                 }
5746                 if (ll & 0xffff0000) {
5747                         l += 16; ll >>= 16;
5748                 }
5749                 if (ll & 0xff00) {
5750                         l += 8; ll >>= 8;
5751                 }
5752                 if (ll & 0xf0) {
5753                         l += 4; ll >>= 4;
5754                 }
5755                 if (ll & 0xc) {
5756                         l += 2; ll >>= 2;
5757                 }
5758                 if (ll & 0x2)
5759                         l += 2;
5760                 else if (ll & 0x1)
5761                         l += 1;
5762                 *valp = l;
5763                 return (0);
5764         }
5765
5766         if (cmd == _PC_ACL_ENABLED) {
5767                 *valp = _ACL_ACLENT_ENABLED;
5768                 return (0);
5769         }
5770
5771         if (cmd == _PC_XATTR_EXISTS) {
5772                 error = 0;
5773                 *valp = 0;
5774                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5775                         vnode_t *avp;
5776                         rnode_t *rp;
5777                         int error = 0;
5778                         mntinfo_t *mi = VTOMI(vp);
5779
5780                         if (!(mi->mi_flags & MI_EXTATTR))
5781                                 return (0);
5782
5783                         rp = VTOR(vp);
5784                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5785                             INTR(vp)))
5786                                 return (EINTR);
5787
5788                         error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5789                         if (error || avp == NULL)
5790                                 error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5791
5792                         nfs_rw_exit(&rp->r_rwlock);
5793
5794                         if (error == 0 && avp != NULL) {
5795                                 error = do_xattr_exists_check(avp, valp, cr);
5796                                 VN_RELE(avp);
5797                         } else if (error == ENOENT) {
5798                                 error = 0;
5799                                 *valp = 0;
5800                         }
5801                 }
5802                 return (error);
5803         }
5804
5805         rp = VTOR(vp);
5806         if (rp->r_pathconf != NULL) {
5807                 mutex_enter(&rp->r_statelock);
5808                 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5809                         kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5810                         rp->r_pathconf = NULL;
5811                 }
5812                 if (rp->r_pathconf != NULL) {
5813                         error = 0;
5814                         switch (cmd) {
5815                         case _PC_LINK_MAX:
5816                                 *valp = rp->r_pathconf->link_max;
5817                                 break;
5818                         case _PC_NAME_MAX:
5819                                 *valp = rp->r_pathconf->name_max;
5820                                 break;
5821                         case _PC_PATH_MAX:
5822                         case _PC_SYMLINK_MAX:
5823                                 *valp = MAXPATHLEN;
5824                                 break;
5825                         case _PC_CHOWN_RESTRICTED:
5826                                 *valp = rp->r_pathconf->chown_restricted;
5827                                 break;
5828                         case _PC_NO_TRUNC:
5829                                 *valp = rp->r_pathconf->no_trunc;
5830                                 break;
5831                         default:
5832                                 error = EINVAL;
5833                                 break;
5834                         }
5835                         mutex_exit(&rp->r_statelock);
5836 #ifdef DEBUG
5837                         nfs3_pathconf_cache_hits++;
5838 #endif
5839                         return (error);
5840                 }
5841                 mutex_exit(&rp->r_statelock);
5842         }
5843 #ifdef DEBUG
5844         nfs3_pathconf_cache_misses++;
5845 #endif
5846
5847         args.object = *VTOFH3(vp);
5848         fi.vp = vp;
5849         fi.fhp = (caddr_t)&args.object;
5850         fi.copyproc = nfs3copyfh;
5851         fi.lookupproc = nfs3lookup;
5852         fi.xattrdirproc = acl_getxattrdir3;
5853
5854         douprintf = 1;
5855
5856         t = gethrtime();
5857
5858         error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5859             xdr_nfs_fh3, (caddr_t)&args,
5860             xdr_PATHCONF3res, (caddr_t)&res, cr,
5861             &douprintf, &res.status, 0, &fi);
5862
5863         if (error)
5864                 return (error);
5865
5866         error = geterrno3(res.status);
5867
5868         if (!error) {
5869                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5870                 if (!nfs3_pathconf_disable_cache) {
5871                         mutex_enter(&rp->r_statelock);
5872                         if (rp->r_pathconf == NULL) {
5873                                 rp->r_pathconf = kmem_alloc(
5874                                     sizeof (*rp->r_pathconf), KM_NOSLEEP);
5875                                 if (rp->r_pathconf != NULL)
5876                                         *rp->r_pathconf = res.resok.info;
5877                         }
5878                         mutex_exit(&rp->r_statelock);
5879                 }
5880                 switch (cmd) {
5881                 case _PC_LINK_MAX:
5882                         *valp = res.resok.info.link_max;
5883                         break;
5884                 case _PC_NAME_MAX:
5885                         *valp = res.resok.info.name_max;
5886                         break;
5887                 case _PC_PATH_MAX:
5888                 case _PC_SYMLINK_MAX:
5889                         *valp = MAXPATHLEN;
5890                         break;
5891                 case _PC_CHOWN_RESTRICTED:
5892                         *valp = res.resok.info.chown_restricted;
5893                         break;
5894                 case _PC_NO_TRUNC:
5895                         *valp = res.resok.info.no_trunc;
5896                         break;
5897                 default:
5898                         return (EINVAL);
5899                 }
5900         } else {
5901                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5902                 PURGE_STALE_FH(error, vp, cr);
5903         }
5904
5905         return (error);
5906 }
5907
5908 /*
5909  * Called by async thread to do synchronous pageio. Do the i/o, wait
5910  * for it to complete, and cleanup the page list when done.
5911  */
5912 static int
5913 nfs3_sync_pageio(vnode_t *vp, page_t *pp, uoff_t io_off, size_t io_len,
5914         int flags, cred_t *cr)
5915 {
5916         int error;
5917
5918         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5919         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5920         if (flags & B_READ)
5921                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5922         else
5923                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5924         return (error);
5925 }
5926
5927 /* ARGSUSED */
5928 static int
5929 nfs3_pageio(vnode_t *vp, page_t *pp, uoff_t io_off, size_t io_len,
5930         int flags, cred_t *cr, caller_context_t *ct)
5931 {
5932         int error;
5933         rnode_t *rp;
5934
5935         if (pp == NULL)
5936                 return (EINVAL);
5937         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5938                 return (EIO);
5939
5940         rp = VTOR(vp);
5941         mutex_enter(&rp->r_statelock);
5942         rp->r_count++;
5943         mutex_exit(&rp->r_statelock);
5944
5945         if (flags & B_ASYNC) {
5946                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5947                     nfs3_sync_pageio);
5948         } else
5949                 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5950         mutex_enter(&rp->r_statelock);
5951         rp->r_count--;
5952         cv_broadcast(&rp->r_cv);
5953         mutex_exit(&rp->r_statelock);
5954         return (error);
5955 }
5956
5957 /* ARGSUSED */
5958 static void
5959 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5960         caller_context_t *ct)
5961 {
5962         int error;
5963         rnode_t *rp;
5964         page_t *plist;
5965         page_t *pptr;
5966         offset3 offset;
5967         count3 len;
5968         k_sigset_t smask;
5969
5970         /*
5971          * We should get called with fl equal to either B_FREE or
5972          * B_INVAL.  Any other value is illegal.
5973          *
5974          * The page that we are either supposed to free or destroy
5975          * should be exclusive locked and its io lock should not
5976          * be held.
5977          */
5978         ASSERT(fl == B_FREE || fl == B_INVAL);
5979         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5980         rp = VTOR(vp);
5981
5982         /*
5983          * If the page doesn't need to be committed or we shouldn't
5984          * even bother attempting to commit it, then just make sure
5985          * that the p_fsdata byte is clear and then either free or
5986          * destroy the page as appropriate.
5987          */
5988         if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5989                 pp->p_fsdata = C_NOCOMMIT;
5990                 if (fl == B_FREE)
5991                         page_free(pp, dn);
5992                 else
5993                         page_destroy(pp, dn);
5994                 return;
5995         }
5996
5997         /*
5998          * If there is a page invalidation operation going on, then
5999          * if this is one of the pages being destroyed, then just
6000          * clear the p_fsdata byte and then either free or destroy
6001          * the page as appropriate.
6002          */
6003         mutex_enter(&rp->r_statelock);
6004         if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
6005                 mutex_exit(&rp->r_statelock);
6006                 pp->p_fsdata = C_NOCOMMIT;
6007                 if (fl == B_FREE)
6008                         page_free(pp, dn);
6009                 else
6010                         page_destroy(pp, dn);
6011                 return;
6012         }
6013
6014         /*
6015          * If we are freeing this page and someone else is already
6016          * waiting to do a commit, then just unlock the page and
6017          * return.  That other thread will take care of commiting
6018          * this page.  The page can be freed sometime after the
6019          * commit has finished.  Otherwise, if the page is marked
6020          * as delay commit, then we may be getting called from
6021          * pvn_write_done, one page at a time.   This could result
6022          * in one commit per page, so we end up doing lots of small
6023          * commits instead of fewer larger commits.  This is bad,
6024          * we want do as few commits as possible.
6025          */
6026         if (fl == B_FREE) {
6027                 if (rp->r_flags & RCOMMITWAIT) {
6028                         page_unlock(pp);
6029                         mutex_exit(&rp->r_statelock);
6030                         return;
6031                 }
6032                 if (pp->p_fsdata == C_DELAYCOMMIT) {
6033                         pp->p_fsdata = C_COMMIT;
6034                         page_unlock(pp);
6035                         mutex_exit(&rp->r_statelock);
6036                         return;
6037                 }
6038         }
6039
6040         /*
6041          * Check to see if there is a signal which would prevent an
6042          * attempt to commit the pages from being successful.  If so,
6043          * then don't bother with all of the work to gather pages and
6044          * generate the unsuccessful RPC.  Just return from here and
6045          * let the page be committed at some later time.
6046          */
6047         sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6048         if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6049                 sigunintr(&smask);
6050                 page_unlock(pp);
6051                 mutex_exit(&rp->r_statelock);
6052                 return;
6053         }
6054         sigunintr(&smask);
6055
6056         /*
6057          * We are starting to need to commit pages, so let's try
6058          * to commit as many as possible at once to reduce the
6059          * overhead.
6060          *
6061          * Set the `commit inprogress' state bit.  We must
6062          * first wait until any current one finishes.  Then
6063          * we initialize the c_pages list with this page.
6064          */
6065         while (rp->r_flags & RCOMMIT) {
6066                 rp->r_flags |= RCOMMITWAIT;
6067                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6068                 rp->r_flags &= ~RCOMMITWAIT;
6069         }
6070         rp->r_flags |= RCOMMIT;
6071         mutex_exit(&rp->r_statelock);
6072         ASSERT(rp->r_commit.c_pages == NULL);
6073         rp->r_commit.c_pages = pp;
6074         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6075         rp->r_commit.c_commlen = PAGESIZE;
6076
6077         /*
6078          * Gather together all other pages which can be committed.
6079          * They will all be chained off r_commit.c_pages.
6080          */
6081         nfs3_get_commit(vp);
6082
6083         /*
6084          * Clear the `commit inprogress' status and disconnect
6085          * the list of pages to be committed from the rnode.
6086          * At this same time, we also save the starting offset
6087          * and length of data to be committed on the server.
6088          */
6089         plist = rp->r_commit.c_pages;
6090         rp->r_commit.c_pages = NULL;
6091         offset = rp->r_commit.c_commbase;
6092         len = rp->r_commit.c_commlen;
6093         mutex_enter(&rp->r_statelock);
6094         rp->r_flags &= ~RCOMMIT;
6095         cv_broadcast(&rp->r_commit.c_cv);
6096         mutex_exit(&rp->r_statelock);
6097
6098         if (curproc == proc_pageout || curproc == proc_fsflush ||
6099             nfs_zone() != VTOMI(vp)->mi_zone) {
6100                 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6101                 return;
6102         }
6103
6104         /*
6105          * Actually generate the COMMIT3 over the wire operation.
6106          */
6107         error = nfs3_commit(vp, offset, len, cr);
6108
6109         /*
6110          * If we got an error during the commit, just unlock all
6111          * of the pages.  The pages will get retransmitted to the
6112          * server during a putpage operation.
6113          */
6114         if (error) {
6115                 while (plist != NULL) {
6116                         pptr = plist;
6117                         page_sub(&plist, pptr);
6118                         page_unlock(pptr);
6119                 }
6120                 return;
6121         }
6122
6123         /*
6124          * We've tried as hard as we can to commit the data to stable
6125          * storage on the server.  We release the rest of the pages
6126          * and clear the commit required state.  They will be put
6127          * onto the tail of the cachelist if they are nolonger
6128          * mapped.
6129          */
6130         while (plist != pp) {
6131                 pptr = plist;
6132                 page_sub(&plist, pptr);
6133                 pptr->p_fsdata = C_NOCOMMIT;
6134                 (void) page_release(pptr, 1);
6135         }
6136
6137         /*
6138          * It is possible that nfs3_commit didn't return error but
6139          * some other thread has modified the page we are going
6140          * to free/destroy.
6141          *    In this case we need to rewrite the page. Do an explicit check
6142          * before attempting to free/destroy the page. If modified, needs to
6143          * be rewritten so unlock the page and return.
6144          */
6145         if (hat_ismod(pp)) {
6146                 pp->p_fsdata = C_NOCOMMIT;
6147                 page_unlock(pp);
6148                 return;
6149         }
6150
6151         /*
6152          * Now, as appropriate, either free or destroy the page
6153          * that we were called with.
6154          */
6155         pp->p_fsdata = C_NOCOMMIT;
6156         if (fl == B_FREE)
6157                 page_free(pp, dn);
6158         else
6159                 page_destroy(pp, dn);
6160 }
6161
6162 static int
6163 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6164 {
6165         int error;
6166         rnode_t *rp;
6167         COMMIT3args args;
6168         COMMIT3res res;
6169         int douprintf;
6170         cred_t *cred;
6171
6172         rp = VTOR(vp);
6173         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6174
6175         mutex_enter(&rp->r_statelock);
6176         if (rp->r_cred != NULL) {
6177                 cred = rp->r_cred;
6178                 crhold(cred);
6179         } else {
6180                 rp->r_cred = cr;
6181                 crhold(cr);
6182                 cred = cr;
6183                 crhold(cred);
6184         }
6185         mutex_exit(&rp->r_statelock);
6186
6187         args.file = *VTOFH3(vp);
6188         args.offset = offset;
6189         args.count = count;
6190
6191 doitagain:
6192         douprintf = 1;
6193         error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6194             xdr_COMMIT3args, (caddr_t)&args,
6195             xdr_COMMIT3res, (caddr_t)&res, cred,
6196             &douprintf, &res.status, 0, NULL);
6197
6198         crfree(cred);
6199
6200         if (error)
6201                 return (error);
6202
6203         error = geterrno3(res.status);
6204         if (!error) {
6205                 ASSERT(rp->r_flags & RHAVEVERF);
6206                 mutex_enter(&rp->r_statelock);
6207                 if (rp->r_verf == res.resok.verf) {
6208                         mutex_exit(&rp->r_statelock);
6209                         return (0);
6210                 }
6211                 nfs3_set_mod(vp);
6212                 rp->r_verf = res.resok.verf;
6213                 mutex_exit(&rp->r_statelock);
6214                 error = NFS_VERF_MISMATCH;
6215         } else {
6216                 if (error == EACCES) {
6217                         mutex_enter(&rp->r_statelock);
6218                         if (cred != cr) {
6219                                 if (rp->r_cred != NULL)
6220                                         crfree(rp->r_cred);
6221                                 rp->r_cred = cr;
6222                                 crhold(cr);
6223                                 cred = cr;
6224                                 crhold(cred);
6225                                 mutex_exit(&rp->r_statelock);
6226                                 goto doitagain;
6227                         }
6228                         mutex_exit(&rp->r_statelock);
6229                 }
6230                 /*
6231                  * Can't do a PURGE_STALE_FH here because this
6232                  * can cause a deadlock.  nfs3_commit can
6233                  * be called from nfs3_dispose which can be called
6234                  * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6235                  * can call back to pvn_vplist_dirty.
6236                  */
6237                 if (error == ESTALE) {
6238                         mutex_enter(&rp->r_statelock);
6239                         rp->r_flags |= RSTALE;
6240                         if (!rp->r_error)
6241                                 rp->r_error = error;
6242                         mutex_exit(&rp->r_statelock);
6243                         PURGE_ATTRCACHE(vp);
6244                 } else {
6245                         mutex_enter(&rp->r_statelock);
6246                         if (!rp->r_error)
6247                                 rp->r_error = error;
6248                         mutex_exit(&rp->r_statelock);
6249                 }
6250         }
6251
6252         return (error);
6253 }
6254
6255 static void
6256 nfs3_set_mod(vnode_t *vp)
6257 {
6258         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6259
6260         pvn_vplist_setdirty(vp, nfs_setmod_check);
6261 }
6262
6263 /*
6264  * This routine is used to gather together a page list of the pages
6265  * which are to be committed on the server.  This routine must not
6266  * be called if the calling thread holds any locked pages.
6267  *
6268  * The calling thread must have set RCOMMIT.  This bit is used to
6269  * serialize access to the commit structure in the rnode.  As long
6270  * as the thread has set RCOMMIT, then it can manipulate the commit
6271  * structure without requiring any other locks.
6272  */
6273 static void
6274 nfs3_get_commit(vnode_t *vp)
6275 {
6276         rnode_t *rp;
6277         page_t *pp;
6278
6279         rp = VTOR(vp);
6280
6281         ASSERT(rp->r_flags & RCOMMIT);
6282
6283         vmobject_lock(&vp->v_object);
6284
6285         /*
6286          * Step through all of the pages associated with this vnode
6287          * looking for pages which need to be committed.
6288          */
6289         for (pp = vmobject_get_head(&vp->v_object);
6290              pp != NULL;
6291              pp = vmobject_get_next(&vp->v_object, pp)) {
6292                 /* Skip marker pages. */
6293                 if (PP_ISPVN_TAG(pp))
6294                         continue;
6295
6296                 /*
6297                  * If this page does not need to be committed or is
6298                  * modified, then just skip it.
6299                  */
6300                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6301                         continue;
6302
6303                 /*
6304                  * Attempt to lock the page.  If we can't, then
6305                  * someone else is messing with it and we will
6306                  * just skip it.
6307                  */
6308                 if (!page_trylock(pp, SE_EXCL))
6309                         continue;
6310
6311                 /*
6312                  * If this page does not need to be committed or is
6313                  * modified, then just skip it.  Recheck now that
6314                  * the page is locked.
6315                  */
6316                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6317                         page_unlock(pp);
6318                         continue;
6319                 }
6320
6321                 if (PP_ISFREE(pp)) {
6322                         cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6323                             (void *)pp);
6324                 }
6325
6326                 /*
6327                  * The page needs to be committed and we locked it.
6328                  * Update the base and length parameters and add it
6329                  * to r_pages.
6330                  */
6331                 if (rp->r_commit.c_pages == NULL) {
6332                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6333                         rp->r_commit.c_commlen = PAGESIZE;
6334                 } else if (pp->p_offset < rp->r_commit.c_commbase) {
6335                         rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6336                             (offset3)pp->p_offset + rp->r_commit.c_commlen;
6337                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6338                 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6339                     <= pp->p_offset) {
6340                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6341                             rp->r_commit.c_commbase + PAGESIZE;
6342                 }
6343                 page_add(&rp->r_commit.c_pages, pp);
6344         }
6345
6346         vmobject_unlock(&vp->v_object);
6347 }
6348
6349 /*
6350  * This routine is used to gather together a page list of the pages
6351  * which are to be committed on the server.  This routine must not
6352  * be called if the calling thread holds any locked pages.
6353  *
6354  * The calling thread must have set RCOMMIT.  This bit is used to
6355  * serialize access to the commit structure in the rnode.  As long
6356  * as the thread has set RCOMMIT, then it can manipulate the commit
6357  * structure without requiring any other locks.
6358  */
6359 static void
6360 nfs3_get_commit_range(vnode_t *vp, uoff_t soff, size_t len)
6361 {
6362
6363         rnode_t *rp;
6364         page_t *pp;
6365         uoff_t end;
6366         uoff_t off;
6367
6368         ASSERT(len != 0);
6369
6370         rp = VTOR(vp);
6371
6372         ASSERT(rp->r_flags & RCOMMIT);
6373         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6374
6375         /*
6376          * If there are no pages associated with this vnode, then
6377          * just return.
6378          */
6379         if (!vn_has_cached_data(vp))
6380                 return;
6381
6382         /*
6383          * Calculate the ending offset.
6384          */
6385         end = soff + len;
6386
6387         for (off = soff; off < end; off += PAGESIZE) {
6388                 /*
6389                  * Lookup each page by vp, offset.
6390                  */
6391                 if ((pp = page_lookup_nowait(&vp->v_object, off, SE_EXCL)) == NULL)
6392                         continue;
6393
6394                 /*
6395                  * If this page does not need to be committed or is
6396                  * modified, then just skip it.
6397                  */
6398                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6399                         page_unlock(pp);
6400                         continue;
6401                 }
6402
6403                 ASSERT(PP_ISFREE(pp) == 0);
6404
6405                 /*
6406                  * The page needs to be committed and we locked it.
6407                  * Update the base and length parameters and add it
6408                  * to r_pages.
6409                  */
6410                 if (rp->r_commit.c_pages == NULL) {
6411                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6412                         rp->r_commit.c_commlen = PAGESIZE;
6413                 } else {
6414                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6415                             rp->r_commit.c_commbase + PAGESIZE;
6416                 }
6417                 page_add(&rp->r_commit.c_pages, pp);
6418         }
6419 }
6420
6421 static int
6422 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6423 {
6424         int error;
6425         writeverf3 write_verf;
6426         rnode_t *rp = VTOR(vp);
6427
6428         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6429         /*
6430          * Flush the data portion of the file and then commit any
6431          * portions which need to be committed.  This may need to
6432          * be done twice if the server has changed state since
6433          * data was last written.  The data will need to be
6434          * rewritten to the server and then a new commit done.
6435          *
6436          * In fact, this may need to be done several times if the
6437          * server is having problems and crashing while we are
6438          * attempting to do this.
6439          */
6440
6441 top:
6442         /*
6443          * Do a flush based on the poff and plen arguments.  This
6444          * will asynchronously write out any modified pages in the
6445          * range specified by (poff, plen).  This starts all of the
6446          * i/o operations which will be waited for in the next
6447          * call to nfs3_putpage
6448          */
6449
6450         mutex_enter(&rp->r_statelock);
6451         write_verf = rp->r_verf;
6452         mutex_exit(&rp->r_statelock);
6453
6454         error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6455         if (error == EAGAIN)
6456                 error = 0;
6457
6458         /*
6459          * Do a flush based on the poff and plen arguments.  This
6460          * will synchronously write out any modified pages in the
6461          * range specified by (poff, plen) and wait until all of
6462          * the asynchronous i/o's in that range are done as well.
6463          */
6464         if (!error)
6465                 error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6466
6467         if (error)
6468                 return (error);
6469
6470         mutex_enter(&rp->r_statelock);
6471         if (rp->r_verf != write_verf) {
6472                 mutex_exit(&rp->r_statelock);
6473                 goto top;
6474         }
6475         mutex_exit(&rp->r_statelock);
6476
6477         /*
6478          * Now commit any pages which might need to be committed.
6479          * If the error, NFS_VERF_MISMATCH, is returned, then
6480          * start over with the flush operation.
6481          */
6482
6483         error = nfs3_commit_vp(vp, poff, plen, cr);
6484
6485         if (error == NFS_VERF_MISMATCH)
6486                 goto top;
6487
6488         return (error);
6489 }
6490
6491 static int
6492 nfs3_commit_vp(vnode_t *vp, uoff_t poff, size_t plen, cred_t *cr)
6493 {
6494         rnode_t *rp;
6495         page_t *plist;
6496         offset3 offset;
6497         count3 len;
6498
6499
6500         rp = VTOR(vp);
6501
6502         if (nfs_zone() != VTOMI(vp)->mi_zone)
6503                 return (EIO);
6504         /*
6505          * Set the `commit inprogress' state bit.  We must
6506          * first wait until any current one finishes.
6507          */
6508         mutex_enter(&rp->r_statelock);
6509         while (rp->r_flags & RCOMMIT) {
6510                 rp->r_flags |= RCOMMITWAIT;
6511                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6512                 rp->r_flags &= ~RCOMMITWAIT;
6513         }
6514         rp->r_flags |= RCOMMIT;
6515         mutex_exit(&rp->r_statelock);
6516
6517         /*
6518          * Gather together all of the pages which need to be
6519          * committed.
6520          */
6521         if (plen == 0)
6522                 nfs3_get_commit(vp);
6523         else
6524                 nfs3_get_commit_range(vp, poff, plen);
6525
6526         /*
6527          * Clear the `commit inprogress' bit and disconnect the
6528          * page list which was gathered together in nfs3_get_commit.
6529          */
6530         plist = rp->r_commit.c_pages;
6531         rp->r_commit.c_pages = NULL;
6532         offset = rp->r_commit.c_commbase;
6533         len = rp->r_commit.c_commlen;
6534         mutex_enter(&rp->r_statelock);
6535         rp->r_flags &= ~RCOMMIT;
6536         cv_broadcast(&rp->r_commit.c_cv);
6537         mutex_exit(&rp->r_statelock);
6538
6539         /*
6540          * If any pages need to be committed, commit them and
6541          * then unlock them so that they can be freed some
6542          * time later.
6543          */
6544         if (plist != NULL) {
6545                 /*
6546                  * No error occurred during the flush portion
6547                  * of this operation, so now attempt to commit
6548                  * the data to stable storage on the server.
6549                  *
6550                  * This will unlock all of the pages on the list.
6551                  */
6552                 return (nfs3_sync_commit(vp, plist, offset, len, cr));
6553         }
6554         return (0);
6555 }
6556
6557 static int
6558 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6559         cred_t *cr)
6560 {
6561         int error;
6562         page_t *pp;
6563
6564         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6565         error = nfs3_commit(vp, offset, count, cr);
6566
6567         /*
6568          * If we got an error, then just unlock all of the pages
6569          * on the list.
6570          */
6571         if (error) {
6572                 while (plist != NULL) {
6573                         pp = plist;
6574                         page_sub(&plist, pp);
6575                         page_unlock(pp);
6576                 }
6577                 return (error);
6578         }
6579         /*
6580          * We've tried as hard as we can to commit the data to stable
6581          * storage on the server.  We just unlock the pages and clear
6582          * the commit required state.  They will get freed later.
6583          */
6584         while (plist != NULL) {
6585                 pp = plist;
6586                 page_sub(&plist, pp);
6587                 pp->p_fsdata = C_NOCOMMIT;
6588                 page_unlock(pp);
6589         }
6590
6591         return (error);
6592 }
6593
6594 static void
6595 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6596         cred_t *cr)
6597 {
6598         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6599         (void) nfs3_sync_commit(vp, plist, offset, count, cr);
6600 }
6601
6602 /* ARGSUSED */
6603 static int
6604 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6605         caller_context_t *ct)
6606 {
6607         int error;
6608         mntinfo_t *mi;
6609
6610         mi = VTOMI(vp);
6611
6612         if (nfs_zone() != mi->mi_zone)
6613                 return (EIO);
6614
6615         if (mi->mi_flags & MI_ACL) {
6616                 error = acl_setacl3(vp, vsecattr, flag, cr);
6617                 if (mi->mi_flags & MI_ACL)
6618                         return (error);
6619         }
6620
6621         return (ENOSYS);
6622 }
6623
6624 /* ARGSUSED */
6625 static int
6626 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6627         caller_context_t *ct)
6628 {
6629         int error;
6630         mntinfo_t *mi;
6631
6632         mi = VTOMI(vp);
6633
6634         if (nfs_zone() != mi->mi_zone)
6635                 return (EIO);
6636
6637         if (mi->mi_flags & MI_ACL) {
6638                 error = acl_getacl3(vp, vsecattr, flag, cr);
6639                 if (mi->mi_flags & MI_ACL)
6640                         return (error);
6641         }
6642
6643         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6644 }
6645
6646 /* ARGSUSED */
6647 static int
6648 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6649         caller_context_t *ct)
6650 {
6651         int error;
6652         struct shrlock nshr;
6653         struct nfs_owner nfs_owner;
6654         netobj lm_fh3;
6655
6656         if (nfs_zone() != VTOMI(vp)->mi_zone)
6657                 return (EIO);
6658
6659         /*
6660          * check for valid cmd parameter
6661          */
6662         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6663                 return (EINVAL);
6664
6665         /*
6666          * Check access permissions
6667          */
6668         if (cmd == F_SHARE &&
6669             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6670             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6671                 return (EBADF);
6672
6673         /*
6674          * If the filesystem is mounted using local locking, pass the
6675          * request off to the local share code.
6676          */
6677         if (VTOMI(vp)->mi_flags & MI_LLOCK)
6678                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6679
6680         switch (cmd) {
6681         case F_SHARE:
6682         case F_UNSHARE:
6683                 lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6684                 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6685
6686                 /*
6687                  * If passed an owner that is too large to fit in an
6688                  * nfs_owner it is likely a recursive call from the
6689                  * lock manager client and pass it straight through.  If
6690                  * it is not a nfs_owner then simply return an error.
6691                  */
6692                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6693                         if (((struct nfs_owner *)shr->s_owner)->magic !=
6694                             NFS_OWNER_MAGIC)
6695                                 return (EINVAL);
6696
6697                         if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6698                                 error = set_errno(error);
6699                         }
6700                         return (error);
6701                 }
6702                 /*
6703                  * Remote share reservations owner is a combination of
6704                  * a magic number, hostname, and the local owner
6705                  */
6706                 bzero(&nfs_owner, sizeof (nfs_owner));
6707                 nfs_owner.magic = NFS_OWNER_MAGIC;
6708                 (void) strncpy(nfs_owner.hname, uts_nodename(),
6709                     sizeof (nfs_owner.hname));
6710                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6711                 nshr.s_access = shr->s_access;
6712                 nshr.s_deny = shr->s_deny;
6713                 nshr.s_sysid = 0;
6714                 nshr.s_pid = ttoproc(curthread)->p_pid;
6715                 nshr.s_own_len = sizeof (nfs_owner);
6716                 nshr.s_owner = (caddr_t)&nfs_owner;
6717
6718                 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6719                         error = set_errno(error);
6720                 }
6721
6722                 break;
6723
6724         case F_HASREMOTELOCKS:
6725                 /*
6726                  * NFS client can't store remote locks itself
6727                  */
6728                 shr->s_access = 0;
6729                 error = 0;
6730                 break;
6731
6732         default:
6733                 error = EINVAL;
6734                 break;
6735         }
6736
6737         return (error);
6738 }