sys/nfs/nfs_bio.c

   1 /*      $NetBSD: nfs_bio.c,v 1.182 2009/03/13 15:00:34 yamt Exp $       */
   2
   3 /*
   4  * Copyright (c) 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Rick Macklem at The University of Guelph.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __KERNEL_RCSID(0, "$NetBSD: nfs_bio.c,v 1.182 2009/03/13 15:00:34 yamt Exp $");
  39
  40 #ifdef _KERNEL_OPT
  41 #include "opt_nfs.h"
  42 #include "opt_ddb.h"
  43 #endif
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/resourcevar.h>
  48 #include <sys/signalvar.h>
  49 #include <sys/proc.h>
  50 #include <sys/buf.h>
  51 #include <sys/vnode.h>
  52 #include <sys/mount.h>
  53 #include <sys/kernel.h>
  54 #include <sys/namei.h>
  55 #include <sys/dirent.h>
  56 #include <sys/kauth.h>
  57
  58 #include <uvm/uvm_extern.h>
  59 #include <uvm/uvm.h>
  60
  61 #include <nfs/rpcv2.h>
  62 #include <nfs/nfsproto.h>
  63 #include <nfs/nfs.h>
  64 #include <nfs/nfsmount.h>
  65 #include <nfs/nfsnode.h>
  66 #include <nfs/nfs_var.h>
  67
  68 extern int nfs_numasync;
  69 extern int nfs_commitsize;
  70 extern struct nfsstats nfsstats;
  71
  72 static int nfs_doio_read(struct buf *, struct uio *);
  73 static int nfs_doio_write(struct buf *, struct uio *);
  74 static int nfs_doio_phys(struct buf *, struct uio *);
  75
  76 /*
  77  * Vnode op for read using bio
  78  * Any similarity to readip() is purely coincidental
  79  */
  80 int
  81 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag,
  82             kauth_cred_t cred, int cflag)
  83 {
  84         struct nfsnode *np = VTONFS(vp);
  85         struct buf *bp = NULL, *rabp;
  86         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
  87         struct nfsdircache *ndp = NULL, *nndp = NULL;
  88         void *baddr;
  89         int got_buf = 0, error = 0, n = 0, on = 0, en, enn;
  90         int enough = 0;
  91         struct dirent *dp, *pdp, *edp, *ep;
  92         off_t curoff = 0;
  93         int advice;
  94         struct lwp *l = curlwp;
  95
  96 #ifdef DIAGNOSTIC
  97         if (uio->uio_rw != UIO_READ)
  98                 panic("nfs_read mode");
  99 #endif
 100         if (uio->uio_resid == 0)
 101                 return (0);
 102         if (vp->v_type != VDIR && uio->uio_offset < 0)
 103                 return (EINVAL);
 104 #ifndef NFS_V2_ONLY
 105         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 106             !(nmp->nm_iflag & NFSMNT_GOTFSINFO))
 107                 (void)nfs_fsinfo(nmp, vp, cred, l);
 108 #endif
 109         if (vp->v_type != VDIR &&
 110             (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 111                 return (EFBIG);
 112
 113         /*
 114          * For nfs, cache consistency can only be maintained approximately.
 115          * Although RFC1094 does not specify the criteria, the following is
 116          * believed to be compatible with the reference port.
 117          *
 118          * If the file's modify time on the server has changed since the
 119          * last read rpc or you have written to the file,
 120          * you may have lost data cache consistency with the
 121          * server, so flush all of the file's data out of the cache.
 122          * Then force a getattr rpc to ensure that you have up to date
 123          * attributes.
 124          * NB: This implies that cache data can be read when up to
 125          * nfs_attrtimeo seconds out of date. If you find that you need current
 126          * attributes this could be forced by setting n_attrstamp to 0 before
 127          * the VOP_GETATTR() call.
 128          */
 129
 130         if (vp->v_type != VLNK) {
 131                 error = nfs_flushstalebuf(vp, cred, l,
 132                     NFS_FLUSHSTALEBUF_MYWRITE);
 133                 if (error)
 134                         return error;
 135         }
 136
 137         do {
 138             /*
 139              * Don't cache symlinks.
 140              */
 141             if ((vp->v_vflag & VV_ROOT) && vp->v_type == VLNK) {
 142                 return (nfs_readlinkrpc(vp, uio, cred));
 143             }
 144             baddr = (void *)0;
 145             switch (vp->v_type) {
 146             case VREG:
 147                 nfsstats.biocache_reads++;
 148
 149                 advice = IO_ADV_DECODE(ioflag);
 150                 error = 0;
 151                 while (uio->uio_resid > 0) {
 152                         vsize_t bytelen;
 153
 154                         nfs_delayedtruncate(vp);
 155                         if (np->n_size <= uio->uio_offset) {
 156                                 break;
 157                         }
 158                         bytelen =
 159                             MIN(np->n_size - uio->uio_offset, uio->uio_resid);
 160                         error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
 161                             advice, UBC_READ | UBC_PARTIALOK |
 162                             (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
 163                         if (error) {
 164                                 /*
 165                                  * XXXkludge
 166                                  * the file has been truncated on the server.
 167                                  * there isn't much we can do.
 168                                  */
 169                                 if (uio->uio_offset >= np->n_size) {
 170                                         /* end of file */
 171                                         error = 0;
 172                                 } else {
 173                                         break;
 174                                 }
 175                         }
 176                 }
 177                 break;
 178
 179             case VLNK:
 180                 nfsstats.biocache_readlinks++;
 181                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, l);
 182                 if (!bp)
 183                         return (EINTR);
 184                 if ((bp->b_oflags & BO_DONE) == 0) {
 185                         bp->b_flags |= B_READ;
 186                         error = nfs_doio(bp);
 187                         if (error) {
 188                                 brelse(bp, 0);
 189                                 return (error);
 190                         }
 191                 }
 192                 n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 193                 got_buf = 1;
 194                 on = 0;
 195                 break;
 196             case VDIR:
 197 diragain:
 198                 nfsstats.biocache_readdirs++;
 199                 ndp = nfs_searchdircache(vp, uio->uio_offset,
 200                         (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0);
 201                 if (!ndp) {
 202                         /*
 203                          * We've been handed a cookie that is not
 204                          * in the cache. If we're not translating
 205                          * 32 <-> 64, it may be a value that was
 206                          * flushed out of the cache because it grew
 207                          * too big. Let the server judge if it's
 208                          * valid or not. In the translation case,
 209                          * we have no way of validating this value,
 210                          * so punt.
 211                          */
 212                         if (nmp->nm_flag & NFSMNT_XLATECOOKIE)
 213                                 return (EINVAL);
 214                         ndp = nfs_enterdircache(vp, uio->uio_offset,
 215                                 uio->uio_offset, 0, 0);
 216                 }
 217
 218                 if (NFS_EOFVALID(np) &&
 219                     ndp->dc_cookie == np->n_direofoffset) {
 220                         nfs_putdircache(np, ndp);
 221                         nfsstats.direofcache_hits++;
 222                         return (0);
 223                 }
 224
 225                 bp = nfs_getcacheblk(vp, NFSDC_BLKNO(ndp), NFS_DIRBLKSIZ, l);
 226                 if (!bp)
 227                     return (EINTR);
 228                 if ((bp->b_oflags & BO_DONE) == 0) {
 229                     bp->b_flags |= B_READ;
 230                     bp->b_dcookie = ndp->dc_blkcookie;
 231                     error = nfs_doio(bp);
 232                     if (error) {
 233                         /*
 234                          * Yuck! The directory has been modified on the
 235                          * server. Punt and let the userland code
 236                          * deal with it.
 237                          */
 238                         nfs_putdircache(np, ndp);
 239                         brelse(bp, 0);
 240                         /*
 241                          * nfs_request maps NFSERR_BAD_COOKIE to EINVAL.
 242                          */
 243                         if (error == EINVAL) { /* NFSERR_BAD_COOKIE */
 244                             nfs_invaldircache(vp, 0);
 245                             nfs_vinvalbuf(vp, 0, cred, l, 1);
 246                         }
 247                         return (error);
 248                     }
 249                 }
 250
 251                 /*
 252                  * Just return if we hit EOF right away with this
 253                  * block. Always check here, because direofoffset
 254                  * may have been set by an nfsiod since the last
 255                  * check.
 256                  *
 257                  * also, empty block implies EOF.
 258                  */
 259
 260                 if (bp->b_bcount == bp->b_resid ||
 261                     (NFS_EOFVALID(np) &&
 262                     ndp->dc_blkcookie == np->n_direofoffset)) {
 263                         KASSERT(bp->b_bcount != bp->b_resid ||
 264                             ndp->dc_blkcookie == bp->b_dcookie);
 265                         nfs_putdircache(np, ndp);
 266                         brelse(bp, BC_NOCACHE);
 267                         return 0;
 268                 }
 269
 270                 /*
 271                  * Find the entry we were looking for in the block.
 272                  */
 273
 274                 en = ndp->dc_entry;
 275
 276                 pdp = dp = (struct dirent *)bp->b_data;
 277                 edp = (struct dirent *)(void *)((char *)bp->b_data + bp->b_bcount -
 278                     bp->b_resid);
 279                 enn = 0;
 280                 while (enn < en && dp < edp) {
 281                         pdp = dp;
 282                         dp = _DIRENT_NEXT(dp);
 283                         enn++;
 284                 }
 285
 286                 /*
 287                  * If the entry number was bigger than the number of
 288                  * entries in the block, or the cookie of the previous
 289                  * entry doesn't match, the directory cache is
 290                  * stale. Flush it and try again (i.e. go to
 291                  * the server).
 292                  */
 293                 if (dp >= edp || (struct dirent *)_DIRENT_NEXT(dp) > edp ||
 294                     (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) {
 295 #ifdef DEBUG
 296                         printf("invalid cache: %p %p %p off %jx %jx\n",
 297                                 pdp, dp, edp,
 298                                 (uintmax_t)uio->uio_offset,
 299                                 (uintmax_t)NFS_GETCOOKIE(pdp));
 300 #endif
 301                         nfs_putdircache(np, ndp);
 302                         brelse(bp, 0);
 303                         nfs_invaldircache(vp, 0);
 304                         nfs_vinvalbuf(vp, 0, cred, l, 0);
 305                         goto diragain;
 306                 }
 307
 308                 on = (char *)dp - (char *)bp->b_data;
 309
 310                 /*
 311                  * Cache all entries that may be exported to the
 312                  * user, as they may be thrown back at us. The
 313                  * NFSBIO_CACHECOOKIES flag indicates that all
 314                  * entries are being 'exported', so cache them all.
 315                  */
 316
 317                 if (en == 0 && pdp == dp) {
 318                         dp = _DIRENT_NEXT(dp);
 319                         enn++;
 320                 }
 321
 322                 if (uio->uio_resid < (bp->b_bcount - bp->b_resid - on)) {
 323                         n = uio->uio_resid;
 324                         enough = 1;
 325                 } else
 326                         n = bp->b_bcount - bp->b_resid - on;
 327
 328                 ep = (struct dirent *)(void *)((char *)bp->b_data + on + n);
 329
 330                 /*
 331                  * Find last complete entry to copy, caching entries
 332                  * (if requested) as we go.
 333                  */
 334
 335                 while (dp < ep && (struct dirent *)_DIRENT_NEXT(dp) <= ep) {
 336                         if (cflag & NFSBIO_CACHECOOKIES) {
 337                                 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp),
 338                                     ndp->dc_blkcookie, enn, bp->b_lblkno);
 339                                 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) {
 340                                         NFS_STASHCOOKIE32(pdp,
 341                                             nndp->dc_cookie32);
 342                                 }
 343                                 nfs_putdircache(np, nndp);
 344                         }
 345                         pdp = dp;
 346                         dp = _DIRENT_NEXT(dp);
 347                         enn++;
 348                 }
 349                 nfs_putdircache(np, ndp);
 350
 351                 /*
 352                  * If the last requested entry was not the last in the
 353                  * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ),
 354                  * cache the cookie of the last requested one, and
 355                  * set of the offset to it.
 356                  */
 357
 358                 if ((on + n) < bp->b_bcount - bp->b_resid) {
 359                         curoff = NFS_GETCOOKIE(pdp);
 360                         nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie,
 361                             enn, bp->b_lblkno);
 362                         if (nmp->nm_flag & NFSMNT_XLATECOOKIE) {
 363                                 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32);
 364                                 curoff = nndp->dc_cookie32;
 365                         }
 366                         nfs_putdircache(np, nndp);
 367                 } else
 368                         curoff = bp->b_dcookie;
 369
 370                 /*
 371                  * Always cache the entry for the next block,
 372                  * so that readaheads can use it.
 373                  */
 374                 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0);
 375                 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) {
 376                         if (curoff == bp->b_dcookie) {
 377                                 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32);
 378                                 curoff = nndp->dc_cookie32;
 379                         }
 380                 }
 381
 382                 n = (char *)_DIRENT_NEXT(pdp) - ((char *)bp->b_data + on);
 383
 384                 /*
 385                  * If not eof and read aheads are enabled, start one.
 386                  * (You need the current block first, so that you have the
 387                  *  directory offset cookie of the next block.)
 388                  */
 389                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 390                     !NFS_EOFVALID(np)) {
 391                         rabp = nfs_getcacheblk(vp, NFSDC_BLKNO(nndp),
 392                                                 NFS_DIRBLKSIZ, l);
 393                         if (rabp) {
 394                             if ((rabp->b_oflags & (BO_DONE | BO_DELWRI)) == 0) {
 395                                 rabp->b_dcookie = nndp->dc_cookie;
 396                                 rabp->b_flags |= (B_READ | B_ASYNC);
 397                                 if (nfs_asyncio(rabp)) {
 398                                     brelse(rabp, BC_INVAL);
 399                                 }
 400                             } else
 401                                 brelse(rabp, 0);
 402                         }
 403                 }
 404                 nfs_putdircache(np, nndp);
 405                 got_buf = 1;
 406                 break;
 407             default:
 408                 printf(" nfsbioread: type %x unexpected\n",vp->v_type);
 409                 break;
 410             }
 411
 412             if (n > 0) {
 413                 if (!baddr)
 414                         baddr = bp->b_data;
 415                 error = uiomove((char *)baddr + on, (int)n, uio);
 416             }
 417             switch (vp->v_type) {
 418             case VREG:
 419                 break;
 420             case VLNK:
 421                 n = 0;
 422                 break;
 423             case VDIR:
 424                 uio->uio_offset = curoff;
 425                 if (enough)
 426                         n = 0;
 427                 break;
 428             default:
 429                 printf(" nfsbioread: type %x unexpected\n",vp->v_type);
 430             }
 431             if (got_buf)
 432                 brelse(bp, 0);
 433         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 434         return (error);
 435 }
 436
 437 /*
 438  * Vnode op for write using bio
 439  */
 440 int
 441 nfs_write(void *v)
 442 {
 443         struct vop_write_args /* {
 444                 struct vnode *a_vp;
 445                 struct uio *a_uio;
 446                 int  a_ioflag;
 447                 kauth_cred_t a_cred;
 448         } */ *ap = v;
 449         struct uio *uio = ap->a_uio;
 450         struct lwp *l = curlwp;
 451         struct vnode *vp = ap->a_vp;
 452         struct nfsnode *np = VTONFS(vp);
 453         kauth_cred_t cred = ap->a_cred;
 454         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 455         voff_t oldoff, origoff;
 456         vsize_t bytelen;
 457         int error = 0;
 458         int ioflag = ap->a_ioflag;
 459         int extended = 0, wrotedata = 0;
 460
 461 #ifdef DIAGNOSTIC
 462         if (uio->uio_rw != UIO_WRITE)
 463                 panic("nfs_write mode");
 464 #endif
 465         if (vp->v_type != VREG)
 466                 return (EIO);
 467         if (np->n_flag & NWRITEERR) {
 468                 np->n_flag &= ~NWRITEERR;
 469                 return (np->n_error);
 470         }
 471 #ifndef NFS_V2_ONLY
 472         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 473             !(nmp->nm_iflag & NFSMNT_GOTFSINFO))
 474                 (void)nfs_fsinfo(nmp, vp, cred, l);
 475 #endif
 476         if (ioflag & IO_APPEND) {
 477                 NFS_INVALIDATE_ATTRCACHE(np);
 478                 error = nfs_flushstalebuf(vp, cred, l,
 479                     NFS_FLUSHSTALEBUF_MYWRITE);
 480                 if (error)
 481                         return (error);
 482                 uio->uio_offset = np->n_size;
 483         }
 484         if (uio->uio_offset < 0)
 485                 return (EINVAL);
 486         if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 487                 return (EFBIG);
 488         if (uio->uio_resid == 0)
 489                 return (0);
 490         /*
 491          * Maybe this should be above the vnode op call, but so long as
 492          * file servers have no limits, i don't think it matters
 493          */
 494         if (l && l->l_proc && uio->uio_offset + uio->uio_resid >
 495               l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 496                 mutex_enter(proc_lock);
 497                 psignal(l->l_proc, SIGXFSZ);
 498                 mutex_exit(proc_lock);
 499                 return (EFBIG);
 500         }
 501
 502         origoff = uio->uio_offset;
 503         do {
 504                 bool overwrite; /* if we are overwriting whole pages */
 505                 u_quad_t oldsize;
 506                 oldoff = uio->uio_offset;
 507                 bytelen = uio->uio_resid;
 508
 509                 nfsstats.biocache_writes++;
 510
 511                 oldsize = np->n_size;
 512                 np->n_flag |= NMODIFIED;
 513                 if (np->n_size < uio->uio_offset + bytelen) {
 514                         np->n_size = uio->uio_offset + bytelen;
 515                 }
 516                 overwrite = false;
 517                 if ((uio->uio_offset & PAGE_MASK) == 0) {
 518                         if ((vp->v_vflag & VV_MAPPED) == 0 &&
 519                             bytelen > PAGE_SIZE) {
 520                                 bytelen = trunc_page(bytelen);
 521                                 overwrite = true;
 522                         } else if ((bytelen & PAGE_MASK) == 0 &&
 523                             uio->uio_offset >= vp->v_size) {
 524                                 overwrite = true;
 525                         }
 526                 }
 527                 if (vp->v_size < uio->uio_offset + bytelen) {
 528                         uvm_vnp_setwritesize(vp, uio->uio_offset + bytelen);
 529                 }
 530                 error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
 531                     UVM_ADV_RANDOM, UBC_WRITE | UBC_PARTIALOK |
 532                     (overwrite ? UBC_FAULTBUSY : 0) |
 533                     (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
 534                 if (error) {
 535                         uvm_vnp_setwritesize(vp, vp->v_size);
 536                         if (overwrite && np->n_size != oldsize) {
 537                                 /*
 538                                  * backout size and free pages past eof.
 539                                  */
 540                                 np->n_size = oldsize;
 541                                 mutex_enter(&vp->v_interlock);
 542                                 (void)VOP_PUTPAGES(vp, round_page(vp->v_size),
 543                                     0, PGO_SYNCIO | PGO_FREE);
 544                         }
 545                         break;
 546                 }
 547                 wrotedata = 1;
 548
 549                 /*
 550                  * update UVM's notion of the size now that we've
 551                  * copied the data into the vnode's pages.
 552                  */
 553
 554                 if (vp->v_size < uio->uio_offset) {
 555                         uvm_vnp_setsize(vp, uio->uio_offset);
 556                         extended = 1;
 557                 }
 558
 559                 if ((oldoff & ~(nmp->nm_wsize - 1)) !=
 560                     (uio->uio_offset & ~(nmp->nm_wsize - 1))) {
 561                         mutex_enter(&vp->v_interlock);
 562                         error = VOP_PUTPAGES(vp,
 563                             trunc_page(oldoff & ~(nmp->nm_wsize - 1)),
 564                             round_page((uio->uio_offset + nmp->nm_wsize - 1) &
 565                                        ~(nmp->nm_wsize - 1)), PGO_CLEANIT);
 566                 }
 567         } while (uio->uio_resid > 0);
 568         if (wrotedata)
 569                 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
 570         if (error == 0 && (ioflag & IO_SYNC) != 0) {
 571                 mutex_enter(&vp->v_interlock);
 572                 error = VOP_PUTPAGES(vp,
 573                     trunc_page(origoff & ~(nmp->nm_wsize - 1)),
 574                     round_page((uio->uio_offset + nmp->nm_wsize - 1) &
 575                                ~(nmp->nm_wsize - 1)),
 576                     PGO_CLEANIT | PGO_SYNCIO);
 577         }
 578         return error;
 579 }
 580
 581 /*
 582  * Get an nfs cache block.
 583  * Allocate a new one if the block isn't currently in the cache
 584  * and return the block marked busy. If the calling process is
 585  * interrupted by a signal for an interruptible mount point, return
 586  * NULL.
 587  */
 588 struct buf *
 589 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct lwp *l)
 590 {
 591         struct buf *bp;
 592         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 593
 594         if (nmp->nm_flag & NFSMNT_INT) {
 595                 bp = getblk(vp, bn, size, PCATCH, 0);
 596                 while (bp == NULL) {
 597                         if (nfs_sigintr(nmp, NULL, l))
 598                                 return (NULL);
 599                         bp = getblk(vp, bn, size, 0, 2 * hz);
 600                 }
 601         } else
 602                 bp = getblk(vp, bn, size, 0, 0);
 603         return (bp);
 604 }
 605
 606 /*
 607  * Flush and invalidate all dirty buffers. If another process is already
 608  * doing the flush, just wait for completion.
 609  */
 610 int
 611 nfs_vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred,
 612                 struct lwp *l, int intrflg)
 613 {
 614         struct nfsnode *np = VTONFS(vp);
 615         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 616         int error = 0, slptimeo;
 617         bool catch;
 618
 619         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 620                 intrflg = 0;
 621         if (intrflg) {
 622                 catch = true;
 623                 slptimeo = 2 * hz;
 624         } else {
 625                 catch = false;
 626                 slptimeo = 0;
 627         }
 628         /*
 629          * First wait for any other process doing a flush to complete.
 630          */
 631         mutex_enter(&vp->v_interlock);
 632         while (np->n_flag & NFLUSHINPROG) {
 633                 np->n_flag |= NFLUSHWANT;
 634                 error = mtsleep(&np->n_flag, PRIBIO + 2, "nfsvinval",
 635                         slptimeo, &vp->v_interlock);
 636                 if (error && intrflg && nfs_sigintr(nmp, NULL, l)) {
 637                         mutex_exit(&vp->v_interlock);
 638                         return EINTR;
 639                 }
 640         }
 641
 642         /*
 643          * Now, flush as required.
 644          */
 645         np->n_flag |= NFLUSHINPROG;
 646         mutex_exit(&vp->v_interlock);
 647         error = vinvalbuf(vp, flags, cred, l, catch, 0);
 648         while (error) {
 649                 if (intrflg && nfs_sigintr(nmp, NULL, l)) {
 650                         error = EINTR;
 651                         break;
 652                 }
 653                 error = vinvalbuf(vp, flags, cred, l, 0, slptimeo);
 654         }
 655         mutex_enter(&vp->v_interlock);
 656         if (error == 0)
 657                 np->n_flag &= ~NMODIFIED;
 658         np->n_flag &= ~NFLUSHINPROG;
 659         if (np->n_flag & NFLUSHWANT) {
 660                 np->n_flag &= ~NFLUSHWANT;
 661                 wakeup(&np->n_flag);
 662         }
 663         mutex_exit(&vp->v_interlock);
 664         return error;
 665 }
 666
 667 /*
 668  * nfs_flushstalebuf: flush cache if it's stale.
 669  *
 670  * => caller shouldn't own any pages or buffers which belong to the vnode.
 671  */
 672
 673 int
 674 nfs_flushstalebuf(struct vnode *vp, kauth_cred_t cred, struct lwp *l,
 675     int flags)
 676 {
 677         struct nfsnode *np = VTONFS(vp);
 678         struct vattr vattr;
 679         int error;
 680
 681         if (np->n_flag & NMODIFIED) {
 682                 if ((flags & NFS_FLUSHSTALEBUF_MYWRITE) == 0
 683                     || vp->v_type != VREG) {
 684                         error = nfs_vinvalbuf(vp, V_SAVE, cred, l, 1);
 685                         if (error)
 686                                 return error;
 687                         if (vp->v_type == VDIR) {
 688                                 nfs_invaldircache(vp, 0);
 689                         }
 690                 } else {
 691                         /*
 692                          * XXX assuming writes are ours.
 693                          */
 694                 }
 695                 NFS_INVALIDATE_ATTRCACHE(np);
 696                 error = VOP_GETATTR(vp, &vattr, cred);
 697                 if (error)
 698                         return error;
 699                 np->n_mtime = vattr.va_mtime;
 700         } else {
 701                 error = VOP_GETATTR(vp, &vattr, cred);
 702                 if (error)
 703                         return error;
 704                 if (timespeccmp(&np->n_mtime, &vattr.va_mtime, !=)) {
 705                         if (vp->v_type == VDIR) {
 706                                 nfs_invaldircache(vp, 0);
 707                         }
 708                         error = nfs_vinvalbuf(vp, V_SAVE, cred, l, 1);
 709                         if (error)
 710                                 return error;
 711                         np->n_mtime = vattr.va_mtime;
 712                 }
 713         }
 714
 715         return error;
 716 }
 717
 718 /*
 719  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 720  * This is mainly to avoid queueing async I/O requests when the nfsiods
 721  * are all hung on a dead server.
 722  */
 723
 724 int
 725 nfs_asyncio(struct buf *bp)
 726 {
 727         struct nfs_iod *iod;
 728         struct nfsmount *nmp;
 729         int slptimeo = 0, error;
 730         bool catch = false;
 731
 732         if (nfs_numasync == 0)
 733                 return (EIO);
 734
 735         nmp = VFSTONFS(bp->b_vp->v_mount);
 736 again:
 737         if (nmp->nm_flag & NFSMNT_INT)
 738                 catch = true;
 739
 740         /*
 741          * Find a free iod to process this request.
 742          */
 743
 744         mutex_enter(&nfs_iodlist_lock);
 745         iod = LIST_FIRST(&nfs_iodlist_idle);
 746         if (iod) {
 747                 /*
 748                  * Found one, so wake it up and tell it which
 749                  * mount to process.
 750                  */
 751                 LIST_REMOVE(iod, nid_idle);
 752                 mutex_enter(&iod->nid_lock);
 753                 mutex_exit(&nfs_iodlist_lock);
 754                 KASSERT(iod->nid_mount == NULL);
 755                 iod->nid_mount = nmp;
 756                 cv_signal(&iod->nid_cv);
 757                 mutex_enter(&nmp->nm_lock);
 758                 mutex_exit(&iod->nid_lock);
 759                 nmp->nm_bufqiods++;
 760                 if (nmp->nm_bufqlen < 2 * nmp->nm_bufqiods) {
 761                         cv_broadcast(&nmp->nm_aiocv);
 762                 }
 763         } else {
 764                 mutex_exit(&nfs_iodlist_lock);
 765                 mutex_enter(&nmp->nm_lock);
 766         }
 767
 768         KASSERT(mutex_owned(&nmp->nm_lock));
 769
 770         /*
 771          * If we have an iod which can process the request, then queue
 772          * the buffer.  However, even if we have an iod, do not initiate
 773          * queue cleaning if curproc is the pageout daemon. if the NFS mount
 774          * is via local loopback, we may put curproc (pagedaemon) to sleep
 775          * waiting for the writes to complete. But the server (ourself)
 776          * may block the write, waiting for its (ie., our) pagedaemon
 777          * to produce clean pages to handle the write: deadlock.
 778          * XXX: start non-loopback mounts straight away?  If "lots free",
 779          * let pagedaemon start loopback writes anyway?
 780          */
 781         if (nmp->nm_bufqiods > 0) {
 782
 783                 /*
 784                  * Ensure that the queue never grows too large.
 785                  */
 786                 if (curlwp == uvm.pagedaemon_lwp) {
 787                         /* Enque for later, to avoid free-page deadlock */
 788                 } else while (nmp->nm_bufqlen >= 2 * nmp->nm_bufqiods) {
 789                         if (catch) {
 790                                 error = cv_timedwait_sig(&nmp->nm_aiocv,
 791                                     &nmp->nm_lock, slptimeo);
 792                         } else {
 793                                 error = cv_timedwait(&nmp->nm_aiocv,
 794                                     &nmp->nm_lock, slptimeo);
 795                         }
 796                         if (error) {
 797                                 if (nfs_sigintr(nmp, NULL, curlwp)) {
 798                                         mutex_exit(&nmp->nm_lock);
 799                                         return (EINTR);
 800                                 }
 801                                 if (catch) {
 802                                         catch = false;
 803                                         slptimeo = 2 * hz;
 804                                 }
 805                         }
 806
 807                         /*
 808                          * We might have lost our iod while sleeping,
 809                          * so check and loop if necessary.
 810                          */
 811
 812                         if (nmp->nm_bufqiods == 0) {
 813                                 mutex_exit(&nmp->nm_lock);
 814                                 goto again;
 815                         }
 816                 }
 817                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 818                 nmp->nm_bufqlen++;
 819                 mutex_exit(&nmp->nm_lock);
 820                 return (0);
 821         }
 822         mutex_exit(&nmp->nm_lock);
 823
 824         /*
 825          * All the iods are busy on other mounts, so return EIO to
 826          * force the caller to process the i/o synchronously.
 827          */
 828
 829         return (EIO);
 830 }
 831
 832 /*
 833  * nfs_doio for read.
 834  */
 835 static int
 836 nfs_doio_read(struct buf *bp, struct uio *uiop)
 837 {
 838         struct vnode *vp = bp->b_vp;
 839         struct nfsnode *np = VTONFS(vp);
 840         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 841         int error = 0;
 842
 843         uiop->uio_rw = UIO_READ;
 844         switch (vp->v_type) {
 845         case VREG:
 846                 nfsstats.read_bios++;
 847                 error = nfs_readrpc(vp, uiop);
 848                 if (!error && uiop->uio_resid) {
 849                         int diff, len;
 850
 851                         /*
 852                          * If uio_resid > 0, there is a hole in the file and
 853                          * no writes after the hole have been pushed to
 854                          * the server yet or the file has been truncated
 855                          * on the server.
 856                          * Just zero fill the rest of the valid area.
 857                          */
 858
 859                         KASSERT(vp->v_size >=
 860                             uiop->uio_offset + uiop->uio_resid);
 861                         diff = bp->b_bcount - uiop->uio_resid;
 862                         len = uiop->uio_resid;
 863                         memset((char *)bp->b_data + diff, 0, len);
 864                         uiop->uio_resid = 0;
 865                 }
 866 #if 0
 867                 if (uiop->uio_lwp && (vp->v_iflag & VI_TEXT) &&
 868                     timespeccmp(&np->n_mtime, &np->n_vattr->va_mtime, !=)) {
 869                         mutex_enter(proc_lock);
 870                         killproc(uiop->uio_lwp->l_proc, "process text file was modified");
 871                         mutex_exit(proc_lock);
 872 #if 0 /* XXX NJWLWP */
 873                         uiop->uio_lwp->l_proc->p_holdcnt++;
 874 #endif
 875                 }
 876 #endif
 877                 break;
 878         case VLNK:
 879                 KASSERT(uiop->uio_offset == (off_t)0);
 880                 nfsstats.readlink_bios++;
 881                 error = nfs_readlinkrpc(vp, uiop, np->n_rcred);
 882                 break;
 883         case VDIR:
 884                 nfsstats.readdir_bios++;
 885                 uiop->uio_offset = bp->b_dcookie;
 886 #ifndef NFS_V2_ONLY
 887                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
 888                         error = nfs_readdirplusrpc(vp, uiop,
 889                             curlwp->l_cred);
 890                         /*
 891                          * nfs_request maps NFSERR_NOTSUPP to ENOTSUP.
 892                          */
 893                         if (error == ENOTSUP)
 894                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 895                 }
 896 #else
 897                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 898 #endif
 899                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 900                         error = nfs_readdirrpc(vp, uiop,
 901                             curlwp->l_cred);
 902                 if (!error) {
 903                         bp->b_dcookie = uiop->uio_offset;
 904                 }
 905                 break;
 906         default:
 907                 printf("nfs_doio:  type %x unexpected\n", vp->v_type);
 908                 break;
 909         }
 910         bp->b_error = error;
 911         return error;
 912 }
 913
 914 /*
 915  * nfs_doio for write.
 916  */
 917 static int
 918 nfs_doio_write(struct buf *bp, struct uio *uiop)
 919 {
 920         struct vnode *vp = bp->b_vp;
 921         struct nfsnode *np = VTONFS(vp);
 922         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 923         int iomode;
 924         bool stalewriteverf = false;
 925         int i, npages = (bp->b_bcount + PAGE_SIZE - 1) >> PAGE_SHIFT;
 926         struct vm_page **pgs, *spgs[UBC_MAX_PAGES];
 927 #ifndef NFS_V2_ONLY
 928         bool needcommit = true; /* need only COMMIT RPC */
 929 #else
 930         bool needcommit = false; /* need only COMMIT RPC */
 931 #endif
 932         bool pageprotected;
 933         struct uvm_object *uobj = &vp->v_uobj;
 934         int error;
 935         off_t off, cnt;
 936
 937         if (npages < __arraycount(spgs))
 938                 pgs = spgs;
 939         else {
 940                 if ((pgs = kmem_alloc(sizeof(*pgs) * npages, KM_NOSLEEP)) ==
 941                     NULL)
 942                         return ENOMEM;
 943         }
 944
 945         if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) {
 946                 iomode = NFSV3WRITE_UNSTABLE;
 947         } else {
 948                 iomode = NFSV3WRITE_FILESYNC;
 949         }
 950
 951 #ifndef NFS_V2_ONLY
 952 again:
 953 #endif
 954         rw_enter(&nmp->nm_writeverflock, RW_READER);
 955
 956         for (i = 0; i < npages; i++) {
 957                 pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
 958                 if (pgs[i]->uobject == uobj &&
 959                     pgs[i]->offset == uiop->uio_offset + (i << PAGE_SHIFT)) {
 960                         KASSERT(pgs[i]->flags & PG_BUSY);
 961                         /*
 962                          * this page belongs to our object.
 963                          */
 964                         mutex_enter(&uobj->vmobjlock);
 965                         /*
 966                          * write out the page stably if it's about to
 967                          * be released because we can't resend it
 968                          * on the server crash.
 969                          *
 970                          * XXX assuming PG_RELEASE|PG_PAGEOUT won't be
 971                          * changed until unbusy the page.
 972                          */
 973                         if (pgs[i]->flags & (PG_RELEASED|PG_PAGEOUT))
 974                                 iomode = NFSV3WRITE_FILESYNC;
 975                         /*
 976                          * if we met a page which hasn't been sent yet,
 977                          * we need do WRITE RPC.
 978                          */
 979                         if ((pgs[i]->flags & PG_NEEDCOMMIT) == 0)
 980                                 needcommit = false;
 981                         mutex_exit(&uobj->vmobjlock);
 982                 } else {
 983                         iomode = NFSV3WRITE_FILESYNC;
 984                         needcommit = false;
 985                 }
 986         }
 987         if (!needcommit && iomode == NFSV3WRITE_UNSTABLE) {
 988                 mutex_enter(&uobj->vmobjlock);
 989                 for (i = 0; i < npages; i++) {
 990                         pgs[i]->flags |= PG_NEEDCOMMIT | PG_RDONLY;
 991                         pmap_page_protect(pgs[i], VM_PROT_READ);
 992                 }
 993                 mutex_exit(&uobj->vmobjlock);
 994                 pageprotected = true; /* pages can't be modified during i/o. */
 995         } else
 996                 pageprotected = false;
 997
 998         /*
 999          * Send the data to the server if necessary,
1000          * otherwise just send a commit rpc.
1001          */
1002 #ifndef NFS_V2_ONLY
1003         if (needcommit) {
1004
1005                 /*
1006                  * If the buffer is in the range that we already committed,
1007                  * there's nothing to do.
1008                  *
1009                  * If it's in the range that we need to commit, push the
1010                  * whole range at once, otherwise only push the buffer.
1011                  * In both these cases, acquire the commit lock to avoid
1012                  * other processes modifying the range.
1013                  */
1014
1015                 off = uiop->uio_offset;
1016                 cnt = bp->b_bcount;
1017                 mutex_enter(&np->n_commitlock);
1018                 if (!nfs_in_committed_range(vp, off, bp->b_bcount)) {
1019                         bool pushedrange;
1020                         if (nfs_in_tobecommitted_range(vp, off, bp->b_bcount)) {
1021                                 pushedrange = true;
1022                                 off = np->n_pushlo;
1023                                 cnt = np->n_pushhi - np->n_pushlo;
1024                         } else {
1025                                 pushedrange = false;
1026                         }
1027                         error = nfs_commit(vp, off, cnt, curlwp);
1028                         if (error == 0) {
1029                                 if (pushedrange) {
1030                                         nfs_merge_commit_ranges(vp);
1031                                 } else {
1032                                         nfs_add_committed_range(vp, off, cnt);
1033                                 }
1034                         }
1035                 } else {
1036                         error = 0;
1037                 }
1038                 mutex_exit(&np->n_commitlock);
1039                 rw_exit(&nmp->nm_writeverflock);
1040                 if (!error) {
1041                         /*
1042                          * pages are now on stable storage.
1043                          */
1044                         uiop->uio_resid = 0;
1045                         mutex_enter(&uobj->vmobjlock);
1046                         for (i = 0; i < npages; i++) {
1047                                 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY);
1048                         }
1049                         mutex_exit(&uobj->vmobjlock);
1050                         return 0;
1051                 } else if (error == NFSERR_STALEWRITEVERF) {
1052                         nfs_clearcommit(vp->v_mount);
1053                         goto again;
1054                 }
1055                 if (error) {
1056                         bp->b_error = np->n_error = error;
1057                         np->n_flag |= NWRITEERR;
1058                 }
1059                 goto out;
1060         }
1061 #endif
1062         off = uiop->uio_offset;
1063         cnt = bp->b_bcount;
1064         uiop->uio_rw = UIO_WRITE;
1065         nfsstats.write_bios++;
1066         error = nfs_writerpc(vp, uiop, &iomode, pageprotected, &stalewriteverf);
1067 #ifndef NFS_V2_ONLY
1068         if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1069                 /*
1070                  * we need to commit pages later.
1071                  */
1072                 mutex_enter(&np->n_commitlock);
1073                 nfs_add_tobecommitted_range(vp, off, cnt);
1074                 /*
1075                  * if there can be too many uncommitted pages, commit them now.
1076                  */
1077                 if (np->n_pushhi - np->n_pushlo > nfs_commitsize) {
1078                         off = np->n_pushlo;
1079                         cnt = nfs_commitsize >> 1;
1080                         error = nfs_commit(vp, off, cnt, curlwp);
1081                         if (!error) {
1082                                 nfs_add_committed_range(vp, off, cnt);
1083                                 nfs_del_tobecommitted_range(vp, off, cnt);
1084                         }
1085                         if (error == NFSERR_STALEWRITEVERF) {
1086                                 stalewriteverf = true;
1087                                 error = 0; /* it isn't a real error */
1088                         }
1089                 } else {
1090                         /*
1091                          * re-dirty pages so that they will be passed
1092                          * to us later again.
1093                          */
1094                         mutex_enter(&uobj->vmobjlock);
1095                         for (i = 0; i < npages; i++) {
1096                                 pgs[i]->flags &= ~PG_CLEAN;
1097                         }
1098                         mutex_exit(&uobj->vmobjlock);
1099                 }
1100                 mutex_exit(&np->n_commitlock);
1101         } else
1102 #endif
1103         if (!error) {
1104                 /*
1105                  * pages are now on stable storage.
1106                  */
1107                 mutex_enter(&np->n_commitlock);
1108                 nfs_del_committed_range(vp, off, cnt);
1109                 mutex_exit(&np->n_commitlock);
1110                 mutex_enter(&uobj->vmobjlock);
1111                 for (i = 0; i < npages; i++) {
1112                         pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY);
1113                 }
1114                 mutex_exit(&uobj->vmobjlock);
1115         } else {
1116                 /*
1117                  * we got an error.
1118                  */
1119                 bp->b_error = np->n_error = error;
1120                 np->n_flag |= NWRITEERR;
1121         }
1122
1123         rw_exit(&nmp->nm_writeverflock);
1124
1125
1126         if (stalewriteverf) {
1127                 nfs_clearcommit(vp->v_mount);
1128         }
1129 #ifndef NFS_V2_ONLY
1130 out:
1131 #endif
1132         if (pgs != spgs)
1133                 kmem_free(pgs, sizeof(*pgs) * npages);
1134         return error;
1135 }
1136
1137 /*
1138  * nfs_doio for B_PHYS.
1139  */
1140 static int
1141 nfs_doio_phys(struct buf *bp, struct uio *uiop)
1142 {
1143         struct vnode *vp = bp->b_vp;
1144         int error;
1145
1146         uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT;
1147         if (bp->b_flags & B_READ) {
1148                 uiop->uio_rw = UIO_READ;
1149                 nfsstats.read_physios++;
1150                 error = nfs_readrpc(vp, uiop);
1151         } else {
1152                 int iomode = NFSV3WRITE_DATASYNC;
1153                 bool stalewriteverf;
1154                 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1155
1156                 uiop->uio_rw = UIO_WRITE;
1157                 nfsstats.write_physios++;
1158                 rw_enter(&nmp->nm_writeverflock, RW_READER);
1159                 error = nfs_writerpc(vp, uiop, &iomode, false, &stalewriteverf);
1160                 rw_exit(&nmp->nm_writeverflock);
1161                 if (stalewriteverf) {
1162                         nfs_clearcommit(bp->b_vp->v_mount);
1163                 }
1164         }
1165         bp->b_error = error;
1166         return error;
1167 }
1168
1169 /*
1170  * Do an I/O operation to/from a cache block. This may be called
1171  * synchronously or from an nfsiod.
1172  */
1173 int
1174 nfs_doio(struct buf *bp)
1175 {
1176         int error;
1177         struct uio uio;
1178         struct uio *uiop = &uio;
1179         struct iovec io;
1180         UVMHIST_FUNC("nfs_doio"); UVMHIST_CALLED(ubchist);
1181
1182         uiop->uio_iov = &io;
1183         uiop->uio_iovcnt = 1;
1184         uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT);
1185         UIO_SETUP_SYSSPACE(uiop);
1186         io.iov_base = bp->b_data;
1187         io.iov_len = uiop->uio_resid = bp->b_bcount;
1188
1189         /*
1190          * Historically, paging was done with physio, but no more...
1191          */
1192         if (bp->b_flags & B_PHYS) {
1193                 /*
1194                  * ...though reading /dev/drum still gets us here.
1195                  */
1196                 error = nfs_doio_phys(bp, uiop);
1197         } else if (bp->b_flags & B_READ) {
1198                 error = nfs_doio_read(bp, uiop);
1199         } else {
1200                 error = nfs_doio_write(bp, uiop);
1201         }
1202         bp->b_resid = uiop->uio_resid;
1203         biodone(bp);
1204         return (error);
1205 }
1206
1207 /*
1208  * Vnode op for VM getpages.
1209  */
1210
1211 int
1212 nfs_getpages(void *v)
1213 {
1214         struct vop_getpages_args /* {
1215                 struct vnode *a_vp;
1216                 voff_t a_offset;
1217                 struct vm_page **a_m;
1218                 int *a_count;
1219                 int a_centeridx;
1220                 vm_prot_t a_access_type;
1221                 int a_advice;
1222                 int a_flags;
1223         } */ *ap = v;
1224
1225         struct vnode *vp = ap->a_vp;
1226         struct uvm_object *uobj = &vp->v_uobj;
1227         struct nfsnode *np = VTONFS(vp);
1228         const int npages = *ap->a_count;
1229         struct vm_page *pg, **pgs, **opgs, *spgs[UBC_MAX_PAGES];
1230         off_t origoffset, len;
1231         int i, error;
1232         bool v3 = NFS_ISV3(vp);
1233         bool write = (ap->a_access_type & VM_PROT_WRITE) != 0;
1234         bool locked = (ap->a_flags & PGO_LOCKED) != 0;
1235
1236         /*
1237          * If we are not locked we are not really using opgs,
1238          * so just initialize it
1239          */
1240         if (!locked || npages < __arraycount(spgs))
1241                 opgs = spgs;
1242         else {
1243                 if ((opgs = kmem_alloc(npages * sizeof(*opgs), KM_NOSLEEP)) ==
1244                     NULL)
1245                         return ENOMEM;
1246         }
1247
1248         /*
1249          * call the genfs code to get the pages.  `pgs' may be NULL
1250          * when doing read-ahead.
1251          */
1252         pgs = ap->a_m;
1253         if (write && locked && v3) {
1254                 KASSERT(pgs != NULL);
1255 #ifdef DEBUG
1256
1257                 /*
1258                  * If PGO_LOCKED is set, real pages shouldn't exists
1259                  * in the array.
1260                  */
1261
1262                 for (i = 0; i < npages; i++)
1263                         KDASSERT(pgs[i] == NULL || pgs[i] == PGO_DONTCARE);
1264 #endif
1265                 memcpy(opgs, pgs, npages * sizeof(struct vm_pages *));
1266         }
1267         error = genfs_getpages(v);
1268         if (error)
1269                 goto out;
1270
1271         /*
1272          * for read faults where the nfs node is not yet marked NMODIFIED,
1273          * set PG_RDONLY on the pages so that we come back here if someone
1274          * tries to modify later via the mapping that will be entered for
1275          * this fault.
1276          */
1277
1278         if (!write && (np->n_flag & NMODIFIED) == 0 && pgs != NULL) {
1279                 if (!locked) {
1280                         mutex_enter(&uobj->vmobjlock);
1281                 }
1282                 for (i = 0; i < npages; i++) {
1283                         pg = pgs[i];
1284                         if (pg == NULL || pg == PGO_DONTCARE) {
1285                                 continue;
1286                         }
1287                         pg->flags |= PG_RDONLY;
1288                 }
1289                 if (!locked) {
1290                         mutex_exit(&uobj->vmobjlock);
1291                 }
1292         }
1293         if (!write)
1294                 goto out;
1295
1296         /*
1297          * this is a write fault, update the commit info.
1298          */
1299
1300         origoffset = ap->a_offset;
1301         len = npages << PAGE_SHIFT;
1302
1303         if (v3) {
1304                 if (!locked) {
1305                         mutex_enter(&np->n_commitlock);
1306                 } else {
1307                         if (!mutex_tryenter(&np->n_commitlock)) {
1308
1309                                 /*
1310                                  * Since PGO_LOCKED is set, we need to unbusy
1311                                  * all pages fetched by genfs_getpages() above,
1312                                  * tell the caller that there are no pages
1313                                  * available and put back original pgs array.
1314                                  */
1315
1316                                 mutex_enter(&uvm_pageqlock);
1317                                 uvm_page_unbusy(pgs, npages);
1318                                 mutex_exit(&uvm_pageqlock);
1319                                 *ap->a_count = 0;
1320                                 memcpy(pgs, opgs,
1321                                     npages * sizeof(struct vm_pages *));
1322                                 error = EBUSY;
1323                                 goto out;
1324                         }
1325                 }
1326                 nfs_del_committed_range(vp, origoffset, len);
1327                 nfs_del_tobecommitted_range(vp, origoffset, len);
1328         }
1329         np->n_flag |= NMODIFIED;
1330         if (!locked) {
1331                 mutex_enter(&uobj->vmobjlock);
1332         }
1333         for (i = 0; i < npages; i++) {
1334                 pg = pgs[i];
1335                 if (pg == NULL || pg == PGO_DONTCARE) {
1336                         continue;
1337                 }
1338                 pg->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY);
1339         }
1340         if (!locked) {
1341                 mutex_exit(&uobj->vmobjlock);
1342         }
1343         if (v3) {
1344                 mutex_exit(&np->n_commitlock);
1345         }
1346 out:
1347         if (opgs != spgs)
1348                 kmem_free(opgs, sizeof(*opgs) * npages);
1349         return error;
1350 }