1 /* $NetBSD: nfs_bio.c,v 1.182 2009/03/13 15:00:34 yamt Exp $ */
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: nfs_bio.c,v 1.182 2009/03/13 15:00:34 yamt Exp $");
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/resourcevar.h>
48 #include <sys/signalvar.h>
51 #include <sys/vnode.h>
52 #include <sys/mount.h>
53 #include <sys/kernel.h>
54 #include <sys/namei.h>
55 #include <sys/dirent.h>
56 #include <sys/kauth.h>
58 #include <uvm/uvm_extern.h>
61 #include <nfs/rpcv2.h>
62 #include <nfs/nfsproto.h>
64 #include <nfs/nfsmount.h>
65 #include <nfs/nfsnode.h>
66 #include <nfs/nfs_var.h>
68 extern int nfs_numasync
;
69 extern int nfs_commitsize
;
70 extern struct nfsstats nfsstats
;
72 static int nfs_doio_read(struct buf
*, struct uio
*);
73 static int nfs_doio_write(struct buf
*, struct uio
*);
74 static int nfs_doio_phys(struct buf
*, struct uio
*);
77 * Vnode op for read using bio
78 * Any similarity to readip() is purely coincidental
81 nfs_bioread(struct vnode
*vp
, struct uio
*uio
, int ioflag
,
82 kauth_cred_t cred
, int cflag
)
84 struct nfsnode
*np
= VTONFS(vp
);
85 struct buf
*bp
= NULL
, *rabp
;
86 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
87 struct nfsdircache
*ndp
= NULL
, *nndp
= NULL
;
89 int got_buf
= 0, error
= 0, n
= 0, on
= 0, en
, enn
;
91 struct dirent
*dp
, *pdp
, *edp
, *ep
;
94 struct lwp
*l
= curlwp
;
97 if (uio
->uio_rw
!= UIO_READ
)
98 panic("nfs_read mode");
100 if (uio
->uio_resid
== 0)
102 if (vp
->v_type
!= VDIR
&& uio
->uio_offset
< 0)
105 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) &&
106 !(nmp
->nm_iflag
& NFSMNT_GOTFSINFO
))
107 (void)nfs_fsinfo(nmp
, vp
, cred
, l
);
109 if (vp
->v_type
!= VDIR
&&
110 (uio
->uio_offset
+ uio
->uio_resid
) > nmp
->nm_maxfilesize
)
114 * For nfs, cache consistency can only be maintained approximately.
115 * Although RFC1094 does not specify the criteria, the following is
116 * believed to be compatible with the reference port.
118 * If the file's modify time on the server has changed since the
119 * last read rpc or you have written to the file,
120 * you may have lost data cache consistency with the
121 * server, so flush all of the file's data out of the cache.
122 * Then force a getattr rpc to ensure that you have up to date
124 * NB: This implies that cache data can be read when up to
125 * nfs_attrtimeo seconds out of date. If you find that you need current
126 * attributes this could be forced by setting n_attrstamp to 0 before
127 * the VOP_GETATTR() call.
130 if (vp
->v_type
!= VLNK
) {
131 error
= nfs_flushstalebuf(vp
, cred
, l
,
132 NFS_FLUSHSTALEBUF_MYWRITE
);
139 * Don't cache symlinks.
141 if ((vp
->v_vflag
& VV_ROOT
) && vp
->v_type
== VLNK
) {
142 return (nfs_readlinkrpc(vp
, uio
, cred
));
145 switch (vp
->v_type
) {
147 nfsstats
.biocache_reads
++;
149 advice
= IO_ADV_DECODE(ioflag
);
151 while (uio
->uio_resid
> 0) {
154 nfs_delayedtruncate(vp
);
155 if (np
->n_size
<= uio
->uio_offset
) {
159 MIN(np
->n_size
- uio
->uio_offset
, uio
->uio_resid
);
160 error
= ubc_uiomove(&vp
->v_uobj
, uio
, bytelen
,
161 advice
, UBC_READ
| UBC_PARTIALOK
|
162 (UBC_WANT_UNMAP(vp
) ? UBC_UNMAP
: 0));
166 * the file has been truncated on the server.
167 * there isn't much we can do.
169 if (uio
->uio_offset
>= np
->n_size
) {
180 nfsstats
.biocache_readlinks
++;
181 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, l
);
184 if ((bp
->b_oflags
& BO_DONE
) == 0) {
185 bp
->b_flags
|= B_READ
;
186 error
= nfs_doio(bp
);
192 n
= MIN(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
198 nfsstats
.biocache_readdirs
++;
199 ndp
= nfs_searchdircache(vp
, uio
->uio_offset
,
200 (nmp
->nm_flag
& NFSMNT_XLATECOOKIE
), 0);
203 * We've been handed a cookie that is not
204 * in the cache. If we're not translating
205 * 32 <-> 64, it may be a value that was
206 * flushed out of the cache because it grew
207 * too big. Let the server judge if it's
208 * valid or not. In the translation case,
209 * we have no way of validating this value,
212 if (nmp
->nm_flag
& NFSMNT_XLATECOOKIE
)
214 ndp
= nfs_enterdircache(vp
, uio
->uio_offset
,
215 uio
->uio_offset
, 0, 0);
218 if (NFS_EOFVALID(np
) &&
219 ndp
->dc_cookie
== np
->n_direofoffset
) {
220 nfs_putdircache(np
, ndp
);
221 nfsstats
.direofcache_hits
++;
225 bp
= nfs_getcacheblk(vp
, NFSDC_BLKNO(ndp
), NFS_DIRBLKSIZ
, l
);
228 if ((bp
->b_oflags
& BO_DONE
) == 0) {
229 bp
->b_flags
|= B_READ
;
230 bp
->b_dcookie
= ndp
->dc_blkcookie
;
231 error
= nfs_doio(bp
);
234 * Yuck! The directory has been modified on the
235 * server. Punt and let the userland code
238 nfs_putdircache(np
, ndp
);
241 * nfs_request maps NFSERR_BAD_COOKIE to EINVAL.
243 if (error
== EINVAL
) { /* NFSERR_BAD_COOKIE */
244 nfs_invaldircache(vp
, 0);
245 nfs_vinvalbuf(vp
, 0, cred
, l
, 1);
252 * Just return if we hit EOF right away with this
253 * block. Always check here, because direofoffset
254 * may have been set by an nfsiod since the last
257 * also, empty block implies EOF.
260 if (bp
->b_bcount
== bp
->b_resid
||
262 ndp
->dc_blkcookie
== np
->n_direofoffset
)) {
263 KASSERT(bp
->b_bcount
!= bp
->b_resid
||
264 ndp
->dc_blkcookie
== bp
->b_dcookie
);
265 nfs_putdircache(np
, ndp
);
266 brelse(bp
, BC_NOCACHE
);
271 * Find the entry we were looking for in the block.
276 pdp
= dp
= (struct dirent
*)bp
->b_data
;
277 edp
= (struct dirent
*)(void *)((char *)bp
->b_data
+ bp
->b_bcount
-
280 while (enn
< en
&& dp
< edp
) {
282 dp
= _DIRENT_NEXT(dp
);
287 * If the entry number was bigger than the number of
288 * entries in the block, or the cookie of the previous
289 * entry doesn't match, the directory cache is
290 * stale. Flush it and try again (i.e. go to
293 if (dp
>= edp
|| (struct dirent
*)_DIRENT_NEXT(dp
) > edp
||
294 (en
> 0 && NFS_GETCOOKIE(pdp
) != ndp
->dc_cookie
)) {
296 printf("invalid cache: %p %p %p off %jx %jx\n",
298 (uintmax_t)uio
->uio_offset
,
299 (uintmax_t)NFS_GETCOOKIE(pdp
));
301 nfs_putdircache(np
, ndp
);
303 nfs_invaldircache(vp
, 0);
304 nfs_vinvalbuf(vp
, 0, cred
, l
, 0);
308 on
= (char *)dp
- (char *)bp
->b_data
;
311 * Cache all entries that may be exported to the
312 * user, as they may be thrown back at us. The
313 * NFSBIO_CACHECOOKIES flag indicates that all
314 * entries are being 'exported', so cache them all.
317 if (en
== 0 && pdp
== dp
) {
318 dp
= _DIRENT_NEXT(dp
);
322 if (uio
->uio_resid
< (bp
->b_bcount
- bp
->b_resid
- on
)) {
326 n
= bp
->b_bcount
- bp
->b_resid
- on
;
328 ep
= (struct dirent
*)(void *)((char *)bp
->b_data
+ on
+ n
);
331 * Find last complete entry to copy, caching entries
332 * (if requested) as we go.
335 while (dp
< ep
&& (struct dirent
*)_DIRENT_NEXT(dp
) <= ep
) {
336 if (cflag
& NFSBIO_CACHECOOKIES
) {
337 nndp
= nfs_enterdircache(vp
, NFS_GETCOOKIE(pdp
),
338 ndp
->dc_blkcookie
, enn
, bp
->b_lblkno
);
339 if (nmp
->nm_flag
& NFSMNT_XLATECOOKIE
) {
340 NFS_STASHCOOKIE32(pdp
,
343 nfs_putdircache(np
, nndp
);
346 dp
= _DIRENT_NEXT(dp
);
349 nfs_putdircache(np
, ndp
);
352 * If the last requested entry was not the last in the
353 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ),
354 * cache the cookie of the last requested one, and
355 * set of the offset to it.
358 if ((on
+ n
) < bp
->b_bcount
- bp
->b_resid
) {
359 curoff
= NFS_GETCOOKIE(pdp
);
360 nndp
= nfs_enterdircache(vp
, curoff
, ndp
->dc_blkcookie
,
362 if (nmp
->nm_flag
& NFSMNT_XLATECOOKIE
) {
363 NFS_STASHCOOKIE32(pdp
, nndp
->dc_cookie32
);
364 curoff
= nndp
->dc_cookie32
;
366 nfs_putdircache(np
, nndp
);
368 curoff
= bp
->b_dcookie
;
371 * Always cache the entry for the next block,
372 * so that readaheads can use it.
374 nndp
= nfs_enterdircache(vp
, bp
->b_dcookie
, bp
->b_dcookie
, 0,0);
375 if (nmp
->nm_flag
& NFSMNT_XLATECOOKIE
) {
376 if (curoff
== bp
->b_dcookie
) {
377 NFS_STASHCOOKIE32(pdp
, nndp
->dc_cookie32
);
378 curoff
= nndp
->dc_cookie32
;
382 n
= (char *)_DIRENT_NEXT(pdp
) - ((char *)bp
->b_data
+ on
);
385 * If not eof and read aheads are enabled, start one.
386 * (You need the current block first, so that you have the
387 * directory offset cookie of the next block.)
389 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
391 rabp
= nfs_getcacheblk(vp
, NFSDC_BLKNO(nndp
),
394 if ((rabp
->b_oflags
& (BO_DONE
| BO_DELWRI
)) == 0) {
395 rabp
->b_dcookie
= nndp
->dc_cookie
;
396 rabp
->b_flags
|= (B_READ
| B_ASYNC
);
397 if (nfs_asyncio(rabp
)) {
398 brelse(rabp
, BC_INVAL
);
404 nfs_putdircache(np
, nndp
);
408 printf(" nfsbioread: type %x unexpected\n",vp
->v_type
);
415 error
= uiomove((char *)baddr
+ on
, (int)n
, uio
);
417 switch (vp
->v_type
) {
424 uio
->uio_offset
= curoff
;
429 printf(" nfsbioread: type %x unexpected\n",vp
->v_type
);
433 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
438 * Vnode op for write using bio
443 struct vop_write_args
/* {
449 struct uio
*uio
= ap
->a_uio
;
450 struct lwp
*l
= curlwp
;
451 struct vnode
*vp
= ap
->a_vp
;
452 struct nfsnode
*np
= VTONFS(vp
);
453 kauth_cred_t cred
= ap
->a_cred
;
454 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
455 voff_t oldoff
, origoff
;
458 int ioflag
= ap
->a_ioflag
;
459 int extended
= 0, wrotedata
= 0;
462 if (uio
->uio_rw
!= UIO_WRITE
)
463 panic("nfs_write mode");
465 if (vp
->v_type
!= VREG
)
467 if (np
->n_flag
& NWRITEERR
) {
468 np
->n_flag
&= ~NWRITEERR
;
469 return (np
->n_error
);
472 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) &&
473 !(nmp
->nm_iflag
& NFSMNT_GOTFSINFO
))
474 (void)nfs_fsinfo(nmp
, vp
, cred
, l
);
476 if (ioflag
& IO_APPEND
) {
477 NFS_INVALIDATE_ATTRCACHE(np
);
478 error
= nfs_flushstalebuf(vp
, cred
, l
,
479 NFS_FLUSHSTALEBUF_MYWRITE
);
482 uio
->uio_offset
= np
->n_size
;
484 if (uio
->uio_offset
< 0)
486 if ((uio
->uio_offset
+ uio
->uio_resid
) > nmp
->nm_maxfilesize
)
488 if (uio
->uio_resid
== 0)
491 * Maybe this should be above the vnode op call, but so long as
492 * file servers have no limits, i don't think it matters
494 if (l
&& l
->l_proc
&& uio
->uio_offset
+ uio
->uio_resid
>
495 l
->l_proc
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
496 mutex_enter(proc_lock
);
497 psignal(l
->l_proc
, SIGXFSZ
);
498 mutex_exit(proc_lock
);
502 origoff
= uio
->uio_offset
;
504 bool overwrite
; /* if we are overwriting whole pages */
506 oldoff
= uio
->uio_offset
;
507 bytelen
= uio
->uio_resid
;
509 nfsstats
.biocache_writes
++;
511 oldsize
= np
->n_size
;
512 np
->n_flag
|= NMODIFIED
;
513 if (np
->n_size
< uio
->uio_offset
+ bytelen
) {
514 np
->n_size
= uio
->uio_offset
+ bytelen
;
517 if ((uio
->uio_offset
& PAGE_MASK
) == 0) {
518 if ((vp
->v_vflag
& VV_MAPPED
) == 0 &&
519 bytelen
> PAGE_SIZE
) {
520 bytelen
= trunc_page(bytelen
);
522 } else if ((bytelen
& PAGE_MASK
) == 0 &&
523 uio
->uio_offset
>= vp
->v_size
) {
527 if (vp
->v_size
< uio
->uio_offset
+ bytelen
) {
528 uvm_vnp_setwritesize(vp
, uio
->uio_offset
+ bytelen
);
530 error
= ubc_uiomove(&vp
->v_uobj
, uio
, bytelen
,
531 UVM_ADV_RANDOM
, UBC_WRITE
| UBC_PARTIALOK
|
532 (overwrite
? UBC_FAULTBUSY
: 0) |
533 (UBC_WANT_UNMAP(vp
) ? UBC_UNMAP
: 0));
535 uvm_vnp_setwritesize(vp
, vp
->v_size
);
536 if (overwrite
&& np
->n_size
!= oldsize
) {
538 * backout size and free pages past eof.
540 np
->n_size
= oldsize
;
541 mutex_enter(&vp
->v_interlock
);
542 (void)VOP_PUTPAGES(vp
, round_page(vp
->v_size
),
543 0, PGO_SYNCIO
| PGO_FREE
);
550 * update UVM's notion of the size now that we've
551 * copied the data into the vnode's pages.
554 if (vp
->v_size
< uio
->uio_offset
) {
555 uvm_vnp_setsize(vp
, uio
->uio_offset
);
559 if ((oldoff
& ~(nmp
->nm_wsize
- 1)) !=
560 (uio
->uio_offset
& ~(nmp
->nm_wsize
- 1))) {
561 mutex_enter(&vp
->v_interlock
);
562 error
= VOP_PUTPAGES(vp
,
563 trunc_page(oldoff
& ~(nmp
->nm_wsize
- 1)),
564 round_page((uio
->uio_offset
+ nmp
->nm_wsize
- 1) &
565 ~(nmp
->nm_wsize
- 1)), PGO_CLEANIT
);
567 } while (uio
->uio_resid
> 0);
569 VN_KNOTE(vp
, NOTE_WRITE
| (extended
? NOTE_EXTEND
: 0));
570 if (error
== 0 && (ioflag
& IO_SYNC
) != 0) {
571 mutex_enter(&vp
->v_interlock
);
572 error
= VOP_PUTPAGES(vp
,
573 trunc_page(origoff
& ~(nmp
->nm_wsize
- 1)),
574 round_page((uio
->uio_offset
+ nmp
->nm_wsize
- 1) &
575 ~(nmp
->nm_wsize
- 1)),
576 PGO_CLEANIT
| PGO_SYNCIO
);
582 * Get an nfs cache block.
583 * Allocate a new one if the block isn't currently in the cache
584 * and return the block marked busy. If the calling process is
585 * interrupted by a signal for an interruptible mount point, return
589 nfs_getcacheblk(struct vnode
*vp
, daddr_t bn
, int size
, struct lwp
*l
)
592 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
594 if (nmp
->nm_flag
& NFSMNT_INT
) {
595 bp
= getblk(vp
, bn
, size
, PCATCH
, 0);
597 if (nfs_sigintr(nmp
, NULL
, l
))
599 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
);
602 bp
= getblk(vp
, bn
, size
, 0, 0);
607 * Flush and invalidate all dirty buffers. If another process is already
608 * doing the flush, just wait for completion.
611 nfs_vinvalbuf(struct vnode
*vp
, int flags
, kauth_cred_t cred
,
612 struct lwp
*l
, int intrflg
)
614 struct nfsnode
*np
= VTONFS(vp
);
615 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
616 int error
= 0, slptimeo
;
619 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
629 * First wait for any other process doing a flush to complete.
631 mutex_enter(&vp
->v_interlock
);
632 while (np
->n_flag
& NFLUSHINPROG
) {
633 np
->n_flag
|= NFLUSHWANT
;
634 error
= mtsleep(&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
635 slptimeo
, &vp
->v_interlock
);
636 if (error
&& intrflg
&& nfs_sigintr(nmp
, NULL
, l
)) {
637 mutex_exit(&vp
->v_interlock
);
643 * Now, flush as required.
645 np
->n_flag
|= NFLUSHINPROG
;
646 mutex_exit(&vp
->v_interlock
);
647 error
= vinvalbuf(vp
, flags
, cred
, l
, catch, 0);
649 if (intrflg
&& nfs_sigintr(nmp
, NULL
, l
)) {
653 error
= vinvalbuf(vp
, flags
, cred
, l
, 0, slptimeo
);
655 mutex_enter(&vp
->v_interlock
);
657 np
->n_flag
&= ~NMODIFIED
;
658 np
->n_flag
&= ~NFLUSHINPROG
;
659 if (np
->n_flag
& NFLUSHWANT
) {
660 np
->n_flag
&= ~NFLUSHWANT
;
663 mutex_exit(&vp
->v_interlock
);
668 * nfs_flushstalebuf: flush cache if it's stale.
670 * => caller shouldn't own any pages or buffers which belong to the vnode.
674 nfs_flushstalebuf(struct vnode
*vp
, kauth_cred_t cred
, struct lwp
*l
,
677 struct nfsnode
*np
= VTONFS(vp
);
681 if (np
->n_flag
& NMODIFIED
) {
682 if ((flags
& NFS_FLUSHSTALEBUF_MYWRITE
) == 0
683 || vp
->v_type
!= VREG
) {
684 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, l
, 1);
687 if (vp
->v_type
== VDIR
) {
688 nfs_invaldircache(vp
, 0);
692 * XXX assuming writes are ours.
695 NFS_INVALIDATE_ATTRCACHE(np
);
696 error
= VOP_GETATTR(vp
, &vattr
, cred
);
699 np
->n_mtime
= vattr
.va_mtime
;
701 error
= VOP_GETATTR(vp
, &vattr
, cred
);
704 if (timespeccmp(&np
->n_mtime
, &vattr
.va_mtime
, !=)) {
705 if (vp
->v_type
== VDIR
) {
706 nfs_invaldircache(vp
, 0);
708 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, l
, 1);
711 np
->n_mtime
= vattr
.va_mtime
;
719 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
720 * This is mainly to avoid queueing async I/O requests when the nfsiods
721 * are all hung on a dead server.
725 nfs_asyncio(struct buf
*bp
)
728 struct nfsmount
*nmp
;
729 int slptimeo
= 0, error
;
732 if (nfs_numasync
== 0)
735 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
737 if (nmp
->nm_flag
& NFSMNT_INT
)
741 * Find a free iod to process this request.
744 mutex_enter(&nfs_iodlist_lock
);
745 iod
= LIST_FIRST(&nfs_iodlist_idle
);
748 * Found one, so wake it up and tell it which
751 LIST_REMOVE(iod
, nid_idle
);
752 mutex_enter(&iod
->nid_lock
);
753 mutex_exit(&nfs_iodlist_lock
);
754 KASSERT(iod
->nid_mount
== NULL
);
755 iod
->nid_mount
= nmp
;
756 cv_signal(&iod
->nid_cv
);
757 mutex_enter(&nmp
->nm_lock
);
758 mutex_exit(&iod
->nid_lock
);
760 if (nmp
->nm_bufqlen
< 2 * nmp
->nm_bufqiods
) {
761 cv_broadcast(&nmp
->nm_aiocv
);
764 mutex_exit(&nfs_iodlist_lock
);
765 mutex_enter(&nmp
->nm_lock
);
768 KASSERT(mutex_owned(&nmp
->nm_lock
));
771 * If we have an iod which can process the request, then queue
772 * the buffer. However, even if we have an iod, do not initiate
773 * queue cleaning if curproc is the pageout daemon. if the NFS mount
774 * is via local loopback, we may put curproc (pagedaemon) to sleep
775 * waiting for the writes to complete. But the server (ourself)
776 * may block the write, waiting for its (ie., our) pagedaemon
777 * to produce clean pages to handle the write: deadlock.
778 * XXX: start non-loopback mounts straight away? If "lots free",
779 * let pagedaemon start loopback writes anyway?
781 if (nmp
->nm_bufqiods
> 0) {
784 * Ensure that the queue never grows too large.
786 if (curlwp
== uvm
.pagedaemon_lwp
) {
787 /* Enque for later, to avoid free-page deadlock */
788 } else while (nmp
->nm_bufqlen
>= 2 * nmp
->nm_bufqiods
) {
790 error
= cv_timedwait_sig(&nmp
->nm_aiocv
,
791 &nmp
->nm_lock
, slptimeo
);
793 error
= cv_timedwait(&nmp
->nm_aiocv
,
794 &nmp
->nm_lock
, slptimeo
);
797 if (nfs_sigintr(nmp
, NULL
, curlwp
)) {
798 mutex_exit(&nmp
->nm_lock
);
808 * We might have lost our iod while sleeping,
809 * so check and loop if necessary.
812 if (nmp
->nm_bufqiods
== 0) {
813 mutex_exit(&nmp
->nm_lock
);
817 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
819 mutex_exit(&nmp
->nm_lock
);
822 mutex_exit(&nmp
->nm_lock
);
825 * All the iods are busy on other mounts, so return EIO to
826 * force the caller to process the i/o synchronously.
836 nfs_doio_read(struct buf
*bp
, struct uio
*uiop
)
838 struct vnode
*vp
= bp
->b_vp
;
839 struct nfsnode
*np
= VTONFS(vp
);
840 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
843 uiop
->uio_rw
= UIO_READ
;
844 switch (vp
->v_type
) {
846 nfsstats
.read_bios
++;
847 error
= nfs_readrpc(vp
, uiop
);
848 if (!error
&& uiop
->uio_resid
) {
852 * If uio_resid > 0, there is a hole in the file and
853 * no writes after the hole have been pushed to
854 * the server yet or the file has been truncated
856 * Just zero fill the rest of the valid area.
859 KASSERT(vp
->v_size
>=
860 uiop
->uio_offset
+ uiop
->uio_resid
);
861 diff
= bp
->b_bcount
- uiop
->uio_resid
;
862 len
= uiop
->uio_resid
;
863 memset((char *)bp
->b_data
+ diff
, 0, len
);
867 if (uiop
->uio_lwp
&& (vp
->v_iflag
& VI_TEXT
) &&
868 timespeccmp(&np
->n_mtime
, &np
->n_vattr
->va_mtime
, !=)) {
869 mutex_enter(proc_lock
);
870 killproc(uiop
->uio_lwp
->l_proc
, "process text file was modified");
871 mutex_exit(proc_lock
);
872 #if 0 /* XXX NJWLWP */
873 uiop
->uio_lwp
->l_proc
->p_holdcnt
++;
879 KASSERT(uiop
->uio_offset
== (off_t
)0);
880 nfsstats
.readlink_bios
++;
881 error
= nfs_readlinkrpc(vp
, uiop
, np
->n_rcred
);
884 nfsstats
.readdir_bios
++;
885 uiop
->uio_offset
= bp
->b_dcookie
;
887 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
888 error
= nfs_readdirplusrpc(vp
, uiop
,
891 * nfs_request maps NFSERR_NOTSUPP to ENOTSUP.
893 if (error
== ENOTSUP
)
894 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
897 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
899 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
900 error
= nfs_readdirrpc(vp
, uiop
,
903 bp
->b_dcookie
= uiop
->uio_offset
;
907 printf("nfs_doio: type %x unexpected\n", vp
->v_type
);
915 * nfs_doio for write.
918 nfs_doio_write(struct buf
*bp
, struct uio
*uiop
)
920 struct vnode
*vp
= bp
->b_vp
;
921 struct nfsnode
*np
= VTONFS(vp
);
922 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
924 bool stalewriteverf
= false;
925 int i
, npages
= (bp
->b_bcount
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
926 struct vm_page
**pgs
, *spgs
[UBC_MAX_PAGES
];
928 bool needcommit
= true; /* need only COMMIT RPC */
930 bool needcommit
= false; /* need only COMMIT RPC */
933 struct uvm_object
*uobj
= &vp
->v_uobj
;
937 if (npages
< __arraycount(spgs
))
940 if ((pgs
= kmem_alloc(sizeof(*pgs
) * npages
, KM_NOSLEEP
)) ==
945 if ((bp
->b_flags
& B_ASYNC
) != 0 && NFS_ISV3(vp
)) {
946 iomode
= NFSV3WRITE_UNSTABLE
;
948 iomode
= NFSV3WRITE_FILESYNC
;
954 rw_enter(&nmp
->nm_writeverflock
, RW_READER
);
956 for (i
= 0; i
< npages
; i
++) {
957 pgs
[i
] = uvm_pageratop((vaddr_t
)bp
->b_data
+ (i
<< PAGE_SHIFT
));
958 if (pgs
[i
]->uobject
== uobj
&&
959 pgs
[i
]->offset
== uiop
->uio_offset
+ (i
<< PAGE_SHIFT
)) {
960 KASSERT(pgs
[i
]->flags
& PG_BUSY
);
962 * this page belongs to our object.
964 mutex_enter(&uobj
->vmobjlock
);
966 * write out the page stably if it's about to
967 * be released because we can't resend it
968 * on the server crash.
970 * XXX assuming PG_RELEASE|PG_PAGEOUT won't be
971 * changed until unbusy the page.
973 if (pgs
[i
]->flags
& (PG_RELEASED
|PG_PAGEOUT
))
974 iomode
= NFSV3WRITE_FILESYNC
;
976 * if we met a page which hasn't been sent yet,
977 * we need do WRITE RPC.
979 if ((pgs
[i
]->flags
& PG_NEEDCOMMIT
) == 0)
981 mutex_exit(&uobj
->vmobjlock
);
983 iomode
= NFSV3WRITE_FILESYNC
;
987 if (!needcommit
&& iomode
== NFSV3WRITE_UNSTABLE
) {
988 mutex_enter(&uobj
->vmobjlock
);
989 for (i
= 0; i
< npages
; i
++) {
990 pgs
[i
]->flags
|= PG_NEEDCOMMIT
| PG_RDONLY
;
991 pmap_page_protect(pgs
[i
], VM_PROT_READ
);
993 mutex_exit(&uobj
->vmobjlock
);
994 pageprotected
= true; /* pages can't be modified during i/o. */
996 pageprotected
= false;
999 * Send the data to the server if necessary,
1000 * otherwise just send a commit rpc.
1006 * If the buffer is in the range that we already committed,
1007 * there's nothing to do.
1009 * If it's in the range that we need to commit, push the
1010 * whole range at once, otherwise only push the buffer.
1011 * In both these cases, acquire the commit lock to avoid
1012 * other processes modifying the range.
1015 off
= uiop
->uio_offset
;
1017 mutex_enter(&np
->n_commitlock
);
1018 if (!nfs_in_committed_range(vp
, off
, bp
->b_bcount
)) {
1020 if (nfs_in_tobecommitted_range(vp
, off
, bp
->b_bcount
)) {
1023 cnt
= np
->n_pushhi
- np
->n_pushlo
;
1025 pushedrange
= false;
1027 error
= nfs_commit(vp
, off
, cnt
, curlwp
);
1030 nfs_merge_commit_ranges(vp
);
1032 nfs_add_committed_range(vp
, off
, cnt
);
1038 mutex_exit(&np
->n_commitlock
);
1039 rw_exit(&nmp
->nm_writeverflock
);
1042 * pages are now on stable storage.
1044 uiop
->uio_resid
= 0;
1045 mutex_enter(&uobj
->vmobjlock
);
1046 for (i
= 0; i
< npages
; i
++) {
1047 pgs
[i
]->flags
&= ~(PG_NEEDCOMMIT
| PG_RDONLY
);
1049 mutex_exit(&uobj
->vmobjlock
);
1051 } else if (error
== NFSERR_STALEWRITEVERF
) {
1052 nfs_clearcommit(vp
->v_mount
);
1056 bp
->b_error
= np
->n_error
= error
;
1057 np
->n_flag
|= NWRITEERR
;
1062 off
= uiop
->uio_offset
;
1064 uiop
->uio_rw
= UIO_WRITE
;
1065 nfsstats
.write_bios
++;
1066 error
= nfs_writerpc(vp
, uiop
, &iomode
, pageprotected
, &stalewriteverf
);
1068 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
) {
1070 * we need to commit pages later.
1072 mutex_enter(&np
->n_commitlock
);
1073 nfs_add_tobecommitted_range(vp
, off
, cnt
);
1075 * if there can be too many uncommitted pages, commit them now.
1077 if (np
->n_pushhi
- np
->n_pushlo
> nfs_commitsize
) {
1079 cnt
= nfs_commitsize
>> 1;
1080 error
= nfs_commit(vp
, off
, cnt
, curlwp
);
1082 nfs_add_committed_range(vp
, off
, cnt
);
1083 nfs_del_tobecommitted_range(vp
, off
, cnt
);
1085 if (error
== NFSERR_STALEWRITEVERF
) {
1086 stalewriteverf
= true;
1087 error
= 0; /* it isn't a real error */
1091 * re-dirty pages so that they will be passed
1092 * to us later again.
1094 mutex_enter(&uobj
->vmobjlock
);
1095 for (i
= 0; i
< npages
; i
++) {
1096 pgs
[i
]->flags
&= ~PG_CLEAN
;
1098 mutex_exit(&uobj
->vmobjlock
);
1100 mutex_exit(&np
->n_commitlock
);
1105 * pages are now on stable storage.
1107 mutex_enter(&np
->n_commitlock
);
1108 nfs_del_committed_range(vp
, off
, cnt
);
1109 mutex_exit(&np
->n_commitlock
);
1110 mutex_enter(&uobj
->vmobjlock
);
1111 for (i
= 0; i
< npages
; i
++) {
1112 pgs
[i
]->flags
&= ~(PG_NEEDCOMMIT
| PG_RDONLY
);
1114 mutex_exit(&uobj
->vmobjlock
);
1119 bp
->b_error
= np
->n_error
= error
;
1120 np
->n_flag
|= NWRITEERR
;
1123 rw_exit(&nmp
->nm_writeverflock
);
1126 if (stalewriteverf
) {
1127 nfs_clearcommit(vp
->v_mount
);
1133 kmem_free(pgs
, sizeof(*pgs
) * npages
);
1138 * nfs_doio for B_PHYS.
1141 nfs_doio_phys(struct buf
*bp
, struct uio
*uiop
)
1143 struct vnode
*vp
= bp
->b_vp
;
1146 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) << DEV_BSHIFT
;
1147 if (bp
->b_flags
& B_READ
) {
1148 uiop
->uio_rw
= UIO_READ
;
1149 nfsstats
.read_physios
++;
1150 error
= nfs_readrpc(vp
, uiop
);
1152 int iomode
= NFSV3WRITE_DATASYNC
;
1153 bool stalewriteverf
;
1154 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1156 uiop
->uio_rw
= UIO_WRITE
;
1157 nfsstats
.write_physios
++;
1158 rw_enter(&nmp
->nm_writeverflock
, RW_READER
);
1159 error
= nfs_writerpc(vp
, uiop
, &iomode
, false, &stalewriteverf
);
1160 rw_exit(&nmp
->nm_writeverflock
);
1161 if (stalewriteverf
) {
1162 nfs_clearcommit(bp
->b_vp
->v_mount
);
1165 bp
->b_error
= error
;
1170 * Do an I/O operation to/from a cache block. This may be called
1171 * synchronously or from an nfsiod.
1174 nfs_doio(struct buf
*bp
)
1178 struct uio
*uiop
= &uio
;
1180 UVMHIST_FUNC("nfs_doio"); UVMHIST_CALLED(ubchist
);
1182 uiop
->uio_iov
= &io
;
1183 uiop
->uio_iovcnt
= 1;
1184 uiop
->uio_offset
= (((off_t
)bp
->b_blkno
) << DEV_BSHIFT
);
1185 UIO_SETUP_SYSSPACE(uiop
);
1186 io
.iov_base
= bp
->b_data
;
1187 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1190 * Historically, paging was done with physio, but no more...
1192 if (bp
->b_flags
& B_PHYS
) {
1194 * ...though reading /dev/drum still gets us here.
1196 error
= nfs_doio_phys(bp
, uiop
);
1197 } else if (bp
->b_flags
& B_READ
) {
1198 error
= nfs_doio_read(bp
, uiop
);
1200 error
= nfs_doio_write(bp
, uiop
);
1202 bp
->b_resid
= uiop
->uio_resid
;
1208 * Vnode op for VM getpages.
1212 nfs_getpages(void *v
)
1214 struct vop_getpages_args
/* {
1217 struct vm_page **a_m;
1220 vm_prot_t a_access_type;
1225 struct vnode
*vp
= ap
->a_vp
;
1226 struct uvm_object
*uobj
= &vp
->v_uobj
;
1227 struct nfsnode
*np
= VTONFS(vp
);
1228 const int npages
= *ap
->a_count
;
1229 struct vm_page
*pg
, **pgs
, **opgs
, *spgs
[UBC_MAX_PAGES
];
1230 off_t origoffset
, len
;
1232 bool v3
= NFS_ISV3(vp
);
1233 bool write
= (ap
->a_access_type
& VM_PROT_WRITE
) != 0;
1234 bool locked
= (ap
->a_flags
& PGO_LOCKED
) != 0;
1237 * If we are not locked we are not really using opgs,
1238 * so just initialize it
1240 if (!locked
|| npages
< __arraycount(spgs
))
1243 if ((opgs
= kmem_alloc(npages
* sizeof(*opgs
), KM_NOSLEEP
)) ==
1249 * call the genfs code to get the pages. `pgs' may be NULL
1250 * when doing read-ahead.
1253 if (write
&& locked
&& v3
) {
1254 KASSERT(pgs
!= NULL
);
1258 * If PGO_LOCKED is set, real pages shouldn't exists
1262 for (i
= 0; i
< npages
; i
++)
1263 KDASSERT(pgs
[i
] == NULL
|| pgs
[i
] == PGO_DONTCARE
);
1265 memcpy(opgs
, pgs
, npages
* sizeof(struct vm_pages
*));
1267 error
= genfs_getpages(v
);
1272 * for read faults where the nfs node is not yet marked NMODIFIED,
1273 * set PG_RDONLY on the pages so that we come back here if someone
1274 * tries to modify later via the mapping that will be entered for
1278 if (!write
&& (np
->n_flag
& NMODIFIED
) == 0 && pgs
!= NULL
) {
1280 mutex_enter(&uobj
->vmobjlock
);
1282 for (i
= 0; i
< npages
; i
++) {
1284 if (pg
== NULL
|| pg
== PGO_DONTCARE
) {
1287 pg
->flags
|= PG_RDONLY
;
1290 mutex_exit(&uobj
->vmobjlock
);
1297 * this is a write fault, update the commit info.
1300 origoffset
= ap
->a_offset
;
1301 len
= npages
<< PAGE_SHIFT
;
1305 mutex_enter(&np
->n_commitlock
);
1307 if (!mutex_tryenter(&np
->n_commitlock
)) {
1310 * Since PGO_LOCKED is set, we need to unbusy
1311 * all pages fetched by genfs_getpages() above,
1312 * tell the caller that there are no pages
1313 * available and put back original pgs array.
1316 mutex_enter(&uvm_pageqlock
);
1317 uvm_page_unbusy(pgs
, npages
);
1318 mutex_exit(&uvm_pageqlock
);
1321 npages
* sizeof(struct vm_pages
*));
1326 nfs_del_committed_range(vp
, origoffset
, len
);
1327 nfs_del_tobecommitted_range(vp
, origoffset
, len
);
1329 np
->n_flag
|= NMODIFIED
;
1331 mutex_enter(&uobj
->vmobjlock
);
1333 for (i
= 0; i
< npages
; i
++) {
1335 if (pg
== NULL
|| pg
== PGO_DONTCARE
) {
1338 pg
->flags
&= ~(PG_NEEDCOMMIT
| PG_RDONLY
);
1341 mutex_exit(&uobj
->vmobjlock
);
1344 mutex_exit(&np
->n_commitlock
);
1348 kmem_free(opgs
, sizeof(*opgs
) * npages
);