1 /* $NetBSD: genfs_io.c,v 1.20 2009/04/18 15:40:33 pooka Exp $ */
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.20 2009/04/18 15:40:33 pooka Exp $");
36 #include <sys/param.h>
37 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/mount.h>
41 #include <sys/namei.h>
42 #include <sys/vnode.h>
43 #include <sys/fcntl.h>
48 #include <sys/kauth.h>
49 #include <sys/fstrans.h>
52 #include <miscfs/genfs/genfs.h>
53 #include <miscfs/genfs/genfs_node.h>
54 #include <miscfs/specfs/specdev.h>
57 #include <uvm/uvm_pager.h>
59 static int genfs_do_directio(struct vmspace
*, vaddr_t
, size_t, struct vnode
*,
61 static void genfs_dio_iodone(struct buf
*);
63 static int genfs_do_io(struct vnode
*, off_t
, vaddr_t
, size_t, int, enum uio_rw
,
64 void (*)(struct buf
*));
65 static inline void genfs_rel_pages(struct vm_page
**, int);
67 int genfs_maxdio
= MAXPHYS
;
70 genfs_rel_pages(struct vm_page
**pgs
, int npages
)
74 for (i
= 0; i
< npages
; i
++) {
75 struct vm_page
*pg
= pgs
[i
];
77 if (pg
== NULL
|| pg
== PGO_DONTCARE
)
79 if (pg
->flags
& PG_FAKE
) {
80 pg
->flags
|= PG_RELEASED
;
83 mutex_enter(&uvm_pageqlock
);
84 uvm_page_unbusy(pgs
, npages
);
85 mutex_exit(&uvm_pageqlock
);
89 * generic VM getpages routine.
90 * Return PG_BUSY pages for the given range,
91 * reading from backing store if necessary.
95 genfs_getpages(void *v
)
97 struct vop_getpages_args
/* {
100 struct vm_page **a_m;
103 vm_prot_t a_access_type;
108 off_t newsize
, diskeof
, memeof
;
109 off_t offset
, origoffset
, startoffset
, endoffset
;
111 int i
, error
, npages
, orignpages
, npgs
, run
, ridx
, pidx
, pcount
;
112 int fs_bshift
, fs_bsize
, dev_bshift
;
113 const int flags
= ap
->a_flags
;
114 size_t bytes
, iobytes
, tailstart
, tailbytes
, totalbytes
, skipbytes
;
116 struct buf
*bp
, *mbp
;
117 struct vnode
*vp
= ap
->a_vp
;
119 struct genfs_node
*gp
= VTOG(vp
);
120 struct uvm_object
*uobj
= &vp
->v_uobj
;
121 struct vm_page
*pg
, **pgs
, *pgs_onstack
[UBC_MAX_PAGES
];
123 kauth_cred_t cred
= curlwp
->l_cred
; /* XXXUBC curlwp */
124 const bool async
= (flags
& PGO_SYNCIO
) == 0;
125 const bool write
= (ap
->a_access_type
& VM_PROT_WRITE
) != 0;
126 bool sawhole
= false;
127 bool has_trans
= false;
128 const bool overwrite
= (flags
& PGO_OVERWRITE
) != 0;
129 const bool blockalloc
= write
&& (flags
& PGO_NOBLOCKALLOC
) == 0;
131 UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist
);
133 UVMHIST_LOG(ubchist
, "vp %p off 0x%x/%x count %d",
134 vp
, ap
->a_offset
>> 32, ap
->a_offset
, *ap
->a_count
);
136 KASSERT(vp
->v_type
== VREG
|| vp
->v_type
== VDIR
||
137 vp
->v_type
== VLNK
|| vp
->v_type
== VBLK
);
144 origvsize
= vp
->v_size
;
145 origoffset
= ap
->a_offset
;
146 orignpages
= *ap
->a_count
;
147 GOP_SIZE(vp
, origvsize
, &diskeof
, 0);
148 if (flags
& PGO_PASTEOF
) {
149 #if defined(DIAGNOSTIC)
151 #endif /* defined(DIAGNOSTIC) */
153 newsize
= MAX(origvsize
,
154 origoffset
+ (orignpages
<< PAGE_SHIFT
));
155 GOP_SIZE(vp
, newsize
, &memeof
, GOP_SIZE_MEM
);
156 #if defined(DIAGNOSTIC)
157 GOP_SIZE(vp
, vp
->v_writesize
, &writeeof
, GOP_SIZE_MEM
);
158 if (newsize
> round_page(writeeof
)) {
159 panic("%s: past eof", __func__
);
161 #endif /* defined(DIAGNOSTIC) */
163 GOP_SIZE(vp
, origvsize
, &memeof
, GOP_SIZE_MEM
);
165 KASSERT(ap
->a_centeridx
>= 0 || ap
->a_centeridx
<= orignpages
);
166 KASSERT((origoffset
& (PAGE_SIZE
- 1)) == 0 && origoffset
>= 0);
167 KASSERT(orignpages
> 0);
170 * Bounds-check the request.
173 if (origoffset
+ (ap
->a_centeridx
<< PAGE_SHIFT
) >= memeof
) {
174 if ((flags
& PGO_LOCKED
) == 0) {
175 mutex_exit(&uobj
->vmobjlock
);
177 UVMHIST_LOG(ubchist
, "off 0x%x count %d goes past EOF 0x%x",
178 origoffset
, *ap
->a_count
, memeof
,0);
185 if ((flags
& PGO_NOTIMESTAMP
) == 0 &&
186 (vp
->v_type
!= VBLK
||
187 (vp
->v_mount
->mnt_flag
& MNT_NODEVMTIME
) == 0)) {
190 if ((vp
->v_mount
->mnt_flag
& MNT_NOATIME
) == 0) {
191 updflags
= GOP_UPDATE_ACCESSED
;
194 updflags
|= GOP_UPDATE_MODIFIED
;
197 GOP_MARKUPDATE(vp
, updflags
);
203 if ((vp
->v_iflag
& VI_ONWORKLST
) == 0) {
204 vn_syncer_add_to_worklist(vp
, filedelay
);
206 if ((vp
->v_iflag
& (VI_WRMAP
|VI_WRMAPDIRTY
)) == VI_WRMAP
) {
207 vp
->v_iflag
|= VI_WRMAPDIRTY
;
212 * For PGO_LOCKED requests, just return whatever's in memory.
215 if (flags
& PGO_LOCKED
) {
218 npages
= *ap
->a_count
;
220 for (i
= 0; i
< npages
; i
++) {
222 KASSERT(pg
== NULL
|| pg
== PGO_DONTCARE
);
224 #endif /* defined(DEBUG) */
225 nfound
= uvn_findpages(uobj
, origoffset
, &npages
,
226 ap
->a_m
, UFP_NOWAIT
|UFP_NOALLOC
|(write
? UFP_NORDONLY
: 0));
227 KASSERT(npages
== *ap
->a_count
);
232 if (!rw_tryenter(&gp
->g_glock
, RW_READER
)) {
233 genfs_rel_pages(ap
->a_m
, npages
);
239 for (i
= 0; i
< npages
; i
++) {
242 if (pg
!= NULL
|| pg
!= PGO_DONTCARE
) {
247 rw_exit(&gp
->g_glock
);
249 error
= (ap
->a_m
[ap
->a_centeridx
] == NULL
? EBUSY
: 0);
252 mutex_exit(&uobj
->vmobjlock
);
255 * find the requested pages and make some simple checks.
256 * leave space in the page array for a whole block.
259 if (vp
->v_type
!= VBLK
) {
260 fs_bshift
= vp
->v_mount
->mnt_fs_bshift
;
261 dev_bshift
= vp
->v_mount
->mnt_dev_bshift
;
263 fs_bshift
= DEV_BSHIFT
;
264 dev_bshift
= DEV_BSHIFT
;
266 fs_bsize
= 1 << fs_bshift
;
268 orignpages
= MIN(orignpages
,
269 round_page(memeof
- origoffset
) >> PAGE_SHIFT
);
271 startoffset
= origoffset
& ~(fs_bsize
- 1);
272 endoffset
= round_page((origoffset
+ (npages
<< PAGE_SHIFT
) +
273 fs_bsize
- 1) & ~(fs_bsize
- 1));
274 endoffset
= MIN(endoffset
, round_page(memeof
));
275 ridx
= (origoffset
- startoffset
) >> PAGE_SHIFT
;
277 pgs_size
= sizeof(struct vm_page
*) *
278 ((endoffset
- startoffset
) >> PAGE_SHIFT
);
279 if (pgs_size
> sizeof(pgs_onstack
)) {
280 pgs
= kmem_zalloc(pgs_size
, async
? KM_NOSLEEP
: KM_SLEEP
);
288 (void)memset(pgs
, 0, pgs_size
);
292 UVMHIST_LOG(ubchist
, "ridx %d npages %d startoff %ld endoff %ld",
293 ridx
, npages
, startoffset
, endoffset
);
296 fstrans_start(vp
->v_mount
, FSTRANS_SHARED
);
301 * hold g_glock to prevent a race with truncate.
303 * check if our idea of v_size is still valid.
307 rw_enter(&gp
->g_glock
, RW_WRITER
);
309 rw_enter(&gp
->g_glock
, RW_READER
);
311 mutex_enter(&uobj
->vmobjlock
);
312 if (vp
->v_size
< origvsize
) {
313 rw_exit(&gp
->g_glock
);
314 if (pgs
!= pgs_onstack
)
315 kmem_free(pgs
, pgs_size
);
319 if (uvn_findpages(uobj
, origoffset
, &npages
, &pgs
[ridx
],
320 async
? UFP_NOWAIT
: UFP_ALL
) != orignpages
) {
321 rw_exit(&gp
->g_glock
);
323 genfs_rel_pages(&pgs
[ridx
], orignpages
);
324 mutex_exit(&uobj
->vmobjlock
);
330 * if the pages are already resident, just return them.
333 for (i
= 0; i
< npages
; i
++) {
334 struct vm_page
*pg1
= pgs
[ridx
+ i
];
336 if ((pg1
->flags
& PG_FAKE
) ||
337 (blockalloc
&& (pg1
->flags
& PG_RDONLY
))) {
342 rw_exit(&gp
->g_glock
);
343 UVMHIST_LOG(ubchist
, "returning cached pages", 0,0,0,0);
349 * if PGO_OVERWRITE is set, don't bother reading the pages.
353 rw_exit(&gp
->g_glock
);
354 UVMHIST_LOG(ubchist
, "PGO_OVERWRITE",0,0,0,0);
356 for (i
= 0; i
< npages
; i
++) {
357 struct vm_page
*pg1
= pgs
[ridx
+ i
];
359 pg1
->flags
&= ~(PG_RDONLY
|PG_CLEAN
);
366 * the page wasn't resident and we're not overwriting,
367 * so we're going to have to do some i/o.
368 * find any additional pages needed to cover the expanded range.
371 npages
= (endoffset
- startoffset
) >> PAGE_SHIFT
;
372 if (startoffset
!= origoffset
|| npages
!= orignpages
) {
375 * we need to avoid deadlocks caused by locking
376 * additional pages at lower offsets than pages we
377 * already have locked. unlock them all and start over.
380 genfs_rel_pages(&pgs
[ridx
], orignpages
);
381 memset(pgs
, 0, pgs_size
);
383 UVMHIST_LOG(ubchist
, "reset npages start 0x%x end 0x%x",
384 startoffset
, endoffset
, 0,0);
386 if (uvn_findpages(uobj
, startoffset
, &npgs
, pgs
,
387 async
? UFP_NOWAIT
: UFP_ALL
) != npages
) {
388 rw_exit(&gp
->g_glock
);
390 genfs_rel_pages(pgs
, npages
);
391 mutex_exit(&uobj
->vmobjlock
);
396 mutex_exit(&uobj
->vmobjlock
);
399 * read the desired page(s).
402 totalbytes
= npages
<< PAGE_SHIFT
;
403 bytes
= MIN(totalbytes
, MAX(diskeof
- startoffset
, 0));
404 tailbytes
= totalbytes
- bytes
;
407 kva
= uvm_pagermapin(pgs
, npages
,
408 UVMPAGER_MAPIN_READ
| UVMPAGER_MAPIN_WAITOK
);
410 mbp
= getiobuf(vp
, true);
411 mbp
->b_bufsize
= totalbytes
;
412 mbp
->b_data
= (void *)kva
;
413 mbp
->b_resid
= mbp
->b_bcount
= bytes
;
414 mbp
->b_cflags
= BC_BUSY
;
416 mbp
->b_flags
= B_READ
| B_ASYNC
;
417 mbp
->b_iodone
= uvm_aio_biodone
;
419 mbp
->b_flags
= B_READ
;
420 mbp
->b_iodone
= NULL
;
423 BIO_SETPRIO(mbp
, BPRIO_TIMELIMITED
);
425 BIO_SETPRIO(mbp
, BPRIO_TIMECRITICAL
);
428 * if EOF is in the middle of the range, zero the part past EOF.
429 * skip over pages which are not PG_FAKE since in that case they have
430 * valid data that we need to preserve.
434 while (tailbytes
> 0) {
435 const int len
= PAGE_SIZE
- (tailstart
& PAGE_MASK
);
437 KASSERT(len
<= tailbytes
);
438 if ((pgs
[tailstart
>> PAGE_SHIFT
]->flags
& PG_FAKE
) != 0) {
439 memset((void *)(kva
+ tailstart
), 0, len
);
440 UVMHIST_LOG(ubchist
, "tailbytes %p 0x%x 0x%x",
441 kva
, tailstart
, len
, 0);
448 * now loop over the pages, reading as needed.
452 for (offset
= startoffset
;
454 offset
+= iobytes
, bytes
-= iobytes
) {
457 * skip pages which don't need to be read.
460 pidx
= (offset
- startoffset
) >> PAGE_SHIFT
;
461 while ((pgs
[pidx
]->flags
& PG_FAKE
) == 0) {
464 KASSERT((offset
& (PAGE_SIZE
- 1)) == 0);
465 if ((pgs
[pidx
]->flags
& PG_RDONLY
)) {
468 b
= MIN(PAGE_SIZE
, bytes
);
473 UVMHIST_LOG(ubchist
, "skipping, new offset 0x%x",
481 * bmap the file to find out the blkno to read from and
482 * how much we can read in one i/o. if bmap returns an error,
483 * skip the rest of the top-level i/o.
486 lbn
= offset
>> fs_bshift
;
487 error
= VOP_BMAP(vp
, lbn
, &devvp
, &blkno
, &run
);
489 UVMHIST_LOG(ubchist
, "VOP_BMAP lbn 0x%x -> %d\n",
496 * see how many pages can be read with this i/o.
497 * reduce the i/o size if necessary to avoid
498 * overwriting pages with valid data.
501 iobytes
= MIN((((off_t
)lbn
+ 1 + run
) << fs_bshift
) - offset
,
503 if (offset
+ iobytes
> round_page(offset
)) {
505 while (pidx
+ pcount
< npages
&&
506 pgs
[pidx
+ pcount
]->flags
& PG_FAKE
) {
509 iobytes
= MIN(iobytes
, (pcount
<< PAGE_SHIFT
) -
510 (offset
- trunc_page(offset
)));
514 * if this block isn't allocated, zero it instead of
515 * reading it. unless we are going to allocate blocks,
516 * mark the pages we zeroed PG_RDONLY.
520 int holepages
= (round_page(offset
+ iobytes
) -
521 trunc_page(offset
)) >> PAGE_SHIFT
;
522 UVMHIST_LOG(ubchist
, "lbn 0x%x -> HOLE", lbn
,0,0,0);
525 memset((char *)kva
+ (offset
- startoffset
), 0,
527 skipbytes
+= iobytes
;
529 for (i
= 0; i
< holepages
; i
++) {
531 pgs
[pidx
+ i
]->flags
&= ~PG_CLEAN
;
534 pgs
[pidx
+ i
]->flags
|= PG_RDONLY
;
541 * allocate a sub-buf for this piece of the i/o
542 * (or just use mbp if there's only 1 piece),
543 * and start it going.
546 if (offset
== startoffset
&& iobytes
== bytes
) {
549 bp
= getiobuf(vp
, true);
550 nestiobuf_setup(mbp
, bp
, offset
- startoffset
, iobytes
);
554 /* adjust physical blkno for partial blocks */
555 bp
->b_blkno
= blkno
+ ((offset
- ((off_t
)lbn
<< fs_bshift
)) >>
559 "bp %p offset 0x%x bcount 0x%x blkno 0x%x",
560 bp
, offset
, iobytes
, bp
->b_blkno
);
562 VOP_STRATEGY(devvp
, bp
);
566 nestiobuf_done(mbp
, skipbytes
, error
);
568 UVMHIST_LOG(ubchist
, "returning 0 (async)",0,0,0,0);
569 rw_exit(&gp
->g_glock
);
574 error
= biowait(mbp
);
577 /* Remove the mapping (make KVA available as soon as possible) */
578 uvm_pagermapout(kva
, npages
);
581 * if this we encountered a hole then we have to do a little more work.
582 * for read faults, we marked the page PG_RDONLY so that future
583 * write accesses to the page will fault again.
584 * for write faults, we must make sure that the backing store for
585 * the page is completely allocated while the pages are locked.
588 if (!error
&& sawhole
&& blockalloc
) {
590 * XXX: This assumes that we come here only via
593 if (vp
->v_mount
->mnt_wapbl
) {
594 error
= WAPBL_BEGIN(vp
->v_mount
);
598 error
= GOP_ALLOC(vp
, startoffset
,
599 npages
<< PAGE_SHIFT
, 0, cred
);
600 if (vp
->v_mount
->mnt_wapbl
) {
601 WAPBL_END(vp
->v_mount
);
605 UVMHIST_LOG(ubchist
, "gop_alloc off 0x%x/0x%x -> %d",
606 startoffset
, npages
<< PAGE_SHIFT
, error
,0);
608 for (i
= 0; i
< npages
; i
++) {
609 if (pgs
[i
] == NULL
) {
612 pgs
[i
]->flags
&= ~(PG_CLEAN
|PG_RDONLY
);
613 UVMHIST_LOG(ubchist
, "mark dirty pg %p",
618 rw_exit(&gp
->g_glock
);
622 mutex_enter(&uobj
->vmobjlock
);
625 * we're almost done! release the pages...
626 * for errors, we free the pages.
627 * otherwise we activate them and mark them as valid and clean.
628 * also, unbusy pages that were not actually requested.
632 for (i
= 0; i
< npages
; i
++) {
633 if (pgs
[i
] == NULL
) {
636 UVMHIST_LOG(ubchist
, "examining pg %p flags 0x%x",
637 pgs
[i
], pgs
[i
]->flags
, 0,0);
638 if (pgs
[i
]->flags
& PG_FAKE
) {
639 pgs
[i
]->flags
|= PG_RELEASED
;
642 mutex_enter(&uvm_pageqlock
);
643 uvm_page_unbusy(pgs
, npages
);
644 mutex_exit(&uvm_pageqlock
);
645 mutex_exit(&uobj
->vmobjlock
);
646 UVMHIST_LOG(ubchist
, "returning error %d", error
,0,0,0);
651 UVMHIST_LOG(ubchist
, "succeeding, npages %d", npages
,0,0,0);
653 mutex_enter(&uvm_pageqlock
);
654 for (i
= 0; i
< npages
; i
++) {
659 UVMHIST_LOG(ubchist
, "examining pg %p flags 0x%x",
661 if (pg
->flags
& PG_FAKE
&& !overwrite
) {
662 pg
->flags
&= ~(PG_FAKE
);
663 pmap_clear_modify(pgs
[i
]);
665 KASSERT(!write
|| !blockalloc
|| (pg
->flags
& PG_RDONLY
) == 0);
666 if (i
< ridx
|| i
>= ridx
+ orignpages
|| async
) {
667 UVMHIST_LOG(ubchist
, "unbusy pg %p offset 0x%x",
669 if (pg
->flags
& PG_WANTED
) {
672 if (pg
->flags
& PG_FAKE
) {
676 if (pg
->flags
& PG_RELEASED
) {
681 pg
->flags
&= ~(PG_WANTED
|PG_BUSY
|PG_FAKE
);
682 UVM_PAGE_OWN(pg
, NULL
);
685 mutex_exit(&uvm_pageqlock
);
686 mutex_exit(&uobj
->vmobjlock
);
687 if (ap
->a_m
!= NULL
) {
688 memcpy(ap
->a_m
, &pgs
[ridx
],
689 orignpages
* sizeof(struct vm_page
*));
693 if (pgs
!= NULL
&& pgs
!= pgs_onstack
)
694 kmem_free(pgs
, pgs_size
);
696 fstrans_done(vp
->v_mount
);
701 * generic VM putpages routine.
702 * Write the given range of pages to backing store.
704 * => "offhi == 0" means flush all pages at or after "offlo".
705 * => object should be locked by caller. we return with the
707 * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
708 * thus, a caller might want to unlock higher level resources
709 * (e.g. vm_map) before calling flush.
710 * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
711 * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
712 * => NOTE: we rely on the fact that the object's memq is a TAILQ and
713 * that new pages are inserted on the tail end of the list. thus,
714 * we can make a complete pass through the object in one go by starting
715 * at the head and working towards the tail (new pages are put in
717 * => NOTE: we are allowed to lock the page queues, so the caller
718 * must not be holding the page queue lock.
720 * note on "cleaning" object and PG_BUSY pages:
721 * this routine is holding the lock on the object. the only time
722 * that it can run into a PG_BUSY page that it does not own is if
723 * some other process has started I/O on the page (e.g. either
724 * a pagein, or a pageout). if the PG_BUSY page is being paged
725 * in, then it can not be dirty (!PG_CLEAN) because no one has
726 * had a chance to modify it yet. if the PG_BUSY page is being
727 * paged out then it means that someone else has already started
728 * cleaning the page for us (how nice!). in this case, if we
729 * have syncio specified, then after we make our pass through the
730 * object we need to wait for the other PG_BUSY pages to clear
731 * off (i.e. we need to do an iosync). also note that once a
732 * page is PG_BUSY it must stay in its object until it is un-busyed.
734 * note on page traversal:
735 * we can traverse the pages in an object either by going down the
736 * linked list in "uobj->memq", or we can go over the address range
737 * by page doing hash table lookups for each address. depending
738 * on how many pages are in the object it may be cheaper to do one
739 * or the other. we set "by_list" to true if we are using memq.
740 * if the cost of a hash lookup was equal to the cost of the list
741 * traversal we could compare the number of pages in the start->stop
742 * range to the total number of pages in the object. however, it
743 * seems that a hash table lookup is more expensive than the linked
744 * list traversal, so we multiply the number of pages in the
745 * range by an estimate of the relatively higher cost of the hash lookup.
749 genfs_putpages(void *v
)
751 struct vop_putpages_args
/* {
758 return genfs_do_putpages(ap
->a_vp
, ap
->a_offlo
, ap
->a_offhi
,
763 genfs_do_putpages(struct vnode
*vp
, off_t startoff
, off_t endoff
,
764 int origflags
, struct vm_page
**busypg
)
766 struct uvm_object
*uobj
= &vp
->v_uobj
;
767 kmutex_t
*slock
= &uobj
->vmobjlock
;
769 /* Even for strange MAXPHYS, the shift rounds down to a page */
770 #define maxpages (MAXPHYS >> PAGE_SHIFT)
771 int i
, error
, npages
, nback
;
773 struct vm_page
*pgs
[maxpages
], *pg
, *nextpg
, *tpg
, curmp
, endmp
;
774 bool wasclean
, by_list
, needs_clean
, yld
;
775 bool async
= (origflags
& PGO_SYNCIO
) == 0;
776 bool pagedaemon
= curlwp
== uvm
.pagedaemon_lwp
;
777 struct lwp
*l
= curlwp
? curlwp
: &lwp0
;
778 struct genfs_node
*gp
= VTOG(vp
);
787 UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist
);
789 KASSERT(origflags
& (PGO_CLEANIT
|PGO_FREE
|PGO_DEACTIVATE
));
790 KASSERT((startoff
& PAGE_MASK
) == 0 && (endoff
& PAGE_MASK
) == 0);
791 KASSERT(startoff
< endoff
|| endoff
== 0);
793 UVMHIST_LOG(ubchist
, "vp %p pages %d off 0x%x len 0x%x",
794 vp
, uobj
->uo_npages
, startoff
, endoff
- startoff
);
797 need_wapbl
= (!pagedaemon
&& vp
->v_mount
&& vp
->v_mount
->mnt_wapbl
&&
798 (origflags
& PGO_JOURNALLOCKED
) == 0);
803 KASSERT((vp
->v_iflag
& VI_ONWORKLST
) != 0 ||
804 (vp
->v_iflag
& VI_WRMAPDIRTY
) == 0);
805 if (uobj
->uo_npages
== 0) {
806 if (vp
->v_iflag
& VI_ONWORKLST
) {
807 vp
->v_iflag
&= ~VI_WRMAPDIRTY
;
808 if (LIST_FIRST(&vp
->v_dirtyblkhd
) == NULL
)
809 vn_syncer_remove_from_worklist(vp
);
813 WAPBL_END(vp
->v_mount
);
814 fstrans_done(vp
->v_mount
);
821 * the vnode has pages, set up to process the request.
824 if (!has_trans
&& (flags
& PGO_CLEANIT
) != 0) {
827 error
= fstrans_start_nowait(vp
->v_mount
, FSTRANS_LAZY
);
831 fstrans_start(vp
->v_mount
, FSTRANS_LAZY
);
833 error
= WAPBL_BEGIN(vp
->v_mount
);
835 fstrans_done(vp
->v_mount
);
845 wasclean
= (vp
->v_numoutput
== 0);
847 if (endoff
== 0 || flags
& PGO_ALLPAGES
) {
848 endoff
= trunc_page(LLONG_MAX
);
850 by_list
= (uobj
->uo_npages
<=
851 ((endoff
- startoff
) >> PAGE_SHIFT
) * UVM_PAGE_TREE_PENALTY
);
855 * if this vnode is known not to have dirty pages,
856 * don't bother to clean it out.
859 if ((vp
->v_iflag
& VI_ONWORKLST
) == 0) {
860 if ((flags
& (PGO_FREE
|PGO_DEACTIVATE
)) == 0) {
863 flags
&= ~PGO_CLEANIT
;
865 #endif /* !defined(DEBUG) */
868 * start the loop. when scanning by list, hold the last page
869 * in the list before we start. pages allocated after we start
870 * will be added to the end of the list, so we can stop at the
874 cleanall
= (flags
& PGO_CLEANIT
) != 0 && wasclean
&&
875 startoff
== 0 && endoff
== trunc_page(LLONG_MAX
) &&
876 (vp
->v_iflag
& VI_ONWORKLST
) != 0;
877 dirtygen
= gp
->g_dirtygen
;
878 freeflag
= pagedaemon
? PG_PAGEOUT
: PG_RELEASED
;
880 curmp
.uobject
= uobj
;
881 curmp
.offset
= (voff_t
)-1;
882 curmp
.flags
= PG_BUSY
;
883 endmp
.uobject
= uobj
;
884 endmp
.offset
= (voff_t
)-1;
885 endmp
.flags
= PG_BUSY
;
886 pg
= TAILQ_FIRST(&uobj
->memq
);
887 TAILQ_INSERT_TAIL(&uobj
->memq
, &endmp
, listq
.queue
);
889 pg
= uvm_pagelookup(uobj
, off
);
892 while (by_list
|| off
< endoff
) {
895 * if the current page is not interesting, move on to the next.
898 KASSERT(pg
== NULL
|| pg
->uobject
== uobj
);
899 KASSERT(pg
== NULL
||
900 (pg
->flags
& (PG_RELEASED
|PG_PAGEOUT
)) == 0 ||
901 (pg
->flags
& PG_BUSY
) != 0);
906 if (pg
->offset
< startoff
|| pg
->offset
>= endoff
||
907 pg
->flags
& (PG_RELEASED
|PG_PAGEOUT
)) {
908 if (pg
->flags
& (PG_RELEASED
|PG_PAGEOUT
)) {
911 pg
= TAILQ_NEXT(pg
, listq
.queue
);
915 } else if (pg
== NULL
|| pg
->flags
& (PG_RELEASED
|PG_PAGEOUT
)) {
921 pg
= uvm_pagelookup(uobj
, off
);
927 * if the current page needs to be cleaned and it's busy,
928 * wait for it to become unbusy.
931 yld
= (l
->l_cpu
->ci_schedstate
.spc_flags
&
932 SPCF_SHOULDYIELD
) && !pagedaemon
;
933 if (pg
->flags
& PG_BUSY
|| yld
) {
934 UVMHIST_LOG(ubchist
, "busy %p", pg
,0,0,0);
935 if (flags
& PGO_BUSYFAIL
&& pg
->flags
& PG_BUSY
) {
936 UVMHIST_LOG(ubchist
, "busyfail %p", pg
, 0,0,0);
944 * someone has taken the page while we
945 * dropped the lock for fstrans_start.
950 TAILQ_INSERT_BEFORE(pg
, &curmp
, listq
.queue
);
951 UVMHIST_LOG(ubchist
, "curmp next %p",
952 TAILQ_NEXT(&curmp
, listq
.queue
), 0,0,0);
959 pg
->flags
|= PG_WANTED
;
960 UVM_UNLOCK_AND_WAIT(pg
, slock
, 0, "genput", 0);
964 UVMHIST_LOG(ubchist
, "after next %p",
965 TAILQ_NEXT(&curmp
, listq
.queue
), 0,0,0);
966 pg
= TAILQ_NEXT(&curmp
, listq
.queue
);
967 TAILQ_REMOVE(&uobj
->memq
, &curmp
, listq
.queue
);
969 pg
= uvm_pagelookup(uobj
, off
);
975 * if we're freeing, remove all mappings of the page now.
976 * if we're cleaning, check if the page is needs to be cleaned.
979 if (flags
& PGO_FREE
) {
980 pmap_page_protect(pg
, VM_PROT_NONE
);
981 } else if (flags
& PGO_CLEANIT
) {
984 * if we still have some hope to pull this vnode off
985 * from the syncer queue, write-protect the page.
988 if (cleanall
&& wasclean
&&
989 gp
->g_dirtygen
== dirtygen
) {
992 * uobj pages get wired only by uvm_fault
993 * where uobj is locked.
996 if (pg
->wire_count
== 0) {
997 pmap_page_protect(pg
,
998 VM_PROT_READ
|VM_PROT_EXECUTE
);
1005 if (flags
& PGO_CLEANIT
) {
1006 needs_clean
= pmap_clear_modify(pg
) ||
1007 (pg
->flags
& PG_CLEAN
) == 0;
1008 pg
->flags
|= PG_CLEAN
;
1010 needs_clean
= false;
1014 * if we're cleaning, build a cluster.
1015 * the cluster will consist of pages which are currently dirty,
1016 * but they will be returned to us marked clean.
1017 * if not cleaning, just operate on the one page.
1021 KDASSERT((vp
->v_iflag
& VI_ONWORKLST
));
1023 memset(pgs
, 0, sizeof(pgs
));
1024 pg
->flags
|= PG_BUSY
;
1025 UVM_PAGE_OWN(pg
, "genfs_putpages");
1028 * first look backward.
1031 npages
= MIN(maxpages
>> 1, off
>> PAGE_SHIFT
);
1033 uvn_findpages(uobj
, off
- PAGE_SIZE
, &nback
, &pgs
[0],
1034 UFP_NOWAIT
|UFP_NOALLOC
|UFP_DIRTYONLY
|UFP_BACKWARD
);
1036 memmove(&pgs
[0], &pgs
[npages
- nback
],
1037 nback
* sizeof(pgs
[0]));
1038 if (npages
- nback
< nback
)
1039 memset(&pgs
[nback
], 0,
1040 (npages
- nback
) * sizeof(pgs
[0]));
1042 memset(&pgs
[npages
- nback
], 0,
1043 nback
* sizeof(pgs
[0]));
1047 * then plug in our page of interest.
1053 * then look forward to fill in the remaining space in
1054 * the array of pages.
1057 npages
= maxpages
- nback
- 1;
1058 uvn_findpages(uobj
, off
+ PAGE_SIZE
, &npages
,
1060 UFP_NOWAIT
|UFP_NOALLOC
|UFP_DIRTYONLY
);
1061 npages
+= nback
+ 1;
1069 * apply FREE or DEACTIVATE options if requested.
1072 if (flags
& (PGO_DEACTIVATE
|PGO_FREE
)) {
1073 mutex_enter(&uvm_pageqlock
);
1075 for (i
= 0; i
< npages
; i
++) {
1077 KASSERT(tpg
->uobject
== uobj
);
1078 if (by_list
&& tpg
== TAILQ_NEXT(pg
, listq
.queue
))
1080 if (tpg
->offset
< startoff
|| tpg
->offset
>= endoff
)
1082 if (flags
& PGO_DEACTIVATE
&& tpg
->wire_count
== 0) {
1083 uvm_pagedeactivate(tpg
);
1084 } else if (flags
& PGO_FREE
) {
1085 pmap_page_protect(tpg
, VM_PROT_NONE
);
1086 if (tpg
->flags
& PG_BUSY
) {
1087 tpg
->flags
|= freeflag
;
1089 uvm_pageout_start(1);
1090 uvm_pagedequeue(tpg
);
1095 * ``page is not busy''
1096 * implies that npages is 1
1097 * and needs_clean is false.
1100 nextpg
= TAILQ_NEXT(tpg
, listq
.queue
);
1107 if (flags
& (PGO_DEACTIVATE
|PGO_FREE
)) {
1108 mutex_exit(&uvm_pageqlock
);
1114 * start the i/o. if we're traversing by list,
1115 * keep our place in the list with a marker page.
1119 TAILQ_INSERT_AFTER(&uobj
->memq
, pg
, &curmp
,
1123 error
= GOP_WRITE(vp
, pgs
, npages
, flags
);
1126 pg
= TAILQ_NEXT(&curmp
, listq
.queue
);
1127 TAILQ_REMOVE(&uobj
->memq
, &curmp
, listq
.queue
);
1138 * find the next page and continue if there was no error.
1146 pg
= TAILQ_NEXT(pg
, listq
.queue
);
1149 off
+= (npages
- nback
) << PAGE_SHIFT
;
1151 pg
= uvm_pagelookup(uobj
, off
);
1156 TAILQ_REMOVE(&uobj
->memq
, &endmp
, listq
.queue
);
1159 if (modified
&& (vp
->v_iflag
& VI_WRMAPDIRTY
) != 0 &&
1160 (vp
->v_type
!= VBLK
||
1161 (vp
->v_mount
->mnt_flag
& MNT_NODEVMTIME
) == 0)) {
1162 GOP_MARKUPDATE(vp
, GOP_UPDATE_MODIFIED
);
1166 * if we're cleaning and there was nothing to clean,
1167 * take us off the syncer list. if we started any i/o
1168 * and we're doing sync i/o, wait for all writes to finish.
1171 if (cleanall
&& wasclean
&& gp
->g_dirtygen
== dirtygen
&&
1172 (vp
->v_iflag
& VI_ONWORKLST
) != 0) {
1174 TAILQ_FOREACH(pg
, &uobj
->memq
, listq
.queue
) {
1175 if ((pg
->flags
& PG_CLEAN
) == 0) {
1176 printf("%s: %p: !CLEAN\n", __func__
, pg
);
1178 if (pmap_is_modified(pg
)) {
1179 printf("%s: %p: modified\n", __func__
, pg
);
1182 #endif /* defined(DEBUG) */
1183 vp
->v_iflag
&= ~VI_WRMAPDIRTY
;
1184 if (LIST_FIRST(&vp
->v_dirtyblkhd
) == NULL
)
1185 vn_syncer_remove_from_worklist(vp
);
1190 #endif /* !defined(DEBUG) */
1192 /* Wait for output to complete. */
1193 if (!wasclean
&& !async
&& vp
->v_numoutput
!= 0) {
1194 while (vp
->v_numoutput
!= 0)
1195 cv_wait(&vp
->v_cv
, slock
);
1197 onworklst
= (vp
->v_iflag
& VI_ONWORKLST
) != 0;
1200 if ((flags
& PGO_RECLAIM
) != 0 && onworklst
) {
1202 * in the case of PGO_RECLAIM, ensure to make the vnode clean.
1203 * retrying is not a big deal because, in many cases,
1204 * uobj->uo_npages is already 0 here.
1212 WAPBL_END(vp
->v_mount
);
1213 fstrans_done(vp
->v_mount
);
1220 genfs_gop_write(struct vnode
*vp
, struct vm_page
**pgs
, int npages
, int flags
)
1226 UVMHIST_FUNC(__func__
); UVMHIST_CALLED(ubchist
);
1228 UVMHIST_LOG(ubchist
, "vp %p pgs %p npages %d flags 0x%x",
1229 vp
, pgs
, npages
, flags
);
1231 off
= pgs
[0]->offset
;
1232 kva
= uvm_pagermapin(pgs
, npages
,
1233 UVMPAGER_MAPIN_WRITE
| UVMPAGER_MAPIN_WAITOK
);
1234 len
= npages
<< PAGE_SHIFT
;
1236 error
= genfs_do_io(vp
, off
, kva
, len
, flags
, UIO_WRITE
,
1243 genfs_gop_write_rwmap(struct vnode
*vp
, struct vm_page
**pgs
, int npages
, int flags
)
1249 UVMHIST_FUNC(__func__
); UVMHIST_CALLED(ubchist
);
1251 UVMHIST_LOG(ubchist
, "vp %p pgs %p npages %d flags 0x%x",
1252 vp
, pgs
, npages
, flags
);
1254 off
= pgs
[0]->offset
;
1255 kva
= uvm_pagermapin(pgs
, npages
,
1256 UVMPAGER_MAPIN_READ
| UVMPAGER_MAPIN_WAITOK
);
1257 len
= npages
<< PAGE_SHIFT
;
1259 error
= genfs_do_io(vp
, off
, kva
, len
, flags
, UIO_WRITE
,
1266 * Backend routine for doing I/O to vnode pages. Pages are already locked
1267 * and mapped into kernel memory. Here we just look up the underlying
1268 * device block addresses and call the strategy routine.
1272 genfs_do_io(struct vnode
*vp
, off_t off
, vaddr_t kva
, size_t len
, int flags
,
1273 enum uio_rw rw
, void (*iodone
)(struct buf
*))
1276 int fs_bshift
, dev_bshift
;
1277 off_t eof
, offset
, startoffset
;
1278 size_t bytes
, iobytes
, skipbytes
;
1280 struct buf
*mbp
, *bp
;
1281 struct vnode
*devvp
;
1282 bool async
= (flags
& PGO_SYNCIO
) == 0;
1283 bool write
= rw
== UIO_WRITE
;
1284 int brw
= write
? B_WRITE
: B_READ
;
1285 UVMHIST_FUNC(__func__
); UVMHIST_CALLED(ubchist
);
1287 UVMHIST_LOG(ubchist
, "vp %p kva %p len 0x%x flags 0x%x",
1288 vp
, kva
, len
, flags
);
1290 KASSERT(vp
->v_size
<= vp
->v_writesize
);
1291 GOP_SIZE(vp
, vp
->v_writesize
, &eof
, 0);
1292 if (vp
->v_type
!= VBLK
) {
1293 fs_bshift
= vp
->v_mount
->mnt_fs_bshift
;
1294 dev_bshift
= vp
->v_mount
->mnt_dev_bshift
;
1296 fs_bshift
= DEV_BSHIFT
;
1297 dev_bshift
= DEV_BSHIFT
;
1301 bytes
= MIN(len
, eof
- startoffset
);
1303 KASSERT(bytes
!= 0);
1306 mutex_enter(&vp
->v_interlock
);
1307 vp
->v_numoutput
+= 2;
1308 mutex_exit(&vp
->v_interlock
);
1310 mbp
= getiobuf(vp
, true);
1311 UVMHIST_LOG(ubchist
, "vp %p mbp %p num now %d bytes 0x%x",
1312 vp
, mbp
, vp
->v_numoutput
, bytes
);
1313 mbp
->b_bufsize
= len
;
1314 mbp
->b_data
= (void *)kva
;
1315 mbp
->b_resid
= mbp
->b_bcount
= bytes
;
1316 mbp
->b_cflags
= BC_BUSY
| BC_AGE
;
1318 mbp
->b_flags
= brw
| B_ASYNC
;
1319 mbp
->b_iodone
= iodone
;
1322 mbp
->b_iodone
= NULL
;
1324 if (curlwp
== uvm
.pagedaemon_lwp
)
1325 BIO_SETPRIO(mbp
, BPRIO_TIMELIMITED
);
1327 BIO_SETPRIO(mbp
, BPRIO_TIMENONCRITICAL
);
1329 BIO_SETPRIO(mbp
, BPRIO_TIMECRITICAL
);
1332 for (offset
= startoffset
;
1334 offset
+= iobytes
, bytes
-= iobytes
) {
1335 lbn
= offset
>> fs_bshift
;
1336 error
= VOP_BMAP(vp
, lbn
, &devvp
, &blkno
, &run
);
1338 UVMHIST_LOG(ubchist
, "VOP_BMAP() -> %d", error
,0,0,0);
1344 iobytes
= MIN((((off_t
)lbn
+ 1 + run
) << fs_bshift
) - offset
,
1346 if (blkno
== (daddr_t
)-1) {
1348 memset((char *)kva
+ (offset
- startoffset
), 0,
1351 skipbytes
+= iobytes
;
1355 /* if it's really one i/o, don't make a second buf */
1356 if (offset
== startoffset
&& iobytes
== bytes
) {
1359 UVMHIST_LOG(ubchist
, "vp %p bp %p num now %d",
1360 vp
, bp
, vp
->v_numoutput
, 0);
1361 bp
= getiobuf(vp
, true);
1362 nestiobuf_setup(mbp
, bp
, offset
- startoffset
, iobytes
);
1366 /* adjust physical blkno for partial blocks */
1367 bp
->b_blkno
= blkno
+ ((offset
- ((off_t
)lbn
<< fs_bshift
)) >>
1369 UVMHIST_LOG(ubchist
,
1370 "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
1371 vp
, offset
, bp
->b_bcount
, bp
->b_blkno
);
1373 VOP_STRATEGY(devvp
, bp
);
1376 UVMHIST_LOG(ubchist
, "skipbytes %d", skipbytes
, 0,0,0);
1378 nestiobuf_done(mbp
, skipbytes
, error
);
1380 UVMHIST_LOG(ubchist
, "returning 0 (async)", 0,0,0,0);
1383 UVMHIST_LOG(ubchist
, "waiting for mbp %p", mbp
,0,0,0);
1384 error
= biowait(mbp
);
1388 UVMHIST_LOG(ubchist
, "returning, error %d", error
,0,0,0);
1393 genfs_compat_getpages(void *v
)
1395 struct vop_getpages_args
/* {
1398 struct vm_page **a_m;
1401 vm_prot_t a_access_type;
1407 struct vnode
*vp
= ap
->a_vp
;
1408 struct uvm_object
*uobj
= &vp
->v_uobj
;
1409 struct vm_page
*pg
, **pgs
;
1411 int i
, error
, orignpages
, npages
;
1414 kauth_cred_t cred
= curlwp
->l_cred
;
1415 bool write
= (ap
->a_access_type
& VM_PROT_WRITE
) != 0;
1418 origoffset
= ap
->a_offset
;
1419 orignpages
= *ap
->a_count
;
1422 if (write
&& (vp
->v_iflag
& VI_ONWORKLST
) == 0) {
1423 vn_syncer_add_to_worklist(vp
, filedelay
);
1425 if (ap
->a_flags
& PGO_LOCKED
) {
1426 uvn_findpages(uobj
, origoffset
, ap
->a_count
, ap
->a_m
,
1427 UFP_NOWAIT
|UFP_NOALLOC
| (write
? UFP_NORDONLY
: 0));
1429 return (ap
->a_m
[ap
->a_centeridx
] == NULL
? EBUSY
: 0);
1431 if (origoffset
+ (ap
->a_centeridx
<< PAGE_SHIFT
) >= vp
->v_size
) {
1432 mutex_exit(&uobj
->vmobjlock
);
1435 if ((ap
->a_flags
& PGO_SYNCIO
) == 0) {
1436 mutex_exit(&uobj
->vmobjlock
);
1439 npages
= orignpages
;
1440 uvn_findpages(uobj
, origoffset
, &npages
, pgs
, UFP_ALL
);
1441 mutex_exit(&uobj
->vmobjlock
);
1442 kva
= uvm_pagermapin(pgs
, npages
,
1443 UVMPAGER_MAPIN_READ
| UVMPAGER_MAPIN_WAITOK
);
1444 for (i
= 0; i
< npages
; i
++) {
1446 if ((pg
->flags
& PG_FAKE
) == 0) {
1449 iov
.iov_base
= (char *)kva
+ (i
<< PAGE_SHIFT
);
1450 iov
.iov_len
= PAGE_SIZE
;
1453 uio
.uio_offset
= origoffset
+ (i
<< PAGE_SHIFT
);
1454 uio
.uio_rw
= UIO_READ
;
1455 uio
.uio_resid
= PAGE_SIZE
;
1456 UIO_SETUP_SYSSPACE(&uio
);
1458 error
= VOP_READ(vp
, &uio
, 0, cred
);
1462 if (uio
.uio_resid
) {
1463 memset(iov
.iov_base
, 0, uio
.uio_resid
);
1466 uvm_pagermapout(kva
, npages
);
1467 mutex_enter(&uobj
->vmobjlock
);
1468 mutex_enter(&uvm_pageqlock
);
1469 for (i
= 0; i
< npages
; i
++) {
1471 if (error
&& (pg
->flags
& PG_FAKE
) != 0) {
1472 pg
->flags
|= PG_RELEASED
;
1474 pmap_clear_modify(pg
);
1475 uvm_pageactivate(pg
);
1479 uvm_page_unbusy(pgs
, npages
);
1481 mutex_exit(&uvm_pageqlock
);
1482 mutex_exit(&uobj
->vmobjlock
);
1487 genfs_compat_gop_write(struct vnode
*vp
, struct vm_page
**pgs
, int npages
,
1493 kauth_cred_t cred
= curlwp
->l_cred
;
1498 offset
= pgs
[0]->offset
;
1499 kva
= uvm_pagermapin(pgs
, npages
,
1500 UVMPAGER_MAPIN_WRITE
| UVMPAGER_MAPIN_WAITOK
);
1502 iov
.iov_base
= (void *)kva
;
1503 iov
.iov_len
= npages
<< PAGE_SHIFT
;
1506 uio
.uio_offset
= offset
;
1507 uio
.uio_rw
= UIO_WRITE
;
1508 uio
.uio_resid
= npages
<< PAGE_SHIFT
;
1509 UIO_SETUP_SYSSPACE(&uio
);
1511 error
= VOP_WRITE(vp
, &uio
, 0, cred
);
1513 mutex_enter(&vp
->v_interlock
);
1515 mutex_exit(&vp
->v_interlock
);
1517 bp
= getiobuf(vp
, true);
1518 bp
->b_cflags
= BC_BUSY
| BC_AGE
;
1519 bp
->b_lblkno
= offset
>> vp
->v_mount
->mnt_fs_bshift
;
1520 bp
->b_data
= (char *)kva
;
1521 bp
->b_bcount
= npages
<< PAGE_SHIFT
;
1522 bp
->b_bufsize
= npages
<< PAGE_SHIFT
;
1524 bp
->b_error
= error
;
1525 uvm_aio_aiodone(bp
);
1530 * Process a uio using direct I/O. If we reach a part of the request
1531 * which cannot be processed in this fashion for some reason, just return.
1532 * The caller must handle some additional part of the request using
1533 * buffered I/O before trying direct I/O again.
1537 genfs_directio(struct vnode
*vp
, struct uio
*uio
, int ioflag
)
1543 const int mask
= DEV_BSIZE
- 1;
1545 bool need_wapbl
= (vp
->v_mount
&& vp
->v_mount
->mnt_wapbl
&&
1546 (ioflag
& IO_JOURNALLOCKED
) == 0);
1549 * We only support direct I/O to user space for now.
1552 if (VMSPACE_IS_KERNEL_P(uio
->uio_vmspace
)) {
1557 * If the vnode is mapped, we would need to get the getpages lock
1558 * to stabilize the bmap, but then we would get into trouble whil e
1559 * locking the pages if the pages belong to this same vnode (or a
1560 * multi-vnode cascade to the same effect). Just fall back to
1561 * buffered I/O if the vnode is mapped to avoid this mess.
1564 if (vp
->v_vflag
& VV_MAPPED
) {
1569 error
= WAPBL_BEGIN(vp
->v_mount
);
1575 * Do as much of the uio as possible with direct I/O.
1578 vs
= uio
->uio_vmspace
;
1579 while (uio
->uio_resid
) {
1581 if (iov
->iov_len
== 0) {
1586 va
= (vaddr_t
)iov
->iov_base
;
1587 len
= MIN(iov
->iov_len
, genfs_maxdio
);
1591 * If the next chunk is smaller than DEV_BSIZE or extends past
1592 * the current EOF, then fall back to buffered I/O.
1595 if (len
== 0 || uio
->uio_offset
+ len
> vp
->v_size
) {
1600 * Check alignment. The file offset must be at least
1601 * sector-aligned. The exact constraint on memory alignment
1602 * is very hardware-dependent, but requiring sector-aligned
1603 * addresses there too is safe.
1606 if (uio
->uio_offset
& mask
|| va
& mask
) {
1609 error
= genfs_do_directio(vs
, va
, len
, vp
, uio
->uio_offset
,
1614 iov
->iov_base
= (char *)iov
->iov_base
+ len
;
1615 iov
->iov_len
-= len
;
1616 uio
->uio_offset
+= len
;
1617 uio
->uio_resid
-= len
;
1621 WAPBL_END(vp
->v_mount
);
1625 * Iodone routine for direct I/O. We don't do much here since the request is
1626 * always synchronous, so the caller will do most of the work after biowait().
1630 genfs_dio_iodone(struct buf
*bp
)
1633 KASSERT((bp
->b_flags
& B_ASYNC
) == 0);
1634 if ((bp
->b_flags
& B_READ
) == 0 && (bp
->b_cflags
& BC_AGE
) != 0) {
1635 mutex_enter(bp
->b_objlock
);
1637 mutex_exit(bp
->b_objlock
);
1643 * Process one chunk of a direct I/O request.
1647 genfs_do_directio(struct vmspace
*vs
, vaddr_t uva
, size_t len
, struct vnode
*vp
,
1648 off_t off
, enum uio_rw rw
)
1651 struct pmap
*upm
, *kpm
;
1652 size_t klen
= round_page(uva
+ len
) - trunc_page(uva
);
1657 int error
, rv
, poff
, koff
;
1658 const int pgoflags
= PGO_CLEANIT
| PGO_SYNCIO
| PGO_JOURNALLOCKED
|
1659 (rw
== UIO_WRITE
? PGO_FREE
: 0);
1662 * For writes, verify that this range of the file already has fully
1663 * allocated backing store. If there are any holes, just punt and
1664 * make the caller take the buffered write path.
1667 if (rw
== UIO_WRITE
) {
1668 daddr_t lbn
, elbn
, blkno
;
1669 int bsize
, bshift
, run
;
1671 bshift
= vp
->v_mount
->mnt_fs_bshift
;
1672 bsize
= 1 << bshift
;
1673 lbn
= off
>> bshift
;
1674 elbn
= (off
+ len
+ bsize
- 1) >> bshift
;
1675 while (lbn
< elbn
) {
1676 error
= VOP_BMAP(vp
, lbn
, NULL
, &blkno
, &run
);
1680 if (blkno
== (daddr_t
)-1) {
1688 * Flush any cached pages for parts of the file that we're about to
1689 * access. If we're writing, invalidate pages as well.
1692 spoff
= trunc_page(off
);
1693 epoff
= round_page(off
+ len
);
1694 mutex_enter(&vp
->v_interlock
);
1695 error
= VOP_PUTPAGES(vp
, spoff
, epoff
, pgoflags
);
1701 * Wire the user pages and remap them into kernel memory.
1704 prot
= rw
== UIO_READ
? VM_PROT_READ
| VM_PROT_WRITE
: VM_PROT_READ
;
1705 error
= uvm_vslock(vs
, (void *)uva
, len
, prot
);
1711 upm
= vm_map_pmap(map
);
1712 kpm
= vm_map_pmap(kernel_map
);
1713 kva
= uvm_km_alloc(kernel_map
, klen
, 0,
1714 UVM_KMF_VAONLY
| UVM_KMF_WAITVA
);
1715 puva
= trunc_page(uva
);
1716 for (poff
= 0; poff
< klen
; poff
+= PAGE_SIZE
) {
1717 rv
= pmap_extract(upm
, puva
+ poff
, &pa
);
1719 pmap_enter(kpm
, kva
+ poff
, pa
, prot
, prot
| PMAP_WIRED
);
1727 koff
= uva
- trunc_page(uva
);
1728 error
= genfs_do_io(vp
, off
, kva
+ koff
, len
, PGO_SYNCIO
, rw
,
1732 * Tear down the kernel mapping.
1735 pmap_remove(kpm
, kva
, kva
+ klen
);
1737 uvm_km_free(kernel_map
, kva
, klen
, UVM_KMF_VAONLY
);
1740 * Unwire the user pages.
1743 uvm_vsunlock(vs
, (void *)uva
, len
);