1 /* $NetBSD: kern_physio.c,v 1.90 2009/05/18 21:12:33 ad Exp $ */
4 * Copyright (c) 1982, 1986, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
40 * Copyright (c) 1994 Christopher G. Demetriou
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
73 #include <sys/cdefs.h>
74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.90 2009/05/18 21:12:33 ad Exp $");
76 #include <sys/param.h>
77 #include <sys/systm.h>
81 #include <sys/workqueue.h>
84 #include <uvm/uvm_extern.h>
86 ONCE_DECL(physio_initialized
);
87 struct workqueue
*physio_workqueue
;
89 int physio_concurrency
= 16;
91 /* #define PHYSIO_DEBUG */
92 #if defined(PHYSIO_DEBUG)
93 #define DPRINTF(a) printf a
94 #else /* defined(PHYSIO_DEBUG) */
95 #define DPRINTF(a) /* nothing */
96 #endif /* defined(PHYSIO_DEBUG) */
109 physio_done(struct work
*wk
, void *dummy
)
111 struct buf
*bp
= (void *)wk
;
112 size_t todo
= bp
->b_bufsize
;
113 size_t done
= bp
->b_bcount
- bp
->b_resid
;
114 struct physio_stat
*ps
= bp
->b_private
;
117 KASSERT(&bp
->b_work
== wk
);
118 KASSERT(bp
->b_bcount
<= todo
);
119 KASSERT(bp
->b_resid
<= bp
->b_bcount
);
120 KASSERT((bp
->b_flags
& B_PHYS
) != 0);
121 KASSERT(dummy
== NULL
);
124 uvm_vsunlock(bp
->b_proc
->p_vmspace
, bp
->b_data
, todo
);
126 mutex_enter(&ps
->ps_lock
);
127 is_iobuf
= (bp
!= ps
->ps_orig_bp
);
128 if (__predict_false(done
!= todo
)) {
129 off_t endoffset
= dbtob(bp
->b_blkno
) + done
;
132 * we got an error or hit EOM.
134 * we only care about the first one.
135 * ie. the one at the lowest offset.
138 KASSERT(ps
->ps_endoffset
!= endoffset
);
139 DPRINTF(("%s: error=%d at %" PRIu64
" - %" PRIu64
140 ", blkno=%" PRIu64
", bcount=%d, flags=0x%x\n",
141 __func__
, bp
->b_error
, dbtob(bp
->b_blkno
), endoffset
,
142 bp
->b_blkno
, bp
->b_bcount
, bp
->b_flags
));
144 if (ps
->ps_endoffset
== -1 || endoffset
< ps
->ps_endoffset
) {
145 DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
148 ps
->ps_error
, bp
->b_error
,
149 ps
->ps_endoffset
, endoffset
));
151 ps
->ps_endoffset
= endoffset
;
152 ps
->ps_error
= bp
->b_error
;
156 KASSERT(bp
->b_error
== 0);
160 cv_signal(&ps
->ps_cv
);
161 mutex_exit(&ps
->ps_lock
);
168 physio_biodone(struct buf
*bp
)
170 #if defined(DIAGNOSTIC)
171 struct physio_stat
*ps
= bp
->b_private
;
172 size_t todo
= bp
->b_bufsize
;
173 size_t done
= bp
->b_bcount
- bp
->b_resid
;
175 KASSERT(ps
->ps_running
> 0);
176 KASSERT(bp
->b_bcount
<= todo
);
177 KASSERT(bp
->b_resid
<= bp
->b_bcount
);
179 KASSERT(bp
->b_error
== 0);
180 #endif /* defined(DIAGNOSTIC) */
182 workqueue_enqueue(physio_workqueue
, &bp
->b_work
, NULL
);
186 physio_wait(struct physio_stat
*ps
, int n
)
189 KASSERT(mutex_owned(&ps
->ps_lock
));
191 while (ps
->ps_running
> n
)
192 cv_wait(&ps
->ps_cv
, &ps
->ps_lock
);
200 KASSERT(physio_workqueue
== NULL
);
202 error
= workqueue_create(&physio_workqueue
, "physiod",
203 physio_done
, NULL
, PRI_BIO
, IPL_BIO
, WQ_MPSAFE
);
209 * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
210 * from the raw device to user buffers, and bypasses the buffer cache.
213 physio(void (*strategy
)(struct buf
*), struct buf
*obp
, dev_t dev
, int flags
,
214 void (*min_phys
)(struct buf
*), struct uio
*uio
)
217 struct lwp
*l
= curlwp
;
218 struct proc
*p
= l
->l_proc
;
220 struct buf
*bp
= NULL
;
221 struct physio_stat
*ps
;
222 int concurrency
= physio_concurrency
- 1;
224 error
= RUN_ONCE(&physio_initialized
, physio_init
);
225 if (__predict_false(error
!= 0)) {
229 DPRINTF(("%s: called: off=%" PRIu64
", resid=%zu\n",
230 __func__
, uio
->uio_offset
, uio
->uio_resid
));
232 flags
&= B_READ
| B_WRITE
;
234 if ((ps
= kmem_zalloc(sizeof(*ps
), KM_SLEEP
)) == NULL
)
236 /* ps->ps_running = 0; */
237 /* ps->ps_error = 0; */
238 /* ps->ps_failed = 0; */
239 ps
->ps_orig_bp
= obp
;
240 ps
->ps_endoffset
= -1;
241 mutex_init(&ps
->ps_lock
, MUTEX_DEFAULT
, IPL_NONE
);
242 cv_init(&ps
->ps_cv
, "physio");
244 /* Make sure we have a buffer, creating one if necessary. */
246 mutex_enter(&bufcache_lock
);
247 /* Mark it busy, so nobody else will use it. */
248 while (bbusy(obp
, false, 0, NULL
) == EPASSTHROUGH
)
250 mutex_exit(&bufcache_lock
);
251 concurrency
= 0; /* see "XXXkludge" comment below */
254 for (i
= 0; i
< uio
->uio_iovcnt
; i
++) {
257 iovp
= &uio
->uio_iov
[i
];
258 while (iovp
->iov_len
> 0) {
262 mutex_enter(&ps
->ps_lock
);
263 if (ps
->ps_failed
!= 0) {
266 physio_wait(ps
, sync
? 0 : concurrency
);
267 mutex_exit(&ps
->ps_lock
);
271 * some drivers use "obp" as an identifier.
275 bp
= getiobuf(NULL
, true);
276 bp
->b_cflags
= BC_BUSY
;
283 * Mrk the buffer busy for physical I/O. Also set
284 * B_PHYS because it's an I/O to user memory, and
285 * B_RAW because B_RAW is to be "set by physio for
289 bp
->b_cflags
= BC_BUSY
;
290 bp
->b_flags
= flags
| B_PHYS
| B_RAW
;
291 bp
->b_iodone
= physio_biodone
;
293 /* Set up the buffer for a maximum-sized transfer. */
294 bp
->b_blkno
= btodb(uio
->uio_offset
);
295 if (dbtob(bp
->b_blkno
) != uio
->uio_offset
) {
299 bp
->b_bcount
= MIN(MAXPHYS
, iovp
->iov_len
);
300 bp
->b_data
= iovp
->iov_base
;
303 * Call minphys to bound the transfer size,
304 * and remember the amount of data to transfer,
305 * for later comparison.
308 todo
= bp
->b_bufsize
= bp
->b_bcount
;
309 #if defined(DIAGNOSTIC)
311 panic("todo(%zu) > MAXPHYS; minphys broken",
313 #endif /* defined(DIAGNOSTIC) */
316 endp
= (vaddr_t
)bp
->b_data
+ todo
;
317 if (trunc_page(endp
) != endp
) {
319 * Following requests can overlap.
320 * note that uvm_vslock does round_page.
326 * Lock the part of the user address space involved
327 * in the transfer. Beware vmapbuf(); it clobbers
328 * b_data and saves it in b_saveaddr. However,
329 * vunmapbuf() restores it.
331 error
= uvm_vslock(p
->p_vmspace
, bp
->b_data
, todo
,
332 (flags
& B_READ
) ? VM_PROT_WRITE
: VM_PROT_READ
);
338 BIO_SETPRIO(bp
, BPRIO_TIMECRITICAL
);
340 mutex_enter(&ps
->ps_lock
);
342 mutex_exit(&ps
->ps_lock
);
344 /* Call strategy to start the transfer. */
348 iovp
->iov_len
-= todo
;
349 iovp
->iov_base
= (char *)iovp
->iov_base
+ todo
;
350 uio
->uio_offset
+= todo
;
351 uio
->uio_resid
-= todo
;
356 mutex_enter(&ps
->ps_lock
);
359 mutex_exit(&ps
->ps_lock
);
361 if (ps
->ps_failed
!= 0) {
364 delta
= uio
->uio_offset
- ps
->ps_endoffset
;
366 uio
->uio_resid
+= delta
;
367 /* uio->uio_offset = ps->ps_endoffset; */
369 KASSERT(ps
->ps_endoffset
== -1);
371 if (bp
!= NULL
&& bp
!= obp
) {
375 error
= ps
->ps_error
;
377 mutex_destroy(&ps
->ps_lock
);
378 cv_destroy(&ps
->ps_cv
);
379 kmem_free(ps
, sizeof(*ps
));
382 * Clean up the state of the buffer. Remember if somebody wants
383 * it, so we can wake them up below. Also, if we had to steal it,
387 KASSERT((obp
->b_cflags
& BC_BUSY
) != 0);
390 * If another process is waiting for the raw I/O buffer,
391 * wake up processes waiting to do physical I/O;
393 mutex_enter(&bufcache_lock
);
394 obp
->b_cflags
&= ~(BC_BUSY
| BC_WANTED
);
395 obp
->b_flags
&= ~(B_PHYS
| B_RAW
);
396 obp
->b_iodone
= NULL
;
397 cv_broadcast(&obp
->b_busy
);
398 mutex_exit(&bufcache_lock
);
401 DPRINTF(("%s: done: off=%" PRIu64
", resid=%zu\n",
402 __func__
, uio
->uio_offset
, uio
->uio_resid
));
408 * A minphys() routine is called by physio() to adjust the size of each
409 * I/O transfer before the latter is passed to the strategy routine.
411 * This minphys() is a default that must be called to enforce limits
412 * that are applicable to all devices, because of limitations in the
413 * kernel or the hardware platform.
416 minphys(struct buf
*bp
)
419 if (bp
->b_bcount
> MAXPHYS
)
420 bp
->b_bcount
= MAXPHYS
;