4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
38 #include <sys/types.h>
39 #include <sys/t_lock.h>
40 #include <sys/param.h>
41 #include <sys/errno.h>
42 #include <sys/debug.h>
43 #include <sys/cmn_err.h>
45 #include <sys/sysmacros.h>
46 #include <sys/inline.h>
51 #include <sys/systm.h>
52 #include <sys/vmsystm.h>
53 #include <sys/cpuvar.h>
56 #include <sys/vnode.h>
61 #include <sys/vtrace.h>
62 #include <sys/tnf_probe.h>
63 #include <sys/fs/snode.h>
64 #include <sys/copyops.h>
73 #include <vm/seg_vn.h>
74 #include <vm/seg_kmem.h>
79 minphys(struct buf
*bp
)
81 if (bp
->b_bcount
> maxphys
)
82 bp
->b_bcount
= maxphys
;
86 * use kmem_cache_create for physio buffers. This has shown
87 * a better cache distribution compared to buffers on the
88 * stack. It also avoids semaphore construction/deconstruction
92 static struct kmem_cache
*physio_buf_cache
;
96 physio_buf_constructor(void *buf
, void *cdrarg
, int kmflags
)
98 bioinit((struct buf
*)buf
);
104 physio_buf_destructor(void *buf
, void *cdrarg
)
106 biofini((struct buf
*)buf
);
110 physio_bufs_init(void)
112 physio_buf_cache
= kmem_cache_create("physio_buf_cache",
113 sizeof (struct buf
), 0, physio_buf_constructor
,
114 physio_buf_destructor
, NULL
, NULL
, NULL
, 0);
120 * initiate raw I/O request
122 * allocate buf header if necessary
123 * adjust max size of each I/O request
124 * lock down user pages and verify access protections
125 * call driver's strategy routine to submit request
126 * wait for I/O completion
127 * unlock user pages and free allocated buf header
131 default_physio(int (*strat
)(struct buf
*), struct buf
*bp
, dev_t dev
,
132 int rw
, void (*mincnt
)(struct buf
*), struct uio
*uio
)
143 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_START
, "physio_start: bp %p", bp
);
146 TNF_PROBE_4(physio_start
, "io rawio", /* CSTYLED */,
147 tnf_device
, device
, dev
,
148 tnf_offset
, offset
, uio
->uio_loffset
,
149 tnf_size
, size
, uio
->uio_resid
,
150 tnf_bioflags
, rw
, rw
);
153 CPU_STATS_ADD_K(sys
, phread
, 1);
155 CPU_STATS_ADD_K(sys
, phwrite
, 1);
158 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_GETBUF_START
,
159 "getbuf_start: bp %p", bp
);
162 bp
= kmem_cache_alloc(physio_buf_cache
, KM_SLEEP
);
167 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_GETBUF_END
, "getbuf_end: bp %p", bp
);
169 if (uio
->uio_segflg
== UIO_USERSPACE
) {
170 procp
= ttoproc(curthread
);
176 ASSERT(SEMA_HELD(&bp
->b_sem
));
179 * We need to prepare this buffer for the io:::start probe, including
180 * NULL'ing out the file, clearing the offset, and filling in the
187 (void) devopsp
[getmajor(dev
)]->devo_getinfo(NULL
,
188 DDI_INFO_DEVT2DEVINFO
, (void *)dev
, (void **)&bp
->b_dip
);
193 while (uio
->uio_iovcnt
> 0) {
199 while (iov
->iov_len
> 0) {
200 if (uio
->uio_resid
== 0)
202 if (uio
->uio_loffset
< 0) {
208 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
209 * which represents the maximum size that can be
210 * supported by the IO subsystem.
211 * XXX this code assumes a D_64BIT driver.
213 if (uio
->uio_loffset
> SPEC_MAXOFFSET_T
) {
218 bp
->b_flags
= B_BUSY
| B_PHYS
| rw
;
220 bp
->b_lblkno
= btodt(uio
->uio_loffset
);
223 * Don't count on b_addr remaining untouched by the
224 * code below (it may be reset because someone does
225 * a bp_mapin on the buffer) -- reset from the iov
226 * each time through, updating the iov's base address
229 a
= bp
->b_un
.b_addr
= iov
->iov_base
;
230 bp
->b_bcount
= MIN(iov
->iov_len
, uio
->uio_resid
);
234 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_LOCK_START
,
235 "as_pagelock_start: bp %p", bp
);
237 error
= as_pagelock(asp
, &pplist
, a
,
238 c
, rw
== B_READ
? S_WRITE
: S_READ
);
240 TRACE_0(TR_FAC_PHYSIO
, TR_PHYSIO_LOCK_END
,
244 bp
->b_flags
|= B_ERROR
;
246 bp
->b_flags
&= ~(B_BUSY
|B_WANTED
|B_PHYS
);
249 bp
->b_shadow
= pplist
;
250 if (pplist
!= NULL
) {
251 bp
->b_flags
|= B_SHADOW
;
254 DTRACE_IO1(start
, struct buf
*, bp
);
255 bp
->b_flags
|= B_STARTED
;
263 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_UNLOCK_START
,
264 "as_pageunlock_start: bp %p", bp
);
266 as_pageunlock(asp
, pplist
, a
, c
,
267 rw
== B_READ
? S_WRITE
: S_READ
);
269 TRACE_0(TR_FAC_PHYSIO
, TR_PHYSIO_UNLOCK_END
,
270 "as_pageunlock_end:");
276 uio
->uio_loffset
+= c
;
277 /* bp->b_resid - temp kludge for tape drives */
278 if (bp
->b_resid
|| error
)
281 bp
->b_flags
&= ~(B_BUSY
|B_WANTED
|B_PHYS
|B_SHADOW
);
282 /* bp->b_resid - temp kludge for tape drives */
283 if (bp
->b_resid
|| error
)
290 kmem_cache_free(physio_buf_cache
, bp
);
294 TNF_PROBE_1(physio_end
, "io rawio", /* CSTYLED */,
295 tnf_device
, device
, dev
);
297 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_END
, "physio_end: bp %p", bp
);
303 * Returns 0 on success, or an error on failure.
305 * This function is no longer a part of the DDI/DKI.
306 * However, for compatibility, its interface should not
307 * be changed and it should not be removed from the kernel.
310 useracc(void *addr
, size_t count
, int access
)
314 prot
= PROT_USER
| ((access
== B_READ
) ? PROT_READ
: PROT_WRITE
);
315 return (as_checkprot(ttoproc(curthread
)->p_as
, addr
, count
, prot
));
318 #define MAX_MAPIN_PAGES 8
321 * This function temporarily "borrows" user pages for kernel use. If
322 * "cow" is on, it also sets up copy-on-write protection (only feasible
323 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
324 * pages from any changes by the user. The caller is responsible for
325 * unlocking and tearing down cow settings when it's done with the pages.
326 * For an example, see kcfree().
328 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
329 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
330 * kaddr != -1. On entering this function, cached_ppp contains a list
331 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
332 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
333 * the kernel map won't need to be reloaded again.
335 * For cow == 1, if the pages are anonymous pages, it also bumps the anon
336 * reference count, and change the user-mapping to read-only. This
337 * scheme should work on all types of segment drivers. But to be safe,
338 * we check against segvn here.
340 * Since this function is used to emulate copyin() semantic, it checks
341 * to make sure the user-mappings allow "user-read".
343 * On exit "lenp" contains the number of bytes successfully locked and
344 * mapped in. For the unsuccessful ones, the caller can fall back to
348 * ENOTSUP - operation like this is not supported either on this segment
349 * type, or on this platform type.
352 cow_mapin(struct as
*as
, caddr_t uaddr
, caddr_t kaddr
, struct page
**cached_ppp
,
353 struct anon
**app
, size_t *lenp
, int cow
)
358 page_t
*pp
, *ppp
[MAX_MAPIN_PAGES
];
361 size_t size
, total
= *lenp
;
367 AS_LOCK_ENTER(as
, RW_WRITER
);
368 seg
= as_findseg(as
, uaddr
, 0);
369 if ((seg
== NULL
) || ((base
= seg
->s_base
) > uaddr
) ||
370 (uaddr
+ total
) > base
+ seg
->s_size
) {
375 * The COW scheme should work for all segment types.
376 * But to be safe, we check against segvn.
378 if (seg
->s_ops
!= &segvn_ops
) {
381 } else if ((SEGOP_GETTYPE(seg
, uaddr
) & MAP_PRIVATE
) == 0) {
390 * If (cow), hat_softlock will also change the usr protection to RO.
391 * This is the first step toward setting up cow. Before we
392 * bump up an_refcnt, we can't allow any cow-fault on this
393 * address. Otherwise segvn_fault will change the protection back
394 * to RW upon seeing an_refcnt == 1.
395 * The solution is to hold the writer lock on "as".
397 res
= hat_softlock(hat
, uaddr
, &size
, &ppp
[0], cow
? HAT_COW
: 0);
400 size
= size
>> PAGESHIFT
;
407 * Another solution is to hold SE_EXCL on pp, and
408 * disable PROT_WRITE. This also works for MAP_SHARED
409 * segment. The disadvantage is that it locks the
410 * page from being used by anybody else.
412 ahm
= AH_MUTEX(pp
->p_vnode
, pp
->p_offset
);
414 *app
= swap_anon(pp
->p_vnode
, pp
->p_offset
);
416 * Since we are holding the as lock, this avoids a
417 * potential race with anon_decref. (segvn_unmap and
418 * segvn_free needs the as writer lock to do anon_free.)
422 if ((*app
)->an_refcnt
== 0)
424 * Consider the following senario (unlikey
427 * 2. we solftlock the page.
428 * 3. cow ocurrs on this addr. So a new ap,
429 * page and mapping is established on addr.
430 * 4. an_refcnt drops to 1 (segvn_faultpage
431 * -> anon_decref(oldap))
432 * 5. the last ref to ap also drops (from
433 * another as). It ends up blocked inside
434 * anon_decref trying to get page's excl lock.
435 * 6. Later kcfree unlocks the page, call
436 * anon_decref -> oops, ap is gone already.
438 * Holding as writer lock solves all problems.
449 if (kaddr
!= (caddr_t
)-1) {
450 if (pp
!= *cached_ppp
) {
451 if (*cached_ppp
== NULL
)
452 flags
= HAT_LOAD_LOCK
| HAT_NOSYNC
|
455 flags
= HAT_LOAD_REMAP
|
458 * In order to cache the kernel mapping after
459 * the user page is unlocked, we call
460 * hat_devload instead of hat_memload so
461 * that the kernel mapping we set up here is
462 * "invisible" to the rest of the world. This
463 * is not very pretty. But as long as the
464 * caller bears the responsibility of keeping
465 * cache consistency, we should be ok -
466 * HAT_NOCONSIST will get us a uncached
467 * mapping on VAC. hat_softlock will flush
468 * a VAC_WRITEBACK cache. Therefore the kaddr
469 * doesn't have to be of the same vcolor as
471 * The alternative is - change hat_devload
472 * to get a cached mapping. Allocate a kaddr
473 * with the same vcolor as uaddr. Then
474 * hat_softlock won't need to flush the VAC.
476 hat_devload(kas
.a_hat
, kaddr
, PAGESIZE
,
477 page_pptonum(pp
), PROT_READ
, flags
);
489 if (first
&& res
== FC_NOMAP
) {
491 * If the address is not mapped yet, we call as_fault to
492 * fault the pages in. We could've fallen back to copy and
493 * let it fault in the pages. But for a mapped file, we
494 * normally reference each page only once. For zero-copy to
495 * be of any use, we'd better fall in the page now and try
499 size
= size
<< PAGESHIFT
;
503 res
= as_fault(as
->a_hat
, as
, uaddr
, size
, F_INVAL
, S_READ
);
505 AS_LOCK_ENTER(as
, RW_WRITER
);
511 case FC_PROT
: /* Pretend we don't know about it. This will be */
512 /* caught by the caller when uiomove fails. */