4 * Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi,
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * uvm_object read-ahead
34 * - handle multiple streams.
35 * - find a better way to deal with PGO_LOCKED pager requests.
36 * (currently just ignored)
37 * - consider the amount of memory in the system.
38 * - consider the speed of the underlying device.
39 * - consider filesystem block size / block layout.
42 #include <sys/cdefs.h>
43 __KERNEL_RCSID(0, "$NetBSD$");
45 #include <sys/param.h>
49 #include <uvm/uvm_readahead.h>
51 #if defined(READAHEAD_DEBUG)
52 #define DPRINTF(a) printf a
53 #else /* defined(READAHEAD_DEBUG) */
54 #define DPRINTF(a) /* nothing */
55 #endif /* defined(READAHEAD_DEBUG) */
58 * uvm_ractx: read-ahead context.
64 off_t ra_winstart
; /* window start offset */
65 size_t ra_winsize
; /* window size */
66 off_t ra_next
; /* next offset to read-ahead */
69 #if defined(sun2) || (defined(sun3) && defined(_SUN3_))
70 /* XXX: on sun2 and sun3 (but not sun3x) MAXPHYS is 0xe000 */
72 #define MAXPHYS 0x8000 /* XXX */
75 #define RA_WINSIZE_INIT MAXPHYS /* initial window size */
76 #define RA_WINSIZE_MAX (MAXPHYS * 8) /* max window size */
77 #define RA_WINSIZE_SEQENTIAL RA_WINSIZE_MAX /* fixed window size used for
79 #define RA_MINSIZE (MAXPHYS * 2) /* min size to start i/o */
80 #define RA_IOCHUNK MAXPHYS /* read-ahead i/o chunk size */
82 static off_t
ra_startio(struct uvm_object
*, off_t
, size_t);
83 static struct uvm_ractx
*ra_allocctx(void);
84 static void ra_freectx(struct uvm_ractx
*);
86 static struct pool_cache ractx_cache
;
89 * uvm_ra_init: initialize readahead module.
96 pool_cache_bootstrap(&ractx_cache
, sizeof(struct uvm_ractx
), 0, 0, 0,
97 "ractx", NULL
, IPL_NONE
, NULL
, NULL
, NULL
);
100 static struct uvm_ractx
*
104 return pool_cache_get(&ractx_cache
, PR_NOWAIT
);
108 ra_freectx(struct uvm_ractx
*ra
)
111 pool_cache_put(&ractx_cache
, ra
);
115 * ra_startio: start i/o for read-ahead.
117 * => start i/o for each RA_IOCHUNK sized chunk.
118 * => return offset to which we started i/o.
122 ra_startio(struct uvm_object
*uobj
, off_t off
, size_t sz
)
124 const off_t endoff
= off
+ sz
;
126 DPRINTF(("%s: uobj=%p, off=%" PRIu64
", endoff=%" PRIu64
"\n",
127 __func__
, uobj
, off
, endoff
));
128 off
= trunc_page(off
);
129 while (off
< endoff
) {
130 const size_t chunksize
= RA_IOCHUNK
;
137 KASSERT((chunksize
& (chunksize
- 1)) == 0);
138 KASSERT((off
& PAGE_MASK
) == 0);
139 bytelen
= ((off
+ chunksize
) & -(off_t
)chunksize
) - off
;
140 KASSERT((bytelen
& PAGE_MASK
) == 0);
141 npages
= orignpages
= bytelen
>> PAGE_SHIFT
;
142 KASSERT(npages
!= 0);
145 * use UVM_ADV_RANDOM to avoid recursion.
148 mutex_enter(&uobj
->vmobjlock
);
149 error
= (*uobj
->pgops
->pgo_get
)(uobj
, off
, NULL
,
150 &npages
, 0, VM_PROT_READ
, UVM_ADV_RANDOM
, 0);
151 DPRINTF(("%s: off=%" PRIu64
", bytelen=%zu -> %d\n",
152 __func__
, off
, bytelen
, error
));
153 if (error
!= 0 && error
!= EBUSY
) {
154 if (error
!= EINVAL
) { /* maybe past EOF */
155 DPRINTF(("%s: error=%d\n", __func__
, error
));
159 KASSERT(orignpages
== npages
);
160 donebytes
= orignpages
<< PAGE_SHIFT
;
167 /* ------------------------------------------------------------ */
170 * uvm_ra_allocctx: allocate a context.
174 uvm_ra_allocctx(void)
176 struct uvm_ractx
*ra
;
187 * uvm_ra_freectx: free a context.
191 uvm_ra_freectx(struct uvm_ractx
*ra
)
199 * uvm_ra_request: update a read-ahead context and start i/o if appropriate.
201 * => called when [reqoff, reqoff+reqsize) is requested.
202 * => object must be locked by caller, will return locked.
206 uvm_ra_request(struct uvm_ractx
*ra
, int advice
, struct uvm_object
*uobj
,
207 off_t reqoff
, size_t reqsize
)
210 KASSERT(mutex_owned(&uobj
->vmobjlock
));
212 if (ra
== NULL
|| advice
== UVM_ADV_RANDOM
) {
216 if (advice
== UVM_ADV_SEQUENTIAL
) {
219 * always do read-ahead with a large window.
222 if ((ra
->ra_flags
& RA_VALID
) == 0) {
223 ra
->ra_winstart
= ra
->ra_next
= 0;
224 ra
->ra_flags
|= RA_VALID
;
226 if (reqoff
< ra
->ra_winstart
) {
227 ra
->ra_next
= reqoff
;
229 ra
->ra_winsize
= RA_WINSIZE_SEQENTIAL
;
234 * a request with UVM_ADV_NORMAL hint. (ie. no hint)
236 * we keep a sliding window in order to determine:
237 * - if the previous read-ahead was successful or not.
238 * - how many bytes to read-ahead.
242 * if it's the first request for this context,
243 * initialize context and return.
246 if ((ra
->ra_flags
& RA_VALID
) == 0) {
248 ra
->ra_winstart
= ra
->ra_next
= reqoff
+ reqsize
;
249 ra
->ra_winsize
= RA_WINSIZE_INIT
;
250 ra
->ra_flags
|= RA_VALID
;
255 * if it isn't in our window,
256 * initialize context and return.
260 if (reqoff
< ra
->ra_winstart
||
261 ra
->ra_winstart
+ ra
->ra_winsize
< reqoff
) {
264 * ... unless we seem to be reading the same chunk repeatedly.
266 * XXX should have some margin?
269 if (reqoff
+ reqsize
== ra
->ra_winstart
) {
270 DPRINTF(("%s: %p: same block: off=%" PRIu64
271 ", size=%zd, winstart=%" PRIu64
"\n",
272 __func__
, ra
, reqoff
, reqsize
, ra
->ra_winstart
));
279 * it's in our window. (read-ahead hit)
280 * - start read-ahead i/o if appropriate.
281 * - advance and enlarge window.
287 * don't bother to read-ahead behind current request.
290 if (reqoff
> ra
->ra_next
) {
291 ra
->ra_next
= reqoff
;
295 * try to make [reqoff, reqoff+ra_winsize) in-core.
296 * note that [reqoff, ra_next) is considered already done.
299 if (reqoff
+ ra
->ra_winsize
> ra
->ra_next
) {
300 off_t raoff
= MAX(reqoff
, ra
->ra_next
);
301 size_t rasize
= reqoff
+ ra
->ra_winsize
- ra
->ra_next
;
303 #if defined(DIAGNOSTIC)
304 if (rasize
> RA_WINSIZE_MAX
) {
305 printf("%s: corrupted context", __func__
);
306 rasize
= RA_WINSIZE_MAX
;
308 #endif /* defined(DIAGNOSTIC) */
311 * issue read-ahead only if we can start big enough i/o.
312 * otherwise we end up with a stream of small i/o.
315 if (rasize
>= RA_MINSIZE
) {
318 mutex_exit(&uobj
->vmobjlock
);
319 next
= ra_startio(uobj
, raoff
, rasize
);
320 mutex_enter(&uobj
->vmobjlock
);
328 * enlarge window by reqsize, so that it grows in a predictable manner
329 * regardless of the size of each read(2).
332 ra
->ra_winstart
= reqoff
+ reqsize
;
333 ra
->ra_winsize
= MIN(RA_WINSIZE_MAX
, ra
->ra_winsize
+ reqsize
);
339 uvm_readahead(struct uvm_object
*uobj
, off_t off
, off_t size
)
343 * don't allow too much read-ahead.
345 if (size
> RA_WINSIZE_MAX
) {
346 size
= RA_WINSIZE_MAX
;
348 ra_startio(uobj
, off
, size
);