4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
28 #include <sys/zfs_context.h>
29 #include <sys/dnode.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dmu_zfetch.h>
36 * I'm against tune-ables, but these should probably exist as tweakable globals
37 * until we can get this working the way we want it to.
40 int zfs_prefetch_disable
= 0;
42 /* max # of streams per zfetch */
43 uint32_t zfetch_max_streams
= 8;
44 /* min time before stream reclaim */
45 uint32_t zfetch_min_sec_reap
= 2;
46 /* max number of blocks to fetch at a time */
47 uint32_t zfetch_block_cap
= 256;
48 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
49 uint64_t zfetch_array_rd_sz
= 1024 * 1024;
51 /* forward decls for static routines */
52 static int dmu_zfetch_colinear(zfetch_t
*, zstream_t
*);
53 static void dmu_zfetch_dofetch(zfetch_t
*, zstream_t
*);
54 static uint64_t dmu_zfetch_fetch(dnode_t
*, uint64_t, uint64_t);
55 static uint64_t dmu_zfetch_fetchsz(dnode_t
*, uint64_t, uint64_t);
56 static int dmu_zfetch_find(zfetch_t
*, zstream_t
*, int);
57 static int dmu_zfetch_stream_insert(zfetch_t
*, zstream_t
*);
58 static zstream_t
*dmu_zfetch_stream_reclaim(zfetch_t
*);
59 static void dmu_zfetch_stream_remove(zfetch_t
*, zstream_t
*);
60 static int dmu_zfetch_streams_equal(zstream_t
*, zstream_t
*);
63 * Given a zfetch structure and a zstream structure, determine whether the
64 * blocks to be read are part of a co-linear pair of existing prefetch
65 * streams. If a set is found, coalesce the streams, removing one, and
66 * configure the prefetch so it looks for a strided access pattern.
68 * In other words: if we find two sequential access streams that are
69 * the same length and distance N appart, and this read is N from the
70 * last stream, then we are probably in a strided access pattern. So
71 * combine the two sequential streams into a single strided stream.
73 * If no co-linear streams are found, return NULL.
76 dmu_zfetch_colinear(zfetch_t
*zf
, zstream_t
*zh
)
81 if (! rw_tryenter(&zf
->zf_rwlock
, RW_WRITER
))
85 rw_exit(&zf
->zf_rwlock
);
89 for (z_walk
= list_head(&zf
->zf_stream
); z_walk
;
90 z_walk
= list_next(&zf
->zf_stream
, z_walk
)) {
91 for (z_comp
= list_next(&zf
->zf_stream
, z_walk
); z_comp
;
92 z_comp
= list_next(&zf
->zf_stream
, z_comp
)) {
95 if (z_walk
->zst_len
!= z_walk
->zst_stride
||
96 z_comp
->zst_len
!= z_comp
->zst_stride
) {
100 diff
= z_comp
->zst_offset
- z_walk
->zst_offset
;
101 if (z_comp
->zst_offset
+ diff
== zh
->zst_offset
) {
102 z_walk
->zst_offset
= zh
->zst_offset
;
103 z_walk
->zst_direction
= diff
< 0 ? -1 : 1;
105 diff
* z_walk
->zst_direction
;
106 z_walk
->zst_ph_offset
=
107 zh
->zst_offset
+ z_walk
->zst_stride
;
108 dmu_zfetch_stream_remove(zf
, z_comp
);
109 mutex_destroy(&z_comp
->zst_lock
);
110 kmem_free(z_comp
, sizeof (zstream_t
));
112 dmu_zfetch_dofetch(zf
, z_walk
);
114 rw_exit(&zf
->zf_rwlock
);
118 diff
= z_walk
->zst_offset
- z_comp
->zst_offset
;
119 if (z_walk
->zst_offset
+ diff
== zh
->zst_offset
) {
120 z_walk
->zst_offset
= zh
->zst_offset
;
121 z_walk
->zst_direction
= diff
< 0 ? -1 : 1;
123 diff
* z_walk
->zst_direction
;
124 z_walk
->zst_ph_offset
=
125 zh
->zst_offset
+ z_walk
->zst_stride
;
126 dmu_zfetch_stream_remove(zf
, z_comp
);
127 mutex_destroy(&z_comp
->zst_lock
);
128 kmem_free(z_comp
, sizeof (zstream_t
));
130 dmu_zfetch_dofetch(zf
, z_walk
);
132 rw_exit(&zf
->zf_rwlock
);
138 rw_exit(&zf
->zf_rwlock
);
143 * Given a zstream_t, determine the bounds of the prefetch. Then call the
144 * routine that actually prefetches the individual blocks.
147 dmu_zfetch_dofetch(zfetch_t
*zf
, zstream_t
*zs
)
149 uint64_t prefetch_tail
;
150 uint64_t prefetch_limit
;
151 uint64_t prefetch_ofst
;
152 uint64_t prefetch_len
;
153 uint64_t blocks_fetched
;
155 zs
->zst_stride
= MAX((int64_t)zs
->zst_stride
, zs
->zst_len
);
156 zs
->zst_cap
= MIN(zfetch_block_cap
, 2 * zs
->zst_cap
);
158 prefetch_tail
= MAX((int64_t)zs
->zst_ph_offset
,
159 (int64_t)(zs
->zst_offset
+ zs
->zst_stride
));
161 * XXX: use a faster division method?
163 prefetch_limit
= zs
->zst_offset
+ zs
->zst_len
+
164 (zs
->zst_cap
* zs
->zst_stride
) / zs
->zst_len
;
166 while (prefetch_tail
< prefetch_limit
) {
167 prefetch_ofst
= zs
->zst_offset
+ zs
->zst_direction
*
168 (prefetch_tail
- zs
->zst_offset
);
170 prefetch_len
= zs
->zst_len
;
173 * Don't prefetch beyond the end of the file, if working
176 if ((zs
->zst_direction
== ZFETCH_BACKWARD
) &&
177 (prefetch_ofst
> prefetch_tail
)) {
178 prefetch_len
+= prefetch_ofst
;
182 /* don't prefetch more than we're supposed to */
183 if (prefetch_len
> zs
->zst_len
)
186 blocks_fetched
= dmu_zfetch_fetch(zf
->zf_dnode
,
187 prefetch_ofst
, zs
->zst_len
);
189 prefetch_tail
+= zs
->zst_stride
;
190 /* stop if we've run out of stuff to prefetch */
191 if (blocks_fetched
< zs
->zst_len
)
194 zs
->zst_ph_offset
= prefetch_tail
;
195 zs
->zst_last
= lbolt
;
199 * This takes a pointer to a zfetch structure and a dnode. It performs the
200 * necessary setup for the zfetch structure, grokking data from the
204 dmu_zfetch_init(zfetch_t
*zf
, dnode_t
*dno
)
211 zf
->zf_stream_cnt
= 0;
212 zf
->zf_alloc_fail
= 0;
214 list_create(&zf
->zf_stream
, sizeof (zstream_t
),
215 offsetof(zstream_t
, zst_node
));
217 rw_init(&zf
->zf_rwlock
, NULL
, RW_DEFAULT
, NULL
);
221 * This function computes the actual size, in blocks, that can be prefetched,
225 dmu_zfetch_fetch(dnode_t
*dn
, uint64_t blkid
, uint64_t nblks
)
230 fetchsz
= dmu_zfetch_fetchsz(dn
, blkid
, nblks
);
232 for (i
= 0; i
< fetchsz
; i
++) {
233 dbuf_prefetch(dn
, blkid
+ i
);
240 * this function returns the number of blocks that would be prefetched, based
241 * upon the supplied dnode, blockid, and nblks. This is used so that we can
242 * update streams in place, and then prefetch with their old value after the
243 * fact. This way, we can delay the prefetch, but subsequent accesses to the
244 * stream won't result in the same data being prefetched multiple times.
247 dmu_zfetch_fetchsz(dnode_t
*dn
, uint64_t blkid
, uint64_t nblks
)
251 if (blkid
> dn
->dn_maxblkid
) {
255 /* compute fetch size */
256 if (blkid
+ nblks
+ 1 > dn
->dn_maxblkid
) {
257 fetchsz
= (dn
->dn_maxblkid
- blkid
) + 1;
258 ASSERT(blkid
+ fetchsz
- 1 <= dn
->dn_maxblkid
);
268 * given a zfetch and a zsearch structure, see if there is an associated zstream
269 * for this block read. If so, it starts a prefetch for the stream it
270 * located and returns true, otherwise it returns false
273 dmu_zfetch_find(zfetch_t
*zf
, zstream_t
*zh
, int prefetched
)
277 int reset
= !prefetched
;
284 * XXX: This locking strategy is a bit coarse; however, it's impact has
285 * yet to be tested. If this turns out to be an issue, it can be
286 * modified in a number of different ways.
289 rw_enter(&zf
->zf_rwlock
, RW_READER
);
292 for (zs
= list_head(&zf
->zf_stream
); zs
;
293 zs
= list_next(&zf
->zf_stream
, zs
)) {
296 * XXX - should this be an assert?
298 if (zs
->zst_len
== 0) {
304 * We hit this case when we are in a strided prefetch stream:
305 * we will read "len" blocks before "striding".
307 if (zh
->zst_offset
>= zs
->zst_offset
&&
308 zh
->zst_offset
< zs
->zst_offset
+ zs
->zst_len
) {
309 /* already fetched */
315 * This is the forward sequential read case: we increment
316 * len by one each time we hit here, so we will enter this
317 * case on every read.
319 if (zh
->zst_offset
== zs
->zst_offset
+ zs
->zst_len
) {
321 reset
= !prefetched
&& zs
->zst_len
> 1;
323 mutex_enter(&zs
->zst_lock
);
325 if (zh
->zst_offset
!= zs
->zst_offset
+ zs
->zst_len
) {
326 mutex_exit(&zs
->zst_lock
);
329 zs
->zst_len
+= zh
->zst_len
;
330 diff
= zs
->zst_len
- zfetch_block_cap
;
332 zs
->zst_offset
+= diff
;
333 zs
->zst_len
= zs
->zst_len
> diff
?
334 zs
->zst_len
- diff
: 0;
336 zs
->zst_direction
= ZFETCH_FORWARD
;
341 * Same as above, but reading backwards through the file.
343 } else if (zh
->zst_offset
== zs
->zst_offset
- zh
->zst_len
) {
344 /* backwards sequential access */
346 reset
= !prefetched
&& zs
->zst_len
> 1;
348 mutex_enter(&zs
->zst_lock
);
350 if (zh
->zst_offset
!= zs
->zst_offset
- zh
->zst_len
) {
351 mutex_exit(&zs
->zst_lock
);
355 zs
->zst_offset
= zs
->zst_offset
> zh
->zst_len
?
356 zs
->zst_offset
- zh
->zst_len
: 0;
357 zs
->zst_ph_offset
= zs
->zst_ph_offset
> zh
->zst_len
?
358 zs
->zst_ph_offset
- zh
->zst_len
: 0;
359 zs
->zst_len
+= zh
->zst_len
;
361 diff
= zs
->zst_len
- zfetch_block_cap
;
363 zs
->zst_ph_offset
= zs
->zst_ph_offset
> diff
?
364 zs
->zst_ph_offset
- diff
: 0;
365 zs
->zst_len
= zs
->zst_len
> diff
?
366 zs
->zst_len
- diff
: zs
->zst_len
;
368 zs
->zst_direction
= ZFETCH_BACKWARD
;
372 } else if ((zh
->zst_offset
- zs
->zst_offset
- zs
->zst_stride
<
373 zs
->zst_len
) && (zs
->zst_len
!= zs
->zst_stride
)) {
374 /* strided forward access */
376 mutex_enter(&zs
->zst_lock
);
378 if ((zh
->zst_offset
- zs
->zst_offset
- zs
->zst_stride
>=
379 zs
->zst_len
) || (zs
->zst_len
== zs
->zst_stride
)) {
380 mutex_exit(&zs
->zst_lock
);
384 zs
->zst_offset
+= zs
->zst_stride
;
385 zs
->zst_direction
= ZFETCH_FORWARD
;
389 } else if ((zh
->zst_offset
- zs
->zst_offset
+ zs
->zst_stride
<
390 zs
->zst_len
) && (zs
->zst_len
!= zs
->zst_stride
)) {
391 /* strided reverse access */
393 mutex_enter(&zs
->zst_lock
);
395 if ((zh
->zst_offset
- zs
->zst_offset
+ zs
->zst_stride
>=
396 zs
->zst_len
) || (zs
->zst_len
== zs
->zst_stride
)) {
397 mutex_exit(&zs
->zst_lock
);
401 zs
->zst_offset
= zs
->zst_offset
> zs
->zst_stride
?
402 zs
->zst_offset
- zs
->zst_stride
: 0;
403 zs
->zst_ph_offset
= (zs
->zst_ph_offset
>
404 (2 * zs
->zst_stride
)) ?
405 (zs
->zst_ph_offset
- (2 * zs
->zst_stride
)) : 0;
406 zs
->zst_direction
= ZFETCH_BACKWARD
;
414 zstream_t
*remove
= zs
;
417 mutex_exit(&zs
->zst_lock
);
418 rw_exit(&zf
->zf_rwlock
);
419 rw_enter(&zf
->zf_rwlock
, RW_WRITER
);
421 * Relocate the stream, in case someone removes
422 * it while we were acquiring the WRITER lock.
424 for (zs
= list_head(&zf
->zf_stream
); zs
;
425 zs
= list_next(&zf
->zf_stream
, zs
)) {
427 dmu_zfetch_stream_remove(zf
, zs
);
428 mutex_destroy(&zs
->zst_lock
);
429 kmem_free(zs
, sizeof (zstream_t
));
435 dmu_zfetch_dofetch(zf
, zs
);
436 mutex_exit(&zs
->zst_lock
);
440 rw_exit(&zf
->zf_rwlock
);
445 * Clean-up state associated with a zfetch structure. This frees allocated
446 * structure members, empties the zf_stream tree, and generally makes things
447 * nice. This doesn't free the zfetch_t itself, that's left to the caller.
450 dmu_zfetch_rele(zfetch_t
*zf
)
455 ASSERT(!RW_LOCK_HELD(&zf
->zf_rwlock
));
457 for (zs
= list_head(&zf
->zf_stream
); zs
; zs
= zs_next
) {
458 zs_next
= list_next(&zf
->zf_stream
, zs
);
460 list_remove(&zf
->zf_stream
, zs
);
461 mutex_destroy(&zs
->zst_lock
);
462 kmem_free(zs
, sizeof (zstream_t
));
464 list_destroy(&zf
->zf_stream
);
465 rw_destroy(&zf
->zf_rwlock
);
471 * Given a zfetch and zstream structure, insert the zstream structure into the
472 * AVL tree contained within the zfetch structure. Peform the appropriate
473 * book-keeping. It is possible that another thread has inserted a stream which
474 * matches one that we are about to insert, so we must be sure to check for this
475 * case. If one is found, return failure, and let the caller cleanup the
479 dmu_zfetch_stream_insert(zfetch_t
*zf
, zstream_t
*zs
)
484 ASSERT(RW_WRITE_HELD(&zf
->zf_rwlock
));
486 for (zs_walk
= list_head(&zf
->zf_stream
); zs_walk
; zs_walk
= zs_next
) {
487 zs_next
= list_next(&zf
->zf_stream
, zs_walk
);
489 if (dmu_zfetch_streams_equal(zs_walk
, zs
)) {
494 list_insert_head(&zf
->zf_stream
, zs
);
502 * Walk the list of zstreams in the given zfetch, find an old one (by time), and
503 * reclaim it for use by the caller.
506 dmu_zfetch_stream_reclaim(zfetch_t
*zf
)
510 if (! rw_tryenter(&zf
->zf_rwlock
, RW_WRITER
))
513 for (zs
= list_head(&zf
->zf_stream
); zs
;
514 zs
= list_next(&zf
->zf_stream
, zs
)) {
516 if (((lbolt
- zs
->zst_last
) / hz
) > zfetch_min_sec_reap
)
521 dmu_zfetch_stream_remove(zf
, zs
);
522 mutex_destroy(&zs
->zst_lock
);
523 bzero(zs
, sizeof (zstream_t
));
527 rw_exit(&zf
->zf_rwlock
);
533 * Given a zfetch and zstream structure, remove the zstream structure from its
534 * container in the zfetch structure. Perform the appropriate book-keeping.
537 dmu_zfetch_stream_remove(zfetch_t
*zf
, zstream_t
*zs
)
539 ASSERT(RW_WRITE_HELD(&zf
->zf_rwlock
));
541 list_remove(&zf
->zf_stream
, zs
);
546 dmu_zfetch_streams_equal(zstream_t
*zs1
, zstream_t
*zs2
)
548 if (zs1
->zst_offset
!= zs2
->zst_offset
)
551 if (zs1
->zst_len
!= zs2
->zst_len
)
554 if (zs1
->zst_stride
!= zs2
->zst_stride
)
557 if (zs1
->zst_ph_offset
!= zs2
->zst_ph_offset
)
560 if (zs1
->zst_cap
!= zs2
->zst_cap
)
563 if (zs1
->zst_direction
!= zs2
->zst_direction
)
570 * This is the prefetch entry point. It calls all of the other dmu_zfetch
571 * routines to create, delete, find, or operate upon prefetch streams.
574 dmu_zfetch(zfetch_t
*zf
, uint64_t offset
, uint64_t size
, int prefetched
)
577 zstream_t
*newstream
;
580 unsigned int blkshft
;
583 if (zfs_prefetch_disable
)
586 /* files that aren't ln2 blocksz are only one block -- nothing to do */
587 if (!zf
->zf_dnode
->dn_datablkshift
)
590 /* convert offset and size, into blockid and nblocks */
591 blkshft
= zf
->zf_dnode
->dn_datablkshift
;
592 blksz
= (1 << blkshft
);
594 bzero(&zst
, sizeof (zstream_t
));
595 zst
.zst_offset
= offset
>> blkshft
;
596 zst
.zst_len
= (P2ROUNDUP(offset
+ size
, blksz
) -
597 P2ALIGN(offset
, blksz
)) >> blkshft
;
599 fetched
= dmu_zfetch_find(zf
, &zst
, prefetched
);
601 fetched
= dmu_zfetch_colinear(zf
, &zst
);
605 newstream
= dmu_zfetch_stream_reclaim(zf
);
608 * we still couldn't find a stream, drop the lock, and allocate
609 * one if possible. Otherwise, give up and go home.
611 if (newstream
== NULL
) {
613 uint32_t max_streams
;
614 uint32_t cur_streams
;
616 cur_streams
= zf
->zf_stream_cnt
;
617 maxblocks
= zf
->zf_dnode
->dn_maxblkid
;
619 max_streams
= MIN(zfetch_max_streams
,
620 (maxblocks
/ zfetch_block_cap
));
621 if (max_streams
== 0) {
625 if (cur_streams
>= max_streams
) {
629 newstream
= kmem_zalloc(sizeof (zstream_t
), KM_SLEEP
);
632 newstream
->zst_offset
= zst
.zst_offset
;
633 newstream
->zst_len
= zst
.zst_len
;
634 newstream
->zst_stride
= zst
.zst_len
;
635 newstream
->zst_ph_offset
= zst
.zst_len
+ zst
.zst_offset
;
636 newstream
->zst_cap
= zst
.zst_len
;
637 newstream
->zst_direction
= ZFETCH_FORWARD
;
638 newstream
->zst_last
= lbolt
;
640 mutex_init(&newstream
->zst_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
642 rw_enter(&zf
->zf_rwlock
, RW_WRITER
);
643 inserted
= dmu_zfetch_stream_insert(zf
, newstream
);
644 rw_exit(&zf
->zf_rwlock
);
647 mutex_destroy(&newstream
->zst_lock
);
648 kmem_free(newstream
, sizeof (zstream_t
));