4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
31 #include <sys/arc_impl.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/dmu_zfetch.h>
37 #include <sys/kstat.h>
38 #include <sys/wmsum.h>
41 * This tunable disables predictive prefetch. Note that it leaves "prescient"
42 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
43 * prescient prefetch never issues i/os that end up not being needed,
44 * so it can't hurt performance.
47 static int zfs_prefetch_disable
= B_FALSE
;
49 /* max # of streams per zfetch */
50 static unsigned int zfetch_max_streams
= 8;
51 /* min time before stream reclaim */
52 static unsigned int zfetch_min_sec_reap
= 1;
53 /* max time before stream delete */
54 static unsigned int zfetch_max_sec_reap
= 2;
56 /* min bytes to prefetch per stream (default 2MB) */
57 static unsigned int zfetch_min_distance
= 2 * 1024 * 1024;
58 /* max bytes to prefetch per stream (default 8MB) */
59 unsigned int zfetch_max_distance
= 8 * 1024 * 1024;
61 /* min bytes to prefetch per stream (default 4MB) */
62 static unsigned int zfetch_min_distance
= 4 * 1024 * 1024;
63 /* max bytes to prefetch per stream (default 64MB) */
64 unsigned int zfetch_max_distance
= 64 * 1024 * 1024;
66 /* max bytes to prefetch indirects for per stream (default 64MB) */
67 unsigned int zfetch_max_idistance
= 64 * 1024 * 1024;
69 typedef struct zfetch_stats
{
70 kstat_named_t zfetchstat_hits
;
71 kstat_named_t zfetchstat_misses
;
72 kstat_named_t zfetchstat_max_streams
;
73 kstat_named_t zfetchstat_io_issued
;
74 kstat_named_t zfetchstat_io_active
;
77 static zfetch_stats_t zfetch_stats
= {
78 { "hits", KSTAT_DATA_UINT64
},
79 { "misses", KSTAT_DATA_UINT64
},
80 { "max_streams", KSTAT_DATA_UINT64
},
81 { "io_issued", KSTAT_DATA_UINT64
},
82 { "io_active", KSTAT_DATA_UINT64
},
86 wmsum_t zfetchstat_hits
;
87 wmsum_t zfetchstat_misses
;
88 wmsum_t zfetchstat_max_streams
;
89 wmsum_t zfetchstat_io_issued
;
90 aggsum_t zfetchstat_io_active
;
93 #define ZFETCHSTAT_BUMP(stat) \
94 wmsum_add(&zfetch_sums.stat, 1)
95 #define ZFETCHSTAT_ADD(stat, val) \
96 wmsum_add(&zfetch_sums.stat, val)
99 static kstat_t
*zfetch_ksp
;
102 zfetch_kstats_update(kstat_t
*ksp
, int rw
)
104 zfetch_stats_t
*zs
= ksp
->ks_data
;
106 if (rw
== KSTAT_WRITE
)
108 zs
->zfetchstat_hits
.value
.ui64
=
109 wmsum_value(&zfetch_sums
.zfetchstat_hits
);
110 zs
->zfetchstat_misses
.value
.ui64
=
111 wmsum_value(&zfetch_sums
.zfetchstat_misses
);
112 zs
->zfetchstat_max_streams
.value
.ui64
=
113 wmsum_value(&zfetch_sums
.zfetchstat_max_streams
);
114 zs
->zfetchstat_io_issued
.value
.ui64
=
115 wmsum_value(&zfetch_sums
.zfetchstat_io_issued
);
116 zs
->zfetchstat_io_active
.value
.ui64
=
117 aggsum_value(&zfetch_sums
.zfetchstat_io_active
);
124 wmsum_init(&zfetch_sums
.zfetchstat_hits
, 0);
125 wmsum_init(&zfetch_sums
.zfetchstat_misses
, 0);
126 wmsum_init(&zfetch_sums
.zfetchstat_max_streams
, 0);
127 wmsum_init(&zfetch_sums
.zfetchstat_io_issued
, 0);
128 aggsum_init(&zfetch_sums
.zfetchstat_io_active
, 0);
130 zfetch_ksp
= kstat_create("zfs", 0, "zfetchstats", "misc",
131 KSTAT_TYPE_NAMED
, sizeof (zfetch_stats
) / sizeof (kstat_named_t
),
134 if (zfetch_ksp
!= NULL
) {
135 zfetch_ksp
->ks_data
= &zfetch_stats
;
136 zfetch_ksp
->ks_update
= zfetch_kstats_update
;
137 kstat_install(zfetch_ksp
);
144 if (zfetch_ksp
!= NULL
) {
145 kstat_delete(zfetch_ksp
);
149 wmsum_fini(&zfetch_sums
.zfetchstat_hits
);
150 wmsum_fini(&zfetch_sums
.zfetchstat_misses
);
151 wmsum_fini(&zfetch_sums
.zfetchstat_max_streams
);
152 wmsum_fini(&zfetch_sums
.zfetchstat_io_issued
);
153 ASSERT0(aggsum_value(&zfetch_sums
.zfetchstat_io_active
));
154 aggsum_fini(&zfetch_sums
.zfetchstat_io_active
);
158 * This takes a pointer to a zfetch structure and a dnode. It performs the
159 * necessary setup for the zfetch structure, grokking data from the
163 dmu_zfetch_init(zfetch_t
*zf
, dnode_t
*dno
)
168 zf
->zf_numstreams
= 0;
170 list_create(&zf
->zf_stream
, sizeof (zstream_t
),
171 offsetof(zstream_t
, zs_node
));
173 mutex_init(&zf
->zf_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
177 dmu_zfetch_stream_fini(zstream_t
*zs
)
179 ASSERT(!list_link_active(&zs
->zs_node
));
180 zfs_refcount_destroy(&zs
->zs_callers
);
181 zfs_refcount_destroy(&zs
->zs_refs
);
182 kmem_free(zs
, sizeof (*zs
));
186 dmu_zfetch_stream_remove(zfetch_t
*zf
, zstream_t
*zs
)
188 ASSERT(MUTEX_HELD(&zf
->zf_lock
));
189 list_remove(&zf
->zf_stream
, zs
);
192 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
193 dmu_zfetch_stream_fini(zs
);
197 * Clean-up state associated with a zfetch structure (e.g. destroy the
198 * streams). This doesn't free the zfetch_t itself, that's left to the caller.
201 dmu_zfetch_fini(zfetch_t
*zf
)
205 mutex_enter(&zf
->zf_lock
);
206 while ((zs
= list_head(&zf
->zf_stream
)) != NULL
)
207 dmu_zfetch_stream_remove(zf
, zs
);
208 mutex_exit(&zf
->zf_lock
);
209 list_destroy(&zf
->zf_stream
);
210 mutex_destroy(&zf
->zf_lock
);
216 * If there aren't too many active streams already, create one more.
217 * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
218 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
219 * The "blkid" argument is the next block that we expect this stream to access.
222 dmu_zfetch_stream_create(zfetch_t
*zf
, uint64_t blkid
)
224 zstream_t
*zs
, *zs_next
, *zs_old
= NULL
;
225 hrtime_t now
= gethrtime(), t
;
227 ASSERT(MUTEX_HELD(&zf
->zf_lock
));
230 * Delete too old streams, reusing the first found one.
232 t
= now
- SEC2NSEC(zfetch_max_sec_reap
);
233 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
; zs
= zs_next
) {
234 zs_next
= list_next(&zf
->zf_stream
, zs
);
236 * Skip if still active. 1 -- zf_stream reference.
238 if (zfs_refcount_count(&zs
->zs_refs
) != 1)
240 if (zs
->zs_atime
> t
)
243 dmu_zfetch_stream_remove(zf
, zs
);
253 * The maximum number of streams is normally zfetch_max_streams,
254 * but for small files we lower it such that it's at least possible
255 * for all the streams to be non-overlapping.
257 uint32_t max_streams
= MAX(1, MIN(zfetch_max_streams
,
258 zf
->zf_dnode
->dn_maxblkid
* zf
->zf_dnode
->dn_datablksz
/
259 zfetch_max_distance
));
260 if (zf
->zf_numstreams
>= max_streams
) {
261 t
= now
- SEC2NSEC(zfetch_min_sec_reap
);
262 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
;
263 zs
= list_next(&zf
->zf_stream
, zs
)) {
264 if (zfs_refcount_count(&zs
->zs_refs
) != 1)
266 if (zs
->zs_atime
> t
)
268 if (zs_old
== NULL
|| zs
->zs_atime
< zs_old
->zs_atime
)
275 ZFETCHSTAT_BUMP(zfetchstat_max_streams
);
279 zs
= kmem_zalloc(sizeof (*zs
), KM_SLEEP
);
281 zfs_refcount_create(&zs
->zs_callers
);
282 zfs_refcount_create(&zs
->zs_refs
);
283 /* One reference for zf_stream. */
284 zfs_refcount_add(&zs
->zs_refs
, NULL
);
286 list_insert_head(&zf
->zf_stream
, zs
);
289 zs
->zs_blkid
= blkid
;
291 zs
->zs_pf_start
= blkid
;
292 zs
->zs_pf_end
= blkid
;
294 zs
->zs_ipf_start
= blkid
;
295 zs
->zs_ipf_end
= blkid
;
296 /* Allow immediate stream reuse until first hit. */
297 zs
->zs_atime
= now
- SEC2NSEC(zfetch_min_sec_reap
);
298 zs
->zs_missed
= B_FALSE
;
299 zs
->zs_more
= B_FALSE
;
303 dmu_zfetch_done(void *arg
, uint64_t level
, uint64_t blkid
, boolean_t io_issued
)
307 if (io_issued
&& level
== 0 && blkid
< zs
->zs_blkid
)
308 zs
->zs_more
= B_TRUE
;
309 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
310 dmu_zfetch_stream_fini(zs
);
311 aggsum_add(&zfetch_sums
.zfetchstat_io_active
, -1);
315 * This is the predictive prefetch entry point. dmu_zfetch_prepare()
316 * associates dnode access specified with blkid and nblks arguments with
317 * prefetch stream, predicts further accesses based on that stats and returns
318 * the stream pointer on success. That pointer must later be passed to
319 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
320 * release it. dmu_zfetch() is a wrapper for simple cases when window between
321 * prediction and prefetch initiation is not needed.
322 * fetch_data argument specifies whether actual data blocks should be fetched:
323 * FALSE -- prefetch only indirect blocks for predicted data blocks;
324 * TRUE -- prefetch predicted data blocks plus following indirect blocks.
327 dmu_zfetch_prepare(zfetch_t
*zf
, uint64_t blkid
, uint64_t nblks
,
328 boolean_t fetch_data
, boolean_t have_lock
)
331 spa_t
*spa
= zf
->zf_dnode
->dn_objset
->os_spa
;
333 if (zfs_prefetch_disable
)
336 * If we haven't yet loaded the indirect vdevs' mappings, we
337 * can only read from blocks that we carefully ensure are on
338 * concrete vdevs (or previously-loaded indirect vdevs). So we
339 * can't allow the predictive prefetcher to attempt reads of other
340 * blocks (e.g. of the MOS's dnode object).
342 if (!spa_indirect_vdevs_loaded(spa
))
346 * As a fast path for small (single-block) files, ignore access
347 * to the first block.
349 if (!have_lock
&& blkid
== 0)
353 rw_enter(&zf
->zf_dnode
->dn_struct_rwlock
, RW_READER
);
356 * A fast path for small files for which no prefetch will
359 uint64_t maxblkid
= zf
->zf_dnode
->dn_maxblkid
;
362 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
365 mutex_enter(&zf
->zf_lock
);
368 * Find matching prefetch stream. Depending on whether the accesses
369 * are block-aligned, first block of the new access may either follow
370 * the last block of the previous access, or be equal to it.
372 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
;
373 zs
= list_next(&zf
->zf_stream
, zs
)) {
374 if (blkid
== zs
->zs_blkid
) {
376 } else if (blkid
+ 1 == zs
->zs_blkid
) {
384 * If the file is ending, remove the matching stream if found.
385 * If not found then it is too late to create a new one now.
387 uint64_t end_of_access_blkid
= blkid
+ nblks
;
388 if (end_of_access_blkid
>= maxblkid
) {
390 dmu_zfetch_stream_remove(zf
, zs
);
391 mutex_exit(&zf
->zf_lock
);
393 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
397 /* Exit if we already prefetched this block before. */
399 mutex_exit(&zf
->zf_lock
);
401 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
407 * This access is not part of any existing stream. Create
408 * a new stream for it.
410 dmu_zfetch_stream_create(zf
, end_of_access_blkid
);
411 mutex_exit(&zf
->zf_lock
);
413 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
414 ZFETCHSTAT_BUMP(zfetchstat_misses
);
419 * This access was to a block that we issued a prefetch for on
420 * behalf of this stream. Calculate further prefetch distances.
422 * Start prefetch from the demand access size (nblks). Double the
423 * distance every access up to zfetch_min_distance. After that only
424 * if needed increase the distance by 1/8 up to zfetch_max_distance.
426 * Don't double the distance beyond single block if we have more
427 * than ~6% of ARC held by active prefetches. It should help with
428 * getting out of RAM on some badly mispredicted read patterns.
430 unsigned int dbs
= zf
->zf_dnode
->dn_datablkshift
;
431 unsigned int nbytes
= nblks
<< dbs
;
432 unsigned int pf_nblks
;
434 if (unlikely(zs
->zs_pf_dist
< nbytes
))
435 zs
->zs_pf_dist
= nbytes
;
436 else if (zs
->zs_pf_dist
< zfetch_min_distance
&&
437 (zs
->zs_pf_dist
< (1 << dbs
) ||
438 aggsum_compare(&zfetch_sums
.zfetchstat_io_active
,
439 arc_c_max
>> (4 + dbs
)) < 0))
441 else if (zs
->zs_more
)
442 zs
->zs_pf_dist
+= zs
->zs_pf_dist
/ 8;
443 zs
->zs_more
= B_FALSE
;
444 if (zs
->zs_pf_dist
> zfetch_max_distance
)
445 zs
->zs_pf_dist
= zfetch_max_distance
;
446 pf_nblks
= zs
->zs_pf_dist
>> dbs
;
450 if (zs
->zs_pf_start
< end_of_access_blkid
)
451 zs
->zs_pf_start
= end_of_access_blkid
;
452 if (zs
->zs_pf_end
< end_of_access_blkid
+ pf_nblks
)
453 zs
->zs_pf_end
= end_of_access_blkid
+ pf_nblks
;
456 * Do the same for indirects, starting where we will stop reading
457 * data blocks (and the indirects that point to them).
459 if (unlikely(zs
->zs_ipf_dist
< nbytes
))
460 zs
->zs_ipf_dist
= nbytes
;
462 zs
->zs_ipf_dist
*= 2;
463 if (zs
->zs_ipf_dist
> zfetch_max_idistance
)
464 zs
->zs_ipf_dist
= zfetch_max_idistance
;
465 pf_nblks
= zs
->zs_ipf_dist
>> dbs
;
466 if (zs
->zs_ipf_start
< zs
->zs_pf_end
)
467 zs
->zs_ipf_start
= zs
->zs_pf_end
;
468 if (zs
->zs_ipf_end
< zs
->zs_pf_end
+ pf_nblks
)
469 zs
->zs_ipf_end
= zs
->zs_pf_end
+ pf_nblks
;
471 zs
->zs_blkid
= end_of_access_blkid
;
472 /* Protect the stream from reclamation. */
473 zs
->zs_atime
= gethrtime();
474 zfs_refcount_add(&zs
->zs_refs
, NULL
);
475 /* Count concurrent callers. */
476 zfs_refcount_add(&zs
->zs_callers
, NULL
);
477 mutex_exit(&zf
->zf_lock
);
480 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
482 ZFETCHSTAT_BUMP(zfetchstat_hits
);
487 dmu_zfetch_run(zstream_t
*zs
, boolean_t missed
, boolean_t have_lock
)
489 zfetch_t
*zf
= zs
->zs_fetch
;
490 int64_t pf_start
, pf_end
, ipf_start
, ipf_end
;
494 zs
->zs_missed
= missed
;
497 * Postpone the prefetch if there are more concurrent callers.
498 * It happens when multiple requests are waiting for the same
499 * indirect block. The last one will run the prefetch for all.
501 if (zfs_refcount_remove(&zs
->zs_callers
, NULL
) != 0) {
502 /* Drop reference taken in dmu_zfetch_prepare(). */
503 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
504 dmu_zfetch_stream_fini(zs
);
508 mutex_enter(&zf
->zf_lock
);
510 pf_start
= zs
->zs_pf_start
;
511 pf_end
= zs
->zs_pf_start
= zs
->zs_pf_end
;
513 pf_start
= pf_end
= 0;
515 ipf_start
= zs
->zs_ipf_start
;
516 ipf_end
= zs
->zs_ipf_start
= zs
->zs_ipf_end
;
517 mutex_exit(&zf
->zf_lock
);
518 ASSERT3S(pf_start
, <=, pf_end
);
519 ASSERT3S(ipf_start
, <=, ipf_end
);
521 epbs
= zf
->zf_dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
522 ipf_start
= P2ROUNDUP(ipf_start
, 1 << epbs
) >> epbs
;
523 ipf_end
= P2ROUNDUP(ipf_end
, 1 << epbs
) >> epbs
;
524 ASSERT3S(ipf_start
, <=, ipf_end
);
525 issued
= pf_end
- pf_start
+ ipf_end
- ipf_start
;
527 /* More references on top of taken in dmu_zfetch_prepare(). */
528 zfs_refcount_add_few(&zs
->zs_refs
, issued
- 1, NULL
);
529 } else if (issued
== 0) {
530 /* Some other thread has done our work, so drop the ref. */
531 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
532 dmu_zfetch_stream_fini(zs
);
535 aggsum_add(&zfetch_sums
.zfetchstat_io_active
, issued
);
538 rw_enter(&zf
->zf_dnode
->dn_struct_rwlock
, RW_READER
);
541 for (int64_t blk
= pf_start
; blk
< pf_end
; blk
++) {
542 issued
+= dbuf_prefetch_impl(zf
->zf_dnode
, 0, blk
,
543 ZIO_PRIORITY_ASYNC_READ
, 0, dmu_zfetch_done
, zs
);
545 for (int64_t iblk
= ipf_start
; iblk
< ipf_end
; iblk
++) {
546 issued
+= dbuf_prefetch_impl(zf
->zf_dnode
, 1, iblk
,
547 ZIO_PRIORITY_ASYNC_READ
, 0, dmu_zfetch_done
, zs
);
551 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
554 ZFETCHSTAT_ADD(zfetchstat_io_issued
, issued
);
558 dmu_zfetch(zfetch_t
*zf
, uint64_t blkid
, uint64_t nblks
, boolean_t fetch_data
,
559 boolean_t missed
, boolean_t have_lock
)
563 zs
= dmu_zfetch_prepare(zf
, blkid
, nblks
, fetch_data
, have_lock
);
565 dmu_zfetch_run(zs
, missed
, have_lock
);
568 ZFS_MODULE_PARAM(zfs_prefetch
, zfs_prefetch_
, disable
, INT
, ZMOD_RW
,
569 "Disable all ZFS prefetching");
571 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_streams
, UINT
, ZMOD_RW
,
572 "Max number of streams per zfetch");
574 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, min_sec_reap
, UINT
, ZMOD_RW
,
575 "Min time before stream reclaim");
577 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_sec_reap
, UINT
, ZMOD_RW
,
578 "Max time before stream delete");
580 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, min_distance
, UINT
, ZMOD_RW
,
581 "Min bytes to prefetch per stream");
583 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_distance
, UINT
, ZMOD_RW
,
584 "Max bytes to prefetch per stream");
586 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_idistance
, UINT
, ZMOD_RW
,
587 "Max bytes to prefetch indirects for per stream");