4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
31 #include <sys/arc_impl.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/dmu_zfetch.h>
37 #include <sys/kstat.h>
38 #include <sys/wmsum.h>
41 * This tunable disables predictive prefetch. Note that it leaves "prescient"
42 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
43 * prescient prefetch never issues i/os that end up not being needed,
44 * so it can't hurt performance.
47 static int zfs_prefetch_disable
= B_FALSE
;
49 /* max # of streams per zfetch */
50 static unsigned int zfetch_max_streams
= 8;
51 /* min time before stream reclaim */
52 static unsigned int zfetch_min_sec_reap
= 1;
53 /* max time before stream delete */
54 static unsigned int zfetch_max_sec_reap
= 2;
56 /* min bytes to prefetch per stream (default 2MB) */
57 static unsigned int zfetch_min_distance
= 2 * 1024 * 1024;
58 /* max bytes to prefetch per stream (default 8MB) */
59 unsigned int zfetch_max_distance
= 8 * 1024 * 1024;
61 /* min bytes to prefetch per stream (default 4MB) */
62 static unsigned int zfetch_min_distance
= 4 * 1024 * 1024;
63 /* max bytes to prefetch per stream (default 64MB) */
64 unsigned int zfetch_max_distance
= 64 * 1024 * 1024;
66 /* max bytes to prefetch indirects for per stream (default 128MB) */
67 unsigned int zfetch_max_idistance
= 128 * 1024 * 1024;
68 /* max request reorder distance within a stream (default 16MB) */
69 unsigned int zfetch_max_reorder
= 16 * 1024 * 1024;
70 /* Max log2 fraction of holes in a stream */
71 unsigned int zfetch_hole_shift
= 2;
73 typedef struct zfetch_stats
{
74 kstat_named_t zfetchstat_hits
;
75 kstat_named_t zfetchstat_future
;
76 kstat_named_t zfetchstat_stride
;
77 kstat_named_t zfetchstat_past
;
78 kstat_named_t zfetchstat_misses
;
79 kstat_named_t zfetchstat_max_streams
;
80 kstat_named_t zfetchstat_io_issued
;
81 kstat_named_t zfetchstat_io_active
;
84 static zfetch_stats_t zfetch_stats
= {
85 { "hits", KSTAT_DATA_UINT64
},
86 { "future", KSTAT_DATA_UINT64
},
87 { "stride", KSTAT_DATA_UINT64
},
88 { "past", KSTAT_DATA_UINT64
},
89 { "misses", KSTAT_DATA_UINT64
},
90 { "max_streams", KSTAT_DATA_UINT64
},
91 { "io_issued", KSTAT_DATA_UINT64
},
92 { "io_active", KSTAT_DATA_UINT64
},
96 wmsum_t zfetchstat_hits
;
97 wmsum_t zfetchstat_future
;
98 wmsum_t zfetchstat_stride
;
99 wmsum_t zfetchstat_past
;
100 wmsum_t zfetchstat_misses
;
101 wmsum_t zfetchstat_max_streams
;
102 wmsum_t zfetchstat_io_issued
;
103 aggsum_t zfetchstat_io_active
;
106 #define ZFETCHSTAT_BUMP(stat) \
107 wmsum_add(&zfetch_sums.stat, 1)
108 #define ZFETCHSTAT_ADD(stat, val) \
109 wmsum_add(&zfetch_sums.stat, val)
112 static kstat_t
*zfetch_ksp
;
115 zfetch_kstats_update(kstat_t
*ksp
, int rw
)
117 zfetch_stats_t
*zs
= ksp
->ks_data
;
119 if (rw
== KSTAT_WRITE
)
121 zs
->zfetchstat_hits
.value
.ui64
=
122 wmsum_value(&zfetch_sums
.zfetchstat_hits
);
123 zs
->zfetchstat_future
.value
.ui64
=
124 wmsum_value(&zfetch_sums
.zfetchstat_future
);
125 zs
->zfetchstat_stride
.value
.ui64
=
126 wmsum_value(&zfetch_sums
.zfetchstat_stride
);
127 zs
->zfetchstat_past
.value
.ui64
=
128 wmsum_value(&zfetch_sums
.zfetchstat_past
);
129 zs
->zfetchstat_misses
.value
.ui64
=
130 wmsum_value(&zfetch_sums
.zfetchstat_misses
);
131 zs
->zfetchstat_max_streams
.value
.ui64
=
132 wmsum_value(&zfetch_sums
.zfetchstat_max_streams
);
133 zs
->zfetchstat_io_issued
.value
.ui64
=
134 wmsum_value(&zfetch_sums
.zfetchstat_io_issued
);
135 zs
->zfetchstat_io_active
.value
.ui64
=
136 aggsum_value(&zfetch_sums
.zfetchstat_io_active
);
143 wmsum_init(&zfetch_sums
.zfetchstat_hits
, 0);
144 wmsum_init(&zfetch_sums
.zfetchstat_future
, 0);
145 wmsum_init(&zfetch_sums
.zfetchstat_stride
, 0);
146 wmsum_init(&zfetch_sums
.zfetchstat_past
, 0);
147 wmsum_init(&zfetch_sums
.zfetchstat_misses
, 0);
148 wmsum_init(&zfetch_sums
.zfetchstat_max_streams
, 0);
149 wmsum_init(&zfetch_sums
.zfetchstat_io_issued
, 0);
150 aggsum_init(&zfetch_sums
.zfetchstat_io_active
, 0);
152 zfetch_ksp
= kstat_create("zfs", 0, "zfetchstats", "misc",
153 KSTAT_TYPE_NAMED
, sizeof (zfetch_stats
) / sizeof (kstat_named_t
),
156 if (zfetch_ksp
!= NULL
) {
157 zfetch_ksp
->ks_data
= &zfetch_stats
;
158 zfetch_ksp
->ks_update
= zfetch_kstats_update
;
159 kstat_install(zfetch_ksp
);
166 if (zfetch_ksp
!= NULL
) {
167 kstat_delete(zfetch_ksp
);
171 wmsum_fini(&zfetch_sums
.zfetchstat_hits
);
172 wmsum_fini(&zfetch_sums
.zfetchstat_future
);
173 wmsum_fini(&zfetch_sums
.zfetchstat_stride
);
174 wmsum_fini(&zfetch_sums
.zfetchstat_past
);
175 wmsum_fini(&zfetch_sums
.zfetchstat_misses
);
176 wmsum_fini(&zfetch_sums
.zfetchstat_max_streams
);
177 wmsum_fini(&zfetch_sums
.zfetchstat_io_issued
);
178 ASSERT0(aggsum_value(&zfetch_sums
.zfetchstat_io_active
));
179 aggsum_fini(&zfetch_sums
.zfetchstat_io_active
);
183 * This takes a pointer to a zfetch structure and a dnode. It performs the
184 * necessary setup for the zfetch structure, grokking data from the
188 dmu_zfetch_init(zfetch_t
*zf
, dnode_t
*dno
)
193 zf
->zf_numstreams
= 0;
195 list_create(&zf
->zf_stream
, sizeof (zstream_t
),
196 offsetof(zstream_t
, zs_node
));
198 mutex_init(&zf
->zf_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
202 dmu_zfetch_stream_fini(zstream_t
*zs
)
204 ASSERT(!list_link_active(&zs
->zs_node
));
205 zfs_refcount_destroy(&zs
->zs_callers
);
206 zfs_refcount_destroy(&zs
->zs_refs
);
207 kmem_free(zs
, sizeof (*zs
));
211 dmu_zfetch_stream_remove(zfetch_t
*zf
, zstream_t
*zs
)
213 ASSERT(MUTEX_HELD(&zf
->zf_lock
));
214 list_remove(&zf
->zf_stream
, zs
);
217 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
218 dmu_zfetch_stream_fini(zs
);
222 * Clean-up state associated with a zfetch structure (e.g. destroy the
223 * streams). This doesn't free the zfetch_t itself, that's left to the caller.
226 dmu_zfetch_fini(zfetch_t
*zf
)
230 mutex_enter(&zf
->zf_lock
);
231 while ((zs
= list_head(&zf
->zf_stream
)) != NULL
)
232 dmu_zfetch_stream_remove(zf
, zs
);
233 mutex_exit(&zf
->zf_lock
);
234 list_destroy(&zf
->zf_stream
);
235 mutex_destroy(&zf
->zf_lock
);
241 * If there aren't too many active streams already, create one more.
242 * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
243 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
244 * The "blkid" argument is the next block that we expect this stream to access.
247 dmu_zfetch_stream_create(zfetch_t
*zf
, uint64_t blkid
)
249 zstream_t
*zs
, *zs_next
, *zs_old
= NULL
;
250 uint_t now
= gethrestime_sec(), t
;
252 ASSERT(MUTEX_HELD(&zf
->zf_lock
));
255 * Delete too old streams, reusing the first found one.
257 t
= now
- zfetch_max_sec_reap
;
258 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
; zs
= zs_next
) {
259 zs_next
= list_next(&zf
->zf_stream
, zs
);
261 * Skip if still active. 1 -- zf_stream reference.
263 if ((int)(zs
->zs_atime
- t
) >= 0)
265 if (zfs_refcount_count(&zs
->zs_refs
) != 1)
268 dmu_zfetch_stream_remove(zf
, zs
);
274 list_remove(&zf
->zf_stream
, zs
);
279 * The maximum number of streams is normally zfetch_max_streams,
280 * but for small files we lower it such that it's at least possible
281 * for all the streams to be non-overlapping.
283 uint32_t max_streams
= MAX(1, MIN(zfetch_max_streams
,
284 (zf
->zf_dnode
->dn_maxblkid
<< zf
->zf_dnode
->dn_datablkshift
) /
285 zfetch_max_distance
));
286 if (zf
->zf_numstreams
>= max_streams
) {
287 t
= now
- zfetch_min_sec_reap
;
288 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
;
289 zs
= list_next(&zf
->zf_stream
, zs
)) {
290 if ((int)(zs
->zs_atime
- t
) >= 0)
292 if (zfs_refcount_count(&zs
->zs_refs
) != 1)
294 if (zs_old
== NULL
||
295 (int)(zs_old
->zs_atime
- zs
->zs_atime
) >= 0)
300 list_remove(&zf
->zf_stream
, zs
);
303 ZFETCHSTAT_BUMP(zfetchstat_max_streams
);
307 zs
= kmem_zalloc(sizeof (*zs
), KM_SLEEP
);
308 zfs_refcount_create(&zs
->zs_callers
);
309 zfs_refcount_create(&zs
->zs_refs
);
310 /* One reference for zf_stream. */
311 zfs_refcount_add(&zs
->zs_refs
, NULL
);
315 list_insert_head(&zf
->zf_stream
, zs
);
316 zs
->zs_blkid
= blkid
;
317 /* Allow immediate stream reuse until first hit. */
318 zs
->zs_atime
= now
- zfetch_min_sec_reap
;
319 memset(zs
->zs_ranges
, 0, sizeof (zs
->zs_ranges
));
322 zs
->zs_pf_start
= blkid
;
323 zs
->zs_pf_end
= blkid
;
324 zs
->zs_ipf_start
= blkid
;
325 zs
->zs_ipf_end
= blkid
;
326 zs
->zs_missed
= B_FALSE
;
327 zs
->zs_more
= B_FALSE
;
331 dmu_zfetch_done(void *arg
, uint64_t level
, uint64_t blkid
, boolean_t io_issued
)
335 if (io_issued
&& level
== 0 && blkid
< zs
->zs_blkid
)
336 zs
->zs_more
= B_TRUE
;
337 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
338 dmu_zfetch_stream_fini(zs
);
339 aggsum_add(&zfetch_sums
.zfetchstat_io_active
, -1);
343 * Process stream hit access for nblks blocks starting at zs_blkid. Return
344 * number of blocks to proceed for after aggregation with future ranges.
347 dmu_zfetch_hit(zstream_t
*zs
, uint64_t nblks
)
351 /* Optimize sequential accesses (no future ranges). */
352 if (zs
->zs_ranges
[0].start
== 0)
355 /* Look for intersections with further ranges. */
356 for (i
= 0; i
< ZFETCH_RANGES
; i
++) {
357 zsrange_t
*r
= &zs
->zs_ranges
[i
];
358 if (r
->start
== 0 || r
->start
> nblks
)
360 if (r
->end
>= nblks
) {
367 /* Delete all found intersecting ranges, updates remaining. */
368 for (j
= 0; i
< ZFETCH_RANGES
; i
++, j
++) {
369 if (zs
->zs_ranges
[i
].start
== 0)
371 ASSERT3U(zs
->zs_ranges
[i
].start
, >, nblks
);
372 ASSERT3U(zs
->zs_ranges
[i
].end
, >, nblks
);
373 zs
->zs_ranges
[j
].start
= zs
->zs_ranges
[i
].start
- nblks
;
374 zs
->zs_ranges
[j
].end
= zs
->zs_ranges
[i
].end
- nblks
;
376 if (j
< ZFETCH_RANGES
) {
377 zs
->zs_ranges
[j
].start
= 0;
378 zs
->zs_ranges
[j
].end
= 0;
382 zs
->zs_blkid
+= nblks
;
387 * Process future stream access for nblks blocks starting at blkid. Return
388 * number of blocks to proceed for if future ranges reach fill threshold.
391 dmu_zfetch_future(zstream_t
*zs
, uint64_t blkid
, uint64_t nblks
)
393 ASSERT3U(blkid
, >, zs
->zs_blkid
);
394 blkid
-= zs
->zs_blkid
;
395 ASSERT3U(blkid
+ nblks
, <=, UINT16_MAX
);
397 /* Search for first and last intersection or insert point. */
398 uint_t f
= ZFETCH_RANGES
, l
= 0, i
;
399 for (i
= 0; i
< ZFETCH_RANGES
; i
++) {
400 zsrange_t
*r
= &zs
->zs_ranges
[i
];
401 if (r
->start
== 0 || r
->start
> blkid
+ nblks
)
411 /* Got some intersecting range, expand it if needed. */
412 if (zs
->zs_ranges
[f
].start
> blkid
)
413 zs
->zs_ranges
[f
].start
= blkid
;
414 zs
->zs_ranges
[f
].end
= MAX(zs
->zs_ranges
[l
].end
, blkid
+ nblks
);
416 /* Got more than one intersection, remove others. */
417 for (f
++, l
++; l
< ZFETCH_RANGES
; f
++, l
++) {
418 zs
->zs_ranges
[f
].start
= zs
->zs_ranges
[l
].start
;
419 zs
->zs_ranges
[f
].end
= zs
->zs_ranges
[l
].end
;
421 zs
->zs_ranges
[f
].start
= 0;
422 zs
->zs_ranges
[f
].end
= 0;
424 } else if (i
< ZFETCH_RANGES
) {
425 /* Got no intersecting ranges, insert new one. */
426 for (l
= ZFETCH_RANGES
- 1; l
> i
; l
--) {
427 zs
->zs_ranges
[l
].start
= zs
->zs_ranges
[l
- 1].start
;
428 zs
->zs_ranges
[l
].end
= zs
->zs_ranges
[l
- 1].end
;
430 zs
->zs_ranges
[i
].start
= blkid
;
431 zs
->zs_ranges
[i
].end
= blkid
+ nblks
;
433 /* No space left to insert. Drop the range. */
437 /* Check if with the new access addition we reached fill threshold. */
438 if (zfetch_hole_shift
>= 16)
441 for (i
= f
= l
= 0; i
< ZFETCH_RANGES
; i
++) {
442 zsrange_t
*r
= &zs
->zs_ranges
[i
];
445 hole
+= r
->start
- f
;
447 if (hole
<= r
->end
>> zfetch_hole_shift
)
451 return (dmu_zfetch_hit(zs
, l
));
457 * This is the predictive prefetch entry point. dmu_zfetch_prepare()
458 * associates dnode access specified with blkid and nblks arguments with
459 * prefetch stream, predicts further accesses based on that stats and returns
460 * the stream pointer on success. That pointer must later be passed to
461 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
462 * release it. dmu_zfetch() is a wrapper for simple cases when window between
463 * prediction and prefetch initiation is not needed.
464 * fetch_data argument specifies whether actual data blocks should be fetched:
465 * FALSE -- prefetch only indirect blocks for predicted data blocks;
466 * TRUE -- prefetch predicted data blocks plus following indirect blocks.
469 dmu_zfetch_prepare(zfetch_t
*zf
, uint64_t blkid
, uint64_t nblks
,
470 boolean_t fetch_data
, boolean_t have_lock
)
473 spa_t
*spa
= zf
->zf_dnode
->dn_objset
->os_spa
;
474 zfs_prefetch_type_t os_prefetch
= zf
->zf_dnode
->dn_objset
->os_prefetch
;
475 int64_t ipf_start
, ipf_end
;
477 if (zfs_prefetch_disable
|| os_prefetch
== ZFS_PREFETCH_NONE
)
480 if (os_prefetch
== ZFS_PREFETCH_METADATA
)
481 fetch_data
= B_FALSE
;
484 * If we haven't yet loaded the indirect vdevs' mappings, we
485 * can only read from blocks that we carefully ensure are on
486 * concrete vdevs (or previously-loaded indirect vdevs). So we
487 * can't allow the predictive prefetcher to attempt reads of other
488 * blocks (e.g. of the MOS's dnode object).
490 if (!spa_indirect_vdevs_loaded(spa
))
494 * As a fast path for small (single-block) files, ignore access
495 * to the first block.
497 if (!have_lock
&& blkid
== 0)
501 rw_enter(&zf
->zf_dnode
->dn_struct_rwlock
, RW_READER
);
504 * A fast path for small files for which no prefetch will
507 uint64_t maxblkid
= zf
->zf_dnode
->dn_maxblkid
;
510 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
513 mutex_enter(&zf
->zf_lock
);
516 * Find perfect prefetch stream. Depending on whether the accesses
517 * are block-aligned, first block of the new access may either follow
518 * the last block of the previous access, or be equal to it.
520 unsigned int dbs
= zf
->zf_dnode
->dn_datablkshift
;
521 uint64_t end_blkid
= blkid
+ nblks
;
522 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
;
523 zs
= list_next(&zf
->zf_stream
, zs
)) {
524 if (blkid
== zs
->zs_blkid
) {
526 } else if (blkid
+ 1 == zs
->zs_blkid
) {
534 * Find close enough prefetch stream. Access crossing stream position
535 * is a hit in its new part. Access ahead of stream position considered
536 * a hit for metadata prefetch, since we do not care about fill percent,
537 * or stored for future otherwise. Access behind stream position is
538 * silently ignored, since we already skipped it reaching fill percent.
540 uint_t max_reorder
= MIN((zfetch_max_reorder
>> dbs
) + 1, UINT16_MAX
);
541 uint_t t
= gethrestime_sec() - zfetch_max_sec_reap
;
542 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
;
543 zs
= list_next(&zf
->zf_stream
, zs
)) {
544 if (blkid
> zs
->zs_blkid
) {
545 if (end_blkid
<= zs
->zs_blkid
+ max_reorder
) {
547 nblks
= dmu_zfetch_hit(zs
,
548 end_blkid
- zs
->zs_blkid
);
549 ZFETCHSTAT_BUMP(zfetchstat_stride
);
552 nblks
= dmu_zfetch_future(zs
, blkid
, nblks
);
554 ZFETCHSTAT_BUMP(zfetchstat_stride
);
556 ZFETCHSTAT_BUMP(zfetchstat_future
);
559 } else if (end_blkid
>= zs
->zs_blkid
) {
560 nblks
-= zs
->zs_blkid
- blkid
;
561 blkid
+= zs
->zs_blkid
- blkid
;
563 } else if (end_blkid
+ max_reorder
> zs
->zs_blkid
&&
564 (int)(zs
->zs_atime
- t
) >= 0) {
565 ZFETCHSTAT_BUMP(zfetchstat_past
);
566 zs
->zs_atime
= gethrestime_sec();
572 * This access is not part of any existing stream. Create a new
573 * stream for it unless we are at the end of file.
576 if (end_blkid
< maxblkid
)
577 dmu_zfetch_stream_create(zf
, end_blkid
);
578 mutex_exit(&zf
->zf_lock
);
579 ZFETCHSTAT_BUMP(zfetchstat_misses
);
584 nblks
= dmu_zfetch_hit(zs
, nblks
);
585 ZFETCHSTAT_BUMP(zfetchstat_hits
);
588 zs
->zs_atime
= gethrestime_sec();
590 /* Exit if we already prefetched for this position before. */
594 /* If the file is ending, remove the stream. */
595 end_blkid
= zs
->zs_blkid
;
596 if (end_blkid
>= maxblkid
) {
597 dmu_zfetch_stream_remove(zf
, zs
);
599 mutex_exit(&zf
->zf_lock
);
601 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
606 * This access was to a block that we issued a prefetch for on
607 * behalf of this stream. Calculate further prefetch distances.
609 * Start prefetch from the demand access size (nblks). Double the
610 * distance every access up to zfetch_min_distance. After that only
611 * if needed increase the distance by 1/8 up to zfetch_max_distance.
613 * Don't double the distance beyond single block if we have more
614 * than ~6% of ARC held by active prefetches. It should help with
615 * getting out of RAM on some badly mispredicted read patterns.
617 unsigned int nbytes
= nblks
<< dbs
;
618 unsigned int pf_nblks
;
620 if (unlikely(zs
->zs_pf_dist
< nbytes
))
621 zs
->zs_pf_dist
= nbytes
;
622 else if (zs
->zs_pf_dist
< zfetch_min_distance
&&
623 (zs
->zs_pf_dist
< (1 << dbs
) ||
624 aggsum_compare(&zfetch_sums
.zfetchstat_io_active
,
625 arc_c_max
>> (4 + dbs
)) < 0))
627 else if (zs
->zs_more
)
628 zs
->zs_pf_dist
+= zs
->zs_pf_dist
/ 8;
629 zs
->zs_more
= B_FALSE
;
630 if (zs
->zs_pf_dist
> zfetch_max_distance
)
631 zs
->zs_pf_dist
= zfetch_max_distance
;
632 pf_nblks
= zs
->zs_pf_dist
>> dbs
;
636 if (zs
->zs_pf_start
< end_blkid
)
637 zs
->zs_pf_start
= end_blkid
;
638 if (zs
->zs_pf_end
< end_blkid
+ pf_nblks
)
639 zs
->zs_pf_end
= end_blkid
+ pf_nblks
;
642 * Do the same for indirects, starting where we will stop reading
643 * data blocks (and the indirects that point to them).
645 if (unlikely(zs
->zs_ipf_dist
< nbytes
))
646 zs
->zs_ipf_dist
= nbytes
;
648 zs
->zs_ipf_dist
*= 2;
649 if (zs
->zs_ipf_dist
> zfetch_max_idistance
)
650 zs
->zs_ipf_dist
= zfetch_max_idistance
;
651 pf_nblks
= zs
->zs_ipf_dist
>> dbs
;
652 if (zs
->zs_ipf_start
< zs
->zs_pf_end
)
653 zs
->zs_ipf_start
= zs
->zs_pf_end
;
654 ipf_start
= zs
->zs_ipf_end
;
655 if (zs
->zs_ipf_end
< zs
->zs_pf_end
+ pf_nblks
)
656 zs
->zs_ipf_end
= zs
->zs_pf_end
+ pf_nblks
;
658 zfs_refcount_add(&zs
->zs_refs
, NULL
);
659 /* Count concurrent callers. */
660 zfs_refcount_add(&zs
->zs_callers
, NULL
);
661 mutex_exit(&zf
->zf_lock
);
665 * Prefetch the following indirect blocks for this access to reduce
666 * dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode().
667 * This covers the gap during the first couple accesses when we can
668 * not predict the future yet, but know what is needed right now.
669 * This should be very rare for reads/writes to need more than one
670 * indirect, but more useful for cloning due to much bigger accesses.
672 ipf_start
= MAX(ipf_start
, blkid
+ 1);
673 int epbs
= zf
->zf_dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
674 ipf_start
= P2ROUNDUP(ipf_start
, 1 << epbs
) >> epbs
;
675 ipf_end
= P2ROUNDUP(end_blkid
, 1 << epbs
) >> epbs
;
678 for (int64_t iblk
= ipf_start
; iblk
< ipf_end
; iblk
++) {
679 issued
+= dbuf_prefetch(zf
->zf_dnode
, 1, iblk
,
680 ZIO_PRIORITY_SYNC_READ
, ARC_FLAG_PRESCIENT_PREFETCH
);
684 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
686 ZFETCHSTAT_ADD(zfetchstat_io_issued
, issued
);
691 dmu_zfetch_run(zfetch_t
*zf
, zstream_t
*zs
, boolean_t missed
,
694 int64_t pf_start
, pf_end
, ipf_start
, ipf_end
;
698 zs
->zs_missed
= missed
;
701 * Postpone the prefetch if there are more concurrent callers.
702 * It happens when multiple requests are waiting for the same
703 * indirect block. The last one will run the prefetch for all.
705 if (zfs_refcount_remove(&zs
->zs_callers
, NULL
) != 0) {
706 /* Drop reference taken in dmu_zfetch_prepare(). */
707 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
708 dmu_zfetch_stream_fini(zs
);
712 mutex_enter(&zf
->zf_lock
);
714 pf_start
= zs
->zs_pf_start
;
715 pf_end
= zs
->zs_pf_start
= zs
->zs_pf_end
;
717 pf_start
= pf_end
= 0;
719 ipf_start
= zs
->zs_ipf_start
;
720 ipf_end
= zs
->zs_ipf_start
= zs
->zs_ipf_end
;
721 mutex_exit(&zf
->zf_lock
);
722 ASSERT3S(pf_start
, <=, pf_end
);
723 ASSERT3S(ipf_start
, <=, ipf_end
);
725 epbs
= zf
->zf_dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
726 ipf_start
= P2ROUNDUP(ipf_start
, 1 << epbs
) >> epbs
;
727 ipf_end
= P2ROUNDUP(ipf_end
, 1 << epbs
) >> epbs
;
728 ASSERT3S(ipf_start
, <=, ipf_end
);
729 issued
= pf_end
- pf_start
+ ipf_end
- ipf_start
;
731 /* More references on top of taken in dmu_zfetch_prepare(). */
732 zfs_refcount_add_few(&zs
->zs_refs
, issued
- 1, NULL
);
733 } else if (issued
== 0) {
734 /* Some other thread has done our work, so drop the ref. */
735 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
736 dmu_zfetch_stream_fini(zs
);
739 aggsum_add(&zfetch_sums
.zfetchstat_io_active
, issued
);
742 rw_enter(&zf
->zf_dnode
->dn_struct_rwlock
, RW_READER
);
745 for (int64_t blk
= pf_start
; blk
< pf_end
; blk
++) {
746 issued
+= dbuf_prefetch_impl(zf
->zf_dnode
, 0, blk
,
747 ZIO_PRIORITY_ASYNC_READ
, 0, dmu_zfetch_done
, zs
);
749 for (int64_t iblk
= ipf_start
; iblk
< ipf_end
; iblk
++) {
750 issued
+= dbuf_prefetch_impl(zf
->zf_dnode
, 1, iblk
,
751 ZIO_PRIORITY_ASYNC_READ
, 0, dmu_zfetch_done
, zs
);
755 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
758 ZFETCHSTAT_ADD(zfetchstat_io_issued
, issued
);
762 dmu_zfetch(zfetch_t
*zf
, uint64_t blkid
, uint64_t nblks
, boolean_t fetch_data
,
763 boolean_t missed
, boolean_t have_lock
)
767 zs
= dmu_zfetch_prepare(zf
, blkid
, nblks
, fetch_data
, have_lock
);
769 dmu_zfetch_run(zf
, zs
, missed
, have_lock
);
772 ZFS_MODULE_PARAM(zfs_prefetch
, zfs_prefetch_
, disable
, INT
, ZMOD_RW
,
773 "Disable all ZFS prefetching");
775 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_streams
, UINT
, ZMOD_RW
,
776 "Max number of streams per zfetch");
778 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, min_sec_reap
, UINT
, ZMOD_RW
,
779 "Min time before stream reclaim");
781 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_sec_reap
, UINT
, ZMOD_RW
,
782 "Max time before stream delete");
784 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, min_distance
, UINT
, ZMOD_RW
,
785 "Min bytes to prefetch per stream");
787 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_distance
, UINT
, ZMOD_RW
,
788 "Max bytes to prefetch per stream");
790 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_idistance
, UINT
, ZMOD_RW
,
791 "Max bytes to prefetch indirects for per stream");
793 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_reorder
, UINT
, ZMOD_RW
,
794 "Max request reorder distance within a stream");
796 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, hole_shift
, UINT
, ZMOD_RW
,
797 "Max log2 fraction of holes in a stream");