ZIL: Call brt_pending_add() replaying TX_CLONE_RANGE
[zfs.git] / module / zfs / dmu_zfetch.c
blobd0acaf5020664b0c8e31c3160edc8131d4d013a9
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
31 #include <sys/arc_impl.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/dmu_zfetch.h>
35 #include <sys/dmu.h>
36 #include <sys/dbuf.h>
37 #include <sys/kstat.h>
38 #include <sys/wmsum.h>
41 * This tunable disables predictive prefetch. Note that it leaves "prescient"
42 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
43 * prescient prefetch never issues i/os that end up not being needed,
44 * so it can't hurt performance.
47 static int zfs_prefetch_disable = B_FALSE;
49 /* max # of streams per zfetch */
50 static unsigned int zfetch_max_streams = 8;
51 /* min time before stream reclaim */
52 static unsigned int zfetch_min_sec_reap = 1;
53 /* max time before stream delete */
54 static unsigned int zfetch_max_sec_reap = 2;
55 #ifdef _ILP32
56 /* min bytes to prefetch per stream (default 2MB) */
57 static unsigned int zfetch_min_distance = 2 * 1024 * 1024;
58 /* max bytes to prefetch per stream (default 8MB) */
59 unsigned int zfetch_max_distance = 8 * 1024 * 1024;
60 #else
61 /* min bytes to prefetch per stream (default 4MB) */
62 static unsigned int zfetch_min_distance = 4 * 1024 * 1024;
63 /* max bytes to prefetch per stream (default 64MB) */
64 unsigned int zfetch_max_distance = 64 * 1024 * 1024;
65 #endif
66 /* max bytes to prefetch indirects for per stream (default 64MB) */
67 unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
69 typedef struct zfetch_stats {
70 kstat_named_t zfetchstat_hits;
71 kstat_named_t zfetchstat_misses;
72 kstat_named_t zfetchstat_max_streams;
73 kstat_named_t zfetchstat_io_issued;
74 kstat_named_t zfetchstat_io_active;
75 } zfetch_stats_t;
77 static zfetch_stats_t zfetch_stats = {
78 { "hits", KSTAT_DATA_UINT64 },
79 { "misses", KSTAT_DATA_UINT64 },
80 { "max_streams", KSTAT_DATA_UINT64 },
81 { "io_issued", KSTAT_DATA_UINT64 },
82 { "io_active", KSTAT_DATA_UINT64 },
85 struct {
86 wmsum_t zfetchstat_hits;
87 wmsum_t zfetchstat_misses;
88 wmsum_t zfetchstat_max_streams;
89 wmsum_t zfetchstat_io_issued;
90 aggsum_t zfetchstat_io_active;
91 } zfetch_sums;
93 #define ZFETCHSTAT_BUMP(stat) \
94 wmsum_add(&zfetch_sums.stat, 1)
95 #define ZFETCHSTAT_ADD(stat, val) \
96 wmsum_add(&zfetch_sums.stat, val)
99 static kstat_t *zfetch_ksp;
101 static int
102 zfetch_kstats_update(kstat_t *ksp, int rw)
104 zfetch_stats_t *zs = ksp->ks_data;
106 if (rw == KSTAT_WRITE)
107 return (EACCES);
108 zs->zfetchstat_hits.value.ui64 =
109 wmsum_value(&zfetch_sums.zfetchstat_hits);
110 zs->zfetchstat_misses.value.ui64 =
111 wmsum_value(&zfetch_sums.zfetchstat_misses);
112 zs->zfetchstat_max_streams.value.ui64 =
113 wmsum_value(&zfetch_sums.zfetchstat_max_streams);
114 zs->zfetchstat_io_issued.value.ui64 =
115 wmsum_value(&zfetch_sums.zfetchstat_io_issued);
116 zs->zfetchstat_io_active.value.ui64 =
117 aggsum_value(&zfetch_sums.zfetchstat_io_active);
118 return (0);
121 void
122 zfetch_init(void)
124 wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
125 wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
126 wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
127 wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
128 aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);
130 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
131 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
132 KSTAT_FLAG_VIRTUAL);
134 if (zfetch_ksp != NULL) {
135 zfetch_ksp->ks_data = &zfetch_stats;
136 zfetch_ksp->ks_update = zfetch_kstats_update;
137 kstat_install(zfetch_ksp);
141 void
142 zfetch_fini(void)
144 if (zfetch_ksp != NULL) {
145 kstat_delete(zfetch_ksp);
146 zfetch_ksp = NULL;
149 wmsum_fini(&zfetch_sums.zfetchstat_hits);
150 wmsum_fini(&zfetch_sums.zfetchstat_misses);
151 wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
152 wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
153 ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
154 aggsum_fini(&zfetch_sums.zfetchstat_io_active);
158 * This takes a pointer to a zfetch structure and a dnode. It performs the
159 * necessary setup for the zfetch structure, grokking data from the
160 * associated dnode.
162 void
163 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
165 if (zf == NULL)
166 return;
167 zf->zf_dnode = dno;
168 zf->zf_numstreams = 0;
170 list_create(&zf->zf_stream, sizeof (zstream_t),
171 offsetof(zstream_t, zs_node));
173 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
176 static void
177 dmu_zfetch_stream_fini(zstream_t *zs)
179 ASSERT(!list_link_active(&zs->zs_node));
180 zfs_refcount_destroy(&zs->zs_callers);
181 zfs_refcount_destroy(&zs->zs_refs);
182 kmem_free(zs, sizeof (*zs));
185 static void
186 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
188 ASSERT(MUTEX_HELD(&zf->zf_lock));
189 list_remove(&zf->zf_stream, zs);
190 zf->zf_numstreams--;
191 membar_producer();
192 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
193 dmu_zfetch_stream_fini(zs);
197 * Clean-up state associated with a zfetch structure (e.g. destroy the
198 * streams). This doesn't free the zfetch_t itself, that's left to the caller.
200 void
201 dmu_zfetch_fini(zfetch_t *zf)
203 zstream_t *zs;
205 mutex_enter(&zf->zf_lock);
206 while ((zs = list_head(&zf->zf_stream)) != NULL)
207 dmu_zfetch_stream_remove(zf, zs);
208 mutex_exit(&zf->zf_lock);
209 list_destroy(&zf->zf_stream);
210 mutex_destroy(&zf->zf_lock);
212 zf->zf_dnode = NULL;
216 * If there aren't too many active streams already, create one more.
217 * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
218 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
219 * The "blkid" argument is the next block that we expect this stream to access.
221 static void
222 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
224 zstream_t *zs, *zs_next, *zs_old = NULL;
225 hrtime_t now = gethrtime(), t;
227 ASSERT(MUTEX_HELD(&zf->zf_lock));
230 * Delete too old streams, reusing the first found one.
232 t = now - SEC2NSEC(zfetch_max_sec_reap);
233 for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
234 zs_next = list_next(&zf->zf_stream, zs);
236 * Skip if still active. 1 -- zf_stream reference.
238 if (zfs_refcount_count(&zs->zs_refs) != 1)
239 continue;
240 if (zs->zs_atime > t)
241 continue;
242 if (zs_old)
243 dmu_zfetch_stream_remove(zf, zs);
244 else
245 zs_old = zs;
247 if (zs_old) {
248 zs = zs_old;
249 goto reuse;
253 * The maximum number of streams is normally zfetch_max_streams,
254 * but for small files we lower it such that it's at least possible
255 * for all the streams to be non-overlapping.
257 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
258 zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
259 zfetch_max_distance));
260 if (zf->zf_numstreams >= max_streams) {
261 t = now - SEC2NSEC(zfetch_min_sec_reap);
262 for (zs = list_head(&zf->zf_stream); zs != NULL;
263 zs = list_next(&zf->zf_stream, zs)) {
264 if (zfs_refcount_count(&zs->zs_refs) != 1)
265 continue;
266 if (zs->zs_atime > t)
267 continue;
268 if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
269 zs_old = zs;
271 if (zs_old) {
272 zs = zs_old;
273 goto reuse;
275 ZFETCHSTAT_BUMP(zfetchstat_max_streams);
276 return;
279 zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
280 zs->zs_fetch = zf;
281 zfs_refcount_create(&zs->zs_callers);
282 zfs_refcount_create(&zs->zs_refs);
283 /* One reference for zf_stream. */
284 zfs_refcount_add(&zs->zs_refs, NULL);
285 zf->zf_numstreams++;
286 list_insert_head(&zf->zf_stream, zs);
288 reuse:
289 zs->zs_blkid = blkid;
290 zs->zs_pf_dist = 0;
291 zs->zs_pf_start = blkid;
292 zs->zs_pf_end = blkid;
293 zs->zs_ipf_dist = 0;
294 zs->zs_ipf_start = blkid;
295 zs->zs_ipf_end = blkid;
296 /* Allow immediate stream reuse until first hit. */
297 zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
298 zs->zs_missed = B_FALSE;
299 zs->zs_more = B_FALSE;
302 static void
303 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
305 zstream_t *zs = arg;
307 if (io_issued && level == 0 && blkid < zs->zs_blkid)
308 zs->zs_more = B_TRUE;
309 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
310 dmu_zfetch_stream_fini(zs);
311 aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
315 * This is the predictive prefetch entry point. dmu_zfetch_prepare()
316 * associates dnode access specified with blkid and nblks arguments with
317 * prefetch stream, predicts further accesses based on that stats and returns
318 * the stream pointer on success. That pointer must later be passed to
319 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
320 * release it. dmu_zfetch() is a wrapper for simple cases when window between
321 * prediction and prefetch initiation is not needed.
322 * fetch_data argument specifies whether actual data blocks should be fetched:
323 * FALSE -- prefetch only indirect blocks for predicted data blocks;
324 * TRUE -- prefetch predicted data blocks plus following indirect blocks.
326 zstream_t *
327 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
328 boolean_t fetch_data, boolean_t have_lock)
330 zstream_t *zs;
331 spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
333 if (zfs_prefetch_disable)
334 return (NULL);
336 * If we haven't yet loaded the indirect vdevs' mappings, we
337 * can only read from blocks that we carefully ensure are on
338 * concrete vdevs (or previously-loaded indirect vdevs). So we
339 * can't allow the predictive prefetcher to attempt reads of other
340 * blocks (e.g. of the MOS's dnode object).
342 if (!spa_indirect_vdevs_loaded(spa))
343 return (NULL);
346 * As a fast path for small (single-block) files, ignore access
347 * to the first block.
349 if (!have_lock && blkid == 0)
350 return (NULL);
352 if (!have_lock)
353 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
356 * A fast path for small files for which no prefetch will
357 * happen.
359 uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
360 if (maxblkid < 2) {
361 if (!have_lock)
362 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
363 return (NULL);
365 mutex_enter(&zf->zf_lock);
368 * Find matching prefetch stream. Depending on whether the accesses
369 * are block-aligned, first block of the new access may either follow
370 * the last block of the previous access, or be equal to it.
372 for (zs = list_head(&zf->zf_stream); zs != NULL;
373 zs = list_next(&zf->zf_stream, zs)) {
374 if (blkid == zs->zs_blkid) {
375 break;
376 } else if (blkid + 1 == zs->zs_blkid) {
377 blkid++;
378 nblks--;
379 break;
384 * If the file is ending, remove the matching stream if found.
385 * If not found then it is too late to create a new one now.
387 uint64_t end_of_access_blkid = blkid + nblks;
388 if (end_of_access_blkid >= maxblkid) {
389 if (zs != NULL)
390 dmu_zfetch_stream_remove(zf, zs);
391 mutex_exit(&zf->zf_lock);
392 if (!have_lock)
393 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
394 return (NULL);
397 /* Exit if we already prefetched this block before. */
398 if (nblks == 0) {
399 mutex_exit(&zf->zf_lock);
400 if (!have_lock)
401 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
402 return (NULL);
405 if (zs == NULL) {
407 * This access is not part of any existing stream. Create
408 * a new stream for it.
410 dmu_zfetch_stream_create(zf, end_of_access_blkid);
411 mutex_exit(&zf->zf_lock);
412 if (!have_lock)
413 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
414 ZFETCHSTAT_BUMP(zfetchstat_misses);
415 return (NULL);
419 * This access was to a block that we issued a prefetch for on
420 * behalf of this stream. Calculate further prefetch distances.
422 * Start prefetch from the demand access size (nblks). Double the
423 * distance every access up to zfetch_min_distance. After that only
424 * if needed increase the distance by 1/8 up to zfetch_max_distance.
426 * Don't double the distance beyond single block if we have more
427 * than ~6% of ARC held by active prefetches. It should help with
428 * getting out of RAM on some badly mispredicted read patterns.
430 unsigned int dbs = zf->zf_dnode->dn_datablkshift;
431 unsigned int nbytes = nblks << dbs;
432 unsigned int pf_nblks;
433 if (fetch_data) {
434 if (unlikely(zs->zs_pf_dist < nbytes))
435 zs->zs_pf_dist = nbytes;
436 else if (zs->zs_pf_dist < zfetch_min_distance &&
437 (zs->zs_pf_dist < (1 << dbs) ||
438 aggsum_compare(&zfetch_sums.zfetchstat_io_active,
439 arc_c_max >> (4 + dbs)) < 0))
440 zs->zs_pf_dist *= 2;
441 else if (zs->zs_more)
442 zs->zs_pf_dist += zs->zs_pf_dist / 8;
443 zs->zs_more = B_FALSE;
444 if (zs->zs_pf_dist > zfetch_max_distance)
445 zs->zs_pf_dist = zfetch_max_distance;
446 pf_nblks = zs->zs_pf_dist >> dbs;
447 } else {
448 pf_nblks = 0;
450 if (zs->zs_pf_start < end_of_access_blkid)
451 zs->zs_pf_start = end_of_access_blkid;
452 if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
453 zs->zs_pf_end = end_of_access_blkid + pf_nblks;
456 * Do the same for indirects, starting where we will stop reading
457 * data blocks (and the indirects that point to them).
459 if (unlikely(zs->zs_ipf_dist < nbytes))
460 zs->zs_ipf_dist = nbytes;
461 else
462 zs->zs_ipf_dist *= 2;
463 if (zs->zs_ipf_dist > zfetch_max_idistance)
464 zs->zs_ipf_dist = zfetch_max_idistance;
465 pf_nblks = zs->zs_ipf_dist >> dbs;
466 if (zs->zs_ipf_start < zs->zs_pf_end)
467 zs->zs_ipf_start = zs->zs_pf_end;
468 if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
469 zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
471 zs->zs_blkid = end_of_access_blkid;
472 /* Protect the stream from reclamation. */
473 zs->zs_atime = gethrtime();
474 zfs_refcount_add(&zs->zs_refs, NULL);
475 /* Count concurrent callers. */
476 zfs_refcount_add(&zs->zs_callers, NULL);
477 mutex_exit(&zf->zf_lock);
479 if (!have_lock)
480 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
482 ZFETCHSTAT_BUMP(zfetchstat_hits);
483 return (zs);
486 void
487 dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
489 zfetch_t *zf = zs->zs_fetch;
490 int64_t pf_start, pf_end, ipf_start, ipf_end;
491 int epbs, issued;
493 if (missed)
494 zs->zs_missed = missed;
497 * Postpone the prefetch if there are more concurrent callers.
498 * It happens when multiple requests are waiting for the same
499 * indirect block. The last one will run the prefetch for all.
501 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
502 /* Drop reference taken in dmu_zfetch_prepare(). */
503 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
504 dmu_zfetch_stream_fini(zs);
505 return;
508 mutex_enter(&zf->zf_lock);
509 if (zs->zs_missed) {
510 pf_start = zs->zs_pf_start;
511 pf_end = zs->zs_pf_start = zs->zs_pf_end;
512 } else {
513 pf_start = pf_end = 0;
515 ipf_start = zs->zs_ipf_start;
516 ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
517 mutex_exit(&zf->zf_lock);
518 ASSERT3S(pf_start, <=, pf_end);
519 ASSERT3S(ipf_start, <=, ipf_end);
521 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
522 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
523 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
524 ASSERT3S(ipf_start, <=, ipf_end);
525 issued = pf_end - pf_start + ipf_end - ipf_start;
526 if (issued > 1) {
527 /* More references on top of taken in dmu_zfetch_prepare(). */
528 zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
529 } else if (issued == 0) {
530 /* Some other thread has done our work, so drop the ref. */
531 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
532 dmu_zfetch_stream_fini(zs);
533 return;
535 aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);
537 if (!have_lock)
538 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
540 issued = 0;
541 for (int64_t blk = pf_start; blk < pf_end; blk++) {
542 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
543 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
545 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
546 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
547 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
550 if (!have_lock)
551 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
553 if (issued)
554 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
557 void
558 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
559 boolean_t missed, boolean_t have_lock)
561 zstream_t *zs;
563 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
564 if (zs)
565 dmu_zfetch_run(zs, missed, have_lock);
568 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
569 "Disable all ZFS prefetching");
571 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
572 "Max number of streams per zfetch");
574 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
575 "Min time before stream reclaim");
577 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
578 "Max time before stream delete");
580 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
581 "Min bytes to prefetch per stream");
583 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
584 "Max bytes to prefetch per stream");
586 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
587 "Max bytes to prefetch indirects for per stream");