4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
34 #include <sys/spa_impl.h>
36 #include <sys/dmu_impl.h>
38 #include <sys/sa_impl.h>
39 #include <sys/callb.h>
40 #include <sys/zfeature.h>
42 static int32_t zfs_pd_bytes_max
= 50 * 1024 * 1024; /* 50MB */
43 static int32_t send_holes_without_birth_time
= 1;
44 static uint_t zfs_traverse_indirect_prefetch_limit
= 32;
46 typedef struct prefetch_data
{
49 int32_t pd_bytes_fetched
;
53 zbookmark_phys_t pd_resume
;
56 typedef struct traverse_data
{
61 zbookmark_phys_t
*td_resume
;
63 prefetch_data_t
*td_pfd
;
65 uint64_t td_hole_birth_enabled_txg
;
68 boolean_t td_realloc_possible
;
71 static int traverse_dnode(traverse_data_t
*td
, const blkptr_t
*bp
,
72 const dnode_phys_t
*dnp
, uint64_t objset
, uint64_t object
);
73 static void prefetch_dnode_metadata(traverse_data_t
*td
, const dnode_phys_t
*,
74 uint64_t objset
, uint64_t object
);
77 traverse_zil_block(zilog_t
*zilog
, const blkptr_t
*bp
, void *arg
,
80 traverse_data_t
*td
= arg
;
87 BP_GET_LOGICAL_BIRTH(bp
) >= spa_min_claim_txg(td
->td_spa
))
90 SET_BOOKMARK(&zb
, td
->td_objset
, ZB_ZIL_OBJECT
, ZB_ZIL_LEVEL
,
91 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]);
93 (void) td
->td_func(td
->td_spa
, zilog
, bp
, &zb
, NULL
, td
->td_arg
);
99 traverse_zil_record(zilog_t
*zilog
, const lr_t
*lrc
, void *arg
,
102 traverse_data_t
*td
= arg
;
104 if (lrc
->lrc_txtype
== TX_WRITE
) {
105 lr_write_t
*lr
= (lr_write_t
*)lrc
;
106 blkptr_t
*bp
= &lr
->lr_blkptr
;
112 if (claim_txg
== 0 || BP_GET_LOGICAL_BIRTH(bp
) < claim_txg
)
115 ASSERT3U(BP_GET_LSIZE(bp
), !=, 0);
116 SET_BOOKMARK(&zb
, td
->td_objset
, lr
->lr_foid
,
117 ZB_ZIL_LEVEL
, lr
->lr_offset
/ BP_GET_LSIZE(bp
));
119 (void) td
->td_func(td
->td_spa
, zilog
, bp
, &zb
, NULL
,
126 traverse_zil(traverse_data_t
*td
, zil_header_t
*zh
)
128 uint64_t claim_txg
= zh
->zh_claim_txg
;
131 * We only want to visit blocks that have been claimed but not yet
132 * replayed; plus blocks that are already stable in read-only mode.
134 if (claim_txg
== 0 && spa_writeable(td
->td_spa
))
137 zilog_t
*zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
138 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
139 claim_txg
, !(td
->td_flags
& TRAVERSE_NO_DECRYPT
));
143 typedef enum resume_skip
{
150 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
151 * the block indicated by zb does not need to be visited at all. Returns
152 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
153 * resume point. This indicates that this block should be visited but not its
154 * children (since they must have been visited in a previous traversal).
155 * Otherwise returns RESUME_SKIP_NONE.
158 resume_skip_check(const traverse_data_t
*td
, const dnode_phys_t
*dnp
,
159 const zbookmark_phys_t
*zb
)
161 if (td
->td_resume
!= NULL
) {
163 * If we already visited this bp & everything below,
164 * don't bother doing it again.
166 if (zbookmark_subtree_completed(dnp
, zb
, td
->td_resume
))
167 return (RESUME_SKIP_ALL
);
169 if (memcmp(zb
, td
->td_resume
, sizeof (*zb
)) == 0) {
170 if (td
->td_flags
& TRAVERSE_POST
)
171 return (RESUME_SKIP_CHILDREN
);
174 return (RESUME_SKIP_NONE
);
178 * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
181 traverse_prefetch_metadata(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
182 const blkptr_t
*bp
, const zbookmark_phys_t
*zb
)
184 arc_flags_t flags
= ARC_FLAG_NOWAIT
| ARC_FLAG_PREFETCH
|
185 ARC_FLAG_PRESCIENT_PREFETCH
;
186 int zio_flags
= ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
;
188 if (!(td
->td_flags
& TRAVERSE_PREFETCH_METADATA
))
191 * If this bp is before the resume point, it may have already been
194 if (resume_skip_check(td
, dnp
, zb
) != RESUME_SKIP_NONE
)
196 if (BP_IS_HOLE(bp
) || BP_GET_LOGICAL_BIRTH(bp
) <= td
->td_min_txg
)
198 if (BP_GET_LEVEL(bp
) == 0 && BP_GET_TYPE(bp
) != DMU_OT_DNODE
)
200 ASSERT(!BP_IS_REDACTED(bp
));
202 if ((td
->td_flags
& TRAVERSE_NO_DECRYPT
) && BP_IS_PROTECTED(bp
))
203 zio_flags
|= ZIO_FLAG_RAW
;
205 (void) arc_read(NULL
, td
->td_spa
, bp
, NULL
, NULL
,
206 ZIO_PRIORITY_ASYNC_READ
, zio_flags
, &flags
, zb
);
211 prefetch_needed(prefetch_data_t
*pfd
, const blkptr_t
*bp
)
213 ASSERT(pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
);
214 if (BP_IS_HOLE(bp
) || BP_IS_EMBEDDED(bp
) ||
215 BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
|| BP_IS_REDACTED(bp
))
221 traverse_visitbp(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
222 const blkptr_t
*bp
, const zbookmark_phys_t
*zb
)
225 arc_buf_t
*buf
= NULL
;
226 prefetch_data_t
*pd
= td
->td_pfd
;
228 switch (resume_skip_check(td
, dnp
, zb
)) {
229 case RESUME_SKIP_ALL
:
231 case RESUME_SKIP_CHILDREN
:
233 case RESUME_SKIP_NONE
:
239 if (BP_GET_LOGICAL_BIRTH(bp
) == 0) {
241 * Since this block has a birth time of 0 it must be one of
242 * two things: a hole created before the
243 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
244 * which has always been a hole in an object.
246 * If a file is written sparsely, then the unwritten parts of
247 * the file were "always holes" -- that is, they have been
248 * holes since this object was allocated. However, we (and
249 * our callers) can not necessarily tell when an object was
250 * allocated. Therefore, if it's possible that this object
251 * was freed and then its object number reused, we need to
252 * visit all the holes with birth==0.
254 * If it isn't possible that the object number was reused,
255 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
256 * all the blocks we will visit as part of this traversal,
257 * then this hole must have always existed, so we can skip
258 * it. We visit blocks born after (exclusive) td_min_txg.
260 * Note that the meta-dnode cannot be reallocated.
262 if (!send_holes_without_birth_time
&&
263 (!td
->td_realloc_possible
||
264 zb
->zb_object
== DMU_META_DNODE_OBJECT
) &&
265 td
->td_hole_birth_enabled_txg
<= td
->td_min_txg
)
267 } else if (BP_GET_LOGICAL_BIRTH(bp
) <= td
->td_min_txg
) {
271 if (pd
!= NULL
&& !pd
->pd_exited
&& prefetch_needed(pd
, bp
)) {
272 uint64_t size
= BP_GET_LSIZE(bp
);
273 mutex_enter(&pd
->pd_mtx
);
274 ASSERT(pd
->pd_bytes_fetched
>= 0);
275 while (pd
->pd_bytes_fetched
< size
&& !pd
->pd_exited
)
276 cv_wait_sig(&pd
->pd_cv
, &pd
->pd_mtx
);
277 pd
->pd_bytes_fetched
-= size
;
278 cv_broadcast(&pd
->pd_cv
);
279 mutex_exit(&pd
->pd_mtx
);
282 if (BP_IS_HOLE(bp
) || BP_IS_REDACTED(bp
)) {
283 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
, td
->td_arg
);
289 if (td
->td_flags
& TRAVERSE_PRE
) {
290 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
,
292 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
298 if (BP_GET_LEVEL(bp
) > 0) {
299 uint32_t flags
= ARC_FLAG_WAIT
;
300 int32_t i
, ptidx
, pidx
;
301 uint32_t prefetchlimit
;
302 int32_t epb
= BP_GET_LSIZE(bp
) >> SPA_BLKPTRSHIFT
;
303 zbookmark_phys_t
*czb
;
305 ASSERT(!BP_IS_PROTECTED(bp
));
307 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
308 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
312 czb
= kmem_alloc(sizeof (zbookmark_phys_t
), KM_SLEEP
);
315 * When performing a traversal it is beneficial to
316 * asynchronously read-ahead the upcoming indirect
317 * blocks since they will be needed shortly. However,
318 * since a 128k indirect (non-L0) block may contain up
319 * to 1024 128-byte block pointers, its preferable to not
320 * prefetch them all at once. Issuing a large number of
321 * async reads may effect performance, and the earlier
322 * the indirect blocks are prefetched the less likely
323 * they are to still be resident in the ARC when needed.
324 * Therefore, prefetching indirect blocks is limited to
325 * zfs_traverse_indirect_prefetch_limit=32 blocks by
328 * pidx: Index for which next prefetch to be issued.
329 * ptidx: Index at which next prefetch to be triggered.
333 prefetchlimit
= zfs_traverse_indirect_prefetch_limit
;
334 for (i
= 0; i
< epb
; i
++) {
335 if (prefetchlimit
&& i
== ptidx
) {
336 ASSERT3S(ptidx
, <=, pidx
);
337 for (uint32_t prefetched
= 0; pidx
< epb
&&
338 prefetched
< prefetchlimit
; pidx
++) {
339 SET_BOOKMARK(czb
, zb
->zb_objset
,
340 zb
->zb_object
, zb
->zb_level
- 1,
341 zb
->zb_blkid
* epb
+ pidx
);
342 if (traverse_prefetch_metadata(td
, dnp
,
343 &((blkptr_t
*)buf
->b_data
)[pidx
],
347 MAX(prefetchlimit
/ 2, 1))
353 /* recursively visitbp() blocks below this */
354 SET_BOOKMARK(czb
, zb
->zb_objset
, zb
->zb_object
,
356 zb
->zb_blkid
* epb
+ i
);
357 err
= traverse_visitbp(td
, dnp
,
358 &((blkptr_t
*)buf
->b_data
)[i
], czb
);
363 kmem_free(czb
, sizeof (zbookmark_phys_t
));
365 } else if (BP_GET_TYPE(bp
) == DMU_OT_DNODE
) {
366 uint32_t flags
= ARC_FLAG_WAIT
;
367 uint32_t zio_flags
= ZIO_FLAG_CANFAIL
;
369 int32_t epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
370 dnode_phys_t
*child_dnp
;
373 * dnode blocks might have their bonus buffers encrypted, so
374 * we must be careful to honor TRAVERSE_NO_DECRYPT
376 if ((td
->td_flags
& TRAVERSE_NO_DECRYPT
) && BP_IS_PROTECTED(bp
))
377 zio_flags
|= ZIO_FLAG_RAW
;
379 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
380 ZIO_PRIORITY_ASYNC_READ
, zio_flags
, &flags
, zb
);
384 child_dnp
= buf
->b_data
;
386 for (i
= 0; i
< epb
; i
+= child_dnp
[i
].dn_extra_slots
+ 1) {
387 prefetch_dnode_metadata(td
, &child_dnp
[i
],
388 zb
->zb_objset
, zb
->zb_blkid
* epb
+ i
);
391 /* recursively visitbp() blocks below this */
392 for (i
= 0; i
< epb
; i
+= child_dnp
[i
].dn_extra_slots
+ 1) {
393 err
= traverse_dnode(td
, bp
, &child_dnp
[i
],
394 zb
->zb_objset
, zb
->zb_blkid
* epb
+ i
);
398 } else if (BP_GET_TYPE(bp
) == DMU_OT_OBJSET
) {
399 uint32_t zio_flags
= ZIO_FLAG_CANFAIL
;
400 arc_flags_t flags
= ARC_FLAG_WAIT
;
403 if ((td
->td_flags
& TRAVERSE_NO_DECRYPT
) && BP_IS_PROTECTED(bp
))
404 zio_flags
|= ZIO_FLAG_RAW
;
406 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
407 ZIO_PRIORITY_ASYNC_READ
, zio_flags
, &flags
, zb
);
412 prefetch_dnode_metadata(td
, &osp
->os_meta_dnode
, zb
->zb_objset
,
413 DMU_META_DNODE_OBJECT
);
415 * See the block comment above for the goal of this variable.
416 * If the maxblkid of the meta-dnode is 0, then we know that
417 * we've never had more than DNODES_PER_BLOCK objects in the
418 * dataset, which means we can't have reused any object ids.
420 if (osp
->os_meta_dnode
.dn_maxblkid
== 0)
421 td
->td_realloc_possible
= B_FALSE
;
423 if (OBJSET_BUF_HAS_USERUSED(buf
)) {
424 if (OBJSET_BUF_HAS_PROJECTUSED(buf
))
425 prefetch_dnode_metadata(td
,
426 &osp
->os_projectused_dnode
,
427 zb
->zb_objset
, DMU_PROJECTUSED_OBJECT
);
428 prefetch_dnode_metadata(td
, &osp
->os_groupused_dnode
,
429 zb
->zb_objset
, DMU_GROUPUSED_OBJECT
);
430 prefetch_dnode_metadata(td
, &osp
->os_userused_dnode
,
431 zb
->zb_objset
, DMU_USERUSED_OBJECT
);
434 err
= traverse_dnode(td
, bp
, &osp
->os_meta_dnode
, zb
->zb_objset
,
435 DMU_META_DNODE_OBJECT
);
436 if (err
== 0 && OBJSET_BUF_HAS_USERUSED(buf
)) {
437 if (OBJSET_BUF_HAS_PROJECTUSED(buf
))
438 err
= traverse_dnode(td
, bp
,
439 &osp
->os_projectused_dnode
, zb
->zb_objset
,
440 DMU_PROJECTUSED_OBJECT
);
442 err
= traverse_dnode(td
, bp
,
443 &osp
->os_groupused_dnode
, zb
->zb_objset
,
444 DMU_GROUPUSED_OBJECT
);
446 err
= traverse_dnode(td
, bp
,
447 &osp
->os_userused_dnode
, zb
->zb_objset
,
448 DMU_USERUSED_OBJECT
);
453 arc_buf_destroy(buf
, &buf
);
456 if (err
== 0 && (td
->td_flags
& TRAVERSE_POST
))
457 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
, td
->td_arg
);
459 if ((td
->td_flags
& TRAVERSE_HARD
) && (err
== EIO
|| err
== ECKSUM
)) {
461 * Ignore this disk error as requested by the HARD flag,
462 * and continue traversal.
468 * If we are stopping here, set td_resume.
470 if (td
->td_resume
!= NULL
&& err
!= 0 && !td
->td_paused
) {
471 td
->td_resume
->zb_objset
= zb
->zb_objset
;
472 td
->td_resume
->zb_object
= zb
->zb_object
;
473 td
->td_resume
->zb_level
= 0;
475 * If we have stopped on an indirect block (e.g. due to
476 * i/o error), we have not visited anything below it.
477 * Set the bookmark to the first level-0 block that we need
478 * to visit. This way, the resuming code does not need to
479 * deal with resuming from indirect blocks.
481 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
484 td
->td_resume
->zb_blkid
= zb
->zb_blkid
;
485 if (zb
->zb_level
> 0) {
486 td
->td_resume
->zb_blkid
<<= zb
->zb_level
*
487 (dnp
->dn_indblkshift
- SPA_BLKPTRSHIFT
);
489 td
->td_paused
= B_TRUE
;
496 prefetch_dnode_metadata(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
497 uint64_t objset
, uint64_t object
)
500 zbookmark_phys_t czb
;
502 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
503 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
504 traverse_prefetch_metadata(td
, dnp
, &dnp
->dn_blkptr
[j
], &czb
);
507 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
508 SET_BOOKMARK(&czb
, objset
, object
, 0, DMU_SPILL_BLKID
);
509 traverse_prefetch_metadata(td
, dnp
, DN_SPILL_BLKPTR(dnp
), &czb
);
514 traverse_dnode(traverse_data_t
*td
, const blkptr_t
*bp
, const dnode_phys_t
*dnp
,
515 uint64_t objset
, uint64_t object
)
518 zbookmark_phys_t czb
;
520 if (object
!= DMU_META_DNODE_OBJECT
&& td
->td_resume
!= NULL
&&
521 object
< td
->td_resume
->zb_object
)
524 if (td
->td_flags
& TRAVERSE_PRE
) {
525 SET_BOOKMARK(&czb
, objset
, object
, ZB_DNODE_LEVEL
,
527 err
= td
->td_func(td
->td_spa
, NULL
, bp
, &czb
, dnp
,
529 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
535 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
536 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
537 err
= traverse_visitbp(td
, dnp
, &dnp
->dn_blkptr
[j
], &czb
);
542 if (err
== 0 && (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
)) {
543 SET_BOOKMARK(&czb
, objset
, object
, 0, DMU_SPILL_BLKID
);
544 err
= traverse_visitbp(td
, dnp
, DN_SPILL_BLKPTR(dnp
), &czb
);
547 if (err
== 0 && (td
->td_flags
& TRAVERSE_POST
)) {
548 SET_BOOKMARK(&czb
, objset
, object
, ZB_DNODE_LEVEL
,
550 err
= td
->td_func(td
->td_spa
, NULL
, bp
, &czb
, dnp
,
552 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
561 traverse_prefetcher(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
562 const zbookmark_phys_t
*zb
, const dnode_phys_t
*dnp
, void *arg
)
564 (void) zilog
, (void) dnp
;
565 prefetch_data_t
*pfd
= arg
;
566 int zio_flags
= ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
;
567 arc_flags_t aflags
= ARC_FLAG_NOWAIT
| ARC_FLAG_PREFETCH
|
568 ARC_FLAG_PRESCIENT_PREFETCH
;
570 ASSERT(pfd
->pd_bytes_fetched
>= 0);
571 if (zb
->zb_level
== ZB_DNODE_LEVEL
)
574 return (SET_ERROR(EINTR
));
576 if (!prefetch_needed(pfd
, bp
))
579 mutex_enter(&pfd
->pd_mtx
);
580 while (!pfd
->pd_cancel
&& pfd
->pd_bytes_fetched
>= zfs_pd_bytes_max
)
581 cv_wait_sig(&pfd
->pd_cv
, &pfd
->pd_mtx
);
582 pfd
->pd_bytes_fetched
+= BP_GET_LSIZE(bp
);
583 cv_broadcast(&pfd
->pd_cv
);
584 mutex_exit(&pfd
->pd_mtx
);
586 if ((pfd
->pd_flags
& TRAVERSE_NO_DECRYPT
) && BP_IS_PROTECTED(bp
))
587 zio_flags
|= ZIO_FLAG_RAW
;
589 (void) arc_read(NULL
, spa
, bp
, NULL
, NULL
, ZIO_PRIORITY_ASYNC_READ
,
590 zio_flags
, &aflags
, zb
);
596 traverse_prefetch_thread(void *arg
)
598 traverse_data_t
*td_main
= arg
;
599 traverse_data_t td
= *td_main
;
600 zbookmark_phys_t czb
;
601 fstrans_cookie_t cookie
= spl_fstrans_mark();
603 td
.td_func
= traverse_prefetcher
;
604 td
.td_arg
= td_main
->td_pfd
;
606 td
.td_resume
= &td_main
->td_pfd
->pd_resume
;
608 SET_BOOKMARK(&czb
, td
.td_objset
,
609 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
610 (void) traverse_visitbp(&td
, NULL
, td
.td_rootbp
, &czb
);
612 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
613 td_main
->td_pfd
->pd_exited
= B_TRUE
;
614 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
615 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
616 spl_fstrans_unmark(cookie
);
620 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
621 * in syncing context).
624 traverse_impl(spa_t
*spa
, dsl_dataset_t
*ds
, uint64_t objset
, blkptr_t
*rootbp
,
625 uint64_t txg_start
, zbookmark_phys_t
*resume
, int flags
,
626 blkptr_cb_t func
, void *arg
)
630 zbookmark_phys_t
*czb
;
633 ASSERT(ds
== NULL
|| objset
== ds
->ds_object
);
634 ASSERT(!(flags
& TRAVERSE_PRE
) || !(flags
& TRAVERSE_POST
));
636 td
= kmem_alloc(sizeof (traverse_data_t
), KM_SLEEP
);
637 pd
= kmem_zalloc(sizeof (prefetch_data_t
), KM_SLEEP
);
638 czb
= kmem_alloc(sizeof (zbookmark_phys_t
), KM_SLEEP
);
641 td
->td_objset
= objset
;
642 td
->td_rootbp
= rootbp
;
643 td
->td_min_txg
= txg_start
;
644 td
->td_resume
= resume
;
648 td
->td_flags
= flags
;
649 td
->td_paused
= B_FALSE
;
650 td
->td_realloc_possible
= (txg_start
== 0 ? B_FALSE
: B_TRUE
);
652 if (spa_feature_is_active(spa
, SPA_FEATURE_HOLE_BIRTH
)) {
653 VERIFY(spa_feature_enabled_txg(spa
,
654 SPA_FEATURE_HOLE_BIRTH
, &td
->td_hole_birth_enabled_txg
));
656 td
->td_hole_birth_enabled_txg
= UINT64_MAX
;
659 pd
->pd_flags
= flags
;
661 pd
->pd_resume
= *resume
;
662 mutex_init(&pd
->pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
663 cv_init(&pd
->pd_cv
, NULL
, CV_DEFAULT
, NULL
);
665 SET_BOOKMARK(czb
, td
->td_objset
,
666 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
668 /* See comment on ZIL traversal in dsl_scan_visitds. */
669 if (ds
!= NULL
&& !ds
->ds_is_snapshot
&& !BP_IS_HOLE(rootbp
)) {
670 zio_flag_t zio_flags
= ZIO_FLAG_CANFAIL
;
671 uint32_t flags
= ARC_FLAG_WAIT
;
674 ASSERT(!BP_IS_REDACTED(rootbp
));
676 if ((td
->td_flags
& TRAVERSE_NO_DECRYPT
) &&
677 BP_IS_PROTECTED(rootbp
))
678 zio_flags
|= ZIO_FLAG_RAW
;
680 err
= arc_read(NULL
, td
->td_spa
, rootbp
, arc_getbuf_func
,
681 &buf
, ZIO_PRIORITY_ASYNC_READ
, zio_flags
, &flags
, czb
);
684 * If both TRAVERSE_HARD and TRAVERSE_PRE are set,
685 * continue to visitbp so that td_func can be called
686 * in pre stage, and err will reset to zero.
688 if (!(td
->td_flags
& TRAVERSE_HARD
) ||
689 !(td
->td_flags
& TRAVERSE_PRE
))
693 traverse_zil(td
, &osp
->os_zil_header
);
694 arc_buf_destroy(buf
, &buf
);
698 if (!(flags
& TRAVERSE_PREFETCH_DATA
) ||
699 taskq_dispatch(spa
->spa_prefetch_taskq
, traverse_prefetch_thread
,
700 td
, TQ_NOQUEUE
) == TASKQID_INVALID
)
701 pd
->pd_exited
= B_TRUE
;
703 err
= traverse_visitbp(td
, NULL
, rootbp
, czb
);
705 mutex_enter(&pd
->pd_mtx
);
706 pd
->pd_cancel
= B_TRUE
;
707 cv_broadcast(&pd
->pd_cv
);
708 while (!pd
->pd_exited
)
709 cv_wait_sig(&pd
->pd_cv
, &pd
->pd_mtx
);
710 mutex_exit(&pd
->pd_mtx
);
712 mutex_destroy(&pd
->pd_mtx
);
713 cv_destroy(&pd
->pd_cv
);
715 kmem_free(czb
, sizeof (zbookmark_phys_t
));
716 kmem_free(pd
, sizeof (struct prefetch_data
));
717 kmem_free(td
, sizeof (struct traverse_data
));
723 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
724 * in syncing context).
727 traverse_dataset_resume(dsl_dataset_t
*ds
, uint64_t txg_start
,
728 zbookmark_phys_t
*resume
,
729 int flags
, blkptr_cb_t func
, void *arg
)
731 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
, ds
->ds_object
,
732 &dsl_dataset_phys(ds
)->ds_bp
, txg_start
, resume
, flags
, func
, arg
));
736 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
,
737 int flags
, blkptr_cb_t func
, void *arg
)
739 return (traverse_dataset_resume(ds
, txg_start
, NULL
, flags
, func
, arg
));
743 traverse_dataset_destroyed(spa_t
*spa
, blkptr_t
*blkptr
,
744 uint64_t txg_start
, zbookmark_phys_t
*resume
, int flags
,
745 blkptr_cb_t func
, void *arg
)
747 return (traverse_impl(spa
, NULL
, ZB_DESTROYED_OBJSET
,
748 blkptr
, txg_start
, resume
, flags
, func
, arg
));
752 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
755 traverse_pool(spa_t
*spa
, uint64_t txg_start
, int flags
,
756 blkptr_cb_t func
, void *arg
)
759 dsl_pool_t
*dp
= spa_get_dsl(spa
);
760 objset_t
*mos
= dp
->dp_meta_objset
;
761 boolean_t hard
= (flags
& TRAVERSE_HARD
);
764 err
= traverse_impl(spa
, NULL
, 0, spa_get_rootblkptr(spa
),
765 txg_start
, NULL
, flags
, func
, arg
);
769 /* visit each dataset */
770 for (uint64_t obj
= 1; err
== 0;
771 err
= dmu_object_next(mos
, &obj
, B_FALSE
, txg_start
)) {
772 dmu_object_info_t doi
;
774 err
= dmu_object_info(mos
, obj
, &doi
);
781 if (doi
.doi_bonus_type
== DMU_OT_DSL_DATASET
) {
783 uint64_t txg
= txg_start
;
785 dsl_pool_config_enter(dp
, FTAG
);
786 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
787 dsl_pool_config_exit(dp
, FTAG
);
793 if (dsl_dataset_phys(ds
)->ds_prev_snap_txg
> txg
)
794 txg
= dsl_dataset_phys(ds
)->ds_prev_snap_txg
;
795 err
= traverse_dataset(ds
, txg
, flags
, func
, arg
);
796 dsl_dataset_rele(ds
, FTAG
);
806 EXPORT_SYMBOL(traverse_dataset
);
807 EXPORT_SYMBOL(traverse_pool
);
809 ZFS_MODULE_PARAM(zfs
, zfs_
, pd_bytes_max
, INT
, ZMOD_RW
,
810 "Max number of bytes to prefetch");
812 ZFS_MODULE_PARAM(zfs
, zfs_
, traverse_indirect_prefetch_limit
, UINT
, ZMOD_RW
,
813 "Traverse prefetch number of blocks pointed by indirect block");
816 module_param_named(ignore_hole_birth
, send_holes_without_birth_time
, int, 0644);
817 MODULE_PARM_DESC(ignore_hole_birth
,
818 "Alias for send_holes_without_birth_time");
822 ZFS_MODULE_PARAM(zfs
, , send_holes_without_birth_time
, INT
, ZMOD_RW
,
823 "Ignore hole_birth txg for zfs send");