4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/callb.h>
38 #define SET_BOOKMARK(zb, objset, object, level, blkid) \
40 (zb)->zb_objset = objset; \
41 (zb)->zb_object = object; \
42 (zb)->zb_level = level; \
43 (zb)->zb_blkid = blkid; \
46 struct prefetch_data
{
56 struct traverse_data
{
62 struct prefetch_data
*td_pfd
;
69 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
71 struct traverse_data
*td
= arg
;
74 if (bp
->blk_birth
== 0)
77 if (claim_txg
== 0 && bp
->blk_birth
>= spa_first_txg(td
->td_spa
))
80 zb
.zb_objset
= td
->td_objset
;
83 zb
.zb_blkid
= bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
];
84 VERIFY(0 == td
->td_func(td
->td_spa
, bp
, &zb
, NULL
, td
->td_arg
));
89 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
91 struct traverse_data
*td
= arg
;
93 if (lrc
->lrc_txtype
== TX_WRITE
) {
94 lr_write_t
*lr
= (lr_write_t
*)lrc
;
95 blkptr_t
*bp
= &lr
->lr_blkptr
;
98 if (bp
->blk_birth
== 0)
101 if (claim_txg
== 0 || bp
->blk_birth
< claim_txg
)
104 zb
.zb_objset
= td
->td_objset
;
105 zb
.zb_object
= lr
->lr_foid
;
106 zb
.zb_level
= BP_GET_LEVEL(bp
);
107 zb
.zb_blkid
= lr
->lr_offset
/ BP_GET_LSIZE(bp
);
108 VERIFY(0 == td
->td_func(td
->td_spa
, bp
, &zb
, NULL
, td
->td_arg
));
113 traverse_zil(struct traverse_data
*td
, zil_header_t
*zh
)
115 uint64_t claim_txg
= zh
->zh_claim_txg
;
119 * We only want to visit blocks that have been claimed but not yet
120 * replayed (or, in read-only mode, blocks that *would* be claimed).
122 if (claim_txg
== 0 && (spa_mode
& FWRITE
))
125 zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
127 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
134 traverse_visitbp(struct traverse_data
*td
, const dnode_phys_t
*dnp
,
135 arc_buf_t
*pbuf
, blkptr_t
*bp
, const zbookmark_t
*zb
)
139 arc_buf_t
*buf
= NULL
;
140 struct prefetch_data
*pd
= td
->td_pfd
;
142 if (bp
->blk_birth
== 0) {
143 err
= td
->td_func(td
->td_spa
, NULL
, zb
, dnp
, td
->td_arg
);
147 if (bp
->blk_birth
<= td
->td_min_txg
)
150 if (pd
&& !pd
->pd_exited
&&
151 ((pd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
152 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0)) {
153 mutex_enter(&pd
->pd_mtx
);
154 ASSERT(pd
->pd_blks_fetched
>= 0);
155 while (pd
->pd_blks_fetched
== 0 && !pd
->pd_exited
)
156 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
157 pd
->pd_blks_fetched
--;
158 cv_broadcast(&pd
->pd_cv
);
159 mutex_exit(&pd
->pd_mtx
);
162 if (td
->td_flags
& TRAVERSE_PRE
) {
163 err
= td
->td_func(td
->td_spa
, bp
, zb
, dnp
, td
->td_arg
);
168 if (BP_GET_LEVEL(bp
) > 0) {
169 uint32_t flags
= ARC_WAIT
;
172 int epb
= BP_GET_LSIZE(bp
) >> SPA_BLKPTRSHIFT
;
174 err
= arc_read(NULL
, td
->td_spa
, bp
, pbuf
,
175 arc_getbuf_func
, &buf
,
176 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
180 /* recursively visitbp() blocks below this */
182 for (i
= 0; i
< epb
; i
++, cbp
++) {
183 SET_BOOKMARK(&czb
, zb
->zb_objset
, zb
->zb_object
,
185 zb
->zb_blkid
* epb
+ i
);
186 err
= traverse_visitbp(td
, dnp
, buf
, cbp
, &czb
);
190 } else if (BP_GET_TYPE(bp
) == DMU_OT_DNODE
) {
191 uint32_t flags
= ARC_WAIT
;
193 int epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
195 err
= arc_read(NULL
, td
->td_spa
, bp
, pbuf
,
196 arc_getbuf_func
, &buf
,
197 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
201 /* recursively visitbp() blocks below this */
203 for (i
= 0; i
< epb
&& err
== 0; i
++, dnp
++) {
204 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
205 SET_BOOKMARK(&czb
, zb
->zb_objset
,
206 zb
->zb_blkid
* epb
+ i
,
207 dnp
->dn_nlevels
- 1, j
);
208 err
= traverse_visitbp(td
, dnp
, buf
,
209 (blkptr_t
*)&dnp
->dn_blkptr
[j
], &czb
);
214 } else if (BP_GET_TYPE(bp
) == DMU_OT_OBJSET
) {
215 uint32_t flags
= ARC_WAIT
;
219 err
= arc_read_nolock(NULL
, td
->td_spa
, bp
,
220 arc_getbuf_func
, &buf
,
221 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
227 * traverse_zil is just here for zdb's leak checking.
228 * For other consumers, there will be no ZIL blocks.
230 traverse_zil(td
, &osp
->os_zil_header
);
232 for (j
= 0; j
< osp
->os_meta_dnode
.dn_nblkptr
; j
++) {
233 SET_BOOKMARK(&czb
, zb
->zb_objset
, 0,
234 osp
->os_meta_dnode
.dn_nlevels
- 1, j
);
235 err
= traverse_visitbp(td
, &osp
->os_meta_dnode
, buf
,
236 (blkptr_t
*)&osp
->os_meta_dnode
.dn_blkptr
[j
],
244 (void) arc_buf_remove_ref(buf
, &buf
);
246 if (err
== 0 && (td
->td_flags
& TRAVERSE_POST
))
247 err
= td
->td_func(td
->td_spa
, bp
, zb
, dnp
, td
->td_arg
);
254 traverse_prefetcher(spa_t
*spa
, blkptr_t
*bp
, const zbookmark_t
*zb
,
255 const dnode_phys_t
*dnp
, void *arg
)
257 struct prefetch_data
*pfd
= arg
;
258 uint32_t aflags
= ARC_NOWAIT
| ARC_PREFETCH
;
260 ASSERT(pfd
->pd_blks_fetched
>= 0);
264 if (bp
== NULL
|| !((pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
265 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0))
268 mutex_enter(&pfd
->pd_mtx
);
269 while (!pfd
->pd_cancel
&& pfd
->pd_blks_fetched
>= pfd
->pd_blks_max
)
270 cv_wait(&pfd
->pd_cv
, &pfd
->pd_mtx
);
271 pfd
->pd_blks_fetched
++;
272 cv_broadcast(&pfd
->pd_cv
);
273 mutex_exit(&pfd
->pd_mtx
);
275 (void) arc_read_nolock(NULL
, spa
, bp
, NULL
, NULL
,
276 ZIO_PRIORITY_ASYNC_READ
,
277 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
,
284 traverse_prefetch_thread(void *arg
)
286 struct traverse_data
*td_main
= arg
;
287 struct traverse_data td
= *td_main
;
290 td
.td_func
= traverse_prefetcher
;
291 td
.td_arg
= td_main
->td_pfd
;
294 SET_BOOKMARK(&czb
, td
.td_objset
, 0, -1, 0);
295 (void) traverse_visitbp(&td
, NULL
, NULL
, td
.td_rootbp
, &czb
);
297 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
298 td_main
->td_pfd
->pd_exited
= B_TRUE
;
299 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
300 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
304 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
305 * in syncing context).
308 traverse_impl(spa_t
*spa
, uint64_t objset
, blkptr_t
*rootbp
,
309 uint64_t txg_start
, int flags
, blkptr_cb_t func
, void *arg
)
311 struct traverse_data td
;
312 struct prefetch_data pd
= { 0 };
317 td
.td_objset
= objset
;
318 td
.td_rootbp
= rootbp
;
319 td
.td_min_txg
= txg_start
;
325 pd
.pd_blks_max
= 100;
327 mutex_init(&pd
.pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
328 cv_init(&pd
.pd_cv
, NULL
, CV_DEFAULT
, NULL
);
330 if (!(flags
& TRAVERSE_PREFETCH
) ||
331 0 == taskq_dispatch(system_taskq
, traverse_prefetch_thread
,
333 pd
.pd_exited
= B_TRUE
;
335 SET_BOOKMARK(&czb
, objset
, 0, -1, 0);
336 err
= traverse_visitbp(&td
, NULL
, NULL
, rootbp
, &czb
);
338 mutex_enter(&pd
.pd_mtx
);
339 pd
.pd_cancel
= B_TRUE
;
340 cv_broadcast(&pd
.pd_cv
);
341 while (!pd
.pd_exited
)
342 cv_wait(&pd
.pd_cv
, &pd
.pd_mtx
);
343 mutex_exit(&pd
.pd_mtx
);
345 mutex_destroy(&pd
.pd_mtx
);
346 cv_destroy(&pd
.pd_cv
);
352 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
353 * in syncing context).
356 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int flags
,
357 blkptr_cb_t func
, void *arg
)
359 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
->ds_object
,
360 &ds
->ds_phys
->ds_bp
, txg_start
, flags
, func
, arg
));
364 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
367 traverse_pool(spa_t
*spa
, blkptr_cb_t func
, void *arg
)
371 dsl_pool_t
*dp
= spa_get_dsl(spa
);
372 objset_t
*mos
= dp
->dp_meta_objset
;
375 err
= traverse_impl(spa
, 0, spa_get_rootblkptr(spa
),
376 0, TRAVERSE_PRE
, func
, arg
);
380 /* visit each dataset */
381 for (obj
= 1; err
== 0; err
= dmu_object_next(mos
, &obj
, FALSE
, 0)) {
382 dmu_object_info_t doi
;
384 err
= dmu_object_info(mos
, obj
, &doi
);
388 if (doi
.doi_type
== DMU_OT_DSL_DATASET
) {
390 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
391 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
392 rw_exit(&dp
->dp_config_rwlock
);
395 err
= traverse_dataset(ds
,
396 ds
->ds_phys
->ds_prev_snap_txg
, TRAVERSE_PRE
,
398 dsl_dataset_rele(ds
, FTAG
);