4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2017 Datto Inc.
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zfs_refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
35 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
38 bpobj_alloc_empty(objset_t
*os
, int blocksize
, dmu_tx_t
*tx
)
40 spa_t
*spa
= dmu_objset_spa(os
);
41 dsl_pool_t
*dp
= dmu_objset_pool(os
);
43 if (spa_feature_is_enabled(spa
, SPA_FEATURE_EMPTY_BPOBJ
)) {
44 if (!spa_feature_is_active(spa
, SPA_FEATURE_EMPTY_BPOBJ
)) {
45 ASSERT0(dp
->dp_empty_bpobj
);
47 bpobj_alloc(os
, SPA_OLD_MAXBLOCKSIZE
, tx
);
49 DMU_POOL_DIRECTORY_OBJECT
,
50 DMU_POOL_EMPTY_BPOBJ
, sizeof (uint64_t), 1,
51 &dp
->dp_empty_bpobj
, tx
) == 0);
53 spa_feature_incr(spa
, SPA_FEATURE_EMPTY_BPOBJ
, tx
);
54 ASSERT(dp
->dp_empty_bpobj
!= 0);
55 return (dp
->dp_empty_bpobj
);
57 return (bpobj_alloc(os
, blocksize
, tx
));
62 bpobj_decr_empty(objset_t
*os
, dmu_tx_t
*tx
)
64 dsl_pool_t
*dp
= dmu_objset_pool(os
);
66 spa_feature_decr(dmu_objset_spa(os
), SPA_FEATURE_EMPTY_BPOBJ
, tx
);
67 if (!spa_feature_is_active(dmu_objset_spa(os
),
68 SPA_FEATURE_EMPTY_BPOBJ
)) {
69 VERIFY3U(0, ==, zap_remove(dp
->dp_meta_objset
,
70 DMU_POOL_DIRECTORY_OBJECT
,
71 DMU_POOL_EMPTY_BPOBJ
, tx
));
72 VERIFY3U(0, ==, dmu_object_free(os
, dp
->dp_empty_bpobj
, tx
));
73 dp
->dp_empty_bpobj
= 0;
78 bpobj_alloc(objset_t
*os
, int blocksize
, dmu_tx_t
*tx
)
82 if (spa_version(dmu_objset_spa(os
)) < SPA_VERSION_BPOBJ_ACCOUNT
)
84 else if (spa_version(dmu_objset_spa(os
)) < SPA_VERSION_DEADLISTS
)
86 else if (!spa_feature_is_active(dmu_objset_spa(os
),
87 SPA_FEATURE_LIVELIST
))
90 size
= sizeof (bpobj_phys_t
);
92 return (dmu_object_alloc(os
, DMU_OT_BPOBJ
, blocksize
,
93 DMU_OT_BPOBJ_HDR
, size
, tx
));
97 bpobj_free(objset_t
*os
, uint64_t obj
, dmu_tx_t
*tx
)
101 dmu_object_info_t doi
;
103 dmu_buf_t
*dbuf
= NULL
;
105 ASSERT(obj
!= dmu_objset_pool(os
)->dp_empty_bpobj
);
106 VERIFY3U(0, ==, bpobj_open(&bpo
, os
, obj
));
108 mutex_enter(&bpo
.bpo_lock
);
110 if (!bpo
.bpo_havesubobj
|| bpo
.bpo_phys
->bpo_subobjs
== 0)
113 VERIFY3U(0, ==, dmu_object_info(os
, bpo
.bpo_phys
->bpo_subobjs
, &doi
));
114 epb
= doi
.doi_data_block_size
/ sizeof (uint64_t);
116 for (i
= bpo
.bpo_phys
->bpo_num_subobjs
- 1; i
>= 0; i
--) {
118 uint64_t offset
, blkoff
;
120 offset
= i
* sizeof (uint64_t);
121 blkoff
= P2PHASE(i
, epb
);
123 if (dbuf
== NULL
|| dbuf
->db_offset
> offset
) {
125 dmu_buf_rele(dbuf
, FTAG
);
126 VERIFY3U(0, ==, dmu_buf_hold(os
,
127 bpo
.bpo_phys
->bpo_subobjs
, offset
, FTAG
, &dbuf
, 0));
130 ASSERT3U(offset
, >=, dbuf
->db_offset
);
131 ASSERT3U(offset
, <, dbuf
->db_offset
+ dbuf
->db_size
);
133 objarray
= dbuf
->db_data
;
134 bpobj_free(os
, objarray
[blkoff
], tx
);
137 dmu_buf_rele(dbuf
, FTAG
);
140 VERIFY3U(0, ==, dmu_object_free(os
, bpo
.bpo_phys
->bpo_subobjs
, tx
));
143 mutex_exit(&bpo
.bpo_lock
);
146 VERIFY3U(0, ==, dmu_object_free(os
, obj
, tx
));
150 bpobj_open(bpobj_t
*bpo
, objset_t
*os
, uint64_t object
)
152 dmu_object_info_t doi
;
155 err
= dmu_object_info(os
, object
, &doi
);
159 memset(bpo
, 0, sizeof (*bpo
));
160 mutex_init(&bpo
->bpo_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
162 ASSERT(bpo
->bpo_dbuf
== NULL
);
163 ASSERT(bpo
->bpo_phys
== NULL
);
165 ASSERT3U(doi
.doi_type
, ==, DMU_OT_BPOBJ
);
166 ASSERT3U(doi
.doi_bonus_type
, ==, DMU_OT_BPOBJ_HDR
);
168 err
= dmu_bonus_hold(os
, object
, bpo
, &bpo
->bpo_dbuf
);
173 bpo
->bpo_object
= object
;
174 bpo
->bpo_epb
= doi
.doi_data_block_size
>> SPA_BLKPTRSHIFT
;
175 bpo
->bpo_havecomp
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V0
);
176 bpo
->bpo_havesubobj
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V1
);
177 bpo
->bpo_havefreed
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V2
);
178 bpo
->bpo_phys
= bpo
->bpo_dbuf
->db_data
;
183 bpobj_is_open(const bpobj_t
*bpo
)
185 return (bpo
->bpo_object
!= 0);
189 bpobj_close(bpobj_t
*bpo
)
191 /* Lame workaround for closing a bpobj that was never opened. */
192 if (bpo
->bpo_object
== 0)
195 dmu_buf_rele(bpo
->bpo_dbuf
, bpo
);
196 if (bpo
->bpo_cached_dbuf
!= NULL
)
197 dmu_buf_rele(bpo
->bpo_cached_dbuf
, bpo
);
198 bpo
->bpo_dbuf
= NULL
;
199 bpo
->bpo_phys
= NULL
;
200 bpo
->bpo_cached_dbuf
= NULL
;
203 mutex_destroy(&bpo
->bpo_lock
);
207 bpobj_is_empty_impl(bpobj_t
*bpo
)
209 ASSERT(MUTEX_HELD(&bpo
->bpo_lock
));
210 return (bpo
->bpo_phys
->bpo_num_blkptrs
== 0 &&
211 (!bpo
->bpo_havesubobj
|| bpo
->bpo_phys
->bpo_num_subobjs
== 0));
215 bpobj_is_empty(bpobj_t
*bpo
)
217 mutex_enter(&bpo
->bpo_lock
);
218 boolean_t is_empty
= bpobj_is_empty_impl(bpo
);
219 mutex_exit(&bpo
->bpo_lock
);
224 * A recursive iteration of the bpobjs would be nice here but we run the risk
225 * of overflowing function stack space. Instead, find each subobj and add it
226 * to the head of our list so it can be scanned for subjobjs. Like a
227 * recursive implementation, the "deepest" subobjs will be freed first.
228 * When a subobj is found to have no additional subojs, free it.
230 typedef struct bpobj_info
{
233 * This object is a subobj of bpi_parent,
234 * at bpi_index in its subobj array.
236 struct bpobj_info
*bpi_parent
;
238 /* How many of our subobj's are left to process. */
239 uint64_t bpi_unprocessed_subobjs
;
240 /* True after having visited this bpo's directly referenced BPs. */
241 boolean_t bpi_visited
;
242 list_node_t bpi_node
;
245 static bpobj_info_t
*
246 bpi_alloc(bpobj_t
*bpo
, bpobj_info_t
*parent
, uint64_t index
)
248 bpobj_info_t
*bpi
= kmem_zalloc(sizeof (bpobj_info_t
), KM_SLEEP
);
250 bpi
->bpi_parent
= parent
;
251 bpi
->bpi_index
= index
;
252 if (bpo
->bpo_havesubobj
&& bpo
->bpo_phys
->bpo_subobjs
!= 0) {
253 bpi
->bpi_unprocessed_subobjs
= bpo
->bpo_phys
->bpo_num_subobjs
;
259 * Update bpobj and all of its parents with new space accounting.
262 propagate_space_reduction(bpobj_info_t
*bpi
, int64_t freed
,
263 int64_t comp_freed
, int64_t uncomp_freed
, dmu_tx_t
*tx
)
266 for (; bpi
!= NULL
; bpi
= bpi
->bpi_parent
) {
267 bpobj_t
*p
= bpi
->bpi_bpo
;
268 ASSERT(dmu_buf_is_dirty(p
->bpo_dbuf
, tx
));
269 p
->bpo_phys
->bpo_bytes
-= freed
;
270 ASSERT3S(p
->bpo_phys
->bpo_bytes
, >=, 0);
271 if (p
->bpo_havecomp
) {
272 p
->bpo_phys
->bpo_comp
-= comp_freed
;
273 p
->bpo_phys
->bpo_uncomp
-= uncomp_freed
;
279 bpobj_iterate_blkptrs(bpobj_info_t
*bpi
, bpobj_itor_t func
, void *arg
,
280 int64_t start
, dmu_tx_t
*tx
, boolean_t free
)
283 int64_t freed
= 0, comp_freed
= 0, uncomp_freed
= 0;
284 dmu_buf_t
*dbuf
= NULL
;
285 bpobj_t
*bpo
= bpi
->bpi_bpo
;
287 int64_t i
= bpo
->bpo_phys
->bpo_num_blkptrs
- 1;
288 uint64_t pe
= P2ALIGN_TYPED(i
, bpo
->bpo_epb
, uint64_t) *
290 uint64_t ps
= start
* sizeof (blkptr_t
);
291 uint64_t pb
= MAX((pe
> dmu_prefetch_max
) ? pe
- dmu_prefetch_max
: 0,
294 dmu_prefetch(bpo
->bpo_os
, bpo
->bpo_object
, 0, pb
, pe
- pb
,
295 ZIO_PRIORITY_ASYNC_READ
);
297 for (; i
>= start
; i
--) {
298 uint64_t offset
= i
* sizeof (blkptr_t
);
299 uint64_t blkoff
= P2PHASE(i
, bpo
->bpo_epb
);
301 if (dbuf
== NULL
|| dbuf
->db_offset
> offset
) {
303 dmu_buf_rele(dbuf
, FTAG
);
304 err
= dmu_buf_hold(bpo
->bpo_os
, bpo
->bpo_object
,
305 offset
, FTAG
, &dbuf
, DMU_READ_NO_PREFETCH
);
309 pb
= MAX((dbuf
->db_offset
> dmu_prefetch_max
) ?
310 dbuf
->db_offset
- dmu_prefetch_max
: 0, ps
);
312 dmu_prefetch(bpo
->bpo_os
, bpo
->bpo_object
, 0,
313 pb
, pe
- pb
, ZIO_PRIORITY_ASYNC_READ
);
317 ASSERT3U(offset
, >=, dbuf
->db_offset
);
318 ASSERT3U(offset
, <, dbuf
->db_offset
+ dbuf
->db_size
);
320 blkptr_t
*bparray
= dbuf
->db_data
;
321 blkptr_t
*bp
= &bparray
[blkoff
];
323 boolean_t bp_freed
= BP_GET_FREE(bp
);
324 err
= func(arg
, bp
, bp_freed
, tx
);
329 int sign
= bp_freed
? -1 : +1;
330 spa_t
*spa
= dmu_objset_spa(bpo
->bpo_os
);
331 freed
+= sign
* bp_get_dsize_sync(spa
, bp
);
332 comp_freed
+= sign
* BP_GET_PSIZE(bp
);
333 uncomp_freed
+= sign
* BP_GET_UCSIZE(bp
);
334 ASSERT(dmu_buf_is_dirty(bpo
->bpo_dbuf
, tx
));
335 bpo
->bpo_phys
->bpo_num_blkptrs
--;
336 ASSERT3S(bpo
->bpo_phys
->bpo_num_blkptrs
, >=, 0);
338 ASSERT(bpo
->bpo_havefreed
);
339 bpo
->bpo_phys
->bpo_num_freed
--;
340 ASSERT3S(bpo
->bpo_phys
->bpo_num_freed
, >=, 0);
345 propagate_space_reduction(bpi
, freed
, comp_freed
,
347 VERIFY0(dmu_free_range(bpo
->bpo_os
,
349 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
),
350 DMU_OBJECT_END
, tx
));
353 dmu_buf_rele(dbuf
, FTAG
);
360 * Given an initial bpo, start by freeing the BPs that are directly referenced
361 * by that bpo. If the bpo has subobjs, read in its last subobj and push the
362 * subobj to our stack. By popping items off our stack, eventually we will
363 * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
364 * requested also free the now-empty bpo from disk and decrement
365 * its parent's subobj count. We continue popping each subobj from our stack,
366 * visiting its last subobj until they too have no more subobjs, and so on.
369 bpobj_iterate_impl(bpobj_t
*initial_bpo
, bpobj_itor_t func
, void *arg
,
370 dmu_tx_t
*tx
, boolean_t free
, uint64_t *bpobj_size
)
377 * Create a "stack" for us to work with without worrying about
378 * stack overflows. Initialize it with the initial_bpo.
380 list_create(&stack
, sizeof (bpobj_info_t
),
381 offsetof(bpobj_info_t
, bpi_node
));
382 mutex_enter(&initial_bpo
->bpo_lock
);
384 if (bpobj_size
!= NULL
)
385 *bpobj_size
= initial_bpo
->bpo_phys
->bpo_num_blkptrs
;
387 list_insert_head(&stack
, bpi_alloc(initial_bpo
, NULL
, 0));
389 while ((bpi
= list_head(&stack
)) != NULL
) {
390 bpobj_t
*bpo
= bpi
->bpi_bpo
;
392 ASSERT3P(bpo
, !=, NULL
);
393 ASSERT(MUTEX_HELD(&bpo
->bpo_lock
));
394 ASSERT(bpobj_is_open(bpo
));
397 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
399 if (bpi
->bpi_visited
== B_FALSE
) {
400 err
= bpobj_iterate_blkptrs(bpi
, func
, arg
, 0, tx
,
402 bpi
->bpi_visited
= B_TRUE
;
407 * We've finished with this bpo's directly-referenced BP's and
408 * it has no more unprocessed subobjs. We can free its
409 * bpobj_info_t (unless it is the topmost, initial_bpo).
410 * If we are freeing from disk, we can also do that.
412 if (bpi
->bpi_unprocessed_subobjs
== 0) {
414 * If there are no entries, there should
417 if (bpobj_is_empty_impl(bpo
)) {
418 ASSERT0(bpo
->bpo_phys
->bpo_bytes
);
419 ASSERT0(bpo
->bpo_phys
->bpo_comp
);
420 ASSERT0(bpo
->bpo_phys
->bpo_uncomp
);
423 /* The initial_bpo has no parent and is not closed. */
424 if (bpi
->bpi_parent
!= NULL
) {
426 bpobj_t
*p
= bpi
->bpi_parent
->bpi_bpo
;
428 ASSERT0(bpo
->bpo_phys
->bpo_num_blkptrs
);
429 ASSERT3U(p
->bpo_phys
->bpo_num_subobjs
,
431 ASSERT3U(bpi
->bpi_index
, ==,
432 p
->bpo_phys
->bpo_num_subobjs
- 1);
433 ASSERT(dmu_buf_is_dirty(bpo
->bpo_dbuf
,
436 p
->bpo_phys
->bpo_num_subobjs
--;
438 VERIFY0(dmu_free_range(p
->bpo_os
,
439 p
->bpo_phys
->bpo_subobjs
,
440 bpi
->bpi_index
* sizeof (uint64_t),
441 sizeof (uint64_t), tx
));
443 /* eliminate the empty subobj list */
444 if (bpo
->bpo_havesubobj
&&
445 bpo
->bpo_phys
->bpo_subobjs
!= 0) {
446 ASSERT0(bpo
->bpo_phys
->
448 err
= dmu_object_free(
450 bpo
->bpo_phys
->bpo_subobjs
,
454 bpo
->bpo_phys
->bpo_subobjs
= 0;
456 err
= dmu_object_free(p
->bpo_os
,
457 bpo
->bpo_object
, tx
);
462 mutex_exit(&bpo
->bpo_lock
);
464 kmem_free(bpo
, sizeof (bpobj_t
));
466 mutex_exit(&bpo
->bpo_lock
);
470 * Finished processing this bpo. Unlock, and free
473 list_remove_head(&stack
);
474 kmem_free(bpi
, sizeof (bpobj_info_t
));
477 * We have unprocessed subobjs. Process the next one.
479 ASSERT(bpo
->bpo_havecomp
);
480 ASSERT3P(bpobj_size
, ==, NULL
);
482 /* Add the last subobj to stack. */
483 int64_t i
= bpi
->bpi_unprocessed_subobjs
- 1;
484 uint64_t offset
= i
* sizeof (uint64_t);
487 err
= dmu_read(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
488 offset
, sizeof (uint64_t), &subobj
,
489 DMU_READ_NO_PREFETCH
);
493 bpobj_t
*subbpo
= kmem_alloc(sizeof (bpobj_t
),
495 err
= bpobj_open(subbpo
, bpo
->bpo_os
, subobj
);
497 kmem_free(subbpo
, sizeof (bpobj_t
));
501 if (subbpo
->bpo_havesubobj
&&
502 subbpo
->bpo_phys
->bpo_subobjs
!= 0) {
503 dmu_prefetch(subbpo
->bpo_os
,
504 subbpo
->bpo_phys
->bpo_subobjs
, 0, 0, 0,
505 ZIO_PRIORITY_ASYNC_READ
);
508 list_insert_head(&stack
, bpi_alloc(subbpo
, bpi
, i
));
509 mutex_enter(&subbpo
->bpo_lock
);
510 bpi
->bpi_unprocessed_subobjs
--;
514 * Cleanup anything left on the "stack" after we left the loop.
515 * Every bpo on the stack is locked so we must remember to undo
516 * that now (in LIFO order).
518 while ((bpi
= list_remove_head(&stack
)) != NULL
) {
519 bpobj_t
*bpo
= bpi
->bpi_bpo
;
521 ASSERT3P(bpo
, !=, NULL
);
523 mutex_exit(&bpo
->bpo_lock
);
525 /* do not free the initial_bpo */
526 if (bpi
->bpi_parent
!= NULL
) {
527 bpobj_close(bpi
->bpi_bpo
);
528 kmem_free(bpi
->bpi_bpo
, sizeof (bpobj_t
));
530 kmem_free(bpi
, sizeof (bpobj_info_t
));
533 list_destroy(&stack
);
539 * Iterate and remove the entries. If func returns nonzero, iteration
540 * will stop and that entry will not be removed.
543 bpobj_iterate(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
, dmu_tx_t
*tx
)
545 return (bpobj_iterate_impl(bpo
, func
, arg
, tx
, B_TRUE
, NULL
));
549 * Iterate the entries. If func returns nonzero, iteration will stop.
551 * If there are no subobjs:
553 * *bpobj_size can be used to return the number of block pointers in the
554 * bpobj. Note that this may be different from the number of block pointers
555 * that are iterated over, if iteration is terminated early (e.g. by the func
556 * returning nonzero).
558 * If there are concurrent (or subsequent) modifications to the bpobj then the
559 * returned *bpobj_size can be passed as "start" to
560 * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
563 bpobj_iterate_nofree(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
,
564 uint64_t *bpobj_size
)
566 return (bpobj_iterate_impl(bpo
, func
, arg
, NULL
, B_FALSE
, bpobj_size
));
570 * Iterate over the blkptrs in the bpobj beginning at index start. If func
571 * returns nonzero, iteration will stop. This is a livelist specific function
572 * since it assumes that there are no subobjs present.
575 livelist_bpobj_iterate_from_nofree(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
,
578 if (bpo
->bpo_havesubobj
)
579 VERIFY0(bpo
->bpo_phys
->bpo_subobjs
);
580 bpobj_info_t
*bpi
= bpi_alloc(bpo
, NULL
, 0);
581 int err
= bpobj_iterate_blkptrs(bpi
, func
, arg
, start
, NULL
, B_FALSE
);
582 kmem_free(bpi
, sizeof (bpobj_info_t
));
587 * Logically add subobj's contents to the parent bpobj.
589 * In the most general case, this is accomplished in constant time by adding
590 * a reference to subobj. This case is used when enqueuing a large subobj:
591 * +--------------+ +--------------+
592 * | bpobj |----------------------->| subobj list |
593 * +----+----+----+----+----+ +-----+-----+--+--+
594 * | bp | bp | bp | bp | bp | | obj | obj | obj |
595 * +----+----+----+----+----+ +-----+-----+-----+
597 * +--------------+ +--------------+
598 * | sub-bpobj |----------------------> | subsubobj |
599 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
600 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
601 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
603 * Result: sub-bpobj added to parent's subobj list.
604 * +--------------+ +--------------+
605 * | bpobj |----------------------->| subobj list |
606 * +----+----+----+----+----+ +-----+-----+--+--+-----+
607 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
608 * +----+----+----+----+----+ +-----+-----+-----+--|--+
610 * /-----------------------------------------------------/
612 * +--------------+ +--------------+
613 * | sub-bpobj |----------------------> | subsubobj |
614 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
615 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
616 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
619 * In a common case, the subobj is small: its bp's and its list of subobj's
620 * are each stored in a single block. In this case we copy the subobj's
621 * contents to the parent:
622 * +--------------+ +--------------+
623 * | bpobj |----------------------->| subobj list |
624 * +----+----+----+----+----+ +-----+-----+--+--+
625 * | bp | bp | bp | bp | bp | | obj | obj | obj |
626 * +----+----+----+----+----+ +-----+-----+-----+
628 * +--------------+ | +--------------+ |
629 * | sub-bpobj |---------^------------> | subsubobj | ^
630 * +----+----+----+ | +-----+-----+--+ |
631 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
632 * +----+----+ +-----+-----+
634 * Result: subobj destroyed, contents copied to parent:
635 * +--------------+ +--------------+
636 * | bpobj |----------------------->| subobj list |
637 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
638 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
639 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
642 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
643 * but retain the sub-bpobj:
644 * +--------------+ +--------------+
645 * | bpobj |----------------------->| subobj list |
646 * +----+----+----+----+----+ +-----+-----+--+--+
647 * | bp | bp | bp | bp | bp | | obj | obj | obj |
648 * +----+----+----+----+----+ +-----+-----+-----+
650 * +--------------+ +--------------+ |
651 * | sub-bpobj |----------------------> | subsubobj | ^
652 * +----+----+----+----+---------+----+ +-----+-----+--+ |
653 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
654 * +----+----+----+----+---------+----+ +-----+-----+
656 * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
657 * +--------------+ +--------------+
658 * | bpobj |-------------------->| subobj list |
659 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
660 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
661 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
663 * /--------------------------------------------------------------/
667 * +----+----+----+----+---------+----+
668 * | bp | bp | bp | bp | ... | bp |
669 * +----+----+----+----+---------+----+
672 bpobj_enqueue_subobj(bpobj_t
*bpo
, uint64_t subobj
, dmu_tx_t
*tx
)
675 uint64_t used
, comp
, uncomp
, subsubobjs
;
676 boolean_t copy_subsub
= B_TRUE
;
677 boolean_t copy_bps
= B_TRUE
;
679 ASSERT(bpobj_is_open(bpo
));
681 ASSERT(bpo
->bpo_havesubobj
);
682 ASSERT(bpo
->bpo_havecomp
);
683 ASSERT(bpo
->bpo_object
!= dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
);
685 if (subobj
== dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
) {
686 bpobj_decr_empty(bpo
->bpo_os
, tx
);
690 VERIFY3U(0, ==, bpobj_open(&subbpo
, bpo
->bpo_os
, subobj
));
691 if (bpobj_is_empty(&subbpo
)) {
692 /* No point in having an empty subobj. */
693 bpobj_close(&subbpo
);
694 bpobj_free(bpo
->bpo_os
, subobj
, tx
);
697 VERIFY3U(0, ==, bpobj_space(&subbpo
, &used
, &comp
, &uncomp
));
699 mutex_enter(&bpo
->bpo_lock
);
700 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
702 dmu_object_info_t doi
;
704 if (bpo
->bpo_phys
->bpo_subobjs
!= 0) {
705 ASSERT0(dmu_object_info(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
707 ASSERT3U(doi
.doi_type
, ==, DMU_OT_BPOBJ_SUBOBJ
);
711 * If subobj has only one block of subobjs, then move subobj's
712 * subobjs to bpo's subobj list directly. This reduces recursion in
713 * bpobj_iterate due to nested subobjs.
715 subsubobjs
= subbpo
.bpo_phys
->bpo_subobjs
;
716 if (subsubobjs
!= 0) {
717 VERIFY0(dmu_object_info(bpo
->bpo_os
, subsubobjs
, &doi
));
718 if (doi
.doi_max_offset
> doi
.doi_data_block_size
) {
719 copy_subsub
= B_FALSE
;
724 * If, in addition to having only one block of subobj's, subobj has
725 * only one block of bp's, then move subobj's bp's to bpo's bp list
726 * directly. This reduces recursion in bpobj_iterate due to nested
729 VERIFY3U(0, ==, dmu_object_info(bpo
->bpo_os
, subobj
, &doi
));
730 if (doi
.doi_max_offset
> doi
.doi_data_block_size
|| !copy_subsub
) {
734 if (copy_subsub
&& subsubobjs
!= 0) {
736 uint64_t numsubsub
= subbpo
.bpo_phys
->bpo_num_subobjs
;
738 VERIFY0(dmu_buf_hold(bpo
->bpo_os
, subsubobjs
,
739 0, FTAG
, &subdb
, 0));
741 * Make sure that we are not asking dmu_write()
742 * to write more data than we have in our buffer.
744 VERIFY3U(subdb
->db_size
, >=,
745 numsubsub
* sizeof (subobj
));
746 if (bpo
->bpo_phys
->bpo_subobjs
== 0) {
747 bpo
->bpo_phys
->bpo_subobjs
=
748 dmu_object_alloc(bpo
->bpo_os
,
749 DMU_OT_BPOBJ_SUBOBJ
, SPA_OLD_MAXBLOCKSIZE
,
752 dmu_write(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
753 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
),
754 numsubsub
* sizeof (subobj
), subdb
->db_data
, tx
);
755 dmu_buf_rele(subdb
, FTAG
);
756 bpo
->bpo_phys
->bpo_num_subobjs
+= numsubsub
;
758 dmu_buf_will_dirty(subbpo
.bpo_dbuf
, tx
);
759 subbpo
.bpo_phys
->bpo_subobjs
= 0;
760 VERIFY0(dmu_object_free(bpo
->bpo_os
, subsubobjs
, tx
));
765 uint64_t numbps
= subbpo
.bpo_phys
->bpo_num_blkptrs
;
768 VERIFY0(dmu_buf_hold(bpo
->bpo_os
, subobj
,
772 * Make sure that we are not asking dmu_write()
773 * to write more data than we have in our buffer.
775 VERIFY3U(bps
->db_size
, >=, numbps
* sizeof (blkptr_t
));
776 dmu_write(bpo
->bpo_os
, bpo
->bpo_object
,
777 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
),
778 numbps
* sizeof (blkptr_t
),
780 dmu_buf_rele(bps
, FTAG
);
781 bpo
->bpo_phys
->bpo_num_blkptrs
+= numbps
;
783 bpobj_close(&subbpo
);
784 VERIFY0(dmu_object_free(bpo
->bpo_os
, subobj
, tx
));
786 bpobj_close(&subbpo
);
787 if (bpo
->bpo_phys
->bpo_subobjs
== 0) {
788 bpo
->bpo_phys
->bpo_subobjs
=
789 dmu_object_alloc(bpo
->bpo_os
,
790 DMU_OT_BPOBJ_SUBOBJ
, SPA_OLD_MAXBLOCKSIZE
,
794 dmu_write(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
795 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
),
796 sizeof (subobj
), &subobj
, tx
);
797 bpo
->bpo_phys
->bpo_num_subobjs
++;
800 bpo
->bpo_phys
->bpo_bytes
+= used
;
801 bpo
->bpo_phys
->bpo_comp
+= comp
;
802 bpo
->bpo_phys
->bpo_uncomp
+= uncomp
;
803 mutex_exit(&bpo
->bpo_lock
);
808 * Prefetch metadata required for bpobj_enqueue_subobj().
811 bpobj_prefetch_subobj(bpobj_t
*bpo
, uint64_t subobj
)
813 dmu_object_info_t doi
;
816 boolean_t copy_subsub
= B_TRUE
;
817 boolean_t copy_bps
= B_TRUE
;
819 ASSERT(bpobj_is_open(bpo
));
822 if (subobj
== dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
)
825 if (bpobj_open(&subbpo
, bpo
->bpo_os
, subobj
) != 0)
827 if (bpobj_is_empty(&subbpo
)) {
828 bpobj_close(&subbpo
);
831 subsubobjs
= subbpo
.bpo_phys
->bpo_subobjs
;
832 bpobj_close(&subbpo
);
834 if (subsubobjs
!= 0) {
835 if (dmu_object_info(bpo
->bpo_os
, subsubobjs
, &doi
) != 0)
837 if (doi
.doi_max_offset
> doi
.doi_data_block_size
)
838 copy_subsub
= B_FALSE
;
841 if (dmu_object_info(bpo
->bpo_os
, subobj
, &doi
) != 0)
843 if (doi
.doi_max_offset
> doi
.doi_data_block_size
|| !copy_subsub
)
846 if (copy_subsub
&& subsubobjs
!= 0) {
847 if (bpo
->bpo_phys
->bpo_subobjs
) {
848 dmu_prefetch(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
, 0,
849 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
), 1,
850 ZIO_PRIORITY_ASYNC_READ
);
852 dmu_prefetch(bpo
->bpo_os
, subsubobjs
, 0, 0, 1,
853 ZIO_PRIORITY_ASYNC_READ
);
857 dmu_prefetch(bpo
->bpo_os
, bpo
->bpo_object
, 0,
858 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
), 1,
859 ZIO_PRIORITY_ASYNC_READ
);
860 dmu_prefetch(bpo
->bpo_os
, subobj
, 0, 0, 1,
861 ZIO_PRIORITY_ASYNC_READ
);
862 } else if (bpo
->bpo_phys
->bpo_subobjs
) {
863 dmu_prefetch(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
, 0,
864 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
), 1,
865 ZIO_PRIORITY_ASYNC_READ
);
870 bpobj_enqueue(bpobj_t
*bpo
, const blkptr_t
*bp
, boolean_t bp_freed
,
873 blkptr_t stored_bp
= *bp
;
878 ASSERT(bpobj_is_open(bpo
));
879 ASSERT(!BP_IS_HOLE(bp
));
880 ASSERT(bpo
->bpo_object
!= dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
);
882 if (BP_IS_EMBEDDED(bp
)) {
884 * The bpobj will compress better without the payload.
886 * Note that we store EMBEDDED bp's because they have an
887 * uncompressed size, which must be accounted for. An
888 * alternative would be to add their size to bpo_uncomp
889 * without storing the bp, but that would create additional
890 * complications: bpo_uncomp would be inconsistent with the
891 * set of BP's stored, and bpobj_iterate() wouldn't visit
892 * all the space accounted for in the bpobj.
894 memset(&stored_bp
, 0, sizeof (stored_bp
));
895 stored_bp
.blk_prop
= bp
->blk_prop
;
896 BP_SET_LOGICAL_BIRTH(&stored_bp
, BP_GET_LOGICAL_BIRTH(bp
));
897 } else if (!BP_GET_DEDUP(bp
)) {
898 /* The bpobj will compress better without the checksum */
899 memset(&stored_bp
.blk_cksum
, 0, sizeof (stored_bp
.blk_cksum
));
902 stored_bp
.blk_fill
= 0;
903 BP_SET_FREE(&stored_bp
, bp_freed
);
905 mutex_enter(&bpo
->bpo_lock
);
907 offset
= bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (stored_bp
);
908 blkoff
= P2PHASE(bpo
->bpo_phys
->bpo_num_blkptrs
, bpo
->bpo_epb
);
910 if (bpo
->bpo_cached_dbuf
== NULL
||
911 offset
< bpo
->bpo_cached_dbuf
->db_offset
||
912 offset
>= bpo
->bpo_cached_dbuf
->db_offset
+
913 bpo
->bpo_cached_dbuf
->db_size
) {
914 if (bpo
->bpo_cached_dbuf
)
915 dmu_buf_rele(bpo
->bpo_cached_dbuf
, bpo
);
916 VERIFY3U(0, ==, dmu_buf_hold(bpo
->bpo_os
, bpo
->bpo_object
,
917 offset
, bpo
, &bpo
->bpo_cached_dbuf
, 0));
918 ASSERT3P(bpo
->bpo_cached_dbuf
, !=, NULL
);
921 dmu_buf_will_dirty(bpo
->bpo_cached_dbuf
, tx
);
922 bparray
= bpo
->bpo_cached_dbuf
->db_data
;
923 bparray
[blkoff
] = stored_bp
;
925 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
926 bpo
->bpo_phys
->bpo_num_blkptrs
++;
927 int sign
= bp_freed
? -1 : +1;
928 bpo
->bpo_phys
->bpo_bytes
+= sign
*
929 bp_get_dsize_sync(dmu_objset_spa(bpo
->bpo_os
), bp
);
930 if (bpo
->bpo_havecomp
) {
931 bpo
->bpo_phys
->bpo_comp
+= sign
* BP_GET_PSIZE(bp
);
932 bpo
->bpo_phys
->bpo_uncomp
+= sign
* BP_GET_UCSIZE(bp
);
935 ASSERT(bpo
->bpo_havefreed
);
936 bpo
->bpo_phys
->bpo_num_freed
++;
938 mutex_exit(&bpo
->bpo_lock
);
941 struct space_range_arg
{
951 space_range_cb(void *arg
, const blkptr_t
*bp
, boolean_t bp_freed
, dmu_tx_t
*tx
)
953 (void) bp_freed
, (void) tx
;
954 struct space_range_arg
*sra
= arg
;
956 if (BP_GET_LOGICAL_BIRTH(bp
) > sra
->mintxg
&&
957 BP_GET_LOGICAL_BIRTH(bp
) <= sra
->maxtxg
) {
958 if (dsl_pool_sync_context(spa_get_dsl(sra
->spa
)))
959 sra
->used
+= bp_get_dsize_sync(sra
->spa
, bp
);
961 sra
->used
+= bp_get_dsize(sra
->spa
, bp
);
962 sra
->comp
+= BP_GET_PSIZE(bp
);
963 sra
->uncomp
+= BP_GET_UCSIZE(bp
);
969 bpobj_space(bpobj_t
*bpo
, uint64_t *usedp
, uint64_t *compp
, uint64_t *uncompp
)
971 ASSERT(bpobj_is_open(bpo
));
972 mutex_enter(&bpo
->bpo_lock
);
974 *usedp
= bpo
->bpo_phys
->bpo_bytes
;
975 if (bpo
->bpo_havecomp
) {
976 *compp
= bpo
->bpo_phys
->bpo_comp
;
977 *uncompp
= bpo
->bpo_phys
->bpo_uncomp
;
978 mutex_exit(&bpo
->bpo_lock
);
981 mutex_exit(&bpo
->bpo_lock
);
982 return (bpobj_space_range(bpo
, 0, UINT64_MAX
,
983 usedp
, compp
, uncompp
));
988 * Return the amount of space in the bpobj which is:
989 * mintxg < logical birth <= maxtxg
992 bpobj_space_range(bpobj_t
*bpo
, uint64_t mintxg
, uint64_t maxtxg
,
993 uint64_t *usedp
, uint64_t *compp
, uint64_t *uncompp
)
995 struct space_range_arg sra
= { 0 };
998 ASSERT(bpobj_is_open(bpo
));
1001 * As an optimization, if they want the whole txg range, just
1002 * get bpo_bytes rather than iterating over the bps.
1004 if (mintxg
< TXG_INITIAL
&& maxtxg
== UINT64_MAX
&& bpo
->bpo_havecomp
)
1005 return (bpobj_space(bpo
, usedp
, compp
, uncompp
));
1007 sra
.spa
= dmu_objset_spa(bpo
->bpo_os
);
1008 sra
.mintxg
= mintxg
;
1009 sra
.maxtxg
= maxtxg
;
1011 err
= bpobj_iterate_nofree(bpo
, space_range_cb
, &sra
, NULL
);
1014 *uncompp
= sra
.uncomp
;
1019 * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
1020 * bpobj are designated as free or allocated that information is not preserved
1024 bplist_append_cb(void *arg
, const blkptr_t
*bp
, boolean_t bp_freed
,
1027 (void) bp_freed
, (void) tx
;
1028 bplist_t
*bpl
= arg
;
1029 bplist_append(bpl
, bp
);