4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2017 Datto Inc.
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zfs_refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
35 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
38 bpobj_alloc_empty(objset_t
*os
, int blocksize
, dmu_tx_t
*tx
)
40 spa_t
*spa
= dmu_objset_spa(os
);
41 dsl_pool_t
*dp
= dmu_objset_pool(os
);
43 if (spa_feature_is_enabled(spa
, SPA_FEATURE_EMPTY_BPOBJ
)) {
44 if (!spa_feature_is_active(spa
, SPA_FEATURE_EMPTY_BPOBJ
)) {
45 ASSERT0(dp
->dp_empty_bpobj
);
47 bpobj_alloc(os
, SPA_OLD_MAXBLOCKSIZE
, tx
);
49 DMU_POOL_DIRECTORY_OBJECT
,
50 DMU_POOL_EMPTY_BPOBJ
, sizeof (uint64_t), 1,
51 &dp
->dp_empty_bpobj
, tx
) == 0);
53 spa_feature_incr(spa
, SPA_FEATURE_EMPTY_BPOBJ
, tx
);
54 ASSERT(dp
->dp_empty_bpobj
!= 0);
55 return (dp
->dp_empty_bpobj
);
57 return (bpobj_alloc(os
, blocksize
, tx
));
62 bpobj_decr_empty(objset_t
*os
, dmu_tx_t
*tx
)
64 dsl_pool_t
*dp
= dmu_objset_pool(os
);
66 spa_feature_decr(dmu_objset_spa(os
), SPA_FEATURE_EMPTY_BPOBJ
, tx
);
67 if (!spa_feature_is_active(dmu_objset_spa(os
),
68 SPA_FEATURE_EMPTY_BPOBJ
)) {
69 VERIFY3U(0, ==, zap_remove(dp
->dp_meta_objset
,
70 DMU_POOL_DIRECTORY_OBJECT
,
71 DMU_POOL_EMPTY_BPOBJ
, tx
));
72 VERIFY3U(0, ==, dmu_object_free(os
, dp
->dp_empty_bpobj
, tx
));
73 dp
->dp_empty_bpobj
= 0;
78 bpobj_alloc(objset_t
*os
, int blocksize
, dmu_tx_t
*tx
)
82 if (spa_version(dmu_objset_spa(os
)) < SPA_VERSION_BPOBJ_ACCOUNT
)
84 else if (spa_version(dmu_objset_spa(os
)) < SPA_VERSION_DEADLISTS
)
86 else if (!spa_feature_is_active(dmu_objset_spa(os
),
87 SPA_FEATURE_LIVELIST
))
90 size
= sizeof (bpobj_phys_t
);
92 return (dmu_object_alloc(os
, DMU_OT_BPOBJ
, blocksize
,
93 DMU_OT_BPOBJ_HDR
, size
, tx
));
97 bpobj_free(objset_t
*os
, uint64_t obj
, dmu_tx_t
*tx
)
101 dmu_object_info_t doi
;
103 dmu_buf_t
*dbuf
= NULL
;
105 ASSERT(obj
!= dmu_objset_pool(os
)->dp_empty_bpobj
);
106 VERIFY3U(0, ==, bpobj_open(&bpo
, os
, obj
));
108 mutex_enter(&bpo
.bpo_lock
);
110 if (!bpo
.bpo_havesubobj
|| bpo
.bpo_phys
->bpo_subobjs
== 0)
113 VERIFY3U(0, ==, dmu_object_info(os
, bpo
.bpo_phys
->bpo_subobjs
, &doi
));
114 epb
= doi
.doi_data_block_size
/ sizeof (uint64_t);
116 for (i
= bpo
.bpo_phys
->bpo_num_subobjs
- 1; i
>= 0; i
--) {
118 uint64_t offset
, blkoff
;
120 offset
= i
* sizeof (uint64_t);
121 blkoff
= P2PHASE(i
, epb
);
123 if (dbuf
== NULL
|| dbuf
->db_offset
> offset
) {
125 dmu_buf_rele(dbuf
, FTAG
);
126 VERIFY3U(0, ==, dmu_buf_hold(os
,
127 bpo
.bpo_phys
->bpo_subobjs
, offset
, FTAG
, &dbuf
, 0));
130 ASSERT3U(offset
, >=, dbuf
->db_offset
);
131 ASSERT3U(offset
, <, dbuf
->db_offset
+ dbuf
->db_size
);
133 objarray
= dbuf
->db_data
;
134 bpobj_free(os
, objarray
[blkoff
], tx
);
137 dmu_buf_rele(dbuf
, FTAG
);
140 VERIFY3U(0, ==, dmu_object_free(os
, bpo
.bpo_phys
->bpo_subobjs
, tx
));
143 mutex_exit(&bpo
.bpo_lock
);
146 VERIFY3U(0, ==, dmu_object_free(os
, obj
, tx
));
150 bpobj_open(bpobj_t
*bpo
, objset_t
*os
, uint64_t object
)
152 dmu_object_info_t doi
;
155 err
= dmu_object_info(os
, object
, &doi
);
159 bzero(bpo
, sizeof (*bpo
));
160 mutex_init(&bpo
->bpo_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
162 ASSERT(bpo
->bpo_dbuf
== NULL
);
163 ASSERT(bpo
->bpo_phys
== NULL
);
165 ASSERT3U(doi
.doi_type
, ==, DMU_OT_BPOBJ
);
166 ASSERT3U(doi
.doi_bonus_type
, ==, DMU_OT_BPOBJ_HDR
);
168 err
= dmu_bonus_hold(os
, object
, bpo
, &bpo
->bpo_dbuf
);
173 bpo
->bpo_object
= object
;
174 bpo
->bpo_epb
= doi
.doi_data_block_size
>> SPA_BLKPTRSHIFT
;
175 bpo
->bpo_havecomp
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V0
);
176 bpo
->bpo_havesubobj
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V1
);
177 bpo
->bpo_havefreed
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V2
);
178 bpo
->bpo_phys
= bpo
->bpo_dbuf
->db_data
;
183 bpobj_is_open(const bpobj_t
*bpo
)
185 return (bpo
->bpo_object
!= 0);
189 bpobj_close(bpobj_t
*bpo
)
191 /* Lame workaround for closing a bpobj that was never opened. */
192 if (bpo
->bpo_object
== 0)
195 dmu_buf_rele(bpo
->bpo_dbuf
, bpo
);
196 if (bpo
->bpo_cached_dbuf
!= NULL
)
197 dmu_buf_rele(bpo
->bpo_cached_dbuf
, bpo
);
198 bpo
->bpo_dbuf
= NULL
;
199 bpo
->bpo_phys
= NULL
;
200 bpo
->bpo_cached_dbuf
= NULL
;
203 mutex_destroy(&bpo
->bpo_lock
);
207 bpobj_is_empty_impl(bpobj_t
*bpo
)
209 ASSERT(MUTEX_HELD(&bpo
->bpo_lock
));
210 return (bpo
->bpo_phys
->bpo_num_blkptrs
== 0 &&
211 (!bpo
->bpo_havesubobj
|| bpo
->bpo_phys
->bpo_num_subobjs
== 0));
215 bpobj_is_empty(bpobj_t
*bpo
)
217 mutex_enter(&bpo
->bpo_lock
);
218 boolean_t is_empty
= bpobj_is_empty_impl(bpo
);
219 mutex_exit(&bpo
->bpo_lock
);
224 * A recursive iteration of the bpobjs would be nice here but we run the risk
225 * of overflowing function stack space. Instead, find each subobj and add it
226 * to the head of our list so it can be scanned for subjobjs. Like a
227 * recursive implementation, the "deepest" subobjs will be freed first.
228 * When a subobj is found to have no additional subojs, free it.
230 typedef struct bpobj_info
{
233 * This object is a subobj of bpi_parent,
234 * at bpi_index in its subobj array.
236 struct bpobj_info
*bpi_parent
;
238 /* How many of our subobj's are left to process. */
239 uint64_t bpi_unprocessed_subobjs
;
240 /* True after having visited this bpo's directly referenced BPs. */
241 boolean_t bpi_visited
;
242 list_node_t bpi_node
;
245 static bpobj_info_t
*
246 bpi_alloc(bpobj_t
*bpo
, bpobj_info_t
*parent
, uint64_t index
)
248 bpobj_info_t
*bpi
= kmem_zalloc(sizeof (bpobj_info_t
), KM_SLEEP
);
250 bpi
->bpi_parent
= parent
;
251 bpi
->bpi_index
= index
;
252 if (bpo
->bpo_havesubobj
&& bpo
->bpo_phys
->bpo_subobjs
!= 0) {
253 bpi
->bpi_unprocessed_subobjs
= bpo
->bpo_phys
->bpo_num_subobjs
;
259 * Update bpobj and all of its parents with new space accounting.
262 propagate_space_reduction(bpobj_info_t
*bpi
, int64_t freed
,
263 int64_t comp_freed
, int64_t uncomp_freed
, dmu_tx_t
*tx
)
266 for (; bpi
!= NULL
; bpi
= bpi
->bpi_parent
) {
267 bpobj_t
*p
= bpi
->bpi_bpo
;
268 ASSERT(dmu_buf_is_dirty(p
->bpo_dbuf
, tx
));
269 p
->bpo_phys
->bpo_bytes
-= freed
;
270 ASSERT3S(p
->bpo_phys
->bpo_bytes
, >=, 0);
271 if (p
->bpo_havecomp
) {
272 p
->bpo_phys
->bpo_comp
-= comp_freed
;
273 p
->bpo_phys
->bpo_uncomp
-= uncomp_freed
;
279 bpobj_iterate_blkptrs(bpobj_info_t
*bpi
, bpobj_itor_t func
, void *arg
,
280 int64_t start
, dmu_tx_t
*tx
, boolean_t free
)
283 int64_t freed
= 0, comp_freed
= 0, uncomp_freed
= 0;
284 dmu_buf_t
*dbuf
= NULL
;
285 bpobj_t
*bpo
= bpi
->bpi_bpo
;
287 for (int64_t i
= bpo
->bpo_phys
->bpo_num_blkptrs
- 1; i
>= start
; i
--) {
288 uint64_t offset
= i
* sizeof (blkptr_t
);
289 uint64_t blkoff
= P2PHASE(i
, bpo
->bpo_epb
);
291 if (dbuf
== NULL
|| dbuf
->db_offset
> offset
) {
293 dmu_buf_rele(dbuf
, FTAG
);
294 err
= dmu_buf_hold(bpo
->bpo_os
, bpo
->bpo_object
,
295 offset
, FTAG
, &dbuf
, 0);
300 ASSERT3U(offset
, >=, dbuf
->db_offset
);
301 ASSERT3U(offset
, <, dbuf
->db_offset
+ dbuf
->db_size
);
303 blkptr_t
*bparray
= dbuf
->db_data
;
304 blkptr_t
*bp
= &bparray
[blkoff
];
306 boolean_t bp_freed
= BP_GET_FREE(bp
);
307 err
= func(arg
, bp
, bp_freed
, tx
);
312 int sign
= bp_freed
? -1 : +1;
313 spa_t
*spa
= dmu_objset_spa(bpo
->bpo_os
);
314 freed
+= sign
* bp_get_dsize_sync(spa
, bp
);
315 comp_freed
+= sign
* BP_GET_PSIZE(bp
);
316 uncomp_freed
+= sign
* BP_GET_UCSIZE(bp
);
317 ASSERT(dmu_buf_is_dirty(bpo
->bpo_dbuf
, tx
));
318 bpo
->bpo_phys
->bpo_num_blkptrs
--;
319 ASSERT3S(bpo
->bpo_phys
->bpo_num_blkptrs
, >=, 0);
321 ASSERT(bpo
->bpo_havefreed
);
322 bpo
->bpo_phys
->bpo_num_freed
--;
323 ASSERT3S(bpo
->bpo_phys
->bpo_num_freed
, >=, 0);
328 propagate_space_reduction(bpi
, freed
, comp_freed
,
330 VERIFY0(dmu_free_range(bpo
->bpo_os
,
332 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
),
333 DMU_OBJECT_END
, tx
));
336 dmu_buf_rele(dbuf
, FTAG
);
343 * Given an initial bpo, start by freeing the BPs that are directly referenced
344 * by that bpo. If the bpo has subobjs, read in its last subobj and push the
345 * subobj to our stack. By popping items off our stack, eventually we will
346 * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
347 * requested also free the now-empty bpo from disk and decrement
348 * its parent's subobj count. We continue popping each subobj from our stack,
349 * visiting its last subobj until they too have no more subobjs, and so on.
352 bpobj_iterate_impl(bpobj_t
*initial_bpo
, bpobj_itor_t func
, void *arg
,
353 dmu_tx_t
*tx
, boolean_t free
, uint64_t *bpobj_size
)
360 * Create a "stack" for us to work with without worrying about
361 * stack overflows. Initialize it with the initial_bpo.
363 list_create(&stack
, sizeof (bpobj_info_t
),
364 offsetof(bpobj_info_t
, bpi_node
));
365 mutex_enter(&initial_bpo
->bpo_lock
);
367 if (bpobj_size
!= NULL
)
368 *bpobj_size
= initial_bpo
->bpo_phys
->bpo_num_blkptrs
;
370 list_insert_head(&stack
, bpi_alloc(initial_bpo
, NULL
, 0));
372 while ((bpi
= list_head(&stack
)) != NULL
) {
373 bpobj_t
*bpo
= bpi
->bpi_bpo
;
375 ASSERT3P(bpo
, !=, NULL
);
376 ASSERT(MUTEX_HELD(&bpo
->bpo_lock
));
377 ASSERT(bpobj_is_open(bpo
));
380 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
382 if (bpi
->bpi_visited
== B_FALSE
) {
383 err
= bpobj_iterate_blkptrs(bpi
, func
, arg
, 0, tx
,
385 bpi
->bpi_visited
= B_TRUE
;
390 * We've finished with this bpo's directly-referenced BP's and
391 * it has no more unprocessed subobjs. We can free its
392 * bpobj_info_t (unless it is the topmost, initial_bpo).
393 * If we are freeing from disk, we can also do that.
395 if (bpi
->bpi_unprocessed_subobjs
== 0) {
397 * If there are no entries, there should
400 if (bpobj_is_empty_impl(bpo
)) {
401 ASSERT0(bpo
->bpo_phys
->bpo_bytes
);
402 ASSERT0(bpo
->bpo_phys
->bpo_comp
);
403 ASSERT0(bpo
->bpo_phys
->bpo_uncomp
);
406 /* The initial_bpo has no parent and is not closed. */
407 if (bpi
->bpi_parent
!= NULL
) {
409 bpobj_t
*p
= bpi
->bpi_parent
->bpi_bpo
;
411 ASSERT0(bpo
->bpo_phys
->bpo_num_blkptrs
);
412 ASSERT3U(p
->bpo_phys
->bpo_num_subobjs
,
414 ASSERT3U(bpi
->bpi_index
, ==,
415 p
->bpo_phys
->bpo_num_subobjs
- 1);
416 ASSERT(dmu_buf_is_dirty(bpo
->bpo_dbuf
,
419 p
->bpo_phys
->bpo_num_subobjs
--;
421 VERIFY0(dmu_free_range(p
->bpo_os
,
422 p
->bpo_phys
->bpo_subobjs
,
423 bpi
->bpi_index
* sizeof (uint64_t),
424 sizeof (uint64_t), tx
));
426 /* eliminate the empty subobj list */
427 if (bpo
->bpo_havesubobj
&&
428 bpo
->bpo_phys
->bpo_subobjs
!= 0) {
429 ASSERT0(bpo
->bpo_phys
->
431 err
= dmu_object_free(
433 bpo
->bpo_phys
->bpo_subobjs
,
437 bpo
->bpo_phys
->bpo_subobjs
= 0;
439 err
= dmu_object_free(p
->bpo_os
,
440 bpo
->bpo_object
, tx
);
445 mutex_exit(&bpo
->bpo_lock
);
447 kmem_free(bpo
, sizeof (bpobj_t
));
449 mutex_exit(&bpo
->bpo_lock
);
453 * Finished processing this bpo. Unlock, and free
456 list_remove_head(&stack
);
457 kmem_free(bpi
, sizeof (bpobj_info_t
));
460 * We have unprocessed subobjs. Process the next one.
462 ASSERT(bpo
->bpo_havecomp
);
463 ASSERT3P(bpobj_size
, ==, NULL
);
465 /* Add the last subobj to stack. */
466 int64_t i
= bpi
->bpi_unprocessed_subobjs
- 1;
467 uint64_t offset
= i
* sizeof (uint64_t);
469 uint64_t obj_from_sublist
;
470 err
= dmu_read(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
471 offset
, sizeof (uint64_t), &obj_from_sublist
,
475 bpobj_t
*sublist
= kmem_alloc(sizeof (bpobj_t
),
478 err
= bpobj_open(sublist
, bpo
->bpo_os
,
483 list_insert_head(&stack
, bpi_alloc(sublist
, bpi
, i
));
484 mutex_enter(&sublist
->bpo_lock
);
485 bpi
->bpi_unprocessed_subobjs
--;
489 * Cleanup anything left on the "stack" after we left the loop.
490 * Every bpo on the stack is locked so we must remember to undo
491 * that now (in LIFO order).
493 while ((bpi
= list_remove_head(&stack
)) != NULL
) {
494 bpobj_t
*bpo
= bpi
->bpi_bpo
;
496 ASSERT3P(bpo
, !=, NULL
);
498 mutex_exit(&bpo
->bpo_lock
);
500 /* do not free the initial_bpo */
501 if (bpi
->bpi_parent
!= NULL
) {
502 bpobj_close(bpi
->bpi_bpo
);
503 kmem_free(bpi
->bpi_bpo
, sizeof (bpobj_t
));
505 kmem_free(bpi
, sizeof (bpobj_info_t
));
508 list_destroy(&stack
);
514 * Iterate and remove the entries. If func returns nonzero, iteration
515 * will stop and that entry will not be removed.
518 bpobj_iterate(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
, dmu_tx_t
*tx
)
520 return (bpobj_iterate_impl(bpo
, func
, arg
, tx
, B_TRUE
, NULL
));
524 * Iterate the entries. If func returns nonzero, iteration will stop.
526 * If there are no subobjs:
528 * *bpobj_size can be used to return the number of block pointers in the
529 * bpobj. Note that this may be different from the number of block pointers
530 * that are iterated over, if iteration is terminated early (e.g. by the func
531 * returning nonzero).
533 * If there are concurrent (or subsequent) modifications to the bpobj then the
534 * returned *bpobj_size can be passed as "start" to
535 * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
538 bpobj_iterate_nofree(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
,
539 uint64_t *bpobj_size
)
541 return (bpobj_iterate_impl(bpo
, func
, arg
, NULL
, B_FALSE
, bpobj_size
));
545 * Iterate over the blkptrs in the bpobj beginning at index start. If func
546 * returns nonzero, iteration will stop. This is a livelist specific function
547 * since it assumes that there are no subobjs present.
550 livelist_bpobj_iterate_from_nofree(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
,
553 if (bpo
->bpo_havesubobj
)
554 VERIFY0(bpo
->bpo_phys
->bpo_subobjs
);
555 bpobj_info_t
*bpi
= bpi_alloc(bpo
, NULL
, 0);
556 int err
= bpobj_iterate_blkptrs(bpi
, func
, arg
, start
, NULL
, B_FALSE
);
557 kmem_free(bpi
, sizeof (bpobj_info_t
));
562 * Logically add subobj's contents to the parent bpobj.
564 * In the most general case, this is accomplished in constant time by adding
565 * a reference to subobj. This case is used when enqueuing a large subobj:
566 * +--------------+ +--------------+
567 * | bpobj |----------------------->| subobj list |
568 * +----+----+----+----+----+ +-----+-----+--+--+
569 * | bp | bp | bp | bp | bp | | obj | obj | obj |
570 * +----+----+----+----+----+ +-----+-----+-----+
572 * +--------------+ +--------------+
573 * | sub-bpobj |----------------------> | subsubobj |
574 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
575 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
576 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
578 * Result: sub-bpobj added to parent's subobj list.
579 * +--------------+ +--------------+
580 * | bpobj |----------------------->| subobj list |
581 * +----+----+----+----+----+ +-----+-----+--+--+-----+
582 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
583 * +----+----+----+----+----+ +-----+-----+-----+--|--+
585 * /-----------------------------------------------------/
587 * +--------------+ +--------------+
588 * | sub-bpobj |----------------------> | subsubobj |
589 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
590 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
591 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
594 * In a common case, the subobj is small: its bp's and its list of subobj's
595 * are each stored in a single block. In this case we copy the subobj's
596 * contents to the parent:
597 * +--------------+ +--------------+
598 * | bpobj |----------------------->| subobj list |
599 * +----+----+----+----+----+ +-----+-----+--+--+
600 * | bp | bp | bp | bp | bp | | obj | obj | obj |
601 * +----+----+----+----+----+ +-----+-----+-----+
603 * +--------------+ | +--------------+ |
604 * | sub-bpobj |---------^------------> | subsubobj | ^
605 * +----+----+----+ | +-----+-----+--+ |
606 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
607 * +----+----+ +-----+-----+
609 * Result: subobj destroyed, contents copied to parent:
610 * +--------------+ +--------------+
611 * | bpobj |----------------------->| subobj list |
612 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
613 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
614 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
617 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
618 * but retain the sub-bpobj:
619 * +--------------+ +--------------+
620 * | bpobj |----------------------->| subobj list |
621 * +----+----+----+----+----+ +-----+-----+--+--+
622 * | bp | bp | bp | bp | bp | | obj | obj | obj |
623 * +----+----+----+----+----+ +-----+-----+-----+
625 * +--------------+ +--------------+ |
626 * | sub-bpobj |----------------------> | subsubobj | ^
627 * +----+----+----+----+---------+----+ +-----+-----+--+ |
628 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
629 * +----+----+----+----+---------+----+ +-----+-----+
631 * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
632 * +--------------+ +--------------+
633 * | bpobj |-------------------->| subobj list |
634 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
635 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
636 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
638 * /--------------------------------------------------------------/
642 * +----+----+----+----+---------+----+
643 * | bp | bp | bp | bp | ... | bp |
644 * +----+----+----+----+---------+----+
647 bpobj_enqueue_subobj(bpobj_t
*bpo
, uint64_t subobj
, dmu_tx_t
*tx
)
650 uint64_t used
, comp
, uncomp
, subsubobjs
;
651 boolean_t copy_subsub
= B_TRUE
;
652 boolean_t copy_bps
= B_TRUE
;
654 ASSERT(bpobj_is_open(bpo
));
656 ASSERT(bpo
->bpo_havesubobj
);
657 ASSERT(bpo
->bpo_havecomp
);
658 ASSERT(bpo
->bpo_object
!= dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
);
660 if (subobj
== dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
) {
661 bpobj_decr_empty(bpo
->bpo_os
, tx
);
665 VERIFY3U(0, ==, bpobj_open(&subbpo
, bpo
->bpo_os
, subobj
));
666 VERIFY3U(0, ==, bpobj_space(&subbpo
, &used
, &comp
, &uncomp
));
668 if (bpobj_is_empty(&subbpo
)) {
669 /* No point in having an empty subobj. */
670 bpobj_close(&subbpo
);
671 bpobj_free(bpo
->bpo_os
, subobj
, tx
);
675 mutex_enter(&bpo
->bpo_lock
);
676 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
678 dmu_object_info_t doi
;
680 if (bpo
->bpo_phys
->bpo_subobjs
!= 0) {
681 ASSERT0(dmu_object_info(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
683 ASSERT3U(doi
.doi_type
, ==, DMU_OT_BPOBJ_SUBOBJ
);
687 * If subobj has only one block of subobjs, then move subobj's
688 * subobjs to bpo's subobj list directly. This reduces recursion in
689 * bpobj_iterate due to nested subobjs.
691 subsubobjs
= subbpo
.bpo_phys
->bpo_subobjs
;
692 if (subsubobjs
!= 0) {
693 VERIFY0(dmu_object_info(bpo
->bpo_os
, subsubobjs
, &doi
));
694 if (doi
.doi_max_offset
> doi
.doi_data_block_size
) {
695 copy_subsub
= B_FALSE
;
700 * If, in addition to having only one block of subobj's, subobj has
701 * only one block of bp's, then move subobj's bp's to bpo's bp list
702 * directly. This reduces recursion in bpobj_iterate due to nested
705 VERIFY3U(0, ==, dmu_object_info(bpo
->bpo_os
, subobj
, &doi
));
706 if (doi
.doi_max_offset
> doi
.doi_data_block_size
|| !copy_subsub
) {
710 if (copy_subsub
&& subsubobjs
!= 0) {
712 uint64_t numsubsub
= subbpo
.bpo_phys
->bpo_num_subobjs
;
714 VERIFY0(dmu_buf_hold(bpo
->bpo_os
, subsubobjs
,
715 0, FTAG
, &subdb
, 0));
717 * Make sure that we are not asking dmu_write()
718 * to write more data than we have in our buffer.
720 VERIFY3U(subdb
->db_size
, >=,
721 numsubsub
* sizeof (subobj
));
722 if (bpo
->bpo_phys
->bpo_subobjs
== 0) {
723 bpo
->bpo_phys
->bpo_subobjs
=
724 dmu_object_alloc(bpo
->bpo_os
,
725 DMU_OT_BPOBJ_SUBOBJ
, SPA_OLD_MAXBLOCKSIZE
,
728 dmu_write(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
729 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
),
730 numsubsub
* sizeof (subobj
), subdb
->db_data
, tx
);
731 dmu_buf_rele(subdb
, FTAG
);
732 bpo
->bpo_phys
->bpo_num_subobjs
+= numsubsub
;
734 dmu_buf_will_dirty(subbpo
.bpo_dbuf
, tx
);
735 subbpo
.bpo_phys
->bpo_subobjs
= 0;
736 VERIFY0(dmu_object_free(bpo
->bpo_os
, subsubobjs
, tx
));
741 uint64_t numbps
= subbpo
.bpo_phys
->bpo_num_blkptrs
;
744 VERIFY0(dmu_buf_hold(bpo
->bpo_os
, subobj
,
748 * Make sure that we are not asking dmu_write()
749 * to write more data than we have in our buffer.
751 VERIFY3U(bps
->db_size
, >=, numbps
* sizeof (blkptr_t
));
752 dmu_write(bpo
->bpo_os
, bpo
->bpo_object
,
753 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
),
754 numbps
* sizeof (blkptr_t
),
756 dmu_buf_rele(bps
, FTAG
);
757 bpo
->bpo_phys
->bpo_num_blkptrs
+= numbps
;
759 bpobj_close(&subbpo
);
760 VERIFY0(dmu_object_free(bpo
->bpo_os
, subobj
, tx
));
762 bpobj_close(&subbpo
);
763 if (bpo
->bpo_phys
->bpo_subobjs
== 0) {
764 bpo
->bpo_phys
->bpo_subobjs
=
765 dmu_object_alloc(bpo
->bpo_os
,
766 DMU_OT_BPOBJ_SUBOBJ
, SPA_OLD_MAXBLOCKSIZE
,
770 dmu_write(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
771 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
),
772 sizeof (subobj
), &subobj
, tx
);
773 bpo
->bpo_phys
->bpo_num_subobjs
++;
776 bpo
->bpo_phys
->bpo_bytes
+= used
;
777 bpo
->bpo_phys
->bpo_comp
+= comp
;
778 bpo
->bpo_phys
->bpo_uncomp
+= uncomp
;
779 mutex_exit(&bpo
->bpo_lock
);
784 bpobj_enqueue(bpobj_t
*bpo
, const blkptr_t
*bp
, boolean_t bp_freed
,
787 blkptr_t stored_bp
= *bp
;
792 ASSERT(bpobj_is_open(bpo
));
793 ASSERT(!BP_IS_HOLE(bp
));
794 ASSERT(bpo
->bpo_object
!= dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
);
796 if (BP_IS_EMBEDDED(bp
)) {
798 * The bpobj will compress better without the payload.
800 * Note that we store EMBEDDED bp's because they have an
801 * uncompressed size, which must be accounted for. An
802 * alternative would be to add their size to bpo_uncomp
803 * without storing the bp, but that would create additional
804 * complications: bpo_uncomp would be inconsistent with the
805 * set of BP's stored, and bpobj_iterate() wouldn't visit
806 * all the space accounted for in the bpobj.
808 bzero(&stored_bp
, sizeof (stored_bp
));
809 stored_bp
.blk_prop
= bp
->blk_prop
;
810 stored_bp
.blk_birth
= bp
->blk_birth
;
811 } else if (!BP_GET_DEDUP(bp
)) {
812 /* The bpobj will compress better without the checksum */
813 bzero(&stored_bp
.blk_cksum
, sizeof (stored_bp
.blk_cksum
));
816 stored_bp
.blk_fill
= 0;
817 BP_SET_FREE(&stored_bp
, bp_freed
);
819 mutex_enter(&bpo
->bpo_lock
);
821 offset
= bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (stored_bp
);
822 blkoff
= P2PHASE(bpo
->bpo_phys
->bpo_num_blkptrs
, bpo
->bpo_epb
);
824 if (bpo
->bpo_cached_dbuf
== NULL
||
825 offset
< bpo
->bpo_cached_dbuf
->db_offset
||
826 offset
>= bpo
->bpo_cached_dbuf
->db_offset
+
827 bpo
->bpo_cached_dbuf
->db_size
) {
828 if (bpo
->bpo_cached_dbuf
)
829 dmu_buf_rele(bpo
->bpo_cached_dbuf
, bpo
);
830 VERIFY3U(0, ==, dmu_buf_hold(bpo
->bpo_os
, bpo
->bpo_object
,
831 offset
, bpo
, &bpo
->bpo_cached_dbuf
, 0));
834 dmu_buf_will_dirty(bpo
->bpo_cached_dbuf
, tx
);
835 bparray
= bpo
->bpo_cached_dbuf
->db_data
;
836 bparray
[blkoff
] = stored_bp
;
838 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
839 bpo
->bpo_phys
->bpo_num_blkptrs
++;
840 int sign
= bp_freed
? -1 : +1;
841 bpo
->bpo_phys
->bpo_bytes
+= sign
*
842 bp_get_dsize_sync(dmu_objset_spa(bpo
->bpo_os
), bp
);
843 if (bpo
->bpo_havecomp
) {
844 bpo
->bpo_phys
->bpo_comp
+= sign
* BP_GET_PSIZE(bp
);
845 bpo
->bpo_phys
->bpo_uncomp
+= sign
* BP_GET_UCSIZE(bp
);
848 ASSERT(bpo
->bpo_havefreed
);
849 bpo
->bpo_phys
->bpo_num_freed
++;
851 mutex_exit(&bpo
->bpo_lock
);
854 struct space_range_arg
{
865 space_range_cb(void *arg
, const blkptr_t
*bp
, boolean_t bp_freed
, dmu_tx_t
*tx
)
867 struct space_range_arg
*sra
= arg
;
869 if (bp
->blk_birth
> sra
->mintxg
&& bp
->blk_birth
<= sra
->maxtxg
) {
870 if (dsl_pool_sync_context(spa_get_dsl(sra
->spa
)))
871 sra
->used
+= bp_get_dsize_sync(sra
->spa
, bp
);
873 sra
->used
+= bp_get_dsize(sra
->spa
, bp
);
874 sra
->comp
+= BP_GET_PSIZE(bp
);
875 sra
->uncomp
+= BP_GET_UCSIZE(bp
);
881 bpobj_space(bpobj_t
*bpo
, uint64_t *usedp
, uint64_t *compp
, uint64_t *uncompp
)
883 ASSERT(bpobj_is_open(bpo
));
884 mutex_enter(&bpo
->bpo_lock
);
886 *usedp
= bpo
->bpo_phys
->bpo_bytes
;
887 if (bpo
->bpo_havecomp
) {
888 *compp
= bpo
->bpo_phys
->bpo_comp
;
889 *uncompp
= bpo
->bpo_phys
->bpo_uncomp
;
890 mutex_exit(&bpo
->bpo_lock
);
893 mutex_exit(&bpo
->bpo_lock
);
894 return (bpobj_space_range(bpo
, 0, UINT64_MAX
,
895 usedp
, compp
, uncompp
));
900 * Return the amount of space in the bpobj which is:
901 * mintxg < blk_birth <= maxtxg
904 bpobj_space_range(bpobj_t
*bpo
, uint64_t mintxg
, uint64_t maxtxg
,
905 uint64_t *usedp
, uint64_t *compp
, uint64_t *uncompp
)
907 struct space_range_arg sra
= { 0 };
910 ASSERT(bpobj_is_open(bpo
));
913 * As an optimization, if they want the whole txg range, just
914 * get bpo_bytes rather than iterating over the bps.
916 if (mintxg
< TXG_INITIAL
&& maxtxg
== UINT64_MAX
&& bpo
->bpo_havecomp
)
917 return (bpobj_space(bpo
, usedp
, compp
, uncompp
));
919 sra
.spa
= dmu_objset_spa(bpo
->bpo_os
);
923 err
= bpobj_iterate_nofree(bpo
, space_range_cb
, &sra
, NULL
);
926 *uncompp
= sra
.uncomp
;
931 * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
932 * bpobj are designated as free or allocated that information is not preserved
937 bplist_append_cb(void *arg
, const blkptr_t
*bp
, boolean_t bp_freed
,
941 bplist_append(bpl
, bp
);