4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2017 Datto Inc.
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
35 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
38 bpobj_alloc_empty(objset_t
*os
, int blocksize
, dmu_tx_t
*tx
)
40 spa_t
*spa
= dmu_objset_spa(os
);
41 dsl_pool_t
*dp
= dmu_objset_pool(os
);
43 if (spa_feature_is_enabled(spa
, SPA_FEATURE_EMPTY_BPOBJ
)) {
44 if (!spa_feature_is_active(spa
, SPA_FEATURE_EMPTY_BPOBJ
)) {
45 ASSERT0(dp
->dp_empty_bpobj
);
47 bpobj_alloc(os
, SPA_OLD_MAXBLOCKSIZE
, tx
);
49 DMU_POOL_DIRECTORY_OBJECT
,
50 DMU_POOL_EMPTY_BPOBJ
, sizeof (uint64_t), 1,
51 &dp
->dp_empty_bpobj
, tx
) == 0);
53 spa_feature_incr(spa
, SPA_FEATURE_EMPTY_BPOBJ
, tx
);
54 ASSERT(dp
->dp_empty_bpobj
!= 0);
55 return (dp
->dp_empty_bpobj
);
57 return (bpobj_alloc(os
, blocksize
, tx
));
62 bpobj_decr_empty(objset_t
*os
, dmu_tx_t
*tx
)
64 dsl_pool_t
*dp
= dmu_objset_pool(os
);
66 spa_feature_decr(dmu_objset_spa(os
), SPA_FEATURE_EMPTY_BPOBJ
, tx
);
67 if (!spa_feature_is_active(dmu_objset_spa(os
),
68 SPA_FEATURE_EMPTY_BPOBJ
)) {
69 VERIFY3U(0, ==, zap_remove(dp
->dp_meta_objset
,
70 DMU_POOL_DIRECTORY_OBJECT
,
71 DMU_POOL_EMPTY_BPOBJ
, tx
));
72 VERIFY3U(0, ==, dmu_object_free(os
, dp
->dp_empty_bpobj
, tx
));
73 dp
->dp_empty_bpobj
= 0;
78 bpobj_alloc(objset_t
*os
, int blocksize
, dmu_tx_t
*tx
)
82 if (spa_version(dmu_objset_spa(os
)) < SPA_VERSION_BPOBJ_ACCOUNT
)
84 else if (spa_version(dmu_objset_spa(os
)) < SPA_VERSION_DEADLISTS
)
86 else if (!spa_feature_is_active(dmu_objset_spa(os
),
87 SPA_FEATURE_LIVELIST
))
90 size
= sizeof (bpobj_phys_t
);
92 return (dmu_object_alloc(os
, DMU_OT_BPOBJ
, blocksize
,
93 DMU_OT_BPOBJ_HDR
, size
, tx
));
97 bpobj_free(objset_t
*os
, uint64_t obj
, dmu_tx_t
*tx
)
101 dmu_object_info_t doi
;
103 dmu_buf_t
*dbuf
= NULL
;
105 ASSERT(obj
!= dmu_objset_pool(os
)->dp_empty_bpobj
);
106 VERIFY3U(0, ==, bpobj_open(&bpo
, os
, obj
));
108 mutex_enter(&bpo
.bpo_lock
);
110 if (!bpo
.bpo_havesubobj
|| bpo
.bpo_phys
->bpo_subobjs
== 0)
113 VERIFY3U(0, ==, dmu_object_info(os
, bpo
.bpo_phys
->bpo_subobjs
, &doi
));
114 epb
= doi
.doi_data_block_size
/ sizeof (uint64_t);
116 for (i
= bpo
.bpo_phys
->bpo_num_subobjs
- 1; i
>= 0; i
--) {
118 uint64_t offset
, blkoff
;
120 offset
= i
* sizeof (uint64_t);
121 blkoff
= P2PHASE(i
, epb
);
123 if (dbuf
== NULL
|| dbuf
->db_offset
> offset
) {
125 dmu_buf_rele(dbuf
, FTAG
);
126 VERIFY3U(0, ==, dmu_buf_hold(os
,
127 bpo
.bpo_phys
->bpo_subobjs
, offset
, FTAG
, &dbuf
, 0));
130 ASSERT3U(offset
, >=, dbuf
->db_offset
);
131 ASSERT3U(offset
, <, dbuf
->db_offset
+ dbuf
->db_size
);
133 objarray
= dbuf
->db_data
;
134 bpobj_free(os
, objarray
[blkoff
], tx
);
137 dmu_buf_rele(dbuf
, FTAG
);
140 VERIFY3U(0, ==, dmu_object_free(os
, bpo
.bpo_phys
->bpo_subobjs
, tx
));
143 mutex_exit(&bpo
.bpo_lock
);
146 VERIFY3U(0, ==, dmu_object_free(os
, obj
, tx
));
150 bpobj_open(bpobj_t
*bpo
, objset_t
*os
, uint64_t object
)
152 dmu_object_info_t doi
;
155 err
= dmu_object_info(os
, object
, &doi
);
159 bzero(bpo
, sizeof (*bpo
));
160 mutex_init(&bpo
->bpo_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
162 ASSERT(bpo
->bpo_dbuf
== NULL
);
163 ASSERT(bpo
->bpo_phys
== NULL
);
165 ASSERT3U(doi
.doi_type
, ==, DMU_OT_BPOBJ
);
166 ASSERT3U(doi
.doi_bonus_type
, ==, DMU_OT_BPOBJ_HDR
);
168 err
= dmu_bonus_hold(os
, object
, bpo
, &bpo
->bpo_dbuf
);
173 bpo
->bpo_object
= object
;
174 bpo
->bpo_epb
= doi
.doi_data_block_size
>> SPA_BLKPTRSHIFT
;
175 bpo
->bpo_havecomp
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V0
);
176 bpo
->bpo_havesubobj
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V1
);
177 bpo
->bpo_havefreed
= (doi
.doi_bonus_size
> BPOBJ_SIZE_V2
);
178 bpo
->bpo_phys
= bpo
->bpo_dbuf
->db_data
;
183 bpobj_is_open(const bpobj_t
*bpo
)
185 return (bpo
->bpo_object
!= 0);
189 bpobj_close(bpobj_t
*bpo
)
191 /* Lame workaround for closing a bpobj that was never opened. */
192 if (bpo
->bpo_object
== 0)
195 dmu_buf_rele(bpo
->bpo_dbuf
, bpo
);
196 if (bpo
->bpo_cached_dbuf
!= NULL
)
197 dmu_buf_rele(bpo
->bpo_cached_dbuf
, bpo
);
198 bpo
->bpo_dbuf
= NULL
;
199 bpo
->bpo_phys
= NULL
;
200 bpo
->bpo_cached_dbuf
= NULL
;
203 mutex_destroy(&bpo
->bpo_lock
);
207 bpobj_is_empty(bpobj_t
*bpo
)
209 return (bpo
->bpo_phys
->bpo_num_blkptrs
== 0 &&
210 (!bpo
->bpo_havesubobj
|| bpo
->bpo_phys
->bpo_num_subobjs
== 0));
214 * A recursive iteration of the bpobjs would be nice here but we run the risk
215 * of overflowing function stack space. Instead, find each subobj and add it
216 * to the head of our list so it can be scanned for subjobjs. Like a
217 * recursive implementation, the "deepest" subobjs will be freed first.
218 * When a subobj is found to have no additional subojs, free it.
220 typedef struct bpobj_info
{
223 * This object is a subobj of bpi_parent,
224 * at bpi_index in its subobj array.
226 struct bpobj_info
*bpi_parent
;
228 /* How many of our subobj's are left to process. */
229 uint64_t bpi_unprocessed_subobjs
;
230 /* True after having visited this bpo's directly referenced BPs. */
231 boolean_t bpi_visited
;
232 list_node_t bpi_node
;
235 static bpobj_info_t
*
236 bpi_alloc(bpobj_t
*bpo
, bpobj_info_t
*parent
, uint64_t index
)
238 bpobj_info_t
*bpi
= kmem_zalloc(sizeof (bpobj_info_t
), KM_SLEEP
);
240 bpi
->bpi_parent
= parent
;
241 bpi
->bpi_index
= index
;
242 if (bpo
->bpo_havesubobj
&& bpo
->bpo_phys
->bpo_subobjs
!= 0) {
243 bpi
->bpi_unprocessed_subobjs
= bpo
->bpo_phys
->bpo_num_subobjs
;
249 * Update bpobj and all of its parents with new space accounting.
252 propagate_space_reduction(bpobj_info_t
*bpi
, int64_t freed
,
253 int64_t comp_freed
, int64_t uncomp_freed
, dmu_tx_t
*tx
)
256 for (; bpi
!= NULL
; bpi
= bpi
->bpi_parent
) {
257 bpobj_t
*p
= bpi
->bpi_bpo
;
258 ASSERT(dmu_buf_is_dirty(p
->bpo_dbuf
, tx
));
259 p
->bpo_phys
->bpo_bytes
-= freed
;
260 ASSERT3S(p
->bpo_phys
->bpo_bytes
, >=, 0);
261 if (p
->bpo_havecomp
) {
262 p
->bpo_phys
->bpo_comp
-= comp_freed
;
263 p
->bpo_phys
->bpo_uncomp
-= uncomp_freed
;
269 bpobj_iterate_blkptrs(bpobj_info_t
*bpi
, bpobj_itor_t func
, void *arg
,
270 int64_t start
, dmu_tx_t
*tx
, boolean_t free
)
273 int64_t freed
= 0, comp_freed
= 0, uncomp_freed
= 0;
274 dmu_buf_t
*dbuf
= NULL
;
275 bpobj_t
*bpo
= bpi
->bpi_bpo
;
277 for (int64_t i
= bpo
->bpo_phys
->bpo_num_blkptrs
- 1; i
>= start
; i
--) {
278 uint64_t offset
= i
* sizeof (blkptr_t
);
279 uint64_t blkoff
= P2PHASE(i
, bpo
->bpo_epb
);
281 if (dbuf
== NULL
|| dbuf
->db_offset
> offset
) {
283 dmu_buf_rele(dbuf
, FTAG
);
284 err
= dmu_buf_hold(bpo
->bpo_os
, bpo
->bpo_object
,
285 offset
, FTAG
, &dbuf
, 0);
290 ASSERT3U(offset
, >=, dbuf
->db_offset
);
291 ASSERT3U(offset
, <, dbuf
->db_offset
+ dbuf
->db_size
);
293 blkptr_t
*bparray
= dbuf
->db_data
;
294 blkptr_t
*bp
= &bparray
[blkoff
];
296 boolean_t bp_freed
= BP_GET_FREE(bp
);
297 err
= func(arg
, bp
, bp_freed
, tx
);
302 int sign
= bp_freed
? -1 : +1;
303 spa_t
*spa
= dmu_objset_spa(bpo
->bpo_os
);
304 freed
+= sign
* bp_get_dsize_sync(spa
, bp
);
305 comp_freed
+= sign
* BP_GET_PSIZE(bp
);
306 uncomp_freed
+= sign
* BP_GET_UCSIZE(bp
);
307 ASSERT(dmu_buf_is_dirty(bpo
->bpo_dbuf
, tx
));
308 bpo
->bpo_phys
->bpo_num_blkptrs
--;
309 ASSERT3S(bpo
->bpo_phys
->bpo_num_blkptrs
, >=, 0);
311 ASSERT(bpo
->bpo_havefreed
);
312 bpo
->bpo_phys
->bpo_num_freed
--;
313 ASSERT3S(bpo
->bpo_phys
->bpo_num_freed
, >=, 0);
318 propagate_space_reduction(bpi
, freed
, comp_freed
,
320 VERIFY0(dmu_free_range(bpo
->bpo_os
,
322 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
),
323 DMU_OBJECT_END
, tx
));
326 dmu_buf_rele(dbuf
, FTAG
);
333 * Given an initial bpo, start by freeing the BPs that are directly referenced
334 * by that bpo. If the bpo has subobjs, read in its last subobj and push the
335 * subobj to our stack. By popping items off our stack, eventually we will
336 * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
337 * requested also free the now-empty bpo from disk and decrement
338 * its parent's subobj count. We continue popping each subobj from our stack,
339 * visiting its last subobj until they too have no more subobjs, and so on.
342 bpobj_iterate_impl(bpobj_t
*initial_bpo
, bpobj_itor_t func
, void *arg
,
343 dmu_tx_t
*tx
, boolean_t free
, uint64_t *bpobj_size
)
350 * Create a "stack" for us to work with without worrying about
351 * stack overflows. Initialize it with the initial_bpo.
353 list_create(&stack
, sizeof (bpobj_info_t
),
354 offsetof(bpobj_info_t
, bpi_node
));
355 mutex_enter(&initial_bpo
->bpo_lock
);
357 if (bpobj_size
!= NULL
)
358 *bpobj_size
= initial_bpo
->bpo_phys
->bpo_num_blkptrs
;
360 list_insert_head(&stack
, bpi_alloc(initial_bpo
, NULL
, 0));
362 while ((bpi
= list_head(&stack
)) != NULL
) {
363 bpobj_t
*bpo
= bpi
->bpi_bpo
;
365 ASSERT3P(bpo
, !=, NULL
);
366 ASSERT(MUTEX_HELD(&bpo
->bpo_lock
));
367 ASSERT(bpobj_is_open(bpo
));
370 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
372 if (bpi
->bpi_visited
== B_FALSE
) {
373 err
= bpobj_iterate_blkptrs(bpi
, func
, arg
, 0, tx
,
375 bpi
->bpi_visited
= B_TRUE
;
380 * We've finished with this bpo's directly-referenced BP's and
381 * it has no more unprocessed subobjs. We can free its
382 * bpobj_info_t (unless it is the topmost, initial_bpo).
383 * If we are freeing from disk, we can also do that.
385 if (bpi
->bpi_unprocessed_subobjs
== 0) {
387 * If there are no entries, there should
390 if (bpobj_is_empty(bpo
)) {
391 ASSERT0(bpo
->bpo_phys
->bpo_bytes
);
392 ASSERT0(bpo
->bpo_phys
->bpo_comp
);
393 ASSERT0(bpo
->bpo_phys
->bpo_uncomp
);
396 /* The initial_bpo has no parent and is not closed. */
397 if (bpi
->bpi_parent
!= NULL
) {
399 bpobj_t
*p
= bpi
->bpi_parent
->bpi_bpo
;
401 ASSERT0(bpo
->bpo_phys
->bpo_num_blkptrs
);
402 ASSERT3U(p
->bpo_phys
->bpo_num_subobjs
,
404 ASSERT3U(bpi
->bpi_index
, ==,
405 p
->bpo_phys
->bpo_num_subobjs
- 1);
406 ASSERT(dmu_buf_is_dirty(bpo
->bpo_dbuf
,
409 p
->bpo_phys
->bpo_num_subobjs
--;
411 VERIFY0(dmu_free_range(p
->bpo_os
,
412 p
->bpo_phys
->bpo_subobjs
,
413 bpi
->bpi_index
* sizeof (uint64_t),
414 sizeof (uint64_t), tx
));
416 /* eliminate the empty subobj list */
417 if (bpo
->bpo_havesubobj
&&
418 bpo
->bpo_phys
->bpo_subobjs
!= 0) {
419 ASSERT0(bpo
->bpo_phys
->
421 err
= dmu_object_free(
423 bpo
->bpo_phys
->bpo_subobjs
,
427 bpo
->bpo_phys
->bpo_subobjs
= 0;
429 err
= dmu_object_free(p
->bpo_os
,
430 bpo
->bpo_object
, tx
);
435 mutex_exit(&bpo
->bpo_lock
);
437 kmem_free(bpo
, sizeof (bpobj_t
));
439 mutex_exit(&bpo
->bpo_lock
);
443 * Finished processing this bpo. Unlock, and free
446 list_remove_head(&stack
);
447 kmem_free(bpi
, sizeof (bpobj_info_t
));
450 * We have unprocessed subobjs. Process the next one.
452 ASSERT(bpo
->bpo_havecomp
);
453 ASSERT3P(bpobj_size
, ==, NULL
);
455 /* Add the last subobj to stack. */
456 int64_t i
= bpi
->bpi_unprocessed_subobjs
- 1;
457 uint64_t offset
= i
* sizeof (uint64_t);
459 uint64_t obj_from_sublist
;
460 err
= dmu_read(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
461 offset
, sizeof (uint64_t), &obj_from_sublist
,
465 bpobj_t
*sublist
= kmem_alloc(sizeof (bpobj_t
),
468 err
= bpobj_open(sublist
, bpo
->bpo_os
,
473 list_insert_head(&stack
, bpi_alloc(sublist
, bpi
, i
));
474 mutex_enter(&sublist
->bpo_lock
);
475 bpi
->bpi_unprocessed_subobjs
--;
479 * Cleanup anything left on the "stack" after we left the loop.
480 * Every bpo on the stack is locked so we must remember to undo
481 * that now (in LIFO order).
483 while ((bpi
= list_remove_head(&stack
)) != NULL
) {
484 bpobj_t
*bpo
= bpi
->bpi_bpo
;
486 ASSERT3P(bpo
, !=, NULL
);
488 mutex_exit(&bpo
->bpo_lock
);
490 /* do not free the initial_bpo */
491 if (bpi
->bpi_parent
!= NULL
) {
492 bpobj_close(bpi
->bpi_bpo
);
493 kmem_free(bpi
->bpi_bpo
, sizeof (bpobj_t
));
495 kmem_free(bpi
, sizeof (bpobj_info_t
));
498 list_destroy(&stack
);
504 * Iterate and remove the entries. If func returns nonzero, iteration
505 * will stop and that entry will not be removed.
508 bpobj_iterate(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
, dmu_tx_t
*tx
)
510 return (bpobj_iterate_impl(bpo
, func
, arg
, tx
, B_TRUE
, NULL
));
514 * Iterate the entries. If func returns nonzero, iteration will stop.
516 * If there are no subobjs:
518 * *bpobj_size can be used to return the number of block pointers in the
519 * bpobj. Note that this may be different from the number of block pointers
520 * that are iterated over, if iteration is terminated early (e.g. by the func
521 * returning nonzero).
523 * If there are concurrent (or subsequent) modifications to the bpobj then the
524 * returned *bpobj_size can be passed as "start" to
525 * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
528 bpobj_iterate_nofree(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
,
529 uint64_t *bpobj_size
)
531 return (bpobj_iterate_impl(bpo
, func
, arg
, NULL
, B_FALSE
, bpobj_size
));
535 * Iterate over the blkptrs in the bpobj beginning at index start. If func
536 * returns nonzero, iteration will stop. This is a livelist specific function
537 * since it assumes that there are no subobjs present.
540 livelist_bpobj_iterate_from_nofree(bpobj_t
*bpo
, bpobj_itor_t func
, void *arg
,
543 if (bpo
->bpo_havesubobj
)
544 VERIFY0(bpo
->bpo_phys
->bpo_subobjs
);
545 bpobj_info_t
*bpi
= bpi_alloc(bpo
, NULL
, 0);
546 int err
= bpobj_iterate_blkptrs(bpi
, func
, arg
, start
, NULL
, B_FALSE
);
547 kmem_free(bpi
, sizeof (bpobj_info_t
));
552 * Logically add subobj's contents to the parent bpobj.
554 * In the most general case, this is accomplished in constant time by adding
555 * a reference to subobj. This case is used when enqueuing a large subobj:
556 * +--------------+ +--------------+
557 * | bpobj |----------------------->| subobj list |
558 * +----+----+----+----+----+ +-----+-----+--+--+
559 * | bp | bp | bp | bp | bp | | obj | obj | obj |
560 * +----+----+----+----+----+ +-----+-----+-----+
562 * +--------------+ +--------------+
563 * | sub-bpobj |----------------------> | subsubobj |
564 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
565 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
566 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
568 * Result: sub-bpobj added to parent's subobj list.
569 * +--------------+ +--------------+
570 * | bpobj |----------------------->| subobj list |
571 * +----+----+----+----+----+ +-----+-----+--+--+-----+
572 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
573 * +----+----+----+----+----+ +-----+-----+-----+--|--+
575 * /-----------------------------------------------------/
577 * +--------------+ +--------------+
578 * | sub-bpobj |----------------------> | subsubobj |
579 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
580 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
581 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
584 * In a common case, the subobj is small: its bp's and its list of subobj's
585 * are each stored in a single block. In this case we copy the subobj's
586 * contents to the parent:
587 * +--------------+ +--------------+
588 * | bpobj |----------------------->| subobj list |
589 * +----+----+----+----+----+ +-----+-----+--+--+
590 * | bp | bp | bp | bp | bp | | obj | obj | obj |
591 * +----+----+----+----+----+ +-----+-----+-----+
593 * +--------------+ | +--------------+ |
594 * | sub-bpobj |---------^------------> | subsubobj | ^
595 * +----+----+----+ | +-----+-----+--+ |
596 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
597 * +----+----+ +-----+-----+
599 * Result: subobj destroyed, contents copied to parent:
600 * +--------------+ +--------------+
601 * | bpobj |----------------------->| subobj list |
602 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
603 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
604 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
607 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
608 * but retain the sub-bpobj:
609 * +--------------+ +--------------+
610 * | bpobj |----------------------->| subobj list |
611 * +----+----+----+----+----+ +-----+-----+--+--+
612 * | bp | bp | bp | bp | bp | | obj | obj | obj |
613 * +----+----+----+----+----+ +-----+-----+-----+
615 * +--------------+ +--------------+ |
616 * | sub-bpobj |----------------------> | subsubobj | ^
617 * +----+----+----+----+---------+----+ +-----+-----+--+ |
618 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
619 * +----+----+----+----+---------+----+ +-----+-----+
621 * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
622 * +--------------+ +--------------+
623 * | bpobj |-------------------->| subobj list |
624 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
625 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
626 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
628 * /--------------------------------------------------------------/
632 * +----+----+----+----+---------+----+
633 * | bp | bp | bp | bp | ... | bp |
634 * +----+----+----+----+---------+----+
637 bpobj_enqueue_subobj(bpobj_t
*bpo
, uint64_t subobj
, dmu_tx_t
*tx
)
640 uint64_t used
, comp
, uncomp
, subsubobjs
;
641 boolean_t copy_subsub
= B_TRUE
;
642 boolean_t copy_bps
= B_TRUE
;
644 ASSERT(bpobj_is_open(bpo
));
646 ASSERT(bpo
->bpo_havesubobj
);
647 ASSERT(bpo
->bpo_havecomp
);
648 ASSERT(bpo
->bpo_object
!= dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
);
650 if (subobj
== dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
) {
651 bpobj_decr_empty(bpo
->bpo_os
, tx
);
655 VERIFY3U(0, ==, bpobj_open(&subbpo
, bpo
->bpo_os
, subobj
));
656 VERIFY3U(0, ==, bpobj_space(&subbpo
, &used
, &comp
, &uncomp
));
658 if (bpobj_is_empty(&subbpo
)) {
659 /* No point in having an empty subobj. */
660 bpobj_close(&subbpo
);
661 bpobj_free(bpo
->bpo_os
, subobj
, tx
);
665 mutex_enter(&bpo
->bpo_lock
);
666 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
668 dmu_object_info_t doi
;
670 if (bpo
->bpo_phys
->bpo_subobjs
!= 0) {
671 ASSERT0(dmu_object_info(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
673 ASSERT3U(doi
.doi_type
, ==, DMU_OT_BPOBJ_SUBOBJ
);
677 * If subobj has only one block of subobjs, then move subobj's
678 * subobjs to bpo's subobj list directly. This reduces recursion in
679 * bpobj_iterate due to nested subobjs.
681 subsubobjs
= subbpo
.bpo_phys
->bpo_subobjs
;
682 if (subsubobjs
!= 0) {
683 VERIFY0(dmu_object_info(bpo
->bpo_os
, subsubobjs
, &doi
));
684 if (doi
.doi_max_offset
> doi
.doi_data_block_size
) {
685 copy_subsub
= B_FALSE
;
690 * If, in addition to having only one block of subobj's, subobj has
691 * only one block of bp's, then move subobj's bp's to bpo's bp list
692 * directly. This reduces recursion in bpobj_iterate due to nested
695 VERIFY3U(0, ==, dmu_object_info(bpo
->bpo_os
, subobj
, &doi
));
696 if (doi
.doi_max_offset
> doi
.doi_data_block_size
|| !copy_subsub
) {
700 if (copy_subsub
&& subsubobjs
!= 0) {
702 uint64_t numsubsub
= subbpo
.bpo_phys
->bpo_num_subobjs
;
704 VERIFY0(dmu_buf_hold(bpo
->bpo_os
, subsubobjs
,
705 0, FTAG
, &subdb
, 0));
707 * Make sure that we are not asking dmu_write()
708 * to write more data than we have in our buffer.
710 VERIFY3U(subdb
->db_size
, >=,
711 numsubsub
* sizeof (subobj
));
712 if (bpo
->bpo_phys
->bpo_subobjs
== 0) {
713 bpo
->bpo_phys
->bpo_subobjs
=
714 dmu_object_alloc(bpo
->bpo_os
,
715 DMU_OT_BPOBJ_SUBOBJ
, SPA_OLD_MAXBLOCKSIZE
,
718 dmu_write(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
719 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
),
720 numsubsub
* sizeof (subobj
), subdb
->db_data
, tx
);
721 dmu_buf_rele(subdb
, FTAG
);
722 bpo
->bpo_phys
->bpo_num_subobjs
+= numsubsub
;
724 dmu_buf_will_dirty(subbpo
.bpo_dbuf
, tx
);
725 subbpo
.bpo_phys
->bpo_subobjs
= 0;
726 VERIFY0(dmu_object_free(bpo
->bpo_os
, subsubobjs
, tx
));
731 uint64_t numbps
= subbpo
.bpo_phys
->bpo_num_blkptrs
;
734 VERIFY0(dmu_buf_hold(bpo
->bpo_os
, subobj
,
738 * Make sure that we are not asking dmu_write()
739 * to write more data than we have in our buffer.
741 VERIFY3U(bps
->db_size
, >=, numbps
* sizeof (blkptr_t
));
742 dmu_write(bpo
->bpo_os
, bpo
->bpo_object
,
743 bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (blkptr_t
),
744 numbps
* sizeof (blkptr_t
),
746 dmu_buf_rele(bps
, FTAG
);
747 bpo
->bpo_phys
->bpo_num_blkptrs
+= numbps
;
749 bpobj_close(&subbpo
);
750 VERIFY0(dmu_object_free(bpo
->bpo_os
, subobj
, tx
));
752 bpobj_close(&subbpo
);
753 if (bpo
->bpo_phys
->bpo_subobjs
== 0) {
754 bpo
->bpo_phys
->bpo_subobjs
=
755 dmu_object_alloc(bpo
->bpo_os
,
756 DMU_OT_BPOBJ_SUBOBJ
, SPA_OLD_MAXBLOCKSIZE
,
760 dmu_write(bpo
->bpo_os
, bpo
->bpo_phys
->bpo_subobjs
,
761 bpo
->bpo_phys
->bpo_num_subobjs
* sizeof (subobj
),
762 sizeof (subobj
), &subobj
, tx
);
763 bpo
->bpo_phys
->bpo_num_subobjs
++;
766 bpo
->bpo_phys
->bpo_bytes
+= used
;
767 bpo
->bpo_phys
->bpo_comp
+= comp
;
768 bpo
->bpo_phys
->bpo_uncomp
+= uncomp
;
769 mutex_exit(&bpo
->bpo_lock
);
774 bpobj_enqueue(bpobj_t
*bpo
, const blkptr_t
*bp
, boolean_t bp_freed
,
777 blkptr_t stored_bp
= *bp
;
782 ASSERT(bpobj_is_open(bpo
));
783 ASSERT(!BP_IS_HOLE(bp
));
784 ASSERT(bpo
->bpo_object
!= dmu_objset_pool(bpo
->bpo_os
)->dp_empty_bpobj
);
786 if (BP_IS_EMBEDDED(bp
)) {
788 * The bpobj will compress better without the payload.
790 * Note that we store EMBEDDED bp's because they have an
791 * uncompressed size, which must be accounted for. An
792 * alternative would be to add their size to bpo_uncomp
793 * without storing the bp, but that would create additional
794 * complications: bpo_uncomp would be inconsistent with the
795 * set of BP's stored, and bpobj_iterate() wouldn't visit
796 * all the space accounted for in the bpobj.
798 bzero(&stored_bp
, sizeof (stored_bp
));
799 stored_bp
.blk_prop
= bp
->blk_prop
;
800 stored_bp
.blk_birth
= bp
->blk_birth
;
801 } else if (!BP_GET_DEDUP(bp
)) {
802 /* The bpobj will compress better without the checksum */
803 bzero(&stored_bp
.blk_cksum
, sizeof (stored_bp
.blk_cksum
));
806 stored_bp
.blk_fill
= 0;
807 BP_SET_FREE(&stored_bp
, bp_freed
);
809 mutex_enter(&bpo
->bpo_lock
);
811 offset
= bpo
->bpo_phys
->bpo_num_blkptrs
* sizeof (stored_bp
);
812 blkoff
= P2PHASE(bpo
->bpo_phys
->bpo_num_blkptrs
, bpo
->bpo_epb
);
814 if (bpo
->bpo_cached_dbuf
== NULL
||
815 offset
< bpo
->bpo_cached_dbuf
->db_offset
||
816 offset
>= bpo
->bpo_cached_dbuf
->db_offset
+
817 bpo
->bpo_cached_dbuf
->db_size
) {
818 if (bpo
->bpo_cached_dbuf
)
819 dmu_buf_rele(bpo
->bpo_cached_dbuf
, bpo
);
820 VERIFY3U(0, ==, dmu_buf_hold(bpo
->bpo_os
, bpo
->bpo_object
,
821 offset
, bpo
, &bpo
->bpo_cached_dbuf
, 0));
824 dmu_buf_will_dirty(bpo
->bpo_cached_dbuf
, tx
);
825 bparray
= bpo
->bpo_cached_dbuf
->db_data
;
826 bparray
[blkoff
] = stored_bp
;
828 dmu_buf_will_dirty(bpo
->bpo_dbuf
, tx
);
829 bpo
->bpo_phys
->bpo_num_blkptrs
++;
830 int sign
= bp_freed
? -1 : +1;
831 bpo
->bpo_phys
->bpo_bytes
+= sign
*
832 bp_get_dsize_sync(dmu_objset_spa(bpo
->bpo_os
), bp
);
833 if (bpo
->bpo_havecomp
) {
834 bpo
->bpo_phys
->bpo_comp
+= sign
* BP_GET_PSIZE(bp
);
835 bpo
->bpo_phys
->bpo_uncomp
+= sign
* BP_GET_UCSIZE(bp
);
838 ASSERT(bpo
->bpo_havefreed
);
839 bpo
->bpo_phys
->bpo_num_freed
++;
841 mutex_exit(&bpo
->bpo_lock
);
844 struct space_range_arg
{
855 space_range_cb(void *arg
, const blkptr_t
*bp
, boolean_t bp_freed
, dmu_tx_t
*tx
)
857 struct space_range_arg
*sra
= arg
;
859 if (bp
->blk_birth
> sra
->mintxg
&& bp
->blk_birth
<= sra
->maxtxg
) {
860 if (dsl_pool_sync_context(spa_get_dsl(sra
->spa
)))
861 sra
->used
+= bp_get_dsize_sync(sra
->spa
, bp
);
863 sra
->used
+= bp_get_dsize(sra
->spa
, bp
);
864 sra
->comp
+= BP_GET_PSIZE(bp
);
865 sra
->uncomp
+= BP_GET_UCSIZE(bp
);
871 bpobj_space(bpobj_t
*bpo
, uint64_t *usedp
, uint64_t *compp
, uint64_t *uncompp
)
873 ASSERT(bpobj_is_open(bpo
));
874 mutex_enter(&bpo
->bpo_lock
);
876 *usedp
= bpo
->bpo_phys
->bpo_bytes
;
877 if (bpo
->bpo_havecomp
) {
878 *compp
= bpo
->bpo_phys
->bpo_comp
;
879 *uncompp
= bpo
->bpo_phys
->bpo_uncomp
;
880 mutex_exit(&bpo
->bpo_lock
);
883 mutex_exit(&bpo
->bpo_lock
);
884 return (bpobj_space_range(bpo
, 0, UINT64_MAX
,
885 usedp
, compp
, uncompp
));
890 * Return the amount of space in the bpobj which is:
891 * mintxg < blk_birth <= maxtxg
894 bpobj_space_range(bpobj_t
*bpo
, uint64_t mintxg
, uint64_t maxtxg
,
895 uint64_t *usedp
, uint64_t *compp
, uint64_t *uncompp
)
897 struct space_range_arg sra
= { 0 };
900 ASSERT(bpobj_is_open(bpo
));
903 * As an optimization, if they want the whole txg range, just
904 * get bpo_bytes rather than iterating over the bps.
906 if (mintxg
< TXG_INITIAL
&& maxtxg
== UINT64_MAX
&& bpo
->bpo_havecomp
)
907 return (bpobj_space(bpo
, usedp
, compp
, uncompp
));
909 sra
.spa
= dmu_objset_spa(bpo
->bpo_os
);
913 err
= bpobj_iterate_nofree(bpo
, space_range_cb
, &sra
, NULL
);
916 *uncompp
= sra
.uncomp
;
921 * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
922 * bpobj are designated as free or allocated that information is not preserved
927 bplist_append_cb(void *arg
, const blkptr_t
*bp
, boolean_t bp_freed
,
931 bplist_append(bpl
, bp
);