4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
26 #include <sys/zfs_context.h>
28 #include <sys/spa_impl.h>
31 #include <sys/brt_impl.h>
33 #include <sys/bitmap.h>
35 #include <sys/dmu_tx.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
44 * Block Cloning design.
46 * Block Cloning allows to manually clone a file (or a subset of its blocks)
47 * into another (or the same) file by just creating additional references to
48 * the data blocks without copying the data itself. Those references are kept
49 * in the Block Reference Tables (BRTs).
51 * In many ways this is similar to the existing deduplication, but there are
52 * some important differences:
54 * - Deduplication is automatic and Block Cloning is not - one has to use a
55 * dedicated system call(s) to clone the given file/blocks.
56 * - Deduplication keeps all data blocks in its table, even those referenced
57 * just once. Block Cloning creates an entry in its tables only when there
58 * are at least two references to the given data block. If the block was
59 * never explicitly cloned or the second to last reference was dropped,
60 * there will be neither space nor performance overhead.
61 * - Deduplication needs data to work - one needs to pass real data to the
62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63 * data, just block pointers to the data, so it is extremely fast, as we pay
64 * neither the cost of reading the data, nor the cost of writing the data -
65 * we operate exclusively on metadata.
66 * - If the D (dedup) bit is not set in the block pointer, it means that
67 * the block is not in the dedup table (DDT) and we won't consult the DDT
68 * when we need to free the block. Block Cloning must be consulted on every
69 * free, because we cannot modify the source BP (eg. by setting something
70 * similar to the D bit), thus we have no hint if the block is in the
71 * Block Reference Table (BRT), so we need to look into the BRT. There is
72 * an optimization in place that allows us to eliminate the majority of BRT
73 * lookups which is described below in the "Minimizing free penalty" section.
74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75 * 64bit offset and 64bit reference counter.
76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77 * other on disk are most likely in totally different parts of the DDT.
78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79 * from one file should have BRT entries close to each other.
80 * - Scrub will only do a single pass over a block that is referenced multiple
81 * times in the DDT. Unfortunately it is not currently (if at all) possible
82 * with Block Cloning and block referenced multiple times will be scrubbed
83 * multiple times. The new, sorted scrub should be able to eliminate
84 * duplicated reads given enough memory.
85 * - Deduplication requires cryptographically strong hash as a checksum or
86 * additional data verification. Block Cloning works with any checksum
87 * algorithm or even with checksumming disabled.
89 * As mentioned above, the BRT entries are much smaller than the DDT entries.
90 * To uniquely identify a block we just need its vdev id and offset. We also
91 * need to maintain a reference counter. The vdev id will often repeat, as there
92 * is a small number of top-level VDEVs and a large number of blocks stored in
93 * each VDEV. We take advantage of that to reduce the BRT entry size further by
94 * maintaining one BRT for each top-level VDEV, so we can then have only offset
95 * and counter as the BRT entry.
97 * Minimizing free penalty.
99 * Block Cloning allows creating additional references to any existing block.
100 * When we free a block there is no hint in the block pointer whether the block
101 * was cloned or not, so on each free we have to check if there is a
102 * corresponding entry in the BRT or not. If there is, we need to decrease
103 * the reference counter. Doing BRT lookup on every free can potentially be
104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105 * This is the main problem with deduplication, so we've learned our lesson and
106 * try not to repeat the same mistake here. How do we do that? We divide each
107 * top-level VDEV into 16MB regions. For each region we maintain a counter that
108 * is a sum of all the BRT entries that have offsets within the region. This
109 * creates the entries count array of 16bit numbers for each top-level VDEV.
110 * The entries count array is always kept in memory and updated on disk in the
111 * same transaction group as the BRT updates to keep everything in-sync. We can
112 * keep the array in memory, because it is very small. With 16MB regions and
113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114 * the region size even further in the future). Now, when we want to free
115 * a block, we first consult the array. If the counter for the whole region is
116 * zero, there is no need to look for the BRT entry, as there isn't one for
117 * sure. If the counter for the region is greater than zero, only then we will
118 * do a BRT lookup and if an entry is found we will decrease the reference
119 * counter in the BRT entry and in the entry counters array.
121 * The entry counters array is small, but can potentially be larger for very
122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123 * array on every change. We then divide the array into 32kB block and keep
124 * a bitmap of dirty blocks within a transaction group. When we sync the
125 * transaction group we can only update the parts of the entry counters array
126 * that were modified. Note: Keeping track of the dirty parts of the entry
127 * counters array is implemented, but updating only parts of the array on disk
128 * is not yet implemented - for now we will update entire array if there was
131 * The implementation tries to be economic: if BRT is not used, or no longer
132 * used, there will be no entries in the MOS and no additional memory used (eg.
133 * the entry counters array is only allocated if needed).
135 * Interaction between Deduplication and Block Cloning.
137 * If both functionalities are in use, we could end up with a block that is
138 * referenced multiple times in both DDT and BRT. When we free one of the
139 * references we couldn't tell where it belongs, so we would have to decide
140 * what table takes the precedence: do we first clear DDT references or BRT
141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143 * lookup DDT entry instead and increase the counter there. No BRT entry
144 * will be created for a block which has the D (dedup) bit set.
145 * BRT may be more efficient for manual deduplication, but if the block is
146 * already in the DDT, then creating additional BRT entry would be less
147 * efficient. This clever idea was proposed by Allan Jude.
149 * Block Cloning across datasets.
151 * Block Cloning is not limited to cloning blocks within the same dataset.
152 * It is possible (and very useful) to clone blocks between different datasets.
153 * One use case is recovering files from snapshots. By cloning the files into
154 * dataset we need no additional storage. Without Block Cloning we would need
155 * additional space for those files.
156 * Another interesting use case is moving the files between datasets
157 * (copying the file content to the new dataset and removing the source file).
158 * In that case Block Cloning will only be used briefly, because the BRT entries
159 * will be removed when the source is removed.
160 * Block Cloning across encrypted datasets is supported as long as both
161 * datasets share the same master key (e.g. snapshots and clones)
163 * Block Cloning flow through ZFS layers.
165 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166 * blocks. As of this writing no interface is implemented that allows for block
167 * cloning within a ZVOL.
168 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169 * for blocking cloning.
172 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173 * size_t len, unsigned int flags);
175 * Even though offsets and length represent bytes, they have to be
176 * block-aligned or we will return an error so the upper layer can
177 * fallback to the generic mechanism that will just copy the data.
178 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179 * This function was implemented based on zfs_write(), but instead of writing
180 * the given data we first read block pointers using the new dmu_read_l0_bps()
181 * function from the source file. Once we have BPs from the source file we call
182 * the dmu_brt_clone() function on the destination file. This function
183 * allocates BPs for us. We iterate over all source BPs. If the given BP is
184 * a hole or an embedded block, we just copy BP as-is. If it points to a real
185 * data we place this BP on a BRT pending list using the brt_pending_add()
188 * We use this pending list to keep track of all BPs that got new references
189 * within this transaction group.
191 * Some special cases to consider and how we address them:
192 * - The block we want to clone may have been created within the same
193 * transaction group that we are trying to clone. Such block has no BP
194 * allocated yet, so cannot be immediately cloned. We return EAGAIN.
195 * - The block we want to clone may have been modified within the same
196 * transaction group. We return EAGAIN.
197 * - A block may be cloned multiple times during one transaction group (that's
198 * why pending list is actually a tree and not an append-only list - this
199 * way we can figure out faster if this block is cloned for the first time
200 * in this txg or consecutive time).
201 * - A block may be cloned and freed within the same transaction group
202 * (see dbuf_undirty()).
203 * - A block may be cloned and within the same transaction group the clone
204 * can be cloned again (see dmu_read_l0_bps()).
205 * - A file might have been deleted, but the caller still has a file descriptor
206 * open to this file and clones it.
208 * When we free a block we have an additional step in the ZIO pipeline where we
209 * call the zio_brt_free() function. We then call the brt_entry_decref()
210 * that loads the corresponding BRT entry (if one exists) and decreases
211 * reference counter. If this is not the last reference we will stop ZIO
212 * pipeline here. If this is the last reference or the block is not in the
213 * BRT, we continue the pipeline and free the block as usual.
215 * At the beginning of spa_sync() where there can be no more block cloning,
216 * but before issuing frees we call brt_pending_apply(). This function applies
217 * all the new clones to the BRT table - we load BRT entries and update
218 * reference counters. To sync new BRT entries to disk, we use brt_sync()
219 * function. This function will sync all dirty per-top-level-vdev BRTs,
220 * the entry counters arrays, etc.
222 * Block Cloning and ZIL.
224 * Every clone operation is divided into chunks (similar to write) and each
225 * chunk is cloned in a separate transaction. The chunk size is determined by
226 * how many BPs we can fit into a single ZIL entry.
227 * Replaying clone operation is different from the regular clone operation,
228 * as when we log clone operations we cannot use the source object - it may
229 * reside on a different dataset, so we log BPs we want to clone.
230 * The ZIL is replayed when we mount the given dataset, not when the pool is
231 * imported. Taking this into account it is possible that the pool is imported
232 * without mounting datasets and the source dataset is destroyed before the
233 * destination dataset is mounted and its ZIL replayed.
234 * To address this situation we leverage zil_claim() mechanism where ZFS will
235 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236 * entries, we will bump reference counters for their BPs in the BRT. Then
237 * on mount and ZIL replay we bump the reference counters once more, while the
238 * first references are dropped during ZIL destroy by zil_free_clone_range().
239 * It is possible that after zil_claim() we never mount the destination, so
240 * we never replay its ZIL and just destroy it. In this case the only taken
241 * references will be dropped by zil_free_clone_range(), since the cloning is
242 * not going to ever take place.
245 static kmem_cache_t
*brt_entry_cache
;
248 * Enable/disable prefetching of BRT entries that we are going to modify.
250 static int brt_zap_prefetch
= 1;
253 #define BRT_DEBUG(...) do { \
254 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
255 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
259 #define BRT_DEBUG(...) do { } while (0)
262 static int brt_zap_default_bs
= 12;
263 static int brt_zap_default_ibs
= 12;
265 static kstat_t
*brt_ksp
;
267 typedef struct brt_stats
{
268 kstat_named_t brt_addref_entry_not_on_disk
;
269 kstat_named_t brt_addref_entry_on_disk
;
270 kstat_named_t brt_decref_entry_in_memory
;
271 kstat_named_t brt_decref_entry_loaded_from_disk
;
272 kstat_named_t brt_decref_entry_not_in_memory
;
273 kstat_named_t brt_decref_entry_read_lost_race
;
274 kstat_named_t brt_decref_entry_still_referenced
;
275 kstat_named_t brt_decref_free_data_later
;
276 kstat_named_t brt_decref_free_data_now
;
277 kstat_named_t brt_decref_no_entry
;
280 static brt_stats_t brt_stats
= {
281 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64
},
282 { "addref_entry_on_disk", KSTAT_DATA_UINT64
},
283 { "decref_entry_in_memory", KSTAT_DATA_UINT64
},
284 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64
},
285 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64
},
286 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64
},
287 { "decref_entry_still_referenced", KSTAT_DATA_UINT64
},
288 { "decref_free_data_later", KSTAT_DATA_UINT64
},
289 { "decref_free_data_now", KSTAT_DATA_UINT64
},
290 { "decref_no_entry", KSTAT_DATA_UINT64
}
294 wmsum_t brt_addref_entry_not_on_disk
;
295 wmsum_t brt_addref_entry_on_disk
;
296 wmsum_t brt_decref_entry_in_memory
;
297 wmsum_t brt_decref_entry_loaded_from_disk
;
298 wmsum_t brt_decref_entry_not_in_memory
;
299 wmsum_t brt_decref_entry_read_lost_race
;
300 wmsum_t brt_decref_entry_still_referenced
;
301 wmsum_t brt_decref_free_data_later
;
302 wmsum_t brt_decref_free_data_now
;
303 wmsum_t brt_decref_no_entry
;
306 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
308 static int brt_entry_compare(const void *x1
, const void *x2
);
309 static void brt_vdevs_expand(spa_t
*spa
, uint64_t nvdevs
);
312 brt_rlock(spa_t
*spa
)
314 rw_enter(&spa
->spa_brt_lock
, RW_READER
);
318 brt_wlock(spa_t
*spa
)
320 rw_enter(&spa
->spa_brt_lock
, RW_WRITER
);
324 brt_unlock(spa_t
*spa
)
326 rw_exit(&spa
->spa_brt_lock
);
330 brt_vdev_entcount_get(const brt_vdev_t
*brtvd
, uint64_t idx
)
333 ASSERT3U(idx
, <, brtvd
->bv_size
);
335 if (unlikely(brtvd
->bv_need_byteswap
)) {
336 return (BSWAP_16(brtvd
->bv_entcount
[idx
]));
338 return (brtvd
->bv_entcount
[idx
]);
343 brt_vdev_entcount_set(brt_vdev_t
*brtvd
, uint64_t idx
, uint16_t entcnt
)
346 ASSERT3U(idx
, <, brtvd
->bv_size
);
348 if (unlikely(brtvd
->bv_need_byteswap
)) {
349 brtvd
->bv_entcount
[idx
] = BSWAP_16(entcnt
);
351 brtvd
->bv_entcount
[idx
] = entcnt
;
356 brt_vdev_entcount_inc(brt_vdev_t
*brtvd
, uint64_t idx
)
360 ASSERT3U(idx
, <, brtvd
->bv_size
);
362 entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
363 ASSERT(entcnt
< UINT16_MAX
);
365 brt_vdev_entcount_set(brtvd
, idx
, entcnt
+ 1);
369 brt_vdev_entcount_dec(brt_vdev_t
*brtvd
, uint64_t idx
)
373 ASSERT3U(idx
, <, brtvd
->bv_size
);
375 entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
378 brt_vdev_entcount_set(brtvd
, idx
, entcnt
- 1);
383 brt_vdev_dump(brt_vdev_t
*brtvd
)
387 uint64_t nblocks
= BRT_RANGESIZE_TO_NBLOCKS(brtvd
->bv_size
);
388 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
389 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
390 (u_longlong_t
)brtvd
->bv_vdevid
,
391 brtvd
->bv_meta_dirty
, brtvd
->bv_entcount_dirty
,
392 (u_longlong_t
)brtvd
->bv_size
,
393 (u_longlong_t
)brtvd
->bv_totalcount
,
394 (u_longlong_t
)nblocks
,
395 (size_t)BT_SIZEOFMAP(nblocks
));
396 if (brtvd
->bv_totalcount
> 0) {
397 zfs_dbgmsg(" entcounts:");
398 for (idx
= 0; idx
< brtvd
->bv_size
; idx
++) {
399 uint16_t entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
401 zfs_dbgmsg(" [%04llu] %hu",
402 (u_longlong_t
)idx
, entcnt
);
406 if (brtvd
->bv_entcount_dirty
) {
409 bitmap
= kmem_alloc(nblocks
+ 1, KM_SLEEP
);
410 for (idx
= 0; idx
< nblocks
; idx
++) {
412 BT_TEST(brtvd
->bv_bitmap
, idx
) ? 'x' : '.';
415 zfs_dbgmsg(" dirty: %s", bitmap
);
416 kmem_free(bitmap
, nblocks
+ 1);
422 brt_vdev(spa_t
*spa
, uint64_t vdevid
, boolean_t alloc
)
424 brt_vdev_t
*brtvd
= NULL
;
427 if (vdevid
< spa
->spa_brt_nvdevs
) {
428 brtvd
= spa
->spa_brt_vdevs
[vdevid
];
430 /* New VDEV was added. */
433 if (vdevid
>= spa
->spa_brt_nvdevs
)
434 brt_vdevs_expand(spa
, vdevid
+ 1);
435 brtvd
= spa
->spa_brt_vdevs
[vdevid
];
442 brt_vdev_create(spa_t
*spa
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
446 ASSERT(brtvd
->bv_initiated
);
447 ASSERT0(brtvd
->bv_mos_brtvdev
);
448 ASSERT0(brtvd
->bv_mos_entries
);
450 uint64_t mos_entries
= zap_create_flags(spa
->spa_meta_objset
, 0,
451 ZAP_FLAG_HASH64
| ZAP_FLAG_UINT64_KEY
, DMU_OTN_ZAP_METADATA
,
452 brt_zap_default_bs
, brt_zap_default_ibs
, DMU_OT_NONE
, 0, tx
);
453 VERIFY(mos_entries
!= 0);
454 VERIFY0(dnode_hold(spa
->spa_meta_objset
, mos_entries
, brtvd
,
455 &brtvd
->bv_mos_entries_dnode
));
456 rw_enter(&brtvd
->bv_mos_entries_lock
, RW_WRITER
);
457 brtvd
->bv_mos_entries
= mos_entries
;
458 rw_exit(&brtvd
->bv_mos_entries_lock
);
459 BRT_DEBUG("MOS entries created, object=%llu",
460 (u_longlong_t
)brtvd
->bv_mos_entries
);
463 * We allocate DMU buffer to store the bv_entcount[] array.
464 * We will keep array size (bv_size) and cummulative count for all
465 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
467 brtvd
->bv_mos_brtvdev
= dmu_object_alloc(spa
->spa_meta_objset
,
468 DMU_OTN_UINT64_METADATA
, BRT_BLOCKSIZE
,
469 DMU_OTN_UINT64_METADATA
, sizeof (brt_vdev_phys_t
), tx
);
470 VERIFY(brtvd
->bv_mos_brtvdev
!= 0);
471 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
472 (u_longlong_t
)brtvd
->bv_mos_brtvdev
);
474 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
475 (u_longlong_t
)brtvd
->bv_vdevid
);
476 VERIFY0(zap_add(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
, name
,
477 sizeof (uint64_t), 1, &brtvd
->bv_mos_brtvdev
, tx
));
478 BRT_DEBUG("Pool directory object created, object=%s", name
);
480 spa_feature_incr(spa
, SPA_FEATURE_BLOCK_CLONING
, tx
);
484 brt_vdev_realloc(spa_t
*spa
, brt_vdev_t
*brtvd
)
489 uint64_t nblocks
, onblocks
, size
;
491 ASSERT(RW_WRITE_HELD(&brtvd
->bv_lock
));
493 spa_config_enter(spa
, SCL_VDEV
, FTAG
, RW_READER
);
494 vd
= vdev_lookup_top(spa
, brtvd
->bv_vdevid
);
495 size
= (vdev_get_min_asize(vd
) - 1) / spa
->spa_brt_rangesize
+ 1;
496 spa_config_exit(spa
, SCL_VDEV
, FTAG
);
498 entcount
= vmem_zalloc(sizeof (entcount
[0]) * size
, KM_SLEEP
);
499 nblocks
= BRT_RANGESIZE_TO_NBLOCKS(size
);
500 bitmap
= kmem_zalloc(BT_SIZEOFMAP(nblocks
), KM_SLEEP
);
502 if (!brtvd
->bv_initiated
) {
503 ASSERT0(brtvd
->bv_size
);
504 ASSERT0P(brtvd
->bv_entcount
);
505 ASSERT0P(brtvd
->bv_bitmap
);
507 ASSERT(brtvd
->bv_size
> 0);
508 ASSERT(brtvd
->bv_entcount
!= NULL
);
509 ASSERT(brtvd
->bv_bitmap
!= NULL
);
511 * TODO: Allow vdev shrinking. We only need to implement
512 * shrinking the on-disk BRT VDEV object.
513 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
516 ASSERT3U(brtvd
->bv_size
, <=, size
);
518 memcpy(entcount
, brtvd
->bv_entcount
,
519 sizeof (entcount
[0]) * MIN(size
, brtvd
->bv_size
));
520 vmem_free(brtvd
->bv_entcount
,
521 sizeof (entcount
[0]) * brtvd
->bv_size
);
522 onblocks
= BRT_RANGESIZE_TO_NBLOCKS(brtvd
->bv_size
);
523 memcpy(bitmap
, brtvd
->bv_bitmap
, MIN(BT_SIZEOFMAP(nblocks
),
524 BT_SIZEOFMAP(onblocks
)));
525 kmem_free(brtvd
->bv_bitmap
, BT_SIZEOFMAP(onblocks
));
528 brtvd
->bv_size
= size
;
529 brtvd
->bv_entcount
= entcount
;
530 brtvd
->bv_bitmap
= bitmap
;
531 if (!brtvd
->bv_initiated
) {
532 brtvd
->bv_need_byteswap
= FALSE
;
533 brtvd
->bv_initiated
= TRUE
;
534 BRT_DEBUG("BRT VDEV %llu initiated.",
535 (u_longlong_t
)brtvd
->bv_vdevid
);
540 brt_vdev_load(spa_t
*spa
, brt_vdev_t
*brtvd
)
543 brt_vdev_phys_t
*bvphys
;
546 ASSERT(!brtvd
->bv_initiated
);
547 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
549 error
= dmu_bonus_hold(spa
->spa_meta_objset
, brtvd
->bv_mos_brtvdev
,
554 bvphys
= db
->db_data
;
555 if (spa
->spa_brt_rangesize
== 0) {
556 spa
->spa_brt_rangesize
= bvphys
->bvp_rangesize
;
558 ASSERT3U(spa
->spa_brt_rangesize
, ==, bvphys
->bvp_rangesize
);
561 brt_vdev_realloc(spa
, brtvd
);
563 /* TODO: We don't support VDEV shrinking. */
564 ASSERT3U(bvphys
->bvp_size
, <=, brtvd
->bv_size
);
567 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
569 error
= dmu_read(spa
->spa_meta_objset
, brtvd
->bv_mos_brtvdev
, 0,
570 MIN(brtvd
->bv_size
, bvphys
->bvp_size
) * sizeof (uint16_t),
571 brtvd
->bv_entcount
, DMU_READ_NO_PREFETCH
);
575 ASSERT(bvphys
->bvp_mos_entries
!= 0);
576 VERIFY0(dnode_hold(spa
->spa_meta_objset
, bvphys
->bvp_mos_entries
, brtvd
,
577 &brtvd
->bv_mos_entries_dnode
));
578 rw_enter(&brtvd
->bv_mos_entries_lock
, RW_WRITER
);
579 brtvd
->bv_mos_entries
= bvphys
->bvp_mos_entries
;
580 rw_exit(&brtvd
->bv_mos_entries_lock
);
581 brtvd
->bv_need_byteswap
=
582 (bvphys
->bvp_byteorder
!= BRT_NATIVE_BYTEORDER
);
583 brtvd
->bv_totalcount
= bvphys
->bvp_totalcount
;
584 brtvd
->bv_usedspace
= bvphys
->bvp_usedspace
;
585 brtvd
->bv_savedspace
= bvphys
->bvp_savedspace
;
587 dmu_buf_rele(db
, FTAG
);
589 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
590 (u_longlong_t
)brtvd
->bv_vdevid
,
591 (u_longlong_t
)brtvd
->bv_mos_brtvdev
,
592 (u_longlong_t
)brtvd
->bv_mos_entries
);
597 brt_vdev_dealloc(brt_vdev_t
*brtvd
)
599 ASSERT(RW_WRITE_HELD(&brtvd
->bv_lock
));
600 ASSERT(brtvd
->bv_initiated
);
601 ASSERT0(avl_numnodes(&brtvd
->bv_tree
));
603 vmem_free(brtvd
->bv_entcount
, sizeof (uint16_t) * brtvd
->bv_size
);
604 brtvd
->bv_entcount
= NULL
;
605 uint64_t nblocks
= BRT_RANGESIZE_TO_NBLOCKS(brtvd
->bv_size
);
606 kmem_free(brtvd
->bv_bitmap
, BT_SIZEOFMAP(nblocks
));
607 brtvd
->bv_bitmap
= NULL
;
611 brtvd
->bv_initiated
= FALSE
;
612 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t
)brtvd
->bv_vdevid
);
616 brt_vdev_destroy(spa_t
*spa
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
621 ASSERT(brtvd
->bv_initiated
);
622 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
623 ASSERT(brtvd
->bv_mos_entries
!= 0);
624 ASSERT0(brtvd
->bv_totalcount
);
625 ASSERT0(brtvd
->bv_usedspace
);
626 ASSERT0(brtvd
->bv_savedspace
);
628 uint64_t mos_entries
= brtvd
->bv_mos_entries
;
629 rw_enter(&brtvd
->bv_mos_entries_lock
, RW_WRITER
);
630 brtvd
->bv_mos_entries
= 0;
631 rw_exit(&brtvd
->bv_mos_entries_lock
);
632 dnode_rele(brtvd
->bv_mos_entries_dnode
, brtvd
);
633 brtvd
->bv_mos_entries_dnode
= NULL
;
634 ASSERT0(zap_count(spa
->spa_meta_objset
, mos_entries
, &count
));
636 VERIFY0(zap_destroy(spa
->spa_meta_objset
, mos_entries
, tx
));
637 BRT_DEBUG("MOS entries destroyed, object=%llu",
638 (u_longlong_t
)mos_entries
);
640 VERIFY0(dmu_object_free(spa
->spa_meta_objset
, brtvd
->bv_mos_brtvdev
,
642 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
643 (u_longlong_t
)brtvd
->bv_mos_brtvdev
);
644 brtvd
->bv_mos_brtvdev
= 0;
645 brtvd
->bv_entcount_dirty
= FALSE
;
647 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
648 (u_longlong_t
)brtvd
->bv_vdevid
);
649 VERIFY0(zap_remove(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
651 BRT_DEBUG("Pool directory object removed, object=%s", name
);
653 brtvd
->bv_meta_dirty
= FALSE
;
655 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
656 brt_vdev_dealloc(brtvd
);
657 rw_exit(&brtvd
->bv_lock
);
659 spa_feature_decr(spa
, SPA_FEATURE_BLOCK_CLONING
, tx
);
663 brt_vdevs_expand(spa_t
*spa
, uint64_t nvdevs
)
667 ASSERT(RW_WRITE_HELD(&spa
->spa_brt_lock
));
668 ASSERT3U(nvdevs
, >=, spa
->spa_brt_nvdevs
);
670 if (nvdevs
== spa
->spa_brt_nvdevs
)
673 vdevs
= kmem_zalloc(sizeof (*spa
->spa_brt_vdevs
) * nvdevs
, KM_SLEEP
);
674 if (spa
->spa_brt_nvdevs
> 0) {
675 ASSERT(spa
->spa_brt_vdevs
!= NULL
);
677 memcpy(vdevs
, spa
->spa_brt_vdevs
,
678 sizeof (*spa
->spa_brt_vdevs
) * spa
->spa_brt_nvdevs
);
679 kmem_free(spa
->spa_brt_vdevs
,
680 sizeof (*spa
->spa_brt_vdevs
) * spa
->spa_brt_nvdevs
);
682 spa
->spa_brt_vdevs
= vdevs
;
684 for (uint64_t vdevid
= spa
->spa_brt_nvdevs
; vdevid
< nvdevs
; vdevid
++) {
685 brt_vdev_t
*brtvd
= kmem_zalloc(sizeof (*brtvd
), KM_SLEEP
);
686 rw_init(&brtvd
->bv_lock
, NULL
, RW_DEFAULT
, NULL
);
687 brtvd
->bv_vdevid
= vdevid
;
688 brtvd
->bv_initiated
= FALSE
;
689 rw_init(&brtvd
->bv_mos_entries_lock
, NULL
, RW_DEFAULT
, NULL
);
690 avl_create(&brtvd
->bv_tree
, brt_entry_compare
,
691 sizeof (brt_entry_t
), offsetof(brt_entry_t
, bre_node
));
692 for (int i
= 0; i
< TXG_SIZE
; i
++) {
693 avl_create(&brtvd
->bv_pending_tree
[i
],
694 brt_entry_compare
, sizeof (brt_entry_t
),
695 offsetof(brt_entry_t
, bre_node
));
697 mutex_init(&brtvd
->bv_pending_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
698 spa
->spa_brt_vdevs
[vdevid
] = brtvd
;
701 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
702 (u_longlong_t
)spa
->spa_brt_nvdevs
, (u_longlong_t
)nvdevs
);
703 spa
->spa_brt_nvdevs
= nvdevs
;
707 brt_vdev_lookup(spa_t
*spa
, brt_vdev_t
*brtvd
, uint64_t offset
)
709 uint64_t idx
= offset
/ spa
->spa_brt_rangesize
;
710 if (idx
< brtvd
->bv_size
) {
711 /* VDEV wasn't expanded. */
712 return (brt_vdev_entcount_get(brtvd
, idx
) > 0);
718 brt_vdev_addref(spa_t
*spa
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
,
719 uint64_t dsize
, uint64_t count
)
723 ASSERT(brtvd
->bv_initiated
);
725 brtvd
->bv_savedspace
+= dsize
* count
;
726 brtvd
->bv_meta_dirty
= TRUE
;
728 if (bre
->bre_count
> 0)
731 brtvd
->bv_usedspace
+= dsize
;
733 idx
= BRE_OFFSET(bre
) / spa
->spa_brt_rangesize
;
734 if (idx
>= brtvd
->bv_size
) {
735 /* VDEV has been expanded. */
736 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
737 brt_vdev_realloc(spa
, brtvd
);
738 rw_exit(&brtvd
->bv_lock
);
741 ASSERT3U(idx
, <, brtvd
->bv_size
);
743 brtvd
->bv_totalcount
++;
744 brt_vdev_entcount_inc(brtvd
, idx
);
745 brtvd
->bv_entcount_dirty
= TRUE
;
746 idx
= idx
/ BRT_BLOCKSIZE
/ 8;
747 BT_SET(brtvd
->bv_bitmap
, idx
);
751 brt_vdev_decref(spa_t
*spa
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
,
756 ASSERT(RW_WRITE_HELD(&brtvd
->bv_lock
));
757 ASSERT(brtvd
->bv_initiated
);
759 brtvd
->bv_savedspace
-= dsize
;
760 brtvd
->bv_meta_dirty
= TRUE
;
762 if (bre
->bre_count
> 0)
765 brtvd
->bv_usedspace
-= dsize
;
767 idx
= BRE_OFFSET(bre
) / spa
->spa_brt_rangesize
;
768 ASSERT3U(idx
, <, brtvd
->bv_size
);
770 ASSERT(brtvd
->bv_totalcount
> 0);
771 brtvd
->bv_totalcount
--;
772 brt_vdev_entcount_dec(brtvd
, idx
);
773 brtvd
->bv_entcount_dirty
= TRUE
;
774 idx
= idx
/ BRT_BLOCKSIZE
/ 8;
775 BT_SET(brtvd
->bv_bitmap
, idx
);
779 brt_vdev_sync(spa_t
*spa
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
782 brt_vdev_phys_t
*bvphys
;
784 ASSERT(brtvd
->bv_meta_dirty
);
785 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
786 ASSERT(dmu_tx_is_syncing(tx
));
788 VERIFY0(dmu_bonus_hold(spa
->spa_meta_objset
, brtvd
->bv_mos_brtvdev
,
791 if (brtvd
->bv_entcount_dirty
) {
793 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
795 dmu_write(spa
->spa_meta_objset
, brtvd
->bv_mos_brtvdev
, 0,
796 brtvd
->bv_size
* sizeof (brtvd
->bv_entcount
[0]),
797 brtvd
->bv_entcount
, tx
);
798 uint64_t nblocks
= BRT_RANGESIZE_TO_NBLOCKS(brtvd
->bv_size
);
799 memset(brtvd
->bv_bitmap
, 0, BT_SIZEOFMAP(nblocks
));
800 brtvd
->bv_entcount_dirty
= FALSE
;
803 dmu_buf_will_dirty(db
, tx
);
804 bvphys
= db
->db_data
;
805 bvphys
->bvp_mos_entries
= brtvd
->bv_mos_entries
;
806 bvphys
->bvp_size
= brtvd
->bv_size
;
807 if (brtvd
->bv_need_byteswap
) {
808 bvphys
->bvp_byteorder
= BRT_NON_NATIVE_BYTEORDER
;
810 bvphys
->bvp_byteorder
= BRT_NATIVE_BYTEORDER
;
812 bvphys
->bvp_totalcount
= brtvd
->bv_totalcount
;
813 bvphys
->bvp_rangesize
= spa
->spa_brt_rangesize
;
814 bvphys
->bvp_usedspace
= brtvd
->bv_usedspace
;
815 bvphys
->bvp_savedspace
= brtvd
->bv_savedspace
;
816 dmu_buf_rele(db
, FTAG
);
818 brtvd
->bv_meta_dirty
= FALSE
;
822 brt_vdevs_free(spa_t
*spa
)
824 if (spa
->spa_brt_vdevs
== 0)
826 for (uint64_t vdevid
= 0; vdevid
< spa
->spa_brt_nvdevs
; vdevid
++) {
827 brt_vdev_t
*brtvd
= spa
->spa_brt_vdevs
[vdevid
];
828 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
829 if (brtvd
->bv_initiated
)
830 brt_vdev_dealloc(brtvd
);
831 rw_exit(&brtvd
->bv_lock
);
832 rw_destroy(&brtvd
->bv_lock
);
833 if (brtvd
->bv_mos_entries
!= 0)
834 dnode_rele(brtvd
->bv_mos_entries_dnode
, brtvd
);
835 rw_destroy(&brtvd
->bv_mos_entries_lock
);
836 avl_destroy(&brtvd
->bv_tree
);
837 for (int i
= 0; i
< TXG_SIZE
; i
++)
838 avl_destroy(&brtvd
->bv_pending_tree
[i
]);
839 mutex_destroy(&brtvd
->bv_pending_lock
);
840 kmem_free(brtvd
, sizeof (*brtvd
));
842 kmem_free(spa
->spa_brt_vdevs
, sizeof (*spa
->spa_brt_vdevs
) *
843 spa
->spa_brt_nvdevs
);
847 brt_entry_fill(const blkptr_t
*bp
, brt_entry_t
*bre
, uint64_t *vdevidp
)
854 *vdevidp
= DVA_GET_VDEV(&bp
->blk_dva
[0]);
858 brt_entry_lookup(brt_vdev_t
*brtvd
, brt_entry_t
*bre
)
860 uint64_t off
= BRE_OFFSET(bre
);
862 return (zap_lookup_uint64_by_dnode(brtvd
->bv_mos_entries_dnode
,
863 &off
, BRT_KEY_WORDS
, 1, sizeof (bre
->bre_count
), &bre
->bre_count
));
867 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
868 * positive, but gives us quick answer if we should look into BRT, which
869 * may require reads and thus will be more expensive.
872 brt_maybe_exists(spa_t
*spa
, const blkptr_t
*bp
)
875 if (spa
->spa_brt_nvdevs
== 0)
878 uint64_t vdevid
= DVA_GET_VDEV(&bp
->blk_dva
[0]);
879 brt_vdev_t
*brtvd
= brt_vdev(spa
, vdevid
, B_FALSE
);
880 if (brtvd
== NULL
|| !brtvd
->bv_initiated
)
884 * We don't need locks here, since bv_entcount pointer must be
885 * stable at this point, and we don't care about false positive
886 * races here, while false negative should be impossible, since
887 * all brt_vdev_addref() have already completed by this point.
889 uint64_t off
= DVA_GET_OFFSET(&bp
->blk_dva
[0]);
890 return (brt_vdev_lookup(spa
, brtvd
, off
));
894 brt_get_dspace(spa_t
*spa
)
896 if (spa
->spa_brt_nvdevs
== 0)
901 for (uint64_t vdevid
= 0; vdevid
< spa
->spa_brt_nvdevs
; vdevid
++)
902 s
+= spa
->spa_brt_vdevs
[vdevid
]->bv_savedspace
;
908 brt_get_used(spa_t
*spa
)
910 if (spa
->spa_brt_nvdevs
== 0)
915 for (uint64_t vdevid
= 0; vdevid
< spa
->spa_brt_nvdevs
; vdevid
++)
916 s
+= spa
->spa_brt_vdevs
[vdevid
]->bv_usedspace
;
922 brt_get_saved(spa_t
*spa
)
924 return (brt_get_dspace(spa
));
928 brt_get_ratio(spa_t
*spa
)
930 uint64_t used
= brt_get_used(spa
);
933 return ((used
+ brt_get_saved(spa
)) * 100 / used
);
937 brt_kstats_update(kstat_t
*ksp
, int rw
)
939 brt_stats_t
*bs
= ksp
->ks_data
;
941 if (rw
== KSTAT_WRITE
)
944 bs
->brt_addref_entry_not_on_disk
.value
.ui64
=
945 wmsum_value(&brt_sums
.brt_addref_entry_not_on_disk
);
946 bs
->brt_addref_entry_on_disk
.value
.ui64
=
947 wmsum_value(&brt_sums
.brt_addref_entry_on_disk
);
948 bs
->brt_decref_entry_in_memory
.value
.ui64
=
949 wmsum_value(&brt_sums
.brt_decref_entry_in_memory
);
950 bs
->brt_decref_entry_loaded_from_disk
.value
.ui64
=
951 wmsum_value(&brt_sums
.brt_decref_entry_loaded_from_disk
);
952 bs
->brt_decref_entry_not_in_memory
.value
.ui64
=
953 wmsum_value(&brt_sums
.brt_decref_entry_not_in_memory
);
954 bs
->brt_decref_entry_read_lost_race
.value
.ui64
=
955 wmsum_value(&brt_sums
.brt_decref_entry_read_lost_race
);
956 bs
->brt_decref_entry_still_referenced
.value
.ui64
=
957 wmsum_value(&brt_sums
.brt_decref_entry_still_referenced
);
958 bs
->brt_decref_free_data_later
.value
.ui64
=
959 wmsum_value(&brt_sums
.brt_decref_free_data_later
);
960 bs
->brt_decref_free_data_now
.value
.ui64
=
961 wmsum_value(&brt_sums
.brt_decref_free_data_now
);
962 bs
->brt_decref_no_entry
.value
.ui64
=
963 wmsum_value(&brt_sums
.brt_decref_no_entry
);
972 wmsum_init(&brt_sums
.brt_addref_entry_not_on_disk
, 0);
973 wmsum_init(&brt_sums
.brt_addref_entry_on_disk
, 0);
974 wmsum_init(&brt_sums
.brt_decref_entry_in_memory
, 0);
975 wmsum_init(&brt_sums
.brt_decref_entry_loaded_from_disk
, 0);
976 wmsum_init(&brt_sums
.brt_decref_entry_not_in_memory
, 0);
977 wmsum_init(&brt_sums
.brt_decref_entry_read_lost_race
, 0);
978 wmsum_init(&brt_sums
.brt_decref_entry_still_referenced
, 0);
979 wmsum_init(&brt_sums
.brt_decref_free_data_later
, 0);
980 wmsum_init(&brt_sums
.brt_decref_free_data_now
, 0);
981 wmsum_init(&brt_sums
.brt_decref_no_entry
, 0);
983 brt_ksp
= kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED
,
984 sizeof (brt_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
985 if (brt_ksp
!= NULL
) {
986 brt_ksp
->ks_data
= &brt_stats
;
987 brt_ksp
->ks_update
= brt_kstats_update
;
988 kstat_install(brt_ksp
);
995 if (brt_ksp
!= NULL
) {
996 kstat_delete(brt_ksp
);
1000 wmsum_fini(&brt_sums
.brt_addref_entry_not_on_disk
);
1001 wmsum_fini(&brt_sums
.brt_addref_entry_on_disk
);
1002 wmsum_fini(&brt_sums
.brt_decref_entry_in_memory
);
1003 wmsum_fini(&brt_sums
.brt_decref_entry_loaded_from_disk
);
1004 wmsum_fini(&brt_sums
.brt_decref_entry_not_in_memory
);
1005 wmsum_fini(&brt_sums
.brt_decref_entry_read_lost_race
);
1006 wmsum_fini(&brt_sums
.brt_decref_entry_still_referenced
);
1007 wmsum_fini(&brt_sums
.brt_decref_free_data_later
);
1008 wmsum_fini(&brt_sums
.brt_decref_free_data_now
);
1009 wmsum_fini(&brt_sums
.brt_decref_no_entry
);
1015 brt_entry_cache
= kmem_cache_create("brt_entry_cache",
1016 sizeof (brt_entry_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1026 kmem_cache_destroy(brt_entry_cache
);
1029 /* Return TRUE if block should be freed immediately. */
1031 brt_entry_decref(spa_t
*spa
, const blkptr_t
*bp
)
1033 brt_entry_t
*bre
, *racebre
;
1034 brt_entry_t bre_search
;
1039 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1041 brt_vdev_t
*brtvd
= brt_vdev(spa
, vdevid
, B_FALSE
);
1042 ASSERT(brtvd
!= NULL
);
1044 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
1045 ASSERT(brtvd
->bv_initiated
);
1046 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1048 BRTSTAT_BUMP(brt_decref_entry_in_memory
);
1051 BRTSTAT_BUMP(brt_decref_entry_not_in_memory
);
1053 rw_exit(&brtvd
->bv_lock
);
1055 error
= brt_entry_lookup(brtvd
, &bre_search
);
1056 /* bre_search now contains correct bre_count */
1057 if (error
== ENOENT
) {
1058 BRTSTAT_BUMP(brt_decref_no_entry
);
1063 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
1064 racebre
= avl_find(&brtvd
->bv_tree
, &bre_search
, &where
);
1065 if (racebre
!= NULL
) {
1066 /* The entry was added when the lock was dropped. */
1067 BRTSTAT_BUMP(brt_decref_entry_read_lost_race
);
1072 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk
);
1073 bre
= kmem_cache_alloc(brt_entry_cache
, KM_SLEEP
);
1074 bre
->bre_bp
= bre_search
.bre_bp
;
1075 bre
->bre_count
= bre_search
.bre_count
;
1076 bre
->bre_pcount
= 0;
1077 avl_insert(&brtvd
->bv_tree
, bre
, where
);
1080 if (bre
->bre_count
== 0) {
1081 rw_exit(&brtvd
->bv_lock
);
1082 BRTSTAT_BUMP(brt_decref_free_data_now
);
1087 ASSERT(bre
->bre_count
> 0);
1089 if (bre
->bre_count
== 0)
1090 BRTSTAT_BUMP(brt_decref_free_data_later
);
1092 BRTSTAT_BUMP(brt_decref_entry_still_referenced
);
1093 brt_vdev_decref(spa
, brtvd
, bre
, bp_get_dsize_sync(spa
, bp
));
1095 rw_exit(&brtvd
->bv_lock
);
1101 brt_entry_get_refcount(spa_t
*spa
, const blkptr_t
*bp
)
1103 brt_entry_t bre_search
, *bre
;
1104 uint64_t vdevid
, refcnt
;
1107 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1109 brt_vdev_t
*brtvd
= brt_vdev(spa
, vdevid
, B_FALSE
);
1110 ASSERT(brtvd
!= NULL
);
1112 rw_enter(&brtvd
->bv_lock
, RW_READER
);
1113 ASSERT(brtvd
->bv_initiated
);
1114 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1116 rw_exit(&brtvd
->bv_lock
);
1117 error
= brt_entry_lookup(brtvd
, &bre_search
);
1118 if (error
== ENOENT
) {
1122 refcnt
= bre_search
.bre_count
;
1125 refcnt
= bre
->bre_count
;
1126 rw_exit(&brtvd
->bv_lock
);
1133 brt_prefetch(brt_vdev_t
*brtvd
, const blkptr_t
*bp
)
1135 if (!brt_zap_prefetch
|| brtvd
->bv_mos_entries
== 0)
1138 uint64_t off
= DVA_GET_OFFSET(&bp
->blk_dva
[0]);
1139 rw_enter(&brtvd
->bv_mos_entries_lock
, RW_READER
);
1140 if (brtvd
->bv_mos_entries
!= 0) {
1141 (void) zap_prefetch_uint64_by_dnode(brtvd
->bv_mos_entries_dnode
,
1142 &off
, BRT_KEY_WORDS
);
1144 rw_exit(&brtvd
->bv_mos_entries_lock
);
1148 brt_entry_compare(const void *x1
, const void *x2
)
1150 const brt_entry_t
*bre1
= x1
, *bre2
= x2
;
1151 const blkptr_t
*bp1
= &bre1
->bre_bp
, *bp2
= &bre2
->bre_bp
;
1153 return (TREE_CMP(DVA_GET_OFFSET(&bp1
->blk_dva
[0]),
1154 DVA_GET_OFFSET(&bp2
->blk_dva
[0])));
1158 brt_pending_add(spa_t
*spa
, const blkptr_t
*bp
, dmu_tx_t
*tx
)
1160 brt_entry_t
*bre
, *newbre
;
1164 txg
= dmu_tx_get_txg(tx
);
1165 ASSERT3U(txg
, !=, 0);
1167 uint64_t vdevid
= DVA_GET_VDEV(&bp
->blk_dva
[0]);
1168 brt_vdev_t
*brtvd
= brt_vdev(spa
, vdevid
, B_TRUE
);
1169 avl_tree_t
*pending_tree
= &brtvd
->bv_pending_tree
[txg
& TXG_MASK
];
1171 newbre
= kmem_cache_alloc(brt_entry_cache
, KM_SLEEP
);
1172 newbre
->bre_bp
= *bp
;
1173 newbre
->bre_count
= 0;
1174 newbre
->bre_pcount
= 1;
1176 mutex_enter(&brtvd
->bv_pending_lock
);
1177 bre
= avl_find(pending_tree
, newbre
, &where
);
1179 avl_insert(pending_tree
, newbre
, where
);
1184 mutex_exit(&brtvd
->bv_pending_lock
);
1186 if (newbre
!= NULL
) {
1187 ASSERT(bre
!= NULL
);
1188 ASSERT(bre
!= newbre
);
1189 kmem_cache_free(brt_entry_cache
, newbre
);
1193 /* Prefetch BRT entry for the syncing context. */
1194 brt_prefetch(brtvd
, bp
);
1199 brt_pending_remove(spa_t
*spa
, const blkptr_t
*bp
, dmu_tx_t
*tx
)
1201 brt_entry_t
*bre
, bre_search
;
1204 txg
= dmu_tx_get_txg(tx
);
1205 ASSERT3U(txg
, !=, 0);
1207 uint64_t vdevid
= DVA_GET_VDEV(&bp
->blk_dva
[0]);
1208 brt_vdev_t
*brtvd
= brt_vdev(spa
, vdevid
, B_FALSE
);
1209 ASSERT(brtvd
!= NULL
);
1210 avl_tree_t
*pending_tree
= &brtvd
->bv_pending_tree
[txg
& TXG_MASK
];
1212 bre_search
.bre_bp
= *bp
;
1214 mutex_enter(&brtvd
->bv_pending_lock
);
1215 bre
= avl_find(pending_tree
, &bre_search
, NULL
);
1216 ASSERT(bre
!= NULL
);
1217 ASSERT(bre
->bre_pcount
> 0);
1219 if (bre
->bre_pcount
== 0)
1220 avl_remove(pending_tree
, bre
);
1223 mutex_exit(&brtvd
->bv_pending_lock
);
1226 kmem_cache_free(brt_entry_cache
, bre
);
1230 brt_pending_apply_vdev(spa_t
*spa
, brt_vdev_t
*brtvd
, uint64_t txg
)
1232 brt_entry_t
*bre
, *nbre
;
1235 * We are in syncing context, so no other bv_pending_tree accesses
1236 * are possible for the TXG. So we don't need bv_pending_lock.
1238 ASSERT(avl_is_empty(&brtvd
->bv_tree
));
1239 avl_swap(&brtvd
->bv_tree
, &brtvd
->bv_pending_tree
[txg
& TXG_MASK
]);
1241 for (bre
= avl_first(&brtvd
->bv_tree
); bre
; bre
= nbre
) {
1242 nbre
= AVL_NEXT(&brtvd
->bv_tree
, bre
);
1245 * If the block has DEDUP bit set, it means that it
1246 * already exists in the DEDUP table, so we can just
1247 * use that instead of creating new entry in the BRT.
1249 if (BP_GET_DEDUP(&bre
->bre_bp
)) {
1250 while (bre
->bre_pcount
> 0) {
1251 if (!ddt_addref(spa
, &bre
->bre_bp
))
1255 if (bre
->bre_pcount
== 0) {
1256 avl_remove(&brtvd
->bv_tree
, bre
);
1257 kmem_cache_free(brt_entry_cache
, bre
);
1263 * Unless we know that the block is definitely not in ZAP,
1264 * try to get its reference count from there.
1266 uint64_t off
= BRE_OFFSET(bre
);
1267 if (brtvd
->bv_mos_entries
!= 0 &&
1268 brt_vdev_lookup(spa
, brtvd
, off
)) {
1269 int error
= zap_lookup_uint64_by_dnode(
1270 brtvd
->bv_mos_entries_dnode
, &off
,
1271 BRT_KEY_WORDS
, 1, sizeof (bre
->bre_count
),
1274 BRTSTAT_BUMP(brt_addref_entry_on_disk
);
1276 ASSERT3U(error
, ==, ENOENT
);
1277 BRTSTAT_BUMP(brt_addref_entry_not_on_disk
);
1283 * If all the cloned blocks we had were handled by DDT, we don't need
1284 * to initiate the vdev.
1286 if (avl_is_empty(&brtvd
->bv_tree
))
1289 if (!brtvd
->bv_initiated
) {
1290 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
1291 brt_vdev_realloc(spa
, brtvd
);
1292 rw_exit(&brtvd
->bv_lock
);
1296 * Convert pending references into proper ones. This has to be a
1297 * separate loop, since entcount modifications would cause false
1298 * positives for brt_vdev_lookup() on following iterations.
1300 for (bre
= avl_first(&brtvd
->bv_tree
); bre
;
1301 bre
= AVL_NEXT(&brtvd
->bv_tree
, bre
)) {
1302 brt_vdev_addref(spa
, brtvd
, bre
,
1303 bp_get_dsize(spa
, &bre
->bre_bp
), bre
->bre_pcount
);
1304 bre
->bre_count
+= bre
->bre_pcount
;
1309 brt_pending_apply(spa_t
*spa
, uint64_t txg
)
1313 for (uint64_t vdevid
= 0; vdevid
< spa
->spa_brt_nvdevs
; vdevid
++) {
1314 brt_vdev_t
*brtvd
= spa
->spa_brt_vdevs
[vdevid
];
1317 brt_pending_apply_vdev(spa
, brtvd
, txg
);
1325 brt_sync_entry(dnode_t
*dn
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
1327 uint64_t off
= BRE_OFFSET(bre
);
1329 if (bre
->bre_pcount
== 0) {
1330 /* The net change is zero, nothing to do in ZAP. */
1331 } else if (bre
->bre_count
== 0) {
1332 int error
= zap_remove_uint64_by_dnode(dn
, &off
,
1334 VERIFY(error
== 0 || error
== ENOENT
);
1336 VERIFY0(zap_update_uint64_by_dnode(dn
, &off
,
1337 BRT_KEY_WORDS
, 1, sizeof (bre
->bre_count
),
1338 &bre
->bre_count
, tx
));
1343 brt_sync_table(spa_t
*spa
, dmu_tx_t
*tx
)
1348 for (uint64_t vdevid
= 0; vdevid
< spa
->spa_brt_nvdevs
; vdevid
++) {
1349 brt_vdev_t
*brtvd
= spa
->spa_brt_vdevs
[vdevid
];
1352 if (!brtvd
->bv_meta_dirty
) {
1353 ASSERT(!brtvd
->bv_entcount_dirty
);
1354 ASSERT0(avl_numnodes(&brtvd
->bv_tree
));
1359 ASSERT(!brtvd
->bv_entcount_dirty
||
1360 avl_numnodes(&brtvd
->bv_tree
) != 0);
1362 if (brtvd
->bv_mos_brtvdev
== 0)
1363 brt_vdev_create(spa
, brtvd
, tx
);
1366 while ((bre
= avl_destroy_nodes(&brtvd
->bv_tree
, &c
)) != NULL
) {
1367 brt_sync_entry(brtvd
->bv_mos_entries_dnode
, bre
, tx
);
1368 kmem_cache_free(brt_entry_cache
, bre
);
1372 if (zfs_flags
& ZFS_DEBUG_BRT
)
1373 brt_vdev_dump(brtvd
);
1375 if (brtvd
->bv_totalcount
== 0)
1376 brt_vdev_destroy(spa
, brtvd
, tx
);
1378 brt_vdev_sync(spa
, brtvd
, tx
);
1385 brt_sync(spa_t
*spa
, uint64_t txg
)
1390 ASSERT3U(spa_syncing_txg(spa
), ==, txg
);
1393 for (vdevid
= 0; vdevid
< spa
->spa_brt_nvdevs
; vdevid
++) {
1394 if (spa
->spa_brt_vdevs
[vdevid
]->bv_meta_dirty
)
1397 if (vdevid
>= spa
->spa_brt_nvdevs
) {
1403 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
1404 brt_sync_table(spa
, tx
);
1409 brt_alloc(spa_t
*spa
)
1411 rw_init(&spa
->spa_brt_lock
, NULL
, RW_DEFAULT
, NULL
);
1412 spa
->spa_brt_vdevs
= NULL
;
1413 spa
->spa_brt_nvdevs
= 0;
1414 spa
->spa_brt_rangesize
= 0;
1418 brt_create(spa_t
*spa
)
1421 spa
->spa_brt_rangesize
= BRT_RANGESIZE
;
1425 brt_load(spa_t
*spa
)
1431 for (uint64_t vdevid
= 0; vdevid
< spa
->spa_root_vdev
->vdev_children
;
1434 uint64_t mos_brtvdev
;
1436 /* Look if this vdev had active block cloning. */
1437 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
1438 (u_longlong_t
)vdevid
);
1439 error
= zap_lookup(spa
->spa_meta_objset
,
1440 DMU_POOL_DIRECTORY_OBJECT
, name
, sizeof (uint64_t), 1,
1442 if (error
== ENOENT
) {
1449 /* If it did, then allocate them all and load this one. */
1450 brt_vdevs_expand(spa
, spa
->spa_root_vdev
->vdev_children
);
1451 brt_vdev_t
*brtvd
= spa
->spa_brt_vdevs
[vdevid
];
1452 rw_enter(&brtvd
->bv_lock
, RW_WRITER
);
1453 brtvd
->bv_mos_brtvdev
= mos_brtvdev
;
1454 error
= brt_vdev_load(spa
, brtvd
);
1455 rw_exit(&brtvd
->bv_lock
);
1460 if (spa
->spa_brt_rangesize
== 0)
1461 spa
->spa_brt_rangesize
= BRT_RANGESIZE
;
1467 brt_unload(spa_t
*spa
)
1469 if (spa
->spa_brt_rangesize
== 0)
1471 brt_vdevs_free(spa
);
1472 rw_destroy(&spa
->spa_brt_lock
);
1473 spa
->spa_brt_rangesize
= 0;
1477 ZFS_MODULE_PARAM(zfs_brt
, , brt_zap_prefetch
, INT
, ZMOD_RW
,
1478 "Enable prefetching of BRT ZAP entries");
1479 ZFS_MODULE_PARAM(zfs_brt
, , brt_zap_default_bs
, UINT
, ZMOD_RW
,
1480 "BRT ZAP leaf blockshift");
1481 ZFS_MODULE_PARAM(zfs_brt
, , brt_zap_default_ibs
, UINT
, ZMOD_RW
,
1482 "BRT ZAP indirect blockshift");