4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
26 #include <sys/zfs_context.h>
28 #include <sys/spa_impl.h>
31 #include <sys/brt_impl.h>
33 #include <sys/bitmap.h>
35 #include <sys/dmu_tx.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
44 * Block Cloning design.
46 * Block Cloning allows to manually clone a file (or a subset of its blocks)
47 * into another (or the same) file by just creating additional references to
48 * the data blocks without copying the data itself. Those references are kept
49 * in the Block Reference Tables (BRTs).
51 * In many ways this is similar to the existing deduplication, but there are
52 * some important differences:
54 * - Deduplication is automatic and Block Cloning is not - one has to use a
55 * dedicated system call(s) to clone the given file/blocks.
56 * - Deduplication keeps all data blocks in its table, even those referenced
57 * just once. Block Cloning creates an entry in its tables only when there
58 * are at least two references to the given data block. If the block was
59 * never explicitly cloned or the second to last reference was dropped,
60 * there will be neither space nor performance overhead.
61 * - Deduplication needs data to work - one needs to pass real data to the
62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63 * data, just block pointers to the data, so it is extremely fast, as we pay
64 * neither the cost of reading the data, nor the cost of writing the data -
65 * we operate exclusively on metadata.
66 * - If the D (dedup) bit is not set in the block pointer, it means that
67 * the block is not in the dedup table (DDT) and we won't consult the DDT
68 * when we need to free the block. Block Cloning must be consulted on every
69 * free, because we cannot modify the source BP (eg. by setting something
70 * similar to the D bit), thus we have no hint if the block is in the
71 * Block Reference Table (BRT), so we need to look into the BRT. There is
72 * an optimization in place that allows us to eliminate the majority of BRT
73 * lookups which is described below in the "Minimizing free penalty" section.
74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75 * 64bit offset and 64bit reference counter.
76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77 * other on disk are most likely in totally different parts of the DDT.
78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79 * from one file should have BRT entries close to each other.
80 * - Scrub will only do a single pass over a block that is referenced multiple
81 * times in the DDT. Unfortunately it is not currently (if at all) possible
82 * with Block Cloning and block referenced multiple times will be scrubbed
83 * multiple times. The new, sorted scrub should be able to eliminate
84 * duplicated reads given enough memory.
85 * - Deduplication requires cryptographically strong hash as a checksum or
86 * additional data verification. Block Cloning works with any checksum
87 * algorithm or even with checksumming disabled.
89 * As mentioned above, the BRT entries are much smaller than the DDT entries.
90 * To uniquely identify a block we just need its vdev id and offset. We also
91 * need to maintain a reference counter. The vdev id will often repeat, as there
92 * is a small number of top-level VDEVs and a large number of blocks stored in
93 * each VDEV. We take advantage of that to reduce the BRT entry size further by
94 * maintaining one BRT for each top-level VDEV, so we can then have only offset
95 * and counter as the BRT entry.
97 * Minimizing free penalty.
99 * Block Cloning allows creating additional references to any existing block.
100 * When we free a block there is no hint in the block pointer whether the block
101 * was cloned or not, so on each free we have to check if there is a
102 * corresponding entry in the BRT or not. If there is, we need to decrease
103 * the reference counter. Doing BRT lookup on every free can potentially be
104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105 * This is the main problem with deduplication, so we've learned our lesson and
106 * try not to repeat the same mistake here. How do we do that? We divide each
107 * top-level VDEV into 16MB regions. For each region we maintain a counter that
108 * is a sum of all the BRT entries that have offsets within the region. This
109 * creates the entries count array of 16bit numbers for each top-level VDEV.
110 * The entries count array is always kept in memory and updated on disk in the
111 * same transaction group as the BRT updates to keep everything in-sync. We can
112 * keep the array in memory, because it is very small. With 16MB regions and
113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114 * the region size even further in the future). Now, when we want to free
115 * a block, we first consult the array. If the counter for the whole region is
116 * zero, there is no need to look for the BRT entry, as there isn't one for
117 * sure. If the counter for the region is greater than zero, only then we will
118 * do a BRT lookup and if an entry is found we will decrease the reference
119 * counter in the BRT entry and in the entry counters array.
121 * The entry counters array is small, but can potentially be larger for very
122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123 * array on every change. We then divide the array into 32kB block and keep
124 * a bitmap of dirty blocks within a transaction group. When we sync the
125 * transaction group we can only update the parts of the entry counters array
126 * that were modified. Note: Keeping track of the dirty parts of the entry
127 * counters array is implemented, but updating only parts of the array on disk
128 * is not yet implemented - for now we will update entire array if there was
131 * The implementation tries to be economic: if BRT is not used, or no longer
132 * used, there will be no entries in the MOS and no additional memory used (eg.
133 * the entry counters array is only allocated if needed).
135 * Interaction between Deduplication and Block Cloning.
137 * If both functionalities are in use, we could end up with a block that is
138 * referenced multiple times in both DDT and BRT. When we free one of the
139 * references we couldn't tell where it belongs, so we would have to decide
140 * what table takes the precedence: do we first clear DDT references or BRT
141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143 * lookup DDT entry instead and increase the counter there. No BRT entry
144 * will be created for a block which has the D (dedup) bit set.
145 * BRT may be more efficient for manual deduplication, but if the block is
146 * already in the DDT, then creating additional BRT entry would be less
147 * efficient. This clever idea was proposed by Allan Jude.
149 * Block Cloning across datasets.
151 * Block Cloning is not limited to cloning blocks within the same dataset.
152 * It is possible (and very useful) to clone blocks between different datasets.
153 * One use case is recovering files from snapshots. By cloning the files into
154 * dataset we need no additional storage. Without Block Cloning we would need
155 * additional space for those files.
156 * Another interesting use case is moving the files between datasets
157 * (copying the file content to the new dataset and removing the source file).
158 * In that case Block Cloning will only be used briefly, because the BRT entries
159 * will be removed when the source is removed.
160 * Note: currently it is not possible to clone blocks between encrypted
161 * datasets, even if those datasets use the same encryption key (this includes
162 * snapshots of encrypted datasets). Cloning blocks between datasets that use
163 * the same keys should be possible and should be implemented in the future.
165 * Block Cloning flow through ZFS layers.
167 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
168 * blocks. As of this writing no interface is implemented that allows for block
169 * cloning within a ZVOL.
170 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
171 * for blocking cloning.
174 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
175 * size_t len, unsigned int flags);
177 * Even though offsets and length represent bytes, they have to be
178 * block-aligned or we will return an error so the upper layer can
179 * fallback to the generic mechanism that will just copy the data.
180 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
181 * This function was implemented based on zfs_write(), but instead of writing
182 * the given data we first read block pointers using the new dmu_read_l0_bps()
183 * function from the source file. Once we have BPs from the source file we call
184 * the dmu_brt_clone() function on the destination file. This function
185 * allocates BPs for us. We iterate over all source BPs. If the given BP is
186 * a hole or an embedded block, we just copy BP as-is. If it points to a real
187 * data we place this BP on a BRT pending list using the brt_pending_add()
190 * We use this pending list to keep track of all BPs that got new references
191 * within this transaction group.
193 * Some special cases to consider and how we address them:
194 * - The block we want to clone may have been created within the same
195 * transaction group that we are trying to clone. Such block has no BP
196 * allocated yet, so cannot be immediately cloned. We return EAGAIN.
197 * - The block we want to clone may have been modified within the same
198 * transaction group. We return EAGAIN.
199 * - A block may be cloned multiple times during one transaction group (that's
200 * why pending list is actually a tree and not an append-only list - this
201 * way we can figure out faster if this block is cloned for the first time
202 * in this txg or consecutive time).
203 * - A block may be cloned and freed within the same transaction group
204 * (see dbuf_undirty()).
205 * - A block may be cloned and within the same transaction group the clone
206 * can be cloned again (see dmu_read_l0_bps()).
207 * - A file might have been deleted, but the caller still has a file descriptor
208 * open to this file and clones it.
210 * When we free a block we have an additional step in the ZIO pipeline where we
211 * call the zio_brt_free() function. We then call the brt_entry_decref()
212 * that loads the corresponding BRT entry (if one exists) and decreases
213 * reference counter. If this is not the last reference we will stop ZIO
214 * pipeline here. If this is the last reference or the block is not in the
215 * BRT, we continue the pipeline and free the block as usual.
217 * At the beginning of spa_sync() where there can be no more block cloning,
218 * but before issuing frees we call brt_pending_apply(). This function applies
219 * all the new clones to the BRT table - we load BRT entries and update
220 * reference counters. To sync new BRT entries to disk, we use brt_sync()
221 * function. This function will sync all dirty per-top-level-vdev BRTs,
222 * the entry counters arrays, etc.
224 * Block Cloning and ZIL.
226 * Every clone operation is divided into chunks (similar to write) and each
227 * chunk is cloned in a separate transaction. The chunk size is determined by
228 * how many BPs we can fit into a single ZIL entry.
229 * Replaying clone operation is different from the regular clone operation,
230 * as when we log clone operations we cannot use the source object - it may
231 * reside on a different dataset, so we log BPs we want to clone.
232 * The ZIL is replayed when we mount the given dataset, not when the pool is
233 * imported. Taking this into account it is possible that the pool is imported
234 * without mounting datasets and the source dataset is destroyed before the
235 * destination dataset is mounted and its ZIL replayed.
236 * To address this situation we leverage zil_claim() mechanism where ZFS will
237 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
238 * entries, we will bump reference counters for their BPs in the BRT. Then
239 * on mount and ZIL replay we bump the reference counters once more, while the
240 * first references are dropped during ZIL destroy by zil_free_clone_range().
241 * It is possible that after zil_claim() we never mount the destination, so
242 * we never replay its ZIL and just destroy it. In this case the only taken
243 * references will be dropped by zil_free_clone_range(), since the cloning is
244 * not going to ever take place.
247 static kmem_cache_t
*brt_entry_cache
;
248 static kmem_cache_t
*brt_pending_entry_cache
;
251 * Enable/disable prefetching of BRT entries that we are going to modify.
253 int zfs_brt_prefetch
= 1;
256 #define BRT_DEBUG(...) do { \
257 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
258 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
262 #define BRT_DEBUG(...) do { } while (0)
265 int brt_zap_leaf_blockshift
= 12;
266 int brt_zap_indirect_blockshift
= 12;
268 static kstat_t
*brt_ksp
;
270 typedef struct brt_stats
{
271 kstat_named_t brt_addref_entry_in_memory
;
272 kstat_named_t brt_addref_entry_not_on_disk
;
273 kstat_named_t brt_addref_entry_on_disk
;
274 kstat_named_t brt_addref_entry_read_lost_race
;
275 kstat_named_t brt_decref_entry_in_memory
;
276 kstat_named_t brt_decref_entry_loaded_from_disk
;
277 kstat_named_t brt_decref_entry_not_in_memory
;
278 kstat_named_t brt_decref_entry_not_on_disk
;
279 kstat_named_t brt_decref_entry_read_lost_race
;
280 kstat_named_t brt_decref_entry_still_referenced
;
281 kstat_named_t brt_decref_free_data_later
;
282 kstat_named_t brt_decref_free_data_now
;
283 kstat_named_t brt_decref_no_entry
;
286 static brt_stats_t brt_stats
= {
287 { "addref_entry_in_memory", KSTAT_DATA_UINT64
},
288 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64
},
289 { "addref_entry_on_disk", KSTAT_DATA_UINT64
},
290 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64
},
291 { "decref_entry_in_memory", KSTAT_DATA_UINT64
},
292 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64
},
293 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64
},
294 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64
},
295 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64
},
296 { "decref_entry_still_referenced", KSTAT_DATA_UINT64
},
297 { "decref_free_data_later", KSTAT_DATA_UINT64
},
298 { "decref_free_data_now", KSTAT_DATA_UINT64
},
299 { "decref_no_entry", KSTAT_DATA_UINT64
}
303 wmsum_t brt_addref_entry_in_memory
;
304 wmsum_t brt_addref_entry_not_on_disk
;
305 wmsum_t brt_addref_entry_on_disk
;
306 wmsum_t brt_addref_entry_read_lost_race
;
307 wmsum_t brt_decref_entry_in_memory
;
308 wmsum_t brt_decref_entry_loaded_from_disk
;
309 wmsum_t brt_decref_entry_not_in_memory
;
310 wmsum_t brt_decref_entry_not_on_disk
;
311 wmsum_t brt_decref_entry_read_lost_race
;
312 wmsum_t brt_decref_entry_still_referenced
;
313 wmsum_t brt_decref_free_data_later
;
314 wmsum_t brt_decref_free_data_now
;
315 wmsum_t brt_decref_no_entry
;
318 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
320 static int brt_entry_compare(const void *x1
, const void *x2
);
321 static int brt_pending_entry_compare(const void *x1
, const void *x2
);
324 brt_rlock(brt_t
*brt
)
326 rw_enter(&brt
->brt_lock
, RW_READER
);
330 brt_wlock(brt_t
*brt
)
332 rw_enter(&brt
->brt_lock
, RW_WRITER
);
336 brt_unlock(brt_t
*brt
)
338 rw_exit(&brt
->brt_lock
);
342 brt_vdev_entcount_get(const brt_vdev_t
*brtvd
, uint64_t idx
)
345 ASSERT3U(idx
, <, brtvd
->bv_size
);
347 if (brtvd
->bv_need_byteswap
) {
348 return (BSWAP_16(brtvd
->bv_entcount
[idx
]));
350 return (brtvd
->bv_entcount
[idx
]);
355 brt_vdev_entcount_set(brt_vdev_t
*brtvd
, uint64_t idx
, uint16_t entcnt
)
358 ASSERT3U(idx
, <, brtvd
->bv_size
);
360 if (brtvd
->bv_need_byteswap
) {
361 brtvd
->bv_entcount
[idx
] = BSWAP_16(entcnt
);
363 brtvd
->bv_entcount
[idx
] = entcnt
;
368 brt_vdev_entcount_inc(brt_vdev_t
*brtvd
, uint64_t idx
)
372 ASSERT3U(idx
, <, brtvd
->bv_size
);
374 entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
375 ASSERT(entcnt
< UINT16_MAX
);
377 brt_vdev_entcount_set(brtvd
, idx
, entcnt
+ 1);
381 brt_vdev_entcount_dec(brt_vdev_t
*brtvd
, uint64_t idx
)
385 ASSERT3U(idx
, <, brtvd
->bv_size
);
387 entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
390 brt_vdev_entcount_set(brtvd
, idx
, entcnt
- 1);
395 brt_vdev_dump(brt_t
*brt
)
400 if ((zfs_flags
& ZFS_DEBUG_BRT
) == 0) {
404 if (brt
->brt_nvdevs
== 0) {
405 zfs_dbgmsg("BRT empty");
409 zfs_dbgmsg("BRT vdev dump:");
410 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
413 brtvd
= &brt
->brt_vdevs
[vdevid
];
414 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
415 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
416 (u_longlong_t
)vdevid
, (u_longlong_t
)brtvd
->bv_vdevid
,
417 brtvd
->bv_meta_dirty
, brtvd
->bv_entcount_dirty
,
418 (u_longlong_t
)brtvd
->bv_size
,
419 (u_longlong_t
)brtvd
->bv_totalcount
,
420 (u_longlong_t
)brtvd
->bv_nblocks
,
421 (size_t)BT_SIZEOFMAP(brtvd
->bv_nblocks
));
422 if (brtvd
->bv_totalcount
> 0) {
423 zfs_dbgmsg(" entcounts:");
424 for (idx
= 0; idx
< brtvd
->bv_size
; idx
++) {
425 if (brt_vdev_entcount_get(brtvd
, idx
) > 0) {
426 zfs_dbgmsg(" [%04llu] %hu",
428 brt_vdev_entcount_get(brtvd
, idx
));
432 if (brtvd
->bv_entcount_dirty
) {
435 bitmap
= kmem_alloc(brtvd
->bv_nblocks
+ 1, KM_SLEEP
);
436 for (idx
= 0; idx
< brtvd
->bv_nblocks
; idx
++) {
438 BT_TEST(brtvd
->bv_bitmap
, idx
) ? 'x' : '.';
441 zfs_dbgmsg(" bitmap: %s", bitmap
);
442 kmem_free(bitmap
, brtvd
->bv_nblocks
+ 1);
449 brt_vdev(brt_t
*brt
, uint64_t vdevid
)
453 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
455 if (vdevid
< brt
->brt_nvdevs
) {
456 brtvd
= &brt
->brt_vdevs
[vdevid
];
465 brt_vdev_create(brt_t
*brt
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
469 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
470 ASSERT0(brtvd
->bv_mos_brtvdev
);
471 ASSERT0(brtvd
->bv_mos_entries
);
472 ASSERT(brtvd
->bv_entcount
!= NULL
);
473 ASSERT(brtvd
->bv_size
> 0);
474 ASSERT(brtvd
->bv_bitmap
!= NULL
);
475 ASSERT(brtvd
->bv_nblocks
> 0);
477 brtvd
->bv_mos_entries
= zap_create_flags(brt
->brt_mos
, 0,
478 ZAP_FLAG_HASH64
| ZAP_FLAG_UINT64_KEY
, DMU_OTN_ZAP_METADATA
,
479 brt_zap_leaf_blockshift
, brt_zap_indirect_blockshift
, DMU_OT_NONE
,
481 VERIFY(brtvd
->bv_mos_entries
!= 0);
482 BRT_DEBUG("MOS entries created, object=%llu",
483 (u_longlong_t
)brtvd
->bv_mos_entries
);
486 * We allocate DMU buffer to store the bv_entcount[] array.
487 * We will keep array size (bv_size) and cummulative count for all
488 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
490 brtvd
->bv_mos_brtvdev
= dmu_object_alloc(brt
->brt_mos
,
491 DMU_OTN_UINT64_METADATA
, BRT_BLOCKSIZE
,
492 DMU_OTN_UINT64_METADATA
, sizeof (brt_vdev_phys_t
), tx
);
493 VERIFY(brtvd
->bv_mos_brtvdev
!= 0);
494 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
495 (u_longlong_t
)brtvd
->bv_mos_brtvdev
);
497 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
498 (u_longlong_t
)brtvd
->bv_vdevid
);
499 VERIFY0(zap_add(brt
->brt_mos
, DMU_POOL_DIRECTORY_OBJECT
, name
,
500 sizeof (uint64_t), 1, &brtvd
->bv_mos_brtvdev
, tx
));
501 BRT_DEBUG("Pool directory object created, object=%s", name
);
503 spa_feature_incr(brt
->brt_spa
, SPA_FEATURE_BLOCK_CLONING
, tx
);
507 brt_vdev_realloc(brt_t
*brt
, brt_vdev_t
*brtvd
)
512 uint64_t nblocks
, size
;
514 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
516 spa_config_enter(brt
->brt_spa
, SCL_VDEV
, FTAG
, RW_READER
);
517 vd
= vdev_lookup_top(brt
->brt_spa
, brtvd
->bv_vdevid
);
518 size
= (vdev_get_min_asize(vd
) - 1) / brt
->brt_rangesize
+ 1;
519 spa_config_exit(brt
->brt_spa
, SCL_VDEV
, FTAG
);
521 entcount
= vmem_zalloc(sizeof (entcount
[0]) * size
, KM_SLEEP
);
522 nblocks
= BRT_RANGESIZE_TO_NBLOCKS(size
);
523 bitmap
= kmem_zalloc(BT_SIZEOFMAP(nblocks
), KM_SLEEP
);
525 if (!brtvd
->bv_initiated
) {
526 ASSERT0(brtvd
->bv_size
);
527 ASSERT(brtvd
->bv_entcount
== NULL
);
528 ASSERT(brtvd
->bv_bitmap
== NULL
);
529 ASSERT0(brtvd
->bv_nblocks
);
531 avl_create(&brtvd
->bv_tree
, brt_entry_compare
,
532 sizeof (brt_entry_t
), offsetof(brt_entry_t
, bre_node
));
534 ASSERT(brtvd
->bv_size
> 0);
535 ASSERT(brtvd
->bv_entcount
!= NULL
);
536 ASSERT(brtvd
->bv_bitmap
!= NULL
);
537 ASSERT(brtvd
->bv_nblocks
> 0);
539 * TODO: Allow vdev shrinking. We only need to implement
540 * shrinking the on-disk BRT VDEV object.
541 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
544 ASSERT3U(brtvd
->bv_size
, <=, size
);
546 memcpy(entcount
, brtvd
->bv_entcount
,
547 sizeof (entcount
[0]) * MIN(size
, brtvd
->bv_size
));
548 memcpy(bitmap
, brtvd
->bv_bitmap
, MIN(BT_SIZEOFMAP(nblocks
),
549 BT_SIZEOFMAP(brtvd
->bv_nblocks
)));
550 vmem_free(brtvd
->bv_entcount
,
551 sizeof (entcount
[0]) * brtvd
->bv_size
);
552 kmem_free(brtvd
->bv_bitmap
, BT_SIZEOFMAP(brtvd
->bv_nblocks
));
555 brtvd
->bv_size
= size
;
556 brtvd
->bv_entcount
= entcount
;
557 brtvd
->bv_bitmap
= bitmap
;
558 brtvd
->bv_nblocks
= nblocks
;
559 if (!brtvd
->bv_initiated
) {
560 brtvd
->bv_need_byteswap
= FALSE
;
561 brtvd
->bv_initiated
= TRUE
;
562 BRT_DEBUG("BRT VDEV %llu initiated.",
563 (u_longlong_t
)brtvd
->bv_vdevid
);
568 brt_vdev_load(brt_t
*brt
, brt_vdev_t
*brtvd
)
572 brt_vdev_phys_t
*bvphys
;
575 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
576 (u_longlong_t
)brtvd
->bv_vdevid
);
577 error
= zap_lookup(brt
->brt_mos
, DMU_POOL_DIRECTORY_OBJECT
, name
,
578 sizeof (uint64_t), 1, &brtvd
->bv_mos_brtvdev
);
581 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
583 error
= dmu_bonus_hold(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, FTAG
, &db
);
588 bvphys
= db
->db_data
;
589 if (brt
->brt_rangesize
== 0) {
590 brt
->brt_rangesize
= bvphys
->bvp_rangesize
;
592 ASSERT3U(brt
->brt_rangesize
, ==, bvphys
->bvp_rangesize
);
595 ASSERT(!brtvd
->bv_initiated
);
596 brt_vdev_realloc(brt
, brtvd
);
598 /* TODO: We don't support VDEV shrinking. */
599 ASSERT3U(bvphys
->bvp_size
, <=, brtvd
->bv_size
);
602 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
604 error
= dmu_read(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, 0,
605 MIN(brtvd
->bv_size
, bvphys
->bvp_size
) * sizeof (uint16_t),
606 brtvd
->bv_entcount
, DMU_READ_NO_PREFETCH
);
609 brtvd
->bv_mos_entries
= bvphys
->bvp_mos_entries
;
610 ASSERT(brtvd
->bv_mos_entries
!= 0);
611 brtvd
->bv_need_byteswap
=
612 (bvphys
->bvp_byteorder
!= BRT_NATIVE_BYTEORDER
);
613 brtvd
->bv_totalcount
= bvphys
->bvp_totalcount
;
614 brtvd
->bv_usedspace
= bvphys
->bvp_usedspace
;
615 brtvd
->bv_savedspace
= bvphys
->bvp_savedspace
;
616 brt
->brt_usedspace
+= brtvd
->bv_usedspace
;
617 brt
->brt_savedspace
+= brtvd
->bv_savedspace
;
619 dmu_buf_rele(db
, FTAG
);
621 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
622 name
, (u_longlong_t
)brtvd
->bv_mos_brtvdev
,
623 (u_longlong_t
)brtvd
->bv_mos_entries
);
627 brt_vdev_dealloc(brt_t
*brt
, brt_vdev_t
*brtvd
)
630 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
631 ASSERT(brtvd
->bv_initiated
);
633 vmem_free(brtvd
->bv_entcount
, sizeof (uint16_t) * brtvd
->bv_size
);
634 brtvd
->bv_entcount
= NULL
;
635 kmem_free(brtvd
->bv_bitmap
, BT_SIZEOFMAP(brtvd
->bv_nblocks
));
636 brtvd
->bv_bitmap
= NULL
;
637 ASSERT0(avl_numnodes(&brtvd
->bv_tree
));
638 avl_destroy(&brtvd
->bv_tree
);
641 brtvd
->bv_nblocks
= 0;
643 brtvd
->bv_initiated
= FALSE
;
644 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t
)brtvd
->bv_vdevid
);
648 brt_vdev_destroy(brt_t
*brt
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
653 brt_vdev_phys_t
*bvphys
;
655 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
656 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
657 ASSERT(brtvd
->bv_mos_entries
!= 0);
659 VERIFY0(zap_count(brt
->brt_mos
, brtvd
->bv_mos_entries
, &count
));
661 VERIFY0(zap_destroy(brt
->brt_mos
, brtvd
->bv_mos_entries
, tx
));
662 BRT_DEBUG("MOS entries destroyed, object=%llu",
663 (u_longlong_t
)brtvd
->bv_mos_entries
);
664 brtvd
->bv_mos_entries
= 0;
666 VERIFY0(dmu_bonus_hold(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, FTAG
, &db
));
667 bvphys
= db
->db_data
;
668 ASSERT0(bvphys
->bvp_totalcount
);
669 ASSERT0(bvphys
->bvp_usedspace
);
670 ASSERT0(bvphys
->bvp_savedspace
);
671 dmu_buf_rele(db
, FTAG
);
673 VERIFY0(dmu_object_free(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, tx
));
674 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
675 (u_longlong_t
)brtvd
->bv_mos_brtvdev
);
676 brtvd
->bv_mos_brtvdev
= 0;
678 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
679 (u_longlong_t
)brtvd
->bv_vdevid
);
680 VERIFY0(zap_remove(brt
->brt_mos
, DMU_POOL_DIRECTORY_OBJECT
, name
, tx
));
681 BRT_DEBUG("Pool directory object removed, object=%s", name
);
683 brt_vdev_dealloc(brt
, brtvd
);
685 spa_feature_decr(brt
->brt_spa
, SPA_FEATURE_BLOCK_CLONING
, tx
);
689 brt_vdevs_expand(brt_t
*brt
, uint64_t nvdevs
)
691 brt_vdev_t
*brtvd
, *vdevs
;
694 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
695 ASSERT3U(nvdevs
, >, brt
->brt_nvdevs
);
697 vdevs
= kmem_zalloc(sizeof (vdevs
[0]) * nvdevs
, KM_SLEEP
);
698 if (brt
->brt_nvdevs
> 0) {
699 ASSERT(brt
->brt_vdevs
!= NULL
);
701 memcpy(vdevs
, brt
->brt_vdevs
,
702 sizeof (brt_vdev_t
) * brt
->brt_nvdevs
);
703 kmem_free(brt
->brt_vdevs
,
704 sizeof (brt_vdev_t
) * brt
->brt_nvdevs
);
706 for (vdevid
= brt
->brt_nvdevs
; vdevid
< nvdevs
; vdevid
++) {
707 brtvd
= &vdevs
[vdevid
];
709 brtvd
->bv_vdevid
= vdevid
;
710 brtvd
->bv_initiated
= FALSE
;
713 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
714 (u_longlong_t
)brt
->brt_nvdevs
, (u_longlong_t
)nvdevs
);
716 brt
->brt_vdevs
= vdevs
;
717 brt
->brt_nvdevs
= nvdevs
;
721 brt_vdev_lookup(brt_t
*brt
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
)
725 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
727 idx
= bre
->bre_offset
/ brt
->brt_rangesize
;
728 if (brtvd
->bv_entcount
!= NULL
&& idx
< brtvd
->bv_size
) {
729 /* VDEV wasn't expanded. */
730 return (brt_vdev_entcount_get(brtvd
, idx
) > 0);
737 brt_vdev_addref(brt_t
*brt
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
,
742 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
743 ASSERT(brtvd
!= NULL
);
744 ASSERT(brtvd
->bv_entcount
!= NULL
);
746 brt
->brt_savedspace
+= dsize
;
747 brtvd
->bv_savedspace
+= dsize
;
748 brtvd
->bv_meta_dirty
= TRUE
;
750 if (bre
->bre_refcount
> 1) {
754 brt
->brt_usedspace
+= dsize
;
755 brtvd
->bv_usedspace
+= dsize
;
757 idx
= bre
->bre_offset
/ brt
->brt_rangesize
;
758 if (idx
>= brtvd
->bv_size
) {
759 /* VDEV has been expanded. */
760 brt_vdev_realloc(brt
, brtvd
);
763 ASSERT3U(idx
, <, brtvd
->bv_size
);
765 brtvd
->bv_totalcount
++;
766 brt_vdev_entcount_inc(brtvd
, idx
);
767 brtvd
->bv_entcount_dirty
= TRUE
;
768 idx
= idx
/ BRT_BLOCKSIZE
/ 8;
769 BT_SET(brtvd
->bv_bitmap
, idx
);
777 brt_vdev_decref(brt_t
*brt
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
,
782 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
783 ASSERT(brtvd
!= NULL
);
784 ASSERT(brtvd
->bv_entcount
!= NULL
);
786 brt
->brt_savedspace
-= dsize
;
787 brtvd
->bv_savedspace
-= dsize
;
788 brtvd
->bv_meta_dirty
= TRUE
;
790 if (bre
->bre_refcount
> 0) {
794 brt
->brt_usedspace
-= dsize
;
795 brtvd
->bv_usedspace
-= dsize
;
797 idx
= bre
->bre_offset
/ brt
->brt_rangesize
;
798 ASSERT3U(idx
, <, brtvd
->bv_size
);
800 ASSERT(brtvd
->bv_totalcount
> 0);
801 brtvd
->bv_totalcount
--;
802 brt_vdev_entcount_dec(brtvd
, idx
);
803 brtvd
->bv_entcount_dirty
= TRUE
;
804 idx
= idx
/ BRT_BLOCKSIZE
/ 8;
805 BT_SET(brtvd
->bv_bitmap
, idx
);
813 brt_vdev_sync(brt_t
*brt
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
816 brt_vdev_phys_t
*bvphys
;
818 ASSERT(brtvd
->bv_meta_dirty
);
819 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
820 ASSERT(dmu_tx_is_syncing(tx
));
822 VERIFY0(dmu_bonus_hold(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, FTAG
, &db
));
824 if (brtvd
->bv_entcount_dirty
) {
826 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
828 dmu_write(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, 0,
829 brtvd
->bv_size
* sizeof (brtvd
->bv_entcount
[0]),
830 brtvd
->bv_entcount
, tx
);
831 memset(brtvd
->bv_bitmap
, 0, BT_SIZEOFMAP(brtvd
->bv_nblocks
));
832 brtvd
->bv_entcount_dirty
= FALSE
;
835 dmu_buf_will_dirty(db
, tx
);
836 bvphys
= db
->db_data
;
837 bvphys
->bvp_mos_entries
= brtvd
->bv_mos_entries
;
838 bvphys
->bvp_size
= brtvd
->bv_size
;
839 if (brtvd
->bv_need_byteswap
) {
840 bvphys
->bvp_byteorder
= BRT_NON_NATIVE_BYTEORDER
;
842 bvphys
->bvp_byteorder
= BRT_NATIVE_BYTEORDER
;
844 bvphys
->bvp_totalcount
= brtvd
->bv_totalcount
;
845 bvphys
->bvp_rangesize
= brt
->brt_rangesize
;
846 bvphys
->bvp_usedspace
= brtvd
->bv_usedspace
;
847 bvphys
->bvp_savedspace
= brtvd
->bv_savedspace
;
848 dmu_buf_rele(db
, FTAG
);
850 brtvd
->bv_meta_dirty
= FALSE
;
854 brt_vdevs_alloc(brt_t
*brt
, boolean_t load
)
861 brt_vdevs_expand(brt
, brt
->brt_spa
->spa_root_vdev
->vdev_children
);
864 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
865 brtvd
= &brt
->brt_vdevs
[vdevid
];
866 ASSERT(brtvd
->bv_entcount
== NULL
);
868 brt_vdev_load(brt
, brtvd
);
872 if (brt
->brt_rangesize
== 0) {
873 brt
->brt_rangesize
= BRT_RANGESIZE
;
880 brt_vdevs_free(brt_t
*brt
)
887 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
888 brtvd
= &brt
->brt_vdevs
[vdevid
];
889 if (brtvd
->bv_initiated
)
890 brt_vdev_dealloc(brt
, brtvd
);
892 kmem_free(brt
->brt_vdevs
, sizeof (brt_vdev_t
) * brt
->brt_nvdevs
);
898 brt_entry_fill(const blkptr_t
*bp
, brt_entry_t
*bre
, uint64_t *vdevidp
)
901 bre
->bre_offset
= DVA_GET_OFFSET(&bp
->blk_dva
[0]);
902 bre
->bre_refcount
= 0;
904 *vdevidp
= DVA_GET_VDEV(&bp
->blk_dva
[0]);
908 brt_entry_compare(const void *x1
, const void *x2
)
910 const brt_entry_t
*bre1
= x1
;
911 const brt_entry_t
*bre2
= x2
;
913 return (TREE_CMP(bre1
->bre_offset
, bre2
->bre_offset
));
917 brt_entry_lookup(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
)
919 uint64_t mos_entries
;
920 uint64_t one
, physsize
;
923 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
925 if (!brt_vdev_lookup(brt
, brtvd
, bre
))
926 return (SET_ERROR(ENOENT
));
929 * Remember mos_entries object number. After we reacquire the BRT lock,
930 * the brtvd pointer may be invalid.
932 mos_entries
= brtvd
->bv_mos_entries
;
933 if (mos_entries
== 0)
934 return (SET_ERROR(ENOENT
));
938 error
= zap_length_uint64(brt
->brt_mos
, mos_entries
, &bre
->bre_offset
,
939 BRT_KEY_WORDS
, &one
, &physsize
);
941 ASSERT3U(one
, ==, 1);
942 ASSERT3U(physsize
, ==, sizeof (bre
->bre_refcount
));
944 error
= zap_lookup_uint64(brt
->brt_mos
, mos_entries
,
945 &bre
->bre_offset
, BRT_KEY_WORDS
, 1,
946 sizeof (bre
->bre_refcount
), &bre
->bre_refcount
);
947 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
948 "count=%llu error=%d", (u_longlong_t
)mos_entries
,
949 (u_longlong_t
)brtvd
->bv_vdevid
,
950 (u_longlong_t
)bre
->bre_offset
,
951 error
== 0 ? (u_longlong_t
)bre
->bre_refcount
: 0, error
);
960 brt_entry_prefetch(brt_t
*brt
, uint64_t vdevid
, brt_entry_t
*bre
)
963 uint64_t mos_entries
= 0;
966 brtvd
= brt_vdev(brt
, vdevid
);
968 mos_entries
= brtvd
->bv_mos_entries
;
971 if (mos_entries
== 0)
974 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
975 (u_longlong_t
)mos_entries
, (u_longlong_t
)vdevid
,
976 (u_longlong_t
)bre
->bre_offset
);
977 (void) zap_prefetch_uint64(brt
->brt_mos
, mos_entries
,
978 (uint64_t *)&bre
->bre_offset
, BRT_KEY_WORDS
);
982 brt_entry_update(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
986 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
987 ASSERT(brtvd
->bv_mos_entries
!= 0);
988 ASSERT(bre
->bre_refcount
> 0);
990 error
= zap_update_uint64(brt
->brt_mos
, brtvd
->bv_mos_entries
,
991 (uint64_t *)&bre
->bre_offset
, BRT_KEY_WORDS
, 1,
992 sizeof (bre
->bre_refcount
), &bre
->bre_refcount
, tx
);
993 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
994 "error=%d", (u_longlong_t
)brtvd
->bv_mos_entries
,
995 (u_longlong_t
)brtvd
->bv_vdevid
, (u_longlong_t
)bre
->bre_offset
,
996 (u_longlong_t
)bre
->bre_refcount
, error
);
1002 brt_entry_remove(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
1006 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
1007 ASSERT(brtvd
->bv_mos_entries
!= 0);
1008 ASSERT0(bre
->bre_refcount
);
1010 error
= zap_remove_uint64(brt
->brt_mos
, brtvd
->bv_mos_entries
,
1011 (uint64_t *)&bre
->bre_offset
, BRT_KEY_WORDS
, tx
);
1012 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1013 "error=%d", (u_longlong_t
)brtvd
->bv_mos_entries
,
1014 (u_longlong_t
)brtvd
->bv_vdevid
, (u_longlong_t
)bre
->bre_offset
,
1015 (u_longlong_t
)bre
->bre_refcount
, error
);
1021 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1022 * positive, but gives us quick answer if we should look into BRT, which
1023 * may require reads and thus will be more expensive.
1026 brt_maybe_exists(spa_t
*spa
, const blkptr_t
*bp
)
1028 brt_t
*brt
= spa
->spa_brt
;
1030 brt_entry_t bre_search
;
1031 boolean_t mayexists
= FALSE
;
1034 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1038 brtvd
= brt_vdev(brt
, vdevid
);
1039 if (brtvd
!= NULL
&& brtvd
->bv_initiated
) {
1040 if (!avl_is_empty(&brtvd
->bv_tree
) ||
1041 brt_vdev_lookup(brt
, brtvd
, &bre_search
)) {
1052 brt_get_dspace(spa_t
*spa
)
1054 brt_t
*brt
= spa
->spa_brt
;
1059 return (brt
->brt_savedspace
);
1063 brt_get_used(spa_t
*spa
)
1065 brt_t
*brt
= spa
->spa_brt
;
1070 return (brt
->brt_usedspace
);
1074 brt_get_saved(spa_t
*spa
)
1076 brt_t
*brt
= spa
->spa_brt
;
1081 return (brt
->brt_savedspace
);
1085 brt_get_ratio(spa_t
*spa
)
1087 brt_t
*brt
= spa
->spa_brt
;
1089 if (brt
->brt_usedspace
== 0)
1092 return ((brt
->brt_usedspace
+ brt
->brt_savedspace
) * 100 /
1093 brt
->brt_usedspace
);
1097 brt_kstats_update(kstat_t
*ksp
, int rw
)
1099 brt_stats_t
*bs
= ksp
->ks_data
;
1101 if (rw
== KSTAT_WRITE
)
1104 bs
->brt_addref_entry_in_memory
.value
.ui64
=
1105 wmsum_value(&brt_sums
.brt_addref_entry_in_memory
);
1106 bs
->brt_addref_entry_not_on_disk
.value
.ui64
=
1107 wmsum_value(&brt_sums
.brt_addref_entry_not_on_disk
);
1108 bs
->brt_addref_entry_on_disk
.value
.ui64
=
1109 wmsum_value(&brt_sums
.brt_addref_entry_on_disk
);
1110 bs
->brt_addref_entry_read_lost_race
.value
.ui64
=
1111 wmsum_value(&brt_sums
.brt_addref_entry_read_lost_race
);
1112 bs
->brt_decref_entry_in_memory
.value
.ui64
=
1113 wmsum_value(&brt_sums
.brt_decref_entry_in_memory
);
1114 bs
->brt_decref_entry_loaded_from_disk
.value
.ui64
=
1115 wmsum_value(&brt_sums
.brt_decref_entry_loaded_from_disk
);
1116 bs
->brt_decref_entry_not_in_memory
.value
.ui64
=
1117 wmsum_value(&brt_sums
.brt_decref_entry_not_in_memory
);
1118 bs
->brt_decref_entry_not_on_disk
.value
.ui64
=
1119 wmsum_value(&brt_sums
.brt_decref_entry_not_on_disk
);
1120 bs
->brt_decref_entry_read_lost_race
.value
.ui64
=
1121 wmsum_value(&brt_sums
.brt_decref_entry_read_lost_race
);
1122 bs
->brt_decref_entry_still_referenced
.value
.ui64
=
1123 wmsum_value(&brt_sums
.brt_decref_entry_still_referenced
);
1124 bs
->brt_decref_free_data_later
.value
.ui64
=
1125 wmsum_value(&brt_sums
.brt_decref_free_data_later
);
1126 bs
->brt_decref_free_data_now
.value
.ui64
=
1127 wmsum_value(&brt_sums
.brt_decref_free_data_now
);
1128 bs
->brt_decref_no_entry
.value
.ui64
=
1129 wmsum_value(&brt_sums
.brt_decref_no_entry
);
1138 wmsum_init(&brt_sums
.brt_addref_entry_in_memory
, 0);
1139 wmsum_init(&brt_sums
.brt_addref_entry_not_on_disk
, 0);
1140 wmsum_init(&brt_sums
.brt_addref_entry_on_disk
, 0);
1141 wmsum_init(&brt_sums
.brt_addref_entry_read_lost_race
, 0);
1142 wmsum_init(&brt_sums
.brt_decref_entry_in_memory
, 0);
1143 wmsum_init(&brt_sums
.brt_decref_entry_loaded_from_disk
, 0);
1144 wmsum_init(&brt_sums
.brt_decref_entry_not_in_memory
, 0);
1145 wmsum_init(&brt_sums
.brt_decref_entry_not_on_disk
, 0);
1146 wmsum_init(&brt_sums
.brt_decref_entry_read_lost_race
, 0);
1147 wmsum_init(&brt_sums
.brt_decref_entry_still_referenced
, 0);
1148 wmsum_init(&brt_sums
.brt_decref_free_data_later
, 0);
1149 wmsum_init(&brt_sums
.brt_decref_free_data_now
, 0);
1150 wmsum_init(&brt_sums
.brt_decref_no_entry
, 0);
1152 brt_ksp
= kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED
,
1153 sizeof (brt_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
1154 if (brt_ksp
!= NULL
) {
1155 brt_ksp
->ks_data
= &brt_stats
;
1156 brt_ksp
->ks_update
= brt_kstats_update
;
1157 kstat_install(brt_ksp
);
1164 if (brt_ksp
!= NULL
) {
1165 kstat_delete(brt_ksp
);
1169 wmsum_fini(&brt_sums
.brt_addref_entry_in_memory
);
1170 wmsum_fini(&brt_sums
.brt_addref_entry_not_on_disk
);
1171 wmsum_fini(&brt_sums
.brt_addref_entry_on_disk
);
1172 wmsum_fini(&brt_sums
.brt_addref_entry_read_lost_race
);
1173 wmsum_fini(&brt_sums
.brt_decref_entry_in_memory
);
1174 wmsum_fini(&brt_sums
.brt_decref_entry_loaded_from_disk
);
1175 wmsum_fini(&brt_sums
.brt_decref_entry_not_in_memory
);
1176 wmsum_fini(&brt_sums
.brt_decref_entry_not_on_disk
);
1177 wmsum_fini(&brt_sums
.brt_decref_entry_read_lost_race
);
1178 wmsum_fini(&brt_sums
.brt_decref_entry_still_referenced
);
1179 wmsum_fini(&brt_sums
.brt_decref_free_data_later
);
1180 wmsum_fini(&brt_sums
.brt_decref_free_data_now
);
1181 wmsum_fini(&brt_sums
.brt_decref_no_entry
);
1187 brt_entry_cache
= kmem_cache_create("brt_entry_cache",
1188 sizeof (brt_entry_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1189 brt_pending_entry_cache
= kmem_cache_create("brt_pending_entry_cache",
1190 sizeof (brt_pending_entry_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1200 kmem_cache_destroy(brt_entry_cache
);
1201 kmem_cache_destroy(brt_pending_entry_cache
);
1204 static brt_entry_t
*
1205 brt_entry_alloc(const brt_entry_t
*bre_init
)
1209 bre
= kmem_cache_alloc(brt_entry_cache
, KM_SLEEP
);
1210 bre
->bre_offset
= bre_init
->bre_offset
;
1211 bre
->bre_refcount
= bre_init
->bre_refcount
;
1217 brt_entry_free(brt_entry_t
*bre
)
1220 kmem_cache_free(brt_entry_cache
, bre
);
1224 brt_entry_addref(brt_t
*brt
, const blkptr_t
*bp
)
1227 brt_entry_t
*bre
, *racebre
;
1228 brt_entry_t bre_search
;
1233 ASSERT(!RW_WRITE_HELD(&brt
->brt_lock
));
1235 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1239 brtvd
= brt_vdev(brt
, vdevid
);
1240 if (brtvd
== NULL
) {
1241 ASSERT3U(vdevid
, >=, brt
->brt_nvdevs
);
1243 /* New VDEV was added. */
1244 brt_vdevs_expand(brt
, vdevid
+ 1);
1245 brtvd
= brt_vdev(brt
, vdevid
);
1247 ASSERT(brtvd
!= NULL
);
1248 if (!brtvd
->bv_initiated
)
1249 brt_vdev_realloc(brt
, brtvd
);
1251 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1253 BRTSTAT_BUMP(brt_addref_entry_in_memory
);
1256 * brt_entry_lookup() may drop the BRT (read) lock and
1257 * reacquire it (write).
1259 error
= brt_entry_lookup(brt
, brtvd
, &bre_search
);
1260 /* bre_search now contains correct bre_refcount */
1261 ASSERT(error
== 0 || error
== ENOENT
);
1263 BRTSTAT_BUMP(brt_addref_entry_on_disk
);
1265 BRTSTAT_BUMP(brt_addref_entry_not_on_disk
);
1267 * When the BRT lock was dropped, brt_vdevs[] may have been
1268 * expanded and reallocated, we need to update brtvd's pointer.
1270 brtvd
= brt_vdev(brt
, vdevid
);
1271 ASSERT(brtvd
!= NULL
);
1273 racebre
= avl_find(&brtvd
->bv_tree
, &bre_search
, &where
);
1274 if (racebre
== NULL
) {
1275 bre
= brt_entry_alloc(&bre_search
);
1276 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
1277 avl_insert(&brtvd
->bv_tree
, bre
, where
);
1278 brt
->brt_nentries
++;
1281 * The entry was added when the BRT lock was dropped in
1282 * brt_entry_lookup().
1284 BRTSTAT_BUMP(brt_addref_entry_read_lost_race
);
1288 bre
->bre_refcount
++;
1289 brt_vdev_addref(brt
, brtvd
, bre
, bp_get_dsize(brt
->brt_spa
, bp
));
1294 /* Return TRUE if block should be freed immediately. */
1296 brt_entry_decref(spa_t
*spa
, const blkptr_t
*bp
)
1298 brt_t
*brt
= spa
->spa_brt
;
1300 brt_entry_t
*bre
, *racebre
;
1301 brt_entry_t bre_search
;
1306 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1310 brtvd
= brt_vdev(brt
, vdevid
);
1311 ASSERT(brtvd
!= NULL
);
1313 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1315 BRTSTAT_BUMP(brt_decref_entry_in_memory
);
1318 BRTSTAT_BUMP(brt_decref_entry_not_in_memory
);
1322 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1324 error
= brt_entry_lookup(brt
, brtvd
, &bre_search
);
1325 /* bre_search now contains correct bre_refcount */
1326 ASSERT(error
== 0 || error
== ENOENT
);
1328 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1329 * and reallocated, we need to update brtvd's pointer.
1331 brtvd
= brt_vdev(brt
, vdevid
);
1332 ASSERT(brtvd
!= NULL
);
1334 if (error
== ENOENT
) {
1335 BRTSTAT_BUMP(brt_decref_entry_not_on_disk
);
1340 racebre
= avl_find(&brtvd
->bv_tree
, &bre_search
, &where
);
1341 if (racebre
!= NULL
) {
1343 * The entry was added when the BRT lock was dropped in
1344 * brt_entry_lookup().
1346 BRTSTAT_BUMP(brt_decref_entry_read_lost_race
);
1351 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk
);
1352 bre
= brt_entry_alloc(&bre_search
);
1353 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
1354 avl_insert(&brtvd
->bv_tree
, bre
, where
);
1355 brt
->brt_nentries
++;
1360 * This is a free of a regular (not cloned) block.
1363 BRTSTAT_BUMP(brt_decref_no_entry
);
1366 if (bre
->bre_refcount
== 0) {
1368 BRTSTAT_BUMP(brt_decref_free_data_now
);
1372 ASSERT(bre
->bre_refcount
> 0);
1373 bre
->bre_refcount
--;
1374 if (bre
->bre_refcount
== 0)
1375 BRTSTAT_BUMP(brt_decref_free_data_later
);
1377 BRTSTAT_BUMP(brt_decref_entry_still_referenced
);
1378 brt_vdev_decref(brt
, brtvd
, bre
, bp_get_dsize(brt
->brt_spa
, bp
));
1386 brt_entry_get_refcount(spa_t
*spa
, const blkptr_t
*bp
)
1388 brt_t
*brt
= spa
->spa_brt
;
1390 brt_entry_t bre_search
, *bre
;
1391 uint64_t vdevid
, refcnt
;
1394 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1398 brtvd
= brt_vdev(brt
, vdevid
);
1399 ASSERT(brtvd
!= NULL
);
1401 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1403 error
= brt_entry_lookup(brt
, brtvd
, &bre_search
);
1404 ASSERT(error
== 0 || error
== ENOENT
);
1405 if (error
== ENOENT
)
1408 refcnt
= bre_search
.bre_refcount
;
1410 refcnt
= bre
->bre_refcount
;
1417 brt_prefetch(brt_t
*brt
, const blkptr_t
*bp
)
1424 if (!zfs_brt_prefetch
)
1427 brt_entry_fill(bp
, &bre
, &vdevid
);
1429 brt_entry_prefetch(brt
, vdevid
, &bre
);
1433 brt_pending_entry_compare(const void *x1
, const void *x2
)
1435 const brt_pending_entry_t
*bpe1
= x1
, *bpe2
= x2
;
1436 const blkptr_t
*bp1
= &bpe1
->bpe_bp
, *bp2
= &bpe2
->bpe_bp
;
1439 cmp
= TREE_CMP(BP_PHYSICAL_BIRTH(bp1
), BP_PHYSICAL_BIRTH(bp2
));
1441 cmp
= TREE_CMP(DVA_GET_VDEV(&bp1
->blk_dva
[0]),
1442 DVA_GET_VDEV(&bp2
->blk_dva
[0]));
1444 cmp
= TREE_CMP(DVA_GET_OFFSET(&bp1
->blk_dva
[0]),
1445 DVA_GET_OFFSET(&bp2
->blk_dva
[0]));
1453 brt_pending_add(spa_t
*spa
, const blkptr_t
*bp
, dmu_tx_t
*tx
)
1456 avl_tree_t
*pending_tree
;
1457 kmutex_t
*pending_lock
;
1458 brt_pending_entry_t
*bpe
, *newbpe
;
1463 txg
= dmu_tx_get_txg(tx
);
1464 ASSERT3U(txg
, !=, 0);
1465 pending_tree
= &brt
->brt_pending_tree
[txg
& TXG_MASK
];
1466 pending_lock
= &brt
->brt_pending_lock
[txg
& TXG_MASK
];
1468 newbpe
= kmem_cache_alloc(brt_pending_entry_cache
, KM_SLEEP
);
1469 newbpe
->bpe_bp
= *bp
;
1470 newbpe
->bpe_count
= 1;
1472 mutex_enter(pending_lock
);
1474 bpe
= avl_find(pending_tree
, newbpe
, &where
);
1476 avl_insert(pending_tree
, newbpe
, where
);
1482 mutex_exit(pending_lock
);
1484 if (newbpe
!= NULL
) {
1485 ASSERT(bpe
!= NULL
);
1486 ASSERT(bpe
!= newbpe
);
1487 kmem_cache_free(brt_pending_entry_cache
, newbpe
);
1489 ASSERT(bpe
== NULL
);
1492 /* Prefetch BRT entry, as we will need it in the syncing context. */
1493 brt_prefetch(brt
, bp
);
1497 brt_pending_remove(spa_t
*spa
, const blkptr_t
*bp
, dmu_tx_t
*tx
)
1500 avl_tree_t
*pending_tree
;
1501 kmutex_t
*pending_lock
;
1502 brt_pending_entry_t
*bpe
, bpe_search
;
1506 txg
= dmu_tx_get_txg(tx
);
1507 ASSERT3U(txg
, !=, 0);
1508 pending_tree
= &brt
->brt_pending_tree
[txg
& TXG_MASK
];
1509 pending_lock
= &brt
->brt_pending_lock
[txg
& TXG_MASK
];
1511 bpe_search
.bpe_bp
= *bp
;
1513 mutex_enter(pending_lock
);
1515 bpe
= avl_find(pending_tree
, &bpe_search
, NULL
);
1516 /* I believe we should always find bpe when this function is called. */
1518 ASSERT(bpe
->bpe_count
> 0);
1521 if (bpe
->bpe_count
== 0) {
1522 avl_remove(pending_tree
, bpe
);
1523 kmem_cache_free(brt_pending_entry_cache
, bpe
);
1527 mutex_exit(pending_lock
);
1531 brt_pending_apply(spa_t
*spa
, uint64_t txg
)
1534 brt_pending_entry_t
*bpe
;
1535 avl_tree_t
*pending_tree
;
1536 kmutex_t
*pending_lock
;
1539 ASSERT3U(txg
, !=, 0);
1542 pending_tree
= &brt
->brt_pending_tree
[txg
& TXG_MASK
];
1543 pending_lock
= &brt
->brt_pending_lock
[txg
& TXG_MASK
];
1545 mutex_enter(pending_lock
);
1548 while ((bpe
= avl_destroy_nodes(pending_tree
, &c
)) != NULL
) {
1549 boolean_t added_to_ddt
;
1551 mutex_exit(pending_lock
);
1553 for (int i
= 0; i
< bpe
->bpe_count
; i
++) {
1555 * If the block has DEDUP bit set, it means that it
1556 * already exists in the DEDUP table, so we can just
1557 * use that instead of creating new entry in
1560 if (BP_GET_DEDUP(&bpe
->bpe_bp
)) {
1561 added_to_ddt
= ddt_addref(spa
, &bpe
->bpe_bp
);
1563 added_to_ddt
= B_FALSE
;
1566 brt_entry_addref(brt
, &bpe
->bpe_bp
);
1569 kmem_cache_free(brt_pending_entry_cache
, bpe
);
1570 mutex_enter(pending_lock
);
1573 mutex_exit(pending_lock
);
1577 brt_sync_entry(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
1580 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
1581 ASSERT(brtvd
->bv_mos_entries
!= 0);
1583 if (bre
->bre_refcount
== 0) {
1586 error
= brt_entry_remove(brt
, brtvd
, bre
, tx
);
1587 ASSERT(error
== 0 || error
== ENOENT
);
1589 * If error == ENOENT then zfs_clone_range() was done from a
1590 * removed (but opened) file (open(), unlink()).
1592 ASSERT(brt_entry_lookup(brt
, brtvd
, bre
) == ENOENT
);
1594 VERIFY0(brt_entry_update(brt
, brtvd
, bre
, tx
));
1599 brt_sync_table(brt_t
*brt
, dmu_tx_t
*tx
)
1608 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
1609 brtvd
= &brt
->brt_vdevs
[vdevid
];
1611 if (!brtvd
->bv_initiated
)
1614 if (!brtvd
->bv_meta_dirty
) {
1615 ASSERT(!brtvd
->bv_entcount_dirty
);
1616 ASSERT0(avl_numnodes(&brtvd
->bv_tree
));
1620 ASSERT(!brtvd
->bv_entcount_dirty
||
1621 avl_numnodes(&brtvd
->bv_tree
) != 0);
1623 if (brtvd
->bv_mos_brtvdev
== 0)
1624 brt_vdev_create(brt
, brtvd
, tx
);
1627 while ((bre
= avl_destroy_nodes(&brtvd
->bv_tree
, &c
)) != NULL
) {
1628 brt_sync_entry(brt
, brtvd
, bre
, tx
);
1629 brt_entry_free(bre
);
1630 ASSERT(brt
->brt_nentries
> 0);
1631 brt
->brt_nentries
--;
1634 brt_vdev_sync(brt
, brtvd
, tx
);
1636 if (brtvd
->bv_totalcount
== 0)
1637 brt_vdev_destroy(brt
, brtvd
, tx
);
1640 ASSERT0(brt
->brt_nentries
);
1646 brt_sync(spa_t
*spa
, uint64_t txg
)
1651 ASSERT(spa_syncing_txg(spa
) == txg
);
1655 if (brt
->brt_nentries
== 0) {
1662 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
1664 brt_sync_table(brt
, tx
);
1670 brt_table_alloc(brt_t
*brt
)
1673 for (int i
= 0; i
< TXG_SIZE
; i
++) {
1674 avl_create(&brt
->brt_pending_tree
[i
],
1675 brt_pending_entry_compare
,
1676 sizeof (brt_pending_entry_t
),
1677 offsetof(brt_pending_entry_t
, bpe_node
));
1678 mutex_init(&brt
->brt_pending_lock
[i
], NULL
, MUTEX_DEFAULT
,
1684 brt_table_free(brt_t
*brt
)
1687 for (int i
= 0; i
< TXG_SIZE
; i
++) {
1688 ASSERT(avl_is_empty(&brt
->brt_pending_tree
[i
]));
1690 avl_destroy(&brt
->brt_pending_tree
[i
]);
1691 mutex_destroy(&brt
->brt_pending_lock
[i
]);
1696 brt_alloc(spa_t
*spa
)
1700 ASSERT(spa
->spa_brt
== NULL
);
1702 brt
= kmem_zalloc(sizeof (*brt
), KM_SLEEP
);
1703 rw_init(&brt
->brt_lock
, NULL
, RW_DEFAULT
, NULL
);
1705 brt
->brt_rangesize
= 0;
1706 brt
->brt_nentries
= 0;
1707 brt
->brt_vdevs
= NULL
;
1708 brt
->brt_nvdevs
= 0;
1709 brt_table_alloc(brt
);
1715 brt_create(spa_t
*spa
)
1719 brt_vdevs_alloc(spa
->spa_brt
, B_FALSE
);
1723 brt_load(spa_t
*spa
)
1727 brt_vdevs_alloc(spa
->spa_brt
, B_TRUE
);
1733 brt_unload(spa_t
*spa
)
1735 brt_t
*brt
= spa
->spa_brt
;
1740 brt_vdevs_free(brt
);
1741 brt_table_free(brt
);
1742 rw_destroy(&brt
->brt_lock
);
1743 kmem_free(brt
, sizeof (*brt
));
1744 spa
->spa_brt
= NULL
;
1748 ZFS_MODULE_PARAM(zfs_brt
, zfs_brt_
, prefetch
, INT
, ZMOD_RW
,
1749 "Enable prefetching of BRT entries");
1750 #ifdef ZFS_BRT_DEBUG
1751 ZFS_MODULE_PARAM(zfs_brt
, zfs_brt_
, debug
, INT
, ZMOD_RW
, "BRT debug");