BRT should return EOPNOTSUPP
[zfs.git] / module / zfs / brt.c
blob877b503a1bf212ad18a2056b55d9db16f8d57a21
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/ddt.h>
32 #include <sys/bitmap.h>
33 #include <sys/zap.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/arc.h>
36 #include <sys/dsl_pool.h>
37 #include <sys/dsl_scan.h>
38 #include <sys/vdev_impl.h>
39 #include <sys/kstat.h>
40 #include <sys/wmsum.h>
43 * Block Cloning design.
45 * Block Cloning allows to manually clone a file (or a subset of its blocks)
46 * into another (or the same) file by just creating additional references to
47 * the data blocks without copying the data itself. Those references are kept
48 * in the Block Reference Tables (BRTs).
50 * In many ways this is similar to the existing deduplication, but there are
51 * some important differences:
53 * - Deduplication is automatic and Block Cloning is not - one has to use a
54 * dedicated system call(s) to clone the given file/blocks.
55 * - Deduplication keeps all data blocks in its table, even those referenced
56 * just once. Block Cloning creates an entry in its tables only when there
57 * are at least two references to the given data block. If the block was
58 * never explicitly cloned or the second to last reference was dropped,
59 * there will be neither space nor performance overhead.
60 * - Deduplication needs data to work - one needs to pass real data to the
61 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
62 * data, just block pointers to the data, so it is extremely fast, as we pay
63 * neither the cost of reading the data, nor the cost of writing the data -
64 * we operate exclusively on metadata.
65 * - If the D (dedup) bit is not set in the block pointer, it means that
66 * the block is not in the dedup table (DDT) and we won't consult the DDT
67 * when we need to free the block. Block Cloning must be consulted on every
68 * free, because we cannot modify the source BP (eg. by setting something
69 * similar to the D bit), thus we have no hint if the block is in the
70 * Block Reference Table (BRT), so we need to look into the BRT. There is
71 * an optimization in place that allows us to eliminate the majority of BRT
72 * lookups which is described below in the "Minimizing free penalty" section.
73 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
74 * 64bit offset and 64bit reference counter.
75 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
76 * other on disk are most likely in totally different parts of the DDT.
77 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
78 * from one file should have BRT entries close to each other.
79 * - Scrub will only do a single pass over a block that is referenced multiple
80 * times in the DDT. Unfortunately it is not currently (if at all) possible
81 * with Block Cloning and block referenced multiple times will be scrubbed
82 * multiple times. The new, sorted scrub should be able to eliminate
83 * duplicated reads given enough memory.
84 * - Deduplication requires cryptographically strong hash as a checksum or
85 * additional data verification. Block Cloning works with any checksum
86 * algorithm or even with checksumming disabled.
88 * As mentioned above, the BRT entries are much smaller than the DDT entries.
89 * To uniquely identify a block we just need its vdev id and offset. We also
90 * need to maintain a reference counter. The vdev id will often repeat, as there
91 * is a small number of top-level VDEVs and a large number of blocks stored in
92 * each VDEV. We take advantage of that to reduce the BRT entry size further by
93 * maintaining one BRT for each top-level VDEV, so we can then have only offset
94 * and counter as the BRT entry.
96 * Minimizing free penalty.
98 * Block Cloning allows creating additional references to any existing block.
99 * When we free a block there is no hint in the block pointer whether the block
100 * was cloned or not, so on each free we have to check if there is a
101 * corresponding entry in the BRT or not. If there is, we need to decrease
102 * the reference counter. Doing BRT lookup on every free can potentially be
103 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
104 * This is the main problem with deduplication, so we've learned our lesson and
105 * try not to repeat the same mistake here. How do we do that? We divide each
106 * top-level VDEV into 16MB regions. For each region we maintain a counter that
107 * is a sum of all the BRT entries that have offsets within the region. This
108 * creates the entries count array of 16bit numbers for each top-level VDEV.
109 * The entries count array is always kept in memory and updated on disk in the
110 * same transaction group as the BRT updates to keep everything in-sync. We can
111 * keep the array in memory, because it is very small. With 16MB regions and
112 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
113 * the region size even further in the future). Now, when we want to free
114 * a block, we first consult the array. If the counter for the whole region is
115 * zero, there is no need to look for the BRT entry, as there isn't one for
116 * sure. If the counter for the region is greater than zero, only then we will
117 * do a BRT lookup and if an entry is found we will decrease the reference
118 * counter in the BRT entry and in the entry counters array.
120 * The entry counters array is small, but can potentially be larger for very
121 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
122 * array on every change. We then divide the array into 32kB block and keep
123 * a bitmap of dirty blocks within a transaction group. When we sync the
124 * transaction group we can only update the parts of the entry counters array
125 * that were modified. Note: Keeping track of the dirty parts of the entry
126 * counters array is implemented, but updating only parts of the array on disk
127 * is not yet implemented - for now we will update entire array if there was
128 * any change.
130 * The implementation tries to be economic: if BRT is not used, or no longer
131 * used, there will be no entries in the MOS and no additional memory used (eg.
132 * the entry counters array is only allocated if needed).
134 * Interaction between Deduplication and Block Cloning.
136 * If both functionalities are in use, we could end up with a block that is
137 * referenced multiple times in both DDT and BRT. When we free one of the
138 * references we couldn't tell where it belongs, so we would have to decide
139 * what table takes the precedence: do we first clear DDT references or BRT
140 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
141 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
142 * lookup DDT entry instead and increase the counter there. No BRT entry
143 * will be created for a block which has the D (dedup) bit set.
144 * BRT may be more efficient for manual deduplication, but if the block is
145 * already in the DDT, then creating additional BRT entry would be less
146 * efficient. This clever idea was proposed by Allan Jude.
148 * Block Cloning across datasets.
150 * Block Cloning is not limited to cloning blocks within the same dataset.
151 * It is possible (and very useful) to clone blocks between different datasets.
152 * One use case is recovering files from snapshots. By cloning the files into
153 * dataset we need no additional storage. Without Block Cloning we would need
154 * additional space for those files.
155 * Another interesting use case is moving the files between datasets
156 * (copying the file content to the new dataset and removing the source file).
157 * In that case Block Cloning will only be used briefly, because the BRT entries
158 * will be removed when the source is removed.
159 * Note: currently it is not possible to clone blocks between encrypted
160 * datasets, even if those datasets use the same encryption key (this includes
161 * snapshots of encrypted datasets). Cloning blocks between datasets that use
162 * the same keys should be possible and should be implemented in the future.
164 * Block Cloning flow through ZFS layers.
166 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167 * blocks. As of this writing no interface is implemented that allows for block
168 * cloning within a ZVOL.
169 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170 * for blocking cloning.
172 * ssize_t
173 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174 * size_t len, unsigned int flags);
176 * Even though offsets and length represent bytes, they have to be
177 * block-aligned or we will return the EXDEV error so the upper layer can
178 * fallback to the generic mechanism that will just copy the data.
179 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180 * This function was implemented based on zfs_write(), but instead of writing
181 * the given data we first read block pointers using the new dmu_read_l0_bps()
182 * function from the source file. Once we have BPs from the source file we call
183 * the dmu_brt_clone() function on the destination file. This function
184 * allocates BPs for us. We iterate over all source BPs. If the given BP is
185 * a hole or an embedded block, we just copy BP as-is. If it points to a real
186 * data we place this BP on a BRT pending list using the brt_pending_add()
187 * function.
189 * We use this pending list to keep track of all BPs that got new references
190 * within this transaction group.
192 * Some special cases to consider and how we address them:
193 * - The block we want to clone may have been created within the same
194 * transaction group that we are trying to clone. Such block has no BP
195 * allocated yet, so cannot be immediately cloned. We return EXDEV.
196 * - The block we want to clone may have been modified within the same
197 * transaction group. We return EXDEV.
198 * - A block may be cloned multiple times during one transaction group (that's
199 * why pending list is actually a tree and not an append-only list - this
200 * way we can figure out faster if this block is cloned for the first time
201 * in this txg or consecutive time).
202 * - A block may be cloned and freed within the same transaction group
203 * (see dbuf_undirty()).
204 * - A block may be cloned and within the same transaction group the clone
205 * can be cloned again (see dmu_read_l0_bps()).
206 * - A file might have been deleted, but the caller still has a file descriptor
207 * open to this file and clones it.
209 * When we free a block we have an additional step in the ZIO pipeline where we
210 * call the zio_brt_free() function. We then call the brt_entry_decref()
211 * that loads the corresponding BRT entry (if one exists) and decreases
212 * reference counter. If this is not the last reference we will stop ZIO
213 * pipeline here. If this is the last reference or the block is not in the
214 * BRT, we continue the pipeline and free the block as usual.
216 * At the beginning of spa_sync() where there can be no more block cloning,
217 * but before issuing frees we call brt_pending_apply(). This function applies
218 * all the new clones to the BRT table - we load BRT entries and update
219 * reference counters. To sync new BRT entries to disk, we use brt_sync()
220 * function. This function will sync all dirty per-top-level-vdev BRTs,
221 * the entry counters arrays, etc.
223 * Block Cloning and ZIL.
225 * Every clone operation is divided into chunks (similar to write) and each
226 * chunk is cloned in a separate transaction. The chunk size is determined by
227 * how many BPs we can fit into a single ZIL entry.
228 * Replaying clone operation is different from the regular clone operation,
229 * as when we log clone operations we cannot use the source object - it may
230 * reside on a different dataset, so we log BPs we want to clone.
231 * The ZIL is replayed when we mount the given dataset, not when the pool is
232 * imported. Taking this into account it is possible that the pool is imported
233 * without mounting datasets and the source dataset is destroyed before the
234 * destination dataset is mounted and its ZIL replayed.
235 * To address this situation we leverage zil_claim() mechanism where ZFS will
236 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237 * entries, we will bump reference counters for their BPs in the BRT and then
238 * on mount and ZIL replay we will just attach BPs to the file without
239 * bumping reference counters.
240 * Note it is still possible that after zil_claim() we never mount the
241 * destination, so we never replay its ZIL and we destroy it. This way we would
242 * end up with leaked references in BRT. We address that too as ZFS gives us
243 * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
247 * BRT - Block Reference Table.
249 #define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
252 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
253 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
254 * Each element in this array represents how many BRT entries do we have in this
255 * chunk of storage. We always load this entire array into memory and update as
256 * needed. By having it in memory we can quickly tell (during zio_free()) if
257 * there are any BRT entries that we might need to update.
259 * This value cannot be larger than 16MB, at least as long as we support
260 * 512 byte block sizes. With 512 byte block size we can have exactly
261 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
262 * many for a 16bit counter.
264 #define BRT_RANGESIZE (16 * 1024 * 1024)
265 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
266 "BRT_RANGESIZE is too large.");
268 * We don't want to update the whole structure every time. Maintain bitmap
269 * of dirty blocks within the regions, so that a single bit represents a
270 * block size of entcounts. For example if we have a 1PB vdev then all
271 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
272 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
273 * the whole 128MB on disk when we have updated only a single entcount.
274 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
275 * is represented by a single bit. This gives us 4096 bits. A set bit in the
276 * bitmap means that we had a change in at least one of the 16384 entcounts
277 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
279 #define BRT_BLOCKSIZE (32 * 1024)
280 #define BRT_RANGESIZE_TO_NBLOCKS(size) \
281 (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
283 #define BRT_LITTLE_ENDIAN 0
284 #define BRT_BIG_ENDIAN 1
285 #ifdef _ZFS_LITTLE_ENDIAN
286 #define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
287 #define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
288 #else
289 #define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
290 #define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
291 #endif
293 typedef struct brt_vdev_phys {
294 uint64_t bvp_mos_entries;
295 uint64_t bvp_size;
296 uint64_t bvp_byteorder;
297 uint64_t bvp_totalcount;
298 uint64_t bvp_rangesize;
299 uint64_t bvp_usedspace;
300 uint64_t bvp_savedspace;
301 } brt_vdev_phys_t;
303 typedef struct brt_vdev {
305 * VDEV id.
307 uint64_t bv_vdevid;
309 * Is the structure initiated?
310 * (bv_entcount and bv_bitmap are allocated?)
312 boolean_t bv_initiated;
314 * Object number in the MOS for the entcount array and brt_vdev_phys.
316 uint64_t bv_mos_brtvdev;
318 * Object number in the MOS for the entries table.
320 uint64_t bv_mos_entries;
322 * Entries to sync.
324 avl_tree_t bv_tree;
326 * Does the bv_entcount[] array needs byte swapping?
328 boolean_t bv_need_byteswap;
330 * Number of entries in the bv_entcount[] array.
332 uint64_t bv_size;
334 * This is the array with BRT entry count per BRT_RANGESIZE.
336 uint16_t *bv_entcount;
338 * Sum of all bv_entcount[]s.
340 uint64_t bv_totalcount;
342 * Space on disk occupied by cloned blocks (without compression).
344 uint64_t bv_usedspace;
346 * How much additional space would be occupied without block cloning.
348 uint64_t bv_savedspace;
350 * brt_vdev_phys needs updating on disk.
352 boolean_t bv_meta_dirty;
354 * bv_entcount[] needs updating on disk.
356 boolean_t bv_entcount_dirty;
358 * bv_entcount[] potentially can be a bit too big to sychronize it all
359 * when we just changed few entcounts. The fields below allow us to
360 * track updates to bv_entcount[] array since the last sync.
361 * A single bit in the bv_bitmap represents as many entcounts as can
362 * fit into a single BRT_BLOCKSIZE.
363 * For example we have 65536 entcounts in the bv_entcount array
364 * (so the whole array is 128kB). We updated bv_entcount[2] and
365 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
366 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
368 ulong_t *bv_bitmap;
369 uint64_t bv_nblocks;
370 } brt_vdev_t;
373 * In-core brt
375 typedef struct brt {
376 krwlock_t brt_lock;
377 spa_t *brt_spa;
378 #define brt_mos brt_spa->spa_meta_objset
379 uint64_t brt_rangesize;
380 uint64_t brt_usedspace;
381 uint64_t brt_savedspace;
382 avl_tree_t brt_pending_tree[TXG_SIZE];
383 kmutex_t brt_pending_lock[TXG_SIZE];
384 /* Sum of all entries across all bv_trees. */
385 uint64_t brt_nentries;
386 brt_vdev_t *brt_vdevs;
387 uint64_t brt_nvdevs;
388 } brt_t;
390 /* Size of bre_offset / sizeof (uint64_t). */
391 #define BRT_KEY_WORDS (1)
394 * In-core brt entry.
395 * On-disk we use bre_offset as the key and bre_refcount as the value.
397 typedef struct brt_entry {
398 uint64_t bre_offset;
399 uint64_t bre_refcount;
400 avl_node_t bre_node;
401 } brt_entry_t;
403 typedef struct brt_pending_entry {
404 blkptr_t bpe_bp;
405 int bpe_count;
406 avl_node_t bpe_node;
407 } brt_pending_entry_t;
409 static kmem_cache_t *brt_entry_cache;
410 static kmem_cache_t *brt_pending_entry_cache;
413 * Enable/disable prefetching of BRT entries that we are going to modify.
415 int zfs_brt_prefetch = 1;
417 #ifdef ZFS_DEBUG
418 #define BRT_DEBUG(...) do { \
419 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
420 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
422 } while (0)
423 #else
424 #define BRT_DEBUG(...) do { } while (0)
425 #endif
427 int brt_zap_leaf_blockshift = 12;
428 int brt_zap_indirect_blockshift = 12;
430 static kstat_t *brt_ksp;
432 typedef struct brt_stats {
433 kstat_named_t brt_addref_entry_in_memory;
434 kstat_named_t brt_addref_entry_not_on_disk;
435 kstat_named_t brt_addref_entry_on_disk;
436 kstat_named_t brt_addref_entry_read_lost_race;
437 kstat_named_t brt_decref_entry_in_memory;
438 kstat_named_t brt_decref_entry_loaded_from_disk;
439 kstat_named_t brt_decref_entry_not_in_memory;
440 kstat_named_t brt_decref_entry_not_on_disk;
441 kstat_named_t brt_decref_entry_read_lost_race;
442 kstat_named_t brt_decref_entry_still_referenced;
443 kstat_named_t brt_decref_free_data_later;
444 kstat_named_t brt_decref_free_data_now;
445 kstat_named_t brt_decref_no_entry;
446 } brt_stats_t;
448 static brt_stats_t brt_stats = {
449 { "addref_entry_in_memory", KSTAT_DATA_UINT64 },
450 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
451 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
452 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 },
453 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
454 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
455 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
456 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 },
457 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
458 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
459 { "decref_free_data_later", KSTAT_DATA_UINT64 },
460 { "decref_free_data_now", KSTAT_DATA_UINT64 },
461 { "decref_no_entry", KSTAT_DATA_UINT64 }
464 struct {
465 wmsum_t brt_addref_entry_in_memory;
466 wmsum_t brt_addref_entry_not_on_disk;
467 wmsum_t brt_addref_entry_on_disk;
468 wmsum_t brt_addref_entry_read_lost_race;
469 wmsum_t brt_decref_entry_in_memory;
470 wmsum_t brt_decref_entry_loaded_from_disk;
471 wmsum_t brt_decref_entry_not_in_memory;
472 wmsum_t brt_decref_entry_not_on_disk;
473 wmsum_t brt_decref_entry_read_lost_race;
474 wmsum_t brt_decref_entry_still_referenced;
475 wmsum_t brt_decref_free_data_later;
476 wmsum_t brt_decref_free_data_now;
477 wmsum_t brt_decref_no_entry;
478 } brt_sums;
480 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
482 static int brt_entry_compare(const void *x1, const void *x2);
483 static int brt_pending_entry_compare(const void *x1, const void *x2);
485 static void
486 brt_rlock(brt_t *brt)
488 rw_enter(&brt->brt_lock, RW_READER);
491 static void
492 brt_wlock(brt_t *brt)
494 rw_enter(&brt->brt_lock, RW_WRITER);
497 static void
498 brt_unlock(brt_t *brt)
500 rw_exit(&brt->brt_lock);
503 static uint16_t
504 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
507 ASSERT3U(idx, <, brtvd->bv_size);
509 if (brtvd->bv_need_byteswap) {
510 return (BSWAP_16(brtvd->bv_entcount[idx]));
511 } else {
512 return (brtvd->bv_entcount[idx]);
516 static void
517 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
520 ASSERT3U(idx, <, brtvd->bv_size);
522 if (brtvd->bv_need_byteswap) {
523 brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
524 } else {
525 brtvd->bv_entcount[idx] = entcnt;
529 static void
530 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
532 uint16_t entcnt;
534 ASSERT3U(idx, <, brtvd->bv_size);
536 entcnt = brt_vdev_entcount_get(brtvd, idx);
537 ASSERT(entcnt < UINT16_MAX);
539 brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
542 static void
543 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
545 uint16_t entcnt;
547 ASSERT3U(idx, <, brtvd->bv_size);
549 entcnt = brt_vdev_entcount_get(brtvd, idx);
550 ASSERT(entcnt > 0);
552 brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
555 #ifdef ZFS_DEBUG
556 static void
557 brt_vdev_dump(brt_t *brt)
559 brt_vdev_t *brtvd;
560 uint64_t vdevid;
562 if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
563 return;
566 if (brt->brt_nvdevs == 0) {
567 zfs_dbgmsg("BRT empty");
568 return;
571 zfs_dbgmsg("BRT vdev dump:");
572 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
573 uint64_t idx;
575 brtvd = &brt->brt_vdevs[vdevid];
576 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
577 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
578 (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
579 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
580 (u_longlong_t)brtvd->bv_size,
581 (u_longlong_t)brtvd->bv_totalcount,
582 (u_longlong_t)brtvd->bv_nblocks,
583 (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
584 if (brtvd->bv_totalcount > 0) {
585 zfs_dbgmsg(" entcounts:");
586 for (idx = 0; idx < brtvd->bv_size; idx++) {
587 if (brt_vdev_entcount_get(brtvd, idx) > 0) {
588 zfs_dbgmsg(" [%04llu] %hu",
589 (u_longlong_t)idx,
590 brt_vdev_entcount_get(brtvd, idx));
594 if (brtvd->bv_entcount_dirty) {
595 char *bitmap;
597 bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
598 for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
599 bitmap[idx] =
600 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
602 bitmap[idx] = '\0';
603 zfs_dbgmsg(" bitmap: %s", bitmap);
604 kmem_free(bitmap, brtvd->bv_nblocks + 1);
608 #endif
610 static brt_vdev_t *
611 brt_vdev(brt_t *brt, uint64_t vdevid)
613 brt_vdev_t *brtvd;
615 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
617 if (vdevid < brt->brt_nvdevs) {
618 brtvd = &brt->brt_vdevs[vdevid];
619 } else {
620 brtvd = NULL;
623 return (brtvd);
626 static void
627 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
629 char name[64];
631 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
632 ASSERT0(brtvd->bv_mos_brtvdev);
633 ASSERT0(brtvd->bv_mos_entries);
634 ASSERT(brtvd->bv_entcount != NULL);
635 ASSERT(brtvd->bv_size > 0);
636 ASSERT(brtvd->bv_bitmap != NULL);
637 ASSERT(brtvd->bv_nblocks > 0);
639 brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
640 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
641 brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
642 0, tx);
643 VERIFY(brtvd->bv_mos_entries != 0);
644 BRT_DEBUG("MOS entries created, object=%llu",
645 (u_longlong_t)brtvd->bv_mos_entries);
648 * We allocate DMU buffer to store the bv_entcount[] array.
649 * We will keep array size (bv_size) and cummulative count for all
650 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
652 brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
653 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
654 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
655 VERIFY(brtvd->bv_mos_brtvdev != 0);
656 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
657 (u_longlong_t)brtvd->bv_mos_brtvdev);
659 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
660 (u_longlong_t)brtvd->bv_vdevid);
661 VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
662 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
663 BRT_DEBUG("Pool directory object created, object=%s", name);
665 spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
668 static void
669 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
671 vdev_t *vd;
672 uint16_t *entcount;
673 ulong_t *bitmap;
674 uint64_t nblocks, size;
676 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
678 spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
679 vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
680 size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
681 spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
683 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
684 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
685 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
687 if (!brtvd->bv_initiated) {
688 ASSERT0(brtvd->bv_size);
689 ASSERT(brtvd->bv_entcount == NULL);
690 ASSERT(brtvd->bv_bitmap == NULL);
691 ASSERT0(brtvd->bv_nblocks);
693 avl_create(&brtvd->bv_tree, brt_entry_compare,
694 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
695 } else {
696 ASSERT(brtvd->bv_size > 0);
697 ASSERT(brtvd->bv_entcount != NULL);
698 ASSERT(brtvd->bv_bitmap != NULL);
699 ASSERT(brtvd->bv_nblocks > 0);
701 * TODO: Allow vdev shrinking. We only need to implement
702 * shrinking the on-disk BRT VDEV object.
703 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
704 * size, tx);
706 ASSERT3U(brtvd->bv_size, <=, size);
708 memcpy(entcount, brtvd->bv_entcount,
709 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
710 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
711 BT_SIZEOFMAP(brtvd->bv_nblocks)));
712 vmem_free(brtvd->bv_entcount,
713 sizeof (entcount[0]) * brtvd->bv_size);
714 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
717 brtvd->bv_size = size;
718 brtvd->bv_entcount = entcount;
719 brtvd->bv_bitmap = bitmap;
720 brtvd->bv_nblocks = nblocks;
721 if (!brtvd->bv_initiated) {
722 brtvd->bv_need_byteswap = FALSE;
723 brtvd->bv_initiated = TRUE;
724 BRT_DEBUG("BRT VDEV %llu initiated.",
725 (u_longlong_t)brtvd->bv_vdevid);
729 static void
730 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
732 char name[64];
733 dmu_buf_t *db;
734 brt_vdev_phys_t *bvphys;
735 int error;
737 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
738 (u_longlong_t)brtvd->bv_vdevid);
739 error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
740 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
741 if (error != 0)
742 return;
743 ASSERT(brtvd->bv_mos_brtvdev != 0);
745 error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
746 ASSERT0(error);
747 if (error != 0)
748 return;
750 bvphys = db->db_data;
751 if (brt->brt_rangesize == 0) {
752 brt->brt_rangesize = bvphys->bvp_rangesize;
753 } else {
754 ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
757 ASSERT(!brtvd->bv_initiated);
758 brt_vdev_realloc(brt, brtvd);
760 /* TODO: We don't support VDEV shrinking. */
761 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
764 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
766 error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
767 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
768 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
769 ASSERT0(error);
771 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
772 ASSERT(brtvd->bv_mos_entries != 0);
773 brtvd->bv_need_byteswap =
774 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
775 brtvd->bv_totalcount = bvphys->bvp_totalcount;
776 brtvd->bv_usedspace = bvphys->bvp_usedspace;
777 brtvd->bv_savedspace = bvphys->bvp_savedspace;
778 brt->brt_usedspace += brtvd->bv_usedspace;
779 brt->brt_savedspace += brtvd->bv_savedspace;
781 dmu_buf_rele(db, FTAG);
783 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
784 name, (u_longlong_t)brtvd->bv_mos_brtvdev,
785 (u_longlong_t)brtvd->bv_mos_entries);
788 static void
789 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
792 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
793 ASSERT(brtvd->bv_initiated);
795 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
796 brtvd->bv_entcount = NULL;
797 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
798 brtvd->bv_bitmap = NULL;
799 ASSERT0(avl_numnodes(&brtvd->bv_tree));
800 avl_destroy(&brtvd->bv_tree);
802 brtvd->bv_size = 0;
803 brtvd->bv_nblocks = 0;
805 brtvd->bv_initiated = FALSE;
806 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
809 static void
810 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
812 char name[64];
813 uint64_t count;
814 dmu_buf_t *db;
815 brt_vdev_phys_t *bvphys;
817 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
818 ASSERT(brtvd->bv_mos_brtvdev != 0);
819 ASSERT(brtvd->bv_mos_entries != 0);
821 VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
822 VERIFY0(count);
823 VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
824 BRT_DEBUG("MOS entries destroyed, object=%llu",
825 (u_longlong_t)brtvd->bv_mos_entries);
826 brtvd->bv_mos_entries = 0;
828 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
829 bvphys = db->db_data;
830 ASSERT0(bvphys->bvp_totalcount);
831 ASSERT0(bvphys->bvp_usedspace);
832 ASSERT0(bvphys->bvp_savedspace);
833 dmu_buf_rele(db, FTAG);
835 VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
836 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
837 (u_longlong_t)brtvd->bv_mos_brtvdev);
838 brtvd->bv_mos_brtvdev = 0;
840 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
841 (u_longlong_t)brtvd->bv_vdevid);
842 VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
843 BRT_DEBUG("Pool directory object removed, object=%s", name);
845 brt_vdev_dealloc(brt, brtvd);
847 spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
850 static void
851 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
853 brt_vdev_t *brtvd, *vdevs;
854 uint64_t vdevid;
856 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
857 ASSERT3U(nvdevs, >, brt->brt_nvdevs);
859 vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
860 if (brt->brt_nvdevs > 0) {
861 ASSERT(brt->brt_vdevs != NULL);
863 memcpy(vdevs, brt->brt_vdevs,
864 sizeof (brt_vdev_t) * brt->brt_nvdevs);
865 kmem_free(brt->brt_vdevs,
866 sizeof (brt_vdev_t) * brt->brt_nvdevs);
868 for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
869 brtvd = &vdevs[vdevid];
871 brtvd->bv_vdevid = vdevid;
872 brtvd->bv_initiated = FALSE;
875 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
876 (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
878 brt->brt_vdevs = vdevs;
879 brt->brt_nvdevs = nvdevs;
882 static boolean_t
883 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
885 uint64_t idx;
887 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
889 idx = bre->bre_offset / brt->brt_rangesize;
890 if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
891 /* VDEV wasn't expanded. */
892 return (brt_vdev_entcount_get(brtvd, idx) > 0);
895 return (FALSE);
898 static void
899 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
900 uint64_t dsize)
902 uint64_t idx;
904 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
905 ASSERT(brtvd != NULL);
906 ASSERT(brtvd->bv_entcount != NULL);
908 brt->brt_savedspace += dsize;
909 brtvd->bv_savedspace += dsize;
910 brtvd->bv_meta_dirty = TRUE;
912 if (bre->bre_refcount > 1) {
913 return;
916 brt->brt_usedspace += dsize;
917 brtvd->bv_usedspace += dsize;
919 idx = bre->bre_offset / brt->brt_rangesize;
920 if (idx >= brtvd->bv_size) {
921 /* VDEV has been expanded. */
922 brt_vdev_realloc(brt, brtvd);
925 ASSERT3U(idx, <, brtvd->bv_size);
927 brtvd->bv_totalcount++;
928 brt_vdev_entcount_inc(brtvd, idx);
929 brtvd->bv_entcount_dirty = TRUE;
930 idx = idx / BRT_BLOCKSIZE / 8;
931 BT_SET(brtvd->bv_bitmap, idx);
933 #ifdef ZFS_DEBUG
934 brt_vdev_dump(brt);
935 #endif
938 static void
939 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
940 uint64_t dsize)
942 uint64_t idx;
944 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
945 ASSERT(brtvd != NULL);
946 ASSERT(brtvd->bv_entcount != NULL);
948 brt->brt_savedspace -= dsize;
949 brtvd->bv_savedspace -= dsize;
950 brtvd->bv_meta_dirty = TRUE;
952 if (bre->bre_refcount > 0) {
953 return;
956 brt->brt_usedspace -= dsize;
957 brtvd->bv_usedspace -= dsize;
959 idx = bre->bre_offset / brt->brt_rangesize;
960 ASSERT3U(idx, <, brtvd->bv_size);
962 ASSERT(brtvd->bv_totalcount > 0);
963 brtvd->bv_totalcount--;
964 brt_vdev_entcount_dec(brtvd, idx);
965 brtvd->bv_entcount_dirty = TRUE;
966 idx = idx / BRT_BLOCKSIZE / 8;
967 BT_SET(brtvd->bv_bitmap, idx);
969 #ifdef ZFS_DEBUG
970 brt_vdev_dump(brt);
971 #endif
974 static void
975 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
977 dmu_buf_t *db;
978 brt_vdev_phys_t *bvphys;
980 ASSERT(brtvd->bv_meta_dirty);
981 ASSERT(brtvd->bv_mos_brtvdev != 0);
982 ASSERT(dmu_tx_is_syncing(tx));
984 VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
986 if (brtvd->bv_entcount_dirty) {
988 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
990 dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
991 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
992 brtvd->bv_entcount, tx);
993 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
994 brtvd->bv_entcount_dirty = FALSE;
997 dmu_buf_will_dirty(db, tx);
998 bvphys = db->db_data;
999 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
1000 bvphys->bvp_size = brtvd->bv_size;
1001 if (brtvd->bv_need_byteswap) {
1002 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
1003 } else {
1004 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
1006 bvphys->bvp_totalcount = brtvd->bv_totalcount;
1007 bvphys->bvp_rangesize = brt->brt_rangesize;
1008 bvphys->bvp_usedspace = brtvd->bv_usedspace;
1009 bvphys->bvp_savedspace = brtvd->bv_savedspace;
1010 dmu_buf_rele(db, FTAG);
1012 brtvd->bv_meta_dirty = FALSE;
1015 static void
1016 brt_vdevs_alloc(brt_t *brt, boolean_t load)
1018 brt_vdev_t *brtvd;
1019 uint64_t vdevid;
1021 brt_wlock(brt);
1023 brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
1025 if (load) {
1026 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1027 brtvd = &brt->brt_vdevs[vdevid];
1028 ASSERT(brtvd->bv_entcount == NULL);
1030 brt_vdev_load(brt, brtvd);
1034 if (brt->brt_rangesize == 0) {
1035 brt->brt_rangesize = BRT_RANGESIZE;
1038 brt_unlock(brt);
1041 static void
1042 brt_vdevs_free(brt_t *brt)
1044 brt_vdev_t *brtvd;
1045 uint64_t vdevid;
1047 brt_wlock(brt);
1049 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1050 brtvd = &brt->brt_vdevs[vdevid];
1051 if (brtvd->bv_initiated)
1052 brt_vdev_dealloc(brt, brtvd);
1054 kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
1056 brt_unlock(brt);
1059 static void
1060 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
1063 bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
1064 bre->bre_refcount = 0;
1066 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
1069 static int
1070 brt_entry_compare(const void *x1, const void *x2)
1072 const brt_entry_t *bre1 = x1;
1073 const brt_entry_t *bre2 = x2;
1075 return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
1078 static int
1079 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
1081 uint64_t mos_entries;
1082 uint64_t one, physsize;
1083 int error;
1085 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1087 if (!brt_vdev_lookup(brt, brtvd, bre))
1088 return (SET_ERROR(ENOENT));
1091 * Remember mos_entries object number. After we reacquire the BRT lock,
1092 * the brtvd pointer may be invalid.
1094 mos_entries = brtvd->bv_mos_entries;
1095 if (mos_entries == 0)
1096 return (SET_ERROR(ENOENT));
1098 brt_unlock(brt);
1100 error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
1101 BRT_KEY_WORDS, &one, &physsize);
1102 if (error == 0) {
1103 ASSERT3U(one, ==, 1);
1104 ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
1106 error = zap_lookup_uint64(brt->brt_mos, mos_entries,
1107 &bre->bre_offset, BRT_KEY_WORDS, 1,
1108 sizeof (bre->bre_refcount), &bre->bre_refcount);
1109 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
1110 "count=%llu error=%d", (u_longlong_t)mos_entries,
1111 (u_longlong_t)brtvd->bv_vdevid,
1112 (u_longlong_t)bre->bre_offset,
1113 error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
1116 brt_wlock(brt);
1118 return (error);
1121 static void
1122 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
1124 brt_vdev_t *brtvd;
1125 uint64_t mos_entries = 0;
1127 brt_rlock(brt);
1128 brtvd = brt_vdev(brt, vdevid);
1129 if (brtvd != NULL)
1130 mos_entries = brtvd->bv_mos_entries;
1131 brt_unlock(brt);
1133 if (mos_entries == 0)
1134 return;
1136 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
1137 (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
1138 (u_longlong_t)bre->bre_offset);
1139 (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
1140 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
1143 static int
1144 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1146 int error;
1148 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1149 ASSERT(brtvd->bv_mos_entries != 0);
1150 ASSERT(bre->bre_refcount > 0);
1152 error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1153 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
1154 sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
1155 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
1156 "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1157 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1158 (u_longlong_t)bre->bre_refcount, error);
1160 return (error);
1163 static int
1164 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1166 int error;
1168 ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1169 ASSERT(brtvd->bv_mos_entries != 0);
1170 ASSERT0(bre->bre_refcount);
1172 error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1173 (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
1174 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1175 "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1176 (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1177 (u_longlong_t)bre->bre_refcount, error);
1179 return (error);
1183 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1184 * positive, but gives us quick answer if we should look into BRT, which
1185 * may require reads and thus will be more expensive.
1187 boolean_t
1188 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1190 brt_t *brt = spa->spa_brt;
1191 brt_vdev_t *brtvd;
1192 brt_entry_t bre_search;
1193 boolean_t mayexists = FALSE;
1194 uint64_t vdevid;
1196 brt_entry_fill(bp, &bre_search, &vdevid);
1198 brt_rlock(brt);
1200 brtvd = brt_vdev(brt, vdevid);
1201 if (brtvd != NULL && brtvd->bv_initiated) {
1202 if (!avl_is_empty(&brtvd->bv_tree) ||
1203 brt_vdev_lookup(brt, brtvd, &bre_search)) {
1204 mayexists = TRUE;
1208 brt_unlock(brt);
1210 return (mayexists);
1213 uint64_t
1214 brt_get_dspace(spa_t *spa)
1216 brt_t *brt = spa->spa_brt;
1218 if (brt == NULL)
1219 return (0);
1221 return (brt->brt_savedspace);
1224 uint64_t
1225 brt_get_used(spa_t *spa)
1227 brt_t *brt = spa->spa_brt;
1229 if (brt == NULL)
1230 return (0);
1232 return (brt->brt_usedspace);
1235 uint64_t
1236 brt_get_saved(spa_t *spa)
1238 brt_t *brt = spa->spa_brt;
1240 if (brt == NULL)
1241 return (0);
1243 return (brt->brt_savedspace);
1246 uint64_t
1247 brt_get_ratio(spa_t *spa)
1249 brt_t *brt = spa->spa_brt;
1251 if (brt->brt_usedspace == 0)
1252 return (100);
1254 return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1255 brt->brt_usedspace);
1258 static int
1259 brt_kstats_update(kstat_t *ksp, int rw)
1261 brt_stats_t *bs = ksp->ks_data;
1263 if (rw == KSTAT_WRITE)
1264 return (EACCES);
1266 bs->brt_addref_entry_in_memory.value.ui64 =
1267 wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1268 bs->brt_addref_entry_not_on_disk.value.ui64 =
1269 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1270 bs->brt_addref_entry_on_disk.value.ui64 =
1271 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1272 bs->brt_addref_entry_read_lost_race.value.ui64 =
1273 wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1274 bs->brt_decref_entry_in_memory.value.ui64 =
1275 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1276 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1277 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1278 bs->brt_decref_entry_not_in_memory.value.ui64 =
1279 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1280 bs->brt_decref_entry_not_on_disk.value.ui64 =
1281 wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1282 bs->brt_decref_entry_read_lost_race.value.ui64 =
1283 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1284 bs->brt_decref_entry_still_referenced.value.ui64 =
1285 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1286 bs->brt_decref_free_data_later.value.ui64 =
1287 wmsum_value(&brt_sums.brt_decref_free_data_later);
1288 bs->brt_decref_free_data_now.value.ui64 =
1289 wmsum_value(&brt_sums.brt_decref_free_data_now);
1290 bs->brt_decref_no_entry.value.ui64 =
1291 wmsum_value(&brt_sums.brt_decref_no_entry);
1293 return (0);
1296 static void
1297 brt_stat_init(void)
1300 wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1301 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1302 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1303 wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1304 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1305 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1306 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1307 wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1308 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1309 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1310 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1311 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1312 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1314 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1315 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1316 if (brt_ksp != NULL) {
1317 brt_ksp->ks_data = &brt_stats;
1318 brt_ksp->ks_update = brt_kstats_update;
1319 kstat_install(brt_ksp);
1323 static void
1324 brt_stat_fini(void)
1326 if (brt_ksp != NULL) {
1327 kstat_delete(brt_ksp);
1328 brt_ksp = NULL;
1331 wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1332 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1333 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1334 wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1335 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1336 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1337 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1338 wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1339 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1340 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1341 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1342 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1343 wmsum_fini(&brt_sums.brt_decref_no_entry);
1346 void
1347 brt_init(void)
1349 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1350 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1351 brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1352 sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1354 brt_stat_init();
1357 void
1358 brt_fini(void)
1360 brt_stat_fini();
1362 kmem_cache_destroy(brt_entry_cache);
1363 kmem_cache_destroy(brt_pending_entry_cache);
1366 static brt_entry_t *
1367 brt_entry_alloc(const brt_entry_t *bre_init)
1369 brt_entry_t *bre;
1371 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1372 bre->bre_offset = bre_init->bre_offset;
1373 bre->bre_refcount = bre_init->bre_refcount;
1375 return (bre);
1378 static void
1379 brt_entry_free(brt_entry_t *bre)
1382 kmem_cache_free(brt_entry_cache, bre);
1385 static void
1386 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1388 brt_vdev_t *brtvd;
1389 brt_entry_t *bre, *racebre;
1390 brt_entry_t bre_search;
1391 avl_index_t where;
1392 uint64_t vdevid;
1393 int error;
1395 ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1397 brt_entry_fill(bp, &bre_search, &vdevid);
1399 brt_wlock(brt);
1401 brtvd = brt_vdev(brt, vdevid);
1402 if (brtvd == NULL) {
1403 ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1405 /* New VDEV was added. */
1406 brt_vdevs_expand(brt, vdevid + 1);
1407 brtvd = brt_vdev(brt, vdevid);
1409 ASSERT(brtvd != NULL);
1410 if (!brtvd->bv_initiated)
1411 brt_vdev_realloc(brt, brtvd);
1413 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1414 if (bre != NULL) {
1415 BRTSTAT_BUMP(brt_addref_entry_in_memory);
1416 } else {
1418 * brt_entry_lookup() may drop the BRT (read) lock and
1419 * reacquire it (write).
1421 error = brt_entry_lookup(brt, brtvd, &bre_search);
1422 /* bre_search now contains correct bre_refcount */
1423 ASSERT(error == 0 || error == ENOENT);
1424 if (error == 0)
1425 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1426 else
1427 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1429 * When the BRT lock was dropped, brt_vdevs[] may have been
1430 * expanded and reallocated, we need to update brtvd's pointer.
1432 brtvd = brt_vdev(brt, vdevid);
1433 ASSERT(brtvd != NULL);
1435 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1436 if (racebre == NULL) {
1437 bre = brt_entry_alloc(&bre_search);
1438 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1439 avl_insert(&brtvd->bv_tree, bre, where);
1440 brt->brt_nentries++;
1441 } else {
1443 * The entry was added when the BRT lock was dropped in
1444 * brt_entry_lookup().
1446 BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1447 bre = racebre;
1450 bre->bre_refcount++;
1451 brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1453 brt_unlock(brt);
1456 /* Return TRUE if block should be freed immediately. */
1457 boolean_t
1458 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1460 brt_t *brt = spa->spa_brt;
1461 brt_vdev_t *brtvd;
1462 brt_entry_t *bre, *racebre;
1463 brt_entry_t bre_search;
1464 avl_index_t where;
1465 uint64_t vdevid;
1466 int error;
1468 brt_entry_fill(bp, &bre_search, &vdevid);
1470 brt_wlock(brt);
1472 brtvd = brt_vdev(brt, vdevid);
1473 ASSERT(brtvd != NULL);
1475 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1476 if (bre != NULL) {
1477 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1478 goto out;
1479 } else {
1480 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1484 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1486 error = brt_entry_lookup(brt, brtvd, &bre_search);
1487 /* bre_search now contains correct bre_refcount */
1488 ASSERT(error == 0 || error == ENOENT);
1490 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1491 * and reallocated, we need to update brtvd's pointer.
1493 brtvd = brt_vdev(brt, vdevid);
1494 ASSERT(brtvd != NULL);
1496 if (error == ENOENT) {
1497 BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1498 bre = NULL;
1499 goto out;
1502 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1503 if (racebre != NULL) {
1505 * The entry was added when the BRT lock was dropped in
1506 * brt_entry_lookup().
1508 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1509 bre = racebre;
1510 goto out;
1513 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1514 bre = brt_entry_alloc(&bre_search);
1515 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1516 avl_insert(&brtvd->bv_tree, bre, where);
1517 brt->brt_nentries++;
1519 out:
1520 if (bre == NULL) {
1522 * This is a free of a regular (not cloned) block.
1524 brt_unlock(brt);
1525 BRTSTAT_BUMP(brt_decref_no_entry);
1526 return (B_TRUE);
1528 if (bre->bre_refcount == 0) {
1529 brt_unlock(brt);
1530 BRTSTAT_BUMP(brt_decref_free_data_now);
1531 return (B_TRUE);
1534 ASSERT(bre->bre_refcount > 0);
1535 bre->bre_refcount--;
1536 if (bre->bre_refcount == 0)
1537 BRTSTAT_BUMP(brt_decref_free_data_later);
1538 else
1539 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1540 brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1542 brt_unlock(brt);
1544 return (B_FALSE);
1547 static void
1548 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1550 brt_entry_t bre;
1551 uint64_t vdevid;
1553 ASSERT(bp != NULL);
1555 if (!zfs_brt_prefetch)
1556 return;
1558 brt_entry_fill(bp, &bre, &vdevid);
1560 brt_entry_prefetch(brt, vdevid, &bre);
1563 static int
1564 brt_pending_entry_compare(const void *x1, const void *x2)
1566 const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1567 const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1568 int cmp;
1570 cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1571 if (cmp == 0) {
1572 cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1573 DVA_GET_VDEV(&bp2->blk_dva[0]));
1574 if (cmp == 0) {
1575 cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1576 DVA_GET_OFFSET(&bp2->blk_dva[0]));
1580 return (cmp);
1583 void
1584 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1586 brt_t *brt;
1587 avl_tree_t *pending_tree;
1588 kmutex_t *pending_lock;
1589 brt_pending_entry_t *bpe, *newbpe;
1590 avl_index_t where;
1591 uint64_t txg;
1593 brt = spa->spa_brt;
1594 txg = dmu_tx_get_txg(tx);
1595 ASSERT3U(txg, !=, 0);
1596 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1597 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1599 newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1600 newbpe->bpe_bp = *bp;
1601 newbpe->bpe_count = 1;
1603 mutex_enter(pending_lock);
1605 bpe = avl_find(pending_tree, newbpe, &where);
1606 if (bpe == NULL) {
1607 avl_insert(pending_tree, newbpe, where);
1608 newbpe = NULL;
1609 } else {
1610 bpe->bpe_count++;
1613 mutex_exit(pending_lock);
1615 if (newbpe != NULL) {
1616 ASSERT(bpe != NULL);
1617 ASSERT(bpe != newbpe);
1618 kmem_cache_free(brt_pending_entry_cache, newbpe);
1619 } else {
1620 ASSERT(bpe == NULL);
1623 /* Prefetch BRT entry, as we will need it in the syncing context. */
1624 brt_prefetch(brt, bp);
1627 void
1628 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1630 brt_t *brt;
1631 avl_tree_t *pending_tree;
1632 kmutex_t *pending_lock;
1633 brt_pending_entry_t *bpe, bpe_search;
1634 uint64_t txg;
1636 brt = spa->spa_brt;
1637 txg = dmu_tx_get_txg(tx);
1638 ASSERT3U(txg, !=, 0);
1639 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1640 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1642 bpe_search.bpe_bp = *bp;
1644 mutex_enter(pending_lock);
1646 bpe = avl_find(pending_tree, &bpe_search, NULL);
1647 /* I believe we should always find bpe when this function is called. */
1648 if (bpe != NULL) {
1649 ASSERT(bpe->bpe_count > 0);
1651 bpe->bpe_count--;
1652 if (bpe->bpe_count == 0) {
1653 avl_remove(pending_tree, bpe);
1654 kmem_cache_free(brt_pending_entry_cache, bpe);
1658 mutex_exit(pending_lock);
1661 void
1662 brt_pending_apply(spa_t *spa, uint64_t txg)
1664 brt_t *brt;
1665 brt_pending_entry_t *bpe;
1666 avl_tree_t *pending_tree;
1667 kmutex_t *pending_lock;
1668 void *c;
1670 ASSERT3U(txg, !=, 0);
1672 brt = spa->spa_brt;
1673 pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1674 pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1676 mutex_enter(pending_lock);
1678 c = NULL;
1679 while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1680 boolean_t added_to_ddt;
1682 mutex_exit(pending_lock);
1684 for (int i = 0; i < bpe->bpe_count; i++) {
1686 * If the block has DEDUP bit set, it means that it
1687 * already exists in the DEDUP table, so we can just
1688 * use that instead of creating new entry in
1689 * the BRT table.
1691 if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1692 added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1693 } else {
1694 added_to_ddt = B_FALSE;
1696 if (!added_to_ddt)
1697 brt_entry_addref(brt, &bpe->bpe_bp);
1700 kmem_cache_free(brt_pending_entry_cache, bpe);
1701 mutex_enter(pending_lock);
1704 mutex_exit(pending_lock);
1707 static void
1708 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1711 ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1712 ASSERT(brtvd->bv_mos_entries != 0);
1714 if (bre->bre_refcount == 0) {
1715 int error;
1717 error = brt_entry_remove(brt, brtvd, bre, tx);
1718 ASSERT(error == 0 || error == ENOENT);
1720 * If error == ENOENT then zfs_clone_range() was done from a
1721 * removed (but opened) file (open(), unlink()).
1723 ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1724 } else {
1725 VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1729 static void
1730 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1732 brt_vdev_t *brtvd;
1733 brt_entry_t *bre;
1734 uint64_t vdevid;
1735 void *c;
1737 brt_wlock(brt);
1739 for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1740 brtvd = &brt->brt_vdevs[vdevid];
1742 if (!brtvd->bv_initiated)
1743 continue;
1745 if (!brtvd->bv_meta_dirty) {
1746 ASSERT(!brtvd->bv_entcount_dirty);
1747 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1748 continue;
1751 ASSERT(!brtvd->bv_entcount_dirty ||
1752 avl_numnodes(&brtvd->bv_tree) != 0);
1754 if (brtvd->bv_mos_brtvdev == 0)
1755 brt_vdev_create(brt, brtvd, tx);
1757 c = NULL;
1758 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1759 brt_sync_entry(brt, brtvd, bre, tx);
1760 brt_entry_free(bre);
1761 ASSERT(brt->brt_nentries > 0);
1762 brt->brt_nentries--;
1765 brt_vdev_sync(brt, brtvd, tx);
1767 if (brtvd->bv_totalcount == 0)
1768 brt_vdev_destroy(brt, brtvd, tx);
1771 ASSERT0(brt->brt_nentries);
1773 brt_unlock(brt);
1776 void
1777 brt_sync(spa_t *spa, uint64_t txg)
1779 dmu_tx_t *tx;
1780 brt_t *brt;
1782 ASSERT(spa_syncing_txg(spa) == txg);
1784 brt = spa->spa_brt;
1785 brt_rlock(brt);
1786 if (brt->brt_nentries == 0) {
1787 /* No changes. */
1788 brt_unlock(brt);
1789 return;
1791 brt_unlock(brt);
1793 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1795 brt_sync_table(brt, tx);
1797 dmu_tx_commit(tx);
1800 static void
1801 brt_table_alloc(brt_t *brt)
1804 for (int i = 0; i < TXG_SIZE; i++) {
1805 avl_create(&brt->brt_pending_tree[i],
1806 brt_pending_entry_compare,
1807 sizeof (brt_pending_entry_t),
1808 offsetof(brt_pending_entry_t, bpe_node));
1809 mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1810 NULL);
1814 static void
1815 brt_table_free(brt_t *brt)
1818 for (int i = 0; i < TXG_SIZE; i++) {
1819 ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1821 avl_destroy(&brt->brt_pending_tree[i]);
1822 mutex_destroy(&brt->brt_pending_lock[i]);
1826 static void
1827 brt_alloc(spa_t *spa)
1829 brt_t *brt;
1831 ASSERT(spa->spa_brt == NULL);
1833 brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1834 rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1835 brt->brt_spa = spa;
1836 brt->brt_rangesize = 0;
1837 brt->brt_nentries = 0;
1838 brt->brt_vdevs = NULL;
1839 brt->brt_nvdevs = 0;
1840 brt_table_alloc(brt);
1842 spa->spa_brt = brt;
1845 void
1846 brt_create(spa_t *spa)
1849 brt_alloc(spa);
1850 brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1854 brt_load(spa_t *spa)
1857 brt_alloc(spa);
1858 brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1860 return (0);
1863 void
1864 brt_unload(spa_t *spa)
1866 brt_t *brt = spa->spa_brt;
1868 if (brt == NULL)
1869 return;
1871 brt_vdevs_free(brt);
1872 brt_table_free(brt);
1873 rw_destroy(&brt->brt_lock);
1874 kmem_free(brt, sizeof (*brt));
1875 spa->spa_brt = NULL;
1878 /* BEGIN CSTYLED */
1879 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1880 "Enable prefetching of BRT entries");
1881 #ifdef ZFS_BRT_DEBUG
1882 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1883 #endif
1884 /* END CSTYLED */