4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
26 #include <sys/zfs_context.h>
28 #include <sys/spa_impl.h>
32 #include <sys/bitmap.h>
34 #include <sys/dmu_tx.h>
36 #include <sys/dsl_pool.h>
37 #include <sys/dsl_scan.h>
38 #include <sys/vdev_impl.h>
39 #include <sys/kstat.h>
40 #include <sys/wmsum.h>
43 * Block Cloning design.
45 * Block Cloning allows to manually clone a file (or a subset of its blocks)
46 * into another (or the same) file by just creating additional references to
47 * the data blocks without copying the data itself. Those references are kept
48 * in the Block Reference Tables (BRTs).
50 * In many ways this is similar to the existing deduplication, but there are
51 * some important differences:
53 * - Deduplication is automatic and Block Cloning is not - one has to use a
54 * dedicated system call(s) to clone the given file/blocks.
55 * - Deduplication keeps all data blocks in its table, even those referenced
56 * just once. Block Cloning creates an entry in its tables only when there
57 * are at least two references to the given data block. If the block was
58 * never explicitly cloned or the second to last reference was dropped,
59 * there will be neither space nor performance overhead.
60 * - Deduplication needs data to work - one needs to pass real data to the
61 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
62 * data, just block pointers to the data, so it is extremely fast, as we pay
63 * neither the cost of reading the data, nor the cost of writing the data -
64 * we operate exclusively on metadata.
65 * - If the D (dedup) bit is not set in the block pointer, it means that
66 * the block is not in the dedup table (DDT) and we won't consult the DDT
67 * when we need to free the block. Block Cloning must be consulted on every
68 * free, because we cannot modify the source BP (eg. by setting something
69 * similar to the D bit), thus we have no hint if the block is in the
70 * Block Reference Table (BRT), so we need to look into the BRT. There is
71 * an optimization in place that allows us to eliminate the majority of BRT
72 * lookups which is described below in the "Minimizing free penalty" section.
73 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
74 * 64bit offset and 64bit reference counter.
75 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
76 * other on disk are most likely in totally different parts of the DDT.
77 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
78 * from one file should have BRT entries close to each other.
79 * - Scrub will only do a single pass over a block that is referenced multiple
80 * times in the DDT. Unfortunately it is not currently (if at all) possible
81 * with Block Cloning and block referenced multiple times will be scrubbed
82 * multiple times. The new, sorted scrub should be able to eliminate
83 * duplicated reads given enough memory.
84 * - Deduplication requires cryptographically strong hash as a checksum or
85 * additional data verification. Block Cloning works with any checksum
86 * algorithm or even with checksumming disabled.
88 * As mentioned above, the BRT entries are much smaller than the DDT entries.
89 * To uniquely identify a block we just need its vdev id and offset. We also
90 * need to maintain a reference counter. The vdev id will often repeat, as there
91 * is a small number of top-level VDEVs and a large number of blocks stored in
92 * each VDEV. We take advantage of that to reduce the BRT entry size further by
93 * maintaining one BRT for each top-level VDEV, so we can then have only offset
94 * and counter as the BRT entry.
96 * Minimizing free penalty.
98 * Block Cloning allows creating additional references to any existing block.
99 * When we free a block there is no hint in the block pointer whether the block
100 * was cloned or not, so on each free we have to check if there is a
101 * corresponding entry in the BRT or not. If there is, we need to decrease
102 * the reference counter. Doing BRT lookup on every free can potentially be
103 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
104 * This is the main problem with deduplication, so we've learned our lesson and
105 * try not to repeat the same mistake here. How do we do that? We divide each
106 * top-level VDEV into 16MB regions. For each region we maintain a counter that
107 * is a sum of all the BRT entries that have offsets within the region. This
108 * creates the entries count array of 16bit numbers for each top-level VDEV.
109 * The entries count array is always kept in memory and updated on disk in the
110 * same transaction group as the BRT updates to keep everything in-sync. We can
111 * keep the array in memory, because it is very small. With 16MB regions and
112 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
113 * the region size even further in the future). Now, when we want to free
114 * a block, we first consult the array. If the counter for the whole region is
115 * zero, there is no need to look for the BRT entry, as there isn't one for
116 * sure. If the counter for the region is greater than zero, only then we will
117 * do a BRT lookup and if an entry is found we will decrease the reference
118 * counter in the BRT entry and in the entry counters array.
120 * The entry counters array is small, but can potentially be larger for very
121 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
122 * array on every change. We then divide the array into 32kB block and keep
123 * a bitmap of dirty blocks within a transaction group. When we sync the
124 * transaction group we can only update the parts of the entry counters array
125 * that were modified. Note: Keeping track of the dirty parts of the entry
126 * counters array is implemented, but updating only parts of the array on disk
127 * is not yet implemented - for now we will update entire array if there was
130 * The implementation tries to be economic: if BRT is not used, or no longer
131 * used, there will be no entries in the MOS and no additional memory used (eg.
132 * the entry counters array is only allocated if needed).
134 * Interaction between Deduplication and Block Cloning.
136 * If both functionalities are in use, we could end up with a block that is
137 * referenced multiple times in both DDT and BRT. When we free one of the
138 * references we couldn't tell where it belongs, so we would have to decide
139 * what table takes the precedence: do we first clear DDT references or BRT
140 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
141 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
142 * lookup DDT entry instead and increase the counter there. No BRT entry
143 * will be created for a block which has the D (dedup) bit set.
144 * BRT may be more efficient for manual deduplication, but if the block is
145 * already in the DDT, then creating additional BRT entry would be less
146 * efficient. This clever idea was proposed by Allan Jude.
148 * Block Cloning across datasets.
150 * Block Cloning is not limited to cloning blocks within the same dataset.
151 * It is possible (and very useful) to clone blocks between different datasets.
152 * One use case is recovering files from snapshots. By cloning the files into
153 * dataset we need no additional storage. Without Block Cloning we would need
154 * additional space for those files.
155 * Another interesting use case is moving the files between datasets
156 * (copying the file content to the new dataset and removing the source file).
157 * In that case Block Cloning will only be used briefly, because the BRT entries
158 * will be removed when the source is removed.
159 * Note: currently it is not possible to clone blocks between encrypted
160 * datasets, even if those datasets use the same encryption key (this includes
161 * snapshots of encrypted datasets). Cloning blocks between datasets that use
162 * the same keys should be possible and should be implemented in the future.
164 * Block Cloning flow through ZFS layers.
166 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167 * blocks. As of this writing no interface is implemented that allows for block
168 * cloning within a ZVOL.
169 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170 * for blocking cloning.
173 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174 * size_t len, unsigned int flags);
176 * Even though offsets and length represent bytes, they have to be
177 * block-aligned or we will return the EXDEV error so the upper layer can
178 * fallback to the generic mechanism that will just copy the data.
179 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180 * This function was implemented based on zfs_write(), but instead of writing
181 * the given data we first read block pointers using the new dmu_read_l0_bps()
182 * function from the source file. Once we have BPs from the source file we call
183 * the dmu_brt_clone() function on the destination file. This function
184 * allocates BPs for us. We iterate over all source BPs. If the given BP is
185 * a hole or an embedded block, we just copy BP as-is. If it points to a real
186 * data we place this BP on a BRT pending list using the brt_pending_add()
189 * We use this pending list to keep track of all BPs that got new references
190 * within this transaction group.
192 * Some special cases to consider and how we address them:
193 * - The block we want to clone may have been created within the same
194 * transaction group that we are trying to clone. Such block has no BP
195 * allocated yet, so cannot be immediately cloned. We return EXDEV.
196 * - The block we want to clone may have been modified within the same
197 * transaction group. We return EXDEV.
198 * - A block may be cloned multiple times during one transaction group (that's
199 * why pending list is actually a tree and not an append-only list - this
200 * way we can figure out faster if this block is cloned for the first time
201 * in this txg or consecutive time).
202 * - A block may be cloned and freed within the same transaction group
203 * (see dbuf_undirty()).
204 * - A block may be cloned and within the same transaction group the clone
205 * can be cloned again (see dmu_read_l0_bps()).
206 * - A file might have been deleted, but the caller still has a file descriptor
207 * open to this file and clones it.
209 * When we free a block we have an additional step in the ZIO pipeline where we
210 * call the zio_brt_free() function. We then call the brt_entry_decref()
211 * that loads the corresponding BRT entry (if one exists) and decreases
212 * reference counter. If this is not the last reference we will stop ZIO
213 * pipeline here. If this is the last reference or the block is not in the
214 * BRT, we continue the pipeline and free the block as usual.
216 * At the beginning of spa_sync() where there can be no more block cloning,
217 * but before issuing frees we call brt_pending_apply(). This function applies
218 * all the new clones to the BRT table - we load BRT entries and update
219 * reference counters. To sync new BRT entries to disk, we use brt_sync()
220 * function. This function will sync all dirty per-top-level-vdev BRTs,
221 * the entry counters arrays, etc.
223 * Block Cloning and ZIL.
225 * Every clone operation is divided into chunks (similar to write) and each
226 * chunk is cloned in a separate transaction. The chunk size is determined by
227 * how many BPs we can fit into a single ZIL entry.
228 * Replaying clone operation is different from the regular clone operation,
229 * as when we log clone operations we cannot use the source object - it may
230 * reside on a different dataset, so we log BPs we want to clone.
231 * The ZIL is replayed when we mount the given dataset, not when the pool is
232 * imported. Taking this into account it is possible that the pool is imported
233 * without mounting datasets and the source dataset is destroyed before the
234 * destination dataset is mounted and its ZIL replayed.
235 * To address this situation we leverage zil_claim() mechanism where ZFS will
236 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237 * entries, we will bump reference counters for their BPs in the BRT and then
238 * on mount and ZIL replay we will just attach BPs to the file without
239 * bumping reference counters.
240 * Note it is still possible that after zil_claim() we never mount the
241 * destination, so we never replay its ZIL and we destroy it. This way we would
242 * end up with leaked references in BRT. We address that too as ZFS gives us
243 * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
247 * BRT - Block Reference Table.
249 #define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
252 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
253 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
254 * Each element in this array represents how many BRT entries do we have in this
255 * chunk of storage. We always load this entire array into memory and update as
256 * needed. By having it in memory we can quickly tell (during zio_free()) if
257 * there are any BRT entries that we might need to update.
259 * This value cannot be larger than 16MB, at least as long as we support
260 * 512 byte block sizes. With 512 byte block size we can have exactly
261 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
262 * many for a 16bit counter.
264 #define BRT_RANGESIZE (16 * 1024 * 1024)
265 _Static_assert(BRT_RANGESIZE
/ SPA_MINBLOCKSIZE
<= UINT16_MAX
,
266 "BRT_RANGESIZE is too large.");
268 * We don't want to update the whole structure every time. Maintain bitmap
269 * of dirty blocks within the regions, so that a single bit represents a
270 * block size of entcounts. For example if we have a 1PB vdev then all
271 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
272 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
273 * the whole 128MB on disk when we have updated only a single entcount.
274 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
275 * is represented by a single bit. This gives us 4096 bits. A set bit in the
276 * bitmap means that we had a change in at least one of the 16384 entcounts
277 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
279 #define BRT_BLOCKSIZE (32 * 1024)
280 #define BRT_RANGESIZE_TO_NBLOCKS(size) \
281 (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
283 #define BRT_LITTLE_ENDIAN 0
284 #define BRT_BIG_ENDIAN 1
285 #ifdef _ZFS_LITTLE_ENDIAN
286 #define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
287 #define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
289 #define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
290 #define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
293 typedef struct brt_vdev_phys
{
294 uint64_t bvp_mos_entries
;
296 uint64_t bvp_byteorder
;
297 uint64_t bvp_totalcount
;
298 uint64_t bvp_rangesize
;
299 uint64_t bvp_usedspace
;
300 uint64_t bvp_savedspace
;
303 typedef struct brt_vdev
{
309 * Is the structure initiated?
310 * (bv_entcount and bv_bitmap are allocated?)
312 boolean_t bv_initiated
;
314 * Object number in the MOS for the entcount array and brt_vdev_phys.
316 uint64_t bv_mos_brtvdev
;
318 * Object number in the MOS for the entries table.
320 uint64_t bv_mos_entries
;
326 * Does the bv_entcount[] array needs byte swapping?
328 boolean_t bv_need_byteswap
;
330 * Number of entries in the bv_entcount[] array.
334 * This is the array with BRT entry count per BRT_RANGESIZE.
336 uint16_t *bv_entcount
;
338 * Sum of all bv_entcount[]s.
340 uint64_t bv_totalcount
;
342 * Space on disk occupied by cloned blocks (without compression).
344 uint64_t bv_usedspace
;
346 * How much additional space would be occupied without block cloning.
348 uint64_t bv_savedspace
;
350 * brt_vdev_phys needs updating on disk.
352 boolean_t bv_meta_dirty
;
354 * bv_entcount[] needs updating on disk.
356 boolean_t bv_entcount_dirty
;
358 * bv_entcount[] potentially can be a bit too big to sychronize it all
359 * when we just changed few entcounts. The fields below allow us to
360 * track updates to bv_entcount[] array since the last sync.
361 * A single bit in the bv_bitmap represents as many entcounts as can
362 * fit into a single BRT_BLOCKSIZE.
363 * For example we have 65536 entcounts in the bv_entcount array
364 * (so the whole array is 128kB). We updated bv_entcount[2] and
365 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
366 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
378 #define brt_mos brt_spa->spa_meta_objset
379 uint64_t brt_rangesize
;
380 uint64_t brt_usedspace
;
381 uint64_t brt_savedspace
;
382 avl_tree_t brt_pending_tree
[TXG_SIZE
];
383 kmutex_t brt_pending_lock
[TXG_SIZE
];
384 /* Sum of all entries across all bv_trees. */
385 uint64_t brt_nentries
;
386 brt_vdev_t
*brt_vdevs
;
390 /* Size of bre_offset / sizeof (uint64_t). */
391 #define BRT_KEY_WORDS (1)
395 * On-disk we use bre_offset as the key and bre_refcount as the value.
397 typedef struct brt_entry
{
399 uint64_t bre_refcount
;
403 typedef struct brt_pending_entry
{
407 } brt_pending_entry_t
;
409 static kmem_cache_t
*brt_entry_cache
;
410 static kmem_cache_t
*brt_pending_entry_cache
;
413 * Enable/disable prefetching of BRT entries that we are going to modify.
415 int zfs_brt_prefetch
= 1;
418 #define BRT_DEBUG(...) do { \
419 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
420 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
424 #define BRT_DEBUG(...) do { } while (0)
427 int brt_zap_leaf_blockshift
= 12;
428 int brt_zap_indirect_blockshift
= 12;
430 static kstat_t
*brt_ksp
;
432 typedef struct brt_stats
{
433 kstat_named_t brt_addref_entry_in_memory
;
434 kstat_named_t brt_addref_entry_not_on_disk
;
435 kstat_named_t brt_addref_entry_on_disk
;
436 kstat_named_t brt_addref_entry_read_lost_race
;
437 kstat_named_t brt_decref_entry_in_memory
;
438 kstat_named_t brt_decref_entry_loaded_from_disk
;
439 kstat_named_t brt_decref_entry_not_in_memory
;
440 kstat_named_t brt_decref_entry_not_on_disk
;
441 kstat_named_t brt_decref_entry_read_lost_race
;
442 kstat_named_t brt_decref_entry_still_referenced
;
443 kstat_named_t brt_decref_free_data_later
;
444 kstat_named_t brt_decref_free_data_now
;
445 kstat_named_t brt_decref_no_entry
;
448 static brt_stats_t brt_stats
= {
449 { "addref_entry_in_memory", KSTAT_DATA_UINT64
},
450 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64
},
451 { "addref_entry_on_disk", KSTAT_DATA_UINT64
},
452 { "addref_entry_read_lost_race", KSTAT_DATA_UINT64
},
453 { "decref_entry_in_memory", KSTAT_DATA_UINT64
},
454 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64
},
455 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64
},
456 { "decref_entry_not_on_disk", KSTAT_DATA_UINT64
},
457 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64
},
458 { "decref_entry_still_referenced", KSTAT_DATA_UINT64
},
459 { "decref_free_data_later", KSTAT_DATA_UINT64
},
460 { "decref_free_data_now", KSTAT_DATA_UINT64
},
461 { "decref_no_entry", KSTAT_DATA_UINT64
}
465 wmsum_t brt_addref_entry_in_memory
;
466 wmsum_t brt_addref_entry_not_on_disk
;
467 wmsum_t brt_addref_entry_on_disk
;
468 wmsum_t brt_addref_entry_read_lost_race
;
469 wmsum_t brt_decref_entry_in_memory
;
470 wmsum_t brt_decref_entry_loaded_from_disk
;
471 wmsum_t brt_decref_entry_not_in_memory
;
472 wmsum_t brt_decref_entry_not_on_disk
;
473 wmsum_t brt_decref_entry_read_lost_race
;
474 wmsum_t brt_decref_entry_still_referenced
;
475 wmsum_t brt_decref_free_data_later
;
476 wmsum_t brt_decref_free_data_now
;
477 wmsum_t brt_decref_no_entry
;
480 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
482 static int brt_entry_compare(const void *x1
, const void *x2
);
483 static int brt_pending_entry_compare(const void *x1
, const void *x2
);
486 brt_rlock(brt_t
*brt
)
488 rw_enter(&brt
->brt_lock
, RW_READER
);
492 brt_wlock(brt_t
*brt
)
494 rw_enter(&brt
->brt_lock
, RW_WRITER
);
498 brt_unlock(brt_t
*brt
)
500 rw_exit(&brt
->brt_lock
);
504 brt_vdev_entcount_get(const brt_vdev_t
*brtvd
, uint64_t idx
)
507 ASSERT3U(idx
, <, brtvd
->bv_size
);
509 if (brtvd
->bv_need_byteswap
) {
510 return (BSWAP_16(brtvd
->bv_entcount
[idx
]));
512 return (brtvd
->bv_entcount
[idx
]);
517 brt_vdev_entcount_set(brt_vdev_t
*brtvd
, uint64_t idx
, uint16_t entcnt
)
520 ASSERT3U(idx
, <, brtvd
->bv_size
);
522 if (brtvd
->bv_need_byteswap
) {
523 brtvd
->bv_entcount
[idx
] = BSWAP_16(entcnt
);
525 brtvd
->bv_entcount
[idx
] = entcnt
;
530 brt_vdev_entcount_inc(brt_vdev_t
*brtvd
, uint64_t idx
)
534 ASSERT3U(idx
, <, brtvd
->bv_size
);
536 entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
537 ASSERT(entcnt
< UINT16_MAX
);
539 brt_vdev_entcount_set(brtvd
, idx
, entcnt
+ 1);
543 brt_vdev_entcount_dec(brt_vdev_t
*brtvd
, uint64_t idx
)
547 ASSERT3U(idx
, <, brtvd
->bv_size
);
549 entcnt
= brt_vdev_entcount_get(brtvd
, idx
);
552 brt_vdev_entcount_set(brtvd
, idx
, entcnt
- 1);
557 brt_vdev_dump(brt_t
*brt
)
562 if ((zfs_flags
& ZFS_DEBUG_BRT
) == 0) {
566 if (brt
->brt_nvdevs
== 0) {
567 zfs_dbgmsg("BRT empty");
571 zfs_dbgmsg("BRT vdev dump:");
572 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
575 brtvd
= &brt
->brt_vdevs
[vdevid
];
576 zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
577 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
578 (u_longlong_t
)vdevid
, (u_longlong_t
)brtvd
->bv_vdevid
,
579 brtvd
->bv_meta_dirty
, brtvd
->bv_entcount_dirty
,
580 (u_longlong_t
)brtvd
->bv_size
,
581 (u_longlong_t
)brtvd
->bv_totalcount
,
582 (u_longlong_t
)brtvd
->bv_nblocks
,
583 (size_t)BT_SIZEOFMAP(brtvd
->bv_nblocks
));
584 if (brtvd
->bv_totalcount
> 0) {
585 zfs_dbgmsg(" entcounts:");
586 for (idx
= 0; idx
< brtvd
->bv_size
; idx
++) {
587 if (brt_vdev_entcount_get(brtvd
, idx
) > 0) {
588 zfs_dbgmsg(" [%04llu] %hu",
590 brt_vdev_entcount_get(brtvd
, idx
));
594 if (brtvd
->bv_entcount_dirty
) {
597 bitmap
= kmem_alloc(brtvd
->bv_nblocks
+ 1, KM_SLEEP
);
598 for (idx
= 0; idx
< brtvd
->bv_nblocks
; idx
++) {
600 BT_TEST(brtvd
->bv_bitmap
, idx
) ? 'x' : '.';
603 zfs_dbgmsg(" bitmap: %s", bitmap
);
604 kmem_free(bitmap
, brtvd
->bv_nblocks
+ 1);
611 brt_vdev(brt_t
*brt
, uint64_t vdevid
)
615 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
617 if (vdevid
< brt
->brt_nvdevs
) {
618 brtvd
= &brt
->brt_vdevs
[vdevid
];
627 brt_vdev_create(brt_t
*brt
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
631 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
632 ASSERT0(brtvd
->bv_mos_brtvdev
);
633 ASSERT0(brtvd
->bv_mos_entries
);
634 ASSERT(brtvd
->bv_entcount
!= NULL
);
635 ASSERT(brtvd
->bv_size
> 0);
636 ASSERT(brtvd
->bv_bitmap
!= NULL
);
637 ASSERT(brtvd
->bv_nblocks
> 0);
639 brtvd
->bv_mos_entries
= zap_create_flags(brt
->brt_mos
, 0,
640 ZAP_FLAG_HASH64
| ZAP_FLAG_UINT64_KEY
, DMU_OTN_ZAP_METADATA
,
641 brt_zap_leaf_blockshift
, brt_zap_indirect_blockshift
, DMU_OT_NONE
,
643 VERIFY(brtvd
->bv_mos_entries
!= 0);
644 BRT_DEBUG("MOS entries created, object=%llu",
645 (u_longlong_t
)brtvd
->bv_mos_entries
);
648 * We allocate DMU buffer to store the bv_entcount[] array.
649 * We will keep array size (bv_size) and cummulative count for all
650 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
652 brtvd
->bv_mos_brtvdev
= dmu_object_alloc(brt
->brt_mos
,
653 DMU_OTN_UINT64_METADATA
, BRT_BLOCKSIZE
,
654 DMU_OTN_UINT64_METADATA
, sizeof (brt_vdev_phys_t
), tx
);
655 VERIFY(brtvd
->bv_mos_brtvdev
!= 0);
656 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
657 (u_longlong_t
)brtvd
->bv_mos_brtvdev
);
659 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
660 (u_longlong_t
)brtvd
->bv_vdevid
);
661 VERIFY0(zap_add(brt
->brt_mos
, DMU_POOL_DIRECTORY_OBJECT
, name
,
662 sizeof (uint64_t), 1, &brtvd
->bv_mos_brtvdev
, tx
));
663 BRT_DEBUG("Pool directory object created, object=%s", name
);
665 spa_feature_incr(brt
->brt_spa
, SPA_FEATURE_BLOCK_CLONING
, tx
);
669 brt_vdev_realloc(brt_t
*brt
, brt_vdev_t
*brtvd
)
674 uint64_t nblocks
, size
;
676 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
678 spa_config_enter(brt
->brt_spa
, SCL_VDEV
, FTAG
, RW_READER
);
679 vd
= vdev_lookup_top(brt
->brt_spa
, brtvd
->bv_vdevid
);
680 size
= (vdev_get_min_asize(vd
) - 1) / brt
->brt_rangesize
+ 1;
681 spa_config_exit(brt
->brt_spa
, SCL_VDEV
, FTAG
);
683 entcount
= vmem_zalloc(sizeof (entcount
[0]) * size
, KM_SLEEP
);
684 nblocks
= BRT_RANGESIZE_TO_NBLOCKS(size
);
685 bitmap
= kmem_zalloc(BT_SIZEOFMAP(nblocks
), KM_SLEEP
);
687 if (!brtvd
->bv_initiated
) {
688 ASSERT0(brtvd
->bv_size
);
689 ASSERT(brtvd
->bv_entcount
== NULL
);
690 ASSERT(brtvd
->bv_bitmap
== NULL
);
691 ASSERT0(brtvd
->bv_nblocks
);
693 avl_create(&brtvd
->bv_tree
, brt_entry_compare
,
694 sizeof (brt_entry_t
), offsetof(brt_entry_t
, bre_node
));
696 ASSERT(brtvd
->bv_size
> 0);
697 ASSERT(brtvd
->bv_entcount
!= NULL
);
698 ASSERT(brtvd
->bv_bitmap
!= NULL
);
699 ASSERT(brtvd
->bv_nblocks
> 0);
701 * TODO: Allow vdev shrinking. We only need to implement
702 * shrinking the on-disk BRT VDEV object.
703 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
706 ASSERT3U(brtvd
->bv_size
, <=, size
);
708 memcpy(entcount
, brtvd
->bv_entcount
,
709 sizeof (entcount
[0]) * MIN(size
, brtvd
->bv_size
));
710 memcpy(bitmap
, brtvd
->bv_bitmap
, MIN(BT_SIZEOFMAP(nblocks
),
711 BT_SIZEOFMAP(brtvd
->bv_nblocks
)));
712 vmem_free(brtvd
->bv_entcount
,
713 sizeof (entcount
[0]) * brtvd
->bv_size
);
714 kmem_free(brtvd
->bv_bitmap
, BT_SIZEOFMAP(brtvd
->bv_nblocks
));
717 brtvd
->bv_size
= size
;
718 brtvd
->bv_entcount
= entcount
;
719 brtvd
->bv_bitmap
= bitmap
;
720 brtvd
->bv_nblocks
= nblocks
;
721 if (!brtvd
->bv_initiated
) {
722 brtvd
->bv_need_byteswap
= FALSE
;
723 brtvd
->bv_initiated
= TRUE
;
724 BRT_DEBUG("BRT VDEV %llu initiated.",
725 (u_longlong_t
)brtvd
->bv_vdevid
);
730 brt_vdev_load(brt_t
*brt
, brt_vdev_t
*brtvd
)
734 brt_vdev_phys_t
*bvphys
;
737 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
738 (u_longlong_t
)brtvd
->bv_vdevid
);
739 error
= zap_lookup(brt
->brt_mos
, DMU_POOL_DIRECTORY_OBJECT
, name
,
740 sizeof (uint64_t), 1, &brtvd
->bv_mos_brtvdev
);
743 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
745 error
= dmu_bonus_hold(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, FTAG
, &db
);
750 bvphys
= db
->db_data
;
751 if (brt
->brt_rangesize
== 0) {
752 brt
->brt_rangesize
= bvphys
->bvp_rangesize
;
754 ASSERT3U(brt
->brt_rangesize
, ==, bvphys
->bvp_rangesize
);
757 ASSERT(!brtvd
->bv_initiated
);
758 brt_vdev_realloc(brt
, brtvd
);
760 /* TODO: We don't support VDEV shrinking. */
761 ASSERT3U(bvphys
->bvp_size
, <=, brtvd
->bv_size
);
764 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
766 error
= dmu_read(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, 0,
767 MIN(brtvd
->bv_size
, bvphys
->bvp_size
) * sizeof (uint16_t),
768 brtvd
->bv_entcount
, DMU_READ_NO_PREFETCH
);
771 brtvd
->bv_mos_entries
= bvphys
->bvp_mos_entries
;
772 ASSERT(brtvd
->bv_mos_entries
!= 0);
773 brtvd
->bv_need_byteswap
=
774 (bvphys
->bvp_byteorder
!= BRT_NATIVE_BYTEORDER
);
775 brtvd
->bv_totalcount
= bvphys
->bvp_totalcount
;
776 brtvd
->bv_usedspace
= bvphys
->bvp_usedspace
;
777 brtvd
->bv_savedspace
= bvphys
->bvp_savedspace
;
778 brt
->brt_usedspace
+= brtvd
->bv_usedspace
;
779 brt
->brt_savedspace
+= brtvd
->bv_savedspace
;
781 dmu_buf_rele(db
, FTAG
);
783 BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
784 name
, (u_longlong_t
)brtvd
->bv_mos_brtvdev
,
785 (u_longlong_t
)brtvd
->bv_mos_entries
);
789 brt_vdev_dealloc(brt_t
*brt
, brt_vdev_t
*brtvd
)
792 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
793 ASSERT(brtvd
->bv_initiated
);
795 vmem_free(brtvd
->bv_entcount
, sizeof (uint16_t) * brtvd
->bv_size
);
796 brtvd
->bv_entcount
= NULL
;
797 kmem_free(brtvd
->bv_bitmap
, BT_SIZEOFMAP(brtvd
->bv_nblocks
));
798 brtvd
->bv_bitmap
= NULL
;
799 ASSERT0(avl_numnodes(&brtvd
->bv_tree
));
800 avl_destroy(&brtvd
->bv_tree
);
803 brtvd
->bv_nblocks
= 0;
805 brtvd
->bv_initiated
= FALSE
;
806 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t
)brtvd
->bv_vdevid
);
810 brt_vdev_destroy(brt_t
*brt
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
815 brt_vdev_phys_t
*bvphys
;
817 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
818 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
819 ASSERT(brtvd
->bv_mos_entries
!= 0);
821 VERIFY0(zap_count(brt
->brt_mos
, brtvd
->bv_mos_entries
, &count
));
823 VERIFY0(zap_destroy(brt
->brt_mos
, brtvd
->bv_mos_entries
, tx
));
824 BRT_DEBUG("MOS entries destroyed, object=%llu",
825 (u_longlong_t
)brtvd
->bv_mos_entries
);
826 brtvd
->bv_mos_entries
= 0;
828 VERIFY0(dmu_bonus_hold(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, FTAG
, &db
));
829 bvphys
= db
->db_data
;
830 ASSERT0(bvphys
->bvp_totalcount
);
831 ASSERT0(bvphys
->bvp_usedspace
);
832 ASSERT0(bvphys
->bvp_savedspace
);
833 dmu_buf_rele(db
, FTAG
);
835 VERIFY0(dmu_object_free(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, tx
));
836 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
837 (u_longlong_t
)brtvd
->bv_mos_brtvdev
);
838 brtvd
->bv_mos_brtvdev
= 0;
840 snprintf(name
, sizeof (name
), "%s%llu", BRT_OBJECT_VDEV_PREFIX
,
841 (u_longlong_t
)brtvd
->bv_vdevid
);
842 VERIFY0(zap_remove(brt
->brt_mos
, DMU_POOL_DIRECTORY_OBJECT
, name
, tx
));
843 BRT_DEBUG("Pool directory object removed, object=%s", name
);
845 brt_vdev_dealloc(brt
, brtvd
);
847 spa_feature_decr(brt
->brt_spa
, SPA_FEATURE_BLOCK_CLONING
, tx
);
851 brt_vdevs_expand(brt_t
*brt
, uint64_t nvdevs
)
853 brt_vdev_t
*brtvd
, *vdevs
;
856 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
857 ASSERT3U(nvdevs
, >, brt
->brt_nvdevs
);
859 vdevs
= kmem_zalloc(sizeof (vdevs
[0]) * nvdevs
, KM_SLEEP
);
860 if (brt
->brt_nvdevs
> 0) {
861 ASSERT(brt
->brt_vdevs
!= NULL
);
863 memcpy(vdevs
, brt
->brt_vdevs
,
864 sizeof (brt_vdev_t
) * brt
->brt_nvdevs
);
865 kmem_free(brt
->brt_vdevs
,
866 sizeof (brt_vdev_t
) * brt
->brt_nvdevs
);
868 for (vdevid
= brt
->brt_nvdevs
; vdevid
< nvdevs
; vdevid
++) {
869 brtvd
= &vdevs
[vdevid
];
871 brtvd
->bv_vdevid
= vdevid
;
872 brtvd
->bv_initiated
= FALSE
;
875 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
876 (u_longlong_t
)brt
->brt_nvdevs
, (u_longlong_t
)nvdevs
);
878 brt
->brt_vdevs
= vdevs
;
879 brt
->brt_nvdevs
= nvdevs
;
883 brt_vdev_lookup(brt_t
*brt
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
)
887 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
889 idx
= bre
->bre_offset
/ brt
->brt_rangesize
;
890 if (brtvd
->bv_entcount
!= NULL
&& idx
< brtvd
->bv_size
) {
891 /* VDEV wasn't expanded. */
892 return (brt_vdev_entcount_get(brtvd
, idx
) > 0);
899 brt_vdev_addref(brt_t
*brt
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
,
904 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
905 ASSERT(brtvd
!= NULL
);
906 ASSERT(brtvd
->bv_entcount
!= NULL
);
908 brt
->brt_savedspace
+= dsize
;
909 brtvd
->bv_savedspace
+= dsize
;
910 brtvd
->bv_meta_dirty
= TRUE
;
912 if (bre
->bre_refcount
> 1) {
916 brt
->brt_usedspace
+= dsize
;
917 brtvd
->bv_usedspace
+= dsize
;
919 idx
= bre
->bre_offset
/ brt
->brt_rangesize
;
920 if (idx
>= brtvd
->bv_size
) {
921 /* VDEV has been expanded. */
922 brt_vdev_realloc(brt
, brtvd
);
925 ASSERT3U(idx
, <, brtvd
->bv_size
);
927 brtvd
->bv_totalcount
++;
928 brt_vdev_entcount_inc(brtvd
, idx
);
929 brtvd
->bv_entcount_dirty
= TRUE
;
930 idx
= idx
/ BRT_BLOCKSIZE
/ 8;
931 BT_SET(brtvd
->bv_bitmap
, idx
);
939 brt_vdev_decref(brt_t
*brt
, brt_vdev_t
*brtvd
, const brt_entry_t
*bre
,
944 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
945 ASSERT(brtvd
!= NULL
);
946 ASSERT(brtvd
->bv_entcount
!= NULL
);
948 brt
->brt_savedspace
-= dsize
;
949 brtvd
->bv_savedspace
-= dsize
;
950 brtvd
->bv_meta_dirty
= TRUE
;
952 if (bre
->bre_refcount
> 0) {
956 brt
->brt_usedspace
-= dsize
;
957 brtvd
->bv_usedspace
-= dsize
;
959 idx
= bre
->bre_offset
/ brt
->brt_rangesize
;
960 ASSERT3U(idx
, <, brtvd
->bv_size
);
962 ASSERT(brtvd
->bv_totalcount
> 0);
963 brtvd
->bv_totalcount
--;
964 brt_vdev_entcount_dec(brtvd
, idx
);
965 brtvd
->bv_entcount_dirty
= TRUE
;
966 idx
= idx
/ BRT_BLOCKSIZE
/ 8;
967 BT_SET(brtvd
->bv_bitmap
, idx
);
975 brt_vdev_sync(brt_t
*brt
, brt_vdev_t
*brtvd
, dmu_tx_t
*tx
)
978 brt_vdev_phys_t
*bvphys
;
980 ASSERT(brtvd
->bv_meta_dirty
);
981 ASSERT(brtvd
->bv_mos_brtvdev
!= 0);
982 ASSERT(dmu_tx_is_syncing(tx
));
984 VERIFY0(dmu_bonus_hold(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, FTAG
, &db
));
986 if (brtvd
->bv_entcount_dirty
) {
988 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
990 dmu_write(brt
->brt_mos
, brtvd
->bv_mos_brtvdev
, 0,
991 brtvd
->bv_size
* sizeof (brtvd
->bv_entcount
[0]),
992 brtvd
->bv_entcount
, tx
);
993 memset(brtvd
->bv_bitmap
, 0, BT_SIZEOFMAP(brtvd
->bv_nblocks
));
994 brtvd
->bv_entcount_dirty
= FALSE
;
997 dmu_buf_will_dirty(db
, tx
);
998 bvphys
= db
->db_data
;
999 bvphys
->bvp_mos_entries
= brtvd
->bv_mos_entries
;
1000 bvphys
->bvp_size
= brtvd
->bv_size
;
1001 if (brtvd
->bv_need_byteswap
) {
1002 bvphys
->bvp_byteorder
= BRT_NON_NATIVE_BYTEORDER
;
1004 bvphys
->bvp_byteorder
= BRT_NATIVE_BYTEORDER
;
1006 bvphys
->bvp_totalcount
= brtvd
->bv_totalcount
;
1007 bvphys
->bvp_rangesize
= brt
->brt_rangesize
;
1008 bvphys
->bvp_usedspace
= brtvd
->bv_usedspace
;
1009 bvphys
->bvp_savedspace
= brtvd
->bv_savedspace
;
1010 dmu_buf_rele(db
, FTAG
);
1012 brtvd
->bv_meta_dirty
= FALSE
;
1016 brt_vdevs_alloc(brt_t
*brt
, boolean_t load
)
1023 brt_vdevs_expand(brt
, brt
->brt_spa
->spa_root_vdev
->vdev_children
);
1026 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
1027 brtvd
= &brt
->brt_vdevs
[vdevid
];
1028 ASSERT(brtvd
->bv_entcount
== NULL
);
1030 brt_vdev_load(brt
, brtvd
);
1034 if (brt
->brt_rangesize
== 0) {
1035 brt
->brt_rangesize
= BRT_RANGESIZE
;
1042 brt_vdevs_free(brt_t
*brt
)
1049 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
1050 brtvd
= &brt
->brt_vdevs
[vdevid
];
1051 if (brtvd
->bv_initiated
)
1052 brt_vdev_dealloc(brt
, brtvd
);
1054 kmem_free(brt
->brt_vdevs
, sizeof (brt_vdev_t
) * brt
->brt_nvdevs
);
1060 brt_entry_fill(const blkptr_t
*bp
, brt_entry_t
*bre
, uint64_t *vdevidp
)
1063 bre
->bre_offset
= DVA_GET_OFFSET(&bp
->blk_dva
[0]);
1064 bre
->bre_refcount
= 0;
1066 *vdevidp
= DVA_GET_VDEV(&bp
->blk_dva
[0]);
1070 brt_entry_compare(const void *x1
, const void *x2
)
1072 const brt_entry_t
*bre1
= x1
;
1073 const brt_entry_t
*bre2
= x2
;
1075 return (TREE_CMP(bre1
->bre_offset
, bre2
->bre_offset
));
1079 brt_entry_lookup(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
)
1081 uint64_t mos_entries
;
1082 uint64_t one
, physsize
;
1085 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
1087 if (!brt_vdev_lookup(brt
, brtvd
, bre
))
1088 return (SET_ERROR(ENOENT
));
1091 * Remember mos_entries object number. After we reacquire the BRT lock,
1092 * the brtvd pointer may be invalid.
1094 mos_entries
= brtvd
->bv_mos_entries
;
1095 if (mos_entries
== 0)
1096 return (SET_ERROR(ENOENT
));
1100 error
= zap_length_uint64(brt
->brt_mos
, mos_entries
, &bre
->bre_offset
,
1101 BRT_KEY_WORDS
, &one
, &physsize
);
1103 ASSERT3U(one
, ==, 1);
1104 ASSERT3U(physsize
, ==, sizeof (bre
->bre_refcount
));
1106 error
= zap_lookup_uint64(brt
->brt_mos
, mos_entries
,
1107 &bre
->bre_offset
, BRT_KEY_WORDS
, 1,
1108 sizeof (bre
->bre_refcount
), &bre
->bre_refcount
);
1109 BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
1110 "count=%llu error=%d", (u_longlong_t
)mos_entries
,
1111 (u_longlong_t
)brtvd
->bv_vdevid
,
1112 (u_longlong_t
)bre
->bre_offset
,
1113 error
== 0 ? (u_longlong_t
)bre
->bre_refcount
: 0, error
);
1122 brt_entry_prefetch(brt_t
*brt
, uint64_t vdevid
, brt_entry_t
*bre
)
1125 uint64_t mos_entries
= 0;
1128 brtvd
= brt_vdev(brt
, vdevid
);
1130 mos_entries
= brtvd
->bv_mos_entries
;
1133 if (mos_entries
== 0)
1136 BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
1137 (u_longlong_t
)mos_entries
, (u_longlong_t
)vdevid
,
1138 (u_longlong_t
)bre
->bre_offset
);
1139 (void) zap_prefetch_uint64(brt
->brt_mos
, mos_entries
,
1140 (uint64_t *)&bre
->bre_offset
, BRT_KEY_WORDS
);
1144 brt_entry_update(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
1148 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
1149 ASSERT(brtvd
->bv_mos_entries
!= 0);
1150 ASSERT(bre
->bre_refcount
> 0);
1152 error
= zap_update_uint64(brt
->brt_mos
, brtvd
->bv_mos_entries
,
1153 (uint64_t *)&bre
->bre_offset
, BRT_KEY_WORDS
, 1,
1154 sizeof (bre
->bre_refcount
), &bre
->bre_refcount
, tx
);
1155 BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
1156 "error=%d", (u_longlong_t
)brtvd
->bv_mos_entries
,
1157 (u_longlong_t
)brtvd
->bv_vdevid
, (u_longlong_t
)bre
->bre_offset
,
1158 (u_longlong_t
)bre
->bre_refcount
, error
);
1164 brt_entry_remove(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
1168 ASSERT(RW_LOCK_HELD(&brt
->brt_lock
));
1169 ASSERT(brtvd
->bv_mos_entries
!= 0);
1170 ASSERT0(bre
->bre_refcount
);
1172 error
= zap_remove_uint64(brt
->brt_mos
, brtvd
->bv_mos_entries
,
1173 (uint64_t *)&bre
->bre_offset
, BRT_KEY_WORDS
, tx
);
1174 BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1175 "error=%d", (u_longlong_t
)brtvd
->bv_mos_entries
,
1176 (u_longlong_t
)brtvd
->bv_vdevid
, (u_longlong_t
)bre
->bre_offset
,
1177 (u_longlong_t
)bre
->bre_refcount
, error
);
1183 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1184 * positive, but gives us quick answer if we should look into BRT, which
1185 * may require reads and thus will be more expensive.
1188 brt_maybe_exists(spa_t
*spa
, const blkptr_t
*bp
)
1190 brt_t
*brt
= spa
->spa_brt
;
1192 brt_entry_t bre_search
;
1193 boolean_t mayexists
= FALSE
;
1196 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1200 brtvd
= brt_vdev(brt
, vdevid
);
1201 if (brtvd
!= NULL
&& brtvd
->bv_initiated
) {
1202 if (!avl_is_empty(&brtvd
->bv_tree
) ||
1203 brt_vdev_lookup(brt
, brtvd
, &bre_search
)) {
1214 brt_get_dspace(spa_t
*spa
)
1216 brt_t
*brt
= spa
->spa_brt
;
1221 return (brt
->brt_savedspace
);
1225 brt_get_used(spa_t
*spa
)
1227 brt_t
*brt
= spa
->spa_brt
;
1232 return (brt
->brt_usedspace
);
1236 brt_get_saved(spa_t
*spa
)
1238 brt_t
*brt
= spa
->spa_brt
;
1243 return (brt
->brt_savedspace
);
1247 brt_get_ratio(spa_t
*spa
)
1249 brt_t
*brt
= spa
->spa_brt
;
1251 if (brt
->brt_usedspace
== 0)
1254 return ((brt
->brt_usedspace
+ brt
->brt_savedspace
) * 100 /
1255 brt
->brt_usedspace
);
1259 brt_kstats_update(kstat_t
*ksp
, int rw
)
1261 brt_stats_t
*bs
= ksp
->ks_data
;
1263 if (rw
== KSTAT_WRITE
)
1266 bs
->brt_addref_entry_in_memory
.value
.ui64
=
1267 wmsum_value(&brt_sums
.brt_addref_entry_in_memory
);
1268 bs
->brt_addref_entry_not_on_disk
.value
.ui64
=
1269 wmsum_value(&brt_sums
.brt_addref_entry_not_on_disk
);
1270 bs
->brt_addref_entry_on_disk
.value
.ui64
=
1271 wmsum_value(&brt_sums
.brt_addref_entry_on_disk
);
1272 bs
->brt_addref_entry_read_lost_race
.value
.ui64
=
1273 wmsum_value(&brt_sums
.brt_addref_entry_read_lost_race
);
1274 bs
->brt_decref_entry_in_memory
.value
.ui64
=
1275 wmsum_value(&brt_sums
.brt_decref_entry_in_memory
);
1276 bs
->brt_decref_entry_loaded_from_disk
.value
.ui64
=
1277 wmsum_value(&brt_sums
.brt_decref_entry_loaded_from_disk
);
1278 bs
->brt_decref_entry_not_in_memory
.value
.ui64
=
1279 wmsum_value(&brt_sums
.brt_decref_entry_not_in_memory
);
1280 bs
->brt_decref_entry_not_on_disk
.value
.ui64
=
1281 wmsum_value(&brt_sums
.brt_decref_entry_not_on_disk
);
1282 bs
->brt_decref_entry_read_lost_race
.value
.ui64
=
1283 wmsum_value(&brt_sums
.brt_decref_entry_read_lost_race
);
1284 bs
->brt_decref_entry_still_referenced
.value
.ui64
=
1285 wmsum_value(&brt_sums
.brt_decref_entry_still_referenced
);
1286 bs
->brt_decref_free_data_later
.value
.ui64
=
1287 wmsum_value(&brt_sums
.brt_decref_free_data_later
);
1288 bs
->brt_decref_free_data_now
.value
.ui64
=
1289 wmsum_value(&brt_sums
.brt_decref_free_data_now
);
1290 bs
->brt_decref_no_entry
.value
.ui64
=
1291 wmsum_value(&brt_sums
.brt_decref_no_entry
);
1300 wmsum_init(&brt_sums
.brt_addref_entry_in_memory
, 0);
1301 wmsum_init(&brt_sums
.brt_addref_entry_not_on_disk
, 0);
1302 wmsum_init(&brt_sums
.brt_addref_entry_on_disk
, 0);
1303 wmsum_init(&brt_sums
.brt_addref_entry_read_lost_race
, 0);
1304 wmsum_init(&brt_sums
.brt_decref_entry_in_memory
, 0);
1305 wmsum_init(&brt_sums
.brt_decref_entry_loaded_from_disk
, 0);
1306 wmsum_init(&brt_sums
.brt_decref_entry_not_in_memory
, 0);
1307 wmsum_init(&brt_sums
.brt_decref_entry_not_on_disk
, 0);
1308 wmsum_init(&brt_sums
.brt_decref_entry_read_lost_race
, 0);
1309 wmsum_init(&brt_sums
.brt_decref_entry_still_referenced
, 0);
1310 wmsum_init(&brt_sums
.brt_decref_free_data_later
, 0);
1311 wmsum_init(&brt_sums
.brt_decref_free_data_now
, 0);
1312 wmsum_init(&brt_sums
.brt_decref_no_entry
, 0);
1314 brt_ksp
= kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED
,
1315 sizeof (brt_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
1316 if (brt_ksp
!= NULL
) {
1317 brt_ksp
->ks_data
= &brt_stats
;
1318 brt_ksp
->ks_update
= brt_kstats_update
;
1319 kstat_install(brt_ksp
);
1326 if (brt_ksp
!= NULL
) {
1327 kstat_delete(brt_ksp
);
1331 wmsum_fini(&brt_sums
.brt_addref_entry_in_memory
);
1332 wmsum_fini(&brt_sums
.brt_addref_entry_not_on_disk
);
1333 wmsum_fini(&brt_sums
.brt_addref_entry_on_disk
);
1334 wmsum_fini(&brt_sums
.brt_addref_entry_read_lost_race
);
1335 wmsum_fini(&brt_sums
.brt_decref_entry_in_memory
);
1336 wmsum_fini(&brt_sums
.brt_decref_entry_loaded_from_disk
);
1337 wmsum_fini(&brt_sums
.brt_decref_entry_not_in_memory
);
1338 wmsum_fini(&brt_sums
.brt_decref_entry_not_on_disk
);
1339 wmsum_fini(&brt_sums
.brt_decref_entry_read_lost_race
);
1340 wmsum_fini(&brt_sums
.brt_decref_entry_still_referenced
);
1341 wmsum_fini(&brt_sums
.brt_decref_free_data_later
);
1342 wmsum_fini(&brt_sums
.brt_decref_free_data_now
);
1343 wmsum_fini(&brt_sums
.brt_decref_no_entry
);
1349 brt_entry_cache
= kmem_cache_create("brt_entry_cache",
1350 sizeof (brt_entry_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1351 brt_pending_entry_cache
= kmem_cache_create("brt_pending_entry_cache",
1352 sizeof (brt_pending_entry_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1362 kmem_cache_destroy(brt_entry_cache
);
1363 kmem_cache_destroy(brt_pending_entry_cache
);
1366 static brt_entry_t
*
1367 brt_entry_alloc(const brt_entry_t
*bre_init
)
1371 bre
= kmem_cache_alloc(brt_entry_cache
, KM_SLEEP
);
1372 bre
->bre_offset
= bre_init
->bre_offset
;
1373 bre
->bre_refcount
= bre_init
->bre_refcount
;
1379 brt_entry_free(brt_entry_t
*bre
)
1382 kmem_cache_free(brt_entry_cache
, bre
);
1386 brt_entry_addref(brt_t
*brt
, const blkptr_t
*bp
)
1389 brt_entry_t
*bre
, *racebre
;
1390 brt_entry_t bre_search
;
1395 ASSERT(!RW_WRITE_HELD(&brt
->brt_lock
));
1397 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1401 brtvd
= brt_vdev(brt
, vdevid
);
1402 if (brtvd
== NULL
) {
1403 ASSERT3U(vdevid
, >=, brt
->brt_nvdevs
);
1405 /* New VDEV was added. */
1406 brt_vdevs_expand(brt
, vdevid
+ 1);
1407 brtvd
= brt_vdev(brt
, vdevid
);
1409 ASSERT(brtvd
!= NULL
);
1410 if (!brtvd
->bv_initiated
)
1411 brt_vdev_realloc(brt
, brtvd
);
1413 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1415 BRTSTAT_BUMP(brt_addref_entry_in_memory
);
1418 * brt_entry_lookup() may drop the BRT (read) lock and
1419 * reacquire it (write).
1421 error
= brt_entry_lookup(brt
, brtvd
, &bre_search
);
1422 /* bre_search now contains correct bre_refcount */
1423 ASSERT(error
== 0 || error
== ENOENT
);
1425 BRTSTAT_BUMP(brt_addref_entry_on_disk
);
1427 BRTSTAT_BUMP(brt_addref_entry_not_on_disk
);
1429 * When the BRT lock was dropped, brt_vdevs[] may have been
1430 * expanded and reallocated, we need to update brtvd's pointer.
1432 brtvd
= brt_vdev(brt
, vdevid
);
1433 ASSERT(brtvd
!= NULL
);
1435 racebre
= avl_find(&brtvd
->bv_tree
, &bre_search
, &where
);
1436 if (racebre
== NULL
) {
1437 bre
= brt_entry_alloc(&bre_search
);
1438 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
1439 avl_insert(&brtvd
->bv_tree
, bre
, where
);
1440 brt
->brt_nentries
++;
1443 * The entry was added when the BRT lock was dropped in
1444 * brt_entry_lookup().
1446 BRTSTAT_BUMP(brt_addref_entry_read_lost_race
);
1450 bre
->bre_refcount
++;
1451 brt_vdev_addref(brt
, brtvd
, bre
, bp_get_dsize(brt
->brt_spa
, bp
));
1456 /* Return TRUE if block should be freed immediately. */
1458 brt_entry_decref(spa_t
*spa
, const blkptr_t
*bp
)
1460 brt_t
*brt
= spa
->spa_brt
;
1462 brt_entry_t
*bre
, *racebre
;
1463 brt_entry_t bre_search
;
1468 brt_entry_fill(bp
, &bre_search
, &vdevid
);
1472 brtvd
= brt_vdev(brt
, vdevid
);
1473 ASSERT(brtvd
!= NULL
);
1475 bre
= avl_find(&brtvd
->bv_tree
, &bre_search
, NULL
);
1477 BRTSTAT_BUMP(brt_decref_entry_in_memory
);
1480 BRTSTAT_BUMP(brt_decref_entry_not_in_memory
);
1484 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1486 error
= brt_entry_lookup(brt
, brtvd
, &bre_search
);
1487 /* bre_search now contains correct bre_refcount */
1488 ASSERT(error
== 0 || error
== ENOENT
);
1490 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1491 * and reallocated, we need to update brtvd's pointer.
1493 brtvd
= brt_vdev(brt
, vdevid
);
1494 ASSERT(brtvd
!= NULL
);
1496 if (error
== ENOENT
) {
1497 BRTSTAT_BUMP(brt_decref_entry_not_on_disk
);
1502 racebre
= avl_find(&brtvd
->bv_tree
, &bre_search
, &where
);
1503 if (racebre
!= NULL
) {
1505 * The entry was added when the BRT lock was dropped in
1506 * brt_entry_lookup().
1508 BRTSTAT_BUMP(brt_decref_entry_read_lost_race
);
1513 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk
);
1514 bre
= brt_entry_alloc(&bre_search
);
1515 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
1516 avl_insert(&brtvd
->bv_tree
, bre
, where
);
1517 brt
->brt_nentries
++;
1522 * This is a free of a regular (not cloned) block.
1525 BRTSTAT_BUMP(brt_decref_no_entry
);
1528 if (bre
->bre_refcount
== 0) {
1530 BRTSTAT_BUMP(brt_decref_free_data_now
);
1534 ASSERT(bre
->bre_refcount
> 0);
1535 bre
->bre_refcount
--;
1536 if (bre
->bre_refcount
== 0)
1537 BRTSTAT_BUMP(brt_decref_free_data_later
);
1539 BRTSTAT_BUMP(brt_decref_entry_still_referenced
);
1540 brt_vdev_decref(brt
, brtvd
, bre
, bp_get_dsize(brt
->brt_spa
, bp
));
1548 brt_prefetch(brt_t
*brt
, const blkptr_t
*bp
)
1555 if (!zfs_brt_prefetch
)
1558 brt_entry_fill(bp
, &bre
, &vdevid
);
1560 brt_entry_prefetch(brt
, vdevid
, &bre
);
1564 brt_pending_entry_compare(const void *x1
, const void *x2
)
1566 const brt_pending_entry_t
*bpe1
= x1
, *bpe2
= x2
;
1567 const blkptr_t
*bp1
= &bpe1
->bpe_bp
, *bp2
= &bpe2
->bpe_bp
;
1570 cmp
= TREE_CMP(BP_PHYSICAL_BIRTH(bp1
), BP_PHYSICAL_BIRTH(bp2
));
1572 cmp
= TREE_CMP(DVA_GET_VDEV(&bp1
->blk_dva
[0]),
1573 DVA_GET_VDEV(&bp2
->blk_dva
[0]));
1575 cmp
= TREE_CMP(DVA_GET_OFFSET(&bp1
->blk_dva
[0]),
1576 DVA_GET_OFFSET(&bp2
->blk_dva
[0]));
1584 brt_pending_add(spa_t
*spa
, const blkptr_t
*bp
, dmu_tx_t
*tx
)
1587 avl_tree_t
*pending_tree
;
1588 kmutex_t
*pending_lock
;
1589 brt_pending_entry_t
*bpe
, *newbpe
;
1594 txg
= dmu_tx_get_txg(tx
);
1595 ASSERT3U(txg
, !=, 0);
1596 pending_tree
= &brt
->brt_pending_tree
[txg
& TXG_MASK
];
1597 pending_lock
= &brt
->brt_pending_lock
[txg
& TXG_MASK
];
1599 newbpe
= kmem_cache_alloc(brt_pending_entry_cache
, KM_SLEEP
);
1600 newbpe
->bpe_bp
= *bp
;
1601 newbpe
->bpe_count
= 1;
1603 mutex_enter(pending_lock
);
1605 bpe
= avl_find(pending_tree
, newbpe
, &where
);
1607 avl_insert(pending_tree
, newbpe
, where
);
1613 mutex_exit(pending_lock
);
1615 if (newbpe
!= NULL
) {
1616 ASSERT(bpe
!= NULL
);
1617 ASSERT(bpe
!= newbpe
);
1618 kmem_cache_free(brt_pending_entry_cache
, newbpe
);
1620 ASSERT(bpe
== NULL
);
1623 /* Prefetch BRT entry, as we will need it in the syncing context. */
1624 brt_prefetch(brt
, bp
);
1628 brt_pending_remove(spa_t
*spa
, const blkptr_t
*bp
, dmu_tx_t
*tx
)
1631 avl_tree_t
*pending_tree
;
1632 kmutex_t
*pending_lock
;
1633 brt_pending_entry_t
*bpe
, bpe_search
;
1637 txg
= dmu_tx_get_txg(tx
);
1638 ASSERT3U(txg
, !=, 0);
1639 pending_tree
= &brt
->brt_pending_tree
[txg
& TXG_MASK
];
1640 pending_lock
= &brt
->brt_pending_lock
[txg
& TXG_MASK
];
1642 bpe_search
.bpe_bp
= *bp
;
1644 mutex_enter(pending_lock
);
1646 bpe
= avl_find(pending_tree
, &bpe_search
, NULL
);
1647 /* I believe we should always find bpe when this function is called. */
1649 ASSERT(bpe
->bpe_count
> 0);
1652 if (bpe
->bpe_count
== 0) {
1653 avl_remove(pending_tree
, bpe
);
1654 kmem_cache_free(brt_pending_entry_cache
, bpe
);
1658 mutex_exit(pending_lock
);
1662 brt_pending_apply(spa_t
*spa
, uint64_t txg
)
1665 brt_pending_entry_t
*bpe
;
1666 avl_tree_t
*pending_tree
;
1667 kmutex_t
*pending_lock
;
1670 ASSERT3U(txg
, !=, 0);
1673 pending_tree
= &brt
->brt_pending_tree
[txg
& TXG_MASK
];
1674 pending_lock
= &brt
->brt_pending_lock
[txg
& TXG_MASK
];
1676 mutex_enter(pending_lock
);
1679 while ((bpe
= avl_destroy_nodes(pending_tree
, &c
)) != NULL
) {
1680 boolean_t added_to_ddt
;
1682 mutex_exit(pending_lock
);
1684 for (int i
= 0; i
< bpe
->bpe_count
; i
++) {
1686 * If the block has DEDUP bit set, it means that it
1687 * already exists in the DEDUP table, so we can just
1688 * use that instead of creating new entry in
1691 if (BP_GET_DEDUP(&bpe
->bpe_bp
)) {
1692 added_to_ddt
= ddt_addref(spa
, &bpe
->bpe_bp
);
1694 added_to_ddt
= B_FALSE
;
1697 brt_entry_addref(brt
, &bpe
->bpe_bp
);
1700 kmem_cache_free(brt_pending_entry_cache
, bpe
);
1701 mutex_enter(pending_lock
);
1704 mutex_exit(pending_lock
);
1708 brt_sync_entry(brt_t
*brt
, brt_vdev_t
*brtvd
, brt_entry_t
*bre
, dmu_tx_t
*tx
)
1711 ASSERT(RW_WRITE_HELD(&brt
->brt_lock
));
1712 ASSERT(brtvd
->bv_mos_entries
!= 0);
1714 if (bre
->bre_refcount
== 0) {
1717 error
= brt_entry_remove(brt
, brtvd
, bre
, tx
);
1718 ASSERT(error
== 0 || error
== ENOENT
);
1720 * If error == ENOENT then zfs_clone_range() was done from a
1721 * removed (but opened) file (open(), unlink()).
1723 ASSERT(brt_entry_lookup(brt
, brtvd
, bre
) == ENOENT
);
1725 VERIFY0(brt_entry_update(brt
, brtvd
, bre
, tx
));
1730 brt_sync_table(brt_t
*brt
, dmu_tx_t
*tx
)
1739 for (vdevid
= 0; vdevid
< brt
->brt_nvdevs
; vdevid
++) {
1740 brtvd
= &brt
->brt_vdevs
[vdevid
];
1742 if (!brtvd
->bv_initiated
)
1745 if (!brtvd
->bv_meta_dirty
) {
1746 ASSERT(!brtvd
->bv_entcount_dirty
);
1747 ASSERT0(avl_numnodes(&brtvd
->bv_tree
));
1751 ASSERT(!brtvd
->bv_entcount_dirty
||
1752 avl_numnodes(&brtvd
->bv_tree
) != 0);
1754 if (brtvd
->bv_mos_brtvdev
== 0)
1755 brt_vdev_create(brt
, brtvd
, tx
);
1758 while ((bre
= avl_destroy_nodes(&brtvd
->bv_tree
, &c
)) != NULL
) {
1759 brt_sync_entry(brt
, brtvd
, bre
, tx
);
1760 brt_entry_free(bre
);
1761 ASSERT(brt
->brt_nentries
> 0);
1762 brt
->brt_nentries
--;
1765 brt_vdev_sync(brt
, brtvd
, tx
);
1767 if (brtvd
->bv_totalcount
== 0)
1768 brt_vdev_destroy(brt
, brtvd
, tx
);
1771 ASSERT0(brt
->brt_nentries
);
1777 brt_sync(spa_t
*spa
, uint64_t txg
)
1782 ASSERT(spa_syncing_txg(spa
) == txg
);
1786 if (brt
->brt_nentries
== 0) {
1793 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
1795 brt_sync_table(brt
, tx
);
1801 brt_table_alloc(brt_t
*brt
)
1804 for (int i
= 0; i
< TXG_SIZE
; i
++) {
1805 avl_create(&brt
->brt_pending_tree
[i
],
1806 brt_pending_entry_compare
,
1807 sizeof (brt_pending_entry_t
),
1808 offsetof(brt_pending_entry_t
, bpe_node
));
1809 mutex_init(&brt
->brt_pending_lock
[i
], NULL
, MUTEX_DEFAULT
,
1815 brt_table_free(brt_t
*brt
)
1818 for (int i
= 0; i
< TXG_SIZE
; i
++) {
1819 ASSERT(avl_is_empty(&brt
->brt_pending_tree
[i
]));
1821 avl_destroy(&brt
->brt_pending_tree
[i
]);
1822 mutex_destroy(&brt
->brt_pending_lock
[i
]);
1827 brt_alloc(spa_t
*spa
)
1831 ASSERT(spa
->spa_brt
== NULL
);
1833 brt
= kmem_zalloc(sizeof (*brt
), KM_SLEEP
);
1834 rw_init(&brt
->brt_lock
, NULL
, RW_DEFAULT
, NULL
);
1836 brt
->brt_rangesize
= 0;
1837 brt
->brt_nentries
= 0;
1838 brt
->brt_vdevs
= NULL
;
1839 brt
->brt_nvdevs
= 0;
1840 brt_table_alloc(brt
);
1846 brt_create(spa_t
*spa
)
1850 brt_vdevs_alloc(spa
->spa_brt
, B_FALSE
);
1854 brt_load(spa_t
*spa
)
1858 brt_vdevs_alloc(spa
->spa_brt
, B_TRUE
);
1864 brt_unload(spa_t
*spa
)
1866 brt_t
*brt
= spa
->spa_brt
;
1871 brt_vdevs_free(brt
);
1872 brt_table_free(brt
);
1873 rw_destroy(&brt
->brt_lock
);
1874 kmem_free(brt
, sizeof (*brt
));
1875 spa
->spa_brt
= NULL
;
1879 ZFS_MODULE_PARAM(zfs_brt
, zfs_brt_
, prefetch
, INT
, ZMOD_RW
,
1880 "Enable prefetching of BRT entries");
1881 #ifdef ZFS_BRT_DEBUG
1882 ZFS_MODULE_PARAM(zfs_brt
, zfs_brt_
, debug
, INT
, ZMOD_RW
, "BRT debug");