Fix false assertion in dmu_tx_dirty_buf() on cloning
[zfs.git] / include / sys / brt_impl.h
blob168d81f17b72da2d3bd954f398bba3a2399b5bd5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
25 #ifndef _SYS_BRT_IMPL_H
26 #define _SYS_BRT_IMPL_H
28 #ifdef __cplusplus
29 extern "C" {
30 #endif
33 * BRT - Block Reference Table.
35 #define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
38 * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
39 * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
40 * Each element in this array represents how many BRT entries do we have in this
41 * chunk of storage. We always load this entire array into memory and update as
42 * needed. By having it in memory we can quickly tell (during zio_free()) if
43 * there are any BRT entries that we might need to update.
45 * This value cannot be larger than 16MB, at least as long as we support
46 * 512 byte block sizes. With 512 byte block size we can have exactly
47 * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
48 * many for a 16bit counter.
50 #define BRT_RANGESIZE (16 * 1024 * 1024)
51 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
52 "BRT_RANGESIZE is too large.");
54 * We don't want to update the whole structure every time. Maintain bitmap
55 * of dirty blocks within the regions, so that a single bit represents a
56 * block size of entcounts. For example if we have a 1PB vdev then all
57 * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
58 * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
59 * the whole 128MB on disk when we have updated only a single entcount.
60 * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
61 * is represented by a single bit. This gives us 4096 bits. A set bit in the
62 * bitmap means that we had a change in at least one of the 16384 entcounts
63 * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
65 #define BRT_BLOCKSIZE (32 * 1024)
66 #define BRT_RANGESIZE_TO_NBLOCKS(size) \
67 (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
69 #define BRT_LITTLE_ENDIAN 0
70 #define BRT_BIG_ENDIAN 1
71 #ifdef _ZFS_LITTLE_ENDIAN
72 #define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
73 #define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
74 #else
75 #define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
76 #define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
77 #endif
79 typedef struct brt_vdev_phys {
80 uint64_t bvp_mos_entries;
81 uint64_t bvp_size;
82 uint64_t bvp_byteorder;
83 uint64_t bvp_totalcount;
84 uint64_t bvp_rangesize;
85 uint64_t bvp_usedspace;
86 uint64_t bvp_savedspace;
87 } brt_vdev_phys_t;
89 struct brt_vdev {
91 * Pending changes from open contexts.
93 kmutex_t bv_pending_lock;
94 avl_tree_t bv_pending_tree[TXG_SIZE];
96 * Protects bv_mos_*.
98 krwlock_t bv_mos_entries_lock ____cacheline_aligned;
100 * Protects all the fields starting from bv_initiated.
102 krwlock_t bv_lock ____cacheline_aligned;
104 * VDEV id.
106 uint64_t bv_vdevid ____cacheline_aligned;
108 * Object number in the MOS for the entcount array and brt_vdev_phys.
110 uint64_t bv_mos_brtvdev;
112 * Object number in the MOS and dnode for the entries table.
114 uint64_t bv_mos_entries;
115 dnode_t *bv_mos_entries_dnode;
117 * Is the structure initiated?
118 * (bv_entcount and bv_bitmap are allocated?)
120 boolean_t bv_initiated;
122 * Does the bv_entcount[] array needs byte swapping?
124 boolean_t bv_need_byteswap;
126 * Number of entries in the bv_entcount[] array.
128 uint64_t bv_size;
130 * This is the array with BRT entry count per BRT_RANGESIZE.
132 uint16_t *bv_entcount;
134 * bv_entcount[] potentially can be a bit too big to sychronize it all
135 * when we just changed few entcounts. The fields below allow us to
136 * track updates to bv_entcount[] array since the last sync.
137 * A single bit in the bv_bitmap represents as many entcounts as can
138 * fit into a single BRT_BLOCKSIZE.
139 * For example we have 65536 entcounts in the bv_entcount array
140 * (so the whole array is 128kB). We updated bv_entcount[2] and
141 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
142 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
144 ulong_t *bv_bitmap;
146 * bv_entcount[] needs updating on disk.
148 boolean_t bv_entcount_dirty;
150 * brt_vdev_phys needs updating on disk.
152 boolean_t bv_meta_dirty;
154 * Sum of all bv_entcount[]s.
156 uint64_t bv_totalcount;
158 * Space on disk occupied by cloned blocks (without compression).
160 uint64_t bv_usedspace;
162 * How much additional space would be occupied without block cloning.
164 uint64_t bv_savedspace;
166 * Entries to sync.
168 avl_tree_t bv_tree;
171 /* Size of offset / sizeof (uint64_t). */
172 #define BRT_KEY_WORDS (1)
174 #define BRE_OFFSET(bre) (DVA_GET_OFFSET(&(bre)->bre_bp.blk_dva[0]))
177 * In-core brt entry.
178 * On-disk we use ZAP with offset as the key and count as the value.
180 typedef struct brt_entry {
181 avl_node_t bre_node;
182 blkptr_t bre_bp;
183 uint64_t bre_count;
184 uint64_t bre_pcount;
185 } brt_entry_t;
187 #ifdef __cplusplus
189 #endif
191 #endif /* _SYS_BRT_IMPL_H */