4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2023, Klara Inc.
27 #ifndef _SYS_DDT_IMPL_H
28 #define _SYS_DDT_IMPL_H
31 #include <sys/bitops.h>
37 /* DDT version numbers */
38 #define DDT_VERSION_LEGACY (0)
39 #define DDT_VERSION_FDT (1)
41 /* Dummy version to signal that configure is still necessary */
42 #define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
44 /* Names of interesting objects in the DDT root dir */
45 #define DDT_DIR_VERSION "version"
46 #define DDT_DIR_FLAGS "flags"
48 /* Fill a lightweight entry from a live entry. */
49 #define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
50 memset((ddlwe), 0, sizeof (*ddlwe)); \
51 (ddlwe)->ddlwe_key = (dde)->dde_key; \
52 (ddlwe)->ddlwe_type = (dde)->dde_type; \
53 (ddlwe)->ddlwe_class = (dde)->dde_class; \
54 memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
57 #define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \
58 memset((ddlwe), 0, sizeof (*ddlwe)); \
59 (ddlwe)->ddlwe_key = (ddle)->ddle_key; \
60 (ddlwe)->ddlwe_type = (ddle)->ddle_type; \
61 (ddlwe)->ddlwe_class = (ddle)->ddle_class; \
62 memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
66 * An entry on the log tree. These are "frozen", and a record of what's in
67 * the on-disk log. They can't be used in place, but can be "loaded" back into
71 ddt_key_t ddle_key
; /* ddt_log_tree key */
72 avl_node_t ddle_node
; /* ddt_log_tree node */
74 ddt_type_t ddle_type
; /* storage type */
75 ddt_class_t ddle_class
; /* storage class */
77 /* extra allocation for flat/trad phys */
78 ddt_univ_phys_t ddle_phys
[];
81 /* On-disk log record types. */
83 DLR_INVALID
= 0, /* end of block marker */
84 DLR_ENTRY
= 1, /* an entry to add or replace in the log tree */
85 } ddt_log_record_type_t
;
87 /* On-disk log record header. */
90 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
93 * bits 0-7: record type (ddt_log_record_type_t)
94 * bits 8-15: length of record header+payload
95 * bits 16-47: reserved, all zero
96 * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type)
98 * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class)
102 uint8_t dlr_payload
[];
105 #define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8)
106 #define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v)
107 #define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16)
108 #define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v)
109 #define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8)
110 #define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v)
111 #define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8)
112 #define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v)
114 /* Payload for DLR_ENTRY. */
117 ddt_univ_phys_t dlre_phys
[];
118 } ddt_log_record_entry_t
;
120 /* Log flags (ddl_flags, dlh_flags) */
121 #define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */
122 #define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */
124 /* On-disk log header, stored in the bonus buffer. */
127 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
130 * bits 0-7: log version
131 * bits 8-15: log flags
132 * bits 16-63: reserved, all zero
136 uint64_t dlh_length
; /* log size in bytes */
137 uint64_t dlh_first_txg
; /* txg this log went active */
138 ddt_key_t dlh_checkpoint
; /* last checkpoint */
141 #define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8)
142 #define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v)
143 #define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8)
144 #define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v)
146 /* DDT log update state */
148 dmu_tx_t
*dlu_tx
; /* tx the update is being applied to */
149 dnode_t
*dlu_dn
; /* log object dnode */
150 dmu_buf_t
**dlu_dbp
; /* array of block buffer pointers */
151 int dlu_ndbp
; /* number of block buffer pointers */
152 uint16_t dlu_reclen
; /* cached length of record */
153 uint64_t dlu_block
; /* block for next entry */
154 uint64_t dlu_offset
; /* offset for next entry */
158 * Ops vector to access a specific DDT object type.
161 char ddt_op_name
[32];
162 int (*ddt_op_create
)(objset_t
*os
, uint64_t *object
, dmu_tx_t
*tx
,
164 int (*ddt_op_destroy
)(objset_t
*os
, uint64_t object
, dmu_tx_t
*tx
);
165 int (*ddt_op_lookup
)(objset_t
*os
, uint64_t object
,
166 const ddt_key_t
*ddk
, void *phys
, size_t psize
);
167 int (*ddt_op_contains
)(objset_t
*os
, uint64_t object
,
168 const ddt_key_t
*ddk
);
169 void (*ddt_op_prefetch
)(objset_t
*os
, uint64_t object
,
170 const ddt_key_t
*ddk
);
171 void (*ddt_op_prefetch_all
)(objset_t
*os
, uint64_t object
);
172 int (*ddt_op_update
)(objset_t
*os
, uint64_t object
,
173 const ddt_key_t
*ddk
, const void *phys
, size_t psize
,
175 int (*ddt_op_remove
)(objset_t
*os
, uint64_t object
,
176 const ddt_key_t
*ddk
, dmu_tx_t
*tx
);
177 int (*ddt_op_walk
)(objset_t
*os
, uint64_t object
, uint64_t *walk
,
178 ddt_key_t
*ddk
, void *phys
, size_t psize
);
179 int (*ddt_op_count
)(objset_t
*os
, uint64_t object
, uint64_t *count
);
182 extern const ddt_ops_t ddt_zap_ops
;
185 extern void ddt_log_begin(ddt_t
*ddt
, size_t nentries
, dmu_tx_t
*tx
,
186 ddt_log_update_t
*dlu
);
187 extern void ddt_log_entry(ddt_t
*ddt
, ddt_lightweight_entry_t
*dde
,
188 ddt_log_update_t
*dlu
);
189 extern void ddt_log_commit(ddt_t
*ddt
, ddt_log_update_t
*dlu
);
191 extern boolean_t
ddt_log_take_first(ddt_t
*ddt
, ddt_log_t
*ddl
,
192 ddt_lightweight_entry_t
*ddlwe
);
194 extern boolean_t
ddt_log_find_key(ddt_t
*ddt
, const ddt_key_t
*ddk
,
195 ddt_lightweight_entry_t
*ddlwe
);
196 extern boolean_t
ddt_log_remove_key(ddt_t
*ddt
, ddt_log_t
*ddl
,
197 const ddt_key_t
*ddk
);
199 extern void ddt_log_checkpoint(ddt_t
*ddt
, ddt_lightweight_entry_t
*ddlwe
,
201 extern void ddt_log_truncate(ddt_t
*ddt
, dmu_tx_t
*tx
);
203 extern boolean_t
ddt_log_swap(ddt_t
*ddt
, dmu_tx_t
*tx
);
205 extern void ddt_log_destroy(ddt_t
*ddt
, dmu_tx_t
*tx
);
207 extern int ddt_log_load(ddt_t
*ddt
);
208 extern void ddt_log_alloc(ddt_t
*ddt
);
209 extern void ddt_log_free(ddt_t
*ddt
);
211 extern void ddt_log_init(void);
212 extern void ddt_log_fini(void);
215 * These are only exposed so that zdb can access them. Try not to use them
216 * outside of the DDT implementation proper, and if you do, consider moving
221 * We use a histogram to convert a percentage request into a
222 * cutoff value where entries older than the cutoff get pruned.
224 * The histogram bins represent hours in power-of-two increments.
225 * 16 bins covers up to four years.
229 typedef struct ddt_age_histo
{
230 uint64_t dah_entries
;
231 uint64_t dah_age_histo
[HIST_BINS
];
234 void ddt_prune_walk(spa_t
*spa
, uint64_t cutoff
, ddt_age_histo_t
*histogram
);
236 #if defined(_KERNEL) || !defined(ZFS_DEBUG)
237 #define ddt_dump_age_histogram(histo, cutoff) ((void)0)
240 ddt_dump_age_histogram(ddt_age_histo_t
*histogram
, uint64_t cutoff
)
242 if (histogram
->dah_entries
== 0)
245 (void) printf("DDT prune unique class age, %llu hour cutoff\n",
246 (u_longlong_t
)(gethrestime_sec() - cutoff
)/3600);
247 (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt");
248 (void) printf("%5s %9s %4s\n", "-----", "---------", "----");
249 for (int i
= 0; i
< HIST_BINS
; i
++) {
250 (void) printf("%5d %9llu %4d%%\n", 1<<i
,
251 (u_longlong_t
)histogram
->dah_age_histo
[i
],
252 (int)((histogram
->dah_age_histo
[i
] * 100) /
253 histogram
->dah_entries
));
259 * Enough room to expand DMU_POOL_DDT format for all possible DDT
260 * checksum/class/type combinations.
262 #define DDT_NAMELEN 32
264 extern uint64_t ddt_phys_total_refcnt(const ddt_t
*ddt
,
265 const ddt_univ_phys_t
*ddp
);
267 extern void ddt_key_fill(ddt_key_t
*ddk
, const blkptr_t
*bp
);
269 extern void ddt_object_name(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t clazz
,
271 extern int ddt_object_walk(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t clazz
,
272 uint64_t *walk
, ddt_lightweight_entry_t
*ddlwe
);
273 extern int ddt_object_count(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t clazz
,
275 extern int ddt_object_info(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t clazz
,
276 dmu_object_info_t
*);
282 #endif /* _SYS_DDT_H */