4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2023, Klara Inc.
30 #include <sys/sysmacros.h>
31 #include <sys/types.h>
32 #include <sys/fs/zfs.h>
43 * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
45 #define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
46 #define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */
47 #define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG)
50 * DDT on-disk storage object types. Each one corresponds to specific
51 * implementation, see ddt_ops_t. The value itself is not stored on disk.
53 * When searching for an entry, objects types will be searched in this order.
55 * Note that DDT_TYPES is used as the "no type" for new entries that have not
56 * yet been written to a storage object.
59 DDT_TYPE_ZAP
= 0, /* ZAP storage object, ddt_zap */
63 _Static_assert(DDT_TYPES
<= UINT8_MAX
,
64 "ddt_type_t must fit in a uint8_t");
66 /* New and updated entries recieve this type, see ddt_sync_entry() */
67 #define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP)
70 * DDT storage classes. Each class has a separate storage object for each type.
71 * The value itself is not stored on disk.
73 * When search for an entry, object classes will be searched in this order.
75 * Note that DDT_CLASSES is used as the "no class" for new entries that have not
76 * yet been written to a storage object.
79 DDT_CLASS_DITTO
= 0, /* entry has ditto blocks (obsolete) */
80 DDT_CLASS_DUPLICATE
, /* entry has multiple references */
81 DDT_CLASS_UNIQUE
, /* entry has a single reference */
85 _Static_assert(DDT_CLASSES
< UINT8_MAX
,
86 "ddt_class_t must fit in a uint8_t");
89 * The "key" part of an on-disk entry. This is the unique "name" for a block,
90 * that is, that parts of the block pointer that will always be the same for
94 zio_cksum_t ddk_cksum
; /* 256-bit block checksum */
96 * Encoded with logical & physical size, encryption, and compression,
98 * +-------+-------+-------+-------+-------+-------+-------+-------+
99 * | 0 | 0 | 0 |X| comp| PSIZE | LSIZE |
100 * +-------+-------+-------+-------+-------+-------+-------+-------+
106 * Macros for accessing parts of a ddt_key_t. These are similar to their BP_*
109 #define DDK_GET_LSIZE(ddk) \
110 BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
111 #define DDK_SET_LSIZE(ddk, x) \
112 BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
114 #define DDK_GET_PSIZE(ddk) \
115 BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
116 #define DDK_SET_PSIZE(ddk, x) \
117 BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
119 #define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 7)
120 #define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 7, x)
122 #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1)
123 #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x)
126 * The "value" part for an on-disk entry. These are the "physical"
127 * characteristics of the stored block, such as its location on disk (DVAs),
128 * birth txg and ref count.
130 * The "traditional" entry has an array of four, one for each number of DVAs
131 * (copies= property) and another for additional "ditto" copies. Users of the
132 * traditional struct will specify the variant (index) of the one they want.
134 * The newer "flat" entry has only a single form that is specified using the
135 * DDT_PHYS_FLAT variant.
137 * Since the value size varies, use one of the size macros when interfacing
141 #define DDT_PHYS_MAX (4)
144 * Note - this can be used in a flexible array and allocated for
145 * a specific size (ddp_trad or ddp_flat). So be careful not to
146 * copy using "=" assignment but instead use ddt_phys_copy().
150 * Traditional physical payload value for DDT zap (256 bytes)
153 dva_t ddp_dva
[SPA_DVAS_PER_BP
];
155 uint64_t ddp_phys_birth
;
156 } ddp_trad
[DDT_PHYS_MAX
];
159 * Flat physical payload value for DDT zap (72 bytes)
162 dva_t ddp_dva
[SPA_DVAS_PER_BP
];
164 uint64_t ddp_phys_birth
; /* txg based from BP */
165 uint64_t ddp_class_start
; /* in realtime seconds */
170 * This enum denotes which variant of a ddt_univ_phys_t to target. For
171 * a traditional DDT entry, it represents the indexes into the ddp_trad
172 * array. Any consumer of a ddt_univ_phys_t needs to know which variant
175 * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
176 * we maintain the ability to free existing dedup-ditto blocks.
186 } ddt_phys_variant_t
;
188 #define DDT_PHYS_VARIANT(ddt, p) \
189 (ASSERT((p) < DDT_PHYS_NONE), \
190 ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
192 #define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
193 #define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
195 #define _DDT_PHYS_SWITCH(ddt, flat, trad) \
196 (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
198 #define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
199 DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
201 #define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
202 #define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
203 #define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))
206 * A "live" entry, holding changes to an entry made this txg, and other data to
207 * support loading, updating and repairing the entry.
210 /* State flags for dde_flags */
211 #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
212 #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
213 #define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
216 * Additional data to support entry update or repair. This is fixed size
217 * because its relatively rarely used.
220 /* copy of data after a repair read, to be rewritten */
221 abd_t
*dde_repair_abd
;
223 /* original phys contents before update, for error handling */
224 ddt_univ_phys_t dde_orig_phys
;
226 /* in-flight update IOs */
227 zio_t
*dde_lead_zio
[DDT_PHYS_MAX
];
231 /* key must be first for ddt_key_compare */
232 ddt_key_t dde_key
; /* ddt_tree key */
233 avl_node_t dde_node
; /* ddt_tree_node */
235 /* storage type and class the entry was loaded from */
237 ddt_class_t dde_class
;
239 uint8_t dde_flags
; /* load state flags */
240 kcondvar_t dde_cv
; /* signaled when load completes */
241 uint64_t dde_waiters
; /* count of waiters on dde_cv */
243 ddt_entry_io_t
*dde_io
; /* IO support, when required */
245 ddt_univ_phys_t dde_phys
[]; /* flexible -- allocated size varies */
249 * A lightweight entry is for short-lived or transient uses, like iterating or
250 * inspecting, when you don't care where it came from.
254 ddt_type_t ddlwe_type
;
255 ddt_class_t ddlwe_class
;
256 ddt_univ_phys_t ddlwe_phys
;
257 } ddt_lightweight_entry_t
;
260 * In-core DDT log. A separate struct to make it easier to switch between the
261 * appending and flushing logs.
264 avl_tree_t ddl_tree
; /* logged entries */
265 uint32_t ddl_flags
; /* flags for this log */
266 uint64_t ddl_object
; /* log object id */
267 uint64_t ddl_length
; /* on-disk log size */
268 uint64_t ddl_first_txg
; /* txg log became active */
269 ddt_key_t ddl_checkpoint
; /* last checkpoint */
273 * In-core DDT object. This covers all entries and stats for a the whole pool
274 * for a given checksum type.
277 kmutex_t ddt_lock
; /* protects changes to all fields */
279 avl_tree_t ddt_tree
; /* "live" (changed) entries this txg */
280 avl_tree_t ddt_log_tree
; /* logged entries */
282 avl_tree_t ddt_repair_tree
; /* entries being repaired */
284 ddt_log_t ddt_log
[2]; /* active/flushing logs */
285 ddt_log_t
*ddt_log_active
; /* pointers into ddt_log */
286 ddt_log_t
*ddt_log_flushing
; /* swapped when flush starts */
288 hrtime_t ddt_flush_start
; /* log flush start this txg */
289 uint32_t ddt_flush_pass
; /* log flush pass this txg */
291 int32_t ddt_flush_count
; /* entries flushed this txg */
292 int32_t ddt_flush_min
; /* min rem entries to flush */
293 int32_t ddt_log_ingest_rate
; /* rolling log ingest rate */
294 int32_t ddt_log_flush_rate
; /* rolling log flush rate */
295 int32_t ddt_log_flush_time_rate
; /* avg time spent flushing */
297 uint64_t ddt_flush_force_txg
; /* flush hard before this txg */
299 kstat_t
*ddt_ksp
; /* kstats context */
301 enum zio_checksum ddt_checksum
; /* checksum algorithm in use */
302 spa_t
*ddt_spa
; /* pool this ddt is on */
303 objset_t
*ddt_os
; /* ddt objset (always MOS) */
305 uint64_t ddt_dir_object
; /* MOS dir holding ddt objects */
306 uint64_t ddt_version
; /* DDT version */
307 uint64_t ddt_flags
; /* FDT option flags */
309 /* per-type/per-class entry store objects */
310 uint64_t ddt_object
[DDT_TYPES
][DDT_CLASSES
];
312 /* object ids for stored, logged and per-type/per-class stats */
313 uint64_t ddt_stat_object
;
314 ddt_object_t ddt_log_stats
;
315 ddt_object_t ddt_object_stats
[DDT_TYPES
][DDT_CLASSES
];
317 /* type/class stats by power-2-sized referenced blocks */
318 ddt_histogram_t ddt_histogram
[DDT_TYPES
][DDT_CLASSES
];
319 ddt_histogram_t ddt_histogram_cache
[DDT_TYPES
][DDT_CLASSES
];
321 /* log stats power-2-sized referenced blocks */
322 ddt_histogram_t ddt_log_histogram
;
326 * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(),
327 * and is stable across calls, even if the DDT is updated, the pool is
328 * restarted or loaded on another system, or OpenZFS is upgraded.
333 uint64_t ddb_checksum
;
337 extern void ddt_bp_fill(const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
,
338 blkptr_t
*bp
, uint64_t txg
);
339 extern void ddt_bp_create(enum zio_checksum checksum
, const ddt_key_t
*ddk
,
340 const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
, blkptr_t
*bp
);
342 extern void ddt_phys_extend(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
,
344 extern void ddt_phys_copy(ddt_univ_phys_t
*dst
, const ddt_univ_phys_t
*src
,
345 ddt_phys_variant_t v
);
346 extern void ddt_phys_clear(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
);
347 extern void ddt_phys_addref(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
);
348 extern uint64_t ddt_phys_decref(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
);
349 extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t
*ddp
,
350 ddt_phys_variant_t v
);
351 extern ddt_phys_variant_t
ddt_phys_select(const ddt_t
*ddt
,
352 const ddt_entry_t
*dde
, const blkptr_t
*bp
);
353 extern uint64_t ddt_phys_birth(const ddt_univ_phys_t
*ddp
,
354 ddt_phys_variant_t v
);
355 extern int ddt_phys_dva_count(const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
,
356 boolean_t encrypted
);
358 extern void ddt_histogram_add_entry(ddt_t
*ddt
, ddt_histogram_t
*ddh
,
359 const ddt_lightweight_entry_t
*ddlwe
);
360 extern void ddt_histogram_sub_entry(ddt_t
*ddt
, ddt_histogram_t
*ddh
,
361 const ddt_lightweight_entry_t
*ddlwe
);
363 extern void ddt_histogram_add(ddt_histogram_t
*dst
, const ddt_histogram_t
*src
);
364 extern void ddt_histogram_total(ddt_stat_t
*dds
, const ddt_histogram_t
*ddh
);
365 extern boolean_t
ddt_histogram_empty(const ddt_histogram_t
*ddh
);
367 extern void ddt_get_dedup_object_stats(spa_t
*spa
, ddt_object_t
*ddo
);
368 extern uint64_t ddt_get_ddt_dsize(spa_t
*spa
);
369 extern void ddt_get_dedup_histogram(spa_t
*spa
, ddt_histogram_t
*ddh
);
370 extern void ddt_get_dedup_stats(spa_t
*spa
, ddt_stat_t
*dds_total
);
372 extern uint64_t ddt_get_dedup_dspace(spa_t
*spa
);
373 extern uint64_t ddt_get_pool_dedup_ratio(spa_t
*spa
);
374 extern int ddt_get_pool_dedup_cached(spa_t
*spa
, uint64_t *psize
);
376 extern ddt_t
*ddt_select(spa_t
*spa
, const blkptr_t
*bp
);
377 extern void ddt_enter(ddt_t
*ddt
);
378 extern void ddt_exit(ddt_t
*ddt
);
379 extern void ddt_init(void);
380 extern void ddt_fini(void);
381 extern ddt_entry_t
*ddt_lookup(ddt_t
*ddt
, const blkptr_t
*bp
);
382 extern void ddt_remove(ddt_t
*ddt
, ddt_entry_t
*dde
);
383 extern void ddt_prefetch(spa_t
*spa
, const blkptr_t
*bp
);
384 extern void ddt_prefetch_all(spa_t
*spa
);
386 extern boolean_t
ddt_class_contains(spa_t
*spa
, ddt_class_t max_class
,
389 extern void ddt_alloc_entry_io(ddt_entry_t
*dde
);
391 extern ddt_entry_t
*ddt_repair_start(ddt_t
*ddt
, const blkptr_t
*bp
);
392 extern void ddt_repair_done(ddt_t
*ddt
, ddt_entry_t
*dde
);
394 extern int ddt_key_compare(const void *x1
, const void *x2
);
396 extern void ddt_create(spa_t
*spa
);
397 extern int ddt_load(spa_t
*spa
);
398 extern void ddt_unload(spa_t
*spa
);
399 extern void ddt_sync(spa_t
*spa
, uint64_t txg
);
401 extern void ddt_walk_init(spa_t
*spa
, uint64_t txg
);
402 extern boolean_t
ddt_walk_ready(spa_t
*spa
);
403 extern int ddt_walk(spa_t
*spa
, ddt_bookmark_t
*ddb
,
404 ddt_lightweight_entry_t
*ddlwe
);
406 extern boolean_t
ddt_addref(spa_t
*spa
, const blkptr_t
*bp
);
408 extern int ddt_prune_unique_entries(spa_t
*spa
, zpool_ddt_prune_unit_t unit
,
415 #endif /* _SYS_DDT_H */