4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25 * Copyright (c) 2022 by Pawel Jakub Dawidek
26 * Copyright (c) 2019, 2023, Klara Inc.
29 #include <sys/zfs_context.h>
31 #include <sys/spa_impl.h>
34 #include <sys/ddt_impl.h>
36 #include <sys/dmu_tx.h>
38 #include <sys/dsl_pool.h>
39 #include <sys/zio_checksum.h>
40 #include <sys/dsl_scan.h>
42 #include <sys/zfeature.h>
45 * # DDT: Deduplication tables
47 * The dedup subsystem provides block-level deduplication. When enabled, blocks
48 * to be written will have the dedup (D) bit set, which causes them to be
49 * tracked in a "dedup table", or DDT. If a block has been seen before (exists
50 * in the DDT), instead of being written, it will instead be made to reference
51 * the existing on-disk data, and a refcount bumped in the DDT instead.
53 * ## Dedup tables and entries
55 * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
56 * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
57 * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
58 * time and refcount. Together these are enough to track references to a
59 * specific block, to build a valid block pointer to reference that block (for
60 * freeing, scrubbing, etc), and to fill a new block pointer with the missing
61 * pieces to make it seem like it was written.
63 * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
64 * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
65 * object data formats, each with their own implementations) and "classes"
66 * (ddt_class_t, instance of a storage type object, for entries with a specific
67 * characteristic). An entry (key) will only ever exist on one of these objects
68 * at any given time, but may be moved from one to another if their type or
71 * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
72 * is to be written, before DVAs have been allocated, ddt_lookup() is called to
73 * see if the block has been seen before. If its not found, the write proceeds
74 * as normal, and after it succeeds, a new entry is created. If it is found, we
75 * fill the BP with the DVAs from the entry, increment the refcount and cause
76 * the write IO to return immediately.
78 * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
79 * block for the same content/checksum. The slot is selected based on the
80 * zp_copies parameter the block is written with, that is, the number of DVAs
81 * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
82 * now-removed "dedupditto" feature. These are no longer written, and will be
83 * freed if encountered on old pools.
85 * If the "fast_dedup" feature is enabled, new dedup tables will be created
86 * with the "flat phys" option. In this mode, there is only one ddt_phys_t
87 * slot. If a write is issued for an entry that exists, but has fewer DVAs,
88 * then only as many new DVAs are allocated and written to make up the
89 * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
92 * ## Lifetime of an entry
94 * A DDT can be enormous, and typically is not held in memory all at once.
95 * Instead, the changes to an entry are tracked in memory, and written down to
96 * disk at the end of each txg.
98 * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
99 * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
100 * required for IO, ddt_lookup() is called. If an entry already exists on
101 * ddt_tree, it is returned. Otherwise, a new one is created, and the
102 * type/class objects for the DDT are searched for that key. If its found, its
103 * value is copied into the live entry. If not, an empty entry is created.
105 * The live entry will be modified during the txg, usually by modifying the
106 * refcount, but sometimes by adding or updating DVAs. At the end of the txg
107 * (during spa_sync()), type and class are recalculated for entry (see
108 * ddt_sync_entry()), and the entry is written to the appropriate storage
109 * object and (if necessary), removed from an old one. ddt_tree is cleared and
110 * the next txg can start.
114 * A maximum size for all DDTs on the pool can be set with the
115 * dedup_table_quota property. This is determined in ddt_over_quota() and
116 * enforced during ddt_lookup(). If the pool is at or over its quota limit,
117 * ddt_lookup() will only return entries for existing blocks, as updates are
118 * still possible. New entries will not be created; instead, ddt_lookup() will
119 * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
120 * the D bit on the block and reissue the IO as a regular write. The block will
121 * not be deduplicated.
123 * Note that this is based on the on-disk size of the dedup store. Reclaiming
124 * this space after deleting entries relies on the ZAP "shrinking" behaviour,
125 * without which, no space would be recovered and the DDT would continue to be
126 * considered "over quota". See zap_shrink_enabled.
128 * ## Dedup table pruning
130 * As a complement to the dedup quota feature, ddtprune allows removal of older
131 * non-duplicate entries to make room for newer duplicate entries. The amount
132 * to prune can be based on a target percentage of the unique entries or based
133 * on the age (i.e., prune unique entry older than N days).
137 * Historically, all entries modified on a txg were written back to dedup
138 * storage objects at the end of every txg. This could cause significant
139 * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
140 * and so required reading the whole node, updating the entry, and writing it
141 * back. On busy pools, this could add serious IO and memory overheads.
143 * To address this, the dedup log was added. If the "fast_dedup" feature is
144 * enabled, at the end of each txg, modified entries will be copied to an
145 * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
146 * same block is requested again, the in-memory object will be checked first,
147 * and if its there, the entry inflated back onto the live tree without going
148 * to storage. The on-disk log is only read at pool import time, to reload the
151 * Each txg, some amount of the in-memory log will be flushed out to a DDT
152 * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
153 * keep up with the rate of change on dedup entries, but not so much that it
154 * would impact overall throughput, and not using too much memory. See the
155 * zfs_dedup_log_* tuneables in zfs(4) for more details.
159 * If a read on a dedup block fails, but there are other copies of the block in
160 * the other ddt_phys_t slots, reads will be issued for those instead
161 * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
162 * the caller, and a copy is stashed on the entry's dde_repair_abd.
164 * During the end-of-txg sync, any entries with a dde_repair_abd get a
165 * "rewrite" write issued for the original block pointer, with the data read
166 * from the alternate block. If the block is actually damaged, this will invoke
167 * the pool's "self-healing" mechanism, and repair the block.
169 * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
170 * use, so there is only ever one ddt_phys_t slot. The repair process will
171 * still happen in this case, though it is unlikely to succeed as there will
172 * usually be no other equivalent blocks to fall back on (though there might
173 * be, if this was an early version of a dedup'd block that has since been
176 * Note that this repair mechanism is in addition to and separate from the
177 * regular OpenZFS scrub and self-healing mechanisms.
179 * ## Scanning (scrub/resilver)
181 * If dedup is active, the scrub machinery will walk the dedup table first, and
182 * scrub all blocks with refcnt > 1 first. After that it will move on to the
183 * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
184 * In this way, heavily deduplicated blocks are only scrubbed once. See the
185 * commentary on dsl_scan_ddt() for more details.
187 * Walking the DDT is done via ddt_walk(). The current position is stored in a
188 * ddt_bookmark_t, which represents a stable position in the storage object.
189 * This bookmark is stored by the scan machinery, and must reference the same
190 * position on the object even if the object changes, the pool is exported, or
191 * OpenZFS is upgraded.
193 * If the "fast_dedup" feature is enabled and the table has a log, the scan
194 * cannot begin until entries on the log are flushed, as the on-disk log has no
195 * concept of a "stable position". Instead, the log flushing process will enter
196 * a more aggressive mode, to flush out as much as is necesary as soon as
197 * possible, in order to begin the scan as soon as possible.
199 * ## Interaction with block cloning
201 * If block cloning and dedup are both enabled on a pool, BRT will look for the
202 * dedup bit on an incoming block pointer. If set, it will call into the DDT
203 * (ddt_addref()) to add a reference to the block, instead of adding a
204 * reference to the BRT. See brt_pending_apply().
208 * These are the only checksums valid for dedup. They must match the list
209 * from dedup_table in zfs_prop.c
211 #define DDT_CHECKSUM_VALID(c) \
212 (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
213 c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
214 c == ZIO_CHECKSUM_BLAKE3)
216 static kmem_cache_t
*ddt_cache
;
218 static kmem_cache_t
*ddt_entry_flat_cache
;
219 static kmem_cache_t
*ddt_entry_trad_cache
;
221 #define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
222 #define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
224 #define DDT_ENTRY_SIZE(ddt) \
225 _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
228 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
230 int zfs_dedup_prefetch
= 0;
233 * If the dedup class cannot satisfy a DDT allocation, treat as over quota
234 * for this many TXGs.
236 uint_t dedup_class_wait_txgs
= 5;
239 * How many DDT prune entries to add to the DDT sync AVL tree.
240 * Note these addtional entries have a memory footprint of a
241 * ddt_entry_t (216 bytes).
243 static uint32_t zfs_ddt_prunes_per_txg
= 50000;
246 * For testing, synthesize aged DDT entries
247 * (in global scope for ztest)
249 boolean_t ddt_prune_artificial_age
= B_FALSE
;
250 boolean_t ddt_dump_prune_histogram
= B_FALSE
;
253 * Don't do more than this many incremental flush passes per txg.
255 uint_t zfs_dedup_log_flush_passes_max
= 8;
258 * Minimum time to flush per txg.
260 uint_t zfs_dedup_log_flush_min_time_ms
= 1000;
263 * Minimum entries to flush per txg.
265 uint_t zfs_dedup_log_flush_entries_min
= 1000;
268 * Number of txgs to average flow rates across.
270 uint_t zfs_dedup_log_flush_flow_rate_txgs
= 10;
272 static const ddt_ops_t
*const ddt_ops
[DDT_TYPES
] = {
276 static const char *const ddt_class_name
[DDT_CLASSES
] = {
283 * DDT feature flags automatically enabled for each on-disk version. Note that
284 * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
286 static const uint64_t ddt_version_flags
[] = {
287 [DDT_VERSION_LEGACY
] = 0,
288 [DDT_VERSION_FDT
] = DDT_FLAG_FLAT
| DDT_FLAG_LOG
,
293 /* total lookups and whether they returned new or existing entries */
294 kstat_named_t dds_lookup
;
295 kstat_named_t dds_lookup_new
;
296 kstat_named_t dds_lookup_existing
;
298 /* entries found on live tree, and if we had to wait for load */
299 kstat_named_t dds_lookup_live_hit
;
300 kstat_named_t dds_lookup_live_wait
;
301 kstat_named_t dds_lookup_live_miss
;
303 /* entries found on log trees */
304 kstat_named_t dds_lookup_log_hit
;
305 kstat_named_t dds_lookup_log_active_hit
;
306 kstat_named_t dds_lookup_log_flushing_hit
;
307 kstat_named_t dds_lookup_log_miss
;
309 /* entries found on store objects */
310 kstat_named_t dds_lookup_stored_hit
;
311 kstat_named_t dds_lookup_stored_miss
;
313 /* number of entries on log trees */
314 kstat_named_t dds_log_active_entries
;
315 kstat_named_t dds_log_flushing_entries
;
317 /* avg updated/flushed entries per txg */
318 kstat_named_t dds_log_ingest_rate
;
319 kstat_named_t dds_log_flush_rate
;
320 kstat_named_t dds_log_flush_time_rate
;
323 static const ddt_kstats_t ddt_kstats_template
= {
324 { "lookup", KSTAT_DATA_UINT64
},
325 { "lookup_new", KSTAT_DATA_UINT64
},
326 { "lookup_existing", KSTAT_DATA_UINT64
},
327 { "lookup_live_hit", KSTAT_DATA_UINT64
},
328 { "lookup_live_wait", KSTAT_DATA_UINT64
},
329 { "lookup_live_miss", KSTAT_DATA_UINT64
},
330 { "lookup_log_hit", KSTAT_DATA_UINT64
},
331 { "lookup_log_active_hit", KSTAT_DATA_UINT64
},
332 { "lookup_log_flushing_hit", KSTAT_DATA_UINT64
},
333 { "lookup_log_miss", KSTAT_DATA_UINT64
},
334 { "lookup_stored_hit", KSTAT_DATA_UINT64
},
335 { "lookup_stored_miss", KSTAT_DATA_UINT64
},
336 { "log_active_entries", KSTAT_DATA_UINT64
},
337 { "log_flushing_entries", KSTAT_DATA_UINT64
},
338 { "log_ingest_rate", KSTAT_DATA_UINT32
},
339 { "log_flush_rate", KSTAT_DATA_UINT32
},
340 { "log_flush_time_rate", KSTAT_DATA_UINT32
},
344 #define _DDT_KSTAT_STAT(ddt, stat) \
345 &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
346 #define DDT_KSTAT_BUMP(ddt, stat) \
347 do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0)
348 #define DDT_KSTAT_ADD(ddt, stat, val) \
349 do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
350 #define DDT_KSTAT_SUB(ddt, stat, val) \
351 do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
352 #define DDT_KSTAT_SET(ddt, stat, val) \
353 do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
354 #define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
356 #define DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
357 #define DDT_KSTAT_ADD(ddt, stat, val) do {} while (0)
358 #define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
359 #define DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
360 #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
365 ddt_object_create(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
368 spa_t
*spa
= ddt
->ddt_spa
;
369 objset_t
*os
= ddt
->ddt_os
;
370 uint64_t *objectp
= &ddt
->ddt_object
[type
][class];
371 boolean_t prehash
= zio_checksum_table
[ddt
->ddt_checksum
].ci_flags
&
372 ZCHECKSUM_FLAG_DEDUP
;
373 char name
[DDT_NAMELEN
];
375 ASSERT3U(ddt
->ddt_dir_object
, >, 0);
377 ddt_object_name(ddt
, type
, class, name
);
379 ASSERT3U(*objectp
, ==, 0);
380 VERIFY0(ddt_ops
[type
]->ddt_op_create(os
, objectp
, tx
, prehash
));
381 ASSERT3U(*objectp
, !=, 0);
383 ASSERT3U(ddt
->ddt_version
, !=, DDT_VERSION_UNCONFIGURED
);
385 VERIFY0(zap_add(os
, ddt
->ddt_dir_object
, name
, sizeof (uint64_t), 1,
388 VERIFY0(zap_add(os
, spa
->spa_ddt_stat_object
, name
,
389 sizeof (uint64_t), sizeof (ddt_histogram_t
) / sizeof (uint64_t),
390 &ddt
->ddt_histogram
[type
][class], tx
));
394 ddt_object_destroy(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
397 spa_t
*spa
= ddt
->ddt_spa
;
398 objset_t
*os
= ddt
->ddt_os
;
399 uint64_t *objectp
= &ddt
->ddt_object
[type
][class];
401 char name
[DDT_NAMELEN
];
403 ASSERT3U(ddt
->ddt_dir_object
, >, 0);
405 ddt_object_name(ddt
, type
, class, name
);
407 ASSERT3U(*objectp
, !=, 0);
408 ASSERT(ddt_histogram_empty(&ddt
->ddt_histogram
[type
][class]));
409 VERIFY0(ddt_object_count(ddt
, type
, class, &count
));
411 VERIFY0(zap_remove(os
, ddt
->ddt_dir_object
, name
, tx
));
412 VERIFY0(zap_remove(os
, spa
->spa_ddt_stat_object
, name
, tx
));
413 VERIFY0(ddt_ops
[type
]->ddt_op_destroy(os
, *objectp
, tx
));
414 memset(&ddt
->ddt_object_stats
[type
][class], 0, sizeof (ddt_object_t
));
420 ddt_object_load(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class)
422 ddt_object_t
*ddo
= &ddt
->ddt_object_stats
[type
][class];
423 dmu_object_info_t doi
;
425 char name
[DDT_NAMELEN
];
428 if (ddt
->ddt_dir_object
== 0) {
430 * If we're configured but the containing dir doesn't exist
431 * yet, then this object can't possibly exist either.
433 ASSERT3U(ddt
->ddt_version
, !=, DDT_VERSION_UNCONFIGURED
);
434 return (SET_ERROR(ENOENT
));
437 ddt_object_name(ddt
, type
, class, name
);
439 error
= zap_lookup(ddt
->ddt_os
, ddt
->ddt_dir_object
, name
,
440 sizeof (uint64_t), 1, &ddt
->ddt_object
[type
][class]);
444 error
= zap_lookup(ddt
->ddt_os
, ddt
->ddt_spa
->spa_ddt_stat_object
, name
,
445 sizeof (uint64_t), sizeof (ddt_histogram_t
) / sizeof (uint64_t),
446 &ddt
->ddt_histogram
[type
][class]);
451 * Seed the cached statistics.
453 error
= ddt_object_info(ddt
, type
, class, &doi
);
457 error
= ddt_object_count(ddt
, type
, class, &count
);
461 ddo
->ddo_count
= count
;
462 ddo
->ddo_dspace
= doi
.doi_physical_blocks_512
<< 9;
463 ddo
->ddo_mspace
= doi
.doi_fill_count
* doi
.doi_data_block_size
;
469 ddt_object_sync(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
472 ddt_object_t
*ddo
= &ddt
->ddt_object_stats
[type
][class];
473 dmu_object_info_t doi
;
475 char name
[DDT_NAMELEN
];
477 ddt_object_name(ddt
, type
, class, name
);
479 VERIFY0(zap_update(ddt
->ddt_os
, ddt
->ddt_spa
->spa_ddt_stat_object
, name
,
480 sizeof (uint64_t), sizeof (ddt_histogram_t
) / sizeof (uint64_t),
481 &ddt
->ddt_histogram
[type
][class], tx
));
484 * Cache DDT statistics; this is the only time they'll change.
486 VERIFY0(ddt_object_info(ddt
, type
, class, &doi
));
487 VERIFY0(ddt_object_count(ddt
, type
, class, &count
));
489 ddo
->ddo_count
= count
;
490 ddo
->ddo_dspace
= doi
.doi_physical_blocks_512
<< 9;
491 ddo
->ddo_mspace
= doi
.doi_fill_count
* doi
.doi_data_block_size
;
495 ddt_object_exists(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class)
497 return (!!ddt
->ddt_object
[type
][class]);
501 ddt_object_lookup(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
504 if (!ddt_object_exists(ddt
, type
, class))
505 return (SET_ERROR(ENOENT
));
507 return (ddt_ops
[type
]->ddt_op_lookup(ddt
->ddt_os
,
508 ddt
->ddt_object
[type
][class], &dde
->dde_key
,
509 dde
->dde_phys
, DDT_PHYS_SIZE(ddt
)));
513 ddt_object_contains(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
514 const ddt_key_t
*ddk
)
516 if (!ddt_object_exists(ddt
, type
, class))
517 return (SET_ERROR(ENOENT
));
519 return (ddt_ops
[type
]->ddt_op_contains(ddt
->ddt_os
,
520 ddt
->ddt_object
[type
][class], ddk
));
524 ddt_object_prefetch(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
525 const ddt_key_t
*ddk
)
527 if (!ddt_object_exists(ddt
, type
, class))
530 ddt_ops
[type
]->ddt_op_prefetch(ddt
->ddt_os
,
531 ddt
->ddt_object
[type
][class], ddk
);
535 ddt_object_prefetch_all(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class)
537 if (!ddt_object_exists(ddt
, type
, class))
540 ddt_ops
[type
]->ddt_op_prefetch_all(ddt
->ddt_os
,
541 ddt
->ddt_object
[type
][class]);
545 ddt_object_update(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
546 const ddt_lightweight_entry_t
*ddlwe
, dmu_tx_t
*tx
)
548 ASSERT(ddt_object_exists(ddt
, type
, class));
550 return (ddt_ops
[type
]->ddt_op_update(ddt
->ddt_os
,
551 ddt
->ddt_object
[type
][class], &ddlwe
->ddlwe_key
,
552 &ddlwe
->ddlwe_phys
, DDT_PHYS_SIZE(ddt
), tx
));
556 ddt_object_remove(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
557 const ddt_key_t
*ddk
, dmu_tx_t
*tx
)
559 ASSERT(ddt_object_exists(ddt
, type
, class));
561 return (ddt_ops
[type
]->ddt_op_remove(ddt
->ddt_os
,
562 ddt
->ddt_object
[type
][class], ddk
, tx
));
566 ddt_object_walk(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
567 uint64_t *walk
, ddt_lightweight_entry_t
*ddlwe
)
569 ASSERT(ddt_object_exists(ddt
, type
, class));
571 int error
= ddt_ops
[type
]->ddt_op_walk(ddt
->ddt_os
,
572 ddt
->ddt_object
[type
][class], walk
, &ddlwe
->ddlwe_key
,
573 &ddlwe
->ddlwe_phys
, DDT_PHYS_SIZE(ddt
));
575 ddlwe
->ddlwe_type
= type
;
576 ddlwe
->ddlwe_class
= class;
583 ddt_object_count(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
586 ASSERT(ddt_object_exists(ddt
, type
, class));
588 return (ddt_ops
[type
]->ddt_op_count(ddt
->ddt_os
,
589 ddt
->ddt_object
[type
][class], count
));
593 ddt_object_info(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
594 dmu_object_info_t
*doi
)
596 if (!ddt_object_exists(ddt
, type
, class))
597 return (SET_ERROR(ENOENT
));
599 return (dmu_object_info(ddt
->ddt_os
, ddt
->ddt_object
[type
][class],
604 ddt_object_name(ddt_t
*ddt
, ddt_type_t type
, ddt_class_t
class,
607 (void) snprintf(name
, DDT_NAMELEN
, DMU_POOL_DDT
,
608 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
,
609 ddt_ops
[type
]->ddt_op_name
, ddt_class_name
[class]);
613 ddt_bp_fill(const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
,
614 blkptr_t
*bp
, uint64_t txg
)
616 ASSERT3U(txg
, !=, 0);
617 ASSERT3U(v
, <, DDT_PHYS_NONE
);
621 if (v
== DDT_PHYS_FLAT
) {
622 phys_birth
= ddp
->ddp_flat
.ddp_phys_birth
;
623 dvap
= ddp
->ddp_flat
.ddp_dva
;
625 phys_birth
= ddp
->ddp_trad
[v
].ddp_phys_birth
;
626 dvap
= ddp
->ddp_trad
[v
].ddp_dva
;
629 for (int d
= 0; d
< SPA_DVAS_PER_BP
; d
++)
630 bp
->blk_dva
[d
] = dvap
[d
];
631 BP_SET_BIRTH(bp
, txg
, phys_birth
);
635 * The bp created via this function may be used for repairs and scrub, but it
636 * will be missing the salt / IV required to do a full decrypting read.
639 ddt_bp_create(enum zio_checksum checksum
, const ddt_key_t
*ddk
,
640 const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
, blkptr_t
*bp
)
645 ddt_bp_fill(ddp
, v
, bp
, ddt_phys_birth(ddp
, v
));
647 bp
->blk_cksum
= ddk
->ddk_cksum
;
649 BP_SET_LSIZE(bp
, DDK_GET_LSIZE(ddk
));
650 BP_SET_PSIZE(bp
, DDK_GET_PSIZE(ddk
));
651 BP_SET_COMPRESS(bp
, DDK_GET_COMPRESS(ddk
));
652 BP_SET_CRYPT(bp
, DDK_GET_CRYPT(ddk
));
654 BP_SET_CHECKSUM(bp
, checksum
);
655 BP_SET_TYPE(bp
, DMU_OT_DEDUP
);
658 BP_SET_BYTEORDER(bp
, ZFS_HOST_BYTEORDER
);
662 ddt_key_fill(ddt_key_t
*ddk
, const blkptr_t
*bp
)
664 ddk
->ddk_cksum
= bp
->blk_cksum
;
667 ASSERT(BP_IS_ENCRYPTED(bp
) || !BP_USES_CRYPT(bp
));
669 DDK_SET_LSIZE(ddk
, BP_GET_LSIZE(bp
));
670 DDK_SET_PSIZE(ddk
, BP_GET_PSIZE(bp
));
671 DDK_SET_COMPRESS(ddk
, BP_GET_COMPRESS(bp
));
672 DDK_SET_CRYPT(ddk
, BP_USES_CRYPT(bp
));
676 ddt_phys_extend(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
, const blkptr_t
*bp
)
678 ASSERT3U(v
, <, DDT_PHYS_NONE
);
679 int bp_ndvas
= BP_GET_NDVAS(bp
);
680 int ddp_max_dvas
= BP_IS_ENCRYPTED(bp
) ?
681 SPA_DVAS_PER_BP
- 1 : SPA_DVAS_PER_BP
;
682 dva_t
*dvas
= (v
== DDT_PHYS_FLAT
) ?
683 ddp
->ddp_flat
.ddp_dva
: ddp
->ddp_trad
[v
].ddp_dva
;
686 while (s
< bp_ndvas
&& d
< ddp_max_dvas
) {
687 if (DVA_IS_VALID(&dvas
[d
])) {
691 dvas
[d
] = bp
->blk_dva
[s
];
696 * If the caller offered us more DVAs than we can fit, something has
697 * gone wrong in their accounting. zio_ddt_write() should never ask for
700 ASSERT3U(s
, ==, bp_ndvas
);
702 if (BP_IS_ENCRYPTED(bp
))
703 dvas
[2] = bp
->blk_dva
[2];
705 if (ddt_phys_birth(ddp
, v
) == 0) {
706 if (v
== DDT_PHYS_FLAT
)
707 ddp
->ddp_flat
.ddp_phys_birth
= BP_GET_BIRTH(bp
);
709 ddp
->ddp_trad
[v
].ddp_phys_birth
= BP_GET_BIRTH(bp
);
714 ddt_phys_copy(ddt_univ_phys_t
*dst
, const ddt_univ_phys_t
*src
,
715 ddt_phys_variant_t v
)
717 ASSERT3U(v
, <, DDT_PHYS_NONE
);
719 if (v
== DDT_PHYS_FLAT
)
720 dst
->ddp_flat
= src
->ddp_flat
;
722 dst
->ddp_trad
[v
] = src
->ddp_trad
[v
];
726 ddt_phys_clear(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
)
728 ASSERT3U(v
, <, DDT_PHYS_NONE
);
730 if (v
== DDT_PHYS_FLAT
)
731 memset(&ddp
->ddp_flat
, 0, DDT_FLAT_PHYS_SIZE
);
733 memset(&ddp
->ddp_trad
[v
], 0, DDT_TRAD_PHYS_SIZE
/ DDT_PHYS_MAX
);
737 ddt_class_start(void)
739 uint64_t start
= gethrestime_sec();
741 if (ddt_prune_artificial_age
) {
743 * debug aide -- simulate a wider distribution
744 * so we don't have to wait for an aged DDT
748 int percent
= random_in_range(100);
751 } else if (percent
> 75) {
754 start
-= random_in_range(range
);
761 ddt_phys_addref(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
)
763 ASSERT3U(v
, <, DDT_PHYS_NONE
);
765 if (v
== DDT_PHYS_FLAT
)
766 ddp
->ddp_flat
.ddp_refcnt
++;
768 ddp
->ddp_trad
[v
].ddp_refcnt
++;
772 ddt_phys_decref(ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
)
774 ASSERT3U(v
, <, DDT_PHYS_NONE
);
778 if (v
== DDT_PHYS_FLAT
)
779 refcntp
= &ddp
->ddp_flat
.ddp_refcnt
;
781 refcntp
= &ddp
->ddp_trad
[v
].ddp_refcnt
;
783 ASSERT3U(*refcntp
, >, 0);
789 ddt_phys_free(ddt_t
*ddt
, ddt_key_t
*ddk
, ddt_univ_phys_t
*ddp
,
790 ddt_phys_variant_t v
, uint64_t txg
)
794 ddt_bp_create(ddt
->ddt_checksum
, ddk
, ddp
, v
, &blk
);
797 * We clear the dedup bit so that zio_free() will actually free the
798 * space, rather than just decrementing the refcount in the DDT.
800 BP_SET_DEDUP(&blk
, 0);
802 ddt_phys_clear(ddp
, v
);
803 zio_free(ddt
->ddt_spa
, txg
, &blk
);
807 ddt_phys_birth(const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
)
809 ASSERT3U(v
, <, DDT_PHYS_NONE
);
811 if (v
== DDT_PHYS_FLAT
)
812 return (ddp
->ddp_flat
.ddp_phys_birth
);
814 return (ddp
->ddp_trad
[v
].ddp_phys_birth
);
818 ddt_phys_dva_count(const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
,
821 ASSERT3U(v
, <, DDT_PHYS_NONE
);
823 const dva_t
*dvas
= (v
== DDT_PHYS_FLAT
) ?
824 ddp
->ddp_flat
.ddp_dva
: ddp
->ddp_trad
[v
].ddp_dva
;
826 return (DVA_IS_VALID(&dvas
[0]) +
827 DVA_IS_VALID(&dvas
[1]) +
828 DVA_IS_VALID(&dvas
[2]) * !encrypted
);
832 ddt_phys_select(const ddt_t
*ddt
, const ddt_entry_t
*dde
, const blkptr_t
*bp
)
835 return (DDT_PHYS_NONE
);
837 const ddt_univ_phys_t
*ddp
= dde
->dde_phys
;
839 if (ddt
->ddt_flags
& DDT_FLAG_FLAT
) {
840 if (DVA_EQUAL(BP_IDENTITY(bp
), &ddp
->ddp_flat
.ddp_dva
[0]) &&
841 BP_GET_BIRTH(bp
) == ddp
->ddp_flat
.ddp_phys_birth
) {
842 return (DDT_PHYS_FLAT
);
844 } else /* traditional phys */ {
845 for (int p
= 0; p
< DDT_PHYS_MAX
; p
++) {
846 if (DVA_EQUAL(BP_IDENTITY(bp
),
847 &ddp
->ddp_trad
[p
].ddp_dva
[0]) &&
849 ddp
->ddp_trad
[p
].ddp_phys_birth
) {
854 return (DDT_PHYS_NONE
);
858 ddt_phys_refcnt(const ddt_univ_phys_t
*ddp
, ddt_phys_variant_t v
)
860 ASSERT3U(v
, <, DDT_PHYS_NONE
);
862 if (v
== DDT_PHYS_FLAT
)
863 return (ddp
->ddp_flat
.ddp_refcnt
);
865 return (ddp
->ddp_trad
[v
].ddp_refcnt
);
869 ddt_phys_total_refcnt(const ddt_t
*ddt
, const ddt_univ_phys_t
*ddp
)
873 if (ddt
->ddt_flags
& DDT_FLAG_FLAT
)
874 refcnt
= ddp
->ddp_flat
.ddp_refcnt
;
876 for (int v
= DDT_PHYS_SINGLE
; v
<= DDT_PHYS_TRIPLE
; v
++)
877 refcnt
+= ddp
->ddp_trad
[v
].ddp_refcnt
;
883 ddt_select(spa_t
*spa
, const blkptr_t
*bp
)
885 ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp
)));
886 return (spa
->spa_ddt
[BP_GET_CHECKSUM(bp
)]);
890 ddt_enter(ddt_t
*ddt
)
892 mutex_enter(&ddt
->ddt_lock
);
898 mutex_exit(&ddt
->ddt_lock
);
904 ddt_cache
= kmem_cache_create("ddt_cache",
905 sizeof (ddt_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
906 ddt_entry_flat_cache
= kmem_cache_create("ddt_entry_flat_cache",
907 DDT_ENTRY_FLAT_SIZE
, 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
908 ddt_entry_trad_cache
= kmem_cache_create("ddt_entry_trad_cache",
909 DDT_ENTRY_TRAD_SIZE
, 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
919 kmem_cache_destroy(ddt_entry_trad_cache
);
920 kmem_cache_destroy(ddt_entry_flat_cache
);
921 kmem_cache_destroy(ddt_cache
);
925 ddt_alloc(const ddt_t
*ddt
, const ddt_key_t
*ddk
)
929 if (ddt
->ddt_flags
& DDT_FLAG_FLAT
) {
930 dde
= kmem_cache_alloc(ddt_entry_flat_cache
, KM_SLEEP
);
931 memset(dde
, 0, DDT_ENTRY_FLAT_SIZE
);
933 dde
= kmem_cache_alloc(ddt_entry_trad_cache
, KM_SLEEP
);
934 memset(dde
, 0, DDT_ENTRY_TRAD_SIZE
);
937 cv_init(&dde
->dde_cv
, NULL
, CV_DEFAULT
, NULL
);
945 ddt_alloc_entry_io(ddt_entry_t
*dde
)
947 if (dde
->dde_io
!= NULL
)
950 dde
->dde_io
= kmem_zalloc(sizeof (ddt_entry_io_t
), KM_SLEEP
);
954 ddt_free(const ddt_t
*ddt
, ddt_entry_t
*dde
)
956 if (dde
->dde_io
!= NULL
) {
957 for (int p
= 0; p
< DDT_NPHYS(ddt
); p
++)
958 ASSERT3P(dde
->dde_io
->dde_lead_zio
[p
], ==, NULL
);
960 if (dde
->dde_io
->dde_repair_abd
!= NULL
)
961 abd_free(dde
->dde_io
->dde_repair_abd
);
963 kmem_free(dde
->dde_io
, sizeof (ddt_entry_io_t
));
966 cv_destroy(&dde
->dde_cv
);
967 kmem_cache_free(ddt
->ddt_flags
& DDT_FLAG_FLAT
?
968 ddt_entry_flat_cache
: ddt_entry_trad_cache
, dde
);
972 ddt_remove(ddt_t
*ddt
, ddt_entry_t
*dde
)
974 ASSERT(MUTEX_HELD(&ddt
->ddt_lock
));
976 /* Entry is still in the log, so charge the entry back to it */
977 if (dde
->dde_flags
& DDE_FLAG_LOGGED
) {
978 ddt_lightweight_entry_t ddlwe
;
979 DDT_ENTRY_TO_LIGHTWEIGHT(ddt
, dde
, &ddlwe
);
980 ddt_histogram_add_entry(ddt
, &ddt
->ddt_log_histogram
, &ddlwe
);
983 avl_remove(&ddt
->ddt_tree
, dde
);
988 ddt_special_over_quota(spa_t
*spa
, metaslab_class_t
*mc
)
990 if (mc
!= NULL
&& metaslab_class_get_space(mc
) > 0) {
991 /* Over quota if allocating outside of this special class */
992 if (spa_syncing_txg(spa
) <= spa
->spa_dedup_class_full_txg
+
993 dedup_class_wait_txgs
) {
994 /* Waiting for some deferred frees to be processed */
999 * We're considered over quota when we hit 85% full, or for
1000 * larger drives, when there is less than 8GB free.
1002 uint64_t allocated
= metaslab_class_get_alloc(mc
);
1003 uint64_t capacity
= metaslab_class_get_space(mc
);
1004 uint64_t limit
= MAX(capacity
* 85 / 100,
1005 (capacity
> (1LL<<33)) ? capacity
- (1LL<<33) : 0);
1007 return (allocated
>= limit
);
1013 * Check if the DDT is over its quota. This can be due to a few conditions:
1014 * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
1015 * exceeds this limit
1017 * 2. 'dedup_table_quota' property is set to automatic and
1018 * a. the dedup or special allocation class could not satisfy a DDT
1019 * allocation in a recent transaction
1020 * b. the dedup or special allocation class has exceeded its 85% limit
1023 ddt_over_quota(spa_t
*spa
)
1025 if (spa
->spa_dedup_table_quota
== 0)
1028 if (spa
->spa_dedup_table_quota
!= UINT64_MAX
)
1029 return (ddt_get_ddt_dsize(spa
) > spa
->spa_dedup_table_quota
);
1032 * For automatic quota, table size is limited by dedup or special class
1034 if (ddt_special_over_quota(spa
, spa_dedup_class(spa
)))
1036 else if (spa_special_has_ddt(spa
) &&
1037 ddt_special_over_quota(spa
, spa_special_class(spa
)))
1044 ddt_prefetch_all(spa_t
*spa
)
1047 * Load all DDT entries for each type/class combination. This is
1048 * indended to perform a prefetch on all such blocks. For the same
1049 * reason that ddt_prefetch isn't locked, this is also not locked.
1051 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
1052 ddt_t
*ddt
= spa
->spa_ddt
[c
];
1056 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1057 for (ddt_class_t
class = 0; class < DDT_CLASSES
;
1059 ddt_object_prefetch_all(ddt
, type
, class);
1065 static int ddt_configure(ddt_t
*ddt
, boolean_t
new);
1068 * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
1069 * to the ones in the entry. If they're different, then the passed-in BP is
1070 * from a previous generation of this entry (ie was previously pruned) and we
1071 * have to act like the entry doesn't exist at all.
1073 * This should only happen during a lookup to free the block (zio_ddt_free()).
1075 * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
1076 * -- robn, 2024-02-09
1079 ddt_entry_lookup_is_valid(ddt_t
*ddt
, const blkptr_t
*bp
, ddt_entry_t
*dde
)
1081 /* If the BP has no DVAs, then this entry is good */
1082 uint_t ndvas
= BP_GET_NDVAS(bp
);
1087 * Only checking the phys for the copies. For flat, there's only one;
1088 * for trad it'll be the one that has the matching set of DVAs.
1090 const dva_t
*dvas
= (ddt
->ddt_flags
& DDT_FLAG_FLAT
) ?
1091 dde
->dde_phys
->ddp_flat
.ddp_dva
:
1092 dde
->dde_phys
->ddp_trad
[ndvas
].ddp_dva
;
1095 * Compare entry DVAs with the BP. They should all be there, but
1096 * there's not really anything we can do if its only partial anyway,
1097 * that's an error somewhere else, maybe long ago.
1100 for (d
= 0; d
< ndvas
; d
++)
1101 if (!DVA_EQUAL(&dvas
[d
], &bp
->blk_dva
[d
]))
1103 ASSERT3U(d
, ==, ndvas
);
1109 ddt_lookup(ddt_t
*ddt
, const blkptr_t
*bp
)
1111 spa_t
*spa
= ddt
->ddt_spa
;
1119 ASSERT(MUTEX_HELD(&ddt
->ddt_lock
));
1121 if (ddt
->ddt_version
== DDT_VERSION_UNCONFIGURED
) {
1123 * This is the first use of this DDT since the pool was
1124 * created; finish getting it ready for use.
1126 VERIFY0(ddt_configure(ddt
, B_TRUE
));
1127 ASSERT3U(ddt
->ddt_version
, !=, DDT_VERSION_UNCONFIGURED
);
1130 DDT_KSTAT_BUMP(ddt
, dds_lookup
);
1132 ddt_key_fill(&search
, bp
);
1134 /* Find an existing live entry */
1135 dde
= avl_find(&ddt
->ddt_tree
, &search
, &where
);
1137 /* If we went over quota, act like we didn't find it */
1138 if (dde
->dde_flags
& DDE_FLAG_OVERQUOTA
)
1141 /* If it's already loaded, we can just return it. */
1142 DDT_KSTAT_BUMP(ddt
, dds_lookup_live_hit
);
1143 if (dde
->dde_flags
& DDE_FLAG_LOADED
) {
1144 if (ddt_entry_lookup_is_valid(ddt
, bp
, dde
))
1149 /* Someone else is loading it, wait for it. */
1151 DDT_KSTAT_BUMP(ddt
, dds_lookup_live_wait
);
1152 while (!(dde
->dde_flags
& DDE_FLAG_LOADED
))
1153 cv_wait(&dde
->dde_cv
, &ddt
->ddt_lock
);
1156 /* Loaded but over quota, forget we were ever here */
1157 if (dde
->dde_flags
& DDE_FLAG_OVERQUOTA
) {
1158 if (dde
->dde_waiters
== 0) {
1159 avl_remove(&ddt
->ddt_tree
, dde
);
1165 DDT_KSTAT_BUMP(ddt
, dds_lookup_existing
);
1167 /* Make sure the loaded entry matches the BP */
1168 if (ddt_entry_lookup_is_valid(ddt
, bp
, dde
))
1172 DDT_KSTAT_BUMP(ddt
, dds_lookup_live_miss
);
1174 /* Time to make a new entry. */
1175 dde
= ddt_alloc(ddt
, &search
);
1177 /* Record the time this class was created (used by ddt prune) */
1178 if (ddt
->ddt_flags
& DDT_FLAG_FLAT
)
1179 dde
->dde_phys
->ddp_flat
.ddp_class_start
= ddt_class_start();
1181 avl_insert(&ddt
->ddt_tree
, dde
, where
);
1183 /* If its in the log tree, we can "load" it from there */
1184 if (ddt
->ddt_flags
& DDT_FLAG_LOG
) {
1185 ddt_lightweight_entry_t ddlwe
;
1187 if (ddt_log_find_key(ddt
, &search
, &ddlwe
)) {
1189 * See if we have the key first, and if so, set up
1192 dde
->dde_type
= ddlwe
.ddlwe_type
;
1193 dde
->dde_class
= ddlwe
.ddlwe_class
;
1194 memcpy(dde
->dde_phys
, &ddlwe
.ddlwe_phys
,
1195 DDT_PHYS_SIZE(ddt
));
1196 /* Whatever we found isn't valid for this BP, eject */
1197 if (!ddt_entry_lookup_is_valid(ddt
, bp
, dde
)) {
1198 avl_remove(&ddt
->ddt_tree
, dde
);
1203 /* Remove it and count it */
1204 if (ddt_log_remove_key(ddt
,
1205 ddt
->ddt_log_active
, &search
)) {
1206 DDT_KSTAT_BUMP(ddt
, dds_lookup_log_active_hit
);
1208 VERIFY(ddt_log_remove_key(ddt
,
1209 ddt
->ddt_log_flushing
, &search
));
1211 dds_lookup_log_flushing_hit
);
1214 dde
->dde_flags
= DDE_FLAG_LOADED
| DDE_FLAG_LOGGED
;
1216 DDT_KSTAT_BUMP(ddt
, dds_lookup_log_hit
);
1217 DDT_KSTAT_BUMP(ddt
, dds_lookup_existing
);
1222 DDT_KSTAT_BUMP(ddt
, dds_lookup_log_miss
);
1226 * ddt_tree is now stable, so unlock and let everyone else keep moving.
1227 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
1228 * and go to sleep waiting for it above.
1232 /* Search all store objects for the entry. */
1234 for (type
= 0; type
< DDT_TYPES
; type
++) {
1235 for (class = 0; class < DDT_CLASSES
; class++) {
1236 error
= ddt_object_lookup(ddt
, type
, class, dde
);
1237 if (error
!= ENOENT
) {
1242 if (error
!= ENOENT
)
1248 ASSERT(!(dde
->dde_flags
& DDE_FLAG_LOADED
));
1250 dde
->dde_type
= type
; /* will be DDT_TYPES if no entry found */
1251 dde
->dde_class
= class; /* will be DDT_CLASSES if no entry found */
1253 boolean_t valid
= B_TRUE
;
1255 if (dde
->dde_type
== DDT_TYPES
&&
1256 dde
->dde_class
== DDT_CLASSES
&&
1257 ddt_over_quota(spa
)) {
1258 /* Over quota. If no one is waiting, clean up right now. */
1259 if (dde
->dde_waiters
== 0) {
1260 avl_remove(&ddt
->ddt_tree
, dde
);
1265 /* Flag cleanup required */
1266 dde
->dde_flags
|= DDE_FLAG_OVERQUOTA
;
1267 } else if (error
== 0) {
1269 * If what we loaded is no good for this BP and there's no one
1270 * waiting for it, we can just remove it and get out. If its no
1271 * good but there are waiters, we have to leave it, because we
1272 * don't know what they want. If its not needed we'll end up
1273 * taking an entry log/sync, but it can only happen if more
1274 * than one previous version of this block is being deleted at
1275 * the same time. This is extremely unlikely to happen and not
1276 * worth the effort to deal with without taking an entry
1279 valid
= ddt_entry_lookup_is_valid(ddt
, bp
, dde
);
1280 if (!valid
&& dde
->dde_waiters
== 0) {
1281 avl_remove(&ddt
->ddt_tree
, dde
);
1286 DDT_KSTAT_BUMP(ddt
, dds_lookup_stored_hit
);
1287 DDT_KSTAT_BUMP(ddt
, dds_lookup_existing
);
1290 * The histograms only track inactive (stored or logged) blocks.
1291 * We've just put an entry onto the live list, so we need to
1292 * remove its counts. When its synced back, it'll be re-added
1295 * We only do this when we successfully found it in the store.
1296 * error == ENOENT means this is a new entry, and so its already
1299 ddt_histogram_t
*ddh
=
1300 &ddt
->ddt_histogram
[dde
->dde_type
][dde
->dde_class
];
1302 ddt_lightweight_entry_t ddlwe
;
1303 DDT_ENTRY_TO_LIGHTWEIGHT(ddt
, dde
, &ddlwe
);
1304 ddt_histogram_sub_entry(ddt
, ddh
, &ddlwe
);
1306 DDT_KSTAT_BUMP(ddt
, dds_lookup_stored_miss
);
1307 DDT_KSTAT_BUMP(ddt
, dds_lookup_new
);
1310 /* Entry loaded, everyone can proceed now */
1311 dde
->dde_flags
|= DDE_FLAG_LOADED
;
1312 cv_broadcast(&dde
->dde_cv
);
1314 if ((dde
->dde_flags
& DDE_FLAG_OVERQUOTA
) || !valid
)
1321 ddt_prefetch(spa_t
*spa
, const blkptr_t
*bp
)
1326 if (!zfs_dedup_prefetch
|| bp
== NULL
|| !BP_GET_DEDUP(bp
))
1330 * We only remove the DDT once all tables are empty and only
1331 * prefetch dedup blocks when there are entries in the DDT.
1332 * Thus no locking is required as the DDT can't disappear on us.
1334 ddt
= ddt_select(spa
, bp
);
1335 ddt_key_fill(&ddk
, bp
);
1337 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1338 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1339 ddt_object_prefetch(ddt
, type
, class, &ddk
);
1345 * ddt_key_t comparison. Any struct wanting to make use of this function must
1346 * have the key as the first element. Casts it to N uint64_ts, and checks until
1347 * we find there's a difference. This is intended to match how ddt_zap.c drives
1348 * the ZAPs (first uint64_t as the key prehash), which will minimise the number
1349 * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
1350 * not an invariant for this function though, should you wish to change it.
1353 ddt_key_compare(const void *x1
, const void *x2
)
1355 const uint64_t *k1
= (const uint64_t *)x1
;
1356 const uint64_t *k2
= (const uint64_t *)x2
;
1359 for (int i
= 0; i
< (sizeof (ddt_key_t
) / sizeof (uint64_t)); i
++)
1360 if (likely((cmp
= TREE_CMP(k1
[i
], k2
[i
])) != 0))
1366 /* Create the containing dir for this DDT and bump the feature count */
1368 ddt_create_dir(ddt_t
*ddt
, dmu_tx_t
*tx
)
1370 ASSERT3U(ddt
->ddt_dir_object
, ==, 0);
1371 ASSERT3U(ddt
->ddt_version
, ==, DDT_VERSION_FDT
);
1373 char name
[DDT_NAMELEN
];
1374 snprintf(name
, DDT_NAMELEN
, DMU_POOL_DDT_DIR
,
1375 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
);
1377 ddt
->ddt_dir_object
= zap_create_link(ddt
->ddt_os
,
1378 DMU_OTN_ZAP_METADATA
, DMU_POOL_DIRECTORY_OBJECT
, name
, tx
);
1380 VERIFY0(zap_add(ddt
->ddt_os
, ddt
->ddt_dir_object
, DDT_DIR_VERSION
,
1381 sizeof (uint64_t), 1, &ddt
->ddt_version
, tx
));
1382 VERIFY0(zap_add(ddt
->ddt_os
, ddt
->ddt_dir_object
, DDT_DIR_FLAGS
,
1383 sizeof (uint64_t), 1, &ddt
->ddt_flags
, tx
));
1385 spa_feature_incr(ddt
->ddt_spa
, SPA_FEATURE_FAST_DEDUP
, tx
);
1388 /* Destroy the containing dir and deactivate the feature */
1390 ddt_destroy_dir(ddt_t
*ddt
, dmu_tx_t
*tx
)
1392 ASSERT3U(ddt
->ddt_dir_object
, !=, 0);
1393 ASSERT3U(ddt
->ddt_dir_object
, !=, DMU_POOL_DIRECTORY_OBJECT
);
1394 ASSERT3U(ddt
->ddt_version
, ==, DDT_VERSION_FDT
);
1396 char name
[DDT_NAMELEN
];
1397 snprintf(name
, DDT_NAMELEN
, DMU_POOL_DDT_DIR
,
1398 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
);
1400 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1401 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1402 ASSERT(!ddt_object_exists(ddt
, type
, class));
1406 ddt_log_destroy(ddt
, tx
);
1409 ASSERT0(zap_count(ddt
->ddt_os
, ddt
->ddt_dir_object
, &count
));
1410 ASSERT0(zap_contains(ddt
->ddt_os
, ddt
->ddt_dir_object
,
1412 ASSERT0(zap_contains(ddt
->ddt_os
, ddt
->ddt_dir_object
, DDT_DIR_FLAGS
));
1413 ASSERT3U(count
, ==, 2);
1415 VERIFY0(zap_remove(ddt
->ddt_os
, DMU_POOL_DIRECTORY_OBJECT
, name
, tx
));
1416 VERIFY0(zap_destroy(ddt
->ddt_os
, ddt
->ddt_dir_object
, tx
));
1418 ddt
->ddt_dir_object
= 0;
1420 spa_feature_decr(ddt
->ddt_spa
, SPA_FEATURE_FAST_DEDUP
, tx
);
1424 * Determine, flags and on-disk layout from what's already stored. If there's
1425 * nothing stored, then if new is false, returns ENOENT, and if true, selects
1426 * based on pool config.
1429 ddt_configure(ddt_t
*ddt
, boolean_t
new)
1431 spa_t
*spa
= ddt
->ddt_spa
;
1432 char name
[DDT_NAMELEN
];
1435 ASSERT3U(spa_load_state(spa
), !=, SPA_LOAD_CREATE
);
1437 boolean_t fdt_enabled
=
1438 spa_feature_is_enabled(spa
, SPA_FEATURE_FAST_DEDUP
);
1439 boolean_t fdt_active
=
1440 spa_feature_is_active(spa
, SPA_FEATURE_FAST_DEDUP
);
1443 * First, look for the global DDT stats object. If its not there, then
1444 * there's never been a DDT written before ever, and we know we're
1445 * starting from scratch.
1447 error
= zap_lookup(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
1448 DMU_POOL_DDT_STATS
, sizeof (uint64_t), 1,
1449 &spa
->spa_ddt_stat_object
);
1451 if (error
!= ENOENT
)
1458 * Now look for a DDT directory. If it exists, then it has
1459 * everything we need.
1461 snprintf(name
, DDT_NAMELEN
, DMU_POOL_DDT_DIR
,
1462 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
);
1464 error
= zap_lookup(spa
->spa_meta_objset
,
1465 DMU_POOL_DIRECTORY_OBJECT
, name
, sizeof (uint64_t), 1,
1466 &ddt
->ddt_dir_object
);
1468 ASSERT3U(spa
->spa_meta_objset
, ==, ddt
->ddt_os
);
1470 error
= zap_lookup(ddt
->ddt_os
, ddt
->ddt_dir_object
,
1471 DDT_DIR_VERSION
, sizeof (uint64_t), 1,
1476 error
= zap_lookup(ddt
->ddt_os
, ddt
->ddt_dir_object
,
1477 DDT_DIR_FLAGS
, sizeof (uint64_t), 1,
1482 if (ddt
->ddt_version
!= DDT_VERSION_FDT
) {
1483 zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1484 "unknown version %llu", spa_name(spa
),
1485 name
, (u_longlong_t
)ddt
->ddt_version
);
1486 return (SET_ERROR(EINVAL
));
1489 if ((ddt
->ddt_flags
& ~DDT_FLAG_MASK
) != 0) {
1490 zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1491 "version=%llu unknown flags %llx",
1492 spa_name(spa
), name
,
1493 (u_longlong_t
)ddt
->ddt_flags
,
1494 (u_longlong_t
)ddt
->ddt_version
);
1495 return (SET_ERROR(EINVAL
));
1500 if (error
!= ENOENT
)
1504 /* Any object in the root indicates a traditional setup. */
1505 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1506 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1507 ddt_object_name(ddt
, type
, class, name
);
1509 error
= zap_lookup(spa
->spa_meta_objset
,
1510 DMU_POOL_DIRECTORY_OBJECT
, name
, sizeof (uint64_t),
1512 if (error
== ENOENT
)
1517 ddt
->ddt_version
= DDT_VERSION_LEGACY
;
1518 ddt
->ddt_flags
= ddt_version_flags
[ddt
->ddt_version
];
1519 ddt
->ddt_dir_object
= DMU_POOL_DIRECTORY_OBJECT
;
1527 return (SET_ERROR(ENOENT
));
1529 /* Nothing on disk, so set up for the best version we can */
1531 ddt
->ddt_version
= DDT_VERSION_FDT
;
1532 ddt
->ddt_flags
= ddt_version_flags
[ddt
->ddt_version
];
1533 ddt
->ddt_dir_object
= 0; /* create on first use */
1535 ddt
->ddt_version
= DDT_VERSION_LEGACY
;
1536 ddt
->ddt_flags
= ddt_version_flags
[ddt
->ddt_version
];
1537 ddt
->ddt_dir_object
= DMU_POOL_DIRECTORY_OBJECT
;
1544 ddt_table_alloc_kstats(ddt_t
*ddt
)
1546 char *mod
= kmem_asprintf("zfs/%s", spa_name(ddt
->ddt_spa
));
1547 char *name
= kmem_asprintf("ddt_stats_%s",
1548 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
);
1550 ddt
->ddt_ksp
= kstat_create(mod
, 0, name
, "misc", KSTAT_TYPE_NAMED
,
1551 sizeof (ddt_kstats_t
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
1552 if (ddt
->ddt_ksp
!= NULL
) {
1553 ddt_kstats_t
*dds
= kmem_alloc(sizeof (ddt_kstats_t
), KM_SLEEP
);
1554 memcpy(dds
, &ddt_kstats_template
, sizeof (ddt_kstats_t
));
1555 ddt
->ddt_ksp
->ks_data
= dds
;
1556 kstat_install(ddt
->ddt_ksp
);
1564 ddt_table_alloc(spa_t
*spa
, enum zio_checksum c
)
1568 ddt
= kmem_cache_alloc(ddt_cache
, KM_SLEEP
);
1569 memset(ddt
, 0, sizeof (ddt_t
));
1570 mutex_init(&ddt
->ddt_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1571 avl_create(&ddt
->ddt_tree
, ddt_key_compare
,
1572 sizeof (ddt_entry_t
), offsetof(ddt_entry_t
, dde_node
));
1573 avl_create(&ddt
->ddt_repair_tree
, ddt_key_compare
,
1574 sizeof (ddt_entry_t
), offsetof(ddt_entry_t
, dde_node
));
1576 ddt
->ddt_checksum
= c
;
1578 ddt
->ddt_os
= spa
->spa_meta_objset
;
1579 ddt
->ddt_version
= DDT_VERSION_UNCONFIGURED
;
1582 ddt_table_alloc_kstats(ddt
);
1588 ddt_table_free(ddt_t
*ddt
)
1590 if (ddt
->ddt_ksp
!= NULL
) {
1591 kmem_free(ddt
->ddt_ksp
->ks_data
, sizeof (ddt_kstats_t
));
1592 ddt
->ddt_ksp
->ks_data
= NULL
;
1593 kstat_delete(ddt
->ddt_ksp
);
1597 ASSERT0(avl_numnodes(&ddt
->ddt_tree
));
1598 ASSERT0(avl_numnodes(&ddt
->ddt_repair_tree
));
1599 avl_destroy(&ddt
->ddt_tree
);
1600 avl_destroy(&ddt
->ddt_repair_tree
);
1601 mutex_destroy(&ddt
->ddt_lock
);
1602 kmem_cache_free(ddt_cache
, ddt
);
1606 ddt_create(spa_t
*spa
)
1608 spa
->spa_dedup_checksum
= ZIO_DEDUPCHECKSUM
;
1610 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
1611 if (DDT_CHECKSUM_VALID(c
))
1612 spa
->spa_ddt
[c
] = ddt_table_alloc(spa
, c
);
1617 ddt_load(spa_t
*spa
)
1623 error
= zap_lookup(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
1624 DMU_POOL_DDT_STATS
, sizeof (uint64_t), 1,
1625 &spa
->spa_ddt_stat_object
);
1627 return (error
== ENOENT
? 0 : error
);
1629 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
1630 if (!DDT_CHECKSUM_VALID(c
))
1633 ddt_t
*ddt
= spa
->spa_ddt
[c
];
1634 error
= ddt_configure(ddt
, B_FALSE
);
1635 if (error
== ENOENT
)
1640 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1641 for (ddt_class_t
class = 0; class < DDT_CLASSES
;
1643 error
= ddt_object_load(ddt
, type
, class);
1644 if (error
!= 0 && error
!= ENOENT
)
1649 error
= ddt_log_load(ddt
);
1650 if (error
!= 0 && error
!= ENOENT
)
1653 DDT_KSTAT_SET(ddt
, dds_log_active_entries
,
1654 avl_numnodes(&ddt
->ddt_log_active
->ddl_tree
));
1655 DDT_KSTAT_SET(ddt
, dds_log_flushing_entries
,
1656 avl_numnodes(&ddt
->ddt_log_flushing
->ddl_tree
));
1659 * Seed the cached histograms.
1661 memcpy(&ddt
->ddt_histogram_cache
, ddt
->ddt_histogram
,
1662 sizeof (ddt
->ddt_histogram
));
1665 spa
->spa_dedup_dspace
= ~0ULL;
1666 spa
->spa_dedup_dsize
= ~0ULL;
1672 ddt_unload(spa_t
*spa
)
1674 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
1675 if (spa
->spa_ddt
[c
]) {
1676 ddt_table_free(spa
->spa_ddt
[c
]);
1677 spa
->spa_ddt
[c
] = NULL
;
1683 ddt_class_contains(spa_t
*spa
, ddt_class_t max_class
, const blkptr_t
*bp
)
1688 if (!BP_GET_DEDUP(bp
))
1691 if (max_class
== DDT_CLASS_UNIQUE
)
1694 ddt
= spa
->spa_ddt
[BP_GET_CHECKSUM(bp
)];
1696 ddt_key_fill(&ddk
, bp
);
1698 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1699 for (ddt_class_t
class = 0; class <= max_class
; class++) {
1700 if (ddt_object_contains(ddt
, type
, class, &ddk
) == 0)
1709 ddt_repair_start(ddt_t
*ddt
, const blkptr_t
*bp
)
1714 ddt_key_fill(&ddk
, bp
);
1716 dde
= ddt_alloc(ddt
, &ddk
);
1717 ddt_alloc_entry_io(dde
);
1719 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1720 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1722 * We can only do repair if there are multiple copies
1723 * of the block. For anything in the UNIQUE class,
1724 * there's definitely only one copy, so don't even try.
1726 if (class != DDT_CLASS_UNIQUE
&&
1727 ddt_object_lookup(ddt
, type
, class, dde
) == 0)
1732 memset(dde
->dde_phys
, 0, DDT_PHYS_SIZE(ddt
));
1738 ddt_repair_done(ddt_t
*ddt
, ddt_entry_t
*dde
)
1744 if (dde
->dde_io
->dde_repair_abd
!= NULL
&&
1745 spa_writeable(ddt
->ddt_spa
) &&
1746 avl_find(&ddt
->ddt_repair_tree
, dde
, &where
) == NULL
)
1747 avl_insert(&ddt
->ddt_repair_tree
, dde
, where
);
1755 ddt_repair_entry_done(zio_t
*zio
)
1757 ddt_t
*ddt
= ddt_select(zio
->io_spa
, zio
->io_bp
);
1758 ddt_entry_t
*rdde
= zio
->io_private
;
1760 ddt_free(ddt
, rdde
);
1764 ddt_repair_entry(ddt_t
*ddt
, ddt_entry_t
*dde
, ddt_entry_t
*rdde
, zio_t
*rio
)
1766 ddt_key_t
*ddk
= &dde
->dde_key
;
1767 ddt_key_t
*rddk
= &rdde
->dde_key
;
1771 zio
= zio_null(rio
, rio
->io_spa
, NULL
,
1772 ddt_repair_entry_done
, rdde
, rio
->io_flags
);
1774 for (int p
= 0; p
< DDT_NPHYS(ddt
); p
++) {
1775 ddt_univ_phys_t
*ddp
= dde
->dde_phys
;
1776 ddt_univ_phys_t
*rddp
= rdde
->dde_phys
;
1777 ddt_phys_variant_t v
= DDT_PHYS_VARIANT(ddt
, p
);
1778 uint64_t phys_birth
= ddt_phys_birth(ddp
, v
);
1779 const dva_t
*dvas
, *rdvas
;
1781 if (ddt
->ddt_flags
& DDT_FLAG_FLAT
) {
1782 dvas
= ddp
->ddp_flat
.ddp_dva
;
1783 rdvas
= rddp
->ddp_flat
.ddp_dva
;
1785 dvas
= ddp
->ddp_trad
[p
].ddp_dva
;
1786 rdvas
= rddp
->ddp_trad
[p
].ddp_dva
;
1789 if (phys_birth
== 0 ||
1790 phys_birth
!= ddt_phys_birth(rddp
, v
) ||
1791 memcmp(dvas
, rdvas
, sizeof (dva_t
) * SPA_DVAS_PER_BP
))
1794 ddt_bp_create(ddt
->ddt_checksum
, ddk
, ddp
, v
, &blk
);
1795 zio_nowait(zio_rewrite(zio
, zio
->io_spa
, 0, &blk
,
1796 rdde
->dde_io
->dde_repair_abd
, DDK_GET_PSIZE(rddk
),
1797 NULL
, NULL
, ZIO_PRIORITY_SYNC_WRITE
,
1798 ZIO_DDT_CHILD_FLAGS(zio
), NULL
));
1805 ddt_repair_table(ddt_t
*ddt
, zio_t
*rio
)
1807 spa_t
*spa
= ddt
->ddt_spa
;
1808 ddt_entry_t
*dde
, *rdde_next
, *rdde
;
1809 avl_tree_t
*t
= &ddt
->ddt_repair_tree
;
1812 if (spa_sync_pass(spa
) > 1)
1816 for (rdde
= avl_first(t
); rdde
!= NULL
; rdde
= rdde_next
) {
1817 rdde_next
= AVL_NEXT(t
, rdde
);
1818 avl_remove(&ddt
->ddt_repair_tree
, rdde
);
1820 ddt_bp_create(ddt
->ddt_checksum
, &rdde
->dde_key
, NULL
,
1821 DDT_PHYS_NONE
, &blk
);
1822 dde
= ddt_repair_start(ddt
, &blk
);
1823 ddt_repair_entry(ddt
, dde
, rdde
, rio
);
1824 ddt_repair_done(ddt
, dde
);
1831 ddt_sync_update_stats(ddt_t
*ddt
, dmu_tx_t
*tx
)
1834 * Count all the entries stored for each type/class, and updates the
1835 * stats within (ddt_object_sync()). If there's no entries for the
1836 * type/class, the whole object is removed. If all objects for the DDT
1837 * are removed, its containing dir is removed, effectively resetting
1838 * the entire DDT to an empty slate.
1841 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
1842 uint64_t add
, tcount
= 0;
1843 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1844 if (ddt_object_exists(ddt
, type
, class)) {
1845 ddt_object_sync(ddt
, type
, class, tx
);
1846 VERIFY0(ddt_object_count(ddt
, type
, class,
1851 for (ddt_class_t
class = 0; class < DDT_CLASSES
; class++) {
1852 if (tcount
== 0 && ddt_object_exists(ddt
, type
, class))
1853 ddt_object_destroy(ddt
, type
, class, tx
);
1858 if (ddt
->ddt_flags
& DDT_FLAG_LOG
) {
1859 /* Include logged entries in the total count */
1860 count
+= avl_numnodes(&ddt
->ddt_log_active
->ddl_tree
);
1861 count
+= avl_numnodes(&ddt
->ddt_log_flushing
->ddl_tree
);
1866 * No entries left on the DDT, so reset the version for next
1867 * time. This allows us to handle the feature being changed
1868 * since the DDT was originally created. New entries should get
1869 * whatever the feature currently demands.
1871 if (ddt
->ddt_version
== DDT_VERSION_FDT
)
1872 ddt_destroy_dir(ddt
, tx
);
1874 ddt
->ddt_version
= DDT_VERSION_UNCONFIGURED
;
1878 memcpy(&ddt
->ddt_histogram_cache
, ddt
->ddt_histogram
,
1879 sizeof (ddt
->ddt_histogram
));
1880 ddt
->ddt_spa
->spa_dedup_dspace
= ~0ULL;
1881 ddt
->ddt_spa
->spa_dedup_dsize
= ~0ULL;
1885 ddt_sync_scan_entry(ddt_t
*ddt
, ddt_lightweight_entry_t
*ddlwe
, dmu_tx_t
*tx
)
1887 dsl_pool_t
*dp
= ddt
->ddt_spa
->spa_dsl_pool
;
1890 * Compute the target class, so we can decide whether or not to inform
1891 * the scrub traversal (below). Note that we don't store this in the
1892 * entry, as it might change multiple times before finally being
1893 * committed (if we're logging). Instead, we recompute it in
1896 uint64_t refcnt
= ddt_phys_total_refcnt(ddt
, &ddlwe
->ddlwe_phys
);
1897 ddt_class_t nclass
=
1898 (refcnt
> 1) ? DDT_CLASS_DUPLICATE
: DDT_CLASS_UNIQUE
;
1901 * If the class changes, the order that we scan this bp changes. If it
1902 * decreases, we could miss it, so scan it right now. (This covers both
1903 * class changing while we are doing ddt_walk(), and when we are
1906 * We also do this when the refcnt goes to zero, because that change is
1907 * only in the log so far; the blocks on disk won't be freed until
1908 * the log is flushed, and the refcnt might increase before that. If it
1909 * does, then we could miss it in the same way.
1911 if (refcnt
== 0 || nclass
< ddlwe
->ddlwe_class
)
1912 dsl_scan_ddt_entry(dp
->dp_scan
, ddt
->ddt_checksum
, ddt
,
1917 ddt_sync_flush_entry(ddt_t
*ddt
, ddt_lightweight_entry_t
*ddlwe
,
1918 ddt_type_t otype
, ddt_class_t oclass
, dmu_tx_t
*tx
)
1920 ddt_key_t
*ddk
= &ddlwe
->ddlwe_key
;
1921 ddt_type_t ntype
= DDT_TYPE_DEFAULT
;
1922 uint64_t refcnt
= 0;
1925 * Compute the total refcnt. Along the way, issue frees for any DVAs
1926 * we no longer want.
1928 for (int p
= 0; p
< DDT_NPHYS(ddt
); p
++) {
1929 ddt_univ_phys_t
*ddp
= &ddlwe
->ddlwe_phys
;
1930 ddt_phys_variant_t v
= DDT_PHYS_VARIANT(ddt
, p
);
1931 uint64_t phys_refcnt
= ddt_phys_refcnt(ddp
, v
);
1933 if (ddt_phys_birth(ddp
, v
) == 0) {
1934 ASSERT0(phys_refcnt
);
1937 if (DDT_PHYS_IS_DITTO(ddt
, p
)) {
1939 * We don't want to keep any obsolete slots (eg ditto),
1940 * regardless of their refcount, but we don't want to
1941 * leak them either. So, free them.
1943 ddt_phys_free(ddt
, ddk
, ddp
, v
, tx
->tx_txg
);
1946 if (phys_refcnt
== 0)
1947 /* No remaining references, free it! */
1948 ddt_phys_free(ddt
, ddk
, ddp
, v
, tx
->tx_txg
);
1949 refcnt
+= phys_refcnt
;
1952 /* Select the best class for the entry. */
1953 ddt_class_t nclass
=
1954 (refcnt
> 1) ? DDT_CLASS_DUPLICATE
: DDT_CLASS_UNIQUE
;
1957 * If an existing entry changed type or class, or its refcount reached
1958 * zero, delete it from the DDT object
1960 if (otype
!= DDT_TYPES
&&
1961 (otype
!= ntype
|| oclass
!= nclass
|| refcnt
== 0)) {
1962 VERIFY0(ddt_object_remove(ddt
, otype
, oclass
, ddk
, tx
));
1963 ASSERT(ddt_object_contains(ddt
, otype
, oclass
, ddk
) == ENOENT
);
1967 * Add or update the entry
1970 ddt_histogram_t
*ddh
=
1971 &ddt
->ddt_histogram
[ntype
][nclass
];
1973 ddt_histogram_add_entry(ddt
, ddh
, ddlwe
);
1975 if (!ddt_object_exists(ddt
, ntype
, nclass
))
1976 ddt_object_create(ddt
, ntype
, nclass
, tx
);
1977 VERIFY0(ddt_object_update(ddt
, ntype
, nclass
, ddlwe
, tx
));
1981 /* Calculate an exponential weighted moving average, lower limited to zero */
1982 static inline int32_t
1983 _ewma(int32_t val
, int32_t prev
, uint32_t weight
)
1985 ASSERT3U(val
, >=, 0);
1986 ASSERT3U(prev
, >=, 0);
1988 MAX(0, prev
+ (val
-prev
) / (int32_t)MAX(weight
, 1));
1989 ASSERT3U(new, >=, 0);
1993 /* Returns true if done for this txg */
1995 ddt_sync_flush_log_incremental(ddt_t
*ddt
, dmu_tx_t
*tx
)
1997 if (ddt
->ddt_flush_pass
== 0) {
1998 if (spa_sync_pass(ddt
->ddt_spa
) == 1) {
1999 /* First run this txg, get set up */
2000 ddt
->ddt_flush_start
= gethrtime();
2001 ddt
->ddt_flush_count
= 0;
2004 * How many entries we need to flush. We want to at
2005 * least match the ingest rate.
2007 ddt
->ddt_flush_min
= MAX(
2008 ddt
->ddt_log_ingest_rate
,
2009 zfs_dedup_log_flush_entries_min
);
2012 * If we've been asked to flush everything in a hurry,
2013 * try to dump as much as possible on this txg. In
2014 * this case we're only limited by time, not amount.
2016 if (ddt
->ddt_flush_force_txg
> 0)
2017 ddt
->ddt_flush_min
=
2018 MAX(ddt
->ddt_flush_min
, avl_numnodes(
2019 &ddt
->ddt_log_flushing
->ddl_tree
));
2021 /* We already decided we're done for this txg */
2024 } else if (ddt
->ddt_flush_pass
== spa_sync_pass(ddt
->ddt_spa
)) {
2026 * We already did some flushing on this pass, skip it. This
2027 * happens when dsl_process_async_destroys() runs during a scan
2028 * (on pass 1) and does an additional ddt_sync() to update
2034 if (spa_sync_pass(ddt
->ddt_spa
) >
2035 MAX(zfs_dedup_log_flush_passes_max
, 1)) {
2036 /* Too many passes this txg, defer until next. */
2037 ddt
->ddt_flush_pass
= 0;
2041 if (avl_is_empty(&ddt
->ddt_log_flushing
->ddl_tree
)) {
2042 /* Nothing to flush, done for this txg. */
2043 ddt
->ddt_flush_pass
= 0;
2047 uint64_t target_time
= txg_sync_waiting(ddt
->ddt_spa
->spa_dsl_pool
) ?
2048 MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms
),
2049 SEC2NSEC(zfs_txg_timeout
)) : SEC2NSEC(zfs_txg_timeout
);
2051 uint64_t elapsed_time
= gethrtime() - ddt
->ddt_flush_start
;
2053 if (elapsed_time
>= target_time
) {
2054 /* Too long since we started, done for this txg. */
2055 ddt
->ddt_flush_pass
= 0;
2059 ddt
->ddt_flush_pass
++;
2060 ASSERT3U(spa_sync_pass(ddt
->ddt_spa
), ==, ddt
->ddt_flush_pass
);
2063 * Estimate how much time we'll need to flush the remaining entries
2064 * based on how long it normally takes.
2067 if (ddt
->ddt_flush_pass
== 1) {
2068 /* First pass, use the average time/entries */
2069 if (ddt
->ddt_log_flush_rate
== 0)
2070 /* Zero rate, just assume the whole time */
2071 want_time
= target_time
;
2073 want_time
= ddt
->ddt_flush_min
*
2074 ddt
->ddt_log_flush_time_rate
/
2075 ddt
->ddt_log_flush_rate
;
2077 /* Later pass, calculate from this txg so far */
2078 want_time
= ddt
->ddt_flush_min
*
2079 elapsed_time
/ ddt
->ddt_flush_count
;
2082 /* Figure out how much time we have left */
2083 uint32_t remain_time
= target_time
- elapsed_time
;
2085 /* Smear the remaining entries over the remaining passes. */
2086 uint32_t nentries
= ddt
->ddt_flush_min
/
2087 (MAX(1, zfs_dedup_log_flush_passes_max
) + 1 - ddt
->ddt_flush_pass
);
2088 if (want_time
> remain_time
) {
2090 * We're behind; try to catch up a bit by doubling the amount
2091 * this pass. If we're behind that means we're in a later
2092 * pass and likely have most of the remaining time to
2093 * ourselves. If we're in the last couple of passes, then
2094 * doubling might just take us over the timeout, but probably
2095 * not be much, and it stops us falling behind. If we're
2096 * in the middle passes, there'll be more to do, but it
2097 * might just help us catch up a bit and we'll recalculate on
2098 * the next pass anyway.
2100 nentries
= MIN(ddt
->ddt_flush_min
, nentries
*2);
2103 ddt_lightweight_entry_t ddlwe
;
2105 while (ddt_log_take_first(ddt
, ddt
->ddt_log_flushing
, &ddlwe
)) {
2106 ddt_sync_flush_entry(ddt
, &ddlwe
,
2107 ddlwe
.ddlwe_type
, ddlwe
.ddlwe_class
, tx
);
2109 /* End this pass if we've synced as much as we need to. */
2110 if (++count
>= nentries
)
2113 ddt
->ddt_flush_count
+= count
;
2114 ddt
->ddt_flush_min
-= count
;
2116 if (avl_is_empty(&ddt
->ddt_log_flushing
->ddl_tree
)) {
2117 /* We emptied it, so truncate on-disk */
2118 DDT_KSTAT_ZERO(ddt
, dds_log_flushing_entries
);
2119 ddt_log_truncate(ddt
, tx
);
2120 /* No more passes needed this txg */
2121 ddt
->ddt_flush_pass
= 0;
2123 /* More to do next time, save checkpoint */
2124 DDT_KSTAT_SUB(ddt
, dds_log_flushing_entries
, count
);
2125 ddt_log_checkpoint(ddt
, &ddlwe
, tx
);
2128 ddt_sync_update_stats(ddt
, tx
);
2130 return (ddt
->ddt_flush_pass
== 0);
2134 ddt_flush_force_update_txg(ddt_t
*ddt
, uint64_t txg
)
2137 * If we're not forcing flush, and not being asked to start, then
2138 * there's nothing more to do.
2141 /* Update requested, are we currently forcing flush? */
2142 if (ddt
->ddt_flush_force_txg
== 0)
2144 txg
= ddt
->ddt_flush_force_txg
;
2148 * If either of the logs have entries unflushed entries before
2149 * the wanted txg, set the force txg, otherwise clear it.
2152 if ((!avl_is_empty(&ddt
->ddt_log_active
->ddl_tree
) &&
2153 ddt
->ddt_log_active
->ddl_first_txg
<= txg
) ||
2154 (!avl_is_empty(&ddt
->ddt_log_flushing
->ddl_tree
) &&
2155 ddt
->ddt_log_flushing
->ddl_first_txg
<= txg
)) {
2156 ddt
->ddt_flush_force_txg
= txg
;
2161 * Nothing to flush behind the given txg, so we can clear force flush
2164 ddt
->ddt_flush_force_txg
= 0;
2168 ddt_sync_flush_log(ddt_t
*ddt
, dmu_tx_t
*tx
)
2170 ASSERT(avl_is_empty(&ddt
->ddt_tree
));
2172 /* Don't do any flushing when the pool is ready to shut down */
2173 if (tx
->tx_txg
> spa_final_dirty_txg(ddt
->ddt_spa
))
2176 /* Try to flush some. */
2177 if (!ddt_sync_flush_log_incremental(ddt
, tx
))
2178 /* More to do next time */
2181 /* No more flushing this txg, so we can do end-of-txg housekeeping */
2183 if (avl_is_empty(&ddt
->ddt_log_flushing
->ddl_tree
) &&
2184 !avl_is_empty(&ddt
->ddt_log_active
->ddl_tree
)) {
2186 * No more to flush, and the active list has stuff, so
2187 * try to swap the logs for next time.
2189 if (ddt_log_swap(ddt
, tx
)) {
2190 DDT_KSTAT_ZERO(ddt
, dds_log_active_entries
);
2191 DDT_KSTAT_SET(ddt
, dds_log_flushing_entries
,
2192 avl_numnodes(&ddt
->ddt_log_flushing
->ddl_tree
));
2196 /* If force flush is no longer necessary, turn it off. */
2197 ddt_flush_force_update_txg(ddt
, 0);
2200 * Update flush rate. This is an exponential weighted moving average of
2201 * the number of entries flushed over recent txgs.
2203 ddt
->ddt_log_flush_rate
= _ewma(
2204 ddt
->ddt_flush_count
, ddt
->ddt_log_flush_rate
,
2205 zfs_dedup_log_flush_flow_rate_txgs
);
2206 DDT_KSTAT_SET(ddt
, dds_log_flush_rate
, ddt
->ddt_log_flush_rate
);
2209 * Update flush time rate. This is an exponential weighted moving
2210 * average of the total time taken to flush over recent txgs.
2212 ddt
->ddt_log_flush_time_rate
= _ewma(
2213 ddt
->ddt_log_flush_time_rate
,
2214 ((int32_t)(NSEC2MSEC(gethrtime() - ddt
->ddt_flush_start
))),
2215 zfs_dedup_log_flush_flow_rate_txgs
);
2216 DDT_KSTAT_SET(ddt
, dds_log_flush_time_rate
,
2217 ddt
->ddt_log_flush_time_rate
);
2221 ddt_sync_table_log(ddt_t
*ddt
, dmu_tx_t
*tx
)
2223 uint64_t count
= avl_numnodes(&ddt
->ddt_tree
);
2226 ddt_log_update_t dlu
= {0};
2227 ddt_log_begin(ddt
, count
, tx
, &dlu
);
2230 void *cookie
= NULL
;
2231 ddt_lightweight_entry_t ddlwe
;
2233 avl_destroy_nodes(&ddt
->ddt_tree
, &cookie
)) != NULL
) {
2234 ASSERT(dde
->dde_flags
& DDE_FLAG_LOADED
);
2235 DDT_ENTRY_TO_LIGHTWEIGHT(ddt
, dde
, &ddlwe
);
2236 ddt_log_entry(ddt
, &ddlwe
, &dlu
);
2237 ddt_sync_scan_entry(ddt
, &ddlwe
, tx
);
2241 ddt_log_commit(ddt
, &dlu
);
2243 DDT_KSTAT_SET(ddt
, dds_log_active_entries
,
2244 avl_numnodes(&ddt
->ddt_log_active
->ddl_tree
));
2247 * Sync the stats for the store objects. Even though we haven't
2248 * modified anything on those objects, they're no longer the
2249 * source of truth for entries that are now in the log, and we
2250 * need the on-disk counts to reflect that, otherwise we'll
2251 * miscount later when importing.
2253 for (ddt_type_t type
= 0; type
< DDT_TYPES
; type
++) {
2254 for (ddt_class_t
class = 0;
2255 class < DDT_CLASSES
; class++) {
2256 if (ddt_object_exists(ddt
, type
, class))
2257 ddt_object_sync(ddt
, type
, class, tx
);
2261 memcpy(&ddt
->ddt_histogram_cache
, ddt
->ddt_histogram
,
2262 sizeof (ddt
->ddt_histogram
));
2263 ddt
->ddt_spa
->spa_dedup_dspace
= ~0ULL;
2264 ddt
->ddt_spa
->spa_dedup_dsize
= ~0ULL;
2267 if (spa_sync_pass(ddt
->ddt_spa
) == 1) {
2269 * Update ingest rate. This is an exponential weighted moving
2270 * average of the number of entries changed over recent txgs.
2271 * The ramp-up cost shouldn't matter too much because the
2272 * flusher will be trying to take at least the minimum anyway.
2274 ddt
->ddt_log_ingest_rate
= _ewma(
2275 count
, ddt
->ddt_log_ingest_rate
,
2276 zfs_dedup_log_flush_flow_rate_txgs
);
2277 DDT_KSTAT_SET(ddt
, dds_log_ingest_rate
,
2278 ddt
->ddt_log_ingest_rate
);
2283 ddt_sync_table_flush(ddt_t
*ddt
, dmu_tx_t
*tx
)
2285 if (avl_numnodes(&ddt
->ddt_tree
) == 0)
2289 void *cookie
= NULL
;
2290 while ((dde
= avl_destroy_nodes(
2291 &ddt
->ddt_tree
, &cookie
)) != NULL
) {
2292 ASSERT(dde
->dde_flags
& DDE_FLAG_LOADED
);
2294 ddt_lightweight_entry_t ddlwe
;
2295 DDT_ENTRY_TO_LIGHTWEIGHT(ddt
, dde
, &ddlwe
);
2296 ddt_sync_flush_entry(ddt
, &ddlwe
,
2297 dde
->dde_type
, dde
->dde_class
, tx
);
2298 ddt_sync_scan_entry(ddt
, &ddlwe
, tx
);
2302 memcpy(&ddt
->ddt_histogram_cache
, ddt
->ddt_histogram
,
2303 sizeof (ddt
->ddt_histogram
));
2304 ddt
->ddt_spa
->spa_dedup_dspace
= ~0ULL;
2305 ddt
->ddt_spa
->spa_dedup_dsize
= ~0ULL;
2306 ddt_sync_update_stats(ddt
, tx
);
2310 ddt_sync_table(ddt_t
*ddt
, dmu_tx_t
*tx
)
2312 spa_t
*spa
= ddt
->ddt_spa
;
2314 if (ddt
->ddt_version
== UINT64_MAX
)
2317 if (spa
->spa_uberblock
.ub_version
< SPA_VERSION_DEDUP
) {
2318 ASSERT0(avl_numnodes(&ddt
->ddt_tree
));
2322 if (spa
->spa_ddt_stat_object
== 0) {
2323 spa
->spa_ddt_stat_object
= zap_create_link(ddt
->ddt_os
,
2324 DMU_OT_DDT_STATS
, DMU_POOL_DIRECTORY_OBJECT
,
2325 DMU_POOL_DDT_STATS
, tx
);
2328 if (ddt
->ddt_version
== DDT_VERSION_FDT
&& ddt
->ddt_dir_object
== 0)
2329 ddt_create_dir(ddt
, tx
);
2331 if (ddt
->ddt_flags
& DDT_FLAG_LOG
)
2332 ddt_sync_table_log(ddt
, tx
);
2334 ddt_sync_table_flush(ddt
, tx
);
2338 ddt_sync(spa_t
*spa
, uint64_t txg
)
2340 dsl_scan_t
*scn
= spa
->spa_dsl_pool
->dp_scan
;
2344 ASSERT3U(spa_syncing_txg(spa
), ==, txg
);
2346 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
2348 rio
= zio_root(spa
, NULL
, NULL
,
2349 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
| ZIO_FLAG_SELF_HEAL
);
2352 * This function may cause an immediate scan of ddt blocks (see
2353 * the comment above dsl_scan_ddt() for details). We set the
2354 * scan's root zio here so that we can wait for any scan IOs in
2355 * addition to the regular ddt IOs.
2357 ASSERT3P(scn
->scn_zio_root
, ==, NULL
);
2358 scn
->scn_zio_root
= rio
;
2360 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
2361 ddt_t
*ddt
= spa
->spa_ddt
[c
];
2364 ddt_sync_table(ddt
, tx
);
2365 if (ddt
->ddt_flags
& DDT_FLAG_LOG
)
2366 ddt_sync_flush_log(ddt
, tx
);
2367 ddt_repair_table(ddt
, rio
);
2370 (void) zio_wait(rio
);
2371 scn
->scn_zio_root
= NULL
;
2377 ddt_walk_init(spa_t
*spa
, uint64_t txg
)
2380 txg
= spa_syncing_txg(spa
);
2382 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
2383 ddt_t
*ddt
= spa
->spa_ddt
[c
];
2384 if (ddt
== NULL
|| !(ddt
->ddt_flags
& DDT_FLAG_LOG
))
2388 ddt_flush_force_update_txg(ddt
, txg
);
2394 ddt_walk_ready(spa_t
*spa
)
2396 for (enum zio_checksum c
= 0; c
< ZIO_CHECKSUM_FUNCTIONS
; c
++) {
2397 ddt_t
*ddt
= spa
->spa_ddt
[c
];
2398 if (ddt
== NULL
|| !(ddt
->ddt_flags
& DDT_FLAG_LOG
))
2401 if (ddt
->ddt_flush_force_txg
> 0)
2409 ddt_walk_impl(spa_t
*spa
, ddt_bookmark_t
*ddb
, ddt_lightweight_entry_t
*ddlwe
,
2410 uint64_t flags
, boolean_t wait
)
2415 ddt_t
*ddt
= spa
->spa_ddt
[ddb
->ddb_checksum
];
2420 (ddt
->ddt_flags
& flags
) != flags
)
2423 if (wait
&& ddt
->ddt_flush_force_txg
> 0)
2427 if (ddt_object_exists(ddt
, ddb
->ddb_type
,
2429 error
= ddt_object_walk(ddt
,
2430 ddb
->ddb_type
, ddb
->ddb_class
,
2431 &ddb
->ddb_cursor
, ddlwe
);
2435 if (error
!= ENOENT
)
2437 ddb
->ddb_cursor
= 0;
2438 } while (++ddb
->ddb_checksum
< ZIO_CHECKSUM_FUNCTIONS
);
2439 ddb
->ddb_checksum
= 0;
2440 } while (++ddb
->ddb_type
< DDT_TYPES
);
2442 } while (++ddb
->ddb_class
< DDT_CLASSES
);
2444 return (SET_ERROR(ENOENT
));
2448 ddt_walk(spa_t
*spa
, ddt_bookmark_t
*ddb
, ddt_lightweight_entry_t
*ddlwe
)
2450 return (ddt_walk_impl(spa
, ddb
, ddlwe
, 0, B_TRUE
));
2454 * This function is used by Block Cloning (brt.c) to increase reference
2455 * counter for the DDT entry if the block is already in DDT.
2457 * Return false if the block, despite having the D bit set, is not present
2458 * in the DDT. This is possible when the DDT has been pruned by an admin
2459 * or by the DDT quota mechanism.
2462 ddt_addref(spa_t
*spa
, const blkptr_t
*bp
)
2468 spa_config_enter(spa
, SCL_ZIO
, FTAG
, RW_READER
);
2469 ddt
= ddt_select(spa
, bp
);
2472 dde
= ddt_lookup(ddt
, bp
);
2474 /* Can be NULL if the entry for this block was pruned. */
2477 spa_config_exit(spa
, SCL_ZIO
, FTAG
);
2481 if ((dde
->dde_type
< DDT_TYPES
) || (dde
->dde_flags
& DDE_FLAG_LOGGED
)) {
2483 * This entry was either synced to a store object (dde_type is
2484 * real) or was logged. It must be properly on disk at this
2485 * point, so we can just bump its refcount.
2487 int p
= DDT_PHYS_FOR_COPIES(ddt
, BP_GET_NDVAS(bp
));
2488 ddt_phys_variant_t v
= DDT_PHYS_VARIANT(ddt
, p
);
2490 ddt_phys_addref(dde
->dde_phys
, v
);
2494 * If the block has the DEDUP flag set it still might not
2495 * exist in the DEDUP table due to DDT pruning of entries
2498 ddt_remove(ddt
, dde
);
2503 spa_config_exit(spa
, SCL_ZIO
, FTAG
);
2508 typedef struct ddt_prune_entry
{
2511 list_node_t dpe_node
;
2512 ddt_univ_phys_t dpe_phys
[];
2513 } ddt_prune_entry_t
;
2515 typedef struct ddt_prune_info
{
2517 uint64_t dpi_txg_syncs
;
2518 uint64_t dpi_pruned
;
2519 list_t dpi_candidates
;
2523 * Add prune candidates for ddt_sync during spa_sync
2526 prune_candidates_sync(void *arg
, dmu_tx_t
*tx
)
2529 ddt_prune_info_t
*dpi
= arg
;
2530 ddt_prune_entry_t
*dpe
;
2532 spa_config_enter(dpi
->dpi_spa
, SCL_ZIO
, FTAG
, RW_READER
);
2534 /* Process the prune candidates collected so far */
2535 while ((dpe
= list_remove_head(&dpi
->dpi_candidates
)) != NULL
) {
2537 ddt_t
*ddt
= dpe
->dpe_ddt
;
2542 * If it's on the live list, then it was loaded for update
2543 * this txg and is no longer stale; skip it.
2545 if (avl_find(&ddt
->ddt_tree
, &dpe
->dpe_key
, NULL
)) {
2547 kmem_free(dpe
, sizeof (*dpe
));
2551 ddt_bp_create(ddt
->ddt_checksum
, &dpe
->dpe_key
,
2552 dpe
->dpe_phys
, DDT_PHYS_FLAT
, &blk
);
2554 ddt_entry_t
*dde
= ddt_lookup(ddt
, &blk
);
2555 if (dde
!= NULL
&& !(dde
->dde_flags
& DDE_FLAG_LOGGED
)) {
2556 ASSERT(dde
->dde_flags
& DDE_FLAG_LOADED
);
2558 * Zero the physical, so we don't try to free DVAs
2559 * at flush nor try to reuse this entry.
2561 ddt_phys_clear(dde
->dde_phys
, DDT_PHYS_FLAT
);
2567 kmem_free(dpe
, sizeof (*dpe
));
2570 spa_config_exit(dpi
->dpi_spa
, SCL_ZIO
, FTAG
);
2571 dpi
->dpi_txg_syncs
++;
2575 * Prune candidates are collected in open context and processed
2576 * in sync context as part of ddt_sync_table().
2579 ddt_prune_entry(list_t
*list
, ddt_t
*ddt
, const ddt_key_t
*ddk
,
2580 const ddt_univ_phys_t
*ddp
)
2582 ASSERT(ddt
->ddt_flags
& DDT_FLAG_FLAT
);
2584 size_t dpe_size
= sizeof (ddt_prune_entry_t
) + DDT_FLAT_PHYS_SIZE
;
2585 ddt_prune_entry_t
*dpe
= kmem_alloc(dpe_size
, KM_SLEEP
);
2588 dpe
->dpe_key
= *ddk
;
2589 memcpy(dpe
->dpe_phys
, ddp
, DDT_FLAT_PHYS_SIZE
);
2590 list_insert_head(list
, dpe
);
2594 * Interate over all the entries in the DDT unique class.
2595 * The walk will perform one of the following operations:
2596 * (a) build a histogram than can be used when pruning
2597 * (b) prune entries older than the cutoff
2599 * Also called by zdb(8) to dump the age histogram
2602 ddt_prune_walk(spa_t
*spa
, uint64_t cutoff
, ddt_age_histo_t
*histogram
)
2604 ddt_bookmark_t ddb
= {
2605 .ddb_class
= DDT_CLASS_UNIQUE
,
2610 ddt_lightweight_entry_t ddlwe
= {0};
2614 uint64_t now
= gethrestime_sec();
2615 ddt_prune_info_t dpi
;
2616 boolean_t pruning
= (cutoff
!= 0);
2619 dpi
.dpi_txg_syncs
= 0;
2622 list_create(&dpi
.dpi_candidates
, sizeof (ddt_prune_entry_t
),
2623 offsetof(ddt_prune_entry_t
, dpe_node
));
2626 if (histogram
!= NULL
)
2627 memset(histogram
, 0, sizeof (ddt_age_histo_t
));
2630 ddt_walk_impl(spa
, &ddb
, &ddlwe
, DDT_FLAG_FLAT
, B_FALSE
)) == 0) {
2631 ddt_t
*ddt
= spa
->spa_ddt
[ddb
.ddb_checksum
];
2634 if (spa_shutting_down(spa
) || issig())
2637 ASSERT(ddt
->ddt_flags
& DDT_FLAG_FLAT
);
2638 ASSERT3U(ddlwe
.ddlwe_phys
.ddp_flat
.ddp_refcnt
, <=, 1);
2640 uint64_t class_start
=
2641 ddlwe
.ddlwe_phys
.ddp_flat
.ddp_class_start
;
2644 * If this entry is on the log, then the stored entry is stale
2645 * and we should skip it.
2647 if (ddt_log_find_key(ddt
, &ddlwe
.ddlwe_key
, NULL
))
2650 /* prune older entries */
2651 if (pruning
&& class_start
< cutoff
) {
2652 if (candidates
++ >= zfs_ddt_prunes_per_txg
) {
2653 /* sync prune candidates in batches */
2654 VERIFY0(dsl_sync_task(spa_name(spa
),
2655 NULL
, prune_candidates_sync
,
2656 &dpi
, 0, ZFS_SPACE_CHECK_NONE
));
2659 ddt_prune_entry(&dpi
.dpi_candidates
, ddt
,
2660 &ddlwe
.ddlwe_key
, &ddlwe
.ddlwe_phys
);
2663 /* build a histogram */
2664 if (histogram
!= NULL
) {
2665 uint64_t age
= MAX(1, (now
- class_start
) / 3600);
2666 int bin
= MIN(highbit64(age
) - 1, HIST_BINS
- 1);
2667 histogram
->dah_entries
++;
2668 histogram
->dah_age_histo
[bin
]++;
2674 if (pruning
&& valid
> 0) {
2675 if (!list_is_empty(&dpi
.dpi_candidates
)) {
2676 /* sync out final batch of prune candidates */
2677 VERIFY0(dsl_sync_task(spa_name(spa
), NULL
,
2678 prune_candidates_sync
, &dpi
, 0,
2679 ZFS_SPACE_CHECK_NONE
));
2681 list_destroy(&dpi
.dpi_candidates
);
2683 zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
2684 (u_longlong_t
)dpi
.dpi_pruned
,
2685 (int)((dpi
.dpi_pruned
* 100) / valid
),
2686 (u_longlong_t
)dpi
.dpi_txg_syncs
);
2691 ddt_total_entries(spa_t
*spa
)
2694 ddt_get_dedup_object_stats(spa
, &ddo
);
2696 return (ddo
.ddo_count
);
2700 ddt_prune_unique_entries(spa_t
*spa
, zpool_ddt_prune_unit_t unit
,
2704 uint64_t start_time
= gethrtime();
2706 if (spa
->spa_active_ddt_prune
)
2707 return (SET_ERROR(EALREADY
));
2708 if (ddt_total_entries(spa
) == 0)
2711 spa
->spa_active_ddt_prune
= B_TRUE
;
2713 zfs_dbgmsg("prune %llu %s", (u_longlong_t
)amount
,
2714 unit
== ZPOOL_DDT_PRUNE_PERCENTAGE
? "%" : "seconds old or older");
2716 if (unit
== ZPOOL_DDT_PRUNE_PERCENTAGE
) {
2717 ddt_age_histo_t histogram
;
2718 uint64_t oldest
= 0;
2720 /* Make a pass over DDT to build a histogram */
2721 ddt_prune_walk(spa
, 0, &histogram
);
2723 int target
= (histogram
.dah_entries
* amount
) / 100;
2726 * Figure out our cutoff date
2727 * (i.e., which bins to prune from)
2729 for (int i
= HIST_BINS
- 1; i
>= 0 && target
> 0; i
--) {
2730 if (histogram
.dah_age_histo
[i
] != 0) {
2731 /* less than this bucket remaining */
2732 if (target
< histogram
.dah_age_histo
[i
]) {
2733 oldest
= MAX(1, (1<<i
) * 3600);
2736 target
-= histogram
.dah_age_histo
[i
];
2740 cutoff
= gethrestime_sec() - oldest
;
2742 if (ddt_dump_prune_histogram
)
2743 ddt_dump_age_histogram(&histogram
, cutoff
);
2744 } else if (unit
== ZPOOL_DDT_PRUNE_AGE
) {
2745 cutoff
= gethrestime_sec() - amount
;
2750 if (cutoff
> 0 && !spa_shutting_down(spa
) && !issig()) {
2751 /* Traverse DDT to prune entries older that our cuttoff */
2752 ddt_prune_walk(spa
, cutoff
, NULL
);
2755 zfs_dbgmsg("%s: prune completed in %llu ms",
2756 spa_name(spa
), (u_longlong_t
)NSEC2MSEC(gethrtime() - start_time
));
2758 spa
->spa_active_ddt_prune
= B_FALSE
;
2762 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, prefetch
, INT
, ZMOD_RW
,
2763 "Enable prefetching dedup-ed blks");
2765 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_flush_passes_max
, UINT
, ZMOD_RW
,
2766 "Max number of incremental dedup log flush passes per transaction");
2768 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_flush_min_time_ms
, UINT
, ZMOD_RW
,
2769 "Min time to spend on incremental dedup log flush each transaction");
2771 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_flush_entries_min
, UINT
, ZMOD_RW
,
2772 "Min number of log entries to flush each transaction");
2774 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_flush_flow_rate_txgs
, UINT
, ZMOD_RW
,
2775 "Number of txgs to average flow rates across");