4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2023, Klara Inc.
26 #include <sys/zfs_context.h>
29 #include <sys/dmu_tx.h>
31 #include <sys/ddt_impl.h>
32 #include <sys/dnode.h>
35 #include <sys/zio_checksum.h>
38 * No more than this many txgs before swapping logs.
40 uint_t zfs_dedup_log_txg_max
= 8;
43 * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
44 * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
46 uint64_t zfs_dedup_log_mem_max
= 0;
47 uint_t zfs_dedup_log_mem_max_percent
= 1;
50 static kmem_cache_t
*ddt_log_entry_flat_cache
;
51 static kmem_cache_t
*ddt_log_entry_trad_cache
;
53 #define DDT_LOG_ENTRY_FLAT_SIZE \
54 (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
55 #define DDT_LOG_ENTRY_TRAD_SIZE \
56 (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
58 #define DDT_LOG_ENTRY_SIZE(ddt) \
59 _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
64 ddt_log_entry_flat_cache
= kmem_cache_create("ddt_log_entry_flat_cache",
65 DDT_LOG_ENTRY_FLAT_SIZE
, 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
66 ddt_log_entry_trad_cache
= kmem_cache_create("ddt_log_entry_trad_cache",
67 DDT_LOG_ENTRY_TRAD_SIZE
, 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
70 * Max memory for log AVL entries. At least 1M, because we need
71 * something (that's ~3800 entries per tree). They can say 100% if they
72 * want; it just means they're at the mercy of the the txg flush limit.
74 if (zfs_dedup_log_mem_max
== 0) {
75 zfs_dedup_log_mem_max_percent
=
76 MIN(zfs_dedup_log_mem_max_percent
, 100);
77 zfs_dedup_log_mem_max
= (physmem
* PAGESIZE
) *
78 zfs_dedup_log_mem_max_percent
/ 100;
80 zfs_dedup_log_mem_max
= MAX(zfs_dedup_log_mem_max
, 1*1024*1024);
86 kmem_cache_destroy(ddt_log_entry_trad_cache
);
87 kmem_cache_destroy(ddt_log_entry_flat_cache
);
91 ddt_log_name(ddt_t
*ddt
, char *name
, uint_t n
)
93 snprintf(name
, DDT_NAMELEN
, DMU_POOL_DDT_LOG
,
94 zio_checksum_table
[ddt
->ddt_checksum
].ci_name
, n
);
98 ddt_log_update_header(ddt_t
*ddt
, ddt_log_t
*ddl
, dmu_tx_t
*tx
)
101 VERIFY0(dmu_bonus_hold(ddt
->ddt_os
, ddl
->ddl_object
, FTAG
, &db
));
102 dmu_buf_will_dirty(db
, tx
);
104 ddt_log_header_t
*hdr
= (ddt_log_header_t
*)db
->db_data
;
105 DLH_SET_VERSION(hdr
, 1);
106 DLH_SET_FLAGS(hdr
, ddl
->ddl_flags
);
107 hdr
->dlh_length
= ddl
->ddl_length
;
108 hdr
->dlh_first_txg
= ddl
->ddl_first_txg
;
109 hdr
->dlh_checkpoint
= ddl
->ddl_checkpoint
;
111 dmu_buf_rele(db
, FTAG
);
115 ddt_log_create_one(ddt_t
*ddt
, ddt_log_t
*ddl
, uint_t n
, dmu_tx_t
*tx
)
117 ASSERT3U(ddt
->ddt_dir_object
, >, 0);
118 ASSERT3U(ddl
->ddl_object
, ==, 0);
120 char name
[DDT_NAMELEN
];
121 ddt_log_name(ddt
, name
, n
);
123 ddl
->ddl_object
= dmu_object_alloc(ddt
->ddt_os
,
124 DMU_OTN_UINT64_METADATA
, SPA_OLD_MAXBLOCKSIZE
,
125 DMU_OTN_UINT64_METADATA
, sizeof (ddt_log_header_t
), tx
);
126 VERIFY0(zap_add(ddt
->ddt_os
, ddt
->ddt_dir_object
, name
,
127 sizeof (uint64_t), 1, &ddl
->ddl_object
, tx
));
129 ddl
->ddl_first_txg
= tx
->tx_txg
;
130 ddt_log_update_header(ddt
, ddl
, tx
);
134 ddt_log_create(ddt_t
*ddt
, dmu_tx_t
*tx
)
136 ddt_log_create_one(ddt
, ddt
->ddt_log_active
, 0, tx
);
137 ddt_log_create_one(ddt
, ddt
->ddt_log_flushing
, 1, tx
);
141 ddt_log_destroy_one(ddt_t
*ddt
, ddt_log_t
*ddl
, uint_t n
, dmu_tx_t
*tx
)
143 ASSERT3U(ddt
->ddt_dir_object
, >, 0);
145 if (ddl
->ddl_object
== 0)
148 ASSERT0(ddl
->ddl_length
);
150 char name
[DDT_NAMELEN
];
151 ddt_log_name(ddt
, name
, n
);
153 VERIFY0(zap_remove(ddt
->ddt_os
, ddt
->ddt_dir_object
, name
, tx
));
154 VERIFY0(dmu_object_free(ddt
->ddt_os
, ddl
->ddl_object
, tx
));
160 ddt_log_destroy(ddt_t
*ddt
, dmu_tx_t
*tx
)
162 ddt_log_destroy_one(ddt
, ddt
->ddt_log_active
, 0, tx
);
163 ddt_log_destroy_one(ddt
, ddt
->ddt_log_flushing
, 1, tx
);
167 ddt_log_update_stats(ddt_t
*ddt
)
170 * Log object stats. We count the number of live entries in the log
171 * tree, even if there are more than on disk, and even if the same
172 * entry is on both append and flush trees, because that's more what
173 * the user expects to see. This does mean the on-disk size is not
174 * really correlated with the number of entries, but I don't think
175 * that's reasonable to expect anyway.
177 dmu_object_info_t doi
;
179 dmu_object_info(ddt
->ddt_os
, ddt
->ddt_log_active
->ddl_object
, &doi
);
180 nblocks
= doi
.doi_physical_blocks_512
;
181 dmu_object_info(ddt
->ddt_os
, ddt
->ddt_log_flushing
->ddl_object
, &doi
);
182 nblocks
+= doi
.doi_physical_blocks_512
;
184 ddt_object_t
*ddo
= &ddt
->ddt_log_stats
;
186 avl_numnodes(&ddt
->ddt_log_active
->ddl_tree
) +
187 avl_numnodes(&ddt
->ddt_log_flushing
->ddl_tree
);
188 ddo
->ddo_mspace
= ddo
->ddo_count
* DDT_LOG_ENTRY_SIZE(ddt
);
189 ddo
->ddo_dspace
= nblocks
<< 9;
193 ddt_log_begin(ddt_t
*ddt
, size_t nentries
, dmu_tx_t
*tx
, ddt_log_update_t
*dlu
)
195 ASSERT3U(nentries
, >, 0);
196 ASSERT3P(dlu
->dlu_dbp
, ==, NULL
);
198 if (ddt
->ddt_log_active
->ddl_object
== 0)
199 ddt_log_create(ddt
, tx
);
202 * We want to store as many entries as we can in a block, but never
203 * split an entry across block boundaries.
205 size_t reclen
= P2ALIGN_TYPED(
206 sizeof (ddt_log_record_t
) + sizeof (ddt_log_record_entry_t
) +
207 DDT_PHYS_SIZE(ddt
), sizeof (uint64_t), size_t);
208 ASSERT3U(reclen
, <=, UINT16_MAX
);
209 dlu
->dlu_reclen
= reclen
;
211 VERIFY0(dnode_hold(ddt
->ddt_os
, ddt
->ddt_log_active
->ddl_object
, FTAG
,
213 dnode_set_storage_type(dlu
->dlu_dn
, DMU_OT_DDT_ZAP
);
215 uint64_t nblocks
= howmany(nentries
,
216 dlu
->dlu_dn
->dn_datablksz
/ dlu
->dlu_reclen
);
217 uint64_t offset
= ddt
->ddt_log_active
->ddl_length
;
218 uint64_t length
= nblocks
* dlu
->dlu_dn
->dn_datablksz
;
220 VERIFY0(dmu_buf_hold_array_by_dnode(dlu
->dlu_dn
, offset
, length
,
221 B_FALSE
, FTAG
, &dlu
->dlu_ndbp
, &dlu
->dlu_dbp
,
222 DMU_READ_NO_PREFETCH
));
225 dlu
->dlu_block
= dlu
->dlu_offset
= 0;
228 static ddt_log_entry_t
*
229 ddt_log_alloc_entry(ddt_t
*ddt
)
231 ddt_log_entry_t
*ddle
;
233 if (ddt
->ddt_flags
& DDT_FLAG_FLAT
) {
234 ddle
= kmem_cache_alloc(ddt_log_entry_flat_cache
, KM_SLEEP
);
235 memset(ddle
, 0, DDT_LOG_ENTRY_FLAT_SIZE
);
237 ddle
= kmem_cache_alloc(ddt_log_entry_trad_cache
, KM_SLEEP
);
238 memset(ddle
, 0, DDT_LOG_ENTRY_TRAD_SIZE
);
245 ddt_log_update_entry(ddt_t
*ddt
, ddt_log_t
*ddl
, ddt_lightweight_entry_t
*ddlwe
)
247 /* Create the log tree entry from a live or stored entry */
249 ddt_log_entry_t
*ddle
=
250 avl_find(&ddl
->ddl_tree
, &ddlwe
->ddlwe_key
, &where
);
252 ddle
= ddt_log_alloc_entry(ddt
);
253 ddle
->ddle_key
= ddlwe
->ddlwe_key
;
254 avl_insert(&ddl
->ddl_tree
, ddle
, where
);
256 ddle
->ddle_type
= ddlwe
->ddlwe_type
;
257 ddle
->ddle_class
= ddlwe
->ddlwe_class
;
258 memcpy(ddle
->ddle_phys
, &ddlwe
->ddlwe_phys
, DDT_PHYS_SIZE(ddt
));
262 ddt_log_entry(ddt_t
*ddt
, ddt_lightweight_entry_t
*ddlwe
, ddt_log_update_t
*dlu
)
264 ASSERT3U(dlu
->dlu_dbp
, !=, NULL
);
266 ddt_log_update_entry(ddt
, ddt
->ddt_log_active
, ddlwe
);
267 ddt_histogram_add_entry(ddt
, &ddt
->ddt_log_histogram
, ddlwe
);
270 ASSERT3U(dlu
->dlu_block
, <, dlu
->dlu_ndbp
);
271 dmu_buf_t
*db
= dlu
->dlu_dbp
[dlu
->dlu_block
];
274 * If this would take us past the end of the block, finish it and
275 * move to the next one.
277 if (db
->db_size
< (dlu
->dlu_offset
+ dlu
->dlu_reclen
)) {
278 ASSERT3U(dlu
->dlu_offset
, >, 0);
279 dmu_buf_fill_done(db
, dlu
->dlu_tx
, B_FALSE
);
282 ASSERT3U(dlu
->dlu_block
, <, dlu
->dlu_ndbp
);
283 db
= dlu
->dlu_dbp
[dlu
->dlu_block
];
287 * If this is the first time touching the block, inform the DMU that
288 * we will fill it, and zero it out.
290 if (dlu
->dlu_offset
== 0) {
291 dmu_buf_will_fill(db
, dlu
->dlu_tx
, B_FALSE
);
292 memset(db
->db_data
, 0, db
->db_size
);
295 /* Create the log record directly in the buffer */
296 ddt_log_record_t
*dlr
= (db
->db_data
+ dlu
->dlu_offset
);
297 DLR_SET_TYPE(dlr
, DLR_ENTRY
);
298 DLR_SET_RECLEN(dlr
, dlu
->dlu_reclen
);
299 DLR_SET_ENTRY_TYPE(dlr
, ddlwe
->ddlwe_type
);
300 DLR_SET_ENTRY_CLASS(dlr
, ddlwe
->ddlwe_class
);
302 ddt_log_record_entry_t
*dlre
=
303 (ddt_log_record_entry_t
*)&dlr
->dlr_payload
;
304 dlre
->dlre_key
= ddlwe
->ddlwe_key
;
305 memcpy(dlre
->dlre_phys
, &ddlwe
->ddlwe_phys
, DDT_PHYS_SIZE(ddt
));
307 /* Advance offset for next record. */
308 dlu
->dlu_offset
+= dlu
->dlu_reclen
;
312 ddt_log_commit(ddt_t
*ddt
, ddt_log_update_t
*dlu
)
314 ASSERT3U(dlu
->dlu_dbp
, !=, NULL
);
315 ASSERT3U(dlu
->dlu_block
+1, ==, dlu
->dlu_ndbp
);
316 ASSERT3U(dlu
->dlu_offset
, >, 0);
319 * Close out the last block. Whatever we haven't used will be zeroed,
320 * which matches DLR_INVALID, so we can detect this during load.
322 dmu_buf_fill_done(dlu
->dlu_dbp
[dlu
->dlu_block
], dlu
->dlu_tx
, B_FALSE
);
324 dmu_buf_rele_array(dlu
->dlu_dbp
, dlu
->dlu_ndbp
, FTAG
);
326 ddt
->ddt_log_active
->ddl_length
+=
327 dlu
->dlu_ndbp
* (uint64_t)dlu
->dlu_dn
->dn_datablksz
;
328 dnode_rele(dlu
->dlu_dn
, FTAG
);
330 ddt_log_update_header(ddt
, ddt
->ddt_log_active
, dlu
->dlu_tx
);
332 memset(dlu
, 0, sizeof (ddt_log_update_t
));
334 ddt_log_update_stats(ddt
);
338 ddt_log_take_first(ddt_t
*ddt
, ddt_log_t
*ddl
, ddt_lightweight_entry_t
*ddlwe
)
340 ddt_log_entry_t
*ddle
= avl_first(&ddl
->ddl_tree
);
344 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt
, ddle
, ddlwe
);
346 ddt_histogram_sub_entry(ddt
, &ddt
->ddt_log_histogram
, ddlwe
);
348 avl_remove(&ddl
->ddl_tree
, ddle
);
349 kmem_cache_free(ddt
->ddt_flags
& DDT_FLAG_FLAT
?
350 ddt_log_entry_flat_cache
: ddt_log_entry_trad_cache
, ddle
);
356 ddt_log_remove_key(ddt_t
*ddt
, ddt_log_t
*ddl
, const ddt_key_t
*ddk
)
358 ddt_log_entry_t
*ddle
= avl_find(&ddl
->ddl_tree
, ddk
, NULL
);
362 ddt_lightweight_entry_t ddlwe
;
363 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt
, ddle
, &ddlwe
);
364 ddt_histogram_sub_entry(ddt
, &ddt
->ddt_log_histogram
, &ddlwe
);
366 avl_remove(&ddl
->ddl_tree
, ddle
);
367 kmem_cache_free(ddt
->ddt_flags
& DDT_FLAG_FLAT
?
368 ddt_log_entry_flat_cache
: ddt_log_entry_trad_cache
, ddle
);
374 ddt_log_find_key(ddt_t
*ddt
, const ddt_key_t
*ddk
,
375 ddt_lightweight_entry_t
*ddlwe
)
377 ddt_log_entry_t
*ddle
=
378 avl_find(&ddt
->ddt_log_active
->ddl_tree
, ddk
, NULL
);
380 ddle
= avl_find(&ddt
->ddt_log_flushing
->ddl_tree
, ddk
, NULL
);
384 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt
, ddle
, ddlwe
);
389 ddt_log_checkpoint(ddt_t
*ddt
, ddt_lightweight_entry_t
*ddlwe
, dmu_tx_t
*tx
)
391 ddt_log_t
*ddl
= ddt
->ddt_log_flushing
;
393 ASSERT3U(ddl
->ddl_object
, !=, 0);
397 * There should not be any entries on the log tree before the given
398 * checkpoint. Assert that this is the case.
400 ddt_log_entry_t
*ddle
= avl_first(&ddl
->ddl_tree
);
402 VERIFY3U(ddt_key_compare(&ddle
->ddle_key
, &ddlwe
->ddlwe_key
),
406 ddl
->ddl_flags
|= DDL_FLAG_CHECKPOINT
;
407 ddl
->ddl_checkpoint
= ddlwe
->ddlwe_key
;
408 ddt_log_update_header(ddt
, ddl
, tx
);
410 ddt_log_update_stats(ddt
);
414 ddt_log_truncate(ddt_t
*ddt
, dmu_tx_t
*tx
)
416 ddt_log_t
*ddl
= ddt
->ddt_log_flushing
;
418 if (ddl
->ddl_object
== 0)
421 ASSERT(avl_is_empty(&ddl
->ddl_tree
));
423 /* Eject the entire object */
424 dmu_free_range(ddt
->ddt_os
, ddl
->ddl_object
, 0, DMU_OBJECT_END
, tx
);
427 ddl
->ddl_flags
&= ~DDL_FLAG_CHECKPOINT
;
428 memset(&ddl
->ddl_checkpoint
, 0, sizeof (ddt_key_t
));
429 ddt_log_update_header(ddt
, ddl
, tx
);
431 ddt_log_update_stats(ddt
);
435 ddt_log_swap(ddt_t
*ddt
, dmu_tx_t
*tx
)
437 /* Swap the logs. The old flushing one must be empty */
438 VERIFY(avl_is_empty(&ddt
->ddt_log_flushing
->ddl_tree
));
441 * If there are still blocks on the flushing log, truncate it first.
442 * This can happen if there were entries on the flushing log that were
443 * removed in memory via ddt_lookup(); their vestigal remains are
446 if (ddt
->ddt_log_flushing
->ddl_length
> 0)
447 ddt_log_truncate(ddt
, tx
);
450 * Swap policy. We swap the logs (and so begin flushing) when the
451 * active tree grows too large, or when we haven't swapped it in
452 * some amount of time, or if something has requested the logs be
453 * flushed ASAP (see ddt_walk_init()).
457 * The log tree is too large if the memory usage of its entries is over
458 * half of the memory limit. This effectively gives each log tree half
459 * the available memory.
461 const boolean_t too_large
=
462 (avl_numnodes(&ddt
->ddt_log_active
->ddl_tree
) *
463 DDT_LOG_ENTRY_SIZE(ddt
)) >= (zfs_dedup_log_mem_max
>> 1);
465 const boolean_t too_old
=
467 (ddt
->ddt_log_active
->ddl_first_txg
+
468 MAX(1, zfs_dedup_log_txg_max
));
470 const boolean_t force
=
471 ddt
->ddt_log_active
->ddl_first_txg
<= ddt
->ddt_flush_force_txg
;
473 if (!(too_large
|| too_old
|| force
))
476 ddt_log_t
*swap
= ddt
->ddt_log_active
;
477 ddt
->ddt_log_active
= ddt
->ddt_log_flushing
;
478 ddt
->ddt_log_flushing
= swap
;
480 ASSERT(ddt
->ddt_log_active
->ddl_flags
& DDL_FLAG_FLUSHING
);
481 ddt
->ddt_log_active
->ddl_flags
&=
482 ~(DDL_FLAG_FLUSHING
| DDL_FLAG_CHECKPOINT
);
484 ASSERT(!(ddt
->ddt_log_flushing
->ddl_flags
& DDL_FLAG_FLUSHING
));
485 ddt
->ddt_log_flushing
->ddl_flags
|= DDL_FLAG_FLUSHING
;
487 ddt
->ddt_log_active
->ddl_first_txg
= tx
->tx_txg
;
489 ddt_log_update_header(ddt
, ddt
->ddt_log_active
, tx
);
490 ddt_log_update_header(ddt
, ddt
->ddt_log_flushing
, tx
);
492 ddt_log_update_stats(ddt
);
498 ddt_log_load_entry(ddt_t
*ddt
, ddt_log_t
*ddl
, ddt_log_record_t
*dlr
,
499 const ddt_key_t
*checkpoint
)
501 ASSERT3U(DLR_GET_TYPE(dlr
), ==, DLR_ENTRY
);
503 ddt_log_record_entry_t
*dlre
=
504 (ddt_log_record_entry_t
*)dlr
->dlr_payload
;
505 if (checkpoint
!= NULL
&&
506 ddt_key_compare(&dlre
->dlre_key
, checkpoint
) <= 0) {
507 /* Skip pre-checkpoint entries; they're already flushed. */
511 ddt_lightweight_entry_t ddlwe
;
512 ddlwe
.ddlwe_type
= DLR_GET_ENTRY_TYPE(dlr
);
513 ddlwe
.ddlwe_class
= DLR_GET_ENTRY_CLASS(dlr
);
515 ddlwe
.ddlwe_key
= dlre
->dlre_key
;
516 memcpy(&ddlwe
.ddlwe_phys
, dlre
->dlre_phys
, DDT_PHYS_SIZE(ddt
));
518 ddt_log_update_entry(ddt
, ddl
, &ddlwe
);
522 ddt_log_empty(ddt_t
*ddt
, ddt_log_t
*ddl
)
525 ddt_log_entry_t
*ddle
;
526 IMPLY(ddt
->ddt_version
== UINT64_MAX
, avl_is_empty(&ddl
->ddl_tree
));
528 avl_destroy_nodes(&ddl
->ddl_tree
, &cookie
)) != NULL
) {
529 kmem_cache_free(ddt
->ddt_flags
& DDT_FLAG_FLAT
?
530 ddt_log_entry_flat_cache
: ddt_log_entry_trad_cache
, ddle
);
532 ASSERT(avl_is_empty(&ddl
->ddl_tree
));
536 ddt_log_load_one(ddt_t
*ddt
, uint_t n
)
540 ddt_log_t
*ddl
= &ddt
->ddt_log
[n
];
542 char name
[DDT_NAMELEN
];
543 ddt_log_name(ddt
, name
, n
);
546 int err
= zap_lookup(ddt
->ddt_os
, ddt
->ddt_dir_object
, name
,
547 sizeof (uint64_t), 1, &obj
);
554 err
= dnode_hold(ddt
->ddt_os
, obj
, FTAG
, &dn
);
558 ddt_log_header_t hdr
;
560 err
= dmu_bonus_hold_by_dnode(dn
, FTAG
, &db
, DMU_READ_NO_PREFETCH
);
562 dnode_rele(dn
, FTAG
);
565 memcpy(&hdr
, db
->db_data
, sizeof (ddt_log_header_t
));
566 dmu_buf_rele(db
, FTAG
);
568 if (DLH_GET_VERSION(&hdr
) != 1) {
569 dnode_rele(dn
, FTAG
);
570 zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
571 "unknown version=%llu", spa_name(ddt
->ddt_spa
), name
,
572 (u_longlong_t
)DLH_GET_VERSION(&hdr
));
573 return (SET_ERROR(EINVAL
));
576 ddt_key_t
*checkpoint
= NULL
;
577 if (DLH_GET_FLAGS(&hdr
) & DDL_FLAG_CHECKPOINT
) {
579 * If the log has a checkpoint, then we can ignore any entries
580 * that have already been flushed.
582 ASSERT(DLH_GET_FLAGS(&hdr
) & DDL_FLAG_FLUSHING
);
583 checkpoint
= &hdr
.dlh_checkpoint
;
586 if (hdr
.dlh_length
> 0) {
587 dmu_prefetch_by_dnode(dn
, 0, 0, hdr
.dlh_length
,
588 ZIO_PRIORITY_SYNC_READ
);
590 for (uint64_t offset
= 0; offset
< hdr
.dlh_length
;
591 offset
+= dn
->dn_datablksz
) {
592 err
= dmu_buf_hold_by_dnode(dn
, offset
, FTAG
, &db
,
595 dnode_rele(dn
, FTAG
);
596 ddt_log_empty(ddt
, ddl
);
600 uint64_t boffset
= 0;
601 while (boffset
< db
->db_size
) {
602 ddt_log_record_t
*dlr
=
603 (ddt_log_record_t
*)(db
->db_data
+ boffset
);
605 /* Partially-filled block, skip the rest */
606 if (DLR_GET_TYPE(dlr
) == DLR_INVALID
)
609 switch (DLR_GET_TYPE(dlr
)) {
611 ddt_log_load_entry(ddt
, ddl
, dlr
,
616 dmu_buf_rele(db
, FTAG
);
617 dnode_rele(dn
, FTAG
);
618 ddt_log_empty(ddt
, ddl
);
619 return (SET_ERROR(EINVAL
));
622 boffset
+= DLR_GET_RECLEN(dlr
);
625 dmu_buf_rele(db
, FTAG
);
629 dnode_rele(dn
, FTAG
);
631 ddl
->ddl_object
= obj
;
632 ddl
->ddl_flags
= DLH_GET_FLAGS(&hdr
);
633 ddl
->ddl_length
= hdr
.dlh_length
;
634 ddl
->ddl_first_txg
= hdr
.dlh_first_txg
;
636 if (ddl
->ddl_flags
& DDL_FLAG_FLUSHING
)
637 ddt
->ddt_log_flushing
= ddl
;
639 ddt
->ddt_log_active
= ddl
;
645 ddt_log_load(ddt_t
*ddt
)
649 if (spa_load_state(ddt
->ddt_spa
) == SPA_LOAD_TRYIMPORT
) {
651 * The DDT is going to be freed again in a moment, so there's
652 * no point loading the log; it'll just slow down import.
657 ASSERT0(ddt
->ddt_log
[0].ddl_object
);
658 ASSERT0(ddt
->ddt_log
[1].ddl_object
);
659 if (ddt
->ddt_dir_object
== 0) {
661 * If we're configured but the containing dir doesn't exist
662 * yet, then the log object can't possibly exist either.
664 ASSERT3U(ddt
->ddt_version
, !=, UINT64_MAX
);
665 return (SET_ERROR(ENOENT
));
668 if ((err
= ddt_log_load_one(ddt
, 0)) != 0)
670 if ((err
= ddt_log_load_one(ddt
, 1)) != 0)
673 VERIFY3P(ddt
->ddt_log_active
, !=, ddt
->ddt_log_flushing
);
674 VERIFY(!(ddt
->ddt_log_active
->ddl_flags
& DDL_FLAG_FLUSHING
));
675 VERIFY(!(ddt
->ddt_log_active
->ddl_flags
& DDL_FLAG_CHECKPOINT
));
676 VERIFY(ddt
->ddt_log_flushing
->ddl_flags
& DDL_FLAG_FLUSHING
);
679 * We have two finalisation tasks:
681 * - rebuild the histogram. We do this at the end rather than while
682 * we're loading so we don't need to uncount and recount entries that
683 * appear multiple times in the log.
685 * - remove entries from the flushing tree that are on both trees. This
686 * happens when ddt_lookup() rehydrates an entry from the flushing
687 * tree, as ddt_log_take_key() removes the entry from the in-memory
688 * tree but doesn't remove it from disk.
692 * We don't technically need a config lock here, since there shouldn't
693 * be pool config changes during DDT load. dva_get_dsize_sync() via
694 * ddt_stat_generate() is expecting it though, and it won't hurt
695 * anything, so we take it.
697 spa_config_enter(ddt
->ddt_spa
, SCL_STATE
, FTAG
, RW_READER
);
699 avl_tree_t
*al
= &ddt
->ddt_log_active
->ddl_tree
;
700 avl_tree_t
*fl
= &ddt
->ddt_log_flushing
->ddl_tree
;
701 ddt_log_entry_t
*ae
= avl_first(al
);
702 ddt_log_entry_t
*fe
= avl_first(fl
);
703 while (ae
!= NULL
|| fe
!= NULL
) {
704 ddt_log_entry_t
*ddle
;
706 /* active exhausted, take flushing */
708 fe
= AVL_NEXT(fl
, fe
);
709 } else if (fe
== NULL
) {
710 /* flushing exuhausted, take active */
712 ae
= AVL_NEXT(al
, ae
);
714 /* compare active and flushing */
715 int c
= ddt_key_compare(&ae
->ddle_key
, &fe
->ddle_key
);
717 /* active behind, take and advance */
719 ae
= AVL_NEXT(al
, ae
);
721 /* flushing behind, take and advance */
723 fe
= AVL_NEXT(fl
, fe
);
725 /* match. remove from flushing, take active */
727 fe
= AVL_NEXT(fl
, fe
);
728 avl_remove(fl
, ddle
);
731 ae
= AVL_NEXT(al
, ae
);
735 ddt_lightweight_entry_t ddlwe
;
736 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt
, ddle
, &ddlwe
);
737 ddt_histogram_add_entry(ddt
, &ddt
->ddt_log_histogram
, &ddlwe
);
740 spa_config_exit(ddt
->ddt_spa
, SCL_STATE
, FTAG
);
742 ddt_log_update_stats(ddt
);
748 ddt_log_alloc(ddt_t
*ddt
)
750 ASSERT3P(ddt
->ddt_log_active
, ==, NULL
);
751 ASSERT3P(ddt
->ddt_log_flushing
, ==, NULL
);
753 avl_create(&ddt
->ddt_log
[0].ddl_tree
, ddt_key_compare
,
754 sizeof (ddt_log_entry_t
), offsetof(ddt_log_entry_t
, ddle_node
));
755 avl_create(&ddt
->ddt_log
[1].ddl_tree
, ddt_key_compare
,
756 sizeof (ddt_log_entry_t
), offsetof(ddt_log_entry_t
, ddle_node
));
757 ddt
->ddt_log_active
= &ddt
->ddt_log
[0];
758 ddt
->ddt_log_flushing
= &ddt
->ddt_log
[1];
759 ddt
->ddt_log_flushing
->ddl_flags
|= DDL_FLAG_FLUSHING
;
763 ddt_log_free(ddt_t
*ddt
)
765 ddt_log_empty(ddt
, &ddt
->ddt_log
[0]);
766 ddt_log_empty(ddt
, &ddt
->ddt_log
[1]);
767 avl_destroy(&ddt
->ddt_log
[0].ddl_tree
);
768 avl_destroy(&ddt
->ddt_log
[1].ddl_tree
);
771 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_txg_max
, UINT
, ZMOD_RW
,
772 "Max transactions before starting to flush dedup logs");
774 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_mem_max
, U64
, ZMOD_RD
,
775 "Max memory for dedup logs");
777 ZFS_MODULE_PARAM(zfs_dedup
, zfs_dedup_
, log_mem_max_percent
, UINT
, ZMOD_RD
,
778 "Max memory for dedup logs, as % of total memory");