4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
20 #include <sys/dmu_tx.h>
21 #include <sys/dsl_pool.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/vdev_indirect_mapping.h>
25 #include <sys/zfeature.h>
26 #include <sys/dmu_objset.h>
30 vdev_indirect_mapping_verify(vdev_indirect_mapping_t
*vim
)
34 ASSERT(vim
->vim_object
!= 0);
35 ASSERT(vim
->vim_objset
!= NULL
);
36 ASSERT(vim
->vim_phys
!= NULL
);
37 ASSERT(vim
->vim_dbuf
!= NULL
);
39 EQUIV(vim
->vim_phys
->vimp_num_entries
> 0,
40 vim
->vim_entries
!= NULL
);
41 if (vim
->vim_phys
->vimp_num_entries
> 0) {
42 vdev_indirect_mapping_entry_phys_t
*last_entry __maybe_unused
=
43 &vim
->vim_entries
[vim
->vim_phys
->vimp_num_entries
- 1];
44 uint64_t offset __maybe_unused
=
45 DVA_MAPPING_GET_SRC_OFFSET(last_entry
);
46 uint64_t size __maybe_unused
=
47 DVA_GET_ASIZE(&last_entry
->vimep_dst
);
49 ASSERT3U(vim
->vim_phys
->vimp_max_offset
, >=, offset
+ size
);
51 if (vim
->vim_havecounts
) {
52 ASSERT(vim
->vim_phys
->vimp_counts_object
!= 0);
58 #define vdev_indirect_mapping_verify(vim) ((void) sizeof (vim), B_TRUE)
62 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t
*vim
)
64 ASSERT(vdev_indirect_mapping_verify(vim
));
66 return (vim
->vim_phys
->vimp_num_entries
);
70 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t
*vim
)
72 ASSERT(vdev_indirect_mapping_verify(vim
));
74 return (vim
->vim_phys
->vimp_max_offset
);
78 vdev_indirect_mapping_object(vdev_indirect_mapping_t
*vim
)
80 ASSERT(vdev_indirect_mapping_verify(vim
));
82 return (vim
->vim_object
);
86 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t
*vim
)
88 ASSERT(vdev_indirect_mapping_verify(vim
));
90 return (vim
->vim_phys
->vimp_bytes_mapped
);
94 * The length (in bytes) of the mapping object array in memory and
95 * (logically) on disk.
97 * Note that unlike most of our accessor functions,
98 * we don't assert that the struct is consistent; therefore it can be
99 * called while there may be concurrent changes, if we don't care about
100 * the value being immediately stale (e.g. from spa_removal_get_stats()).
103 vdev_indirect_mapping_size(vdev_indirect_mapping_t
*vim
)
105 return (vim
->vim_phys
->vimp_num_entries
* sizeof (*vim
->vim_entries
));
109 * Compare an offset with an indirect mapping entry; there are three
110 * possible scenarios:
112 * 1. The offset is "less than" the mapping entry; meaning the
113 * offset is less than the source offset of the mapping entry. In
114 * this case, there is no overlap between the offset and the
115 * mapping entry and -1 will be returned.
117 * 2. The offset is "greater than" the mapping entry; meaning the
118 * offset is greater than the mapping entry's source offset plus
119 * the entry's size. In this case, there is no overlap between
120 * the offset and the mapping entry and 1 will be returned.
122 * NOTE: If the offset is actually equal to the entry's offset
123 * plus size, this is considered to be "greater" than the entry,
124 * and this case applies (i.e. 1 will be returned). Thus, the
125 * entry's "range" can be considered to be inclusive at its
126 * start, but exclusive at its end: e.g. [src, src + size).
128 * 3. The last case to consider is if the offset actually falls
129 * within the mapping entry's range. If this is the case, the
130 * offset is considered to be "equal to" the mapping entry and
131 * 0 will be returned.
133 * NOTE: If the offset is equal to the entry's source offset,
134 * this case applies and 0 will be returned. If the offset is
135 * equal to the entry's source plus its size, this case does
136 * *not* apply (see "NOTE" above for scenario 2), and 1 will be
140 dva_mapping_overlap_compare(const void *v_key
, const void *v_array_elem
)
142 const uint64_t * const key
= v_key
;
143 const vdev_indirect_mapping_entry_phys_t
* const array_elem
=
145 uint64_t src_offset
= DVA_MAPPING_GET_SRC_OFFSET(array_elem
);
147 if (*key
< src_offset
) {
149 } else if (*key
< src_offset
+ DVA_GET_ASIZE(&array_elem
->vimep_dst
)) {
157 * Returns the mapping entry for the given offset.
159 * It's possible that the given offset will not be in the mapping table
160 * (i.e. no mapping entries contain this offset), in which case, the
161 * return value value depends on the "next_if_missing" parameter.
163 * If the offset is not found in the table and "next_if_missing" is
164 * B_FALSE, then NULL will always be returned. The behavior is intended
165 * to allow consumers to get the entry corresponding to the offset
166 * parameter, iff the offset overlaps with an entry in the table.
168 * If the offset is not found in the table and "next_if_missing" is
169 * B_TRUE, then the entry nearest to the given offset will be returned,
170 * such that the entry's source offset is greater than the offset
171 * passed in (i.e. the "next" mapping entry in the table is returned, if
172 * the offset is missing from the table). If there are no entries whose
173 * source offset is greater than the passed in offset, NULL is returned.
175 static vdev_indirect_mapping_entry_phys_t
*
176 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t
*vim
,
177 uint64_t offset
, boolean_t next_if_missing
)
179 ASSERT(vdev_indirect_mapping_verify(vim
));
180 ASSERT(vim
->vim_phys
->vimp_num_entries
> 0);
182 vdev_indirect_mapping_entry_phys_t
*entry
= NULL
;
184 uint64_t last
= vim
->vim_phys
->vimp_num_entries
- 1;
188 * We don't define these inside of the while loop because we use
189 * their value in the case that offset isn't in the mapping.
194 while (last
>= base
) {
195 mid
= base
+ ((last
- base
) >> 1);
197 result
= dva_mapping_overlap_compare(&offset
,
198 &vim
->vim_entries
[mid
]);
201 entry
= &vim
->vim_entries
[mid
];
203 } else if (result
< 0) {
210 if (entry
== NULL
&& next_if_missing
) {
211 ASSERT3U(base
, ==, last
+ 1);
212 ASSERT(mid
== base
|| mid
== last
);
213 ASSERT3S(result
, !=, 0);
216 * The offset we're looking for isn't actually contained
217 * in the mapping table, thus we need to return the
218 * closest mapping entry that is greater than the
219 * offset. We reuse the result of the last comparison,
220 * comparing the mapping entry at index "mid" and the
221 * offset. The offset is guaranteed to lie between
222 * indices one less than "mid", and one greater than
223 * "mid"; we just need to determine if offset is greater
224 * than, or less than the mapping entry contained at
234 ASSERT3U(index
, <=, vim
->vim_phys
->vimp_num_entries
);
236 if (index
== vim
->vim_phys
->vimp_num_entries
) {
238 * If "index" is past the end of the entries
239 * array, then not only is the offset not in the
240 * mapping table, but it's actually greater than
241 * all entries in the table. In this case, we
242 * can't return a mapping entry greater than the
243 * offset (since none exist), so we return NULL.
246 ASSERT3S(dva_mapping_overlap_compare(&offset
,
247 &vim
->vim_entries
[index
- 1]), >, 0);
252 * Just to be safe, we verify the offset falls
253 * in between the mapping entries at index and
254 * one less than index. Since we know the offset
255 * doesn't overlap an entry, and we're supposed
256 * to return the entry just greater than the
257 * offset, both of the following tests must be
260 ASSERT3S(dva_mapping_overlap_compare(&offset
,
261 &vim
->vim_entries
[index
]), <, 0);
262 IMPLY(index
>= 1, dva_mapping_overlap_compare(&offset
,
263 &vim
->vim_entries
[index
- 1]) > 0);
265 return (&vim
->vim_entries
[index
]);
272 vdev_indirect_mapping_entry_phys_t
*
273 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t
*vim
,
276 return (vdev_indirect_mapping_entry_for_offset_impl(vim
, offset
,
280 vdev_indirect_mapping_entry_phys_t
*
281 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t
*vim
,
284 return (vdev_indirect_mapping_entry_for_offset_impl(vim
, offset
,
289 vdev_indirect_mapping_close(vdev_indirect_mapping_t
*vim
)
291 ASSERT(vdev_indirect_mapping_verify(vim
));
293 if (vim
->vim_phys
->vimp_num_entries
> 0) {
294 uint64_t map_size
= vdev_indirect_mapping_size(vim
);
295 vmem_free(vim
->vim_entries
, map_size
);
296 vim
->vim_entries
= NULL
;
299 dmu_buf_rele(vim
->vim_dbuf
, vim
);
301 vim
->vim_objset
= NULL
;
303 vim
->vim_dbuf
= NULL
;
304 vim
->vim_phys
= NULL
;
306 kmem_free(vim
, sizeof (*vim
));
310 vdev_indirect_mapping_alloc(objset_t
*os
, dmu_tx_t
*tx
)
313 ASSERT(dmu_tx_is_syncing(tx
));
314 uint64_t bonus_size
= VDEV_INDIRECT_MAPPING_SIZE_V0
;
316 if (spa_feature_is_enabled(os
->os_spa
, SPA_FEATURE_OBSOLETE_COUNTS
)) {
317 bonus_size
= sizeof (vdev_indirect_mapping_phys_t
);
320 object
= dmu_object_alloc(os
,
321 DMU_OTN_UINT64_METADATA
, SPA_OLD_MAXBLOCKSIZE
,
322 DMU_OTN_UINT64_METADATA
, bonus_size
,
325 if (spa_feature_is_enabled(os
->os_spa
, SPA_FEATURE_OBSOLETE_COUNTS
)) {
327 vdev_indirect_mapping_phys_t
*vimp
;
329 VERIFY0(dmu_bonus_hold(os
, object
, FTAG
, &dbuf
));
330 dmu_buf_will_dirty(dbuf
, tx
);
331 vimp
= dbuf
->db_data
;
332 vimp
->vimp_counts_object
= dmu_object_alloc(os
,
333 DMU_OTN_UINT32_METADATA
, SPA_OLD_MAXBLOCKSIZE
,
335 spa_feature_incr(os
->os_spa
, SPA_FEATURE_OBSOLETE_COUNTS
, tx
);
336 dmu_buf_rele(dbuf
, FTAG
);
343 vdev_indirect_mapping_t
*
344 vdev_indirect_mapping_open(objset_t
*os
, uint64_t mapping_object
)
346 vdev_indirect_mapping_t
*vim
= kmem_zalloc(sizeof (*vim
), KM_SLEEP
);
347 dmu_object_info_t doi
;
348 VERIFY0(dmu_object_info(os
, mapping_object
, &doi
));
350 vim
->vim_objset
= os
;
351 vim
->vim_object
= mapping_object
;
353 VERIFY0(dmu_bonus_hold(os
, vim
->vim_object
, vim
,
355 vim
->vim_phys
= vim
->vim_dbuf
->db_data
;
357 vim
->vim_havecounts
=
358 (doi
.doi_bonus_size
> VDEV_INDIRECT_MAPPING_SIZE_V0
);
360 if (vim
->vim_phys
->vimp_num_entries
> 0) {
361 uint64_t map_size
= vdev_indirect_mapping_size(vim
);
362 vim
->vim_entries
= vmem_alloc(map_size
, KM_SLEEP
);
363 VERIFY0(dmu_read(os
, vim
->vim_object
, 0, map_size
,
364 vim
->vim_entries
, DMU_READ_PREFETCH
));
367 ASSERT(vdev_indirect_mapping_verify(vim
));
373 vdev_indirect_mapping_free(objset_t
*os
, uint64_t object
, dmu_tx_t
*tx
)
375 vdev_indirect_mapping_t
*vim
= vdev_indirect_mapping_open(os
, object
);
376 if (vim
->vim_havecounts
) {
377 VERIFY0(dmu_object_free(os
, vim
->vim_phys
->vimp_counts_object
,
379 spa_feature_decr(os
->os_spa
, SPA_FEATURE_OBSOLETE_COUNTS
, tx
);
381 vdev_indirect_mapping_close(vim
);
383 VERIFY0(dmu_object_free(os
, object
, tx
));
387 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
388 * mapping object. Also remove the entries from the list and free them.
389 * This also implicitly extends the max_offset of the mapping (to the end
390 * of the last entry).
393 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t
*vim
,
394 list_t
*list
, dmu_tx_t
*tx
)
396 vdev_indirect_mapping_entry_phys_t
*mapbuf
;
398 uint32_t *countbuf
= NULL
;
399 vdev_indirect_mapping_entry_phys_t
*old_entries
;
401 uint64_t entries_written
= 0;
403 ASSERT(vdev_indirect_mapping_verify(vim
));
404 ASSERT(dmu_tx_is_syncing(tx
));
405 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx
)));
406 ASSERT(!list_is_empty(list
));
408 old_size
= vdev_indirect_mapping_size(vim
);
409 old_entries
= vim
->vim_entries
;
410 old_count
= vim
->vim_phys
->vimp_num_entries
;
412 dmu_buf_will_dirty(vim
->vim_dbuf
, tx
);
414 mapbuf
= vmem_alloc(SPA_OLD_MAXBLOCKSIZE
, KM_SLEEP
);
415 if (vim
->vim_havecounts
) {
416 countbuf
= vmem_alloc(SPA_OLD_MAXBLOCKSIZE
, KM_SLEEP
);
417 ASSERT(spa_feature_is_active(vim
->vim_objset
->os_spa
,
418 SPA_FEATURE_OBSOLETE_COUNTS
));
420 while (!list_is_empty(list
)) {
423 * Write entries from the list to the
424 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
426 for (i
= 0; i
< SPA_OLD_MAXBLOCKSIZE
/ sizeof (*mapbuf
); i
++) {
427 vdev_indirect_mapping_entry_t
*entry
=
428 list_remove_head(list
);
433 DVA_GET_ASIZE(&entry
->vime_mapping
.vimep_dst
);
434 uint64_t src_offset
=
435 DVA_MAPPING_GET_SRC_OFFSET(&entry
->vime_mapping
);
438 * We shouldn't be adding an entry which is fully
441 ASSERT3U(entry
->vime_obsolete_count
, <, size
);
442 IMPLY(entry
->vime_obsolete_count
!= 0,
443 vim
->vim_havecounts
);
445 mapbuf
[i
] = entry
->vime_mapping
;
446 if (vim
->vim_havecounts
)
447 countbuf
[i
] = entry
->vime_obsolete_count
;
449 vim
->vim_phys
->vimp_bytes_mapped
+= size
;
450 ASSERT3U(src_offset
, >=,
451 vim
->vim_phys
->vimp_max_offset
);
452 vim
->vim_phys
->vimp_max_offset
= src_offset
+ size
;
456 vmem_free(entry
, sizeof (*entry
));
458 dmu_write(vim
->vim_objset
, vim
->vim_object
,
459 vim
->vim_phys
->vimp_num_entries
* sizeof (*mapbuf
),
460 i
* sizeof (*mapbuf
),
462 if (vim
->vim_havecounts
) {
463 dmu_write(vim
->vim_objset
,
464 vim
->vim_phys
->vimp_counts_object
,
465 vim
->vim_phys
->vimp_num_entries
*
467 i
* sizeof (*countbuf
), countbuf
, tx
);
469 vim
->vim_phys
->vimp_num_entries
+= i
;
471 vmem_free(mapbuf
, SPA_OLD_MAXBLOCKSIZE
);
472 if (vim
->vim_havecounts
)
473 vmem_free(countbuf
, SPA_OLD_MAXBLOCKSIZE
);
476 * Update the entry array to reflect the new entries. First, copy
477 * over any old entries then read back the new entries we just wrote.
479 uint64_t new_size
= vdev_indirect_mapping_size(vim
);
480 ASSERT3U(new_size
, >, old_size
);
481 ASSERT3U(new_size
- old_size
, ==,
482 entries_written
* sizeof (vdev_indirect_mapping_entry_phys_t
));
483 vim
->vim_entries
= vmem_alloc(new_size
, KM_SLEEP
);
485 memcpy(vim
->vim_entries
, old_entries
, old_size
);
486 vmem_free(old_entries
, old_size
);
488 VERIFY0(dmu_read(vim
->vim_objset
, vim
->vim_object
, old_size
,
489 new_size
- old_size
, &vim
->vim_entries
[old_count
],
492 zfs_dbgmsg("txg %llu: wrote %llu entries to "
493 "indirect mapping obj %llu; max offset=0x%llx",
494 (u_longlong_t
)dmu_tx_get_txg(tx
),
495 (u_longlong_t
)entries_written
,
496 (u_longlong_t
)vim
->vim_object
,
497 (u_longlong_t
)vim
->vim_phys
->vimp_max_offset
);
501 * Increment the relevant counts for the specified offset and length.
502 * The counts array must be obtained from
503 * vdev_indirect_mapping_load_obsolete_counts().
506 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t
*vim
,
507 uint64_t offset
, uint64_t length
, uint32_t *counts
)
509 vdev_indirect_mapping_entry_phys_t
*mapping
;
512 mapping
= vdev_indirect_mapping_entry_for_offset(vim
, offset
);
515 ASSERT3P(mapping
, !=, NULL
);
517 index
= mapping
- vim
->vim_entries
;
520 ASSERT3U(index
, <, vdev_indirect_mapping_num_entries(vim
));
522 uint64_t size
= DVA_GET_ASIZE(&mapping
->vimep_dst
);
523 uint64_t inner_offset
= offset
-
524 DVA_MAPPING_GET_SRC_OFFSET(mapping
);
525 VERIFY3U(inner_offset
, <, size
);
526 uint64_t inner_size
= MIN(length
, size
- inner_offset
);
528 VERIFY3U(counts
[index
] + inner_size
, <=, size
);
529 counts
[index
] += inner_size
;
531 offset
+= inner_size
;
532 length
-= inner_size
;
538 typedef struct load_obsolete_space_map_arg
{
539 vdev_indirect_mapping_t
*losma_vim
;
540 uint32_t *losma_counts
;
541 } load_obsolete_space_map_arg_t
;
544 load_obsolete_sm_callback(space_map_entry_t
*sme
, void *arg
)
546 load_obsolete_space_map_arg_t
*losma
= arg
;
547 ASSERT3S(sme
->sme_type
, ==, SM_ALLOC
);
549 vdev_indirect_mapping_increment_obsolete_count(losma
->losma_vim
,
550 sme
->sme_offset
, sme
->sme_run
, losma
->losma_counts
);
556 * Modify the counts (increment them) based on the spacemap.
559 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t
*vim
,
560 uint32_t *counts
, space_map_t
*obsolete_space_sm
)
562 load_obsolete_space_map_arg_t losma
;
563 losma
.losma_counts
= counts
;
564 losma
.losma_vim
= vim
;
565 VERIFY0(space_map_iterate(obsolete_space_sm
,
566 space_map_length(obsolete_space_sm
),
567 load_obsolete_sm_callback
, &losma
));
571 * Read the obsolete counts from disk, returning them in an array.
574 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t
*vim
)
576 ASSERT(vdev_indirect_mapping_verify(vim
));
578 uint64_t counts_size
=
579 vim
->vim_phys
->vimp_num_entries
* sizeof (uint32_t);
580 uint32_t *counts
= vmem_alloc(counts_size
, KM_SLEEP
);
581 if (vim
->vim_havecounts
) {
582 VERIFY0(dmu_read(vim
->vim_objset
,
583 vim
->vim_phys
->vimp_counts_object
,
585 counts
, DMU_READ_PREFETCH
));
587 memset(counts
, 0, counts_size
);
593 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t
*vim
,
596 ASSERT(vdev_indirect_mapping_verify(vim
));
598 vmem_free(counts
, vim
->vim_phys
->vimp_num_entries
* sizeof (uint32_t));
602 EXPORT_SYMBOL(vdev_indirect_mapping_add_entries
);
603 EXPORT_SYMBOL(vdev_indirect_mapping_alloc
);
604 EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped
);
605 EXPORT_SYMBOL(vdev_indirect_mapping_close
);
606 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset
);
607 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next
);
608 EXPORT_SYMBOL(vdev_indirect_mapping_free
);
609 EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts
);
610 EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count
);
611 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts
);
612 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap
);
613 EXPORT_SYMBOL(vdev_indirect_mapping_max_offset
);
614 EXPORT_SYMBOL(vdev_indirect_mapping_num_entries
);
615 EXPORT_SYMBOL(vdev_indirect_mapping_object
);
616 EXPORT_SYMBOL(vdev_indirect_mapping_open
);
617 EXPORT_SYMBOL(vdev_indirect_mapping_size
);