Reduce dirty records memory usage
[zfs.git] / module / zfs / vdev_indirect_mapping.c
blobe92495f2dd34f5cd88f44ce49d0cdca577c398cd
1 /*
2 * CDDL HEADER START
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
13 * CDDL HEADER END
17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
20 #include <sys/dmu_tx.h>
21 #include <sys/dsl_pool.h>
22 #include <sys/spa.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/vdev_indirect_mapping.h>
25 #include <sys/zfeature.h>
26 #include <sys/dmu_objset.h>
28 #ifdef ZFS_DEBUG
29 static boolean_t
30 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
32 ASSERT(vim != NULL);
34 ASSERT(vim->vim_object != 0);
35 ASSERT(vim->vim_objset != NULL);
36 ASSERT(vim->vim_phys != NULL);
37 ASSERT(vim->vim_dbuf != NULL);
39 EQUIV(vim->vim_phys->vimp_num_entries > 0,
40 vim->vim_entries != NULL);
41 if (vim->vim_phys->vimp_num_entries > 0) {
42 vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused =
43 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
44 uint64_t offset __maybe_unused =
45 DVA_MAPPING_GET_SRC_OFFSET(last_entry);
46 uint64_t size __maybe_unused =
47 DVA_GET_ASIZE(&last_entry->vimep_dst);
49 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
51 if (vim->vim_havecounts) {
52 ASSERT(vim->vim_phys->vimp_counts_object != 0);
55 return (B_TRUE);
57 #else
58 #define vdev_indirect_mapping_verify(vim) ((void) sizeof (vim), B_TRUE)
59 #endif
61 uint64_t
62 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
64 ASSERT(vdev_indirect_mapping_verify(vim));
66 return (vim->vim_phys->vimp_num_entries);
69 uint64_t
70 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
72 ASSERT(vdev_indirect_mapping_verify(vim));
74 return (vim->vim_phys->vimp_max_offset);
77 uint64_t
78 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
80 ASSERT(vdev_indirect_mapping_verify(vim));
82 return (vim->vim_object);
85 uint64_t
86 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
88 ASSERT(vdev_indirect_mapping_verify(vim));
90 return (vim->vim_phys->vimp_bytes_mapped);
94 * The length (in bytes) of the mapping object array in memory and
95 * (logically) on disk.
97 * Note that unlike most of our accessor functions,
98 * we don't assert that the struct is consistent; therefore it can be
99 * called while there may be concurrent changes, if we don't care about
100 * the value being immediately stale (e.g. from spa_removal_get_stats()).
102 uint64_t
103 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
105 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
109 * Compare an offset with an indirect mapping entry; there are three
110 * possible scenarios:
112 * 1. The offset is "less than" the mapping entry; meaning the
113 * offset is less than the source offset of the mapping entry. In
114 * this case, there is no overlap between the offset and the
115 * mapping entry and -1 will be returned.
117 * 2. The offset is "greater than" the mapping entry; meaning the
118 * offset is greater than the mapping entry's source offset plus
119 * the entry's size. In this case, there is no overlap between
120 * the offset and the mapping entry and 1 will be returned.
122 * NOTE: If the offset is actually equal to the entry's offset
123 * plus size, this is considered to be "greater" than the entry,
124 * and this case applies (i.e. 1 will be returned). Thus, the
125 * entry's "range" can be considered to be inclusive at its
126 * start, but exclusive at its end: e.g. [src, src + size).
128 * 3. The last case to consider is if the offset actually falls
129 * within the mapping entry's range. If this is the case, the
130 * offset is considered to be "equal to" the mapping entry and
131 * 0 will be returned.
133 * NOTE: If the offset is equal to the entry's source offset,
134 * this case applies and 0 will be returned. If the offset is
135 * equal to the entry's source plus its size, this case does
136 * *not* apply (see "NOTE" above for scenario 2), and 1 will be
137 * returned.
139 static int
140 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
142 const uint64_t * const key = v_key;
143 const vdev_indirect_mapping_entry_phys_t * const array_elem =
144 v_array_elem;
145 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
147 if (*key < src_offset) {
148 return (-1);
149 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
150 return (0);
151 } else {
152 return (1);
157 * Returns the mapping entry for the given offset.
159 * It's possible that the given offset will not be in the mapping table
160 * (i.e. no mapping entries contain this offset), in which case, the
161 * return value value depends on the "next_if_missing" parameter.
163 * If the offset is not found in the table and "next_if_missing" is
164 * B_FALSE, then NULL will always be returned. The behavior is intended
165 * to allow consumers to get the entry corresponding to the offset
166 * parameter, iff the offset overlaps with an entry in the table.
168 * If the offset is not found in the table and "next_if_missing" is
169 * B_TRUE, then the entry nearest to the given offset will be returned,
170 * such that the entry's source offset is greater than the offset
171 * passed in (i.e. the "next" mapping entry in the table is returned, if
172 * the offset is missing from the table). If there are no entries whose
173 * source offset is greater than the passed in offset, NULL is returned.
175 static vdev_indirect_mapping_entry_phys_t *
176 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
177 uint64_t offset, boolean_t next_if_missing)
179 ASSERT(vdev_indirect_mapping_verify(vim));
180 ASSERT(vim->vim_phys->vimp_num_entries > 0);
182 vdev_indirect_mapping_entry_phys_t *entry = NULL;
184 uint64_t last = vim->vim_phys->vimp_num_entries - 1;
185 uint64_t base = 0;
188 * We don't define these inside of the while loop because we use
189 * their value in the case that offset isn't in the mapping.
191 uint64_t mid;
192 int result;
194 while (last >= base) {
195 mid = base + ((last - base) >> 1);
197 result = dva_mapping_overlap_compare(&offset,
198 &vim->vim_entries[mid]);
200 if (result == 0) {
201 entry = &vim->vim_entries[mid];
202 break;
203 } else if (result < 0) {
204 last = mid - 1;
205 } else {
206 base = mid + 1;
210 if (entry == NULL && next_if_missing) {
211 ASSERT3U(base, ==, last + 1);
212 ASSERT(mid == base || mid == last);
213 ASSERT3S(result, !=, 0);
216 * The offset we're looking for isn't actually contained
217 * in the mapping table, thus we need to return the
218 * closest mapping entry that is greater than the
219 * offset. We reuse the result of the last comparison,
220 * comparing the mapping entry at index "mid" and the
221 * offset. The offset is guaranteed to lie between
222 * indices one less than "mid", and one greater than
223 * "mid"; we just need to determine if offset is greater
224 * than, or less than the mapping entry contained at
225 * index "mid".
228 uint64_t index;
229 if (result < 0)
230 index = mid;
231 else
232 index = mid + 1;
234 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
236 if (index == vim->vim_phys->vimp_num_entries) {
238 * If "index" is past the end of the entries
239 * array, then not only is the offset not in the
240 * mapping table, but it's actually greater than
241 * all entries in the table. In this case, we
242 * can't return a mapping entry greater than the
243 * offset (since none exist), so we return NULL.
246 ASSERT3S(dva_mapping_overlap_compare(&offset,
247 &vim->vim_entries[index - 1]), >, 0);
249 return (NULL);
250 } else {
252 * Just to be safe, we verify the offset falls
253 * in between the mapping entries at index and
254 * one less than index. Since we know the offset
255 * doesn't overlap an entry, and we're supposed
256 * to return the entry just greater than the
257 * offset, both of the following tests must be
258 * true.
260 ASSERT3S(dva_mapping_overlap_compare(&offset,
261 &vim->vim_entries[index]), <, 0);
262 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
263 &vim->vim_entries[index - 1]) > 0);
265 return (&vim->vim_entries[index]);
267 } else {
268 return (entry);
272 vdev_indirect_mapping_entry_phys_t *
273 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
274 uint64_t offset)
276 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
277 B_FALSE));
280 vdev_indirect_mapping_entry_phys_t *
281 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
282 uint64_t offset)
284 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
285 B_TRUE));
288 void
289 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
291 ASSERT(vdev_indirect_mapping_verify(vim));
293 if (vim->vim_phys->vimp_num_entries > 0) {
294 uint64_t map_size = vdev_indirect_mapping_size(vim);
295 vmem_free(vim->vim_entries, map_size);
296 vim->vim_entries = NULL;
299 dmu_buf_rele(vim->vim_dbuf, vim);
301 vim->vim_objset = NULL;
302 vim->vim_object = 0;
303 vim->vim_dbuf = NULL;
304 vim->vim_phys = NULL;
306 kmem_free(vim, sizeof (*vim));
309 uint64_t
310 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
312 uint64_t object;
313 ASSERT(dmu_tx_is_syncing(tx));
314 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
316 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
317 bonus_size = sizeof (vdev_indirect_mapping_phys_t);
320 object = dmu_object_alloc(os,
321 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
322 DMU_OTN_UINT64_METADATA, bonus_size,
323 tx);
325 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
326 dmu_buf_t *dbuf;
327 vdev_indirect_mapping_phys_t *vimp;
329 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
330 dmu_buf_will_dirty(dbuf, tx);
331 vimp = dbuf->db_data;
332 vimp->vimp_counts_object = dmu_object_alloc(os,
333 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
334 DMU_OT_NONE, 0, tx);
335 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
336 dmu_buf_rele(dbuf, FTAG);
339 return (object);
343 vdev_indirect_mapping_t *
344 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
346 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
347 dmu_object_info_t doi;
348 VERIFY0(dmu_object_info(os, mapping_object, &doi));
350 vim->vim_objset = os;
351 vim->vim_object = mapping_object;
353 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
354 &vim->vim_dbuf));
355 vim->vim_phys = vim->vim_dbuf->db_data;
357 vim->vim_havecounts =
358 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
360 if (vim->vim_phys->vimp_num_entries > 0) {
361 uint64_t map_size = vdev_indirect_mapping_size(vim);
362 vim->vim_entries = vmem_alloc(map_size, KM_SLEEP);
363 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
364 vim->vim_entries, DMU_READ_PREFETCH));
367 ASSERT(vdev_indirect_mapping_verify(vim));
369 return (vim);
372 void
373 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
375 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
376 if (vim->vim_havecounts) {
377 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
378 tx));
379 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
381 vdev_indirect_mapping_close(vim);
383 VERIFY0(dmu_object_free(os, object, tx));
387 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
388 * mapping object. Also remove the entries from the list and free them.
389 * This also implicitly extends the max_offset of the mapping (to the end
390 * of the last entry).
392 void
393 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
394 list_t *list, dmu_tx_t *tx)
396 vdev_indirect_mapping_entry_phys_t *mapbuf;
397 uint64_t old_size;
398 uint32_t *countbuf = NULL;
399 vdev_indirect_mapping_entry_phys_t *old_entries;
400 uint64_t old_count;
401 uint64_t entries_written = 0;
403 ASSERT(vdev_indirect_mapping_verify(vim));
404 ASSERT(dmu_tx_is_syncing(tx));
405 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
406 ASSERT(!list_is_empty(list));
408 old_size = vdev_indirect_mapping_size(vim);
409 old_entries = vim->vim_entries;
410 old_count = vim->vim_phys->vimp_num_entries;
412 dmu_buf_will_dirty(vim->vim_dbuf, tx);
414 mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
415 if (vim->vim_havecounts) {
416 countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
417 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
418 SPA_FEATURE_OBSOLETE_COUNTS));
420 while (!list_is_empty(list)) {
421 uint64_t i;
423 * Write entries from the list to the
424 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
426 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
427 vdev_indirect_mapping_entry_t *entry =
428 list_remove_head(list);
429 if (entry == NULL)
430 break;
432 uint64_t size =
433 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
434 uint64_t src_offset =
435 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
438 * We shouldn't be adding an entry which is fully
439 * obsolete.
441 ASSERT3U(entry->vime_obsolete_count, <, size);
442 IMPLY(entry->vime_obsolete_count != 0,
443 vim->vim_havecounts);
445 mapbuf[i] = entry->vime_mapping;
446 if (vim->vim_havecounts)
447 countbuf[i] = entry->vime_obsolete_count;
449 vim->vim_phys->vimp_bytes_mapped += size;
450 ASSERT3U(src_offset, >=,
451 vim->vim_phys->vimp_max_offset);
452 vim->vim_phys->vimp_max_offset = src_offset + size;
454 entries_written++;
456 vmem_free(entry, sizeof (*entry));
458 dmu_write(vim->vim_objset, vim->vim_object,
459 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
460 i * sizeof (*mapbuf),
461 mapbuf, tx);
462 if (vim->vim_havecounts) {
463 dmu_write(vim->vim_objset,
464 vim->vim_phys->vimp_counts_object,
465 vim->vim_phys->vimp_num_entries *
466 sizeof (*countbuf),
467 i * sizeof (*countbuf), countbuf, tx);
469 vim->vim_phys->vimp_num_entries += i;
471 vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
472 if (vim->vim_havecounts)
473 vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
476 * Update the entry array to reflect the new entries. First, copy
477 * over any old entries then read back the new entries we just wrote.
479 uint64_t new_size = vdev_indirect_mapping_size(vim);
480 ASSERT3U(new_size, >, old_size);
481 ASSERT3U(new_size - old_size, ==,
482 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
483 vim->vim_entries = vmem_alloc(new_size, KM_SLEEP);
484 if (old_size > 0) {
485 memcpy(vim->vim_entries, old_entries, old_size);
486 vmem_free(old_entries, old_size);
488 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
489 new_size - old_size, &vim->vim_entries[old_count],
490 DMU_READ_PREFETCH));
492 zfs_dbgmsg("txg %llu: wrote %llu entries to "
493 "indirect mapping obj %llu; max offset=0x%llx",
494 (u_longlong_t)dmu_tx_get_txg(tx),
495 (u_longlong_t)entries_written,
496 (u_longlong_t)vim->vim_object,
497 (u_longlong_t)vim->vim_phys->vimp_max_offset);
501 * Increment the relevant counts for the specified offset and length.
502 * The counts array must be obtained from
503 * vdev_indirect_mapping_load_obsolete_counts().
505 void
506 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
507 uint64_t offset, uint64_t length, uint32_t *counts)
509 vdev_indirect_mapping_entry_phys_t *mapping;
510 uint64_t index;
512 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
514 ASSERT(length > 0);
515 ASSERT3P(mapping, !=, NULL);
517 index = mapping - vim->vim_entries;
519 while (length > 0) {
520 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
522 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
523 uint64_t inner_offset = offset -
524 DVA_MAPPING_GET_SRC_OFFSET(mapping);
525 VERIFY3U(inner_offset, <, size);
526 uint64_t inner_size = MIN(length, size - inner_offset);
528 VERIFY3U(counts[index] + inner_size, <=, size);
529 counts[index] += inner_size;
531 offset += inner_size;
532 length -= inner_size;
533 mapping++;
534 index++;
538 typedef struct load_obsolete_space_map_arg {
539 vdev_indirect_mapping_t *losma_vim;
540 uint32_t *losma_counts;
541 } load_obsolete_space_map_arg_t;
543 static int
544 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
546 load_obsolete_space_map_arg_t *losma = arg;
547 ASSERT3S(sme->sme_type, ==, SM_ALLOC);
549 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
550 sme->sme_offset, sme->sme_run, losma->losma_counts);
552 return (0);
556 * Modify the counts (increment them) based on the spacemap.
558 void
559 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
560 uint32_t *counts, space_map_t *obsolete_space_sm)
562 load_obsolete_space_map_arg_t losma;
563 losma.losma_counts = counts;
564 losma.losma_vim = vim;
565 VERIFY0(space_map_iterate(obsolete_space_sm,
566 space_map_length(obsolete_space_sm),
567 load_obsolete_sm_callback, &losma));
571 * Read the obsolete counts from disk, returning them in an array.
573 uint32_t *
574 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
576 ASSERT(vdev_indirect_mapping_verify(vim));
578 uint64_t counts_size =
579 vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
580 uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP);
581 if (vim->vim_havecounts) {
582 VERIFY0(dmu_read(vim->vim_objset,
583 vim->vim_phys->vimp_counts_object,
584 0, counts_size,
585 counts, DMU_READ_PREFETCH));
586 } else {
587 memset(counts, 0, counts_size);
589 return (counts);
592 extern void
593 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
594 uint32_t *counts)
596 ASSERT(vdev_indirect_mapping_verify(vim));
598 vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
601 #if defined(_KERNEL)
602 EXPORT_SYMBOL(vdev_indirect_mapping_add_entries);
603 EXPORT_SYMBOL(vdev_indirect_mapping_alloc);
604 EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped);
605 EXPORT_SYMBOL(vdev_indirect_mapping_close);
606 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset);
607 EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next);
608 EXPORT_SYMBOL(vdev_indirect_mapping_free);
609 EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts);
610 EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count);
611 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts);
612 EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap);
613 EXPORT_SYMBOL(vdev_indirect_mapping_max_offset);
614 EXPORT_SYMBOL(vdev_indirect_mapping_num_entries);
615 EXPORT_SYMBOL(vdev_indirect_mapping_object);
616 EXPORT_SYMBOL(vdev_indirect_mapping_open);
617 EXPORT_SYMBOL(vdev_indirect_mapping_size);
618 #endif