FreeBSD: Fix a pair of bugs in zfs_fhtovp()
[zfs.git] / module / zfs / btree.c
blobf0a9222a430818e2ab99d0fb4a7a9bb841a54226
1 /*
2 * CDDL HEADER START
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
13 * CDDL HEADER END
16 * Copyright (c) 2019 by Delphix. All rights reserved.
19 #include <sys/btree.h>
20 #include <sys/bitops.h>
21 #include <sys/zfs_context.h>
23 kmem_cache_t *zfs_btree_leaf_cache;
26 * Control the extent of the verification that occurs when zfs_btree_verify is
27 * called. Primarily used for debugging when extending the btree logic and
28 * functionality. As the intensity is increased, new verification steps are
29 * added. These steps are cumulative; intensity = 3 includes the intensity = 1
30 * and intensity = 2 steps as well.
32 * Intensity 1: Verify that the tree's height is consistent throughout.
33 * Intensity 2: Verify that a core node's children's parent pointers point
34 * to the core node.
35 * Intensity 3: Verify that the total number of elements in the tree matches the
36 * sum of the number of elements in each node. Also verifies that each node's
37 * count obeys the invariants (less than or equal to maximum value, greater than
38 * or equal to half the maximum minus one).
39 * Intensity 4: Verify that each element compares less than the element
40 * immediately after it and greater than the one immediately before it using the
41 * comparator function. For core nodes, also checks that each element is greater
42 * than the last element in the first of the two nodes it separates, and less
43 * than the first element in the second of the two nodes.
44 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
45 * of each node is poisoned appropriately. Note that poisoning always occurs if
46 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
47 * operation.
49 * Intensity 4 and 5 are particularly expensive to perform; the previous levels
50 * are a few memory operations per node, while these levels require multiple
51 * operations per element. In addition, when creating large btrees, these
52 * operations are called at every step, resulting in extremely slow operation
53 * (while the asymptotic complexity of the other steps is the same, the
54 * importance of the constant factors cannot be denied).
56 uint_t zfs_btree_verify_intensity = 0;
59 * Convenience functions to silence warnings from memcpy/memmove's
60 * return values and change argument order to src, dest.
62 static void
63 bcpy(const void *src, void *dest, size_t size)
65 (void) memcpy(dest, src, size);
68 static void
69 bmov(const void *src, void *dest, size_t size)
71 (void) memmove(dest, src, size);
74 static boolean_t
75 zfs_btree_is_core(struct zfs_btree_hdr *hdr)
77 return (hdr->bth_first == -1);
80 #ifdef _ILP32
81 #define BTREE_POISON 0xabadb10c
82 #else
83 #define BTREE_POISON 0xabadb10cdeadbeef
84 #endif
86 static void
87 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
89 #ifdef ZFS_DEBUG
90 size_t size = tree->bt_elem_size;
91 if (zfs_btree_is_core(hdr)) {
92 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
93 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
94 i++) {
95 node->btc_children[i] =
96 (zfs_btree_hdr_t *)BTREE_POISON;
98 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
99 (BTREE_CORE_ELEMS - hdr->bth_count) * size);
100 } else {
101 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
102 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
103 (void) memset(leaf->btl_elems +
104 (hdr->bth_first + hdr->bth_count) * size, 0x0f,
105 BTREE_LEAF_ESIZE -
106 (hdr->bth_first + hdr->bth_count) * size);
108 #endif
111 static inline void
112 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
113 uint32_t idx, uint32_t count)
115 #ifdef ZFS_DEBUG
116 size_t size = tree->bt_elem_size;
117 if (zfs_btree_is_core(hdr)) {
118 ASSERT3U(idx, >=, hdr->bth_count);
119 ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
120 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
121 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
122 for (uint32_t i = 1; i <= count; i++) {
123 node->btc_children[idx + i] =
124 (zfs_btree_hdr_t *)BTREE_POISON;
126 (void) memset(node->btc_elems + idx * size, 0x0f, count * size);
127 } else {
128 ASSERT3U(idx, <=, tree->bt_leaf_cap);
129 ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
130 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
131 (void) memset(leaf->btl_elems +
132 (hdr->bth_first + idx) * size, 0x0f, count * size);
134 #endif
137 static inline void
138 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
139 uint32_t idx)
141 #ifdef ZFS_DEBUG
142 size_t size = tree->bt_elem_size;
143 if (zfs_btree_is_core(hdr)) {
144 ASSERT3U(idx, <, BTREE_CORE_ELEMS);
145 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
146 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
147 VERIFY3P(node->btc_children[idx + 1], ==, cval);
148 for (size_t i = 0; i < size; i++)
149 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
150 } else {
151 ASSERT3U(idx, <, tree->bt_leaf_cap);
152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
153 if (idx >= tree->bt_leaf_cap - hdr->bth_first)
154 return;
155 for (size_t i = 0; i < size; i++) {
156 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
157 * size + i], ==, 0x0f);
160 #endif
163 void
164 zfs_btree_init(void)
166 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
167 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
170 void
171 zfs_btree_fini(void)
173 kmem_cache_destroy(zfs_btree_leaf_cache);
176 void
177 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
178 size_t size)
180 ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2);
182 memset(tree, 0, sizeof (*tree));
183 tree->bt_compar = compar;
184 tree->bt_elem_size = size;
185 tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2);
186 tree->bt_height = -1;
187 tree->bt_bulk = NULL;
191 * Find value in the array of elements provided. Uses a simple binary search.
193 static void *
194 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
195 const void *value, zfs_btree_index_t *where)
197 uint32_t max = nelems;
198 uint32_t min = 0;
199 while (max > min) {
200 uint32_t idx = (min + max) / 2;
201 uint8_t *cur = buf + idx * tree->bt_elem_size;
202 int comp = tree->bt_compar(cur, value);
203 if (comp < 0) {
204 min = idx + 1;
205 } else if (comp > 0) {
206 max = idx;
207 } else {
208 where->bti_offset = idx;
209 where->bti_before = B_FALSE;
210 return (cur);
214 where->bti_offset = max;
215 where->bti_before = B_TRUE;
216 return (NULL);
220 * Find the given value in the tree. where may be passed as null to use as a
221 * membership test or if the btree is being used as a map.
223 void *
224 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
226 if (tree->bt_height == -1) {
227 if (where != NULL) {
228 where->bti_node = NULL;
229 where->bti_offset = 0;
231 ASSERT0(tree->bt_num_elems);
232 return (NULL);
236 * If we're in bulk-insert mode, we check the last spot in the tree
237 * and the last leaf in the tree before doing the normal search,
238 * because for most workloads the vast majority of finds in
239 * bulk-insert mode are to insert new elements.
241 zfs_btree_index_t idx;
242 size_t size = tree->bt_elem_size;
243 if (tree->bt_bulk != NULL) {
244 zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
245 int comp = tree->bt_compar(last_leaf->btl_elems +
246 (last_leaf->btl_hdr.bth_first +
247 last_leaf->btl_hdr.bth_count - 1) * size, value);
248 if (comp < 0) {
250 * If what they're looking for is after the last
251 * element, it's not in the tree.
253 if (where != NULL) {
254 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
255 where->bti_offset =
256 last_leaf->btl_hdr.bth_count;
257 where->bti_before = B_TRUE;
259 return (NULL);
260 } else if (comp == 0) {
261 if (where != NULL) {
262 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
263 where->bti_offset =
264 last_leaf->btl_hdr.bth_count - 1;
265 where->bti_before = B_FALSE;
267 return (last_leaf->btl_elems +
268 (last_leaf->btl_hdr.bth_first +
269 last_leaf->btl_hdr.bth_count - 1) * size);
271 if (tree->bt_compar(last_leaf->btl_elems +
272 last_leaf->btl_hdr.bth_first * size, value) <= 0) {
274 * If what they're looking for is after the first
275 * element in the last leaf, it's in the last leaf or
276 * it's not in the tree.
278 void *d = zfs_btree_find_in_buf(tree,
279 last_leaf->btl_elems +
280 last_leaf->btl_hdr.bth_first * size,
281 last_leaf->btl_hdr.bth_count, value, &idx);
283 if (where != NULL) {
284 idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
285 *where = idx;
287 return (d);
291 zfs_btree_core_t *node = NULL;
292 uint32_t child = 0;
293 uint64_t depth = 0;
296 * Iterate down the tree, finding which child the value should be in
297 * by comparing with the separators.
299 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
300 node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
301 ASSERT3P(node, !=, NULL);
302 void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
303 node->btc_hdr.bth_count, value, &idx);
304 EQUIV(d != NULL, !idx.bti_before);
305 if (d != NULL) {
306 if (where != NULL) {
307 idx.bti_node = (zfs_btree_hdr_t *)node;
308 *where = idx;
310 return (d);
312 ASSERT(idx.bti_before);
313 child = idx.bti_offset;
317 * The value is in this leaf, or it would be if it were in the
318 * tree. Find its proper location and return it.
320 zfs_btree_leaf_t *leaf = (depth == 0 ?
321 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
322 void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
323 leaf->btl_hdr.bth_first * size,
324 leaf->btl_hdr.bth_count, value, &idx);
326 if (where != NULL) {
327 idx.bti_node = (zfs_btree_hdr_t *)leaf;
328 *where = idx;
331 return (d);
335 * To explain the following functions, it is useful to understand the four
336 * kinds of shifts used in btree operation. First, a shift is a movement of
337 * elements within a node. It is used to create gaps for inserting new
338 * elements and children, or cover gaps created when things are removed. A
339 * shift has two fundamental properties, each of which can be one of two
340 * values, making four types of shifts. There is the direction of the shift
341 * (left or right) and the shape of the shift (parallelogram or isoceles
342 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
343 * applies to shifts of core nodes.
345 * The names derive from the following imagining of the layout of a node:
347 * Elements: * * * * * * * ... * * *
348 * Children: * * * * * * * * ... * * *
350 * This layout follows from the fact that the elements act as separators
351 * between pairs of children, and that children root subtrees "below" the
352 * current node. A left and right shift are fairly self-explanatory; a left
353 * shift moves things to the left, while a right shift moves things to the
354 * right. A parallelogram shift is a shift with the same number of elements
355 * and children being moved, while a trapezoid shift is a shift that moves one
356 * more children than elements. An example follows:
358 * A parallelogram shift could contain the following:
359 * _______________
360 * \* * * * \ * * * ... * * *
361 * * \ * * * *\ * * * ... * * *
362 * ---------------
363 * A trapezoid shift could contain the following:
364 * ___________
365 * * / * * * \ * * * ... * * *
366 * * / * * * *\ * * * ... * * *
367 * ---------------
369 * Note that a parallelogram shift is always shaped like a "left-leaning"
370 * parallelogram, where the starting index of the children being moved is
371 * always one higher than the starting index of the elements being moved. No
372 * "right-leaning" parallelogram shifts are needed (shifts where the starting
373 * element index and starting child index being moved are the same) to achieve
374 * any btree operations, so we ignore them.
377 enum bt_shift_shape {
378 BSS_TRAPEZOID,
379 BSS_PARALLELOGRAM
382 enum bt_shift_direction {
383 BSD_LEFT,
384 BSD_RIGHT
388 * Shift elements and children in the provided core node by off spots. The
389 * first element moved is idx, and count elements are moved. The shape of the
390 * shift is determined by shape. The direction is determined by dir.
392 static inline void
393 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
394 uint32_t count, uint32_t off, enum bt_shift_shape shape,
395 enum bt_shift_direction dir)
397 size_t size = tree->bt_elem_size;
398 ASSERT(zfs_btree_is_core(&node->btc_hdr));
400 uint8_t *e_start = node->btc_elems + idx * size;
401 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
402 e_start + off * size);
403 bmov(e_start, e_out, count * size);
405 zfs_btree_hdr_t **c_start = node->btc_children + idx +
406 (shape == BSS_TRAPEZOID ? 0 : 1);
407 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
408 c_start + off);
409 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
410 bmov(c_start, c_out, c_count * sizeof (*c_start));
414 * Shift elements and children in the provided core node left by one spot.
415 * The first element moved is idx, and count elements are moved. The
416 * shape of the shift is determined by trap; true if the shift is a trapezoid,
417 * false if it is a parallelogram.
419 static inline void
420 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
421 uint32_t count, enum bt_shift_shape shape)
423 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
427 * Shift elements and children in the provided core node right by one spot.
428 * Starts with elements[idx] and children[idx] and one more child than element.
430 static inline void
431 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
432 uint32_t count, enum bt_shift_shape shape)
434 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
438 * Shift elements and children in the provided leaf node by off spots.
439 * The first element moved is idx, and count elements are moved. The direction
440 * is determined by left.
442 static inline void
443 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
444 uint32_t count, uint32_t off, enum bt_shift_direction dir)
446 size_t size = tree->bt_elem_size;
447 zfs_btree_hdr_t *hdr = &node->btl_hdr;
448 ASSERT(!zfs_btree_is_core(hdr));
450 if (count == 0)
451 return;
452 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
453 uint8_t *out = (dir == BSD_LEFT ? start - off * size :
454 start + off * size);
455 bmov(start, out, count * size);
459 * Grow leaf for n new elements before idx.
461 static void
462 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
463 uint32_t n)
465 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
466 ASSERT(!zfs_btree_is_core(hdr));
467 ASSERT3U(idx, <=, hdr->bth_count);
468 uint32_t capacity = tree->bt_leaf_cap;
469 ASSERT3U(hdr->bth_count + n, <=, capacity);
470 boolean_t cl = (hdr->bth_first >= n);
471 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
473 if (cl && (!cr || idx <= hdr->bth_count / 2)) {
474 /* Grow left. */
475 hdr->bth_first -= n;
476 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
477 } else if (cr) {
478 /* Grow right. */
479 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
480 BSD_RIGHT);
481 } else {
482 /* Grow both ways. */
483 uint32_t fn = hdr->bth_first -
484 (capacity - (hdr->bth_count + n)) / 2;
485 hdr->bth_first -= fn;
486 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
487 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
488 n - fn, BSD_RIGHT);
490 hdr->bth_count += n;
494 * Shrink leaf for count elements starting from idx.
496 static void
497 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
498 uint32_t n)
500 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
501 ASSERT(!zfs_btree_is_core(hdr));
502 ASSERT3U(idx, <=, hdr->bth_count);
503 ASSERT3U(idx + n, <=, hdr->bth_count);
505 if (idx <= (hdr->bth_count - n) / 2) {
506 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
507 zfs_btree_poison_node_at(tree, hdr, 0, n);
508 hdr->bth_first += n;
509 } else {
510 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
511 BSD_LEFT);
512 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
514 hdr->bth_count -= n;
518 * Move children and elements from one core node to another. The shape
519 * parameter behaves the same as it does in the shift logic.
521 static inline void
522 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
523 uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
524 enum bt_shift_shape shape)
526 size_t size = tree->bt_elem_size;
527 ASSERT(zfs_btree_is_core(&source->btc_hdr));
528 ASSERT(zfs_btree_is_core(&dest->btc_hdr));
530 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
531 count * size);
533 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
534 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
535 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
536 c_count * sizeof (*source->btc_children));
539 static inline void
540 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
541 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
543 size_t size = tree->bt_elem_size;
544 ASSERT(!zfs_btree_is_core(&source->btl_hdr));
545 ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
547 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
548 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
549 count * size);
553 * Find the first element in the subtree rooted at hdr, return its value and
554 * put its location in where if non-null.
556 static void *
557 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
558 zfs_btree_index_t *where)
560 zfs_btree_hdr_t *node;
562 for (node = hdr; zfs_btree_is_core(node);
563 node = ((zfs_btree_core_t *)node)->btc_children[0])
566 ASSERT(!zfs_btree_is_core(node));
567 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
568 if (where != NULL) {
569 where->bti_node = node;
570 where->bti_offset = 0;
571 where->bti_before = B_FALSE;
573 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
576 /* Insert an element and a child into a core node at the given offset. */
577 static void
578 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
579 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
581 size_t size = tree->bt_elem_size;
582 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
583 ASSERT3P(par_hdr, ==, new_node->bth_parent);
584 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
586 if (zfs_btree_verify_intensity >= 5) {
587 zfs_btree_verify_poison_at(tree, par_hdr,
588 par_hdr->bth_count);
590 /* Shift existing elements and children */
591 uint32_t count = par_hdr->bth_count - offset;
592 bt_shift_core_right(tree, parent, offset, count,
593 BSS_PARALLELOGRAM);
595 /* Insert new values */
596 parent->btc_children[offset + 1] = new_node;
597 bcpy(buf, parent->btc_elems + offset * size, size);
598 par_hdr->bth_count++;
602 * Insert new_node into the parent of old_node directly after old_node, with
603 * buf as the dividing element between the two.
605 static void
606 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
607 zfs_btree_hdr_t *new_node, void *buf)
609 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
610 size_t size = tree->bt_elem_size;
611 zfs_btree_core_t *parent = old_node->bth_parent;
614 * If this is the root node we were splitting, we create a new root
615 * and increase the height of the tree.
617 if (parent == NULL) {
618 ASSERT3P(old_node, ==, tree->bt_root);
619 tree->bt_num_nodes++;
620 zfs_btree_core_t *new_root =
621 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
622 size, KM_SLEEP);
623 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
624 new_root_hdr->bth_parent = NULL;
625 new_root_hdr->bth_first = -1;
626 new_root_hdr->bth_count = 1;
628 old_node->bth_parent = new_node->bth_parent = new_root;
629 new_root->btc_children[0] = old_node;
630 new_root->btc_children[1] = new_node;
631 bcpy(buf, new_root->btc_elems, size);
633 tree->bt_height++;
634 tree->bt_root = new_root_hdr;
635 zfs_btree_poison_node(tree, new_root_hdr);
636 return;
640 * Since we have the new separator, binary search for where to put
641 * new_node.
643 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
644 zfs_btree_index_t idx;
645 ASSERT(zfs_btree_is_core(par_hdr));
646 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
647 par_hdr->bth_count, buf, &idx), ==, NULL);
648 ASSERT(idx.bti_before);
649 uint32_t offset = idx.bti_offset;
650 ASSERT3U(offset, <=, par_hdr->bth_count);
651 ASSERT3P(parent->btc_children[offset], ==, old_node);
654 * If the parent isn't full, shift things to accommodate our insertions
655 * and return.
657 if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
658 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
659 return;
663 * We need to split this core node into two. Currently there are
664 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
665 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
666 * current node, and the others will be moved to the new core node.
667 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
668 * will be used as the new separator in our parent, and the others
669 * will be split among the two core nodes.
671 * Usually we will split the node in half evenly, with
672 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
673 * instead move only about a quarter of the elements (and children) to
674 * the new node. Since the average state after a long time is a 3/4
675 * full node, shortcutting directly to that state improves efficiency.
677 * We do this in two stages: first we split into two nodes, and then we
678 * reuse our existing logic to insert the new element and child.
680 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
681 2 : 4)) - 1, 2);
682 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
683 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
684 tree->bt_num_nodes++;
685 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
686 BTREE_CORE_ELEMS * size, KM_SLEEP);
687 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
688 new_par_hdr->bth_parent = par_hdr->bth_parent;
689 new_par_hdr->bth_first = -1;
690 new_par_hdr->bth_count = move_count;
691 zfs_btree_poison_node(tree, new_par_hdr);
693 par_hdr->bth_count = keep_count;
695 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
696 0, BSS_TRAPEZOID);
698 /* Store the new separator in a buffer. */
699 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
700 bcpy(parent->btc_elems + keep_count * size, tmp_buf,
701 size);
702 zfs_btree_poison_node(tree, par_hdr);
704 if (offset < keep_count) {
705 /* Insert the new node into the left half */
706 zfs_btree_insert_core_impl(tree, parent, offset, new_node,
707 buf);
710 * Move the new separator to the existing buffer.
712 bcpy(tmp_buf, buf, size);
713 } else if (offset > keep_count) {
714 /* Insert the new node into the right half */
715 new_node->bth_parent = new_parent;
716 zfs_btree_insert_core_impl(tree, new_parent,
717 offset - keep_count - 1, new_node, buf);
720 * Move the new separator to the existing buffer.
722 bcpy(tmp_buf, buf, size);
723 } else {
725 * Move the new separator into the right half, and replace it
726 * with buf. We also need to shift back the elements in the
727 * right half to accommodate new_node.
729 bt_shift_core_right(tree, new_parent, 0, move_count,
730 BSS_TRAPEZOID);
731 new_parent->btc_children[0] = new_node;
732 bcpy(tmp_buf, new_parent->btc_elems, size);
733 new_par_hdr->bth_count++;
735 kmem_free(tmp_buf, size);
736 zfs_btree_poison_node(tree, par_hdr);
738 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
739 new_parent->btc_children[i]->bth_parent = new_parent;
741 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
742 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
745 * Now that the node is split, we need to insert the new node into its
746 * parent. This may cause further splitting.
748 zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
749 &new_parent->btc_hdr, buf);
752 /* Insert an element into a leaf node at the given offset. */
753 static void
754 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
755 uint32_t idx, const void *value)
757 size_t size = tree->bt_elem_size;
758 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
759 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
761 if (zfs_btree_verify_intensity >= 5) {
762 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
763 leaf->btl_hdr.bth_count);
766 bt_grow_leaf(tree, leaf, idx, 1);
767 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
768 bcpy(value, start, size);
771 static void
772 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
774 /* Helper function for inserting a new value into leaf at the given index. */
775 static void
776 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
777 const void *value, uint32_t idx)
779 size_t size = tree->bt_elem_size;
780 uint32_t capacity = tree->bt_leaf_cap;
783 * If the leaf isn't full, shift the elements after idx and insert
784 * value.
786 if (leaf->btl_hdr.bth_count != capacity) {
787 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
788 return;
792 * Otherwise, we split the leaf node into two nodes. If we're not bulk
793 * inserting, each is of size (capacity / 2). If we are bulk
794 * inserting, we move a quarter of the elements to the new node so
795 * inserts into the old node don't cause immediate splitting but the
796 * tree stays relatively dense. Since the average state after a long
797 * time is a 3/4 full node, shortcutting directly to that state
798 * improves efficiency. At the end of the bulk insertion process
799 * we'll need to go through and fix up any nodes (the last leaf and
800 * its ancestors, potentially) that are below the minimum.
802 * In either case, we're left with one extra element. The leftover
803 * element will become the new dividing element between the two nodes.
805 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
806 uint32_t keep_count = capacity - move_count - 1;
807 ASSERT3U(keep_count, >=, 1);
808 /* If we insert on left. move one more to keep leaves balanced. */
809 if (idx < keep_count) {
810 keep_count--;
811 move_count++;
813 tree->bt_num_nodes++;
814 zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
815 KM_SLEEP);
816 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
817 new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
818 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
819 (idx >= keep_count && idx <= keep_count + move_count / 2);
820 new_hdr->bth_count = move_count;
821 zfs_btree_poison_node(tree, new_hdr);
823 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
824 tree->bt_bulk = new_leaf;
826 /* Copy the back part to the new leaf. */
827 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
829 /* We store the new separator in a buffer we control for simplicity. */
830 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
831 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
832 buf, size);
834 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
836 if (idx < keep_count) {
837 /* Insert into the existing leaf. */
838 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
839 } else if (idx > keep_count) {
840 /* Insert into the new leaf. */
841 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
842 1, value);
843 } else {
845 * Insert planned separator into the new leaf, and use
846 * the new value as the new separator.
848 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
849 bcpy(value, buf, size);
853 * Now that the node is split, we need to insert the new node into its
854 * parent. This may cause further splitting, bur only of core nodes.
856 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
857 buf);
858 kmem_free(buf, size);
861 static uint32_t
862 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
864 void *buf;
865 if (zfs_btree_is_core(hdr)) {
866 buf = ((zfs_btree_core_t *)hdr)->btc_elems;
867 } else {
868 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
869 hdr->bth_first * tree->bt_elem_size;
871 zfs_btree_index_t idx;
872 zfs_btree_core_t *parent = hdr->bth_parent;
873 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
874 parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
875 ASSERT(idx.bti_before);
876 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
877 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
878 return (idx.bti_offset);
882 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
883 * nodes may violate the invariant that non-root nodes must be at least half
884 * full. All nodes violating this invariant should be the last node in their
885 * particular level. To correct the invariant, we take values from their left
886 * neighbor until they are half full. They must have a left neighbor at their
887 * level because the last node at a level is not the first node unless it's
888 * the root.
890 static void
891 zfs_btree_bulk_finish(zfs_btree_t *tree)
893 ASSERT3P(tree->bt_bulk, !=, NULL);
894 ASSERT3P(tree->bt_root, !=, NULL);
895 zfs_btree_leaf_t *leaf = tree->bt_bulk;
896 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
897 zfs_btree_core_t *parent = hdr->bth_parent;
898 size_t size = tree->bt_elem_size;
899 uint32_t capacity = tree->bt_leaf_cap;
902 * The invariant doesn't apply to the root node, if that's the only
903 * node in the tree we're done.
905 if (parent == NULL) {
906 tree->bt_bulk = NULL;
907 return;
910 /* First, take elements to rebalance the leaf node. */
911 if (hdr->bth_count < capacity / 2) {
913 * First, find the left neighbor. The simplest way to do this
914 * is to call zfs_btree_prev twice; the first time finds some
915 * ancestor of this node, and the second time finds the left
916 * neighbor. The ancestor found is the lowest common ancestor
917 * of leaf and the neighbor.
919 zfs_btree_index_t idx = {
920 .bti_node = hdr,
921 .bti_offset = 0
923 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
924 ASSERT(zfs_btree_is_core(idx.bti_node));
925 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
926 uint32_t common_idx = idx.bti_offset;
928 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
929 ASSERT(!zfs_btree_is_core(idx.bti_node));
930 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
931 zfs_btree_hdr_t *l_hdr = idx.bti_node;
932 uint32_t move_count = (capacity / 2) - hdr->bth_count;
933 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
934 capacity / 2);
936 if (zfs_btree_verify_intensity >= 5) {
937 for (uint32_t i = 0; i < move_count; i++) {
938 zfs_btree_verify_poison_at(tree, hdr,
939 leaf->btl_hdr.bth_count + i);
943 /* First, shift elements in leaf back. */
944 bt_grow_leaf(tree, leaf, 0, move_count);
946 /* Next, move the separator from the common ancestor to leaf. */
947 uint8_t *separator = common->btc_elems + common_idx * size;
948 uint8_t *out = leaf->btl_elems +
949 (hdr->bth_first + move_count - 1) * size;
950 bcpy(separator, out, size);
953 * Now we move elements from the tail of the left neighbor to
954 * fill the remaining spots in leaf.
956 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
957 (move_count - 1), move_count - 1, leaf, 0);
960 * Finally, move the new last element in the left neighbor to
961 * the separator.
963 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
964 l_hdr->bth_count - move_count) * size, separator, size);
966 /* Adjust the node's counts, and we're done. */
967 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
968 move_count);
970 ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
971 ASSERT3U(hdr->bth_count, >=, capacity / 2);
975 * Now we have to rebalance any ancestors of leaf that may also
976 * violate the invariant.
978 capacity = BTREE_CORE_ELEMS;
979 while (parent->btc_hdr.bth_parent != NULL) {
980 zfs_btree_core_t *cur = parent;
981 zfs_btree_hdr_t *hdr = &cur->btc_hdr;
982 parent = hdr->bth_parent;
984 * If the invariant isn't violated, move on to the next
985 * ancestor.
987 if (hdr->bth_count >= capacity / 2)
988 continue;
991 * Because the smallest number of nodes we can move when
992 * splitting is 2, we never need to worry about not having a
993 * left sibling (a sibling is a neighbor with the same parent).
995 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
996 ASSERT3U(parent_idx, >, 0);
997 zfs_btree_core_t *l_neighbor =
998 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
999 uint32_t move_count = (capacity / 2) - hdr->bth_count;
1000 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
1001 capacity / 2);
1003 if (zfs_btree_verify_intensity >= 5) {
1004 for (uint32_t i = 0; i < move_count; i++) {
1005 zfs_btree_verify_poison_at(tree, hdr,
1006 hdr->bth_count + i);
1009 /* First, shift things in the right node back. */
1010 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
1011 BSS_TRAPEZOID, BSD_RIGHT);
1013 /* Next, move the separator to the right node. */
1014 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
1015 size);
1016 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
1017 bcpy(separator, e_out, size);
1020 * Now, move elements and children from the left node to the
1021 * right. We move one more child than elements.
1023 move_count--;
1024 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
1025 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
1026 BSS_TRAPEZOID);
1029 * Finally, move the last element in the left node to the
1030 * separator's position.
1032 move_idx--;
1033 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
1035 l_neighbor->btc_hdr.bth_count -= move_count + 1;
1036 hdr->bth_count += move_count + 1;
1038 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
1039 ASSERT3U(hdr->bth_count, >=, capacity / 2);
1041 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
1043 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1044 cur->btc_children[i]->bth_parent = cur;
1047 tree->bt_bulk = NULL;
1048 zfs_btree_verify(tree);
1052 * Insert value into tree at the location specified by where.
1054 void
1055 zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
1056 const zfs_btree_index_t *where)
1058 zfs_btree_index_t idx = {0};
1060 /* If we're not inserting in the last leaf, end bulk insert mode. */
1061 if (tree->bt_bulk != NULL) {
1062 if (where->bti_node != &tree->bt_bulk->btl_hdr) {
1063 zfs_btree_bulk_finish(tree);
1064 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
1065 where = &idx;
1069 tree->bt_num_elems++;
1071 * If this is the first element in the tree, create a leaf root node
1072 * and add the value to it.
1074 if (where->bti_node == NULL) {
1075 ASSERT3U(tree->bt_num_elems, ==, 1);
1076 ASSERT3S(tree->bt_height, ==, -1);
1077 ASSERT3P(tree->bt_root, ==, NULL);
1078 ASSERT0(where->bti_offset);
1080 tree->bt_num_nodes++;
1081 zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
1082 KM_SLEEP);
1083 tree->bt_root = &leaf->btl_hdr;
1084 tree->bt_height++;
1086 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
1087 hdr->bth_parent = NULL;
1088 hdr->bth_first = 0;
1089 hdr->bth_count = 0;
1090 zfs_btree_poison_node(tree, hdr);
1092 zfs_btree_insert_into_leaf(tree, leaf, value, 0);
1093 tree->bt_bulk = leaf;
1094 } else if (!zfs_btree_is_core(where->bti_node)) {
1096 * If we're inserting into a leaf, go directly to the helper
1097 * function.
1099 zfs_btree_insert_into_leaf(tree,
1100 (zfs_btree_leaf_t *)where->bti_node, value,
1101 where->bti_offset);
1102 } else {
1104 * If we're inserting into a core node, we can't just shift
1105 * the existing element in that slot in the same node without
1106 * breaking our ordering invariants. Instead we place the new
1107 * value in the node at that spot and then insert the old
1108 * separator into the first slot in the subtree to the right.
1110 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
1113 * We can ignore bti_before, because either way the value
1114 * should end up in bti_offset.
1116 uint32_t off = where->bti_offset;
1117 zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
1118 size_t size = tree->bt_elem_size;
1119 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
1120 bcpy(node->btc_elems + off * size, buf, size);
1121 bcpy(value, node->btc_elems + off * size, size);
1124 * Find the first slot in the subtree to the right, insert
1125 * there.
1127 zfs_btree_index_t new_idx;
1128 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
1129 NULL);
1130 ASSERT0(new_idx.bti_offset);
1131 ASSERT(!zfs_btree_is_core(new_idx.bti_node));
1132 zfs_btree_insert_into_leaf(tree,
1133 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
1134 kmem_free(buf, size);
1136 zfs_btree_verify(tree);
1140 * Return the first element in the tree, and put its location in where if
1141 * non-null.
1143 void *
1144 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
1146 if (tree->bt_height == -1) {
1147 ASSERT0(tree->bt_num_elems);
1148 return (NULL);
1150 return (zfs_btree_first_helper(tree, tree->bt_root, where));
1154 * Find the last element in the subtree rooted at hdr, return its value and
1155 * put its location in where if non-null.
1157 static void *
1158 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
1159 zfs_btree_index_t *where)
1161 zfs_btree_hdr_t *node;
1163 for (node = hdr; zfs_btree_is_core(node); node =
1164 ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
1167 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
1168 if (where != NULL) {
1169 where->bti_node = node;
1170 where->bti_offset = node->bth_count - 1;
1171 where->bti_before = B_FALSE;
1173 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
1174 btree->bt_elem_size);
1178 * Return the last element in the tree, and put its location in where if
1179 * non-null.
1181 void *
1182 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
1184 if (tree->bt_height == -1) {
1185 ASSERT0(tree->bt_num_elems);
1186 return (NULL);
1188 return (zfs_btree_last_helper(tree, tree->bt_root, where));
1192 * This function contains the logic to find the next node in the tree. A
1193 * helper function is used because there are multiple internal consumemrs of
1194 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
1195 * node after we've finished with it.
1197 static void *
1198 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1199 zfs_btree_index_t *out_idx,
1200 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
1202 if (idx->bti_node == NULL) {
1203 ASSERT3S(tree->bt_height, ==, -1);
1204 return (NULL);
1207 uint32_t offset = idx->bti_offset;
1208 if (!zfs_btree_is_core(idx->bti_node)) {
1210 * When finding the next element of an element in a leaf,
1211 * there are two cases. If the element isn't the last one in
1212 * the leaf, in which case we just return the next element in
1213 * the leaf. Otherwise, we need to traverse up our parents
1214 * until we find one where our ancestor isn't the last child
1215 * of its parent. Once we do, the next element is the
1216 * separator after our ancestor in its parent.
1218 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1219 uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
1220 if (leaf->btl_hdr.bth_count > new_off) {
1221 out_idx->bti_node = &leaf->btl_hdr;
1222 out_idx->bti_offset = new_off;
1223 out_idx->bti_before = B_FALSE;
1224 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1225 new_off) * tree->bt_elem_size);
1228 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1229 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1230 node != NULL; node = node->btc_hdr.bth_parent) {
1231 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1232 ASSERT(zfs_btree_is_core(hdr));
1233 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1234 if (done_func != NULL)
1235 done_func(tree, prev);
1236 if (i == hdr->bth_count) {
1237 prev = hdr;
1238 continue;
1240 out_idx->bti_node = hdr;
1241 out_idx->bti_offset = i;
1242 out_idx->bti_before = B_FALSE;
1243 return (node->btc_elems + i * tree->bt_elem_size);
1245 if (done_func != NULL)
1246 done_func(tree, prev);
1248 * We've traversed all the way up and been at the end of the
1249 * node every time, so this was the last element in the tree.
1251 return (NULL);
1254 /* If we were before an element in a core node, return that element. */
1255 ASSERT(zfs_btree_is_core(idx->bti_node));
1256 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1257 if (idx->bti_before) {
1258 out_idx->bti_before = B_FALSE;
1259 return (node->btc_elems + offset * tree->bt_elem_size);
1263 * The next element from one in a core node is the first element in
1264 * the subtree just to the right of the separator.
1266 zfs_btree_hdr_t *child = node->btc_children[offset + 1];
1267 return (zfs_btree_first_helper(tree, child, out_idx));
1271 * Return the next valued node in the tree. The same address can be safely
1272 * passed for idx and out_idx.
1274 void *
1275 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1276 zfs_btree_index_t *out_idx)
1278 return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
1282 * Return the previous valued node in the tree. The same value can be safely
1283 * passed for idx and out_idx.
1285 void *
1286 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1287 zfs_btree_index_t *out_idx)
1289 if (idx->bti_node == NULL) {
1290 ASSERT3S(tree->bt_height, ==, -1);
1291 return (NULL);
1294 uint32_t offset = idx->bti_offset;
1295 if (!zfs_btree_is_core(idx->bti_node)) {
1297 * When finding the previous element of an element in a leaf,
1298 * there are two cases. If the element isn't the first one in
1299 * the leaf, in which case we just return the previous element
1300 * in the leaf. Otherwise, we need to traverse up our parents
1301 * until we find one where our previous ancestor isn't the
1302 * first child. Once we do, the previous element is the
1303 * separator after our previous ancestor.
1305 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1306 if (offset != 0) {
1307 out_idx->bti_node = &leaf->btl_hdr;
1308 out_idx->bti_offset = offset - 1;
1309 out_idx->bti_before = B_FALSE;
1310 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1311 offset - 1) * tree->bt_elem_size);
1313 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1314 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1315 node != NULL; node = node->btc_hdr.bth_parent) {
1316 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1317 ASSERT(zfs_btree_is_core(hdr));
1318 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1319 if (i == 0) {
1320 prev = hdr;
1321 continue;
1323 out_idx->bti_node = hdr;
1324 out_idx->bti_offset = i - 1;
1325 out_idx->bti_before = B_FALSE;
1326 return (node->btc_elems + (i - 1) * tree->bt_elem_size);
1329 * We've traversed all the way up and been at the start of the
1330 * node every time, so this was the first node in the tree.
1332 return (NULL);
1336 * The previous element from one in a core node is the last element in
1337 * the subtree just to the left of the separator.
1339 ASSERT(zfs_btree_is_core(idx->bti_node));
1340 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1341 zfs_btree_hdr_t *child = node->btc_children[offset];
1342 return (zfs_btree_last_helper(tree, child, out_idx));
1346 * Get the value at the provided index in the tree.
1348 * Note that the value returned from this function can be mutated, but only
1349 * if it will not change the ordering of the element with respect to any other
1350 * elements that could be in the tree.
1352 void *
1353 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
1355 ASSERT(!idx->bti_before);
1356 size_t size = tree->bt_elem_size;
1357 if (!zfs_btree_is_core(idx->bti_node)) {
1358 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1359 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1360 idx->bti_offset) * size);
1362 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1363 return (node->btc_elems + idx->bti_offset * size);
1366 /* Add the given value to the tree. Must not already be in the tree. */
1367 void
1368 zfs_btree_add(zfs_btree_t *tree, const void *node)
1370 zfs_btree_index_t where = {0};
1371 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
1372 zfs_btree_add_idx(tree, node, &where);
1375 /* Helper function to free a tree node. */
1376 static void
1377 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
1379 tree->bt_num_nodes--;
1380 if (!zfs_btree_is_core(node)) {
1381 kmem_cache_free(zfs_btree_leaf_cache, node);
1382 } else {
1383 kmem_free(node, sizeof (zfs_btree_core_t) +
1384 BTREE_CORE_ELEMS * tree->bt_elem_size);
1389 * Remove the rm_hdr and the separator to its left from the parent node. The
1390 * buffer that rm_hdr was stored in may already be freed, so its contents
1391 * cannot be accessed.
1393 static void
1394 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
1395 zfs_btree_hdr_t *rm_hdr)
1397 size_t size = tree->bt_elem_size;
1398 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
1399 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1401 * If the node is the root node and rm_hdr is one of two children,
1402 * promote the other child to the root.
1404 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
1405 ASSERT3U(hdr->bth_count, ==, 1);
1406 ASSERT3P(tree->bt_root, ==, node);
1407 ASSERT3P(node->btc_children[1], ==, rm_hdr);
1408 tree->bt_root = node->btc_children[0];
1409 node->btc_children[0]->bth_parent = NULL;
1410 zfs_btree_node_destroy(tree, hdr);
1411 tree->bt_height--;
1412 return;
1415 uint32_t idx;
1416 for (idx = 0; idx <= hdr->bth_count; idx++) {
1417 if (node->btc_children[idx] == rm_hdr)
1418 break;
1420 ASSERT3U(idx, <=, hdr->bth_count);
1423 * If the node is the root or it has more than the minimum number of
1424 * children, just remove the child and separator, and return.
1426 if (hdr->bth_parent == NULL ||
1427 hdr->bth_count > min_count) {
1429 * Shift the element and children to the right of rm_hdr to
1430 * the left by one spot.
1432 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1433 BSS_PARALLELOGRAM);
1434 hdr->bth_count--;
1435 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
1436 return;
1439 ASSERT3U(hdr->bth_count, ==, min_count);
1442 * Now we try to take a node from a neighbor. We check left, then
1443 * right. If the neighbor exists and has more than the minimum number
1444 * of elements, we move the separator between us and them to our
1445 * node, move their closest element (last for left, first for right)
1446 * to the separator, and move their closest child to our node. Along
1447 * the way we need to collapse the gap made by idx, and (for our right
1448 * neighbor) the gap made by removing their first element and child.
1450 * Note: this logic currently doesn't support taking from a neighbor
1451 * that isn't a sibling (i.e. a neighbor with a different
1452 * parent). This isn't critical functionality, but may be worth
1453 * implementing in the future for completeness' sake.
1455 zfs_btree_core_t *parent = hdr->bth_parent;
1456 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1458 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1459 parent->btc_children[parent_idx - 1]);
1460 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1461 /* We can take a node from the left neighbor. */
1462 ASSERT(zfs_btree_is_core(l_hdr));
1463 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
1466 * Start by shifting the elements and children in the current
1467 * node to the right by one spot.
1469 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
1472 * Move the separator between node and neighbor to the first
1473 * element slot in the current node.
1475 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1476 size;
1477 bcpy(separator, node->btc_elems, size);
1479 /* Move the last child of neighbor to our first child slot. */
1480 node->btc_children[0] =
1481 neighbor->btc_children[l_hdr->bth_count];
1482 node->btc_children[0]->bth_parent = node;
1484 /* Move the last element of neighbor to the separator spot. */
1485 uint8_t *take_elem = neighbor->btc_elems +
1486 (l_hdr->bth_count - 1) * size;
1487 bcpy(take_elem, separator, size);
1488 l_hdr->bth_count--;
1489 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
1490 return;
1493 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1494 NULL : parent->btc_children[parent_idx + 1]);
1495 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1496 /* We can take a node from the right neighbor. */
1497 ASSERT(zfs_btree_is_core(r_hdr));
1498 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
1501 * Shift elements in node left by one spot to overwrite rm_hdr
1502 * and the separator before it.
1504 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1505 BSS_PARALLELOGRAM);
1508 * Move the separator between node and neighbor to the last
1509 * element spot in node.
1511 uint8_t *separator = parent->btc_elems + parent_idx * size;
1512 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
1513 size);
1516 * Move the first child of neighbor to the last child spot in
1517 * node.
1519 node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
1520 node->btc_children[hdr->bth_count]->bth_parent = node;
1522 /* Move the first element of neighbor to the separator spot. */
1523 uint8_t *take_elem = neighbor->btc_elems;
1524 bcpy(take_elem, separator, size);
1525 r_hdr->bth_count--;
1528 * Shift the elements and children of neighbor to cover the
1529 * stolen elements.
1531 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
1532 BSS_TRAPEZOID);
1533 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
1534 return;
1538 * In this case, neither of our neighbors can spare an element, so we
1539 * need to merge with one of them. We prefer the left one,
1540 * arbitrarily. Move the separator into the leftmost merging node
1541 * (which may be us or the left neighbor), and then move the right
1542 * merging node's elements. Once that's done, we go back and delete
1543 * the element we're removing. Finally, go into the parent and delete
1544 * the right merging node and the separator. This may cause further
1545 * merging.
1547 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
1548 uint32_t new_idx = idx;
1549 if (l_hdr != NULL) {
1550 keep_hdr = l_hdr;
1551 new_rm_hdr = hdr;
1552 new_idx += keep_hdr->bth_count + 1;
1553 } else {
1554 ASSERT3P(r_hdr, !=, NULL);
1555 keep_hdr = hdr;
1556 new_rm_hdr = r_hdr;
1557 parent_idx++;
1560 ASSERT(zfs_btree_is_core(keep_hdr));
1561 ASSERT(zfs_btree_is_core(new_rm_hdr));
1563 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
1564 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
1566 if (zfs_btree_verify_intensity >= 5) {
1567 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
1568 zfs_btree_verify_poison_at(tree, keep_hdr,
1569 keep_hdr->bth_count + i);
1573 /* Move the separator into the left node. */
1574 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
1575 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1576 size;
1577 bcpy(separator, e_out, size);
1578 keep_hdr->bth_count++;
1580 /* Move all our elements and children into the left node. */
1581 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
1582 keep_hdr->bth_count, BSS_TRAPEZOID);
1584 uint32_t old_count = keep_hdr->bth_count;
1586 /* Update bookkeeping */
1587 keep_hdr->bth_count += new_rm_hdr->bth_count;
1588 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
1591 * Shift the element and children to the right of rm_hdr to
1592 * the left by one spot.
1594 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
1595 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
1596 BSS_PARALLELOGRAM);
1597 keep_hdr->bth_count--;
1599 /* Reparent all our children to point to the left node. */
1600 zfs_btree_hdr_t **new_start = keep->btc_children +
1601 old_count - 1;
1602 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
1603 new_start[i]->bth_parent = keep;
1604 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
1605 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
1606 ASSERT3P(keep->btc_children[i], !=, rm_hdr);
1608 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
1610 new_rm_hdr->bth_count = 0;
1611 zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
1612 zfs_btree_node_destroy(tree, new_rm_hdr);
1615 /* Remove the element at the specific location. */
1616 void
1617 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
1619 size_t size = tree->bt_elem_size;
1620 zfs_btree_hdr_t *hdr = where->bti_node;
1621 uint32_t idx = where->bti_offset;
1623 ASSERT(!where->bti_before);
1624 if (tree->bt_bulk != NULL) {
1626 * Leave bulk insert mode. Note that our index would be
1627 * invalid after we correct the tree, so we copy the value
1628 * we're planning to remove and find it again after
1629 * bulk_finish.
1631 uint8_t *value = zfs_btree_get(tree, where);
1632 uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
1633 bcpy(value, tmp, size);
1634 zfs_btree_bulk_finish(tree);
1635 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
1636 kmem_free(tmp, size);
1637 hdr = where->bti_node;
1638 idx = where->bti_offset;
1641 tree->bt_num_elems--;
1643 * If the element happens to be in a core node, we move a leaf node's
1644 * element into its place and then remove the leaf node element. This
1645 * makes the rebalance logic not need to be recursive both upwards and
1646 * downwards.
1648 if (zfs_btree_is_core(hdr)) {
1649 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1650 zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
1651 void *new_value = zfs_btree_last_helper(tree, left_subtree,
1652 where);
1653 ASSERT3P(new_value, !=, NULL);
1655 bcpy(new_value, node->btc_elems + idx * size, size);
1657 hdr = where->bti_node;
1658 idx = where->bti_offset;
1659 ASSERT(!where->bti_before);
1663 * First, we'll update the leaf's metadata. Then, we shift any
1664 * elements after the idx to the left. After that, we rebalance if
1665 * needed.
1667 ASSERT(!zfs_btree_is_core(hdr));
1668 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
1669 ASSERT3U(hdr->bth_count, >, 0);
1671 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
1674 * If we're over the minimum size or this is the root, just overwrite
1675 * the value and return.
1677 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
1678 bt_shrink_leaf(tree, leaf, idx, 1);
1679 if (hdr->bth_parent == NULL) {
1680 ASSERT0(tree->bt_height);
1681 if (hdr->bth_count == 0) {
1682 tree->bt_root = NULL;
1683 tree->bt_height--;
1684 zfs_btree_node_destroy(tree, &leaf->btl_hdr);
1687 zfs_btree_verify(tree);
1688 return;
1690 ASSERT3U(hdr->bth_count, ==, min_count);
1693 * Now we try to take a node from a sibling. We check left, then
1694 * right. If they exist and have more than the minimum number of
1695 * elements, we move the separator between us and them to our node
1696 * and move their closest element (last for left, first for right) to
1697 * the separator. Along the way we need to collapse the gap made by
1698 * idx, and (for our right neighbor) the gap made by removing their
1699 * first element.
1701 * Note: this logic currently doesn't support taking from a neighbor
1702 * that isn't a sibling. This isn't critical functionality, but may be
1703 * worth implementing in the future for completeness' sake.
1705 zfs_btree_core_t *parent = hdr->bth_parent;
1706 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1708 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1709 parent->btc_children[parent_idx - 1]);
1710 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1711 /* We can take a node from the left neighbor. */
1712 ASSERT(!zfs_btree_is_core(l_hdr));
1713 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
1716 * Move our elements back by one spot to make room for the
1717 * stolen element and overwrite the element being removed.
1719 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
1721 /* Move the separator to our first spot. */
1722 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1723 size;
1724 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
1726 /* Move our neighbor's last element to the separator. */
1727 uint8_t *take_elem = neighbor->btl_elems +
1728 (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
1729 bcpy(take_elem, separator, size);
1731 /* Delete our neighbor's last element. */
1732 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
1733 zfs_btree_verify(tree);
1734 return;
1737 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1738 NULL : parent->btc_children[parent_idx + 1]);
1739 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1740 /* We can take a node from the right neighbor. */
1741 ASSERT(!zfs_btree_is_core(r_hdr));
1742 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
1745 * Move our elements after the element being removed forwards
1746 * by one spot to make room for the stolen element and
1747 * overwrite the element being removed.
1749 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
1750 1, BSD_LEFT);
1752 /* Move the separator between us to our last spot. */
1753 uint8_t *separator = parent->btc_elems + parent_idx * size;
1754 bcpy(separator, leaf->btl_elems + (hdr->bth_first +
1755 hdr->bth_count - 1) * size, size);
1757 /* Move our neighbor's first element to the separator. */
1758 uint8_t *take_elem = neighbor->btl_elems +
1759 r_hdr->bth_first * size;
1760 bcpy(take_elem, separator, size);
1762 /* Delete our neighbor's first element. */
1763 bt_shrink_leaf(tree, neighbor, 0, 1);
1764 zfs_btree_verify(tree);
1765 return;
1769 * In this case, neither of our neighbors can spare an element, so we
1770 * need to merge with one of them. We prefer the left one, arbitrarily.
1771 * After remove we move the separator into the leftmost merging node
1772 * (which may be us or the left neighbor), and then move the right
1773 * merging node's elements. Once that's done, we go back and delete
1774 * the element we're removing. Finally, go into the parent and delete
1775 * the right merging node and the separator. This may cause further
1776 * merging.
1778 zfs_btree_hdr_t *rm_hdr, *k_hdr;
1779 if (l_hdr != NULL) {
1780 k_hdr = l_hdr;
1781 rm_hdr = hdr;
1782 } else {
1783 ASSERT3P(r_hdr, !=, NULL);
1784 k_hdr = hdr;
1785 rm_hdr = r_hdr;
1786 parent_idx++;
1788 ASSERT(!zfs_btree_is_core(k_hdr));
1789 ASSERT(!zfs_btree_is_core(rm_hdr));
1790 ASSERT3U(k_hdr->bth_count, ==, min_count);
1791 ASSERT3U(rm_hdr->bth_count, ==, min_count);
1792 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
1793 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
1795 if (zfs_btree_verify_intensity >= 5) {
1796 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
1797 zfs_btree_verify_poison_at(tree, k_hdr,
1798 k_hdr->bth_count + i);
1803 * Remove the value from the node. It will go below the minimum,
1804 * but we'll fix it in no time.
1806 bt_shrink_leaf(tree, leaf, idx, 1);
1808 /* Prepare space for elements to be moved from the right. */
1809 uint32_t k_count = k_hdr->bth_count;
1810 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
1811 ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
1813 /* Move the separator into the first open spot. */
1814 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
1815 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
1816 bcpy(separator, out, size);
1818 /* Move our elements to the left neighbor. */
1819 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
1821 /* Remove the emptied node from the parent. */
1822 zfs_btree_remove_from_node(tree, parent, rm_hdr);
1823 zfs_btree_node_destroy(tree, rm_hdr);
1824 zfs_btree_verify(tree);
1827 /* Remove the given value from the tree. */
1828 void
1829 zfs_btree_remove(zfs_btree_t *tree, const void *value)
1831 zfs_btree_index_t where = {0};
1832 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
1833 zfs_btree_remove_idx(tree, &where);
1836 /* Return the number of elements in the tree. */
1837 ulong_t
1838 zfs_btree_numnodes(zfs_btree_t *tree)
1840 return (tree->bt_num_elems);
1844 * This function is used to visit all the elements in the tree before
1845 * destroying the tree. This allows the calling code to perform any cleanup it
1846 * needs to do. This is more efficient than just removing the first element
1847 * over and over, because it removes all rebalancing. Once the destroy_nodes()
1848 * function has been called, no other btree operations are valid until it
1849 * returns NULL, which point the only valid operation is zfs_btree_destroy().
1851 * example:
1853 * zfs_btree_index_t *cookie = NULL;
1854 * my_data_t *node;
1856 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
1857 * free(node->ptr);
1858 * zfs_btree_destroy(tree);
1861 void *
1862 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
1864 if (*cookie == NULL) {
1865 if (tree->bt_height == -1)
1866 return (NULL);
1867 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
1868 return (zfs_btree_first(tree, *cookie));
1871 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
1872 zfs_btree_node_destroy);
1873 if (rval == NULL) {
1874 tree->bt_root = NULL;
1875 tree->bt_height = -1;
1876 tree->bt_num_elems = 0;
1877 kmem_free(*cookie, sizeof (**cookie));
1878 tree->bt_bulk = NULL;
1880 return (rval);
1883 static void
1884 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1886 if (zfs_btree_is_core(hdr)) {
1887 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
1888 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1889 zfs_btree_clear_helper(tree, btc->btc_children[i]);
1892 zfs_btree_node_destroy(tree, hdr);
1895 void
1896 zfs_btree_clear(zfs_btree_t *tree)
1898 if (tree->bt_root == NULL) {
1899 ASSERT0(tree->bt_num_elems);
1900 return;
1903 zfs_btree_clear_helper(tree, tree->bt_root);
1904 tree->bt_num_elems = 0;
1905 tree->bt_root = NULL;
1906 tree->bt_num_nodes = 0;
1907 tree->bt_height = -1;
1908 tree->bt_bulk = NULL;
1911 void
1912 zfs_btree_destroy(zfs_btree_t *tree)
1914 ASSERT0(tree->bt_num_elems);
1915 ASSERT3P(tree->bt_root, ==, NULL);
1918 /* Verify that every child of this node has the correct parent pointer. */
1919 static void
1920 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1922 if (!zfs_btree_is_core(hdr))
1923 return;
1925 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1926 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1927 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
1928 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
1932 /* Verify that every node has the correct parent pointer. */
1933 static void
1934 zfs_btree_verify_pointers(zfs_btree_t *tree)
1936 if (tree->bt_height == -1) {
1937 VERIFY3P(tree->bt_root, ==, NULL);
1938 return;
1940 VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
1941 zfs_btree_verify_pointers_helper(tree, tree->bt_root);
1945 * Verify that all the current node and its children satisfy the count
1946 * invariants, and return the total count in the subtree rooted in this node.
1948 static uint64_t
1949 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1951 if (!zfs_btree_is_core(hdr)) {
1952 if (tree->bt_root != hdr && tree->bt_bulk &&
1953 hdr != &tree->bt_bulk->btl_hdr) {
1954 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
1957 return (hdr->bth_count);
1958 } else {
1960 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1961 uint64_t ret = hdr->bth_count;
1962 if (tree->bt_root != hdr && tree->bt_bulk == NULL)
1963 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
1964 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1965 ret += zfs_btree_verify_counts_helper(tree,
1966 node->btc_children[i]);
1969 return (ret);
1974 * Verify that all nodes satisfy the invariants and that the total number of
1975 * elements is correct.
1977 static void
1978 zfs_btree_verify_counts(zfs_btree_t *tree)
1980 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
1981 if (tree->bt_height == -1) {
1982 return;
1984 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
1985 tree->bt_num_elems);
1989 * Check that the subtree rooted at this node has a uniform height. Returns
1990 * the number of nodes under this node, to help verify bt_num_nodes.
1992 static uint64_t
1993 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
1994 int64_t height)
1996 if (!zfs_btree_is_core(hdr)) {
1997 VERIFY0(height);
1998 return (1);
2001 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2002 uint64_t ret = 1;
2003 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2004 ret += zfs_btree_verify_height_helper(tree,
2005 node->btc_children[i], height - 1);
2007 return (ret);
2011 * Check that the tree rooted at this node has a uniform height, and that the
2012 * bt_height in the tree is correct.
2014 static void
2015 zfs_btree_verify_height(zfs_btree_t *tree)
2017 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2018 if (tree->bt_height == -1) {
2019 return;
2022 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
2023 tree->bt_height), ==, tree->bt_num_nodes);
2027 * Check that the elements in this node are sorted, and that if this is a core
2028 * node, the separators are properly between the subtrees they separaate and
2029 * that the children also satisfy this requirement.
2031 static void
2032 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2034 size_t size = tree->bt_elem_size;
2035 if (!zfs_btree_is_core(hdr)) {
2036 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2037 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2038 VERIFY3S(tree->bt_compar(leaf->btl_elems +
2039 (hdr->bth_first + i - 1) * size,
2040 leaf->btl_elems +
2041 (hdr->bth_first + i) * size), ==, -1);
2043 return;
2046 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2047 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2048 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
2049 node->btc_elems + i * size), ==, -1);
2051 for (uint32_t i = 0; i < hdr->bth_count; i++) {
2052 uint8_t *left_child_last = NULL;
2053 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
2054 if (zfs_btree_is_core(left_child_hdr)) {
2055 zfs_btree_core_t *left_child =
2056 (zfs_btree_core_t *)left_child_hdr;
2057 left_child_last = left_child->btc_elems +
2058 (left_child_hdr->bth_count - 1) * size;
2059 } else {
2060 zfs_btree_leaf_t *left_child =
2061 (zfs_btree_leaf_t *)left_child_hdr;
2062 left_child_last = left_child->btl_elems +
2063 (left_child_hdr->bth_first +
2064 left_child_hdr->bth_count - 1) * size;
2066 int comp = tree->bt_compar(node->btc_elems + i * size,
2067 left_child_last);
2068 if (comp <= 0) {
2069 panic("btree: compar returned %d (expected 1) at "
2070 "%px %d: compar(%px, %px)", comp, node, i,
2071 node->btc_elems + i * size, left_child_last);
2074 uint8_t *right_child_first = NULL;
2075 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
2076 if (zfs_btree_is_core(right_child_hdr)) {
2077 zfs_btree_core_t *right_child =
2078 (zfs_btree_core_t *)right_child_hdr;
2079 right_child_first = right_child->btc_elems;
2080 } else {
2081 zfs_btree_leaf_t *right_child =
2082 (zfs_btree_leaf_t *)right_child_hdr;
2083 right_child_first = right_child->btl_elems +
2084 right_child_hdr->bth_first * size;
2086 comp = tree->bt_compar(node->btc_elems + i * size,
2087 right_child_first);
2088 if (comp >= 0) {
2089 panic("btree: compar returned %d (expected -1) at "
2090 "%px %d: compar(%px, %px)", comp, node, i,
2091 node->btc_elems + i * size, right_child_first);
2094 for (uint32_t i = 0; i <= hdr->bth_count; i++)
2095 zfs_btree_verify_order_helper(tree, node->btc_children[i]);
2098 /* Check that all elements in the tree are in sorted order. */
2099 static void
2100 zfs_btree_verify_order(zfs_btree_t *tree)
2102 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2103 if (tree->bt_height == -1) {
2104 return;
2107 zfs_btree_verify_order_helper(tree, tree->bt_root);
2110 #ifdef ZFS_DEBUG
2111 /* Check that all unused memory is poisoned correctly. */
2112 static void
2113 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2115 size_t size = tree->bt_elem_size;
2116 if (!zfs_btree_is_core(hdr)) {
2117 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2118 for (size_t i = 0; i < hdr->bth_first * size; i++)
2119 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2120 for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2121 i < BTREE_LEAF_ESIZE; i++)
2122 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2123 } else {
2124 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2125 for (size_t i = hdr->bth_count * size;
2126 i < BTREE_CORE_ELEMS * size; i++)
2127 VERIFY3U(node->btc_elems[i], ==, 0x0f);
2129 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
2130 i++) {
2131 VERIFY3P(node->btc_children[i], ==,
2132 (zfs_btree_hdr_t *)BTREE_POISON);
2135 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2136 zfs_btree_verify_poison_helper(tree,
2137 node->btc_children[i]);
2141 #endif
2143 /* Check that unused memory in the tree is still poisoned. */
2144 static void
2145 zfs_btree_verify_poison(zfs_btree_t *tree)
2147 #ifdef ZFS_DEBUG
2148 if (tree->bt_height == -1)
2149 return;
2150 zfs_btree_verify_poison_helper(tree, tree->bt_root);
2151 #endif
2154 void
2155 zfs_btree_verify(zfs_btree_t *tree)
2157 if (zfs_btree_verify_intensity == 0)
2158 return;
2159 zfs_btree_verify_height(tree);
2160 if (zfs_btree_verify_intensity == 1)
2161 return;
2162 zfs_btree_verify_pointers(tree);
2163 if (zfs_btree_verify_intensity == 2)
2164 return;
2165 zfs_btree_verify_counts(tree);
2166 if (zfs_btree_verify_intensity == 3)
2167 return;
2168 zfs_btree_verify_order(tree);
2170 if (zfs_btree_verify_intensity == 4)
2171 return;
2172 zfs_btree_verify_poison(tree);
2175 /* BEGIN CSTYLED */
2176 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
2177 "Enable btree verification. Levels above 4 require ZFS be built "
2178 "with debugging");
2179 /* END CSTYLED */