1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2011 Red Hat, Inc.
5 * This file is released under the GPL.
9 #include "dm-btree-internal.h"
10 #include "dm-transaction-manager.h"
12 #include <linux/export.h>
13 #include <linux/device-mapper.h>
15 #define DM_MSG_PREFIX "btree"
18 * Removing an entry from a btree
19 * ==============================
21 * A very important constraint for our btree is that no node, except the
22 * root, may have fewer than a certain number of entries.
23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
25 * Ensuring this is complicated by the way we want to only ever hold the
26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
29 * Each node may have a left or right sibling. When decending the spine,
30 * if a node contains only MIN_ENTRIES then we try and increase this to at
31 * least MIN_ENTRIES + 1. We do this in the following ways:
33 * [A] No siblings => this can only happen if the node is the root, in which
34 * case we copy the childs contents over the root.
37 * ==> rebalance(node, right sibling)
39 * [C] No right sibling
40 * ==> rebalance(left sibling, node)
42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
43 * ==> delete node adding it's contents to left and right
45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
46 * ==> rebalance(left, node, right)
48 * After these operations it's possible that the our original node no
49 * longer contains the desired sub tree. For this reason this rebalancing
50 * is performed on the children of the current node. This also avoids
51 * having a special case for the root.
53 * Once this rebalancing has occurred we can then step into the child node
54 * for internal nodes. Or delete the entry for leaf nodes.
58 * Some little utilities for moving node data around.
60 static void node_shift(struct btree_node
*n
, int shift
)
62 uint32_t nr_entries
= le32_to_cpu(n
->header
.nr_entries
);
63 uint32_t value_size
= le32_to_cpu(n
->header
.value_size
);
67 BUG_ON(shift
> nr_entries
);
68 BUG_ON((void *) key_ptr(n
, shift
) >= value_ptr(n
, shift
));
69 memmove(key_ptr(n
, 0),
71 (nr_entries
- shift
) * sizeof(__le64
));
72 memmove(value_ptr(n
, 0),
74 (nr_entries
- shift
) * value_size
);
76 BUG_ON(nr_entries
+ shift
> le32_to_cpu(n
->header
.max_entries
));
77 memmove(key_ptr(n
, shift
),
79 nr_entries
* sizeof(__le64
));
80 memmove(value_ptr(n
, shift
),
82 nr_entries
* value_size
);
86 static int node_copy(struct btree_node
*left
, struct btree_node
*right
, int shift
)
88 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
89 uint32_t value_size
= le32_to_cpu(left
->header
.value_size
);
91 if (value_size
!= le32_to_cpu(right
->header
.value_size
)) {
92 DMERR("mismatched value size");
99 if (nr_left
+ shift
> le32_to_cpu(left
->header
.max_entries
)) {
104 memcpy(key_ptr(left
, nr_left
),
106 shift
* sizeof(__le64
));
107 memcpy(value_ptr(left
, nr_left
),
111 if (shift
> le32_to_cpu(right
->header
.max_entries
)) {
116 memcpy(key_ptr(right
, 0),
117 key_ptr(left
, nr_left
- shift
),
118 shift
* sizeof(__le64
));
119 memcpy(value_ptr(right
, 0),
120 value_ptr(left
, nr_left
- shift
),
127 * Delete a specific entry from a leaf node.
129 static void delete_at(struct btree_node
*n
, unsigned int index
)
131 unsigned int nr_entries
= le32_to_cpu(n
->header
.nr_entries
);
132 unsigned int nr_to_copy
= nr_entries
- (index
+ 1);
133 uint32_t value_size
= le32_to_cpu(n
->header
.value_size
);
135 BUG_ON(index
>= nr_entries
);
138 memmove(key_ptr(n
, index
),
139 key_ptr(n
, index
+ 1),
140 nr_to_copy
* sizeof(__le64
));
142 memmove(value_ptr(n
, index
),
143 value_ptr(n
, index
+ 1),
144 nr_to_copy
* value_size
);
147 n
->header
.nr_entries
= cpu_to_le32(nr_entries
- 1);
150 static unsigned int merge_threshold(struct btree_node
*n
)
152 return le32_to_cpu(n
->header
.max_entries
) / 3;
157 struct dm_block
*block
;
158 struct btree_node
*n
;
161 static int init_child(struct dm_btree_info
*info
, struct dm_btree_value_type
*vt
,
162 struct btree_node
*parent
,
163 unsigned int index
, struct child
*result
)
168 result
->index
= index
;
169 root
= value64(parent
, index
);
171 r
= dm_tm_shadow_block(info
->tm
, root
, &btree_node_validator
,
172 &result
->block
, &inc
);
176 result
->n
= dm_block_data(result
->block
);
179 inc_children(info
->tm
, result
->n
, vt
);
181 *((__le64
*) value_ptr(parent
, index
)) =
182 cpu_to_le64(dm_block_location(result
->block
));
187 static void exit_child(struct dm_btree_info
*info
, struct child
*c
)
189 dm_tm_unlock(info
->tm
, c
->block
);
192 static int shift(struct btree_node
*left
, struct btree_node
*right
, int count
)
195 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
196 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
197 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
198 uint32_t r_max_entries
= le32_to_cpu(right
->header
.max_entries
);
200 if (max_entries
!= r_max_entries
) {
201 DMERR("node max_entries mismatch");
205 if (nr_left
- count
> max_entries
) {
206 DMERR("node shift out of bounds");
210 if (nr_right
+ count
> max_entries
) {
211 DMERR("node shift out of bounds");
219 node_shift(right
, count
);
220 r
= node_copy(left
, right
, count
);
224 r
= node_copy(left
, right
, count
);
227 node_shift(right
, count
);
230 left
->header
.nr_entries
= cpu_to_le32(nr_left
- count
);
231 right
->header
.nr_entries
= cpu_to_le32(nr_right
+ count
);
236 static int __rebalance2(struct dm_btree_info
*info
, struct btree_node
*parent
,
237 struct child
*l
, struct child
*r
)
240 struct btree_node
*left
= l
->n
;
241 struct btree_node
*right
= r
->n
;
242 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
243 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
245 * Ensure the number of entries in each child will be greater
246 * than or equal to (max_entries / 3 + 1), so no matter which
247 * child is used for removal, the number will still be not
248 * less than (max_entries / 3).
250 unsigned int threshold
= 2 * (merge_threshold(left
) + 1);
252 if (nr_left
+ nr_right
< threshold
) {
256 node_copy(left
, right
, -nr_right
);
257 left
->header
.nr_entries
= cpu_to_le32(nr_left
+ nr_right
);
258 delete_at(parent
, r
->index
);
261 * We need to decrement the right block, but not it's
262 * children, since they're still referenced by left.
264 dm_tm_dec(info
->tm
, dm_block_location(r
->block
));
269 unsigned int target_left
= (nr_left
+ nr_right
) / 2;
271 ret
= shift(left
, right
, nr_left
- target_left
);
274 *key_ptr(parent
, r
->index
) = right
->keys
[0];
279 static int rebalance2(struct shadow_spine
*s
, struct dm_btree_info
*info
,
280 struct dm_btree_value_type
*vt
, unsigned int left_index
)
283 struct btree_node
*parent
;
284 struct child left
, right
;
286 parent
= dm_block_data(shadow_current(s
));
288 r
= init_child(info
, vt
, parent
, left_index
, &left
);
292 r
= init_child(info
, vt
, parent
, left_index
+ 1, &right
);
294 exit_child(info
, &left
);
298 r
= __rebalance2(info
, parent
, &left
, &right
);
300 exit_child(info
, &left
);
301 exit_child(info
, &right
);
307 * We dump as many entries from center as possible into left, then the rest
308 * in right, then rebalance2. This wastes some cpu, but I want something
311 static int delete_center_node(struct dm_btree_info
*info
, struct btree_node
*parent
,
312 struct child
*l
, struct child
*c
, struct child
*r
,
313 struct btree_node
*left
, struct btree_node
*center
, struct btree_node
*right
,
314 uint32_t nr_left
, uint32_t nr_center
, uint32_t nr_right
)
316 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
317 unsigned int shift
= min(max_entries
- nr_left
, nr_center
);
319 if (nr_left
+ shift
> max_entries
) {
320 DMERR("node shift out of bounds");
324 node_copy(left
, center
, -shift
);
325 left
->header
.nr_entries
= cpu_to_le32(nr_left
+ shift
);
327 if (shift
!= nr_center
) {
328 shift
= nr_center
- shift
;
330 if ((nr_right
+ shift
) > max_entries
) {
331 DMERR("node shift out of bounds");
335 node_shift(right
, shift
);
336 node_copy(center
, right
, shift
);
337 right
->header
.nr_entries
= cpu_to_le32(nr_right
+ shift
);
339 *key_ptr(parent
, r
->index
) = right
->keys
[0];
341 delete_at(parent
, c
->index
);
344 dm_tm_dec(info
->tm
, dm_block_location(c
->block
));
345 return __rebalance2(info
, parent
, l
, r
);
349 * Redistributes entries among 3 sibling nodes.
351 static int redistribute3(struct dm_btree_info
*info
, struct btree_node
*parent
,
352 struct child
*l
, struct child
*c
, struct child
*r
,
353 struct btree_node
*left
, struct btree_node
*center
, struct btree_node
*right
,
354 uint32_t nr_left
, uint32_t nr_center
, uint32_t nr_right
)
357 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
358 unsigned int total
= nr_left
+ nr_center
+ nr_right
;
359 unsigned int target_right
= total
/ 3;
360 unsigned int remainder
= (target_right
* 3) != total
;
361 unsigned int target_left
= target_right
+ remainder
;
363 BUG_ON(target_left
> max_entries
);
364 BUG_ON(target_right
> max_entries
);
366 if (nr_left
< nr_right
) {
367 s
= nr_left
- target_left
;
369 if (s
< 0 && nr_center
< -s
) {
370 /* not enough in central node */
371 ret
= shift(left
, center
, -nr_center
);
376 ret
= shift(left
, right
, s
);
382 ret
= shift(left
, center
, s
);
387 ret
= shift(center
, right
, target_right
- nr_right
);
391 s
= target_right
- nr_right
;
392 if (s
> 0 && nr_center
< s
) {
393 /* not enough in central node */
394 ret
= shift(center
, right
, nr_center
);
398 ret
= shift(left
, right
, s
);
403 ret
= shift(center
, right
, s
);
408 ret
= shift(left
, center
, nr_left
- target_left
);
413 *key_ptr(parent
, c
->index
) = center
->keys
[0];
414 *key_ptr(parent
, r
->index
) = right
->keys
[0];
418 static int __rebalance3(struct dm_btree_info
*info
, struct btree_node
*parent
,
419 struct child
*l
, struct child
*c
, struct child
*r
)
421 struct btree_node
*left
= l
->n
;
422 struct btree_node
*center
= c
->n
;
423 struct btree_node
*right
= r
->n
;
425 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
426 uint32_t nr_center
= le32_to_cpu(center
->header
.nr_entries
);
427 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
429 unsigned int threshold
= merge_threshold(left
) * 4 + 1;
431 if ((left
->header
.max_entries
!= center
->header
.max_entries
) ||
432 (center
->header
.max_entries
!= right
->header
.max_entries
)) {
433 DMERR("bad btree metadata, max_entries differ");
437 if ((nr_left
+ nr_center
+ nr_right
) < threshold
) {
438 return delete_center_node(info
, parent
, l
, c
, r
, left
, center
, right
,
439 nr_left
, nr_center
, nr_right
);
442 return redistribute3(info
, parent
, l
, c
, r
, left
, center
, right
,
443 nr_left
, nr_center
, nr_right
);
446 static int rebalance3(struct shadow_spine
*s
, struct dm_btree_info
*info
,
447 struct dm_btree_value_type
*vt
, unsigned int left_index
)
450 struct btree_node
*parent
= dm_block_data(shadow_current(s
));
451 struct child left
, center
, right
;
454 * FIXME: fill out an array?
456 r
= init_child(info
, vt
, parent
, left_index
, &left
);
460 r
= init_child(info
, vt
, parent
, left_index
+ 1, ¢er
);
462 exit_child(info
, &left
);
466 r
= init_child(info
, vt
, parent
, left_index
+ 2, &right
);
468 exit_child(info
, &left
);
469 exit_child(info
, ¢er
);
473 r
= __rebalance3(info
, parent
, &left
, ¢er
, &right
);
475 exit_child(info
, &left
);
476 exit_child(info
, ¢er
);
477 exit_child(info
, &right
);
482 static int rebalance_children(struct shadow_spine
*s
,
483 struct dm_btree_info
*info
,
484 struct dm_btree_value_type
*vt
, uint64_t key
)
486 int i
, r
, has_left_sibling
, has_right_sibling
;
487 struct btree_node
*n
;
489 n
= dm_block_data(shadow_current(s
));
491 if (le32_to_cpu(n
->header
.nr_entries
) == 1) {
492 struct dm_block
*child
;
493 dm_block_t b
= value64(n
, 0);
495 r
= dm_tm_read_lock(info
->tm
, b
, &btree_node_validator
, &child
);
499 memcpy(n
, dm_block_data(child
),
500 dm_bm_block_size(dm_tm_get_bm(info
->tm
)));
502 dm_tm_dec(info
->tm
, dm_block_location(child
));
503 dm_tm_unlock(info
->tm
, child
);
507 i
= lower_bound(n
, key
);
511 has_left_sibling
= i
> 0;
512 has_right_sibling
= i
< (le32_to_cpu(n
->header
.nr_entries
) - 1);
514 if (!has_left_sibling
)
515 r
= rebalance2(s
, info
, vt
, i
);
517 else if (!has_right_sibling
)
518 r
= rebalance2(s
, info
, vt
, i
- 1);
521 r
= rebalance3(s
, info
, vt
, i
- 1);
526 static int do_leaf(struct btree_node
*n
, uint64_t key
, unsigned int *index
)
528 int i
= lower_bound(n
, key
);
531 (i
>= le32_to_cpu(n
->header
.nr_entries
)) ||
532 (le64_to_cpu(n
->keys
[i
]) != key
))
541 * Prepares for removal from one level of the hierarchy. The caller must
542 * call delete_at() to remove the entry at index.
544 static int remove_raw(struct shadow_spine
*s
, struct dm_btree_info
*info
,
545 struct dm_btree_value_type
*vt
, dm_block_t root
,
546 uint64_t key
, unsigned int *index
)
549 struct btree_node
*n
;
552 r
= shadow_step(s
, root
, vt
);
557 * We have to patch up the parent node, ugly, but I don't
558 * see a way to do this automatically as part of the spine
561 if (shadow_has_parent(s
)) {
562 __le64 location
= cpu_to_le64(dm_block_location(shadow_current(s
)));
564 memcpy(value_ptr(dm_block_data(shadow_parent(s
)), i
),
565 &location
, sizeof(__le64
));
568 n
= dm_block_data(shadow_current(s
));
570 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
)
571 return do_leaf(n
, key
, index
);
573 r
= rebalance_children(s
, info
, vt
, key
);
577 n
= dm_block_data(shadow_current(s
));
578 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
)
579 return do_leaf(n
, key
, index
);
581 i
= lower_bound(n
, key
);
584 * We know the key is present, or else
585 * rebalance_children would have returned
588 root
= value64(n
, i
);
594 int dm_btree_remove(struct dm_btree_info
*info
, dm_block_t root
,
595 uint64_t *keys
, dm_block_t
*new_root
)
597 unsigned int level
, last_level
= info
->levels
- 1;
598 int index
= 0, r
= 0;
599 struct shadow_spine spine
;
600 struct btree_node
*n
;
601 struct dm_btree_value_type le64_vt
;
603 init_le64_type(info
->tm
, &le64_vt
);
604 init_shadow_spine(&spine
, info
);
605 for (level
= 0; level
< info
->levels
; level
++) {
606 r
= remove_raw(&spine
, info
,
607 (level
== last_level
?
608 &info
->value_type
: &le64_vt
),
609 root
, keys
[level
], (unsigned int *)&index
);
613 n
= dm_block_data(shadow_current(&spine
));
614 if (level
!= last_level
) {
615 root
= value64(n
, index
);
619 BUG_ON(index
< 0 || index
>= le32_to_cpu(n
->header
.nr_entries
));
621 if (info
->value_type
.dec
)
622 info
->value_type
.dec(info
->value_type
.context
,
623 value_ptr(n
, index
), 1);
629 *new_root
= shadow_root(&spine
);
630 exit_shadow_spine(&spine
);
634 EXPORT_SYMBOL_GPL(dm_btree_remove
);
636 /*----------------------------------------------------------------*/
638 static int remove_nearest(struct shadow_spine
*s
, struct dm_btree_info
*info
,
639 struct dm_btree_value_type
*vt
, dm_block_t root
,
640 uint64_t key
, int *index
)
643 struct btree_node
*n
;
646 r
= shadow_step(s
, root
, vt
);
651 * We have to patch up the parent node, ugly, but I don't
652 * see a way to do this automatically as part of the spine
655 if (shadow_has_parent(s
)) {
656 __le64 location
= cpu_to_le64(dm_block_location(shadow_current(s
)));
658 memcpy(value_ptr(dm_block_data(shadow_parent(s
)), i
),
659 &location
, sizeof(__le64
));
662 n
= dm_block_data(shadow_current(s
));
664 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
) {
665 *index
= lower_bound(n
, key
);
669 r
= rebalance_children(s
, info
, vt
, key
);
673 n
= dm_block_data(shadow_current(s
));
674 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
) {
675 *index
= lower_bound(n
, key
);
679 i
= lower_bound(n
, key
);
682 * We know the key is present, or else
683 * rebalance_children would have returned
686 root
= value64(n
, i
);
692 static int remove_one(struct dm_btree_info
*info
, dm_block_t root
,
693 uint64_t *keys
, uint64_t end_key
,
694 dm_block_t
*new_root
, unsigned int *nr_removed
)
696 unsigned int level
, last_level
= info
->levels
- 1;
697 int index
= 0, r
= 0;
698 struct shadow_spine spine
;
699 struct btree_node
*n
;
700 struct dm_btree_value_type le64_vt
;
703 init_le64_type(info
->tm
, &le64_vt
);
704 init_shadow_spine(&spine
, info
);
705 for (level
= 0; level
< last_level
; level
++) {
706 r
= remove_raw(&spine
, info
, &le64_vt
,
707 root
, keys
[level
], (unsigned int *) &index
);
711 n
= dm_block_data(shadow_current(&spine
));
712 root
= value64(n
, index
);
715 r
= remove_nearest(&spine
, info
, &info
->value_type
,
716 root
, keys
[last_level
], &index
);
720 n
= dm_block_data(shadow_current(&spine
));
725 if (index
>= le32_to_cpu(n
->header
.nr_entries
)) {
730 k
= le64_to_cpu(n
->keys
[index
]);
731 if (k
>= keys
[last_level
] && k
< end_key
) {
732 if (info
->value_type
.dec
)
733 info
->value_type
.dec(info
->value_type
.context
,
734 value_ptr(n
, index
), 1);
737 keys
[last_level
] = k
+ 1ull;
743 *new_root
= shadow_root(&spine
);
744 exit_shadow_spine(&spine
);
749 int dm_btree_remove_leaves(struct dm_btree_info
*info
, dm_block_t root
,
750 uint64_t *first_key
, uint64_t end_key
,
751 dm_block_t
*new_root
, unsigned int *nr_removed
)
757 r
= remove_one(info
, root
, first_key
, end_key
, &root
, nr_removed
);
763 return r
== -ENODATA
? 0 : r
;
765 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves
);