2 * Copyright (C) 2011 Red Hat, Inc.
4 * This file is released under the GPL.
8 #include "dm-btree-internal.h"
9 #include "dm-transaction-manager.h"
11 #include <linux/export.h>
14 * Removing an entry from a btree
15 * ==============================
17 * A very important constraint for our btree is that no node, except the
18 * root, may have fewer than a certain number of entries.
19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
21 * Ensuring this is complicated by the way we want to only ever hold the
22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
25 * Each node may have a left or right sibling. When decending the spine,
26 * if a node contains only MIN_ENTRIES then we try and increase this to at
27 * least MIN_ENTRIES + 1. We do this in the following ways:
29 * [A] No siblings => this can only happen if the node is the root, in which
30 * case we copy the childs contents over the root.
33 * ==> rebalance(node, right sibling)
35 * [C] No right sibling
36 * ==> rebalance(left sibling, node)
38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
39 * ==> delete node adding it's contents to left and right
41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
42 * ==> rebalance(left, node, right)
44 * After these operations it's possible that the our original node no
45 * longer contains the desired sub tree. For this reason this rebalancing
46 * is performed on the children of the current node. This also avoids
47 * having a special case for the root.
49 * Once this rebalancing has occurred we can then step into the child node
50 * for internal nodes. Or delete the entry for leaf nodes.
54 * Some little utilities for moving node data around.
56 static void node_shift(struct btree_node
*n
, int shift
)
58 uint32_t nr_entries
= le32_to_cpu(n
->header
.nr_entries
);
59 uint32_t value_size
= le32_to_cpu(n
->header
.value_size
);
63 BUG_ON(shift
> nr_entries
);
64 BUG_ON((void *) key_ptr(n
, shift
) >= value_ptr(n
, shift
));
65 memmove(key_ptr(n
, 0),
67 (nr_entries
- shift
) * sizeof(__le64
));
68 memmove(value_ptr(n
, 0),
70 (nr_entries
- shift
) * value_size
);
72 BUG_ON(nr_entries
+ shift
> le32_to_cpu(n
->header
.max_entries
));
73 memmove(key_ptr(n
, shift
),
75 nr_entries
* sizeof(__le64
));
76 memmove(value_ptr(n
, shift
),
78 nr_entries
* value_size
);
82 static void node_copy(struct btree_node
*left
, struct btree_node
*right
, int shift
)
84 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
85 uint32_t value_size
= le32_to_cpu(left
->header
.value_size
);
86 BUG_ON(value_size
!= le32_to_cpu(right
->header
.value_size
));
90 BUG_ON(nr_left
+ shift
> le32_to_cpu(left
->header
.max_entries
));
91 memcpy(key_ptr(left
, nr_left
),
93 shift
* sizeof(__le64
));
94 memcpy(value_ptr(left
, nr_left
),
98 BUG_ON(shift
> le32_to_cpu(right
->header
.max_entries
));
99 memcpy(key_ptr(right
, 0),
100 key_ptr(left
, nr_left
- shift
),
101 shift
* sizeof(__le64
));
102 memcpy(value_ptr(right
, 0),
103 value_ptr(left
, nr_left
- shift
),
109 * Delete a specific entry from a leaf node.
111 static void delete_at(struct btree_node
*n
, unsigned index
)
113 unsigned nr_entries
= le32_to_cpu(n
->header
.nr_entries
);
114 unsigned nr_to_copy
= nr_entries
- (index
+ 1);
115 uint32_t value_size
= le32_to_cpu(n
->header
.value_size
);
116 BUG_ON(index
>= nr_entries
);
119 memmove(key_ptr(n
, index
),
120 key_ptr(n
, index
+ 1),
121 nr_to_copy
* sizeof(__le64
));
123 memmove(value_ptr(n
, index
),
124 value_ptr(n
, index
+ 1),
125 nr_to_copy
* value_size
);
128 n
->header
.nr_entries
= cpu_to_le32(nr_entries
- 1);
131 static unsigned merge_threshold(struct btree_node
*n
)
133 return le32_to_cpu(n
->header
.max_entries
) / 3;
138 struct dm_block
*block
;
139 struct btree_node
*n
;
142 static int init_child(struct dm_btree_info
*info
, struct dm_btree_value_type
*vt
,
143 struct btree_node
*parent
,
144 unsigned index
, struct child
*result
)
149 result
->index
= index
;
150 root
= value64(parent
, index
);
152 r
= dm_tm_shadow_block(info
->tm
, root
, &btree_node_validator
,
153 &result
->block
, &inc
);
157 result
->n
= dm_block_data(result
->block
);
160 inc_children(info
->tm
, result
->n
, vt
);
162 *((__le64
*) value_ptr(parent
, index
)) =
163 cpu_to_le64(dm_block_location(result
->block
));
168 static void exit_child(struct dm_btree_info
*info
, struct child
*c
)
170 dm_tm_unlock(info
->tm
, c
->block
);
173 static void shift(struct btree_node
*left
, struct btree_node
*right
, int count
)
175 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
176 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
177 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
178 uint32_t r_max_entries
= le32_to_cpu(right
->header
.max_entries
);
180 BUG_ON(max_entries
!= r_max_entries
);
181 BUG_ON(nr_left
- count
> max_entries
);
182 BUG_ON(nr_right
+ count
> max_entries
);
188 node_shift(right
, count
);
189 node_copy(left
, right
, count
);
191 node_copy(left
, right
, count
);
192 node_shift(right
, count
);
195 left
->header
.nr_entries
= cpu_to_le32(nr_left
- count
);
196 right
->header
.nr_entries
= cpu_to_le32(nr_right
+ count
);
199 static void __rebalance2(struct dm_btree_info
*info
, struct btree_node
*parent
,
200 struct child
*l
, struct child
*r
)
202 struct btree_node
*left
= l
->n
;
203 struct btree_node
*right
= r
->n
;
204 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
205 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
207 * Ensure the number of entries in each child will be greater
208 * than or equal to (max_entries / 3 + 1), so no matter which
209 * child is used for removal, the number will still be not
210 * less than (max_entries / 3).
212 unsigned int threshold
= 2 * (merge_threshold(left
) + 1);
214 if (nr_left
+ nr_right
< threshold
) {
218 node_copy(left
, right
, -nr_right
);
219 left
->header
.nr_entries
= cpu_to_le32(nr_left
+ nr_right
);
220 delete_at(parent
, r
->index
);
223 * We need to decrement the right block, but not it's
224 * children, since they're still referenced by left.
226 dm_tm_dec(info
->tm
, dm_block_location(r
->block
));
231 unsigned target_left
= (nr_left
+ nr_right
) / 2;
232 shift(left
, right
, nr_left
- target_left
);
233 *key_ptr(parent
, r
->index
) = right
->keys
[0];
237 static int rebalance2(struct shadow_spine
*s
, struct dm_btree_info
*info
,
238 struct dm_btree_value_type
*vt
, unsigned left_index
)
241 struct btree_node
*parent
;
242 struct child left
, right
;
244 parent
= dm_block_data(shadow_current(s
));
246 r
= init_child(info
, vt
, parent
, left_index
, &left
);
250 r
= init_child(info
, vt
, parent
, left_index
+ 1, &right
);
252 exit_child(info
, &left
);
256 __rebalance2(info
, parent
, &left
, &right
);
258 exit_child(info
, &left
);
259 exit_child(info
, &right
);
265 * We dump as many entries from center as possible into left, then the rest
266 * in right, then rebalance2. This wastes some cpu, but I want something
269 static void delete_center_node(struct dm_btree_info
*info
, struct btree_node
*parent
,
270 struct child
*l
, struct child
*c
, struct child
*r
,
271 struct btree_node
*left
, struct btree_node
*center
, struct btree_node
*right
,
272 uint32_t nr_left
, uint32_t nr_center
, uint32_t nr_right
)
274 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
275 unsigned shift
= min(max_entries
- nr_left
, nr_center
);
277 BUG_ON(nr_left
+ shift
> max_entries
);
278 node_copy(left
, center
, -shift
);
279 left
->header
.nr_entries
= cpu_to_le32(nr_left
+ shift
);
281 if (shift
!= nr_center
) {
282 shift
= nr_center
- shift
;
283 BUG_ON((nr_right
+ shift
) > max_entries
);
284 node_shift(right
, shift
);
285 node_copy(center
, right
, shift
);
286 right
->header
.nr_entries
= cpu_to_le32(nr_right
+ shift
);
288 *key_ptr(parent
, r
->index
) = right
->keys
[0];
290 delete_at(parent
, c
->index
);
293 dm_tm_dec(info
->tm
, dm_block_location(c
->block
));
294 __rebalance2(info
, parent
, l
, r
);
298 * Redistributes entries among 3 sibling nodes.
300 static void redistribute3(struct dm_btree_info
*info
, struct btree_node
*parent
,
301 struct child
*l
, struct child
*c
, struct child
*r
,
302 struct btree_node
*left
, struct btree_node
*center
, struct btree_node
*right
,
303 uint32_t nr_left
, uint32_t nr_center
, uint32_t nr_right
)
306 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
307 unsigned total
= nr_left
+ nr_center
+ nr_right
;
308 unsigned target_right
= total
/ 3;
309 unsigned remainder
= (target_right
* 3) != total
;
310 unsigned target_left
= target_right
+ remainder
;
312 BUG_ON(target_left
> max_entries
);
313 BUG_ON(target_right
> max_entries
);
315 if (nr_left
< nr_right
) {
316 s
= nr_left
- target_left
;
318 if (s
< 0 && nr_center
< -s
) {
319 /* not enough in central node */
320 shift(left
, center
, -nr_center
);
322 shift(left
, right
, s
);
325 shift(left
, center
, s
);
327 shift(center
, right
, target_right
- nr_right
);
330 s
= target_right
- nr_right
;
331 if (s
> 0 && nr_center
< s
) {
332 /* not enough in central node */
333 shift(center
, right
, nr_center
);
335 shift(left
, right
, s
);
338 shift(center
, right
, s
);
340 shift(left
, center
, nr_left
- target_left
);
343 *key_ptr(parent
, c
->index
) = center
->keys
[0];
344 *key_ptr(parent
, r
->index
) = right
->keys
[0];
347 static void __rebalance3(struct dm_btree_info
*info
, struct btree_node
*parent
,
348 struct child
*l
, struct child
*c
, struct child
*r
)
350 struct btree_node
*left
= l
->n
;
351 struct btree_node
*center
= c
->n
;
352 struct btree_node
*right
= r
->n
;
354 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
355 uint32_t nr_center
= le32_to_cpu(center
->header
.nr_entries
);
356 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
358 unsigned threshold
= merge_threshold(left
) * 4 + 1;
360 BUG_ON(left
->header
.max_entries
!= center
->header
.max_entries
);
361 BUG_ON(center
->header
.max_entries
!= right
->header
.max_entries
);
363 if ((nr_left
+ nr_center
+ nr_right
) < threshold
)
364 delete_center_node(info
, parent
, l
, c
, r
, left
, center
, right
,
365 nr_left
, nr_center
, nr_right
);
367 redistribute3(info
, parent
, l
, c
, r
, left
, center
, right
,
368 nr_left
, nr_center
, nr_right
);
371 static int rebalance3(struct shadow_spine
*s
, struct dm_btree_info
*info
,
372 struct dm_btree_value_type
*vt
, unsigned left_index
)
375 struct btree_node
*parent
= dm_block_data(shadow_current(s
));
376 struct child left
, center
, right
;
379 * FIXME: fill out an array?
381 r
= init_child(info
, vt
, parent
, left_index
, &left
);
385 r
= init_child(info
, vt
, parent
, left_index
+ 1, ¢er
);
387 exit_child(info
, &left
);
391 r
= init_child(info
, vt
, parent
, left_index
+ 2, &right
);
393 exit_child(info
, &left
);
394 exit_child(info
, ¢er
);
398 __rebalance3(info
, parent
, &left
, ¢er
, &right
);
400 exit_child(info
, &left
);
401 exit_child(info
, ¢er
);
402 exit_child(info
, &right
);
407 static int rebalance_children(struct shadow_spine
*s
,
408 struct dm_btree_info
*info
,
409 struct dm_btree_value_type
*vt
, uint64_t key
)
411 int i
, r
, has_left_sibling
, has_right_sibling
;
412 struct btree_node
*n
;
414 n
= dm_block_data(shadow_current(s
));
416 if (le32_to_cpu(n
->header
.nr_entries
) == 1) {
417 struct dm_block
*child
;
418 dm_block_t b
= value64(n
, 0);
420 r
= dm_tm_read_lock(info
->tm
, b
, &btree_node_validator
, &child
);
424 memcpy(n
, dm_block_data(child
),
425 dm_bm_block_size(dm_tm_get_bm(info
->tm
)));
426 dm_tm_unlock(info
->tm
, child
);
428 dm_tm_dec(info
->tm
, dm_block_location(child
));
432 i
= lower_bound(n
, key
);
436 has_left_sibling
= i
> 0;
437 has_right_sibling
= i
< (le32_to_cpu(n
->header
.nr_entries
) - 1);
439 if (!has_left_sibling
)
440 r
= rebalance2(s
, info
, vt
, i
);
442 else if (!has_right_sibling
)
443 r
= rebalance2(s
, info
, vt
, i
- 1);
446 r
= rebalance3(s
, info
, vt
, i
- 1);
451 static int do_leaf(struct btree_node
*n
, uint64_t key
, unsigned *index
)
453 int i
= lower_bound(n
, key
);
456 (i
>= le32_to_cpu(n
->header
.nr_entries
)) ||
457 (le64_to_cpu(n
->keys
[i
]) != key
))
466 * Prepares for removal from one level of the hierarchy. The caller must
467 * call delete_at() to remove the entry at index.
469 static int remove_raw(struct shadow_spine
*s
, struct dm_btree_info
*info
,
470 struct dm_btree_value_type
*vt
, dm_block_t root
,
471 uint64_t key
, unsigned *index
)
474 struct btree_node
*n
;
477 r
= shadow_step(s
, root
, vt
);
482 * We have to patch up the parent node, ugly, but I don't
483 * see a way to do this automatically as part of the spine
486 if (shadow_has_parent(s
)) {
487 __le64 location
= cpu_to_le64(dm_block_location(shadow_current(s
)));
488 memcpy(value_ptr(dm_block_data(shadow_parent(s
)), i
),
489 &location
, sizeof(__le64
));
492 n
= dm_block_data(shadow_current(s
));
494 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
)
495 return do_leaf(n
, key
, index
);
497 r
= rebalance_children(s
, info
, vt
, key
);
501 n
= dm_block_data(shadow_current(s
));
502 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
)
503 return do_leaf(n
, key
, index
);
505 i
= lower_bound(n
, key
);
508 * We know the key is present, or else
509 * rebalance_children would have returned
512 root
= value64(n
, i
);
518 int dm_btree_remove(struct dm_btree_info
*info
, dm_block_t root
,
519 uint64_t *keys
, dm_block_t
*new_root
)
521 unsigned level
, last_level
= info
->levels
- 1;
522 int index
= 0, r
= 0;
523 struct shadow_spine spine
;
524 struct btree_node
*n
;
525 struct dm_btree_value_type le64_vt
;
527 init_le64_type(info
->tm
, &le64_vt
);
528 init_shadow_spine(&spine
, info
);
529 for (level
= 0; level
< info
->levels
; level
++) {
530 r
= remove_raw(&spine
, info
,
531 (level
== last_level
?
532 &info
->value_type
: &le64_vt
),
533 root
, keys
[level
], (unsigned *)&index
);
537 n
= dm_block_data(shadow_current(&spine
));
538 if (level
!= last_level
) {
539 root
= value64(n
, index
);
543 BUG_ON(index
< 0 || index
>= le32_to_cpu(n
->header
.nr_entries
));
545 if (info
->value_type
.dec
)
546 info
->value_type
.dec(info
->value_type
.context
,
547 value_ptr(n
, index
));
552 *new_root
= shadow_root(&spine
);
553 exit_shadow_spine(&spine
);
557 EXPORT_SYMBOL_GPL(dm_btree_remove
);
559 /*----------------------------------------------------------------*/
561 static int remove_nearest(struct shadow_spine
*s
, struct dm_btree_info
*info
,
562 struct dm_btree_value_type
*vt
, dm_block_t root
,
563 uint64_t key
, int *index
)
566 struct btree_node
*n
;
569 r
= shadow_step(s
, root
, vt
);
574 * We have to patch up the parent node, ugly, but I don't
575 * see a way to do this automatically as part of the spine
578 if (shadow_has_parent(s
)) {
579 __le64 location
= cpu_to_le64(dm_block_location(shadow_current(s
)));
580 memcpy(value_ptr(dm_block_data(shadow_parent(s
)), i
),
581 &location
, sizeof(__le64
));
584 n
= dm_block_data(shadow_current(s
));
586 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
) {
587 *index
= lower_bound(n
, key
);
591 r
= rebalance_children(s
, info
, vt
, key
);
595 n
= dm_block_data(shadow_current(s
));
596 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
) {
597 *index
= lower_bound(n
, key
);
601 i
= lower_bound(n
, key
);
604 * We know the key is present, or else
605 * rebalance_children would have returned
608 root
= value64(n
, i
);
614 static int remove_one(struct dm_btree_info
*info
, dm_block_t root
,
615 uint64_t *keys
, uint64_t end_key
,
616 dm_block_t
*new_root
, unsigned *nr_removed
)
618 unsigned level
, last_level
= info
->levels
- 1;
619 int index
= 0, r
= 0;
620 struct shadow_spine spine
;
621 struct btree_node
*n
;
622 struct dm_btree_value_type le64_vt
;
625 init_le64_type(info
->tm
, &le64_vt
);
626 init_shadow_spine(&spine
, info
);
627 for (level
= 0; level
< last_level
; level
++) {
628 r
= remove_raw(&spine
, info
, &le64_vt
,
629 root
, keys
[level
], (unsigned *) &index
);
633 n
= dm_block_data(shadow_current(&spine
));
634 root
= value64(n
, index
);
637 r
= remove_nearest(&spine
, info
, &info
->value_type
,
638 root
, keys
[last_level
], &index
);
642 n
= dm_block_data(shadow_current(&spine
));
647 if (index
>= le32_to_cpu(n
->header
.nr_entries
)) {
652 k
= le64_to_cpu(n
->keys
[index
]);
653 if (k
>= keys
[last_level
] && k
< end_key
) {
654 if (info
->value_type
.dec
)
655 info
->value_type
.dec(info
->value_type
.context
,
656 value_ptr(n
, index
));
659 keys
[last_level
] = k
+ 1ull;
665 *new_root
= shadow_root(&spine
);
666 exit_shadow_spine(&spine
);
671 int dm_btree_remove_leaves(struct dm_btree_info
*info
, dm_block_t root
,
672 uint64_t *first_key
, uint64_t end_key
,
673 dm_block_t
*new_root
, unsigned *nr_removed
)
679 r
= remove_one(info
, root
, first_key
, end_key
, &root
, nr_removed
);
685 return r
== -ENODATA
? 0 : r
;
687 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves
);