2 * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
4 * This file is released under the GPL.
8 #include "dm-btree-internal.h"
9 #include "dm-transaction-manager.h"
11 #include <linux/export.h>
14 * Removing an entry from a btree
15 * ==============================
17 * A very important constraint for our btree is that no node, except the
18 * root, may have fewer than a certain number of entries.
19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
21 * Ensuring this is complicated by the way we want to only ever hold the
22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
25 * Each node may have a left or right sibling. When decending the spine,
26 * if a node contains only MIN_ENTRIES then we try and increase this to at
27 * least MIN_ENTRIES + 1. We do this in the following ways:
29 * [A] No siblings => this can only happen if the node is the root, in which
30 * case we copy the childs contents over the root.
33 * ==> rebalance(node, right sibling)
35 * [C] No right sibling
36 * ==> rebalance(left sibling, node)
38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
39 * ==> delete node adding it's contents to left and right
41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
42 * ==> rebalance(left, node, right)
44 * After these operations it's possible that the our original node no
45 * longer contains the desired sub tree. For this reason this rebalancing
46 * is performed on the children of the current node. This also avoids
47 * having a special case for the root.
49 * Once this rebalancing has occurred we can then step into the child node
50 * for internal nodes. Or delete the entry for leaf nodes.
54 * Some little utilities for moving node data around.
56 static void node_shift(struct node
*n
, int shift
)
58 uint32_t nr_entries
= le32_to_cpu(n
->header
.nr_entries
);
62 memmove(key_ptr(n
, 0),
64 (nr_entries
- shift
) * sizeof(__le64
));
65 memmove(value_ptr(n
, 0, sizeof(__le64
)),
66 value_ptr(n
, shift
, sizeof(__le64
)),
67 (nr_entries
- shift
) * sizeof(__le64
));
69 memmove(key_ptr(n
, shift
),
71 nr_entries
* sizeof(__le64
));
72 memmove(value_ptr(n
, shift
, sizeof(__le64
)),
73 value_ptr(n
, 0, sizeof(__le64
)),
74 nr_entries
* sizeof(__le64
));
78 static void node_copy(struct node
*left
, struct node
*right
, int shift
)
80 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
84 memcpy(key_ptr(left
, nr_left
),
86 shift
* sizeof(__le64
));
87 memcpy(value_ptr(left
, nr_left
, sizeof(__le64
)),
88 value_ptr(right
, 0, sizeof(__le64
)),
89 shift
* sizeof(__le64
));
91 memcpy(key_ptr(right
, 0),
92 key_ptr(left
, nr_left
- shift
),
93 shift
* sizeof(__le64
));
94 memcpy(value_ptr(right
, 0, sizeof(__le64
)),
95 value_ptr(left
, nr_left
- shift
, sizeof(__le64
)),
96 shift
* sizeof(__le64
));
101 * Delete a specific entry from a leaf node.
103 static void delete_at(struct node
*n
, unsigned index
, size_t value_size
)
105 unsigned nr_entries
= le32_to_cpu(n
->header
.nr_entries
);
106 unsigned nr_to_copy
= nr_entries
- (index
+ 1);
109 memmove(key_ptr(n
, index
),
110 key_ptr(n
, index
+ 1),
111 nr_to_copy
* sizeof(__le64
));
113 memmove(value_ptr(n
, index
, value_size
),
114 value_ptr(n
, index
+ 1, value_size
),
115 nr_to_copy
* value_size
);
118 n
->header
.nr_entries
= cpu_to_le32(nr_entries
- 1);
121 static unsigned del_threshold(struct node
*n
)
123 return le32_to_cpu(n
->header
.max_entries
) / 3;
126 static unsigned merge_threshold(struct node
*n
)
129 * The extra one is because we know we're potentially going to
132 return 2 * (le32_to_cpu(n
->header
.max_entries
) / 3) + 1;
137 struct dm_block
*block
;
141 static struct dm_btree_value_type le64_type
= {
143 .size
= sizeof(__le64
),
149 static int init_child(struct dm_btree_info
*info
, struct node
*parent
,
150 unsigned index
, struct child
*result
)
155 result
->index
= index
;
156 root
= value64(parent
, index
);
158 r
= dm_tm_shadow_block(info
->tm
, root
, &btree_node_validator
,
159 &result
->block
, &inc
);
163 result
->n
= dm_block_data(result
->block
);
166 inc_children(info
->tm
, result
->n
, &le64_type
);
171 static int exit_child(struct dm_btree_info
*info
, struct child
*c
)
173 return dm_tm_unlock(info
->tm
, c
->block
);
176 static void shift(struct node
*left
, struct node
*right
, int count
)
182 node_shift(right
, count
);
183 node_copy(left
, right
, count
);
185 node_copy(left
, right
, count
);
186 node_shift(right
, count
);
189 left
->header
.nr_entries
=
190 cpu_to_le32(le32_to_cpu(left
->header
.nr_entries
) - count
);
192 right
->header
.nr_entries
=
193 cpu_to_le32(le32_to_cpu(right
->header
.nr_entries
) + count
);
196 static void __rebalance2(struct dm_btree_info
*info
, struct node
*parent
,
197 struct child
*l
, struct child
*r
)
199 struct node
*left
= l
->n
;
200 struct node
*right
= r
->n
;
201 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
202 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
204 if (nr_left
+ nr_right
<= merge_threshold(left
)) {
208 node_copy(left
, right
, -nr_right
);
209 left
->header
.nr_entries
= cpu_to_le32(nr_left
+ nr_right
);
211 *((__le64
*) value_ptr(parent
, l
->index
, sizeof(__le64
))) =
212 cpu_to_le64(dm_block_location(l
->block
));
213 delete_at(parent
, r
->index
, sizeof(__le64
));
216 * We need to decrement the right block, but not it's
217 * children, since they're still referenced by left.
219 dm_tm_dec(info
->tm
, dm_block_location(r
->block
));
224 unsigned target_left
= (nr_left
+ nr_right
) / 2;
226 shift(left
, right
, nr_left
- target_left
);
227 *((__le64
*) value_ptr(parent
, l
->index
, sizeof(__le64
))) =
228 cpu_to_le64(dm_block_location(l
->block
));
229 *((__le64
*) value_ptr(parent
, r
->index
, sizeof(__le64
))) =
230 cpu_to_le64(dm_block_location(r
->block
));
231 *key_ptr(parent
, r
->index
) = right
->keys
[0];
235 static int rebalance2(struct shadow_spine
*s
, struct dm_btree_info
*info
,
240 struct child left
, right
;
242 parent
= dm_block_data(shadow_current(s
));
244 r
= init_child(info
, parent
, left_index
, &left
);
248 r
= init_child(info
, parent
, left_index
+ 1, &right
);
250 exit_child(info
, &left
);
254 __rebalance2(info
, parent
, &left
, &right
);
256 r
= exit_child(info
, &left
);
258 exit_child(info
, &right
);
262 r
= exit_child(info
, &right
);
269 static void __rebalance3(struct dm_btree_info
*info
, struct node
*parent
,
270 struct child
*l
, struct child
*c
, struct child
*r
)
272 struct node
*left
= l
->n
;
273 struct node
*center
= c
->n
;
274 struct node
*right
= r
->n
;
276 uint32_t nr_left
= le32_to_cpu(left
->header
.nr_entries
);
277 uint32_t nr_center
= le32_to_cpu(center
->header
.nr_entries
);
278 uint32_t nr_right
= le32_to_cpu(right
->header
.nr_entries
);
279 uint32_t max_entries
= le32_to_cpu(left
->header
.max_entries
);
283 if (((nr_left
+ nr_center
+ nr_right
) / 2) < merge_threshold(center
)) {
285 * Delete center node:
287 * We dump as many entries from center as possible into
288 * left, then the rest in right, then rebalance2. This
289 * wastes some cpu, but I want something simple atm.
291 unsigned shift
= min(max_entries
- nr_left
, nr_center
);
293 node_copy(left
, center
, -shift
);
294 left
->header
.nr_entries
= cpu_to_le32(nr_left
+ shift
);
296 if (shift
!= nr_center
) {
297 shift
= nr_center
- shift
;
298 node_shift(right
, shift
);
299 node_copy(center
, right
, shift
);
300 right
->header
.nr_entries
= cpu_to_le32(nr_right
+ shift
);
303 *((__le64
*) value_ptr(parent
, l
->index
, sizeof(__le64
))) =
304 cpu_to_le64(dm_block_location(l
->block
));
305 *((__le64
*) value_ptr(parent
, r
->index
, sizeof(__le64
))) =
306 cpu_to_le64(dm_block_location(r
->block
));
307 *key_ptr(parent
, r
->index
) = right
->keys
[0];
309 delete_at(parent
, c
->index
, sizeof(__le64
));
312 dm_tm_dec(info
->tm
, dm_block_location(c
->block
));
313 __rebalance2(info
, parent
, l
, r
);
321 target
= (nr_left
+ nr_center
+ nr_right
) / 3;
322 BUG_ON(target
== nr_center
);
325 * Adjust the left node
327 shift(left
, center
, nr_left
- target
);
330 * Adjust the right node
332 shift(center
, right
, target
- nr_right
);
334 *((__le64
*) value_ptr(parent
, l
->index
, sizeof(__le64
))) =
335 cpu_to_le64(dm_block_location(l
->block
));
336 *((__le64
*) value_ptr(parent
, c
->index
, sizeof(__le64
))) =
337 cpu_to_le64(dm_block_location(c
->block
));
338 *((__le64
*) value_ptr(parent
, r
->index
, sizeof(__le64
))) =
339 cpu_to_le64(dm_block_location(r
->block
));
341 *key_ptr(parent
, c
->index
) = center
->keys
[0];
342 *key_ptr(parent
, r
->index
) = right
->keys
[0];
345 static int rebalance3(struct shadow_spine
*s
, struct dm_btree_info
*info
,
349 struct node
*parent
= dm_block_data(shadow_current(s
));
350 struct child left
, center
, right
;
353 * FIXME: fill out an array?
355 r
= init_child(info
, parent
, left_index
, &left
);
359 r
= init_child(info
, parent
, left_index
+ 1, ¢er
);
361 exit_child(info
, &left
);
365 r
= init_child(info
, parent
, left_index
+ 2, &right
);
367 exit_child(info
, &left
);
368 exit_child(info
, ¢er
);
372 __rebalance3(info
, parent
, &left
, ¢er
, &right
);
374 r
= exit_child(info
, &left
);
376 exit_child(info
, ¢er
);
377 exit_child(info
, &right
);
381 r
= exit_child(info
, ¢er
);
383 exit_child(info
, &right
);
387 r
= exit_child(info
, &right
);
394 static int get_nr_entries(struct dm_transaction_manager
*tm
,
395 dm_block_t b
, uint32_t *result
)
398 struct dm_block
*block
;
401 r
= dm_tm_read_lock(tm
, b
, &btree_node_validator
, &block
);
405 n
= dm_block_data(block
);
406 *result
= le32_to_cpu(n
->header
.nr_entries
);
408 return dm_tm_unlock(tm
, block
);
411 static int rebalance_children(struct shadow_spine
*s
,
412 struct dm_btree_info
*info
, uint64_t key
)
414 int i
, r
, has_left_sibling
, has_right_sibling
;
415 uint32_t child_entries
;
418 n
= dm_block_data(shadow_current(s
));
420 if (le32_to_cpu(n
->header
.nr_entries
) == 1) {
421 struct dm_block
*child
;
422 dm_block_t b
= value64(n
, 0);
424 r
= dm_tm_read_lock(info
->tm
, b
, &btree_node_validator
, &child
);
428 memcpy(n
, dm_block_data(child
),
429 dm_bm_block_size(dm_tm_get_bm(info
->tm
)));
430 r
= dm_tm_unlock(info
->tm
, child
);
431 dm_tm_dec(info
->tm
, dm_block_location(child
));
436 i
= lower_bound(n
, key
);
440 r
= get_nr_entries(info
->tm
, value64(n
, i
), &child_entries
);
444 if (child_entries
> del_threshold(n
))
447 has_left_sibling
= i
> 0 ? 1 : 0;
449 (i
>= (le32_to_cpu(n
->header
.nr_entries
) - 1)) ? 0 : 1;
451 if (!has_left_sibling
)
452 r
= rebalance2(s
, info
, i
);
454 else if (!has_right_sibling
)
455 r
= rebalance2(s
, info
, i
- 1);
458 r
= rebalance3(s
, info
, i
- 1);
463 static int do_leaf(struct node
*n
, uint64_t key
, unsigned *index
)
465 int i
= lower_bound(n
, key
);
468 (i
>= le32_to_cpu(n
->header
.nr_entries
)) ||
469 (le64_to_cpu(n
->keys
[i
]) != key
))
478 * Prepares for removal from one level of the hierarchy. The caller must
479 * actually call delete_at() to remove the entry at index.
481 static int remove_raw(struct shadow_spine
*s
, struct dm_btree_info
*info
,
482 struct dm_btree_value_type
*vt
, dm_block_t root
,
483 uint64_t key
, unsigned *index
)
485 int i
= *index
, inc
, r
;
489 r
= shadow_step(s
, root
, vt
, &inc
);
494 * We have to patch up the parent node, ugly, but I don't
495 * see a way to do this automatically as part of the spine
498 if (shadow_has_parent(s
)) {
499 __le64 location
= cpu_to_le64(dm_block_location(shadow_current(s
)));
500 memcpy(value_ptr(dm_block_data(shadow_parent(s
)), i
, sizeof(uint64_t)),
501 &location
, sizeof(__le64
));
504 n
= dm_block_data(shadow_current(s
));
506 inc_children(info
->tm
, n
, vt
);
508 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
)
509 return do_leaf(n
, key
, index
);
511 r
= rebalance_children(s
, info
, key
);
515 n
= dm_block_data(shadow_current(s
));
516 if (le32_to_cpu(n
->header
.flags
) & LEAF_NODE
)
517 return do_leaf(n
, key
, index
);
519 i
= lower_bound(n
, key
);
522 * We know the key is present, or else
523 * rebalance_children would have returned
526 root
= value64(n
, i
);
532 int dm_btree_remove(struct dm_btree_info
*info
, dm_block_t root
,
533 uint64_t *keys
, dm_block_t
*new_root
)
535 unsigned level
, last_level
= info
->levels
- 1;
536 int index
= 0, r
= 0;
537 struct shadow_spine spine
;
540 init_shadow_spine(&spine
, info
);
541 for (level
= 0; level
< info
->levels
; level
++) {
542 r
= remove_raw(&spine
, info
,
543 (level
== last_level
?
544 &info
->value_type
: &le64_type
),
545 root
, keys
[level
], (unsigned *)&index
);
549 n
= dm_block_data(shadow_current(&spine
));
550 if (level
!= last_level
) {
551 root
= value64(n
, index
);
555 BUG_ON(index
< 0 || index
>= le32_to_cpu(n
->header
.nr_entries
));
557 if (info
->value_type
.dec
)
558 info
->value_type
.dec(info
->value_type
.context
,
559 value_ptr(n
, index
, info
->value_type
.size
));
561 delete_at(n
, index
, info
->value_type
.size
);
564 *new_root
= shadow_root(&spine
);
567 exit_shadow_spine(&spine
);
571 EXPORT_SYMBOL_GPL(dm_btree_remove
);