1 // SPDX-License-Identifier: GPL-2.0-only
3 * lib/btree.c - Simple In-memory B+Tree
5 * Copyright (c) 2007-2008 Joern Engel <joern@purestorage.com>
6 * Bits and pieces stolen from Peter Zijlstra's code, which is
7 * Copyright 2007, Red Hat Inc. Peter Zijlstra
9 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
11 * A relatively simple B+Tree implementation. I have written it as a learning
12 * exercise to understand how B+Trees work. Turned out to be useful as well.
14 * B+Trees can be used similar to Linux radix trees (which don't have anything
15 * in common with textbook radix trees, beware). Prerequisite for them working
16 * well is that access to a random tree node is much faster than a large number
17 * of operations within each node.
19 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
20 * has gained similar properties, as memory access times, when measured in cpu
21 * cycles, have increased. Cacheline sizes have increased as well, which also
24 * Compared to radix trees, B+Trees are more efficient when dealing with a
25 * sparsely populated address space. Between 25% and 50% of the memory is
26 * occupied with valid pointers. When densely populated, radix trees contain
27 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
30 * This particular implementation stores pointers identified by a long value.
31 * Storing NULL pointers is illegal, lookup will return NULL when no entry
34 * A tricks was used that is not commonly found in textbooks. The lowest
35 * values are to the right, not to the left. All used slots within a node
36 * are on the left, all unused slots contain NUL values. Most operations
37 * simply loop once over all slots and terminate on the first NUL.
40 #include <linux/btree.h>
41 #include <linux/cache.h>
42 #include <linux/kernel.h>
43 #include <linux/slab.h>
44 #include <linux/module.h>
46 #define NODESIZE MAX(L1_CACHE_BYTES, 128)
54 struct btree_geo btree_geo32
= {
56 .no_pairs
= NODESIZE
/ sizeof(long) / 2,
57 .no_longs
= NODESIZE
/ sizeof(long) / 2,
59 EXPORT_SYMBOL_GPL(btree_geo32
);
61 #define LONG_PER_U64 (64 / BITS_PER_LONG)
62 struct btree_geo btree_geo64
= {
63 .keylen
= LONG_PER_U64
,
64 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
),
65 .no_longs
= LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
)),
67 EXPORT_SYMBOL_GPL(btree_geo64
);
69 struct btree_geo btree_geo128
= {
70 .keylen
= 2 * LONG_PER_U64
,
71 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
),
72 .no_longs
= 2 * LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
)),
74 EXPORT_SYMBOL_GPL(btree_geo128
);
76 #define MAX_KEYLEN (2 * LONG_PER_U64)
78 static struct kmem_cache
*btree_cachep
;
80 void *btree_alloc(gfp_t gfp_mask
, void *pool_data
)
82 return kmem_cache_alloc(btree_cachep
, gfp_mask
);
84 EXPORT_SYMBOL_GPL(btree_alloc
);
86 void btree_free(void *element
, void *pool_data
)
88 kmem_cache_free(btree_cachep
, element
);
90 EXPORT_SYMBOL_GPL(btree_free
);
92 static unsigned long *btree_node_alloc(struct btree_head
*head
, gfp_t gfp
)
96 node
= mempool_alloc(head
->mempool
, gfp
);
98 memset(node
, 0, NODESIZE
);
102 static int longcmp(const unsigned long *l1
, const unsigned long *l2
, size_t n
)
106 for (i
= 0; i
< n
; i
++) {
115 static unsigned long *longcpy(unsigned long *dest
, const unsigned long *src
,
120 for (i
= 0; i
< n
; i
++)
125 static unsigned long *longset(unsigned long *s
, unsigned long c
, size_t n
)
129 for (i
= 0; i
< n
; i
++)
134 static void dec_key(struct btree_geo
*geo
, unsigned long *key
)
139 for (i
= geo
->keylen
- 1; i
>= 0; i
--) {
147 static unsigned long *bkey(struct btree_geo
*geo
, unsigned long *node
, int n
)
149 return &node
[n
* geo
->keylen
];
152 static void *bval(struct btree_geo
*geo
, unsigned long *node
, int n
)
154 return (void *)node
[geo
->no_longs
+ n
];
157 static void setkey(struct btree_geo
*geo
, unsigned long *node
, int n
,
160 longcpy(bkey(geo
, node
, n
), key
, geo
->keylen
);
163 static void setval(struct btree_geo
*geo
, unsigned long *node
, int n
,
166 node
[geo
->no_longs
+ n
] = (unsigned long) val
;
169 static void clearpair(struct btree_geo
*geo
, unsigned long *node
, int n
)
171 longset(bkey(geo
, node
, n
), 0, geo
->keylen
);
172 node
[geo
->no_longs
+ n
] = 0;
175 static inline void __btree_init(struct btree_head
*head
)
181 void btree_init_mempool(struct btree_head
*head
, mempool_t
*mempool
)
184 head
->mempool
= mempool
;
186 EXPORT_SYMBOL_GPL(btree_init_mempool
);
188 int btree_init(struct btree_head
*head
)
191 head
->mempool
= mempool_create(0, btree_alloc
, btree_free
, NULL
);
196 EXPORT_SYMBOL_GPL(btree_init
);
198 void btree_destroy(struct btree_head
*head
)
200 mempool_free(head
->node
, head
->mempool
);
201 mempool_destroy(head
->mempool
);
202 head
->mempool
= NULL
;
204 EXPORT_SYMBOL_GPL(btree_destroy
);
206 void *btree_last(struct btree_head
*head
, struct btree_geo
*geo
,
209 int height
= head
->height
;
210 unsigned long *node
= head
->node
;
215 for ( ; height
> 1; height
--)
216 node
= bval(geo
, node
, 0);
218 longcpy(key
, bkey(geo
, node
, 0), geo
->keylen
);
219 return bval(geo
, node
, 0);
221 EXPORT_SYMBOL_GPL(btree_last
);
223 static int keycmp(struct btree_geo
*geo
, unsigned long *node
, int pos
,
226 return longcmp(bkey(geo
, node
, pos
), key
, geo
->keylen
);
229 static int keyzero(struct btree_geo
*geo
, unsigned long *key
)
233 for (i
= 0; i
< geo
->keylen
; i
++)
240 static void *btree_lookup_node(struct btree_head
*head
, struct btree_geo
*geo
,
243 int i
, height
= head
->height
;
244 unsigned long *node
= head
->node
;
249 for ( ; height
> 1; height
--) {
250 for (i
= 0; i
< geo
->no_pairs
; i
++)
251 if (keycmp(geo
, node
, i
, key
) <= 0)
253 if (i
== geo
->no_pairs
)
255 node
= bval(geo
, node
, i
);
262 void *btree_lookup(struct btree_head
*head
, struct btree_geo
*geo
,
268 node
= btree_lookup_node(head
, geo
, key
);
272 for (i
= 0; i
< geo
->no_pairs
; i
++)
273 if (keycmp(geo
, node
, i
, key
) == 0)
274 return bval(geo
, node
, i
);
277 EXPORT_SYMBOL_GPL(btree_lookup
);
279 int btree_update(struct btree_head
*head
, struct btree_geo
*geo
,
280 unsigned long *key
, void *val
)
285 node
= btree_lookup_node(head
, geo
, key
);
289 for (i
= 0; i
< geo
->no_pairs
; i
++)
290 if (keycmp(geo
, node
, i
, key
) == 0) {
291 setval(geo
, node
, i
, val
);
296 EXPORT_SYMBOL_GPL(btree_update
);
299 * Usually this function is quite similar to normal lookup. But the key of
300 * a parent node may be smaller than the smallest key of all its siblings.
301 * In such a case we cannot just return NULL, as we have only proven that no
302 * key smaller than __key, but larger than this parent key exists.
303 * So we set __key to the parent key and retry. We have to use the smallest
304 * such parent key, which is the last parent key we encountered.
306 void *btree_get_prev(struct btree_head
*head
, struct btree_geo
*geo
,
307 unsigned long *__key
)
310 unsigned long *node
, *oldnode
;
311 unsigned long *retry_key
= NULL
, key
[MAX_KEYLEN
];
313 if (keyzero(geo
, __key
))
316 if (head
->height
== 0)
318 longcpy(key
, __key
, geo
->keylen
);
323 for (height
= head
->height
; height
> 1; height
--) {
324 for (i
= 0; i
< geo
->no_pairs
; i
++)
325 if (keycmp(geo
, node
, i
, key
) <= 0)
327 if (i
== geo
->no_pairs
)
330 node
= bval(geo
, node
, i
);
333 retry_key
= bkey(geo
, oldnode
, i
);
339 for (i
= 0; i
< geo
->no_pairs
; i
++) {
340 if (keycmp(geo
, node
, i
, key
) <= 0) {
341 if (bval(geo
, node
, i
)) {
342 longcpy(__key
, bkey(geo
, node
, i
), geo
->keylen
);
343 return bval(geo
, node
, i
);
350 longcpy(key
, retry_key
, geo
->keylen
);
356 EXPORT_SYMBOL_GPL(btree_get_prev
);
358 static int getpos(struct btree_geo
*geo
, unsigned long *node
,
363 for (i
= 0; i
< geo
->no_pairs
; i
++) {
364 if (keycmp(geo
, node
, i
, key
) <= 0)
370 static int getfill(struct btree_geo
*geo
, unsigned long *node
, int start
)
374 for (i
= start
; i
< geo
->no_pairs
; i
++)
375 if (!bval(geo
, node
, i
))
381 * locate the correct leaf node in the btree
383 static unsigned long *find_level(struct btree_head
*head
, struct btree_geo
*geo
,
384 unsigned long *key
, int level
)
386 unsigned long *node
= head
->node
;
389 for (height
= head
->height
; height
> level
; height
--) {
390 for (i
= 0; i
< geo
->no_pairs
; i
++)
391 if (keycmp(geo
, node
, i
, key
) <= 0)
394 if ((i
== geo
->no_pairs
) || !bval(geo
, node
, i
)) {
395 /* right-most key is too large, update it */
396 /* FIXME: If the right-most key on higher levels is
397 * always zero, this wouldn't be necessary. */
399 setkey(geo
, node
, i
, key
);
402 node
= bval(geo
, node
, i
);
408 static int btree_grow(struct btree_head
*head
, struct btree_geo
*geo
,
414 node
= btree_node_alloc(head
, gfp
);
418 fill
= getfill(geo
, head
->node
, 0);
419 setkey(geo
, node
, 0, bkey(geo
, head
->node
, fill
- 1));
420 setval(geo
, node
, 0, head
->node
);
427 static void btree_shrink(struct btree_head
*head
, struct btree_geo
*geo
)
432 if (head
->height
<= 1)
436 fill
= getfill(geo
, node
, 0);
438 head
->node
= bval(geo
, node
, 0);
440 mempool_free(node
, head
->mempool
);
443 static int btree_insert_level(struct btree_head
*head
, struct btree_geo
*geo
,
444 unsigned long *key
, void *val
, int level
,
448 int i
, pos
, fill
, err
;
451 if (head
->height
< level
) {
452 err
= btree_grow(head
, geo
, gfp
);
458 node
= find_level(head
, geo
, key
, level
);
459 pos
= getpos(geo
, node
, key
);
460 fill
= getfill(geo
, node
, pos
);
461 /* two identical keys are not allowed */
462 BUG_ON(pos
< fill
&& keycmp(geo
, node
, pos
, key
) == 0);
464 if (fill
== geo
->no_pairs
) {
465 /* need to split node */
468 new = btree_node_alloc(head
, gfp
);
471 err
= btree_insert_level(head
, geo
,
472 bkey(geo
, node
, fill
/ 2 - 1),
473 new, level
+ 1, gfp
);
475 mempool_free(new, head
->mempool
);
478 for (i
= 0; i
< fill
/ 2; i
++) {
479 setkey(geo
, new, i
, bkey(geo
, node
, i
));
480 setval(geo
, new, i
, bval(geo
, node
, i
));
481 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ fill
/ 2));
482 setval(geo
, node
, i
, bval(geo
, node
, i
+ fill
/ 2));
483 clearpair(geo
, node
, i
+ fill
/ 2);
486 setkey(geo
, node
, i
, bkey(geo
, node
, fill
- 1));
487 setval(geo
, node
, i
, bval(geo
, node
, fill
- 1));
488 clearpair(geo
, node
, fill
- 1);
492 BUG_ON(fill
>= geo
->no_pairs
);
494 /* shift and insert */
495 for (i
= fill
; i
> pos
; i
--) {
496 setkey(geo
, node
, i
, bkey(geo
, node
, i
- 1));
497 setval(geo
, node
, i
, bval(geo
, node
, i
- 1));
499 setkey(geo
, node
, pos
, key
);
500 setval(geo
, node
, pos
, val
);
505 int btree_insert(struct btree_head
*head
, struct btree_geo
*geo
,
506 unsigned long *key
, void *val
, gfp_t gfp
)
509 return btree_insert_level(head
, geo
, key
, val
, 1, gfp
);
511 EXPORT_SYMBOL_GPL(btree_insert
);
513 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
514 unsigned long *key
, int level
);
515 static void merge(struct btree_head
*head
, struct btree_geo
*geo
, int level
,
516 unsigned long *left
, int lfill
,
517 unsigned long *right
, int rfill
,
518 unsigned long *parent
, int lpos
)
522 for (i
= 0; i
< rfill
; i
++) {
523 /* Move all keys to the left */
524 setkey(geo
, left
, lfill
+ i
, bkey(geo
, right
, i
));
525 setval(geo
, left
, lfill
+ i
, bval(geo
, right
, i
));
527 /* Exchange left and right child in parent */
528 setval(geo
, parent
, lpos
, right
);
529 setval(geo
, parent
, lpos
+ 1, left
);
530 /* Remove left (formerly right) child from parent */
531 btree_remove_level(head
, geo
, bkey(geo
, parent
, lpos
), level
+ 1);
532 mempool_free(right
, head
->mempool
);
535 static void rebalance(struct btree_head
*head
, struct btree_geo
*geo
,
536 unsigned long *key
, int level
, unsigned long *child
, int fill
)
538 unsigned long *parent
, *left
= NULL
, *right
= NULL
;
539 int i
, no_left
, no_right
;
542 /* Because we don't steal entries from a neighbour, this case
543 * can happen. Parent node contains a single child, this
544 * node, so merging with a sibling never happens.
546 btree_remove_level(head
, geo
, key
, level
+ 1);
547 mempool_free(child
, head
->mempool
);
551 parent
= find_level(head
, geo
, key
, level
+ 1);
552 i
= getpos(geo
, parent
, key
);
553 BUG_ON(bval(geo
, parent
, i
) != child
);
556 left
= bval(geo
, parent
, i
- 1);
557 no_left
= getfill(geo
, left
, 0);
558 if (fill
+ no_left
<= geo
->no_pairs
) {
559 merge(head
, geo
, level
,
566 if (i
+ 1 < getfill(geo
, parent
, i
)) {
567 right
= bval(geo
, parent
, i
+ 1);
568 no_right
= getfill(geo
, right
, 0);
569 if (fill
+ no_right
<= geo
->no_pairs
) {
570 merge(head
, geo
, level
,
578 * We could also try to steal one entry from the left or right
579 * neighbor. By not doing so we changed the invariant from
580 * "all nodes are at least half full" to "no two neighboring
581 * nodes can be merged". Which means that the average fill of
582 * all nodes is still half or better.
586 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
587 unsigned long *key
, int level
)
593 if (level
> head
->height
) {
594 /* we recursed all the way up */
600 node
= find_level(head
, geo
, key
, level
);
601 pos
= getpos(geo
, node
, key
);
602 fill
= getfill(geo
, node
, pos
);
603 if ((level
== 1) && (keycmp(geo
, node
, pos
, key
) != 0))
605 ret
= bval(geo
, node
, pos
);
607 /* remove and shift */
608 for (i
= pos
; i
< fill
- 1; i
++) {
609 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ 1));
610 setval(geo
, node
, i
, bval(geo
, node
, i
+ 1));
612 clearpair(geo
, node
, fill
- 1);
614 if (fill
- 1 < geo
->no_pairs
/ 2) {
615 if (level
< head
->height
)
616 rebalance(head
, geo
, key
, level
, node
, fill
- 1);
617 else if (fill
- 1 == 1)
618 btree_shrink(head
, geo
);
624 void *btree_remove(struct btree_head
*head
, struct btree_geo
*geo
,
627 if (head
->height
== 0)
630 return btree_remove_level(head
, geo
, key
, 1);
632 EXPORT_SYMBOL_GPL(btree_remove
);
634 int btree_merge(struct btree_head
*target
, struct btree_head
*victim
,
635 struct btree_geo
*geo
, gfp_t gfp
)
637 unsigned long key
[MAX_KEYLEN
];
638 unsigned long dup
[MAX_KEYLEN
];
642 BUG_ON(target
== victim
);
644 if (!(target
->node
)) {
645 /* target is empty, just copy fields over */
646 target
->node
= victim
->node
;
647 target
->height
= victim
->height
;
648 __btree_init(victim
);
652 /* TODO: This needs some optimizations. Currently we do three tree
653 * walks to remove a single object from the victim.
656 if (!btree_last(victim
, geo
, key
))
658 val
= btree_lookup(victim
, geo
, key
);
659 err
= btree_insert(target
, geo
, key
, val
, gfp
);
662 /* We must make a copy of the key, as the original will get
663 * mangled inside btree_remove. */
664 longcpy(dup
, key
, geo
->keylen
);
665 btree_remove(victim
, geo
, dup
);
669 EXPORT_SYMBOL_GPL(btree_merge
);
671 static size_t __btree_for_each(struct btree_head
*head
, struct btree_geo
*geo
,
672 unsigned long *node
, unsigned long opaque
,
673 void (*func
)(void *elem
, unsigned long opaque
,
674 unsigned long *key
, size_t index
,
676 void *func2
, int reap
, int height
, size_t count
)
679 unsigned long *child
;
681 for (i
= 0; i
< geo
->no_pairs
; i
++) {
682 child
= bval(geo
, node
, i
);
686 count
= __btree_for_each(head
, geo
, child
, opaque
,
687 func
, func2
, reap
, height
- 1, count
);
689 func(child
, opaque
, bkey(geo
, node
, i
), count
++,
693 mempool_free(node
, head
->mempool
);
697 static void empty(void *elem
, unsigned long opaque
, unsigned long *key
,
698 size_t index
, void *func2
)
702 void visitorl(void *elem
, unsigned long opaque
, unsigned long *key
,
703 size_t index
, void *__func
)
705 visitorl_t func
= __func
;
707 func(elem
, opaque
, *key
, index
);
709 EXPORT_SYMBOL_GPL(visitorl
);
711 void visitor32(void *elem
, unsigned long opaque
, unsigned long *__key
,
712 size_t index
, void *__func
)
714 visitor32_t func
= __func
;
715 u32
*key
= (void *)__key
;
717 func(elem
, opaque
, *key
, index
);
719 EXPORT_SYMBOL_GPL(visitor32
);
721 void visitor64(void *elem
, unsigned long opaque
, unsigned long *__key
,
722 size_t index
, void *__func
)
724 visitor64_t func
= __func
;
725 u64
*key
= (void *)__key
;
727 func(elem
, opaque
, *key
, index
);
729 EXPORT_SYMBOL_GPL(visitor64
);
731 void visitor128(void *elem
, unsigned long opaque
, unsigned long *__key
,
732 size_t index
, void *__func
)
734 visitor128_t func
= __func
;
735 u64
*key
= (void *)__key
;
737 func(elem
, opaque
, key
[0], key
[1], index
);
739 EXPORT_SYMBOL_GPL(visitor128
);
741 size_t btree_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
742 unsigned long opaque
,
743 void (*func
)(void *elem
, unsigned long opaque
,
745 size_t index
, void *func2
),
753 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
754 func2
, 0, head
->height
, 0);
757 EXPORT_SYMBOL_GPL(btree_visitor
);
759 size_t btree_grim_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
760 unsigned long opaque
,
761 void (*func
)(void *elem
, unsigned long opaque
,
763 size_t index
, void *func2
),
771 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
772 func2
, 1, head
->height
, 0);
776 EXPORT_SYMBOL_GPL(btree_grim_visitor
);
778 static int __init
btree_module_init(void)
780 btree_cachep
= kmem_cache_create("btree_node", NODESIZE
, 0,
781 SLAB_HWCACHE_ALIGN
, NULL
);
785 static void __exit
btree_module_exit(void)
787 kmem_cache_destroy(btree_cachep
);
790 /* If core code starts using btree, initialization should happen even earlier */
791 module_init(btree_module_init
);
792 module_exit(btree_module_exit
);
794 MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
795 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");