1 // SPDX-License-Identifier: GPL-2.0-only
3 * Longest prefix match list implementation
5 * Copyright (c) 2016,2017 Daniel Mack
6 * Copyright (c) 2016 David Herrmann
10 #include <linux/btf.h>
11 #include <linux/err.h>
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/vmalloc.h>
16 #include <uapi/linux/btf.h>
17 #include <linux/btf_ids.h>
18 #include <linux/bpf_mem_alloc.h>
20 /* Intermediate node */
21 #define LPM_TREE_NODE_FLAG_IM BIT(0)
25 struct lpm_trie_node
{
26 struct lpm_trie_node __rcu
*child
[2];
34 struct lpm_trie_node __rcu
*root
;
35 struct bpf_mem_alloc ma
;
42 /* This trie implements a longest prefix match algorithm that can be used to
43 * match IP addresses to a stored set of ranges.
45 * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
46 * interpreted as big endian, so data[0] stores the most significant byte.
48 * Match ranges are internally stored in instances of struct lpm_trie_node
49 * which each contain their prefix length as well as two pointers that may
50 * lead to more nodes containing more specific matches. Each node also stores
51 * a value that is defined by and returned to userspace via the update_elem
52 * and lookup functions.
54 * For instance, let's start with a trie that was created with a prefix length
55 * of 32, so it can be used for IPv4 addresses, and one single element that
56 * matches 192.168.0.0/16. The data array would hence contain
57 * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
58 * stick to IP-address notation for readability though.
60 * As the trie is empty initially, the new node (1) will be places as root
61 * node, denoted as (R) in the example below. As there are no other node, both
62 * child pointers are %NULL.
71 * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
72 * a node with the same data and a smaller prefix (ie, a less specific one),
73 * node (2) will become a child of (1). In child index depends on the next bit
74 * that is outside of what (1) matches, and that bit is 0, so (2) will be
91 * The child[1] slot of (1) could be filled with another node which has bit #17
92 * (the next bit after the ones that (1) matches on) set to 1. For instance,
102 * +----------------+ +------------------+
104 * | 192.168.0.0/24 | | 192.168.128.0/24 |
105 * | value: 2 | | value: 3 |
106 * | [0] [1] | | [0] [1] |
107 * +----------------+ +------------------+
109 * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
110 * it, node (1) is looked at first, and because (4) of the semantics laid out
111 * above (bit #17 is 0), it would normally be attached to (1) as child[0].
112 * However, that slot is already allocated, so a new node is needed in between.
113 * That node does not have a value attached to it and it will never be
114 * returned to users as result of a lookup. It is only there to differentiate
115 * the traversal further. It will get a prefix as wide as necessary to
116 * distinguish its two children:
125 * +----------------+ +------------------+
126 * | (4) (I) | | (3) |
127 * | 192.168.0.0/23 | | 192.168.128.0/24 |
128 * | value: --- | | value: 3 |
129 * | [0] [1] | | [0] [1] |
130 * +----------------+ +------------------+
132 * +----------------+ +----------------+
134 * | 192.168.0.0/24 | | 192.168.1.0/24 |
135 * | value: 2 | | value: 5 |
136 * | [0] [1] | | [0] [1] |
137 * +----------------+ +----------------+
139 * 192.168.1.1/32 would be a child of (5) etc.
141 * An intermediate node will be turned into a 'real' node on demand. In the
142 * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
144 * A fully populated trie would have a height of 32 nodes, as the trie was
145 * created with a prefix length of 32.
147 * The lookup starts at the root node. If the current node matches and if there
148 * is a child that can be used to become more specific, the trie is traversed
149 * downwards. The last node in the traversal that is a non-intermediate one is
153 static inline int extract_bit(const u8
*data
, size_t index
)
155 return !!(data
[index
/ 8] & (1 << (7 - (index
% 8))));
159 * __longest_prefix_match() - determine the longest prefix
160 * @trie: The trie to get internal sizes from
161 * @node: The node to operate on
162 * @key: The key to compare to @node
164 * Determine the longest prefix of @node that matches the bits in @key.
166 static __always_inline
167 size_t __longest_prefix_match(const struct lpm_trie
*trie
,
168 const struct lpm_trie_node
*node
,
169 const struct bpf_lpm_trie_key_u8
*key
)
171 u32 limit
= min(node
->prefixlen
, key
->prefixlen
);
172 u32 prefixlen
= 0, i
= 0;
174 BUILD_BUG_ON(offsetof(struct lpm_trie_node
, data
) % sizeof(u32
));
175 BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8
, data
) % sizeof(u32
));
177 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
179 /* data_size >= 16 has very small probability.
180 * We do not use a loop for optimal code generation.
182 if (trie
->data_size
>= 8) {
183 u64 diff
= be64_to_cpu(*(__be64
*)node
->data
^
184 *(__be64
*)key
->data
);
186 prefixlen
= 64 - fls64(diff
);
187 if (prefixlen
>= limit
)
195 while (trie
->data_size
>= i
+ 4) {
196 u32 diff
= be32_to_cpu(*(__be32
*)&node
->data
[i
] ^
197 *(__be32
*)&key
->data
[i
]);
199 prefixlen
+= 32 - fls(diff
);
200 if (prefixlen
>= limit
)
207 if (trie
->data_size
>= i
+ 2) {
208 u16 diff
= be16_to_cpu(*(__be16
*)&node
->data
[i
] ^
209 *(__be16
*)&key
->data
[i
]);
211 prefixlen
+= 16 - fls(diff
);
212 if (prefixlen
>= limit
)
219 if (trie
->data_size
>= i
+ 1) {
220 prefixlen
+= 8 - fls(node
->data
[i
] ^ key
->data
[i
]);
222 if (prefixlen
>= limit
)
229 static size_t longest_prefix_match(const struct lpm_trie
*trie
,
230 const struct lpm_trie_node
*node
,
231 const struct bpf_lpm_trie_key_u8
*key
)
233 return __longest_prefix_match(trie
, node
, key
);
236 /* Called from syscall or from eBPF program */
237 static void *trie_lookup_elem(struct bpf_map
*map
, void *_key
)
239 struct lpm_trie
*trie
= container_of(map
, struct lpm_trie
, map
);
240 struct lpm_trie_node
*node
, *found
= NULL
;
241 struct bpf_lpm_trie_key_u8
*key
= _key
;
243 if (key
->prefixlen
> trie
->max_prefixlen
)
246 /* Start walking the trie from the root node ... */
248 for (node
= rcu_dereference_check(trie
->root
, rcu_read_lock_bh_held());
250 unsigned int next_bit
;
253 /* Determine the longest prefix of @node that matches @key.
254 * If it's the maximum possible prefix for this trie, we have
255 * an exact match and can return it directly.
257 matchlen
= __longest_prefix_match(trie
, node
, key
);
258 if (matchlen
== trie
->max_prefixlen
) {
263 /* If the number of bits that match is smaller than the prefix
264 * length of @node, bail out and return the node we have seen
265 * last in the traversal (ie, the parent).
267 if (matchlen
< node
->prefixlen
)
270 /* Consider this node as return candidate unless it is an
271 * artificially added intermediate one.
273 if (!(node
->flags
& LPM_TREE_NODE_FLAG_IM
))
276 /* If the node match is fully satisfied, let's see if we can
277 * become more specific. Determine the next bit in the key and
280 next_bit
= extract_bit(key
->data
, node
->prefixlen
);
281 node
= rcu_dereference_check(node
->child
[next_bit
],
282 rcu_read_lock_bh_held());
288 return found
->data
+ trie
->data_size
;
291 static struct lpm_trie_node
*lpm_trie_node_alloc(struct lpm_trie
*trie
,
293 bool disable_migration
)
295 struct lpm_trie_node
*node
;
297 if (disable_migration
)
299 node
= bpf_mem_cache_alloc(&trie
->ma
);
300 if (disable_migration
)
309 memcpy(node
->data
+ trie
->data_size
, value
,
310 trie
->map
.value_size
);
315 static int trie_check_add_elem(struct lpm_trie
*trie
, u64 flags
)
317 if (flags
== BPF_EXIST
)
319 if (trie
->n_entries
== trie
->map
.max_entries
)
325 /* Called from syscall or from eBPF program */
326 static long trie_update_elem(struct bpf_map
*map
,
327 void *_key
, void *value
, u64 flags
)
329 struct lpm_trie
*trie
= container_of(map
, struct lpm_trie
, map
);
330 struct lpm_trie_node
*node
, *im_node
, *new_node
;
331 struct lpm_trie_node
*free_node
= NULL
;
332 struct lpm_trie_node __rcu
**slot
;
333 struct bpf_lpm_trie_key_u8
*key
= _key
;
334 unsigned long irq_flags
;
335 unsigned int next_bit
;
339 if (unlikely(flags
> BPF_EXIST
))
342 if (key
->prefixlen
> trie
->max_prefixlen
)
345 /* Allocate and fill a new node. Need to disable migration before
346 * invoking bpf_mem_cache_alloc().
348 new_node
= lpm_trie_node_alloc(trie
, value
, true);
352 raw_spin_lock_irqsave(&trie
->lock
, irq_flags
);
354 new_node
->prefixlen
= key
->prefixlen
;
355 RCU_INIT_POINTER(new_node
->child
[0], NULL
);
356 RCU_INIT_POINTER(new_node
->child
[1], NULL
);
357 memcpy(new_node
->data
, key
->data
, trie
->data_size
);
359 /* Now find a slot to attach the new node. To do that, walk the tree
360 * from the root and match as many bits as possible for each node until
361 * we either find an empty slot or a slot that needs to be replaced by
362 * an intermediate node.
366 while ((node
= rcu_dereference_protected(*slot
,
367 lockdep_is_held(&trie
->lock
)))) {
368 matchlen
= longest_prefix_match(trie
, node
, key
);
370 if (node
->prefixlen
!= matchlen
||
371 node
->prefixlen
== key
->prefixlen
)
374 next_bit
= extract_bit(key
->data
, node
->prefixlen
);
375 slot
= &node
->child
[next_bit
];
378 /* If the slot is empty (a free child pointer or an empty root),
379 * simply assign the @new_node to that slot and be done.
382 ret
= trie_check_add_elem(trie
, flags
);
386 rcu_assign_pointer(*slot
, new_node
);
390 /* If the slot we picked already exists, replace it with @new_node
391 * which already has the correct data array set.
393 if (node
->prefixlen
== matchlen
) {
394 if (!(node
->flags
& LPM_TREE_NODE_FLAG_IM
)) {
395 if (flags
== BPF_NOEXIST
) {
400 ret
= trie_check_add_elem(trie
, flags
);
405 new_node
->child
[0] = node
->child
[0];
406 new_node
->child
[1] = node
->child
[1];
408 rcu_assign_pointer(*slot
, new_node
);
414 ret
= trie_check_add_elem(trie
, flags
);
418 /* If the new node matches the prefix completely, it must be inserted
419 * as an ancestor. Simply insert it between @node and *@slot.
421 if (matchlen
== key
->prefixlen
) {
422 next_bit
= extract_bit(node
->data
, matchlen
);
423 rcu_assign_pointer(new_node
->child
[next_bit
], node
);
424 rcu_assign_pointer(*slot
, new_node
);
428 /* migration is disabled within the locked scope */
429 im_node
= lpm_trie_node_alloc(trie
, NULL
, false);
436 im_node
->prefixlen
= matchlen
;
437 im_node
->flags
|= LPM_TREE_NODE_FLAG_IM
;
438 memcpy(im_node
->data
, node
->data
, trie
->data_size
);
440 /* Now determine which child to install in which slot */
441 if (extract_bit(key
->data
, matchlen
)) {
442 rcu_assign_pointer(im_node
->child
[0], node
);
443 rcu_assign_pointer(im_node
->child
[1], new_node
);
445 rcu_assign_pointer(im_node
->child
[0], new_node
);
446 rcu_assign_pointer(im_node
->child
[1], node
);
449 /* Finally, assign the intermediate node to the determined slot */
450 rcu_assign_pointer(*slot
, im_node
);
453 raw_spin_unlock_irqrestore(&trie
->lock
, irq_flags
);
457 bpf_mem_cache_free(&trie
->ma
, new_node
);
458 bpf_mem_cache_free_rcu(&trie
->ma
, free_node
);
464 /* Called from syscall or from eBPF program */
465 static long trie_delete_elem(struct bpf_map
*map
, void *_key
)
467 struct lpm_trie
*trie
= container_of(map
, struct lpm_trie
, map
);
468 struct lpm_trie_node
*free_node
= NULL
, *free_parent
= NULL
;
469 struct bpf_lpm_trie_key_u8
*key
= _key
;
470 struct lpm_trie_node __rcu
**trim
, **trim2
;
471 struct lpm_trie_node
*node
, *parent
;
472 unsigned long irq_flags
;
473 unsigned int next_bit
;
477 if (key
->prefixlen
> trie
->max_prefixlen
)
480 raw_spin_lock_irqsave(&trie
->lock
, irq_flags
);
482 /* Walk the tree looking for an exact key/length match and keeping
483 * track of the path we traverse. We will need to know the node
484 * we wish to delete, and the slot that points to the node we want
485 * to delete. We may also need to know the nodes parent and the
486 * slot that contains it.
491 while ((node
= rcu_dereference_protected(
492 *trim
, lockdep_is_held(&trie
->lock
)))) {
493 matchlen
= longest_prefix_match(trie
, node
, key
);
495 if (node
->prefixlen
!= matchlen
||
496 node
->prefixlen
== key
->prefixlen
)
501 next_bit
= extract_bit(key
->data
, node
->prefixlen
);
502 trim
= &node
->child
[next_bit
];
505 if (!node
|| node
->prefixlen
!= key
->prefixlen
||
506 node
->prefixlen
!= matchlen
||
507 (node
->flags
& LPM_TREE_NODE_FLAG_IM
)) {
514 /* If the node we are removing has two children, simply mark it
515 * as intermediate and we are done.
517 if (rcu_access_pointer(node
->child
[0]) &&
518 rcu_access_pointer(node
->child
[1])) {
519 node
->flags
|= LPM_TREE_NODE_FLAG_IM
;
523 /* If the parent of the node we are about to delete is an intermediate
524 * node, and the deleted node doesn't have any children, we can delete
525 * the intermediate parent as well and promote its other child
526 * up the tree. Doing this maintains the invariant that all
527 * intermediate nodes have exactly 2 children and that there are no
528 * unnecessary intermediate nodes in the tree.
530 if (parent
&& (parent
->flags
& LPM_TREE_NODE_FLAG_IM
) &&
531 !node
->child
[0] && !node
->child
[1]) {
532 if (node
== rcu_access_pointer(parent
->child
[0]))
534 *trim2
, rcu_access_pointer(parent
->child
[1]));
537 *trim2
, rcu_access_pointer(parent
->child
[0]));
538 free_parent
= parent
;
543 /* The node we are removing has either zero or one child. If there
544 * is a child, move it into the removed node's slot then delete
545 * the node. Otherwise just clear the slot and delete the node.
548 rcu_assign_pointer(*trim
, rcu_access_pointer(node
->child
[0]));
549 else if (node
->child
[1])
550 rcu_assign_pointer(*trim
, rcu_access_pointer(node
->child
[1]));
552 RCU_INIT_POINTER(*trim
, NULL
);
556 raw_spin_unlock_irqrestore(&trie
->lock
, irq_flags
);
559 bpf_mem_cache_free_rcu(&trie
->ma
, free_parent
);
560 bpf_mem_cache_free_rcu(&trie
->ma
, free_node
);
566 #define LPM_DATA_SIZE_MAX 256
567 #define LPM_DATA_SIZE_MIN 1
569 #define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
570 sizeof(struct lpm_trie_node))
571 #define LPM_VAL_SIZE_MIN 1
573 #define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X))
574 #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
575 #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
577 #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
580 static struct bpf_map
*trie_alloc(union bpf_attr
*attr
)
582 struct lpm_trie
*trie
;
586 /* check sanity of attributes */
587 if (attr
->max_entries
== 0 ||
588 !(attr
->map_flags
& BPF_F_NO_PREALLOC
) ||
589 attr
->map_flags
& ~LPM_CREATE_FLAG_MASK
||
590 !bpf_map_flags_access_ok(attr
->map_flags
) ||
591 attr
->key_size
< LPM_KEY_SIZE_MIN
||
592 attr
->key_size
> LPM_KEY_SIZE_MAX
||
593 attr
->value_size
< LPM_VAL_SIZE_MIN
||
594 attr
->value_size
> LPM_VAL_SIZE_MAX
)
595 return ERR_PTR(-EINVAL
);
597 trie
= bpf_map_area_alloc(sizeof(*trie
), NUMA_NO_NODE
);
599 return ERR_PTR(-ENOMEM
);
601 /* copy mandatory map attributes */
602 bpf_map_init_from_attr(&trie
->map
, attr
);
603 trie
->data_size
= attr
->key_size
-
604 offsetof(struct bpf_lpm_trie_key_u8
, data
);
605 trie
->max_prefixlen
= trie
->data_size
* 8;
607 raw_spin_lock_init(&trie
->lock
);
609 /* Allocate intermediate and leaf nodes from the same allocator */
610 leaf_size
= sizeof(struct lpm_trie_node
) + trie
->data_size
+
611 trie
->map
.value_size
;
612 err
= bpf_mem_alloc_init(&trie
->ma
, leaf_size
, false);
618 bpf_map_area_free(trie
);
622 static void trie_free(struct bpf_map
*map
)
624 struct lpm_trie
*trie
= container_of(map
, struct lpm_trie
, map
);
625 struct lpm_trie_node __rcu
**slot
;
626 struct lpm_trie_node
*node
;
628 /* Always start at the root and walk down to a node that has no
629 * children. Then free that node, nullify its reference in the parent
637 node
= rcu_dereference_protected(*slot
, 1);
641 if (rcu_access_pointer(node
->child
[0])) {
642 slot
= &node
->child
[0];
646 if (rcu_access_pointer(node
->child
[1])) {
647 slot
= &node
->child
[1];
651 /* No bpf program may access the map, so freeing the
652 * node without waiting for the extra RCU GP.
654 bpf_mem_cache_raw_free(node
);
655 RCU_INIT_POINTER(*slot
, NULL
);
661 bpf_mem_alloc_destroy(&trie
->ma
);
662 bpf_map_area_free(trie
);
665 static int trie_get_next_key(struct bpf_map
*map
, void *_key
, void *_next_key
)
667 struct lpm_trie_node
*node
, *next_node
= NULL
, *parent
, *search_root
;
668 struct lpm_trie
*trie
= container_of(map
, struct lpm_trie
, map
);
669 struct bpf_lpm_trie_key_u8
*key
= _key
, *next_key
= _next_key
;
670 struct lpm_trie_node
**node_stack
= NULL
;
671 int err
= 0, stack_ptr
= -1;
672 unsigned int next_bit
;
675 /* The get_next_key follows postorder. For the 4 node example in
676 * the top of this file, the trie_get_next_key() returns the following
683 * The idea is to return more specific keys before less specific ones.
687 search_root
= rcu_dereference(trie
->root
);
691 /* For invalid key, find the leftmost node in the trie */
692 if (!key
|| key
->prefixlen
> trie
->max_prefixlen
)
695 node_stack
= kmalloc_array(trie
->max_prefixlen
+ 1,
696 sizeof(struct lpm_trie_node
*),
697 GFP_ATOMIC
| __GFP_NOWARN
);
701 /* Try to find the exact node for the given key */
702 for (node
= search_root
; node
;) {
703 node_stack
[++stack_ptr
] = node
;
704 matchlen
= longest_prefix_match(trie
, node
, key
);
705 if (node
->prefixlen
!= matchlen
||
706 node
->prefixlen
== key
->prefixlen
)
709 next_bit
= extract_bit(key
->data
, node
->prefixlen
);
710 node
= rcu_dereference(node
->child
[next_bit
]);
712 if (!node
|| node
->prefixlen
!= matchlen
||
713 (node
->flags
& LPM_TREE_NODE_FLAG_IM
))
716 /* The node with the exactly-matching key has been found,
717 * find the first node in postorder after the matched node.
719 node
= node_stack
[stack_ptr
];
720 while (stack_ptr
> 0) {
721 parent
= node_stack
[stack_ptr
- 1];
722 if (rcu_dereference(parent
->child
[0]) == node
) {
723 search_root
= rcu_dereference(parent
->child
[1]);
727 if (!(parent
->flags
& LPM_TREE_NODE_FLAG_IM
)) {
736 /* did not find anything */
741 /* Find the leftmost non-intermediate node, all intermediate nodes
742 * have exact two children, so this function will never return NULL.
744 for (node
= search_root
; node
;) {
745 if (node
->flags
& LPM_TREE_NODE_FLAG_IM
) {
746 node
= rcu_dereference(node
->child
[0]);
749 node
= rcu_dereference(node
->child
[0]);
751 node
= rcu_dereference(next_node
->child
[1]);
755 next_key
->prefixlen
= next_node
->prefixlen
;
756 memcpy((void *)next_key
+ offsetof(struct bpf_lpm_trie_key_u8
, data
),
757 next_node
->data
, trie
->data_size
);
763 static int trie_check_btf(const struct bpf_map
*map
,
764 const struct btf
*btf
,
765 const struct btf_type
*key_type
,
766 const struct btf_type
*value_type
)
768 /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
769 return BTF_INFO_KIND(key_type
->info
) != BTF_KIND_STRUCT
?
773 static u64
trie_mem_usage(const struct bpf_map
*map
)
775 struct lpm_trie
*trie
= container_of(map
, struct lpm_trie
, map
);
778 elem_size
= sizeof(struct lpm_trie_node
) + trie
->data_size
+
779 trie
->map
.value_size
;
780 return elem_size
* READ_ONCE(trie
->n_entries
);
783 BTF_ID_LIST_SINGLE(trie_map_btf_ids
, struct, lpm_trie
)
784 const struct bpf_map_ops trie_map_ops
= {
785 .map_meta_equal
= bpf_map_meta_equal
,
786 .map_alloc
= trie_alloc
,
787 .map_free
= trie_free
,
788 .map_get_next_key
= trie_get_next_key
,
789 .map_lookup_elem
= trie_lookup_elem
,
790 .map_update_elem
= trie_update_elem
,
791 .map_delete_elem
= trie_delete_elem
,
792 .map_lookup_batch
= generic_map_lookup_batch
,
793 .map_update_batch
= generic_map_update_batch
,
794 .map_delete_batch
= generic_map_delete_batch
,
795 .map_check_btf
= trie_check_btf
,
796 .map_mem_usage
= trie_mem_usage
,
797 .map_btf_id
= &trie_map_btf_ids
[0],