2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include <sys/kernel.h>
34 #include <sys/rmlock.h>
35 #include <sys/malloc.h>
36 #include <sys/kernel.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
43 #include <netinet/in.h>
45 #include <net/route.h>
46 #include <net/route/nhop.h>
47 #include <net/route/route_ctl.h>
48 #include <net/route/route_var.h>
49 #include <net/route/fib_algo.h>
52 * Binary search lookup algo.
54 * Compiles route table into a sorted array.
55 * Used with small amount of routes (< 16).
56 * As array is immutable, it is rebuild on each rtable change.
74 struct bsearch4_record
{
77 struct nhop_object
*nh
;
80 struct bsearch4_data
{
85 struct bsearch4_record
*rr
;
86 struct bsearch4_record br
[0];
90 * Main IPv4 address lookup function.
92 * Finds array record with maximum index that is <= provided key.
93 * Assumes 0.0.0.0/0 always exists (may be with NULL nhop)
95 static struct nhop_object
*
96 bsearch4_lookup(void *algo_data
, const struct flm_lookup_key key
, uint32_t scopeid
)
98 const struct bsearch4_data
*bd
= (const struct bsearch4_data
*)algo_data
;
99 const struct bsearch4_record
*br
;
100 uint32_t addr4
= ntohl(key
.addr4
.s_addr
);
103 int end
= bd
->num_items
;
105 int i
= (start
+ end
) / 2;
106 while (start
+ 1 < end
) {
107 i
= (start
+ end
) / 2;
109 if (addr4
< br
->addr4
) {
110 /* key < average, reduce right boundary */
113 } else if (addr4
> br
->addr4
) {
114 /* key > average, increase left aboundary */
122 /* start + 1 == end */
123 return (bd
->br
[start
].nh
);
127 * Preference function.
128 * Assume ideal for < 10 (typical single-interface setup has 5)
129 * Then gradually degrade.
130 * Assume 30 prefixes is at least 60 records, so it will require 8 lookup,
131 * which is even worse than radix.
134 bsearch4_get_pref(const struct rib_rtable_info
*rinfo
)
137 if (rinfo
->num_prefixes
< 10)
139 else if (rinfo
->num_prefixes
< 30)
140 return (255 - rinfo
->num_prefixes
* 8);
145 static enum flm_op_result
146 bsearch4_init(uint32_t fibnum
, struct fib_data
*fd
, void *_old_data
, void **_data
)
148 struct bsearch4_data
*bd
;
149 struct rib_rtable_info rinfo
;
154 fib_get_rtable_info(fib_get_rh(fd
), &rinfo
);
155 count
= rinfo
.num_prefixes
* 11 / 10 + 64;
157 sz
= sizeof(struct bsearch4_data
) + sizeof(struct bsearch4_record
) * count
;
158 /* add cache line sz to ease alignment */
159 sz
+= CACHE_LINE_SIZE
;
160 mem
= malloc(sz
, M_RTABLE
, M_NOWAIT
| M_ZERO
);
162 return (FLM_REBUILD
);
163 /* Align datapath-usable structure to cache line boundary */
164 bd
= (struct bsearch4_data
*)roundup2((uintptr_t)mem
, CACHE_LINE_SIZE
);
166 bd
->alloc_items
= count
;
172 * Allocate temporary array to store all rtable data.
173 * This step is required to provide the required prefix iteration order.
175 bd
->rr
= mallocarray(count
, sizeof(struct bsearch4_record
), M_TEMP
, M_NOWAIT
| M_ZERO
);
177 return (FLM_REBUILD
);
179 return (FLM_SUCCESS
);
183 bsearch4_destroy(void *_data
)
185 struct bsearch4_data
*bd
= (struct bsearch4_data
*)_data
;
188 free(bd
->rr
, M_TEMP
);
189 free(bd
->mem
, M_RTABLE
);
193 * Callback storing converted rtable prefixes in the temporary array.
194 * Addresses are converted to a host order.
196 static enum flm_op_result
197 bsearch4_add_route_cb(struct rtentry
*rt
, void *_data
)
199 struct bsearch4_data
*bd
= (struct bsearch4_data
*)_data
;
200 struct bsearch4_record
*rr
;
201 struct in_addr addr4
, mask4
;
204 if (bd
->num_items
>= bd
->alloc_items
)
205 return (FLM_REBUILD
);
207 rr
= &bd
->rr
[bd
->num_items
++];
208 rt_get_inet_prefix_pmask(rt
, &addr4
, &mask4
, &scopeid
);
209 rr
->addr4
= ntohl(addr4
.s_addr
);
210 rr
->mask4
= ntohl(mask4
.s_addr
);
211 rr
->nh
= rt_get_raw_nhop(rt
);
213 return (FLM_SUCCESS
);
217 * Prefix comparison function.
218 * 10.0.0.0/24 < 10.0.0.0/25 <- less specific wins
219 * 10.0.0.0/25 < 10.0.0.1/32 <- bigger base wins
222 rr_cmp(const void *_rec1
, const void *_rec2
)
224 const struct bsearch4_record
*rec1
, *rec2
;
228 if (rec1
->addr4
< rec2
->addr4
)
230 else if (rec1
->addr4
> rec2
->addr4
)
234 * wider mask value is lesser mask
235 * we want less specific come first, e.g. <
237 if (rec1
->mask4
< rec2
->mask4
)
239 else if (rec1
->mask4
> rec2
->mask4
)
244 struct bsearch4_array
{
245 uint32_t alloc_items
;
247 struct bsearch4_record
*arr
;
251 add_array_entry(struct bsearch4_array
*ba
, struct bsearch4_record
*br_new
)
254 if (ba
->num_items
< ba
->alloc_items
) {
255 ba
->arr
[ba
->num_items
++] = *br_new
;
261 static struct bsearch4_record
*
262 get_last_entry(struct bsearch4_array
*ba
)
265 return (&ba
->arr
[ba
->num_items
- 1]);
271 * stack: 10.0.1.0/24,nh=3 array: 10.0.1.0/25,nh=4 -> ++10.0.1.128/24,nh=3
276 pop_stack_entry(struct bsearch4_array
*dst_array
, struct bsearch4_array
*stack
)
278 uint32_t last_stack_addr
, last_array_addr
;
280 struct bsearch4_record
*br_prev
= get_last_entry(dst_array
);
281 struct bsearch4_record
*pstack
= get_last_entry(stack
);
283 /* Regardless of the result, pop stack entry */
286 /* Prefix last address for the last entry in lookup array */
287 last_array_addr
= (br_prev
->addr4
| ~br_prev
->mask4
);
288 /* Prefix last address for the stack record entry */
289 last_stack_addr
= (pstack
->addr4
| ~pstack
->mask4
);
291 if (last_stack_addr
> last_array_addr
) {
293 * Stack record covers > address space than
294 * the last entry in the lookup array.
295 * Add the remaining parts of a stack record to
298 struct bsearch4_record br_new
= {
299 .addr4
= last_array_addr
+ 1,
300 .mask4
= pstack
->mask4
,
303 return (add_array_entry(dst_array
, &br_new
));
310 * Updates resulting array @dst_array with a rib entry @rib_entry.
313 bsearch4_process_record(struct bsearch4_array
*dst_array
,
314 struct bsearch4_array
*stack
, struct bsearch4_record
*rib_entry
)
318 * Maintain invariant: current rib_entry is always contained
319 * in the top stack entry.
320 * Note we always have 0.0.0.0/0.
322 while (stack
->num_items
> 0) {
323 struct bsearch4_record
*pst
= get_last_entry(stack
);
326 * Check if we need to pop stack.
327 * Rely on the ordering - larger prefixes comes up first
328 * Pop any entry that doesn't contain current prefix.
330 if (pst
->addr4
== (rib_entry
->addr4
& pst
->mask4
))
333 if (!pop_stack_entry(dst_array
, stack
))
337 if (dst_array
->num_items
> 0) {
340 * Check if there is a gap between previous entry and a
341 * current entry. Code above guarantees that both previous
342 * and current entry are contained in the top stack entry.
344 * Example: last: 10.0.0.1(/32,nh=3) cur: 10.0.0.3(/32,nh=4),
345 * stack: 10.0.0.0/24,nh=2.
346 * Cover a gap between previous and current by adding stack
349 struct bsearch4_record
*br_tmp
= get_last_entry(dst_array
);
350 uint32_t last_declared_addr
= br_tmp
->addr4
| ~br_tmp
->mask4
;
351 if (last_declared_addr
< rib_entry
->addr4
- 1) {
353 struct bsearch4_record
*pst
= get_last_entry(stack
);
354 struct bsearch4_record new_entry
= {
355 .addr4
= last_declared_addr
+ 1,
359 if (!add_array_entry(dst_array
, &new_entry
))
364 * Special case: adding more specific prefix at the start of
365 * the previous interval:
366 * 10.0.0.0(/24,nh=3), 10.0.0.0(/25,nh=4)
367 * Alter the last record, seeting new nexthop and mask.
369 if (br_tmp
->addr4
== rib_entry
->addr4
) {
370 *br_tmp
= *rib_entry
;
371 add_array_entry(stack
, rib_entry
);
376 if (!add_array_entry(dst_array
, rib_entry
))
378 add_array_entry(stack
, rib_entry
);
383 static enum flm_op_result
384 bsearch4_build_array(struct bsearch4_array
*dst_array
, struct bsearch4_array
*src_array
)
388 * During iteration, we keep track of all prefixes in rtable
389 * we currently match, by maintaining stack. As there can be only
390 * 32 prefixes for a single address, pre-allocate stack of size 32.
392 struct bsearch4_array stack
= {
394 .arr
= mallocarray(32, sizeof(struct bsearch4_record
), M_TEMP
, M_NOWAIT
| M_ZERO
),
396 if (stack
.arr
== NULL
)
397 return (FLM_REBUILD
);
399 for (int i
= 0; i
< src_array
->num_items
; i
++) {
400 struct bsearch4_record
*rib_entry
= &src_array
->arr
[i
];
402 if (!bsearch4_process_record(dst_array
, &stack
, rib_entry
)) {
403 free(stack
.arr
, M_TEMP
);
404 return (FLM_REBUILD
);
409 * We know that last record is contained in the top stack entry.
411 while (stack
.num_items
> 0) {
412 if (!pop_stack_entry(dst_array
, &stack
))
413 return (FLM_REBUILD
);
415 free(stack
.arr
, M_TEMP
);
417 return (FLM_SUCCESS
);
420 static enum flm_op_result
421 bsearch4_build(struct bsearch4_data
*bd
)
423 enum flm_op_result ret
;
425 struct bsearch4_array prefixes_array
= {
426 .alloc_items
= bd
->alloc_items
,
427 .num_items
= bd
->num_items
,
431 /* Add default route if not exists */
432 bool default_found
= false;
433 for (int i
= 0; i
< prefixes_array
.num_items
; i
++) {
434 if (prefixes_array
.arr
[i
].mask4
== 0) {
435 default_found
= true;
439 if (!default_found
) {
440 /* Add default route with NULL nhop */
441 struct bsearch4_record default_entry
= {};
442 if (!add_array_entry(&prefixes_array
, &default_entry
))
443 return (FLM_REBUILD
);
447 qsort(prefixes_array
.arr
, prefixes_array
.num_items
, sizeof(struct bsearch4_record
), rr_cmp
);
449 struct bsearch4_array dst_array
= {
450 .alloc_items
= bd
->alloc_items
,
454 ret
= bsearch4_build_array(&dst_array
, &prefixes_array
);
455 bd
->num_items
= dst_array
.num_items
;
457 free(bd
->rr
, M_TEMP
);
463 static enum flm_op_result
464 bsearch4_end_dump(void *_data
, struct fib_dp
*dp
)
466 struct bsearch4_data
*bd
= (struct bsearch4_data
*)_data
;
467 enum flm_op_result ret
;
469 ret
= bsearch4_build(bd
);
470 if (ret
== FLM_SUCCESS
) {
471 dp
->f
= bsearch4_lookup
;
478 static enum flm_op_result
479 bsearch4_change_cb(struct rib_head
*rnh
, struct rib_cmd_info
*rc
,
483 return (FLM_REBUILD
);
486 struct fib_lookup_module flm_bsearch4
= {
487 .flm_name
= "bsearch4",
488 .flm_family
= AF_INET
,
489 .flm_init_cb
= bsearch4_init
,
490 .flm_destroy_cb
= bsearch4_destroy
,
491 .flm_dump_rib_item_cb
= bsearch4_add_route_cb
,
492 .flm_dump_end_cb
= bsearch4_end_dump
,
493 .flm_change_rib_item_cb
= bsearch4_change_cb
,
494 .flm_get_pref
= bsearch4_get_pref
,
498 * Lockless radix lookup algo.
500 * Compiles immutable radix from the current routing table.
501 * Used with small amount of routes (<1000).
502 * As datastructure is immutable, it gets rebuild on each rtable change.
504 * Lookups are slightly faster as shorter lookup keys are used
505 * (4 bytes instead of 8 in stock radix).
508 #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
509 #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
510 struct radix4_addr_entry
{
511 struct radix_node rn
[2];
512 struct sockaddr_in addr
;
513 struct nhop_object
*nhop
;
515 #define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64)
517 struct lradix4_data
{
518 struct radix_node_head
*rnh
;
522 uint32_t alloc_items
;
526 static struct nhop_object
*
527 lradix4_lookup(void *algo_data
, const struct flm_lookup_key key
, uint32_t scopeid
)
529 struct radix_node_head
*rnh
= (struct radix_node_head
*)algo_data
;
530 struct radix4_addr_entry
*ent
;
531 struct sockaddr_in addr4
= {
532 .sin_len
= KEY_LEN_INET
,
533 .sin_addr
= key
.addr4
,
535 ent
= (struct radix4_addr_entry
*)(rn_match(&addr4
, &rnh
->rh
));
542 * Preference function.
543 * Assume close-to-ideal of < 10 routes (though worse than bsearch), then
544 * gradually degrade until 1000 routes are reached.
547 lradix4_get_pref(const struct rib_rtable_info
*rinfo
)
550 if (rinfo
->num_prefixes
< 10)
552 else if (rinfo
->num_prefixes
< 1000)
553 return (254 - rinfo
->num_prefixes
/ 4);
558 static enum flm_op_result
559 lradix4_init(uint32_t fibnum
, struct fib_data
*fd
, void *_old_data
, void **_data
)
561 struct lradix4_data
*lr
;
562 struct rib_rtable_info rinfo
;
566 lr
= malloc(sizeof(struct lradix4_data
), M_RTABLE
, M_NOWAIT
| M_ZERO
);
567 if (lr
== NULL
|| !rn_inithead((void **)&lr
->rnh
, OFF_LEN_INET
))
568 return (FLM_REBUILD
);
569 fib_get_rtable_info(fib_get_rh(fd
), &rinfo
);
571 count
= rinfo
.num_prefixes
* 11 / 10;
572 sz
= count
* LRADIX4_ITEM_SZ
+ CACHE_LINE_SIZE
;
573 lr
->mem
= malloc(sz
, M_RTABLE
, M_NOWAIT
| M_ZERO
);
575 return (FLM_REBUILD
);
576 /* Align all rtentries to a cacheline boundary */
577 lr
->rt_base
= (char *)roundup2((uintptr_t)lr
->mem
, CACHE_LINE_SIZE
);
578 lr
->alloc_items
= count
;
583 return (FLM_SUCCESS
);
587 lradix4_destroy(void *_data
)
589 struct lradix4_data
*lr
= (struct lradix4_data
*)_data
;
592 rn_detachhead((void **)&lr
->rnh
);
594 free(lr
->mem
, M_RTABLE
);
598 static enum flm_op_result
599 lradix4_add_route_cb(struct rtentry
*rt
, void *_data
)
601 struct lradix4_data
*lr
= (struct lradix4_data
*)_data
;
602 struct radix4_addr_entry
*ae
;
603 struct sockaddr_in mask
;
604 struct sockaddr
*rt_mask
;
605 struct radix_node
*rn
;
606 struct in_addr addr4
, mask4
;
609 if (lr
->num_items
>= lr
->alloc_items
)
610 return (FLM_REBUILD
);
612 ae
= (struct radix4_addr_entry
*)(lr
->rt_base
+ lr
->num_items
* LRADIX4_ITEM_SZ
);
615 ae
->nhop
= rt_get_raw_nhop(rt
);
617 rt_get_inet_prefix_pmask(rt
, &addr4
, &mask4
, &scopeid
);
618 ae
->addr
.sin_len
= KEY_LEN_INET
;
619 ae
->addr
.sin_addr
= addr4
;
621 if (mask4
.s_addr
!= INADDR_BROADCAST
) {
622 bzero(&mask
, sizeof(mask
));
623 mask
.sin_len
= KEY_LEN_INET
;
624 mask
.sin_addr
= mask4
;
625 rt_mask
= (struct sockaddr
*)&mask
;
629 rn
= lr
->rnh
->rnh_addaddr((struct sockaddr
*)&ae
->addr
, rt_mask
,
630 &lr
->rnh
->rh
, ae
->rn
);
632 return (FLM_REBUILD
);
634 return (FLM_SUCCESS
);
637 static enum flm_op_result
638 lradix4_end_dump(void *_data
, struct fib_dp
*dp
)
640 struct lradix4_data
*lr
= (struct lradix4_data
*)_data
;
642 dp
->f
= lradix4_lookup
;
645 return (FLM_SUCCESS
);
648 static enum flm_op_result
649 lradix4_change_cb(struct rib_head
*rnh
, struct rib_cmd_info
*rc
,
653 return (FLM_REBUILD
);
656 struct fib_lookup_module flm_radix4_lockless
= {
657 .flm_name
= "radix4_lockless",
658 .flm_family
= AF_INET
,
659 .flm_init_cb
= lradix4_init
,
660 .flm_destroy_cb
= lradix4_destroy
,
661 .flm_dump_rib_item_cb
= lradix4_add_route_cb
,
662 .flm_dump_end_cb
= lradix4_end_dump
,
663 .flm_change_rib_item_cb
= lradix4_change_cb
,
664 .flm_get_pref
= lradix4_get_pref
,
668 * Fallback lookup algorithm.
669 * This is a simple wrapper around system radix.
677 static struct nhop_object
*
678 radix4_lookup(void *algo_data
, const struct flm_lookup_key key
, uint32_t scopeid
)
681 struct rib_head
*rh
= (struct rib_head
*)algo_data
;
682 struct radix_node
*rn
;
683 struct nhop_object
*nh
;
685 /* Prepare lookup key */
686 struct sockaddr_in sin4
= {
687 .sin_family
= AF_INET
,
688 .sin_len
= sizeof(struct sockaddr_in
),
689 .sin_addr
= key
.addr4
,
694 rn
= rn_match((void *)&sin4
, &rh
->head
);
695 if (rn
!= NULL
&& ((rn
->rn_flags
& RNF_ROOT
) == 0))
696 nh
= (RNTORT(rn
))->rt_nhop
;
703 radix4_get_pref(const struct rib_rtable_info
*rinfo
)
709 static enum flm_op_result
710 radix4_init(uint32_t fibnum
, struct fib_data
*fd
, void *_old_data
, void **_data
)
712 struct radix4_data
*r4
;
714 r4
= malloc(sizeof(struct radix4_data
), M_RTABLE
, M_NOWAIT
| M_ZERO
);
716 return (FLM_REBUILD
);
718 r4
->rh
= fib_get_rh(fd
);
722 return (FLM_SUCCESS
);
726 radix4_destroy(void *_data
)
729 free(_data
, M_RTABLE
);
732 static enum flm_op_result
733 radix4_add_route_cb(struct rtentry
*rt
, void *_data
)
736 return (FLM_SUCCESS
);
739 static enum flm_op_result
740 radix4_end_dump(void *_data
, struct fib_dp
*dp
)
742 struct radix4_data
*r4
= (struct radix4_data
*)_data
;
744 dp
->f
= radix4_lookup
;
747 return (FLM_SUCCESS
);
750 static enum flm_op_result
751 radix4_change_cb(struct rib_head
*rnh
, struct rib_cmd_info
*rc
,
755 return (FLM_SUCCESS
);
758 struct fib_lookup_module flm_radix4
= {
759 .flm_name
= "radix4",
760 .flm_family
= AF_INET
,
761 .flm_init_cb
= radix4_init
,
762 .flm_destroy_cb
= radix4_destroy
,
763 .flm_dump_rib_item_cb
= radix4_add_route_cb
,
764 .flm_dump_end_cb
= radix4_end_dump
,
765 .flm_change_rib_item_cb
= radix4_change_cb
,
766 .flm_get_pref
= radix4_get_pref
,
773 fib_module_register(&flm_bsearch4
);
774 fib_module_register(&flm_radix4_lockless
);
775 fib_module_register(&flm_radix4
);
777 SYSINIT(fib4_algo_init
, SI_SUB_PROTO_DOMAIN
, SI_ORDER_THIRD
, fib4_algo_init
, NULL
);