2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.323"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 static DEFINE_RWLOCK(fib_lock
);
87 typedef unsigned int t_key
;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99 #define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
103 #define IS_LEAF(n) (n->_parent & T_LEAF)
107 unsigned long _parent
;
112 unsigned long _parent
;
113 struct hlist_head list
;
117 struct hlist_node hlist
;
119 struct list_head falh
;
124 unsigned long _parent
;
125 unsigned short pos
:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits
:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children
; /* KEYLENGTH bits needed */
128 unsigned short empty_children
; /* KEYLENGTH bits needed */
129 struct node
*child
[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats
{
135 unsigned int backtrack
;
136 unsigned int semantic_match_passed
;
137 unsigned int semantic_match_miss
;
138 unsigned int null_node_hit
;
143 unsigned int totdepth
;
144 unsigned int maxdepth
;
147 unsigned int nullpointers
;
148 unsigned int nodesizes
[MAX_CHILDS
];
153 #ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats
;
157 unsigned int revision
;
160 static int trie_debug
= 0;
162 static int tnode_full(struct tnode
*tn
, struct node
*n
);
163 static void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
);
164 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
);
165 static int tnode_child_length(struct tnode
*tn
);
166 static struct node
*resize(struct trie
*t
, struct tnode
*tn
);
167 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
);
168 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
);
169 static void tnode_free(struct tnode
*tn
);
170 static void trie_dump_seq(struct seq_file
*seq
, struct trie
*t
);
171 extern struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
);
172 extern int fib_detect_death(struct fib_info
*fi
, int order
,
173 struct fib_info
**last_resort
, int *last_idx
, int *dflt
);
175 extern void rtmsg_fib(int event
, u32 key
, struct fib_alias
*fa
, int z
, int tb_id
,
176 struct nlmsghdr
*n
, struct netlink_skb_parms
*req
);
178 static kmem_cache_t
*fn_alias_kmem
;
179 static struct trie
*trie_local
= NULL
, *trie_main
= NULL
;
181 static void trie_bug(char *err
)
183 printk("Trie Bug: %s\n", err
);
187 static inline struct node
*tnode_get_child(struct tnode
*tn
, int i
)
189 if (i
>= 1<<tn
->bits
)
190 trie_bug("tnode_get_child");
195 static inline int tnode_child_length(struct tnode
*tn
)
201 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
206 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208 -----------------------------------------------------------------
209 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
218 static inline t_key
tkey_extract_bits(t_key a
, int offset
, int bits
)
220 if (offset
< KEYLENGTH
)
221 return ((t_key
)(a
<< offset
)) >> (KEYLENGTH
- bits
);
226 static inline int tkey_equals(t_key a
, t_key b
)
231 static inline int tkey_sub_equals(t_key a
, int offset
, int bits
, t_key b
)
233 if (bits
== 0 || offset
>= KEYLENGTH
)
235 bits
= bits
> KEYLENGTH
? KEYLENGTH
: bits
;
236 return ((a
^ b
) << offset
) >> (KEYLENGTH
- bits
) == 0;
239 static inline int tkey_mismatch(t_key a
, int offset
, t_key b
)
246 while((diff
<< i
) >> (KEYLENGTH
-1) == 0)
251 /* Candiate for fib_semantics */
253 static void fn_free_alias(struct fib_alias
*fa
)
255 fib_release_info(fa
->fa_info
);
256 kmem_cache_free(fn_alias_kmem
, fa
);
260 To understand this stuff, an understanding of keys and all their bits is
261 necessary. Every node in the trie has a key associated with it, but not
262 all of the bits in that key are significant.
264 Consider a node 'n' and its parent 'tp'.
266 If n is a leaf, every bit in its key is significant. Its presence is
267 necessitaded by path compression, since during a tree traversal (when
268 searching for a leaf - unless we are doing an insertion) we will completely
269 ignore all skipped bits we encounter. Thus we need to verify, at the end of
270 a potentially successful search, that we have indeed been walking the
273 Note that we can never "miss" the correct key in the tree if present by
274 following the wrong path. Path compression ensures that segments of the key
275 that are the same for all keys with a given prefix are skipped, but the
276 skipped part *is* identical for each node in the subtrie below the skipped
277 bit! trie_insert() in this implementation takes care of that - note the
278 call to tkey_sub_equals() in trie_insert().
280 if n is an internal node - a 'tnode' here, the various parts of its key
281 have many different meanings.
284 _________________________________________________________________
285 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286 -----------------------------------------------------------------
287 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
289 _________________________________________________________________
290 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291 -----------------------------------------------------------------
292 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
299 First, let's just ignore the bits that come before the parent tp, that is
300 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301 not use them for anything.
303 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304 index into the parent's child array. That is, they will be used to find
305 'n' among tp's children.
307 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
310 All the bits we have seen so far are significant to the node n. The rest
311 of the bits are really not needed or indeed known in n->key.
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child.
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
321 static void check_tnode(struct tnode
*tn
)
323 if(tn
&& tn
->pos
+tn
->bits
> 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn
, tn
->pos
, tn
->bits
);
328 static int halve_threshold
= 25;
329 static int inflate_threshold
= 50;
331 static struct leaf
*leaf_new(void)
333 struct leaf
*l
= kmalloc(sizeof(struct leaf
), GFP_KERNEL
);
335 NODE_INIT_PARENT(l
, T_LEAF
);
336 INIT_HLIST_HEAD(&l
->list
);
341 static struct leaf_info
*leaf_info_new(int plen
)
343 struct leaf_info
*li
= kmalloc(sizeof(struct leaf_info
), GFP_KERNEL
);
345 INIT_LIST_HEAD(&li
->falh
);
349 static inline void free_leaf(struct leaf
*l
)
354 static inline void free_leaf_info(struct leaf_info
*li
)
359 static struct tnode
* tnode_new(t_key key
, int pos
, int bits
)
361 int nchildren
= 1<<bits
;
362 int sz
= sizeof(struct tnode
) + nchildren
* sizeof(struct node
*);
363 struct tnode
*tn
= kmalloc(sz
, GFP_KERNEL
);
367 NODE_INIT_PARENT(tn
, T_TNODE
);
371 tn
->full_children
= 0;
372 tn
->empty_children
= 1<<bits
;
375 printk("AT %p s=%u %u\n", tn
, (unsigned int) sizeof(struct tnode
),
376 (unsigned int) (sizeof(struct node
) * 1<<bits
));
380 static void tnode_free(struct tnode
*tn
)
383 trie_bug("tnode_free\n");
386 free_leaf((struct leaf
*)tn
);
388 printk("FL %p \n", tn
);
390 else if(IS_TNODE(tn
)) {
393 printk("FT %p \n", tn
);
396 trie_bug("tnode_free\n");
401 * Check whether a tnode 'n' is "full", i.e. it is an internal node
402 * and no bits are skipped. See discussion in dyntree paper p. 6
405 static inline int tnode_full(struct tnode
*tn
, struct node
*n
)
407 if(n
== NULL
|| IS_LEAF(n
))
410 return ((struct tnode
*) n
)->pos
== tn
->pos
+ tn
->bits
;
413 static inline void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
)
415 tnode_put_child_reorg(tn
, i
, n
, -1);
419 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children.
423 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
)
428 if(i
>= 1<<tn
->bits
) {
429 printk("bits=%d, i=%d\n", tn
->bits
, i
);
430 trie_bug("tnode_put_child_reorg bits");
432 write_lock_bh(&fib_lock
);
435 /* update emptyChildren */
436 if (n
== NULL
&& chi
!= NULL
)
437 tn
->empty_children
++;
438 else if (n
!= NULL
&& chi
== NULL
)
439 tn
->empty_children
--;
441 /* update fullChildren */
443 wasfull
= tnode_full(tn
, chi
);
445 isfull
= tnode_full(tn
, n
);
446 if (wasfull
&& !isfull
)
449 else if (!wasfull
&& isfull
)
452 NODE_SET_PARENT(n
, tn
);
455 write_unlock_bh(&fib_lock
);
458 static struct node
*resize(struct trie
*t
, struct tnode
*tn
)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn
, inflate_threshold
, halve_threshold
);
470 if (tn
->empty_children
== tnode_child_length(tn
)) {
475 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
476 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
478 write_lock_bh(&fib_lock
);
479 if (tn
->child
[i
] != NULL
) {
481 /* compress one level */
482 struct node
*n
= tn
->child
[i
];
484 NODE_INIT_PARENT(n
, NODE_TYPE(n
));
486 write_unlock_bh(&fib_lock
);
490 write_unlock_bh(&fib_lock
);
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
518 * A clearer way to write this would be:
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
524 * new_child_length = tnode_child_length(tn) * 2;
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
528 * if (new_fill_factor >= inflate_threshold)
530 * ...and so on, tho it would mess up the while() loop.
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn)
558 while ((tn
->full_children
> 0 &&
559 50 * (tn
->full_children
+ tnode_child_length(tn
) - tn
->empty_children
) >=
560 inflate_threshold
* tnode_child_length(tn
))) {
568 * Halve as long as the number of empty children in this
569 * node is above threshold.
571 while (tn
->bits
> 1 &&
572 100 * (tnode_child_length(tn
) - tn
->empty_children
) <
573 halve_threshold
* tnode_child_length(tn
))
577 /* Only one child remains */
579 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
580 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
582 write_lock_bh(&fib_lock
);
583 if (tn
->child
[i
] != NULL
) {
584 /* compress one level */
585 struct node
*n
= tn
->child
[i
];
588 NODE_INIT_PARENT(n
, NODE_TYPE(n
));
590 write_unlock_bh(&fib_lock
);
594 write_unlock_bh(&fib_lock
);
597 return (struct node
*) tn
;
600 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
)
603 struct tnode
*oldtnode
= tn
;
604 int olen
= tnode_child_length(tn
);
608 printk("In inflate\n");
610 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
+ 1);
613 trie_bug("tnode_new failed");
615 for(i
= 0; i
< olen
; i
++) {
616 struct node
*node
= tnode_get_child(oldtnode
, i
);
622 /* A leaf or an internal node with skipped bits */
624 if(IS_LEAF(node
) || ((struct tnode
*) node
)->pos
>
625 tn
->pos
+ tn
->bits
- 1) {
626 if(tkey_extract_bits(node
->key
, tn
->pos
+ tn
->bits
- 1,
628 put_child(t
, tn
, 2*i
, node
);
630 put_child(t
, tn
, 2*i
+1, node
);
634 /* An internal node with two children */
635 inode
= (struct tnode
*) node
;
637 if (inode
->bits
== 1) {
638 put_child(t
, tn
, 2*i
, inode
->child
[0]);
639 put_child(t
, tn
, 2*i
+1, inode
->child
[1]);
644 /* An internal node with more than two children */
646 struct tnode
*left
, *right
;
649 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have
652 * a position one bit further down the key and this
653 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the
662 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos)
666 t_key m
= TKEY_GET_MASK(inode
->pos
, 1);
668 /* Use the old key, but set the new significant
671 left
= tnode_new(inode
->key
&(~m
), inode
->pos
+ 1,
675 trie_bug("tnode_new failed");
678 /* Use the old key, but set the new significant
681 right
= tnode_new(inode
->key
|m
, inode
->pos
+ 1,
685 trie_bug("tnode_new failed");
687 size
= tnode_child_length(left
);
688 for(j
= 0; j
< size
; j
++) {
689 put_child(t
, left
, j
, inode
->child
[j
]);
690 put_child(t
, right
, j
, inode
->child
[j
+ size
]);
692 put_child(t
, tn
, 2*i
, resize(t
, left
));
693 put_child(t
, tn
, 2*i
+1, resize(t
, right
));
698 tnode_free(oldtnode
);
702 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
)
704 struct tnode
*oldtnode
= tn
;
705 struct node
*left
, *right
;
707 int olen
= tnode_child_length(tn
);
709 if(trie_debug
) printk("In halve\n");
711 tn
=tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
- 1);
714 trie_bug("tnode_new failed");
716 for(i
= 0; i
< olen
; i
+= 2) {
717 left
= tnode_get_child(oldtnode
, i
);
718 right
= tnode_get_child(oldtnode
, i
+1);
720 /* At least one of the children is empty */
722 if (right
== NULL
) /* Both are empty */
724 put_child(t
, tn
, i
/2, right
);
725 } else if (right
== NULL
)
726 put_child(t
, tn
, i
/2, left
);
728 /* Two nonempty children */
730 struct tnode
*newBinNode
=
731 tnode_new(left
->key
, tn
->pos
+ tn
->bits
, 1);
734 trie_bug("tnode_new failed");
736 put_child(t
, newBinNode
, 0, left
);
737 put_child(t
, newBinNode
, 1, right
);
738 put_child(t
, tn
, i
/2, resize(t
, newBinNode
));
741 tnode_free(oldtnode
);
745 static void *trie_init(struct trie
*t
)
751 #ifdef CONFIG_IP_FIB_TRIE_STATS
752 memset(&t
->stats
, 0, sizeof(struct trie_use_stats
));
758 static struct leaf_info
*find_leaf_info(struct hlist_head
*head
, int plen
)
760 struct hlist_node
*node
;
761 struct leaf_info
*li
;
763 hlist_for_each_entry(li
, node
, head
, hlist
) {
765 if ( li
->plen
== plen
)
771 static inline struct list_head
* get_fa_head(struct leaf
*l
, int plen
)
773 struct list_head
*fa_head
=NULL
;
774 struct leaf_info
*li
= find_leaf_info(&l
->list
, plen
);
782 static void insert_leaf_info(struct hlist_head
*head
, struct leaf_info
*new)
784 struct leaf_info
*li
=NULL
, *last
=NULL
;
785 struct hlist_node
*node
, *tmp
;
787 write_lock_bh(&fib_lock
);
789 if(hlist_empty(head
))
790 hlist_add_head(&new->hlist
, head
);
792 hlist_for_each_entry_safe(li
, node
, tmp
, head
, hlist
) {
794 if (new->plen
> li
->plen
)
800 hlist_add_after(&last
->hlist
, &new->hlist
);
802 hlist_add_before(&new->hlist
, &li
->hlist
);
804 write_unlock_bh(&fib_lock
);
808 fib_find_node(struct trie
*t
, u32 key
)
817 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
818 tn
= (struct tnode
*) n
;
822 if(tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
823 pos
=tn
->pos
+ tn
->bits
;
824 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
829 /* Case we have found a leaf. Compare prefixes */
831 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
832 struct leaf
*l
= (struct leaf
*) n
;
838 static struct node
*trie_rebalance(struct trie
*t
, struct tnode
*tn
)
843 struct tnode
*tp
= NULL
;
851 while (tn
!= NULL
&& NODE_PARENT(tn
) != NULL
) {
854 printk("Rebalance tn=%p \n", tn
);
855 if(tn
) printk("tn->parent=%p \n", NODE_PARENT(tn
));
857 printk("Rebalance tp=%p \n", tp
);
858 if(tp
) printk("tp->parent=%p \n", NODE_PARENT(tp
));
864 tp
= NODE_PARENT(tn
);
865 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
866 wasfull
= tnode_full(tp
, tnode_get_child(tp
, cindex
));
867 tn
= (struct tnode
*) resize (t
, (struct tnode
*)tn
);
868 tnode_put_child_reorg((struct tnode
*)tp
, cindex
,(struct node
*)tn
, wasfull
);
873 tn
= NODE_PARENT(tn
);
875 /* Handle last (top) tnode */
877 tn
= (struct tnode
*) resize(t
, (struct tnode
*)tn
);
879 return (struct node
*) tn
;
882 static struct list_head
*
883 fib_insert_node(struct trie
*t
, u32 key
, int plen
)
886 struct tnode
*tp
= NULL
, *tn
= NULL
;
890 struct list_head
*fa_head
=NULL
;
891 struct leaf_info
*li
;
897 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'!
904 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree.
907 * If it doesn't, we have to replace it with a new T_TNODE.
909 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE.
915 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
916 tn
= (struct tnode
*) n
;
920 if(tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
922 pos
=tn
->pos
+ tn
->bits
;
923 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
925 if(n
&& NODE_PARENT(n
) != tn
) {
926 printk("BUG tn=%p, n->parent=%p\n", tn
, NODE_PARENT(n
));
935 * n ----> NULL, LEAF or TNODE
937 * tp is n's (parent) ----> NULL or TNODE
940 if(tp
&& IS_LEAF(tp
))
945 /* Case 1: n is a leaf. Compare prefixes */
947 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
948 struct leaf
*l
= ( struct leaf
*) n
;
950 li
= leaf_info_new(plen
);
956 insert_leaf_info(&l
->list
, li
);
966 li
= leaf_info_new(plen
);
972 insert_leaf_info(&l
->list
, li
);
974 /* Case 2: n is NULL, and will just insert a new leaf */
975 if (t
->trie
&& n
== NULL
) {
977 NODE_SET_PARENT(l
, tp
);
983 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
984 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)l
);
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
990 * Add a new tnode here
991 * first tnode need some special handling
995 pos
=tp
->pos
+tp
->bits
;
999 newpos
= tkey_mismatch(key
, pos
, n
->key
);
1000 tn
= tnode_new(n
->key
, newpos
, 1);
1004 tn
= tnode_new(key
, newpos
, 1); /* First tnode */
1007 trie_bug("tnode_pfx_new failed");
1009 NODE_SET_PARENT(tn
, tp
);
1011 missbit
=tkey_extract_bits(key
, newpos
, 1);
1012 put_child(t
, tn
, missbit
, (struct node
*)l
);
1013 put_child(t
, tn
, 1-missbit
, n
);
1016 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1017 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)tn
);
1020 t
->trie
= (struct node
*) tn
; /* First tnode */
1024 if(tp
&& tp
->pos
+tp
->bits
> 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp
, tp
->pos
, tp
->bits
, key
, plen
);
1028 /* Rebalance the trie */
1029 t
->trie
= trie_rebalance(t
, tp
);
1035 fn_trie_insert(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1036 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1038 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1039 struct fib_alias
*fa
, *new_fa
;
1040 struct list_head
*fa_head
=NULL
;
1041 struct fib_info
*fi
;
1042 int plen
= r
->rtm_dst_len
;
1043 int type
= r
->rtm_type
;
1044 u8 tos
= r
->rtm_tos
;
1054 memcpy(&key
, rta
->rta_dst
, 4);
1059 printk("Insert table=%d %08x/%d\n", tb
->tb_id
, key
, plen
);
1061 mask
= ntohl( inet_make_mask(plen
) );
1068 if ((fi
= fib_create_info(r
, rta
, nlhdr
, &err
)) == NULL
)
1071 l
= fib_find_node(t
, key
);
1075 fa_head
= get_fa_head(l
, plen
);
1076 fa
= fib_find_alias(fa_head
, tos
, fi
->fib_priority
);
1079 /* Now fa, if non-NULL, points to the first fib alias
1080 * with the same keys [prefix,tos,priority], if such key already
1081 * exists or to the node before which we will insert new one.
1083 * If fa is NULL, we will need to allocate a new one and
1084 * insert to the head of f.
1086 * If f is NULL, no fib node matched the destination key
1087 * and we need to allocate a new one of those as well.
1091 fa
->fa_info
->fib_priority
== fi
->fib_priority
) {
1092 struct fib_alias
*fa_orig
;
1095 if (nlhdr
->nlmsg_flags
& NLM_F_EXCL
)
1098 if (nlhdr
->nlmsg_flags
& NLM_F_REPLACE
) {
1099 struct fib_info
*fi_drop
;
1102 write_lock_bh(&fib_lock
);
1104 fi_drop
= fa
->fa_info
;
1107 fa
->fa_scope
= r
->rtm_scope
;
1108 state
= fa
->fa_state
;
1109 fa
->fa_state
&= ~FA_S_ACCESSED
;
1111 write_unlock_bh(&fib_lock
);
1113 fib_release_info(fi_drop
);
1114 if (state
& FA_S_ACCESSED
)
1119 /* Error if we find a perfect match which
1120 * uses the same scope, type, and nexthop
1124 list_for_each_entry(fa
, fa_orig
->fa_list
.prev
, fa_list
) {
1125 if (fa
->fa_tos
!= tos
)
1127 if (fa
->fa_info
->fib_priority
!= fi
->fib_priority
)
1129 if (fa
->fa_type
== type
&&
1130 fa
->fa_scope
== r
->rtm_scope
&&
1131 fa
->fa_info
== fi
) {
1135 if (!(nlhdr
->nlmsg_flags
& NLM_F_APPEND
))
1139 if (!(nlhdr
->nlmsg_flags
&NLM_F_CREATE
))
1143 new_fa
= kmem_cache_alloc(fn_alias_kmem
, SLAB_KERNEL
);
1147 new_fa
->fa_info
= fi
;
1148 new_fa
->fa_tos
= tos
;
1149 new_fa
->fa_type
= type
;
1150 new_fa
->fa_scope
= r
->rtm_scope
;
1151 new_fa
->fa_state
= 0;
1156 * Insert new entry to the list.
1160 fa_head
= fib_insert_node(t
, key
, plen
);
1162 write_lock_bh(&fib_lock
);
1164 list_add_tail(&new_fa
->fa_list
,
1165 (fa
? &fa
->fa_list
: fa_head
));
1167 write_unlock_bh(&fib_lock
);
1170 rtmsg_fib(RTM_NEWROUTE
, htonl(key
), new_fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1174 fib_release_info(fi
);
1179 static inline int check_leaf(struct trie
*t
, struct leaf
*l
, t_key key
, int *plen
, const struct flowi
*flp
,
1180 struct fib_result
*res
, int *err
)
1184 struct leaf_info
*li
;
1185 struct hlist_head
*hhead
= &l
->list
;
1186 struct hlist_node
*node
;
1188 hlist_for_each_entry(li
, node
, hhead
, hlist
) {
1191 mask
= ntohl(inet_make_mask(i
));
1192 if (l
->key
!= (key
& mask
))
1195 if (((*err
) = fib_semantic_match(&li
->falh
, flp
, res
, l
->key
, mask
, i
)) == 0) {
1197 #ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t
->stats
.semantic_match_passed
++;
1202 #ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t
->stats
.semantic_match_miss
++;
1210 fn_trie_lookup(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1212 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1217 t_key key
=ntohl(flp
->fl4_dst
);
1220 int current_prefix_length
= KEYLENGTH
;
1223 read_lock(&fib_lock
);
1227 #ifdef CONFIG_IP_FIB_TRIE_STATS
1233 if( check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
, &ret
) )
1237 pn
= (struct tnode
*) n
;
1246 cindex
= tkey_extract_bits(MASK_PFX(key
, current_prefix_length
), pos
, bits
);
1248 n
= tnode_get_child(pn
, cindex
);
1251 #ifdef CONFIG_IP_FIB_TRIE_STATS
1252 t
->stats
.null_node_hit
++;
1260 struct tnode
*cn
= (struct tnode
*)n
;
1261 t_key node_prefix
, key_prefix
, pref_mismatch
;
1265 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown.
1277 * The skipped bits are key[pos+bits..cn->pos].
1280 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1292 if (current_prefix_length
< pos
+bits
) {
1293 if (tkey_extract_bits(cn
->key
, current_prefix_length
,
1294 cn
->pos
- current_prefix_length
) != 0 ||
1300 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point.
1309 /* The only thing we can do at this point is to verify that
1310 * any such matching prefix can indeed be a prefix to our
1311 * key, and if the bits in the node we are inspecting that
1312 * do not match our key are not ZERO, this cannot be true.
1313 * Thus, find out where there is a mismatch (before cn->pos)
1314 * and verify that all the mismatching bits are zero in the
1318 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix.
1328 node_prefix
= MASK_PFX(cn
->key
, cn
->pos
);
1329 key_prefix
= MASK_PFX(key
, cn
->pos
);
1330 pref_mismatch
= key_prefix
^node_prefix
;
1333 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly.
1336 if (pref_mismatch
) {
1337 while (!(pref_mismatch
& (1<<(KEYLENGTH
-1)))) {
1339 pref_mismatch
= pref_mismatch
<<1;
1341 key_prefix
= tkey_extract_bits(cn
->key
, mp
, cn
->pos
-mp
);
1343 if (key_prefix
!= 0)
1346 if (current_prefix_length
>= cn
->pos
)
1347 current_prefix_length
=mp
;
1350 pn
= (struct tnode
*)n
; /* Descend */
1355 if( check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
, &ret
))
1361 /* As zero don't change the child key (cindex) */
1362 while ((chopped_off
<= pn
->bits
) && !(cindex
& (1<<(chopped_off
-1)))) {
1366 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length
> pn
->pos
+ pn
->bits
- chopped_off
)
1368 current_prefix_length
= pn
->pos
+ pn
->bits
- chopped_off
;
1371 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent.
1375 if(chopped_off
<= pn
->bits
)
1376 cindex
&= ~(1 << (chopped_off
-1));
1378 if( NODE_PARENT(pn
) == NULL
)
1381 /* Get Child's index */
1382 cindex
= tkey_extract_bits(pn
->key
, NODE_PARENT(pn
)->pos
, NODE_PARENT(pn
)->bits
);
1383 pn
= NODE_PARENT(pn
);
1386 #ifdef CONFIG_IP_FIB_TRIE_STATS
1387 t
->stats
.backtrack
++;
1395 read_unlock(&fib_lock
);
1399 static int trie_leaf_remove(struct trie
*t
, t_key key
)
1402 struct tnode
*tp
= NULL
;
1403 struct node
*n
= t
->trie
;
1407 printk("entering trie_leaf_remove(%p)\n", n
);
1409 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key.
1414 while (n
!= NULL
&& IS_TNODE(n
)) {
1415 struct tnode
*tn
= (struct tnode
*) n
;
1417 n
= tnode_get_child(tn
,tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1419 if(n
&& NODE_PARENT(n
) != tn
) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn
, NODE_PARENT(n
));
1424 l
= (struct leaf
*) n
;
1426 if(!n
|| !tkey_equals(l
->key
, key
))
1431 * Remove the leaf and rebalance the tree
1437 tp
= NODE_PARENT(n
);
1438 tnode_free((struct tnode
*) n
);
1441 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1442 put_child(t
, (struct tnode
*)tp
, cindex
, NULL
);
1443 t
->trie
= trie_rebalance(t
, tp
);
1452 fn_trie_delete(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1453 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1455 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1457 int plen
= r
->rtm_dst_len
;
1458 u8 tos
= r
->rtm_tos
;
1459 struct fib_alias
*fa
, *fa_to_delete
;
1460 struct list_head
*fa_head
;
1468 memcpy(&key
, rta
->rta_dst
, 4);
1471 mask
= ntohl( inet_make_mask(plen
) );
1477 l
= fib_find_node(t
, key
);
1482 fa_head
= get_fa_head(l
, plen
);
1483 fa
= fib_find_alias(fa_head
, tos
, 0);
1489 printk("Deleting %08x/%d tos=%d t=%p\n", key
, plen
, tos
, t
);
1491 fa_to_delete
= NULL
;
1492 fa_head
= fa
->fa_list
.prev
;
1493 list_for_each_entry(fa
, fa_head
, fa_list
) {
1494 struct fib_info
*fi
= fa
->fa_info
;
1496 if (fa
->fa_tos
!= tos
)
1499 if ((!r
->rtm_type
||
1500 fa
->fa_type
== r
->rtm_type
) &&
1501 (r
->rtm_scope
== RT_SCOPE_NOWHERE
||
1502 fa
->fa_scope
== r
->rtm_scope
) &&
1503 (!r
->rtm_protocol
||
1504 fi
->fib_protocol
== r
->rtm_protocol
) &&
1505 fib_nh_match(r
, nlhdr
, rta
, fi
) == 0) {
1513 struct leaf_info
*li
;
1516 rtmsg_fib(RTM_DELROUTE
, htonl(key
), fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1518 l
= fib_find_node(t
, key
);
1519 li
= find_leaf_info(&l
->list
, plen
);
1521 write_lock_bh(&fib_lock
);
1523 list_del(&fa
->fa_list
);
1525 if(list_empty(fa_head
)) {
1526 hlist_del(&li
->hlist
);
1529 write_unlock_bh(&fib_lock
);
1534 if(hlist_empty(&l
->list
))
1535 trie_leaf_remove(t
, key
);
1537 if (fa
->fa_state
& FA_S_ACCESSED
)
1546 static int trie_flush_list(struct trie
*t
, struct list_head
*head
)
1548 struct fib_alias
*fa
, *fa_node
;
1551 list_for_each_entry_safe(fa
, fa_node
, head
, fa_list
) {
1552 struct fib_info
*fi
= fa
->fa_info
;
1554 if (fi
&& (fi
->fib_flags
&RTNH_F_DEAD
)) {
1556 write_lock_bh(&fib_lock
);
1557 list_del(&fa
->fa_list
);
1558 write_unlock_bh(&fib_lock
);
1567 static int trie_flush_leaf(struct trie
*t
, struct leaf
*l
)
1570 struct hlist_head
*lih
= &l
->list
;
1571 struct hlist_node
*node
, *tmp
;
1572 struct leaf_info
*li
= NULL
;
1574 hlist_for_each_entry_safe(li
, node
, tmp
, lih
, hlist
) {
1576 found
+= trie_flush_list(t
, &li
->falh
);
1578 if (list_empty(&li
->falh
)) {
1580 write_lock_bh(&fib_lock
);
1581 hlist_del(&li
->hlist
);
1582 write_unlock_bh(&fib_lock
);
1590 static struct leaf
*nextleaf(struct trie
*t
, struct leaf
*thisleaf
)
1592 struct node
*c
= (struct node
*) thisleaf
;
1600 if (IS_LEAF(t
->trie
)) /* trie w. just a leaf */
1601 return (struct leaf
*) t
->trie
;
1603 p
= (struct tnode
*) t
->trie
; /* Start */
1606 p
= (struct tnode
*) NODE_PARENT(c
);
1610 /* Find the next child of the parent */
1612 pos
= 1 + tkey_extract_bits(c
->key
, p
->pos
, p
->bits
);
1616 last
= 1 << p
->bits
;
1617 for(idx
= pos
; idx
< last
; idx
++) {
1618 if( p
->child
[idx
]) {
1620 /* Decend if tnode */
1622 while (IS_TNODE(p
->child
[idx
])) {
1623 p
= (struct tnode
*) p
->child
[idx
];
1626 /* Rightmost non-NULL branch */
1627 if( p
&& IS_TNODE(p
) )
1628 while ( p
->child
[idx
] == NULL
&& idx
< (1 << p
->bits
) ) idx
++;
1630 /* Done with this tnode? */
1631 if( idx
>= (1 << p
->bits
) || p
->child
[idx
] == NULL
)
1634 return (struct leaf
*) p
->child
[idx
];
1638 /* No more children go up one step */
1639 c
= (struct node
*) p
;
1640 p
= (struct tnode
*) NODE_PARENT(p
);
1642 return NULL
; /* Ready. Root of trie */
1645 static int fn_trie_flush(struct fib_table
*tb
)
1647 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1648 struct leaf
*ll
= NULL
, *l
= NULL
;
1653 for (h
=0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1654 found
+= trie_flush_leaf(t
, l
);
1656 if (ll
&& hlist_empty(&ll
->list
))
1657 trie_leaf_remove(t
, ll
->key
);
1661 if (ll
&& hlist_empty(&ll
->list
))
1662 trie_leaf_remove(t
, ll
->key
);
1665 printk("trie_flush found=%d\n", found
);
1669 static int trie_last_dflt
=-1;
1672 fn_trie_select_default(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1674 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1675 int order
, last_idx
;
1676 struct fib_info
*fi
= NULL
;
1677 struct fib_info
*last_resort
;
1678 struct fib_alias
*fa
= NULL
;
1679 struct list_head
*fa_head
;
1686 read_lock(&fib_lock
);
1688 l
= fib_find_node(t
, 0);
1692 fa_head
= get_fa_head(l
, 0);
1696 if (list_empty(fa_head
))
1699 list_for_each_entry(fa
, fa_head
, fa_list
) {
1700 struct fib_info
*next_fi
= fa
->fa_info
;
1702 if (fa
->fa_scope
!= res
->scope
||
1703 fa
->fa_type
!= RTN_UNICAST
)
1706 if (next_fi
->fib_priority
> res
->fi
->fib_priority
)
1708 if (!next_fi
->fib_nh
[0].nh_gw
||
1709 next_fi
->fib_nh
[0].nh_scope
!= RT_SCOPE_LINK
)
1711 fa
->fa_state
|= FA_S_ACCESSED
;
1714 if (next_fi
!= res
->fi
)
1716 } else if (!fib_detect_death(fi
, order
, &last_resort
,
1717 &last_idx
, &trie_last_dflt
)) {
1719 fib_info_put(res
->fi
);
1721 atomic_inc(&fi
->fib_clntref
);
1722 trie_last_dflt
= order
;
1728 if (order
<= 0 || fi
== NULL
) {
1729 trie_last_dflt
= -1;
1733 if (!fib_detect_death(fi
, order
, &last_resort
, &last_idx
, &trie_last_dflt
)) {
1735 fib_info_put(res
->fi
);
1737 atomic_inc(&fi
->fib_clntref
);
1738 trie_last_dflt
= order
;
1741 if (last_idx
>= 0) {
1743 fib_info_put(res
->fi
);
1744 res
->fi
= last_resort
;
1746 atomic_inc(&last_resort
->fib_clntref
);
1748 trie_last_dflt
= last_idx
;
1750 read_unlock(&fib_lock
);
1753 static int fn_trie_dump_fa(t_key key
, int plen
, struct list_head
*fah
, struct fib_table
*tb
,
1754 struct sk_buff
*skb
, struct netlink_callback
*cb
)
1757 struct fib_alias
*fa
;
1759 u32 xkey
=htonl(key
);
1764 list_for_each_entry(fa
, fah
, fa_list
) {
1769 if (fa
->fa_info
->fib_nh
== NULL
) {
1770 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i
, key
, plen
);
1774 if (fa
->fa_info
== NULL
) {
1775 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i
, key
, plen
);
1780 if (fib_dump_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1789 fa
->fa_info
, 0) < 0) {
1799 static int fn_trie_dump_plen(struct trie
*t
, int plen
, struct fib_table
*tb
, struct sk_buff
*skb
,
1800 struct netlink_callback
*cb
)
1803 struct list_head
*fa_head
;
1804 struct leaf
*l
= NULL
;
1807 for (h
=0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1812 memset(&cb
->args
[3], 0,
1813 sizeof(cb
->args
) - 3*sizeof(cb
->args
[0]));
1815 fa_head
= get_fa_head(l
, plen
);
1820 if(list_empty(fa_head
))
1823 if (fn_trie_dump_fa(l
->key
, plen
, fa_head
, tb
, skb
, cb
)<0) {
1832 static int fn_trie_dump(struct fib_table
*tb
, struct sk_buff
*skb
, struct netlink_callback
*cb
)
1835 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1839 read_lock(&fib_lock
);
1840 for (m
=0; m
<=32; m
++) {
1845 memset(&cb
->args
[2], 0,
1846 sizeof(cb
->args
) - 2*sizeof(cb
->args
[0]));
1848 if (fn_trie_dump_plen(t
, 32-m
, tb
, skb
, cb
)<0) {
1853 read_unlock(&fib_lock
);
1857 read_unlock(&fib_lock
);
1861 /* Fix more generic FIB names for init later */
1863 #ifdef CONFIG_IP_MULTIPLE_TABLES
1864 struct fib_table
* fib_hash_init(int id
)
1866 struct fib_table
* __init
fib_hash_init(int id
)
1869 struct fib_table
*tb
;
1872 if (fn_alias_kmem
== NULL
)
1873 fn_alias_kmem
= kmem_cache_create("ip_fib_alias",
1874 sizeof(struct fib_alias
),
1875 0, SLAB_HWCACHE_ALIGN
,
1878 tb
= kmalloc(sizeof(struct fib_table
) + sizeof(struct trie
),
1884 tb
->tb_lookup
= fn_trie_lookup
;
1885 tb
->tb_insert
= fn_trie_insert
;
1886 tb
->tb_delete
= fn_trie_delete
;
1887 tb
->tb_flush
= fn_trie_flush
;
1888 tb
->tb_select_default
= fn_trie_select_default
;
1889 tb
->tb_dump
= fn_trie_dump
;
1890 memset(tb
->tb_data
, 0, sizeof(struct trie
));
1892 t
= (struct trie
*) tb
->tb_data
;
1896 if (id
== RT_TABLE_LOCAL
)
1898 else if (id
== RT_TABLE_MAIN
)
1901 if (id
== RT_TABLE_LOCAL
)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION
);
1907 /* Trie dump functions */
1909 static void putspace_seq(struct seq_file
*seq
, int n
)
1911 while (n
--) seq_printf(seq
, " ");
1914 static void printbin_seq(struct seq_file
*seq
, unsigned int v
, int bits
)
1917 seq_printf(seq
, "%s", (v
& (1<<bits
))?"1":"0");
1920 static void printnode_seq(struct seq_file
*seq
, int indent
, struct node
*n
,
1921 int pend
, int cindex
, int bits
)
1923 putspace_seq(seq
, indent
);
1925 seq_printf(seq
, "|");
1927 seq_printf(seq
, "+");
1929 seq_printf(seq
, "%d/", cindex
);
1930 printbin_seq(seq
, cindex
, bits
);
1931 seq_printf(seq
, ": ");
1934 seq_printf(seq
, "<root>: ");
1935 seq_printf(seq
, "%s:%p ", IS_LEAF(n
)?"Leaf":"Internal node", n
);
1938 seq_printf(seq
, "key=%d.%d.%d.%d\n",
1939 n
->key
>> 24, (n
->key
>> 16) % 256, (n
->key
>> 8) % 256, n
->key
% 256);
1941 int plen
=((struct tnode
*)n
)->pos
;
1942 t_key prf
=MASK_PFX(n
->key
, plen
);
1943 seq_printf(seq
, "key=%d.%d.%d.%d/%d\n",
1944 prf
>> 24, (prf
>> 16) % 256, (prf
>> 8) % 256, prf
% 256, plen
);
1947 struct leaf
*l
=(struct leaf
*)n
;
1948 struct fib_alias
*fa
;
1950 for (i
=32; i
>=0; i
--)
1951 if(find_leaf_info(&l
->list
, i
)) {
1953 struct list_head
*fa_head
= get_fa_head(l
, i
);
1958 if(list_empty(fa_head
))
1961 putspace_seq(seq
, indent
+2);
1962 seq_printf(seq
, "{/%d...dumping}\n", i
);
1965 list_for_each_entry(fa
, fa_head
, fa_list
) {
1966 putspace_seq(seq
, indent
+2);
1967 if (fa
->fa_info
->fib_nh
== NULL
) {
1968 seq_printf(seq
, "Error _fib_nh=NULL\n");
1971 if (fa
->fa_info
== NULL
) {
1972 seq_printf(seq
, "Error fa_info=NULL\n");
1976 seq_printf(seq
, "{type=%d scope=%d TOS=%d}\n",
1983 else if (IS_TNODE(n
)) {
1984 struct tnode
*tn
=(struct tnode
*)n
;
1985 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
1986 seq_printf(seq
, "{key prefix=%08x/", tn
->key
&TKEY_GET_MASK(0, tn
->pos
));
1987 printbin_seq(seq
, tkey_extract_bits(tn
->key
, 0, tn
->pos
), tn
->pos
);
1988 seq_printf(seq
, "}\n");
1989 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
1990 seq_printf(seq
, "{pos=%d", tn
->pos
);
1991 seq_printf(seq
, " (skip=%d bits)", tn
->pos
- pend
);
1992 seq_printf(seq
, " bits=%d (%u children)}\n", tn
->bits
, (1 << tn
->bits
));
1993 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
1994 seq_printf(seq
, "{empty=%d full=%d}\n", tn
->empty_children
, tn
->full_children
);
1998 static void trie_dump_seq(struct seq_file
*seq
, struct trie
*t
)
2000 struct node
*n
=t
->trie
;
2006 read_lock(&fib_lock
);
2008 seq_printf(seq
, "------ trie_dump of t=%p ------\n", t
);
2010 printnode_seq(seq
, indent
, n
, pend
, cindex
, 0);
2012 struct tnode
*tn
=(struct tnode
*)n
;
2013 pend
= tn
->pos
+tn
->bits
;
2014 putspace_seq(seq
, indent
); seq_printf(seq
, "\\--\n");
2018 while (tn
&& cindex
< (1 << tn
->bits
)) {
2019 if (tn
->child
[cindex
]) {
2023 printnode_seq(seq
, indent
, tn
->child
[cindex
], pend
, cindex
, tn
->bits
);
2024 if (IS_LEAF(tn
->child
[cindex
])) {
2030 * New tnode. Decend one level
2034 n
=tn
->child
[cindex
];
2035 tn
=(struct tnode
*)n
;
2036 pend
=tn
->pos
+tn
->bits
;
2037 putspace_seq(seq
, indent
); seq_printf(seq
, "\\--\n");
2046 * Test if we are done
2049 while (cindex
>= (1 << tn
->bits
)) {
2052 * Move upwards and test for root
2053 * pop off all traversed nodes
2056 if (NODE_PARENT(tn
) == NULL
) {
2062 cindex
= tkey_extract_bits(tn
->key
, NODE_PARENT(tn
)->pos
, NODE_PARENT(tn
)->bits
);
2063 tn
= NODE_PARENT(tn
);
2065 n
=(struct node
*)tn
;
2066 pend
=tn
->pos
+tn
->bits
;
2075 else seq_printf(seq
, "------ trie is empty\n");
2077 read_unlock(&fib_lock
);
2080 static struct trie_stat
*trie_stat_new(void)
2082 struct trie_stat
*s
= kmalloc(sizeof(struct trie_stat
), GFP_KERNEL
);
2090 s
->nullpointers
= 0;
2092 for(i
=0; i
< MAX_CHILDS
; i
++)
2093 s
->nodesizes
[i
] = 0;
2098 static struct trie_stat
*trie_collect_stats(struct trie
*t
)
2100 struct node
*n
=t
->trie
;
2101 struct trie_stat
*s
= trie_stat_new();
2107 read_lock(&fib_lock
);
2112 struct tnode
*tn
= (struct tnode
*)n
;
2113 pend
=tn
->pos
+tn
->bits
;
2115 s
->nodesizes
[tn
->bits
]++;
2118 while (tn
&& cindex
< (1 << tn
->bits
)) {
2119 if (tn
->child
[cindex
]) {
2122 if (IS_LEAF(tn
->child
[cindex
])) {
2126 if (depth
> s
->maxdepth
)
2127 s
->maxdepth
= depth
;
2128 s
->totdepth
+= depth
;
2134 * New tnode. Decend one level
2138 s
->nodesizes
[tn
->bits
]++;
2141 n
= tn
->child
[cindex
];
2142 tn
= (struct tnode
*)n
;
2143 pend
= tn
->pos
+tn
->bits
;
2155 * Test if we are done
2158 while (cindex
>= (1 << tn
->bits
)) {
2161 * Move upwards and test for root
2162 * pop off all traversed nodes
2166 if (NODE_PARENT(tn
) == NULL
) {
2172 cindex
= tkey_extract_bits(tn
->key
, NODE_PARENT(tn
)->pos
, NODE_PARENT(tn
)->bits
);
2173 tn
= NODE_PARENT(tn
);
2175 n
= (struct node
*)tn
;
2176 pend
=tn
->pos
+tn
->bits
;
2187 read_unlock(&fib_lock
);
2191 #ifdef CONFIG_PROC_FS
2193 static struct fib_alias
*fib_triestat_get_first(struct seq_file
*seq
)
2198 static struct fib_alias
*fib_triestat_get_next(struct seq_file
*seq
)
2203 static void *fib_triestat_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2207 if (ip_fib_main_table
)
2208 v
= *pos
? fib_triestat_get_next(seq
) : SEQ_START_TOKEN
;
2212 static void *fib_triestat_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2215 return v
== SEQ_START_TOKEN
? fib_triestat_get_first(seq
) : fib_triestat_get_next(seq
);
2218 static void fib_triestat_seq_stop(struct seq_file
*seq
, void *v
)
2224 * This outputs /proc/net/fib_triestats
2226 * It always works in backward compatibility mode.
2227 * The format of the file is not supposed to be changed.
2230 static void collect_and_show(struct trie
*t
, struct seq_file
*seq
)
2232 int bytes
= 0; /* How many bytes are used, a ref is 4 bytes */
2233 int i
, max
, pointers
;
2234 struct trie_stat
*stat
;
2237 stat
= trie_collect_stats(t
);
2240 seq_printf(seq
, "trie=%p\n", t
);
2244 avdepth
=stat
->totdepth
*100 / stat
->leaves
;
2247 seq_printf(seq
, "Aver depth: %d.%02d\n", avdepth
/ 100, avdepth
% 100 );
2248 seq_printf(seq
, "Max depth: %4d\n", stat
->maxdepth
);
2250 seq_printf(seq
, "Leaves: %d\n", stat
->leaves
);
2251 bytes
+= sizeof(struct leaf
) * stat
->leaves
;
2252 seq_printf(seq
, "Internal nodes: %d\n", stat
->tnodes
);
2253 bytes
+= sizeof(struct tnode
) * stat
->tnodes
;
2257 while (max
>= 0 && stat
->nodesizes
[max
] == 0)
2261 for (i
= 1; i
<= max
; i
++)
2262 if (stat
->nodesizes
[i
] != 0) {
2263 seq_printf(seq
, " %d: %d", i
, stat
->nodesizes
[i
]);
2264 pointers
+= (1<<i
) * stat
->nodesizes
[i
];
2266 seq_printf(seq
, "\n");
2267 seq_printf(seq
, "Pointers: %d\n", pointers
);
2268 bytes
+= sizeof(struct node
*) * pointers
;
2269 seq_printf(seq
, "Null ptrs: %d\n", stat
->nullpointers
);
2270 seq_printf(seq
, "Total size: %d kB\n", bytes
/ 1024);
2275 #ifdef CONFIG_IP_FIB_TRIE_STATS
2276 seq_printf(seq
, "Counters:\n---------\n");
2277 seq_printf(seq
,"gets = %d\n", t
->stats
.gets
);
2278 seq_printf(seq
,"backtracks = %d\n", t
->stats
.backtrack
);
2279 seq_printf(seq
,"semantic match passed = %d\n", t
->stats
.semantic_match_passed
);
2280 seq_printf(seq
,"semantic match miss = %d\n", t
->stats
.semantic_match_miss
);
2281 seq_printf(seq
,"null node hit= %d\n", t
->stats
.null_node_hit
);
2283 memset(&(t
->stats
), 0, sizeof(t
->stats
));
2285 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2288 static int fib_triestat_seq_show(struct seq_file
*seq
, void *v
)
2292 if (v
== SEQ_START_TOKEN
) {
2293 seq_printf(seq
, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf
), sizeof(struct tnode
));
2296 collect_and_show(trie_local
, seq
);
2299 collect_and_show(trie_main
, seq
);
2302 snprintf(bf
, sizeof(bf
),
2303 "*\t%08X\t%08X", 200, 400);
2305 seq_printf(seq
, "%-127s\n", bf
);
2310 static struct seq_operations fib_triestat_seq_ops
= {
2311 .start
= fib_triestat_seq_start
,
2312 .next
= fib_triestat_seq_next
,
2313 .stop
= fib_triestat_seq_stop
,
2314 .show
= fib_triestat_seq_show
,
2317 static int fib_triestat_seq_open(struct inode
*inode
, struct file
*file
)
2319 struct seq_file
*seq
;
2322 rc
= seq_open(file
, &fib_triestat_seq_ops
);
2326 seq
= file
->private_data
;
2333 static struct file_operations fib_triestat_seq_fops
= {
2334 .owner
= THIS_MODULE
,
2335 .open
= fib_triestat_seq_open
,
2337 .llseek
= seq_lseek
,
2338 .release
= seq_release_private
,
2341 int __init
fib_stat_proc_init(void)
2343 if (!proc_net_fops_create("fib_triestat", S_IRUGO
, &fib_triestat_seq_fops
))
2348 void __init
fib_stat_proc_exit(void)
2350 proc_net_remove("fib_triestat");
2353 static struct fib_alias
*fib_trie_get_first(struct seq_file
*seq
)
2358 static struct fib_alias
*fib_trie_get_next(struct seq_file
*seq
)
2363 static void *fib_trie_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2367 if (ip_fib_main_table
)
2368 v
= *pos
? fib_trie_get_next(seq
) : SEQ_START_TOKEN
;
2372 static void *fib_trie_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2375 return v
== SEQ_START_TOKEN
? fib_trie_get_first(seq
) : fib_trie_get_next(seq
);
2378 static void fib_trie_seq_stop(struct seq_file
*seq
, void *v
)
2384 * This outputs /proc/net/fib_trie.
2386 * It always works in backward compatibility mode.
2387 * The format of the file is not supposed to be changed.
2390 static int fib_trie_seq_show(struct seq_file
*seq
, void *v
)
2394 if (v
== SEQ_START_TOKEN
) {
2396 trie_dump_seq(seq
, trie_local
);
2399 trie_dump_seq(seq
, trie_main
);
2403 snprintf(bf
, sizeof(bf
),
2404 "*\t%08X\t%08X", 200, 400);
2405 seq_printf(seq
, "%-127s\n", bf
);
2411 static struct seq_operations fib_trie_seq_ops
= {
2412 .start
= fib_trie_seq_start
,
2413 .next
= fib_trie_seq_next
,
2414 .stop
= fib_trie_seq_stop
,
2415 .show
= fib_trie_seq_show
,
2418 static int fib_trie_seq_open(struct inode
*inode
, struct file
*file
)
2420 struct seq_file
*seq
;
2423 rc
= seq_open(file
, &fib_trie_seq_ops
);
2427 seq
= file
->private_data
;
2434 static struct file_operations fib_trie_seq_fops
= {
2435 .owner
= THIS_MODULE
,
2436 .open
= fib_trie_seq_open
,
2438 .llseek
= seq_lseek
,
2439 .release
= seq_release_private
,
2442 int __init
fib_proc_init(void)
2444 if (!proc_net_fops_create("fib_trie", S_IRUGO
, &fib_trie_seq_fops
))
2449 void __init
fib_proc_exit(void)
2451 proc_net_remove("fib_trie");
2454 #endif /* CONFIG_PROC_FS */