2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.325"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 static DEFINE_RWLOCK(fib_lock
);
87 typedef unsigned int t_key
;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99 #define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
103 #define IS_LEAF(n) (n->_parent & T_LEAF)
107 unsigned long _parent
;
112 unsigned long _parent
;
113 struct hlist_head list
;
117 struct hlist_node hlist
;
119 struct list_head falh
;
124 unsigned long _parent
;
125 unsigned short pos
:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits
:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children
; /* KEYLENGTH bits needed */
128 unsigned short empty_children
; /* KEYLENGTH bits needed */
129 struct node
*child
[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats
{
135 unsigned int backtrack
;
136 unsigned int semantic_match_passed
;
137 unsigned int semantic_match_miss
;
138 unsigned int null_node_hit
;
139 unsigned int resize_node_skipped
;
144 unsigned int totdepth
;
145 unsigned int maxdepth
;
148 unsigned int nullpointers
;
149 unsigned int nodesizes
[MAX_CHILDS
];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats
;
158 unsigned int revision
;
161 static int trie_debug
= 0;
163 static int tnode_full(struct tnode
*tn
, struct node
*n
);
164 static void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
);
165 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
);
166 static int tnode_child_length(struct tnode
*tn
);
167 static struct node
*resize(struct trie
*t
, struct tnode
*tn
);
168 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
, int *err
);
169 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
, int *err
);
170 static void tnode_free(struct tnode
*tn
);
171 static void trie_dump_seq(struct seq_file
*seq
, struct trie
*t
);
172 extern struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
);
173 extern int fib_detect_death(struct fib_info
*fi
, int order
,
174 struct fib_info
**last_resort
, int *last_idx
, int *dflt
);
176 extern void rtmsg_fib(int event
, u32 key
, struct fib_alias
*fa
, int z
, int tb_id
,
177 struct nlmsghdr
*n
, struct netlink_skb_parms
*req
);
179 static kmem_cache_t
*fn_alias_kmem
;
180 static struct trie
*trie_local
= NULL
, *trie_main
= NULL
;
182 static void trie_bug(char *err
)
184 printk("Trie Bug: %s\n", err
);
188 static inline struct node
*tnode_get_child(struct tnode
*tn
, int i
)
190 if (i
>= 1<<tn
->bits
)
191 trie_bug("tnode_get_child");
196 static inline int tnode_child_length(struct tnode
*tn
)
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
219 static inline t_key
tkey_extract_bits(t_key a
, int offset
, int bits
)
221 if (offset
< KEYLENGTH
)
222 return ((t_key
)(a
<< offset
)) >> (KEYLENGTH
- bits
);
227 static inline int tkey_equals(t_key a
, t_key b
)
232 static inline int tkey_sub_equals(t_key a
, int offset
, int bits
, t_key b
)
234 if (bits
== 0 || offset
>= KEYLENGTH
)
236 bits
= bits
> KEYLENGTH
? KEYLENGTH
: bits
;
237 return ((a
^ b
) << offset
) >> (KEYLENGTH
- bits
) == 0;
240 static inline int tkey_mismatch(t_key a
, int offset
, t_key b
)
247 while((diff
<< i
) >> (KEYLENGTH
-1) == 0)
252 /* Candiate for fib_semantics */
254 static void fn_free_alias(struct fib_alias
*fa
)
256 fib_release_info(fa
->fa_info
);
257 kmem_cache_free(fn_alias_kmem
, fa
);
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
265 Consider a node 'n' and its parent 'tp'.
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitaded by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
317 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
322 static void check_tnode(struct tnode
*tn
)
324 if(tn
&& tn
->pos
+tn
->bits
> 32) {
325 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn
, tn
->pos
, tn
->bits
);
329 static int halve_threshold
= 25;
330 static int inflate_threshold
= 50;
332 static struct leaf
*leaf_new(void)
334 struct leaf
*l
= kmalloc(sizeof(struct leaf
), GFP_KERNEL
);
336 NODE_INIT_PARENT(l
, T_LEAF
);
337 INIT_HLIST_HEAD(&l
->list
);
342 static struct leaf_info
*leaf_info_new(int plen
)
344 struct leaf_info
*li
= kmalloc(sizeof(struct leaf_info
), GFP_KERNEL
);
347 INIT_LIST_HEAD(&li
->falh
);
352 static inline void free_leaf(struct leaf
*l
)
357 static inline void free_leaf_info(struct leaf_info
*li
)
362 static struct tnode
*tnode_alloc(unsigned int size
)
364 if (size
<= PAGE_SIZE
) {
365 return kmalloc(size
, GFP_KERNEL
);
367 return (struct tnode
*)
368 __get_free_pages(GFP_KERNEL
, get_order(size
));
372 static void __tnode_free(struct tnode
*tn
)
374 unsigned int size
= sizeof(struct tnode
) +
375 (1<<tn
->bits
) * sizeof(struct node
*);
377 if (size
<= PAGE_SIZE
)
380 free_pages((unsigned long)tn
, get_order(size
));
383 static struct tnode
* tnode_new(t_key key
, int pos
, int bits
)
385 int nchildren
= 1<<bits
;
386 int sz
= sizeof(struct tnode
) + nchildren
* sizeof(struct node
*);
387 struct tnode
*tn
= tnode_alloc(sz
);
391 NODE_INIT_PARENT(tn
, T_TNODE
);
395 tn
->full_children
= 0;
396 tn
->empty_children
= 1<<bits
;
399 printk("AT %p s=%u %u\n", tn
, (unsigned int) sizeof(struct tnode
),
400 (unsigned int) (sizeof(struct node
) * 1<<bits
));
404 static void tnode_free(struct tnode
*tn
)
407 trie_bug("tnode_free\n");
410 free_leaf((struct leaf
*)tn
);
412 printk("FL %p \n", tn
);
414 else if(IS_TNODE(tn
)) {
417 printk("FT %p \n", tn
);
420 trie_bug("tnode_free\n");
425 * Check whether a tnode 'n' is "full", i.e. it is an internal node
426 * and no bits are skipped. See discussion in dyntree paper p. 6
429 static inline int tnode_full(struct tnode
*tn
, struct node
*n
)
431 if(n
== NULL
|| IS_LEAF(n
))
434 return ((struct tnode
*) n
)->pos
== tn
->pos
+ tn
->bits
;
437 static inline void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
)
439 tnode_put_child_reorg(tn
, i
, n
, -1);
443 * Add a child at position i overwriting the old value.
444 * Update the value of full_children and empty_children.
447 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
)
452 if(i
>= 1<<tn
->bits
) {
453 printk("bits=%d, i=%d\n", tn
->bits
, i
);
454 trie_bug("tnode_put_child_reorg bits");
456 write_lock_bh(&fib_lock
);
459 /* update emptyChildren */
460 if (n
== NULL
&& chi
!= NULL
)
461 tn
->empty_children
++;
462 else if (n
!= NULL
&& chi
== NULL
)
463 tn
->empty_children
--;
465 /* update fullChildren */
467 wasfull
= tnode_full(tn
, chi
);
469 isfull
= tnode_full(tn
, n
);
470 if (wasfull
&& !isfull
)
473 else if (!wasfull
&& isfull
)
476 NODE_SET_PARENT(n
, tn
);
479 write_unlock_bh(&fib_lock
);
482 static struct node
*resize(struct trie
*t
, struct tnode
*tn
)
491 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
492 tn
, inflate_threshold
, halve_threshold
);
495 if (tn
->empty_children
== tnode_child_length(tn
)) {
500 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
501 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
503 write_lock_bh(&fib_lock
);
504 if (tn
->child
[i
] != NULL
) {
506 /* compress one level */
507 struct node
*n
= tn
->child
[i
];
509 NODE_INIT_PARENT(n
, NODE_TYPE(n
));
511 write_unlock_bh(&fib_lock
);
515 write_unlock_bh(&fib_lock
);
518 * Double as long as the resulting node has a number of
519 * nonempty nodes that are above the threshold.
523 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
524 * the Helsinki University of Technology and Matti Tikkanen of Nokia
525 * Telecommunications, page 6:
526 * "A node is doubled if the ratio of non-empty children to all
527 * children in the *doubled* node is at least 'high'."
529 * 'high' in this instance is the variable 'inflate_threshold'. It
530 * is expressed as a percentage, so we multiply it with
531 * tnode_child_length() and instead of multiplying by 2 (since the
532 * child array will be doubled by inflate()) and multiplying
533 * the left-hand side by 100 (to handle the percentage thing) we
534 * multiply the left-hand side by 50.
536 * The left-hand side may look a bit weird: tnode_child_length(tn)
537 * - tn->empty_children is of course the number of non-null children
538 * in the current node. tn->full_children is the number of "full"
539 * children, that is non-null tnodes with a skip value of 0.
540 * All of those will be doubled in the resulting inflated tnode, so
541 * we just count them one extra time here.
543 * A clearer way to write this would be:
545 * to_be_doubled = tn->full_children;
546 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
549 * new_child_length = tnode_child_length(tn) * 2;
551 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
553 * if (new_fill_factor >= inflate_threshold)
555 * ...and so on, tho it would mess up the while() loop.
558 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
562 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
563 * inflate_threshold * new_child_length
565 * expand not_to_be_doubled and to_be_doubled, and shorten:
566 * 100 * (tnode_child_length(tn) - tn->empty_children +
567 * tn->full_children ) >= inflate_threshold * new_child_length
569 * expand new_child_length:
570 * 100 * (tnode_child_length(tn) - tn->empty_children +
571 * tn->full_children ) >=
572 * inflate_threshold * tnode_child_length(tn) * 2
575 * 50 * (tn->full_children + tnode_child_length(tn) -
576 * tn->empty_children ) >= inflate_threshold *
577 * tnode_child_length(tn)
584 while ((tn
->full_children
> 0 &&
585 50 * (tn
->full_children
+ tnode_child_length(tn
) - tn
->empty_children
) >=
586 inflate_threshold
* tnode_child_length(tn
))) {
588 tn
= inflate(t
, tn
, &err
);
591 #ifdef CONFIG_IP_FIB_TRIE_STATS
592 t
->stats
.resize_node_skipped
++;
601 * Halve as long as the number of empty children in this
602 * node is above threshold.
606 while (tn
->bits
> 1 &&
607 100 * (tnode_child_length(tn
) - tn
->empty_children
) <
608 halve_threshold
* tnode_child_length(tn
)) {
610 tn
= halve(t
, tn
, &err
);
613 #ifdef CONFIG_IP_FIB_TRIE_STATS
614 t
->stats
.resize_node_skipped
++;
621 /* Only one child remains */
623 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
624 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
626 write_lock_bh(&fib_lock
);
627 if (tn
->child
[i
] != NULL
) {
628 /* compress one level */
629 struct node
*n
= tn
->child
[i
];
632 NODE_INIT_PARENT(n
, NODE_TYPE(n
));
634 write_unlock_bh(&fib_lock
);
638 write_unlock_bh(&fib_lock
);
641 return (struct node
*) tn
;
644 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
, int *err
)
647 struct tnode
*oldtnode
= tn
;
648 int olen
= tnode_child_length(tn
);
652 printk("In inflate\n");
654 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
+ 1);
662 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored.
668 for(i
= 0; i
< olen
; i
++) {
669 struct tnode
*inode
= (struct tnode
*) tnode_get_child(oldtnode
, i
);
673 inode
->pos
== oldtnode
->pos
+ oldtnode
->bits
&&
675 struct tnode
*left
, *right
;
677 t_key m
= TKEY_GET_MASK(inode
->pos
, 1);
679 left
= tnode_new(inode
->key
&(~m
), inode
->pos
+ 1,
687 right
= tnode_new(inode
->key
|m
, inode
->pos
+ 1,
695 put_child(t
, tn
, 2*i
, (struct node
*) left
);
696 put_child(t
, tn
, 2*i
+1, (struct node
*) right
);
701 int size
= tnode_child_length(tn
);
704 for(j
= 0; j
< size
; j
++)
706 tnode_free((struct tnode
*)tn
->child
[j
]);
714 for(i
= 0; i
< olen
; i
++) {
715 struct node
*node
= tnode_get_child(oldtnode
, i
);
721 /* A leaf or an internal node with skipped bits */
723 if(IS_LEAF(node
) || ((struct tnode
*) node
)->pos
>
724 tn
->pos
+ tn
->bits
- 1) {
725 if(tkey_extract_bits(node
->key
, oldtnode
->pos
+ oldtnode
->bits
,
727 put_child(t
, tn
, 2*i
, node
);
729 put_child(t
, tn
, 2*i
+1, node
);
733 /* An internal node with two children */
734 inode
= (struct tnode
*) node
;
736 if (inode
->bits
== 1) {
737 put_child(t
, tn
, 2*i
, inode
->child
[0]);
738 put_child(t
, tn
, 2*i
+1, inode
->child
[1]);
743 /* An internal node with more than two children */
745 struct tnode
*left
, *right
;
748 /* We will replace this node 'inode' with two new
749 * ones, 'left' and 'right', each with half of the
750 * original children. The two new nodes will have
751 * a position one bit further down the key and this
752 * means that the "significant" part of their keys
753 * (see the discussion near the top of this file)
754 * will differ by one bit, which will be "0" in
755 * left's key and "1" in right's key. Since we are
756 * moving the key position by one step, the bit that
757 * we are moving away from - the bit at position
758 * (inode->pos) - is the one that will differ between
759 * left and right. So... we synthesize that bit in the
761 * The mask 'm' below will be a single "one" bit at
762 * the position (inode->pos)
765 /* Use the old key, but set the new significant
769 left
= (struct tnode
*) tnode_get_child(tn
, 2*i
);
770 put_child(t
, tn
, 2*i
, NULL
);
775 right
= (struct tnode
*) tnode_get_child(tn
, 2*i
+1);
776 put_child(t
, tn
, 2*i
+1, NULL
);
781 size
= tnode_child_length(left
);
782 for(j
= 0; j
< size
; j
++) {
783 put_child(t
, left
, j
, inode
->child
[j
]);
784 put_child(t
, right
, j
, inode
->child
[j
+ size
]);
786 put_child(t
, tn
, 2*i
, resize(t
, left
));
787 put_child(t
, tn
, 2*i
+1, resize(t
, right
));
792 tnode_free(oldtnode
);
796 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
, int *err
)
798 struct tnode
*oldtnode
= tn
;
799 struct node
*left
, *right
;
801 int olen
= tnode_child_length(tn
);
803 if(trie_debug
) printk("In halve\n");
805 tn
=tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
- 1);
813 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored.
819 for(i
= 0; i
< olen
; i
+= 2) {
820 left
= tnode_get_child(oldtnode
, i
);
821 right
= tnode_get_child(oldtnode
, i
+1);
823 /* Two nonempty children */
825 struct tnode
*newBinNode
=
826 tnode_new(left
->key
, tn
->pos
+ tn
->bits
, 1);
832 put_child(t
, tn
, i
/2, (struct node
*)newBinNode
);
837 int size
= tnode_child_length(tn
);
840 for(j
= 0; j
< size
; j
++)
842 tnode_free((struct tnode
*)tn
->child
[j
]);
850 for(i
= 0; i
< olen
; i
+= 2) {
851 left
= tnode_get_child(oldtnode
, i
);
852 right
= tnode_get_child(oldtnode
, i
+1);
854 /* At least one of the children is empty */
856 if (right
== NULL
) /* Both are empty */
858 put_child(t
, tn
, i
/2, right
);
859 } else if (right
== NULL
)
860 put_child(t
, tn
, i
/2, left
);
862 /* Two nonempty children */
864 struct tnode
*newBinNode
=
865 (struct tnode
*) tnode_get_child(tn
, i
/2);
866 put_child(t
, tn
, i
/2, NULL
);
871 put_child(t
, newBinNode
, 0, left
);
872 put_child(t
, newBinNode
, 1, right
);
873 put_child(t
, tn
, i
/2, resize(t
, newBinNode
));
876 tnode_free(oldtnode
);
880 static void *trie_init(struct trie
*t
)
886 #ifdef CONFIG_IP_FIB_TRIE_STATS
887 memset(&t
->stats
, 0, sizeof(struct trie_use_stats
));
893 static struct leaf_info
*find_leaf_info(struct hlist_head
*head
, int plen
)
895 struct hlist_node
*node
;
896 struct leaf_info
*li
;
898 hlist_for_each_entry(li
, node
, head
, hlist
) {
900 if ( li
->plen
== plen
)
906 static inline struct list_head
* get_fa_head(struct leaf
*l
, int plen
)
908 struct list_head
*fa_head
=NULL
;
909 struct leaf_info
*li
= find_leaf_info(&l
->list
, plen
);
917 static void insert_leaf_info(struct hlist_head
*head
, struct leaf_info
*new)
919 struct leaf_info
*li
=NULL
, *last
=NULL
;
920 struct hlist_node
*node
, *tmp
;
922 write_lock_bh(&fib_lock
);
924 if(hlist_empty(head
))
925 hlist_add_head(&new->hlist
, head
);
927 hlist_for_each_entry_safe(li
, node
, tmp
, head
, hlist
) {
929 if (new->plen
> li
->plen
)
935 hlist_add_after(&last
->hlist
, &new->hlist
);
937 hlist_add_before(&new->hlist
, &li
->hlist
);
939 write_unlock_bh(&fib_lock
);
943 fib_find_node(struct trie
*t
, u32 key
)
952 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
953 tn
= (struct tnode
*) n
;
957 if(tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
958 pos
=tn
->pos
+ tn
->bits
;
959 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
964 /* Case we have found a leaf. Compare prefixes */
966 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
967 struct leaf
*l
= (struct leaf
*) n
;
973 static struct node
*trie_rebalance(struct trie
*t
, struct tnode
*tn
)
978 struct tnode
*tp
= NULL
;
986 while (tn
!= NULL
&& NODE_PARENT(tn
) != NULL
) {
989 printk("Rebalance tn=%p \n", tn
);
990 if(tn
) printk("tn->parent=%p \n", NODE_PARENT(tn
));
992 printk("Rebalance tp=%p \n", tp
);
993 if(tp
) printk("tp->parent=%p \n", NODE_PARENT(tp
));
999 tp
= NODE_PARENT(tn
);
1000 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1001 wasfull
= tnode_full(tp
, tnode_get_child(tp
, cindex
));
1002 tn
= (struct tnode
*) resize (t
, (struct tnode
*)tn
);
1003 tnode_put_child_reorg((struct tnode
*)tp
, cindex
,(struct node
*)tn
, wasfull
);
1005 if(!NODE_PARENT(tn
))
1008 tn
= NODE_PARENT(tn
);
1010 /* Handle last (top) tnode */
1012 tn
= (struct tnode
*) resize(t
, (struct tnode
*)tn
);
1014 return (struct node
*) tn
;
1017 static struct list_head
*
1018 fib_insert_node(struct trie
*t
, int *err
, u32 key
, int plen
)
1021 struct tnode
*tp
= NULL
, *tn
= NULL
;
1025 struct list_head
*fa_head
=NULL
;
1026 struct leaf_info
*li
;
1032 /* If we point to NULL, stop. Either the tree is empty and we should
1033 * just put a new leaf in if, or we have reached an empty child slot,
1034 * and we should just put our new leaf in that.
1035 * If we point to a T_TNODE, check if it matches our key. Note that
1036 * a T_TNODE might be skipping any number of bits - its 'pos' need
1037 * not be the parent's 'pos'+'bits'!
1039 * If it does match the current key, get pos/bits from it, extract
1040 * the index from our key, push the T_TNODE and walk the tree.
1042 * If it doesn't, we have to replace it with a new T_TNODE.
1044 * If we point to a T_LEAF, it might or might not have the same key
1045 * as we do. If it does, just change the value, update the T_LEAF's
1046 * value, and return it.
1047 * If it doesn't, we need to replace it with a T_TNODE.
1050 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
1051 tn
= (struct tnode
*) n
;
1055 if(tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
1057 pos
=tn
->pos
+ tn
->bits
;
1058 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1060 if(n
&& NODE_PARENT(n
) != tn
) {
1061 printk("BUG tn=%p, n->parent=%p\n", tn
, NODE_PARENT(n
));
1070 * n ----> NULL, LEAF or TNODE
1072 * tp is n's (parent) ----> NULL or TNODE
1075 if(tp
&& IS_LEAF(tp
))
1079 /* Case 1: n is a leaf. Compare prefixes */
1081 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
1082 struct leaf
*l
= ( struct leaf
*) n
;
1084 li
= leaf_info_new(plen
);
1091 fa_head
= &li
->falh
;
1092 insert_leaf_info(&l
->list
, li
);
1104 li
= leaf_info_new(plen
);
1107 tnode_free((struct tnode
*) l
);
1112 fa_head
= &li
->falh
;
1113 insert_leaf_info(&l
->list
, li
);
1115 /* Case 2: n is NULL, and will just insert a new leaf */
1116 if (t
->trie
&& n
== NULL
) {
1118 NODE_SET_PARENT(l
, tp
);
1124 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1125 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)l
);
1128 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131 * Add a new tnode here
1132 * first tnode need some special handling
1136 pos
=tp
->pos
+tp
->bits
;
1140 newpos
= tkey_mismatch(key
, pos
, n
->key
);
1141 tn
= tnode_new(n
->key
, newpos
, 1);
1145 tn
= tnode_new(key
, newpos
, 1); /* First tnode */
1150 tnode_free((struct tnode
*) l
);
1155 NODE_SET_PARENT(tn
, tp
);
1157 missbit
=tkey_extract_bits(key
, newpos
, 1);
1158 put_child(t
, tn
, missbit
, (struct node
*)l
);
1159 put_child(t
, tn
, 1-missbit
, n
);
1162 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1163 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)tn
);
1166 t
->trie
= (struct node
*) tn
; /* First tnode */
1170 if(tp
&& tp
->pos
+tp
->bits
> 32) {
1171 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1172 tp
, tp
->pos
, tp
->bits
, key
, plen
);
1174 /* Rebalance the trie */
1175 t
->trie
= trie_rebalance(t
, tp
);
1183 fn_trie_insert(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1184 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1186 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1187 struct fib_alias
*fa
, *new_fa
;
1188 struct list_head
*fa_head
=NULL
;
1189 struct fib_info
*fi
;
1190 int plen
= r
->rtm_dst_len
;
1191 int type
= r
->rtm_type
;
1192 u8 tos
= r
->rtm_tos
;
1202 memcpy(&key
, rta
->rta_dst
, 4);
1207 printk("Insert table=%d %08x/%d\n", tb
->tb_id
, key
, plen
);
1209 mask
= ntohl( inet_make_mask(plen
) );
1216 if ((fi
= fib_create_info(r
, rta
, nlhdr
, &err
)) == NULL
)
1219 l
= fib_find_node(t
, key
);
1223 fa_head
= get_fa_head(l
, plen
);
1224 fa
= fib_find_alias(fa_head
, tos
, fi
->fib_priority
);
1227 /* Now fa, if non-NULL, points to the first fib alias
1228 * with the same keys [prefix,tos,priority], if such key already
1229 * exists or to the node before which we will insert new one.
1231 * If fa is NULL, we will need to allocate a new one and
1232 * insert to the head of f.
1234 * If f is NULL, no fib node matched the destination key
1235 * and we need to allocate a new one of those as well.
1239 fa
->fa_info
->fib_priority
== fi
->fib_priority
) {
1240 struct fib_alias
*fa_orig
;
1243 if (nlhdr
->nlmsg_flags
& NLM_F_EXCL
)
1246 if (nlhdr
->nlmsg_flags
& NLM_F_REPLACE
) {
1247 struct fib_info
*fi_drop
;
1250 write_lock_bh(&fib_lock
);
1252 fi_drop
= fa
->fa_info
;
1255 fa
->fa_scope
= r
->rtm_scope
;
1256 state
= fa
->fa_state
;
1257 fa
->fa_state
&= ~FA_S_ACCESSED
;
1259 write_unlock_bh(&fib_lock
);
1261 fib_release_info(fi_drop
);
1262 if (state
& FA_S_ACCESSED
)
1267 /* Error if we find a perfect match which
1268 * uses the same scope, type, and nexthop
1272 list_for_each_entry(fa
, fa_orig
->fa_list
.prev
, fa_list
) {
1273 if (fa
->fa_tos
!= tos
)
1275 if (fa
->fa_info
->fib_priority
!= fi
->fib_priority
)
1277 if (fa
->fa_type
== type
&&
1278 fa
->fa_scope
== r
->rtm_scope
&&
1279 fa
->fa_info
== fi
) {
1283 if (!(nlhdr
->nlmsg_flags
& NLM_F_APPEND
))
1287 if (!(nlhdr
->nlmsg_flags
&NLM_F_CREATE
))
1291 new_fa
= kmem_cache_alloc(fn_alias_kmem
, SLAB_KERNEL
);
1295 new_fa
->fa_info
= fi
;
1296 new_fa
->fa_tos
= tos
;
1297 new_fa
->fa_type
= type
;
1298 new_fa
->fa_scope
= r
->rtm_scope
;
1299 new_fa
->fa_state
= 0;
1304 * Insert new entry to the list.
1308 fa_head
= fib_insert_node(t
, &err
, key
, plen
);
1311 goto out_free_new_fa
;
1314 write_lock_bh(&fib_lock
);
1316 list_add_tail(&new_fa
->fa_list
,
1317 (fa
? &fa
->fa_list
: fa_head
));
1319 write_unlock_bh(&fib_lock
);
1322 rtmsg_fib(RTM_NEWROUTE
, htonl(key
), new_fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1327 kmem_cache_free(fn_alias_kmem
, new_fa
);
1329 fib_release_info(fi
);
1334 static inline int check_leaf(struct trie
*t
, struct leaf
*l
, t_key key
, int *plen
, const struct flowi
*flp
,
1335 struct fib_result
*res
, int *err
)
1339 struct leaf_info
*li
;
1340 struct hlist_head
*hhead
= &l
->list
;
1341 struct hlist_node
*node
;
1343 hlist_for_each_entry(li
, node
, hhead
, hlist
) {
1346 mask
= ntohl(inet_make_mask(i
));
1347 if (l
->key
!= (key
& mask
))
1350 if (((*err
) = fib_semantic_match(&li
->falh
, flp
, res
, l
->key
, mask
, i
)) == 0) {
1352 #ifdef CONFIG_IP_FIB_TRIE_STATS
1353 t
->stats
.semantic_match_passed
++;
1357 #ifdef CONFIG_IP_FIB_TRIE_STATS
1358 t
->stats
.semantic_match_miss
++;
1365 fn_trie_lookup(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1367 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1372 t_key key
=ntohl(flp
->fl4_dst
);
1375 int current_prefix_length
= KEYLENGTH
;
1378 read_lock(&fib_lock
);
1382 #ifdef CONFIG_IP_FIB_TRIE_STATS
1388 if( check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
, &ret
) )
1392 pn
= (struct tnode
*) n
;
1401 cindex
= tkey_extract_bits(MASK_PFX(key
, current_prefix_length
), pos
, bits
);
1403 n
= tnode_get_child(pn
, cindex
);
1406 #ifdef CONFIG_IP_FIB_TRIE_STATS
1407 t
->stats
.null_node_hit
++;
1415 struct tnode
*cn
= (struct tnode
*)n
;
1416 t_key node_prefix
, key_prefix
, pref_mismatch
;
1420 * It's a tnode, and we can do some extra checks here if we
1421 * like, to avoid descending into a dead-end branch.
1422 * This tnode is in the parent's child array at index
1423 * key[p_pos..p_pos+p_bits] but potentially with some bits
1424 * chopped off, so in reality the index may be just a
1425 * subprefix, padded with zero at the end.
1426 * We can also take a look at any skipped bits in this
1427 * tnode - everything up to p_pos is supposed to be ok,
1428 * and the non-chopped bits of the index (se previous
1429 * paragraph) are also guaranteed ok, but the rest is
1430 * considered unknown.
1432 * The skipped bits are key[pos+bits..cn->pos].
1435 /* If current_prefix_length < pos+bits, we are already doing
1436 * actual prefix matching, which means everything from
1437 * pos+(bits-chopped_off) onward must be zero along some
1438 * branch of this subtree - otherwise there is *no* valid
1439 * prefix present. Here we can only check the skipped
1440 * bits. Remember, since we have already indexed into the
1441 * parent's child array, we know that the bits we chopped of
1445 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1447 if (current_prefix_length
< pos
+bits
) {
1448 if (tkey_extract_bits(cn
->key
, current_prefix_length
,
1449 cn
->pos
- current_prefix_length
) != 0 ||
1455 * If chopped_off=0, the index is fully validated and we
1456 * only need to look at the skipped bits for this, the new,
1457 * tnode. What we actually want to do is to find out if
1458 * these skipped bits match our key perfectly, or if we will
1459 * have to count on finding a matching prefix further down,
1460 * because if we do, we would like to have some way of
1461 * verifying the existence of such a prefix at this point.
1464 /* The only thing we can do at this point is to verify that
1465 * any such matching prefix can indeed be a prefix to our
1466 * key, and if the bits in the node we are inspecting that
1467 * do not match our key are not ZERO, this cannot be true.
1468 * Thus, find out where there is a mismatch (before cn->pos)
1469 * and verify that all the mismatching bits are zero in the
1473 /* Note: We aren't very concerned about the piece of the key
1474 * that precede pn->pos+pn->bits, since these have already been
1475 * checked. The bits after cn->pos aren't checked since these are
1476 * by definition "unknown" at this point. Thus, what we want to
1477 * see is if we are about to enter the "prefix matching" state,
1478 * and in that case verify that the skipped bits that will prevail
1479 * throughout this subtree are zero, as they have to be if we are
1480 * to find a matching prefix.
1483 node_prefix
= MASK_PFX(cn
->key
, cn
->pos
);
1484 key_prefix
= MASK_PFX(key
, cn
->pos
);
1485 pref_mismatch
= key_prefix
^node_prefix
;
1488 /* In short: If skipped bits in this node do not match the search
1489 * key, enter the "prefix matching" state.directly.
1491 if (pref_mismatch
) {
1492 while (!(pref_mismatch
& (1<<(KEYLENGTH
-1)))) {
1494 pref_mismatch
= pref_mismatch
<<1;
1496 key_prefix
= tkey_extract_bits(cn
->key
, mp
, cn
->pos
-mp
);
1498 if (key_prefix
!= 0)
1501 if (current_prefix_length
>= cn
->pos
)
1502 current_prefix_length
=mp
;
1505 pn
= (struct tnode
*)n
; /* Descend */
1510 if( check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
, &ret
))
1516 /* As zero don't change the child key (cindex) */
1517 while ((chopped_off
<= pn
->bits
) && !(cindex
& (1<<(chopped_off
-1)))) {
1521 /* Decrease current_... with bits chopped off */
1522 if (current_prefix_length
> pn
->pos
+ pn
->bits
- chopped_off
)
1523 current_prefix_length
= pn
->pos
+ pn
->bits
- chopped_off
;
1526 * Either we do the actual chop off according or if we have
1527 * chopped off all bits in this tnode walk up to our parent.
1530 if(chopped_off
<= pn
->bits
)
1531 cindex
&= ~(1 << (chopped_off
-1));
1533 if( NODE_PARENT(pn
) == NULL
)
1536 /* Get Child's index */
1537 cindex
= tkey_extract_bits(pn
->key
, NODE_PARENT(pn
)->pos
, NODE_PARENT(pn
)->bits
);
1538 pn
= NODE_PARENT(pn
);
1541 #ifdef CONFIG_IP_FIB_TRIE_STATS
1542 t
->stats
.backtrack
++;
1550 read_unlock(&fib_lock
);
1554 static int trie_leaf_remove(struct trie
*t
, t_key key
)
1557 struct tnode
*tp
= NULL
;
1558 struct node
*n
= t
->trie
;
1562 printk("entering trie_leaf_remove(%p)\n", n
);
1564 /* Note that in the case skipped bits, those bits are *not* checked!
1565 * When we finish this, we will have NULL or a T_LEAF, and the
1566 * T_LEAF may or may not match our key.
1569 while (n
!= NULL
&& IS_TNODE(n
)) {
1570 struct tnode
*tn
= (struct tnode
*) n
;
1572 n
= tnode_get_child(tn
,tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1574 if(n
&& NODE_PARENT(n
) != tn
) {
1575 printk("BUG tn=%p, n->parent=%p\n", tn
, NODE_PARENT(n
));
1579 l
= (struct leaf
*) n
;
1581 if(!n
|| !tkey_equals(l
->key
, key
))
1586 * Remove the leaf and rebalance the tree
1592 tp
= NODE_PARENT(n
);
1593 tnode_free((struct tnode
*) n
);
1596 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1597 put_child(t
, (struct tnode
*)tp
, cindex
, NULL
);
1598 t
->trie
= trie_rebalance(t
, tp
);
1607 fn_trie_delete(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1608 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1610 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1612 int plen
= r
->rtm_dst_len
;
1613 u8 tos
= r
->rtm_tos
;
1614 struct fib_alias
*fa
, *fa_to_delete
;
1615 struct list_head
*fa_head
;
1623 memcpy(&key
, rta
->rta_dst
, 4);
1626 mask
= ntohl( inet_make_mask(plen
) );
1632 l
= fib_find_node(t
, key
);
1637 fa_head
= get_fa_head(l
, plen
);
1638 fa
= fib_find_alias(fa_head
, tos
, 0);
1644 printk("Deleting %08x/%d tos=%d t=%p\n", key
, plen
, tos
, t
);
1646 fa_to_delete
= NULL
;
1647 fa_head
= fa
->fa_list
.prev
;
1648 list_for_each_entry(fa
, fa_head
, fa_list
) {
1649 struct fib_info
*fi
= fa
->fa_info
;
1651 if (fa
->fa_tos
!= tos
)
1654 if ((!r
->rtm_type
||
1655 fa
->fa_type
== r
->rtm_type
) &&
1656 (r
->rtm_scope
== RT_SCOPE_NOWHERE
||
1657 fa
->fa_scope
== r
->rtm_scope
) &&
1658 (!r
->rtm_protocol
||
1659 fi
->fib_protocol
== r
->rtm_protocol
) &&
1660 fib_nh_match(r
, nlhdr
, rta
, fi
) == 0) {
1668 struct leaf_info
*li
;
1671 rtmsg_fib(RTM_DELROUTE
, htonl(key
), fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1673 l
= fib_find_node(t
, key
);
1674 li
= find_leaf_info(&l
->list
, plen
);
1676 write_lock_bh(&fib_lock
);
1678 list_del(&fa
->fa_list
);
1680 if(list_empty(fa_head
)) {
1681 hlist_del(&li
->hlist
);
1684 write_unlock_bh(&fib_lock
);
1689 if(hlist_empty(&l
->list
))
1690 trie_leaf_remove(t
, key
);
1692 if (fa
->fa_state
& FA_S_ACCESSED
)
1701 static int trie_flush_list(struct trie
*t
, struct list_head
*head
)
1703 struct fib_alias
*fa
, *fa_node
;
1706 list_for_each_entry_safe(fa
, fa_node
, head
, fa_list
) {
1707 struct fib_info
*fi
= fa
->fa_info
;
1709 if (fi
&& (fi
->fib_flags
&RTNH_F_DEAD
)) {
1711 write_lock_bh(&fib_lock
);
1712 list_del(&fa
->fa_list
);
1713 write_unlock_bh(&fib_lock
);
1722 static int trie_flush_leaf(struct trie
*t
, struct leaf
*l
)
1725 struct hlist_head
*lih
= &l
->list
;
1726 struct hlist_node
*node
, *tmp
;
1727 struct leaf_info
*li
= NULL
;
1729 hlist_for_each_entry_safe(li
, node
, tmp
, lih
, hlist
) {
1731 found
+= trie_flush_list(t
, &li
->falh
);
1733 if (list_empty(&li
->falh
)) {
1735 write_lock_bh(&fib_lock
);
1736 hlist_del(&li
->hlist
);
1737 write_unlock_bh(&fib_lock
);
1745 static struct leaf
*nextleaf(struct trie
*t
, struct leaf
*thisleaf
)
1747 struct node
*c
= (struct node
*) thisleaf
;
1755 if (IS_LEAF(t
->trie
)) /* trie w. just a leaf */
1756 return (struct leaf
*) t
->trie
;
1758 p
= (struct tnode
*) t
->trie
; /* Start */
1761 p
= (struct tnode
*) NODE_PARENT(c
);
1765 /* Find the next child of the parent */
1767 pos
= 1 + tkey_extract_bits(c
->key
, p
->pos
, p
->bits
);
1771 last
= 1 << p
->bits
;
1772 for(idx
= pos
; idx
< last
; idx
++) {
1773 if( p
->child
[idx
]) {
1775 /* Decend if tnode */
1777 while (IS_TNODE(p
->child
[idx
])) {
1778 p
= (struct tnode
*) p
->child
[idx
];
1781 /* Rightmost non-NULL branch */
1782 if( p
&& IS_TNODE(p
) )
1783 while ( p
->child
[idx
] == NULL
&& idx
< (1 << p
->bits
) ) idx
++;
1785 /* Done with this tnode? */
1786 if( idx
>= (1 << p
->bits
) || p
->child
[idx
] == NULL
)
1789 return (struct leaf
*) p
->child
[idx
];
1793 /* No more children go up one step */
1794 c
= (struct node
*) p
;
1795 p
= (struct tnode
*) NODE_PARENT(p
);
1797 return NULL
; /* Ready. Root of trie */
1800 static int fn_trie_flush(struct fib_table
*tb
)
1802 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1803 struct leaf
*ll
= NULL
, *l
= NULL
;
1808 for (h
=0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1809 found
+= trie_flush_leaf(t
, l
);
1811 if (ll
&& hlist_empty(&ll
->list
))
1812 trie_leaf_remove(t
, ll
->key
);
1816 if (ll
&& hlist_empty(&ll
->list
))
1817 trie_leaf_remove(t
, ll
->key
);
1820 printk("trie_flush found=%d\n", found
);
1824 static int trie_last_dflt
=-1;
1827 fn_trie_select_default(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1829 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1830 int order
, last_idx
;
1831 struct fib_info
*fi
= NULL
;
1832 struct fib_info
*last_resort
;
1833 struct fib_alias
*fa
= NULL
;
1834 struct list_head
*fa_head
;
1841 read_lock(&fib_lock
);
1843 l
= fib_find_node(t
, 0);
1847 fa_head
= get_fa_head(l
, 0);
1851 if (list_empty(fa_head
))
1854 list_for_each_entry(fa
, fa_head
, fa_list
) {
1855 struct fib_info
*next_fi
= fa
->fa_info
;
1857 if (fa
->fa_scope
!= res
->scope
||
1858 fa
->fa_type
!= RTN_UNICAST
)
1861 if (next_fi
->fib_priority
> res
->fi
->fib_priority
)
1863 if (!next_fi
->fib_nh
[0].nh_gw
||
1864 next_fi
->fib_nh
[0].nh_scope
!= RT_SCOPE_LINK
)
1866 fa
->fa_state
|= FA_S_ACCESSED
;
1869 if (next_fi
!= res
->fi
)
1871 } else if (!fib_detect_death(fi
, order
, &last_resort
,
1872 &last_idx
, &trie_last_dflt
)) {
1874 fib_info_put(res
->fi
);
1876 atomic_inc(&fi
->fib_clntref
);
1877 trie_last_dflt
= order
;
1883 if (order
<= 0 || fi
== NULL
) {
1884 trie_last_dflt
= -1;
1888 if (!fib_detect_death(fi
, order
, &last_resort
, &last_idx
, &trie_last_dflt
)) {
1890 fib_info_put(res
->fi
);
1892 atomic_inc(&fi
->fib_clntref
);
1893 trie_last_dflt
= order
;
1896 if (last_idx
>= 0) {
1898 fib_info_put(res
->fi
);
1899 res
->fi
= last_resort
;
1901 atomic_inc(&last_resort
->fib_clntref
);
1903 trie_last_dflt
= last_idx
;
1905 read_unlock(&fib_lock
);
1908 static int fn_trie_dump_fa(t_key key
, int plen
, struct list_head
*fah
, struct fib_table
*tb
,
1909 struct sk_buff
*skb
, struct netlink_callback
*cb
)
1912 struct fib_alias
*fa
;
1914 u32 xkey
=htonl(key
);
1919 list_for_each_entry(fa
, fah
, fa_list
) {
1924 if (fa
->fa_info
->fib_nh
== NULL
) {
1925 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i
, key
, plen
);
1929 if (fa
->fa_info
== NULL
) {
1930 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i
, key
, plen
);
1935 if (fib_dump_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1944 fa
->fa_info
, 0) < 0) {
1954 static int fn_trie_dump_plen(struct trie
*t
, int plen
, struct fib_table
*tb
, struct sk_buff
*skb
,
1955 struct netlink_callback
*cb
)
1958 struct list_head
*fa_head
;
1959 struct leaf
*l
= NULL
;
1962 for (h
=0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1967 memset(&cb
->args
[3], 0,
1968 sizeof(cb
->args
) - 3*sizeof(cb
->args
[0]));
1970 fa_head
= get_fa_head(l
, plen
);
1975 if(list_empty(fa_head
))
1978 if (fn_trie_dump_fa(l
->key
, plen
, fa_head
, tb
, skb
, cb
)<0) {
1987 static int fn_trie_dump(struct fib_table
*tb
, struct sk_buff
*skb
, struct netlink_callback
*cb
)
1990 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1994 read_lock(&fib_lock
);
1995 for (m
=0; m
<=32; m
++) {
2000 memset(&cb
->args
[2], 0,
2001 sizeof(cb
->args
) - 2*sizeof(cb
->args
[0]));
2003 if (fn_trie_dump_plen(t
, 32-m
, tb
, skb
, cb
)<0) {
2008 read_unlock(&fib_lock
);
2012 read_unlock(&fib_lock
);
2016 /* Fix more generic FIB names for init later */
2018 #ifdef CONFIG_IP_MULTIPLE_TABLES
2019 struct fib_table
* fib_hash_init(int id
)
2021 struct fib_table
* __init
fib_hash_init(int id
)
2024 struct fib_table
*tb
;
2027 if (fn_alias_kmem
== NULL
)
2028 fn_alias_kmem
= kmem_cache_create("ip_fib_alias",
2029 sizeof(struct fib_alias
),
2030 0, SLAB_HWCACHE_ALIGN
,
2033 tb
= kmalloc(sizeof(struct fib_table
) + sizeof(struct trie
),
2039 tb
->tb_lookup
= fn_trie_lookup
;
2040 tb
->tb_insert
= fn_trie_insert
;
2041 tb
->tb_delete
= fn_trie_delete
;
2042 tb
->tb_flush
= fn_trie_flush
;
2043 tb
->tb_select_default
= fn_trie_select_default
;
2044 tb
->tb_dump
= fn_trie_dump
;
2045 memset(tb
->tb_data
, 0, sizeof(struct trie
));
2047 t
= (struct trie
*) tb
->tb_data
;
2051 if (id
== RT_TABLE_LOCAL
)
2053 else if (id
== RT_TABLE_MAIN
)
2056 if (id
== RT_TABLE_LOCAL
)
2057 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION
);
2062 /* Trie dump functions */
2064 static void putspace_seq(struct seq_file
*seq
, int n
)
2066 while (n
--) seq_printf(seq
, " ");
2069 static void printbin_seq(struct seq_file
*seq
, unsigned int v
, int bits
)
2072 seq_printf(seq
, "%s", (v
& (1<<bits
))?"1":"0");
2075 static void printnode_seq(struct seq_file
*seq
, int indent
, struct node
*n
,
2076 int pend
, int cindex
, int bits
)
2078 putspace_seq(seq
, indent
);
2080 seq_printf(seq
, "|");
2082 seq_printf(seq
, "+");
2084 seq_printf(seq
, "%d/", cindex
);
2085 printbin_seq(seq
, cindex
, bits
);
2086 seq_printf(seq
, ": ");
2089 seq_printf(seq
, "<root>: ");
2090 seq_printf(seq
, "%s:%p ", IS_LEAF(n
)?"Leaf":"Internal node", n
);
2093 seq_printf(seq
, "key=%d.%d.%d.%d\n",
2094 n
->key
>> 24, (n
->key
>> 16) % 256, (n
->key
>> 8) % 256, n
->key
% 256);
2096 int plen
=((struct tnode
*)n
)->pos
;
2097 t_key prf
=MASK_PFX(n
->key
, plen
);
2098 seq_printf(seq
, "key=%d.%d.%d.%d/%d\n",
2099 prf
>> 24, (prf
>> 16) % 256, (prf
>> 8) % 256, prf
% 256, plen
);
2102 struct leaf
*l
=(struct leaf
*)n
;
2103 struct fib_alias
*fa
;
2105 for (i
=32; i
>=0; i
--)
2106 if(find_leaf_info(&l
->list
, i
)) {
2108 struct list_head
*fa_head
= get_fa_head(l
, i
);
2113 if(list_empty(fa_head
))
2116 putspace_seq(seq
, indent
+2);
2117 seq_printf(seq
, "{/%d...dumping}\n", i
);
2120 list_for_each_entry(fa
, fa_head
, fa_list
) {
2121 putspace_seq(seq
, indent
+2);
2122 if (fa
->fa_info
->fib_nh
== NULL
) {
2123 seq_printf(seq
, "Error _fib_nh=NULL\n");
2126 if (fa
->fa_info
== NULL
) {
2127 seq_printf(seq
, "Error fa_info=NULL\n");
2131 seq_printf(seq
, "{type=%d scope=%d TOS=%d}\n",
2138 else if (IS_TNODE(n
)) {
2139 struct tnode
*tn
=(struct tnode
*)n
;
2140 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
2141 seq_printf(seq
, "{key prefix=%08x/", tn
->key
&TKEY_GET_MASK(0, tn
->pos
));
2142 printbin_seq(seq
, tkey_extract_bits(tn
->key
, 0, tn
->pos
), tn
->pos
);
2143 seq_printf(seq
, "}\n");
2144 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
2145 seq_printf(seq
, "{pos=%d", tn
->pos
);
2146 seq_printf(seq
, " (skip=%d bits)", tn
->pos
- pend
);
2147 seq_printf(seq
, " bits=%d (%u children)}\n", tn
->bits
, (1 << tn
->bits
));
2148 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
2149 seq_printf(seq
, "{empty=%d full=%d}\n", tn
->empty_children
, tn
->full_children
);
2153 static void trie_dump_seq(struct seq_file
*seq
, struct trie
*t
)
2155 struct node
*n
=t
->trie
;
2161 read_lock(&fib_lock
);
2163 seq_printf(seq
, "------ trie_dump of t=%p ------\n", t
);
2165 printnode_seq(seq
, indent
, n
, pend
, cindex
, 0);
2167 struct tnode
*tn
=(struct tnode
*)n
;
2168 pend
= tn
->pos
+tn
->bits
;
2169 putspace_seq(seq
, indent
); seq_printf(seq
, "\\--\n");
2173 while (tn
&& cindex
< (1 << tn
->bits
)) {
2174 if (tn
->child
[cindex
]) {
2178 printnode_seq(seq
, indent
, tn
->child
[cindex
], pend
, cindex
, tn
->bits
);
2179 if (IS_LEAF(tn
->child
[cindex
])) {
2185 * New tnode. Decend one level
2189 n
=tn
->child
[cindex
];
2190 tn
=(struct tnode
*)n
;
2191 pend
=tn
->pos
+tn
->bits
;
2192 putspace_seq(seq
, indent
); seq_printf(seq
, "\\--\n");
2201 * Test if we are done
2204 while (cindex
>= (1 << tn
->bits
)) {
2207 * Move upwards and test for root
2208 * pop off all traversed nodes
2211 if (NODE_PARENT(tn
) == NULL
) {
2217 cindex
= tkey_extract_bits(tn
->key
, NODE_PARENT(tn
)->pos
, NODE_PARENT(tn
)->bits
);
2218 tn
= NODE_PARENT(tn
);
2220 n
=(struct node
*)tn
;
2221 pend
=tn
->pos
+tn
->bits
;
2230 else seq_printf(seq
, "------ trie is empty\n");
2232 read_unlock(&fib_lock
);
2235 static struct trie_stat
*trie_stat_new(void)
2237 struct trie_stat
*s
= kmalloc(sizeof(struct trie_stat
), GFP_KERNEL
);
2245 s
->nullpointers
= 0;
2247 for(i
=0; i
< MAX_CHILDS
; i
++)
2248 s
->nodesizes
[i
] = 0;
2253 static struct trie_stat
*trie_collect_stats(struct trie
*t
)
2255 struct node
*n
=t
->trie
;
2256 struct trie_stat
*s
= trie_stat_new();
2262 read_lock(&fib_lock
);
2267 struct tnode
*tn
= (struct tnode
*)n
;
2268 pend
=tn
->pos
+tn
->bits
;
2270 s
->nodesizes
[tn
->bits
]++;
2273 while (tn
&& cindex
< (1 << tn
->bits
)) {
2274 if (tn
->child
[cindex
]) {
2277 if (IS_LEAF(tn
->child
[cindex
])) {
2281 if (depth
> s
->maxdepth
)
2282 s
->maxdepth
= depth
;
2283 s
->totdepth
+= depth
;
2289 * New tnode. Decend one level
2293 s
->nodesizes
[tn
->bits
]++;
2296 n
= tn
->child
[cindex
];
2297 tn
= (struct tnode
*)n
;
2298 pend
= tn
->pos
+tn
->bits
;
2310 * Test if we are done
2313 while (cindex
>= (1 << tn
->bits
)) {
2316 * Move upwards and test for root
2317 * pop off all traversed nodes
2321 if (NODE_PARENT(tn
) == NULL
) {
2327 cindex
= tkey_extract_bits(tn
->key
, NODE_PARENT(tn
)->pos
, NODE_PARENT(tn
)->bits
);
2328 tn
= NODE_PARENT(tn
);
2330 n
= (struct node
*)tn
;
2331 pend
=tn
->pos
+tn
->bits
;
2342 read_unlock(&fib_lock
);
2346 #ifdef CONFIG_PROC_FS
2348 static struct fib_alias
*fib_triestat_get_first(struct seq_file
*seq
)
2353 static struct fib_alias
*fib_triestat_get_next(struct seq_file
*seq
)
2358 static void *fib_triestat_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2362 if (ip_fib_main_table
)
2363 v
= *pos
? fib_triestat_get_next(seq
) : SEQ_START_TOKEN
;
2367 static void *fib_triestat_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2370 return v
== SEQ_START_TOKEN
? fib_triestat_get_first(seq
) : fib_triestat_get_next(seq
);
2373 static void fib_triestat_seq_stop(struct seq_file
*seq
, void *v
)
2379 * This outputs /proc/net/fib_triestats
2381 * It always works in backward compatibility mode.
2382 * The format of the file is not supposed to be changed.
2385 static void collect_and_show(struct trie
*t
, struct seq_file
*seq
)
2387 int bytes
= 0; /* How many bytes are used, a ref is 4 bytes */
2388 int i
, max
, pointers
;
2389 struct trie_stat
*stat
;
2392 stat
= trie_collect_stats(t
);
2395 seq_printf(seq
, "trie=%p\n", t
);
2399 avdepth
=stat
->totdepth
*100 / stat
->leaves
;
2402 seq_printf(seq
, "Aver depth: %d.%02d\n", avdepth
/ 100, avdepth
% 100 );
2403 seq_printf(seq
, "Max depth: %4d\n", stat
->maxdepth
);
2405 seq_printf(seq
, "Leaves: %d\n", stat
->leaves
);
2406 bytes
+= sizeof(struct leaf
) * stat
->leaves
;
2407 seq_printf(seq
, "Internal nodes: %d\n", stat
->tnodes
);
2408 bytes
+= sizeof(struct tnode
) * stat
->tnodes
;
2412 while (max
>= 0 && stat
->nodesizes
[max
] == 0)
2416 for (i
= 1; i
<= max
; i
++)
2417 if (stat
->nodesizes
[i
] != 0) {
2418 seq_printf(seq
, " %d: %d", i
, stat
->nodesizes
[i
]);
2419 pointers
+= (1<<i
) * stat
->nodesizes
[i
];
2421 seq_printf(seq
, "\n");
2422 seq_printf(seq
, "Pointers: %d\n", pointers
);
2423 bytes
+= sizeof(struct node
*) * pointers
;
2424 seq_printf(seq
, "Null ptrs: %d\n", stat
->nullpointers
);
2425 seq_printf(seq
, "Total size: %d kB\n", bytes
/ 1024);
2430 #ifdef CONFIG_IP_FIB_TRIE_STATS
2431 seq_printf(seq
, "Counters:\n---------\n");
2432 seq_printf(seq
,"gets = %d\n", t
->stats
.gets
);
2433 seq_printf(seq
,"backtracks = %d\n", t
->stats
.backtrack
);
2434 seq_printf(seq
,"semantic match passed = %d\n", t
->stats
.semantic_match_passed
);
2435 seq_printf(seq
,"semantic match miss = %d\n", t
->stats
.semantic_match_miss
);
2436 seq_printf(seq
,"null node hit= %d\n", t
->stats
.null_node_hit
);
2437 seq_printf(seq
,"skipped node resize = %d\n", t
->stats
.resize_node_skipped
);
2439 memset(&(t
->stats
), 0, sizeof(t
->stats
));
2441 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2444 static int fib_triestat_seq_show(struct seq_file
*seq
, void *v
)
2448 if (v
== SEQ_START_TOKEN
) {
2449 seq_printf(seq
, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2450 sizeof(struct leaf
), sizeof(struct tnode
));
2452 collect_and_show(trie_local
, seq
);
2455 collect_and_show(trie_main
, seq
);
2458 snprintf(bf
, sizeof(bf
),
2459 "*\t%08X\t%08X", 200, 400);
2461 seq_printf(seq
, "%-127s\n", bf
);
2466 static struct seq_operations fib_triestat_seq_ops
= {
2467 .start
= fib_triestat_seq_start
,
2468 .next
= fib_triestat_seq_next
,
2469 .stop
= fib_triestat_seq_stop
,
2470 .show
= fib_triestat_seq_show
,
2473 static int fib_triestat_seq_open(struct inode
*inode
, struct file
*file
)
2475 struct seq_file
*seq
;
2478 rc
= seq_open(file
, &fib_triestat_seq_ops
);
2482 seq
= file
->private_data
;
2489 static struct file_operations fib_triestat_seq_fops
= {
2490 .owner
= THIS_MODULE
,
2491 .open
= fib_triestat_seq_open
,
2493 .llseek
= seq_lseek
,
2494 .release
= seq_release_private
,
2497 int __init
fib_stat_proc_init(void)
2499 if (!proc_net_fops_create("fib_triestat", S_IRUGO
, &fib_triestat_seq_fops
))
2504 void __init
fib_stat_proc_exit(void)
2506 proc_net_remove("fib_triestat");
2509 static struct fib_alias
*fib_trie_get_first(struct seq_file
*seq
)
2514 static struct fib_alias
*fib_trie_get_next(struct seq_file
*seq
)
2519 static void *fib_trie_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2523 if (ip_fib_main_table
)
2524 v
= *pos
? fib_trie_get_next(seq
) : SEQ_START_TOKEN
;
2528 static void *fib_trie_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2531 return v
== SEQ_START_TOKEN
? fib_trie_get_first(seq
) : fib_trie_get_next(seq
);
2534 static void fib_trie_seq_stop(struct seq_file
*seq
, void *v
)
2540 * This outputs /proc/net/fib_trie.
2542 * It always works in backward compatibility mode.
2543 * The format of the file is not supposed to be changed.
2546 static int fib_trie_seq_show(struct seq_file
*seq
, void *v
)
2550 if (v
== SEQ_START_TOKEN
) {
2552 trie_dump_seq(seq
, trie_local
);
2555 trie_dump_seq(seq
, trie_main
);
2559 snprintf(bf
, sizeof(bf
),
2560 "*\t%08X\t%08X", 200, 400);
2561 seq_printf(seq
, "%-127s\n", bf
);
2567 static struct seq_operations fib_trie_seq_ops
= {
2568 .start
= fib_trie_seq_start
,
2569 .next
= fib_trie_seq_next
,
2570 .stop
= fib_trie_seq_stop
,
2571 .show
= fib_trie_seq_show
,
2574 static int fib_trie_seq_open(struct inode
*inode
, struct file
*file
)
2576 struct seq_file
*seq
;
2579 rc
= seq_open(file
, &fib_trie_seq_ops
);
2583 seq
= file
->private_data
;
2590 static struct file_operations fib_trie_seq_fops
= {
2591 .owner
= THIS_MODULE
,
2592 .open
= fib_trie_seq_open
,
2594 .llseek
= seq_lseek
,
2595 .release
= seq_release_private
,
2598 int __init
fib_proc_init(void)
2600 if (!proc_net_fops_create("fib_trie", S_IRUGO
, &fib_trie_seq_fops
))
2605 void __init
fib_proc_exit(void)
2607 proc_net_remove("fib_trie");
2610 #endif /* CONFIG_PROC_FS */