[NET]: Add netlink connector.
[linux-2.6/verdex.git] / net / ipv4 / fib_trie.c
blob1b63b4824164390daa515a4baf1c1e8b6f8a9b8f
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.403"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/rcupdate.h>
66 #include <linux/skbuff.h>
67 #include <linux/netlink.h>
68 #include <linux/init.h>
69 #include <linux/list.h>
70 #include <net/ip.h>
71 #include <net/protocol.h>
72 #include <net/route.h>
73 #include <net/tcp.h>
74 #include <net/sock.h>
75 #include <net/ip_fib.h>
76 #include "fib_lookup.h"
78 #undef CONFIG_IP_FIB_TRIE_STATS
79 #define MAX_CHILDS 16384
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 typedef unsigned int t_key;
87 #define T_TNODE 0
88 #define T_LEAF 1
89 #define NODE_TYPE_MASK 0x1UL
90 #define NODE_PARENT(node) \
91 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95 #define NODE_SET_PARENT(node, ptr) \
96 rcu_assign_pointer((node)->parent, \
97 ((unsigned long)(ptr)) | NODE_TYPE(node))
99 #define IS_TNODE(n) (!(n->parent & T_LEAF))
100 #define IS_LEAF(n) (n->parent & T_LEAF)
102 struct node {
103 t_key key;
104 unsigned long parent;
107 struct leaf {
108 t_key key;
109 unsigned long parent;
110 struct hlist_head list;
111 struct rcu_head rcu;
114 struct leaf_info {
115 struct hlist_node hlist;
116 struct rcu_head rcu;
117 int plen;
118 struct list_head falh;
121 struct tnode {
122 t_key key;
123 unsigned long parent;
124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short full_children; /* KEYLENGTH bits needed */
127 unsigned short empty_children; /* KEYLENGTH bits needed */
128 struct rcu_head rcu;
129 struct node *child[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
141 #endif
143 struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
152 struct trie {
153 struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156 #endif
157 int size;
158 unsigned int revision;
161 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
162 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 static void tnode_free(struct tnode *tn);
168 static kmem_cache_t *fn_alias_kmem __read_mostly;
169 static struct trie *trie_local = NULL, *trie_main = NULL;
172 /* rcu_read_lock needs to be hold by caller from readside */
174 static inline struct node *tnode_get_child(struct tnode *tn, int i)
176 BUG_ON(i >= 1 << tn->bits);
178 return rcu_dereference(tn->child[i]);
181 static inline int tnode_child_length(const struct tnode *tn)
183 return 1 << tn->bits;
186 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
188 if (offset < KEYLENGTH)
189 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
190 else
191 return 0;
194 static inline int tkey_equals(t_key a, t_key b)
196 return a == b;
199 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
201 if (bits == 0 || offset >= KEYLENGTH)
202 return 1;
203 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
204 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
207 static inline int tkey_mismatch(t_key a, int offset, t_key b)
209 t_key diff = a ^ b;
210 int i = offset;
212 if (!diff)
213 return 0;
214 while ((diff << i) >> (KEYLENGTH-1) == 0)
215 i++;
216 return i;
220 To understand this stuff, an understanding of keys and all their bits is
221 necessary. Every node in the trie has a key associated with it, but not
222 all of the bits in that key are significant.
224 Consider a node 'n' and its parent 'tp'.
226 If n is a leaf, every bit in its key is significant. Its presence is
227 necessitaded by path compression, since during a tree traversal (when
228 searching for a leaf - unless we are doing an insertion) we will completely
229 ignore all skipped bits we encounter. Thus we need to verify, at the end of
230 a potentially successful search, that we have indeed been walking the
231 correct key path.
233 Note that we can never "miss" the correct key in the tree if present by
234 following the wrong path. Path compression ensures that segments of the key
235 that are the same for all keys with a given prefix are skipped, but the
236 skipped part *is* identical for each node in the subtrie below the skipped
237 bit! trie_insert() in this implementation takes care of that - note the
238 call to tkey_sub_equals() in trie_insert().
240 if n is an internal node - a 'tnode' here, the various parts of its key
241 have many different meanings.
243 Example:
244 _________________________________________________________________
245 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
246 -----------------------------------------------------------------
247 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
249 _________________________________________________________________
250 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
251 -----------------------------------------------------------------
252 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
254 tp->pos = 7
255 tp->bits = 3
256 n->pos = 15
257 n->bits = 4
259 First, let's just ignore the bits that come before the parent tp, that is
260 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
261 not use them for anything.
263 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
264 index into the parent's child array. That is, they will be used to find
265 'n' among tp's children.
267 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
268 for the node n.
270 All the bits we have seen so far are significant to the node n. The rest
271 of the bits are really not needed or indeed known in n->key.
273 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
274 n's child array, and will of course be different for each child.
277 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
278 at this point.
282 static inline void check_tnode(const struct tnode *tn)
284 WARN_ON(tn && tn->pos+tn->bits > 32);
287 static int halve_threshold = 25;
288 static int inflate_threshold = 50;
291 static void __alias_free_mem(struct rcu_head *head)
293 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
294 kmem_cache_free(fn_alias_kmem, fa);
297 static inline void alias_free_mem_rcu(struct fib_alias *fa)
299 call_rcu(&fa->rcu, __alias_free_mem);
302 static void __leaf_free_rcu(struct rcu_head *head)
304 kfree(container_of(head, struct leaf, rcu));
307 static inline void free_leaf(struct leaf *leaf)
309 call_rcu(&leaf->rcu, __leaf_free_rcu);
312 static void __leaf_info_free_rcu(struct rcu_head *head)
314 kfree(container_of(head, struct leaf_info, rcu));
317 static inline void free_leaf_info(struct leaf_info *leaf)
319 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
322 static struct tnode *tnode_alloc(unsigned int size)
324 struct page *pages;
326 if (size <= PAGE_SIZE)
327 return kcalloc(size, 1, GFP_KERNEL);
329 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
330 if (!pages)
331 return NULL;
333 return page_address(pages);
336 static void __tnode_free_rcu(struct rcu_head *head)
338 struct tnode *tn = container_of(head, struct tnode, rcu);
339 unsigned int size = sizeof(struct tnode) +
340 (1 << tn->bits) * sizeof(struct node *);
342 if (size <= PAGE_SIZE)
343 kfree(tn);
344 else
345 free_pages((unsigned long)tn, get_order(size));
348 static inline void tnode_free(struct tnode *tn)
350 call_rcu(&tn->rcu, __tnode_free_rcu);
353 static struct leaf *leaf_new(void)
355 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
356 if (l) {
357 l->parent = T_LEAF;
358 INIT_HLIST_HEAD(&l->list);
360 return l;
363 static struct leaf_info *leaf_info_new(int plen)
365 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
366 if (li) {
367 li->plen = plen;
368 INIT_LIST_HEAD(&li->falh);
370 return li;
373 static struct tnode* tnode_new(t_key key, int pos, int bits)
375 int nchildren = 1<<bits;
376 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
377 struct tnode *tn = tnode_alloc(sz);
379 if (tn) {
380 memset(tn, 0, sz);
381 tn->parent = T_TNODE;
382 tn->pos = pos;
383 tn->bits = bits;
384 tn->key = key;
385 tn->full_children = 0;
386 tn->empty_children = 1<<bits;
389 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
390 (unsigned int) (sizeof(struct node) * 1<<bits));
391 return tn;
395 * Check whether a tnode 'n' is "full", i.e. it is an internal node
396 * and no bits are skipped. See discussion in dyntree paper p. 6
399 static inline int tnode_full(const struct tnode *tn, const struct node *n)
401 if (n == NULL || IS_LEAF(n))
402 return 0;
404 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
407 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
409 tnode_put_child_reorg(tn, i, n, -1);
413 * Add a child at position i overwriting the old value.
414 * Update the value of full_children and empty_children.
417 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
419 struct node *chi = tn->child[i];
420 int isfull;
422 BUG_ON(i >= 1<<tn->bits);
425 /* update emptyChildren */
426 if (n == NULL && chi != NULL)
427 tn->empty_children++;
428 else if (n != NULL && chi == NULL)
429 tn->empty_children--;
431 /* update fullChildren */
432 if (wasfull == -1)
433 wasfull = tnode_full(tn, chi);
435 isfull = tnode_full(tn, n);
436 if (wasfull && !isfull)
437 tn->full_children--;
438 else if (!wasfull && isfull)
439 tn->full_children++;
441 if (n)
442 NODE_SET_PARENT(n, tn);
444 rcu_assign_pointer(tn->child[i], n);
447 static struct node *resize(struct trie *t, struct tnode *tn)
449 int i;
450 int err = 0;
451 struct tnode *old_tn;
453 if (!tn)
454 return NULL;
456 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
457 tn, inflate_threshold, halve_threshold);
459 /* No children */
460 if (tn->empty_children == tnode_child_length(tn)) {
461 tnode_free(tn);
462 return NULL;
464 /* One child */
465 if (tn->empty_children == tnode_child_length(tn) - 1)
466 for (i = 0; i < tnode_child_length(tn); i++) {
467 struct node *n;
469 n = tn->child[i];
470 if (!n)
471 continue;
473 /* compress one level */
474 NODE_SET_PARENT(n, NULL);
475 tnode_free(tn);
476 return n;
479 * Double as long as the resulting node has a number of
480 * nonempty nodes that are above the threshold.
484 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
485 * the Helsinki University of Technology and Matti Tikkanen of Nokia
486 * Telecommunications, page 6:
487 * "A node is doubled if the ratio of non-empty children to all
488 * children in the *doubled* node is at least 'high'."
490 * 'high' in this instance is the variable 'inflate_threshold'. It
491 * is expressed as a percentage, so we multiply it with
492 * tnode_child_length() and instead of multiplying by 2 (since the
493 * child array will be doubled by inflate()) and multiplying
494 * the left-hand side by 100 (to handle the percentage thing) we
495 * multiply the left-hand side by 50.
497 * The left-hand side may look a bit weird: tnode_child_length(tn)
498 * - tn->empty_children is of course the number of non-null children
499 * in the current node. tn->full_children is the number of "full"
500 * children, that is non-null tnodes with a skip value of 0.
501 * All of those will be doubled in the resulting inflated tnode, so
502 * we just count them one extra time here.
504 * A clearer way to write this would be:
506 * to_be_doubled = tn->full_children;
507 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
508 * tn->full_children;
510 * new_child_length = tnode_child_length(tn) * 2;
512 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
513 * new_child_length;
514 * if (new_fill_factor >= inflate_threshold)
516 * ...and so on, tho it would mess up the while () loop.
518 * anyway,
519 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
520 * inflate_threshold
522 * avoid a division:
523 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
524 * inflate_threshold * new_child_length
526 * expand not_to_be_doubled and to_be_doubled, and shorten:
527 * 100 * (tnode_child_length(tn) - tn->empty_children +
528 * tn->full_children) >= inflate_threshold * new_child_length
530 * expand new_child_length:
531 * 100 * (tnode_child_length(tn) - tn->empty_children +
532 * tn->full_children) >=
533 * inflate_threshold * tnode_child_length(tn) * 2
535 * shorten again:
536 * 50 * (tn->full_children + tnode_child_length(tn) -
537 * tn->empty_children) >= inflate_threshold *
538 * tnode_child_length(tn)
542 check_tnode(tn);
544 err = 0;
545 while ((tn->full_children > 0 &&
546 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
547 inflate_threshold * tnode_child_length(tn))) {
549 old_tn = tn;
550 tn = inflate(t, tn);
551 if (IS_ERR(tn)) {
552 tn = old_tn;
553 #ifdef CONFIG_IP_FIB_TRIE_STATS
554 t->stats.resize_node_skipped++;
555 #endif
556 break;
560 check_tnode(tn);
563 * Halve as long as the number of empty children in this
564 * node is above threshold.
567 err = 0;
568 while (tn->bits > 1 &&
569 100 * (tnode_child_length(tn) - tn->empty_children) <
570 halve_threshold * tnode_child_length(tn)) {
572 old_tn = tn;
573 tn = halve(t, tn);
574 if (IS_ERR(tn)) {
575 tn = old_tn;
576 #ifdef CONFIG_IP_FIB_TRIE_STATS
577 t->stats.resize_node_skipped++;
578 #endif
579 break;
584 /* Only one child remains */
585 if (tn->empty_children == tnode_child_length(tn) - 1)
586 for (i = 0; i < tnode_child_length(tn); i++) {
587 struct node *n;
589 n = tn->child[i];
590 if (!n)
591 continue;
593 /* compress one level */
595 NODE_SET_PARENT(n, NULL);
596 tnode_free(tn);
597 return n;
600 return (struct node *) tn;
603 static struct tnode *inflate(struct trie *t, struct tnode *tn)
605 struct tnode *inode;
606 struct tnode *oldtnode = tn;
607 int olen = tnode_child_length(tn);
608 int i;
610 pr_debug("In inflate\n");
612 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
614 if (!tn)
615 return ERR_PTR(-ENOMEM);
618 * Preallocate and store tnodes before the actual work so we
619 * don't get into an inconsistent state if memory allocation
620 * fails. In case of failure we return the oldnode and inflate
621 * of tnode is ignored.
624 for (i = 0; i < olen; i++) {
625 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
627 if (inode &&
628 IS_TNODE(inode) &&
629 inode->pos == oldtnode->pos + oldtnode->bits &&
630 inode->bits > 1) {
631 struct tnode *left, *right;
632 t_key m = TKEY_GET_MASK(inode->pos, 1);
634 left = tnode_new(inode->key&(~m), inode->pos + 1,
635 inode->bits - 1);
636 if (!left)
637 goto nomem;
639 right = tnode_new(inode->key|m, inode->pos + 1,
640 inode->bits - 1);
642 if (!right) {
643 tnode_free(left);
644 goto nomem;
647 put_child(t, tn, 2*i, (struct node *) left);
648 put_child(t, tn, 2*i+1, (struct node *) right);
652 for (i = 0; i < olen; i++) {
653 struct node *node = tnode_get_child(oldtnode, i);
654 struct tnode *left, *right;
655 int size, j;
657 /* An empty child */
658 if (node == NULL)
659 continue;
661 /* A leaf or an internal node with skipped bits */
663 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
664 tn->pos + tn->bits - 1) {
665 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
666 1) == 0)
667 put_child(t, tn, 2*i, node);
668 else
669 put_child(t, tn, 2*i+1, node);
670 continue;
673 /* An internal node with two children */
674 inode = (struct tnode *) node;
676 if (inode->bits == 1) {
677 put_child(t, tn, 2*i, inode->child[0]);
678 put_child(t, tn, 2*i+1, inode->child[1]);
680 tnode_free(inode);
681 continue;
684 /* An internal node with more than two children */
686 /* We will replace this node 'inode' with two new
687 * ones, 'left' and 'right', each with half of the
688 * original children. The two new nodes will have
689 * a position one bit further down the key and this
690 * means that the "significant" part of their keys
691 * (see the discussion near the top of this file)
692 * will differ by one bit, which will be "0" in
693 * left's key and "1" in right's key. Since we are
694 * moving the key position by one step, the bit that
695 * we are moving away from - the bit at position
696 * (inode->pos) - is the one that will differ between
697 * left and right. So... we synthesize that bit in the
698 * two new keys.
699 * The mask 'm' below will be a single "one" bit at
700 * the position (inode->pos)
703 /* Use the old key, but set the new significant
704 * bit to zero.
707 left = (struct tnode *) tnode_get_child(tn, 2*i);
708 put_child(t, tn, 2*i, NULL);
710 BUG_ON(!left);
712 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
713 put_child(t, tn, 2*i+1, NULL);
715 BUG_ON(!right);
717 size = tnode_child_length(left);
718 for (j = 0; j < size; j++) {
719 put_child(t, left, j, inode->child[j]);
720 put_child(t, right, j, inode->child[j + size]);
722 put_child(t, tn, 2*i, resize(t, left));
723 put_child(t, tn, 2*i+1, resize(t, right));
725 tnode_free(inode);
727 tnode_free(oldtnode);
728 return tn;
729 nomem:
731 int size = tnode_child_length(tn);
732 int j;
734 for (j = 0; j < size; j++)
735 if (tn->child[j])
736 tnode_free((struct tnode *)tn->child[j]);
738 tnode_free(tn);
740 return ERR_PTR(-ENOMEM);
744 static struct tnode *halve(struct trie *t, struct tnode *tn)
746 struct tnode *oldtnode = tn;
747 struct node *left, *right;
748 int i;
749 int olen = tnode_child_length(tn);
751 pr_debug("In halve\n");
753 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
755 if (!tn)
756 return ERR_PTR(-ENOMEM);
759 * Preallocate and store tnodes before the actual work so we
760 * don't get into an inconsistent state if memory allocation
761 * fails. In case of failure we return the oldnode and halve
762 * of tnode is ignored.
765 for (i = 0; i < olen; i += 2) {
766 left = tnode_get_child(oldtnode, i);
767 right = tnode_get_child(oldtnode, i+1);
769 /* Two nonempty children */
770 if (left && right) {
771 struct tnode *newn;
773 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
775 if (!newn)
776 goto nomem;
778 put_child(t, tn, i/2, (struct node *)newn);
783 for (i = 0; i < olen; i += 2) {
784 struct tnode *newBinNode;
786 left = tnode_get_child(oldtnode, i);
787 right = tnode_get_child(oldtnode, i+1);
789 /* At least one of the children is empty */
790 if (left == NULL) {
791 if (right == NULL) /* Both are empty */
792 continue;
793 put_child(t, tn, i/2, right);
794 continue;
797 if (right == NULL) {
798 put_child(t, tn, i/2, left);
799 continue;
802 /* Two nonempty children */
803 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
804 put_child(t, tn, i/2, NULL);
805 put_child(t, newBinNode, 0, left);
806 put_child(t, newBinNode, 1, right);
807 put_child(t, tn, i/2, resize(t, newBinNode));
809 tnode_free(oldtnode);
810 return tn;
811 nomem:
813 int size = tnode_child_length(tn);
814 int j;
816 for (j = 0; j < size; j++)
817 if (tn->child[j])
818 tnode_free((struct tnode *)tn->child[j]);
820 tnode_free(tn);
822 return ERR_PTR(-ENOMEM);
826 static void trie_init(struct trie *t)
828 if (!t)
829 return;
831 t->size = 0;
832 rcu_assign_pointer(t->trie, NULL);
833 t->revision = 0;
834 #ifdef CONFIG_IP_FIB_TRIE_STATS
835 memset(&t->stats, 0, sizeof(struct trie_use_stats));
836 #endif
839 /* readside most use rcu_read_lock currently dump routines
840 via get_fa_head and dump */
842 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
844 struct hlist_node *node;
845 struct leaf_info *li;
847 hlist_for_each_entry_rcu(li, node, head, hlist)
848 if (li->plen == plen)
849 return li;
851 return NULL;
854 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
856 struct leaf_info *li = find_leaf_info(&l->list, plen);
858 if (!li)
859 return NULL;
861 return &li->falh;
864 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
866 struct leaf_info *li = NULL, *last = NULL;
867 struct hlist_node *node;
869 if (hlist_empty(head)) {
870 hlist_add_head_rcu(&new->hlist, head);
871 } else {
872 hlist_for_each_entry(li, node, head, hlist) {
873 if (new->plen > li->plen)
874 break;
876 last = li;
878 if (last)
879 hlist_add_after_rcu(&last->hlist, &new->hlist);
880 else
881 hlist_add_before_rcu(&new->hlist, &li->hlist);
885 /* rcu_read_lock needs to be hold by caller from readside */
887 static struct leaf *
888 fib_find_node(struct trie *t, u32 key)
890 int pos;
891 struct tnode *tn;
892 struct node *n;
894 pos = 0;
895 n = rcu_dereference(t->trie);
897 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
898 tn = (struct tnode *) n;
900 check_tnode(tn);
902 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
903 pos = tn->pos + tn->bits;
904 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
905 } else
906 break;
908 /* Case we have found a leaf. Compare prefixes */
910 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
911 return (struct leaf *)n;
913 return NULL;
916 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
918 int wasfull;
919 t_key cindex, key;
920 struct tnode *tp = NULL;
922 key = tn->key;
924 while (tn != NULL && NODE_PARENT(tn) != NULL) {
926 tp = NODE_PARENT(tn);
927 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
928 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
929 tn = (struct tnode *) resize (t, (struct tnode *)tn);
930 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
932 if (!NODE_PARENT(tn))
933 break;
935 tn = NODE_PARENT(tn);
937 /* Handle last (top) tnode */
938 if (IS_TNODE(tn))
939 tn = (struct tnode*) resize(t, (struct tnode *)tn);
941 return (struct node*) tn;
944 /* only used from updater-side */
946 static struct list_head *
947 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
949 int pos, newpos;
950 struct tnode *tp = NULL, *tn = NULL;
951 struct node *n;
952 struct leaf *l;
953 int missbit;
954 struct list_head *fa_head = NULL;
955 struct leaf_info *li;
956 t_key cindex;
958 pos = 0;
959 n = t->trie;
961 /* If we point to NULL, stop. Either the tree is empty and we should
962 * just put a new leaf in if, or we have reached an empty child slot,
963 * and we should just put our new leaf in that.
964 * If we point to a T_TNODE, check if it matches our key. Note that
965 * a T_TNODE might be skipping any number of bits - its 'pos' need
966 * not be the parent's 'pos'+'bits'!
968 * If it does match the current key, get pos/bits from it, extract
969 * the index from our key, push the T_TNODE and walk the tree.
971 * If it doesn't, we have to replace it with a new T_TNODE.
973 * If we point to a T_LEAF, it might or might not have the same key
974 * as we do. If it does, just change the value, update the T_LEAF's
975 * value, and return it.
976 * If it doesn't, we need to replace it with a T_TNODE.
979 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
980 tn = (struct tnode *) n;
982 check_tnode(tn);
984 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
985 tp = tn;
986 pos = tn->pos + tn->bits;
987 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
989 BUG_ON(n && NODE_PARENT(n) != tn);
990 } else
991 break;
995 * n ----> NULL, LEAF or TNODE
997 * tp is n's (parent) ----> NULL or TNODE
1000 BUG_ON(tp && IS_LEAF(tp));
1002 /* Case 1: n is a leaf. Compare prefixes */
1004 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1005 struct leaf *l = (struct leaf *) n;
1007 li = leaf_info_new(plen);
1009 if (!li) {
1010 *err = -ENOMEM;
1011 goto err;
1014 fa_head = &li->falh;
1015 insert_leaf_info(&l->list, li);
1016 goto done;
1018 t->size++;
1019 l = leaf_new();
1021 if (!l) {
1022 *err = -ENOMEM;
1023 goto err;
1026 l->key = key;
1027 li = leaf_info_new(plen);
1029 if (!li) {
1030 tnode_free((struct tnode *) l);
1031 *err = -ENOMEM;
1032 goto err;
1035 fa_head = &li->falh;
1036 insert_leaf_info(&l->list, li);
1038 if (t->trie && n == NULL) {
1039 /* Case 2: n is NULL, and will just insert a new leaf */
1041 NODE_SET_PARENT(l, tp);
1043 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1044 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1045 } else {
1046 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1048 * Add a new tnode here
1049 * first tnode need some special handling
1052 if (tp)
1053 pos = tp->pos+tp->bits;
1054 else
1055 pos = 0;
1057 if (n) {
1058 newpos = tkey_mismatch(key, pos, n->key);
1059 tn = tnode_new(n->key, newpos, 1);
1060 } else {
1061 newpos = 0;
1062 tn = tnode_new(key, newpos, 1); /* First tnode */
1065 if (!tn) {
1066 free_leaf_info(li);
1067 tnode_free((struct tnode *) l);
1068 *err = -ENOMEM;
1069 goto err;
1072 NODE_SET_PARENT(tn, tp);
1074 missbit = tkey_extract_bits(key, newpos, 1);
1075 put_child(t, tn, missbit, (struct node *)l);
1076 put_child(t, tn, 1-missbit, n);
1078 if (tp) {
1079 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1080 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1081 } else {
1082 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1083 tp = tn;
1087 if (tp && tp->pos + tp->bits > 32)
1088 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1089 tp, tp->pos, tp->bits, key, plen);
1091 /* Rebalance the trie */
1093 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1094 done:
1095 t->revision++;
1096 err:
1097 return fa_head;
1100 static int
1101 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1102 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1104 struct trie *t = (struct trie *) tb->tb_data;
1105 struct fib_alias *fa, *new_fa;
1106 struct list_head *fa_head = NULL;
1107 struct fib_info *fi;
1108 int plen = r->rtm_dst_len;
1109 int type = r->rtm_type;
1110 u8 tos = r->rtm_tos;
1111 u32 key, mask;
1112 int err;
1113 struct leaf *l;
1115 if (plen > 32)
1116 return -EINVAL;
1118 key = 0;
1119 if (rta->rta_dst)
1120 memcpy(&key, rta->rta_dst, 4);
1122 key = ntohl(key);
1124 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1126 mask = ntohl(inet_make_mask(plen));
1128 if (key & ~mask)
1129 return -EINVAL;
1131 key = key & mask;
1133 fi = fib_create_info(r, rta, nlhdr, &err);
1135 if (!fi)
1136 goto err;
1138 l = fib_find_node(t, key);
1139 fa = NULL;
1141 if (l) {
1142 fa_head = get_fa_head(l, plen);
1143 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1146 /* Now fa, if non-NULL, points to the first fib alias
1147 * with the same keys [prefix,tos,priority], if such key already
1148 * exists or to the node before which we will insert new one.
1150 * If fa is NULL, we will need to allocate a new one and
1151 * insert to the head of f.
1153 * If f is NULL, no fib node matched the destination key
1154 * and we need to allocate a new one of those as well.
1157 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1158 struct fib_alias *fa_orig;
1160 err = -EEXIST;
1161 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1162 goto out;
1164 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1165 struct fib_info *fi_drop;
1166 u8 state;
1168 err = -ENOBUFS;
1169 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1170 if (new_fa == NULL)
1171 goto out;
1173 fi_drop = fa->fa_info;
1174 new_fa->fa_tos = fa->fa_tos;
1175 new_fa->fa_info = fi;
1176 new_fa->fa_type = type;
1177 new_fa->fa_scope = r->rtm_scope;
1178 state = fa->fa_state;
1179 new_fa->fa_state &= ~FA_S_ACCESSED;
1181 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1182 alias_free_mem_rcu(fa);
1184 fib_release_info(fi_drop);
1185 if (state & FA_S_ACCESSED)
1186 rt_cache_flush(-1);
1188 goto succeeded;
1190 /* Error if we find a perfect match which
1191 * uses the same scope, type, and nexthop
1192 * information.
1194 fa_orig = fa;
1195 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1196 if (fa->fa_tos != tos)
1197 break;
1198 if (fa->fa_info->fib_priority != fi->fib_priority)
1199 break;
1200 if (fa->fa_type == type &&
1201 fa->fa_scope == r->rtm_scope &&
1202 fa->fa_info == fi) {
1203 goto out;
1206 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1207 fa = fa_orig;
1209 err = -ENOENT;
1210 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1211 goto out;
1213 err = -ENOBUFS;
1214 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1215 if (new_fa == NULL)
1216 goto out;
1218 new_fa->fa_info = fi;
1219 new_fa->fa_tos = tos;
1220 new_fa->fa_type = type;
1221 new_fa->fa_scope = r->rtm_scope;
1222 new_fa->fa_state = 0;
1224 * Insert new entry to the list.
1227 if (!fa_head) {
1228 fa_head = fib_insert_node(t, &err, key, plen);
1229 err = 0;
1230 if (err)
1231 goto out_free_new_fa;
1234 list_add_tail_rcu(&new_fa->fa_list,
1235 (fa ? &fa->fa_list : fa_head));
1237 rt_cache_flush(-1);
1238 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1239 succeeded:
1240 return 0;
1242 out_free_new_fa:
1243 kmem_cache_free(fn_alias_kmem, new_fa);
1244 out:
1245 fib_release_info(fi);
1246 err:
1247 return err;
1251 /* should be clalled with rcu_read_lock */
1252 static inline int check_leaf(struct trie *t, struct leaf *l,
1253 t_key key, int *plen, const struct flowi *flp,
1254 struct fib_result *res)
1256 int err, i;
1257 t_key mask;
1258 struct leaf_info *li;
1259 struct hlist_head *hhead = &l->list;
1260 struct hlist_node *node;
1262 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1263 i = li->plen;
1264 mask = ntohl(inet_make_mask(i));
1265 if (l->key != (key & mask))
1266 continue;
1268 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1269 *plen = i;
1270 #ifdef CONFIG_IP_FIB_TRIE_STATS
1271 t->stats.semantic_match_passed++;
1272 #endif
1273 return err;
1275 #ifdef CONFIG_IP_FIB_TRIE_STATS
1276 t->stats.semantic_match_miss++;
1277 #endif
1279 return 1;
1282 static int
1283 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1285 struct trie *t = (struct trie *) tb->tb_data;
1286 int plen, ret = 0;
1287 struct node *n;
1288 struct tnode *pn;
1289 int pos, bits;
1290 t_key key = ntohl(flp->fl4_dst);
1291 int chopped_off;
1292 t_key cindex = 0;
1293 int current_prefix_length = KEYLENGTH;
1294 struct tnode *cn;
1295 t_key node_prefix, key_prefix, pref_mismatch;
1296 int mp;
1298 rcu_read_lock();
1300 n = rcu_dereference(t->trie);
1301 if (!n)
1302 goto failed;
1304 #ifdef CONFIG_IP_FIB_TRIE_STATS
1305 t->stats.gets++;
1306 #endif
1308 /* Just a leaf? */
1309 if (IS_LEAF(n)) {
1310 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1311 goto found;
1312 goto failed;
1314 pn = (struct tnode *) n;
1315 chopped_off = 0;
1317 while (pn) {
1318 pos = pn->pos;
1319 bits = pn->bits;
1321 if (!chopped_off)
1322 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1324 n = tnode_get_child(pn, cindex);
1326 if (n == NULL) {
1327 #ifdef CONFIG_IP_FIB_TRIE_STATS
1328 t->stats.null_node_hit++;
1329 #endif
1330 goto backtrace;
1333 if (IS_LEAF(n)) {
1334 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1335 goto found;
1336 else
1337 goto backtrace;
1340 #define HL_OPTIMIZE
1341 #ifdef HL_OPTIMIZE
1342 cn = (struct tnode *)n;
1345 * It's a tnode, and we can do some extra checks here if we
1346 * like, to avoid descending into a dead-end branch.
1347 * This tnode is in the parent's child array at index
1348 * key[p_pos..p_pos+p_bits] but potentially with some bits
1349 * chopped off, so in reality the index may be just a
1350 * subprefix, padded with zero at the end.
1351 * We can also take a look at any skipped bits in this
1352 * tnode - everything up to p_pos is supposed to be ok,
1353 * and the non-chopped bits of the index (se previous
1354 * paragraph) are also guaranteed ok, but the rest is
1355 * considered unknown.
1357 * The skipped bits are key[pos+bits..cn->pos].
1360 /* If current_prefix_length < pos+bits, we are already doing
1361 * actual prefix matching, which means everything from
1362 * pos+(bits-chopped_off) onward must be zero along some
1363 * branch of this subtree - otherwise there is *no* valid
1364 * prefix present. Here we can only check the skipped
1365 * bits. Remember, since we have already indexed into the
1366 * parent's child array, we know that the bits we chopped of
1367 * *are* zero.
1370 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1372 if (current_prefix_length < pos+bits) {
1373 if (tkey_extract_bits(cn->key, current_prefix_length,
1374 cn->pos - current_prefix_length) != 0 ||
1375 !(cn->child[0]))
1376 goto backtrace;
1380 * If chopped_off=0, the index is fully validated and we
1381 * only need to look at the skipped bits for this, the new,
1382 * tnode. What we actually want to do is to find out if
1383 * these skipped bits match our key perfectly, or if we will
1384 * have to count on finding a matching prefix further down,
1385 * because if we do, we would like to have some way of
1386 * verifying the existence of such a prefix at this point.
1389 /* The only thing we can do at this point is to verify that
1390 * any such matching prefix can indeed be a prefix to our
1391 * key, and if the bits in the node we are inspecting that
1392 * do not match our key are not ZERO, this cannot be true.
1393 * Thus, find out where there is a mismatch (before cn->pos)
1394 * and verify that all the mismatching bits are zero in the
1395 * new tnode's key.
1398 /* Note: We aren't very concerned about the piece of the key
1399 * that precede pn->pos+pn->bits, since these have already been
1400 * checked. The bits after cn->pos aren't checked since these are
1401 * by definition "unknown" at this point. Thus, what we want to
1402 * see is if we are about to enter the "prefix matching" state,
1403 * and in that case verify that the skipped bits that will prevail
1404 * throughout this subtree are zero, as they have to be if we are
1405 * to find a matching prefix.
1408 node_prefix = MASK_PFX(cn->key, cn->pos);
1409 key_prefix = MASK_PFX(key, cn->pos);
1410 pref_mismatch = key_prefix^node_prefix;
1411 mp = 0;
1413 /* In short: If skipped bits in this node do not match the search
1414 * key, enter the "prefix matching" state.directly.
1416 if (pref_mismatch) {
1417 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1418 mp++;
1419 pref_mismatch = pref_mismatch <<1;
1421 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1423 if (key_prefix != 0)
1424 goto backtrace;
1426 if (current_prefix_length >= cn->pos)
1427 current_prefix_length = mp;
1429 #endif
1430 pn = (struct tnode *)n; /* Descend */
1431 chopped_off = 0;
1432 continue;
1434 backtrace:
1435 chopped_off++;
1437 /* As zero don't change the child key (cindex) */
1438 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1439 chopped_off++;
1441 /* Decrease current_... with bits chopped off */
1442 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1443 current_prefix_length = pn->pos + pn->bits - chopped_off;
1446 * Either we do the actual chop off according or if we have
1447 * chopped off all bits in this tnode walk up to our parent.
1450 if (chopped_off <= pn->bits) {
1451 cindex &= ~(1 << (chopped_off-1));
1452 } else {
1453 if (NODE_PARENT(pn) == NULL)
1454 goto failed;
1456 /* Get Child's index */
1457 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1458 pn = NODE_PARENT(pn);
1459 chopped_off = 0;
1461 #ifdef CONFIG_IP_FIB_TRIE_STATS
1462 t->stats.backtrack++;
1463 #endif
1464 goto backtrace;
1467 failed:
1468 ret = 1;
1469 found:
1470 rcu_read_unlock();
1471 return ret;
1474 /* only called from updater side */
1475 static int trie_leaf_remove(struct trie *t, t_key key)
1477 t_key cindex;
1478 struct tnode *tp = NULL;
1479 struct node *n = t->trie;
1480 struct leaf *l;
1482 pr_debug("entering trie_leaf_remove(%p)\n", n);
1484 /* Note that in the case skipped bits, those bits are *not* checked!
1485 * When we finish this, we will have NULL or a T_LEAF, and the
1486 * T_LEAF may or may not match our key.
1489 while (n != NULL && IS_TNODE(n)) {
1490 struct tnode *tn = (struct tnode *) n;
1491 check_tnode(tn);
1492 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1494 BUG_ON(n && NODE_PARENT(n) != tn);
1496 l = (struct leaf *) n;
1498 if (!n || !tkey_equals(l->key, key))
1499 return 0;
1502 * Key found.
1503 * Remove the leaf and rebalance the tree
1506 t->revision++;
1507 t->size--;
1509 preempt_disable();
1510 tp = NODE_PARENT(n);
1511 tnode_free((struct tnode *) n);
1513 if (tp) {
1514 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1515 put_child(t, (struct tnode *)tp, cindex, NULL);
1516 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1517 } else
1518 rcu_assign_pointer(t->trie, NULL);
1519 preempt_enable();
1521 return 1;
1524 static int
1525 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1526 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1528 struct trie *t = (struct trie *) tb->tb_data;
1529 u32 key, mask;
1530 int plen = r->rtm_dst_len;
1531 u8 tos = r->rtm_tos;
1532 struct fib_alias *fa, *fa_to_delete;
1533 struct list_head *fa_head;
1534 struct leaf *l;
1535 struct leaf_info *li;
1538 if (plen > 32)
1539 return -EINVAL;
1541 key = 0;
1542 if (rta->rta_dst)
1543 memcpy(&key, rta->rta_dst, 4);
1545 key = ntohl(key);
1546 mask = ntohl(inet_make_mask(plen));
1548 if (key & ~mask)
1549 return -EINVAL;
1551 key = key & mask;
1552 l = fib_find_node(t, key);
1554 if (!l)
1555 return -ESRCH;
1557 fa_head = get_fa_head(l, plen);
1558 fa = fib_find_alias(fa_head, tos, 0);
1560 if (!fa)
1561 return -ESRCH;
1563 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1565 fa_to_delete = NULL;
1566 fa_head = fa->fa_list.prev;
1568 list_for_each_entry(fa, fa_head, fa_list) {
1569 struct fib_info *fi = fa->fa_info;
1571 if (fa->fa_tos != tos)
1572 break;
1574 if ((!r->rtm_type ||
1575 fa->fa_type == r->rtm_type) &&
1576 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1577 fa->fa_scope == r->rtm_scope) &&
1578 (!r->rtm_protocol ||
1579 fi->fib_protocol == r->rtm_protocol) &&
1580 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1581 fa_to_delete = fa;
1582 break;
1586 if (!fa_to_delete)
1587 return -ESRCH;
1589 fa = fa_to_delete;
1590 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1592 l = fib_find_node(t, key);
1593 li = find_leaf_info(&l->list, plen);
1595 list_del_rcu(&fa->fa_list);
1597 if (list_empty(fa_head)) {
1598 hlist_del_rcu(&li->hlist);
1599 free_leaf_info(li);
1602 if (hlist_empty(&l->list))
1603 trie_leaf_remove(t, key);
1605 if (fa->fa_state & FA_S_ACCESSED)
1606 rt_cache_flush(-1);
1608 fib_release_info(fa->fa_info);
1609 alias_free_mem_rcu(fa);
1610 return 0;
1613 static int trie_flush_list(struct trie *t, struct list_head *head)
1615 struct fib_alias *fa, *fa_node;
1616 int found = 0;
1618 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1619 struct fib_info *fi = fa->fa_info;
1621 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1622 list_del_rcu(&fa->fa_list);
1623 fib_release_info(fa->fa_info);
1624 alias_free_mem_rcu(fa);
1625 found++;
1628 return found;
1631 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1633 int found = 0;
1634 struct hlist_head *lih = &l->list;
1635 struct hlist_node *node, *tmp;
1636 struct leaf_info *li = NULL;
1638 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1639 found += trie_flush_list(t, &li->falh);
1641 if (list_empty(&li->falh)) {
1642 hlist_del_rcu(&li->hlist);
1643 free_leaf_info(li);
1646 return found;
1649 /* rcu_read_lock needs to be hold by caller from readside */
1651 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1653 struct node *c = (struct node *) thisleaf;
1654 struct tnode *p;
1655 int idx;
1656 struct node *trie = rcu_dereference(t->trie);
1658 if (c == NULL) {
1659 if (trie == NULL)
1660 return NULL;
1662 if (IS_LEAF(trie)) /* trie w. just a leaf */
1663 return (struct leaf *) trie;
1665 p = (struct tnode*) trie; /* Start */
1666 } else
1667 p = (struct tnode *) NODE_PARENT(c);
1669 while (p) {
1670 int pos, last;
1672 /* Find the next child of the parent */
1673 if (c)
1674 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1675 else
1676 pos = 0;
1678 last = 1 << p->bits;
1679 for (idx = pos; idx < last ; idx++) {
1680 c = rcu_dereference(p->child[idx]);
1682 if (!c)
1683 continue;
1685 /* Decend if tnode */
1686 while (IS_TNODE(c)) {
1687 p = (struct tnode *) c;
1688 idx = 0;
1690 /* Rightmost non-NULL branch */
1691 if (p && IS_TNODE(p))
1692 while (!(c = rcu_dereference(p->child[idx]))
1693 && idx < (1<<p->bits)) idx++;
1695 /* Done with this tnode? */
1696 if (idx >= (1 << p->bits) || !c)
1697 goto up;
1699 return (struct leaf *) c;
1702 /* No more children go up one step */
1703 c = (struct node *) p;
1704 p = (struct tnode *) NODE_PARENT(p);
1706 return NULL; /* Ready. Root of trie */
1709 static int fn_trie_flush(struct fib_table *tb)
1711 struct trie *t = (struct trie *) tb->tb_data;
1712 struct leaf *ll = NULL, *l = NULL;
1713 int found = 0, h;
1715 t->revision++;
1717 rcu_read_lock();
1718 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1719 found += trie_flush_leaf(t, l);
1721 if (ll && hlist_empty(&ll->list))
1722 trie_leaf_remove(t, ll->key);
1723 ll = l;
1725 rcu_read_unlock();
1727 if (ll && hlist_empty(&ll->list))
1728 trie_leaf_remove(t, ll->key);
1730 pr_debug("trie_flush found=%d\n", found);
1731 return found;
1734 static int trie_last_dflt = -1;
1736 static void
1737 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1739 struct trie *t = (struct trie *) tb->tb_data;
1740 int order, last_idx;
1741 struct fib_info *fi = NULL;
1742 struct fib_info *last_resort;
1743 struct fib_alias *fa = NULL;
1744 struct list_head *fa_head;
1745 struct leaf *l;
1747 last_idx = -1;
1748 last_resort = NULL;
1749 order = -1;
1751 rcu_read_lock();
1753 l = fib_find_node(t, 0);
1754 if (!l)
1755 goto out;
1757 fa_head = get_fa_head(l, 0);
1758 if (!fa_head)
1759 goto out;
1761 if (list_empty(fa_head))
1762 goto out;
1764 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1765 struct fib_info *next_fi = fa->fa_info;
1767 if (fa->fa_scope != res->scope ||
1768 fa->fa_type != RTN_UNICAST)
1769 continue;
1771 if (next_fi->fib_priority > res->fi->fib_priority)
1772 break;
1773 if (!next_fi->fib_nh[0].nh_gw ||
1774 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1775 continue;
1776 fa->fa_state |= FA_S_ACCESSED;
1778 if (fi == NULL) {
1779 if (next_fi != res->fi)
1780 break;
1781 } else if (!fib_detect_death(fi, order, &last_resort,
1782 &last_idx, &trie_last_dflt)) {
1783 if (res->fi)
1784 fib_info_put(res->fi);
1785 res->fi = fi;
1786 atomic_inc(&fi->fib_clntref);
1787 trie_last_dflt = order;
1788 goto out;
1790 fi = next_fi;
1791 order++;
1793 if (order <= 0 || fi == NULL) {
1794 trie_last_dflt = -1;
1795 goto out;
1798 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1799 if (res->fi)
1800 fib_info_put(res->fi);
1801 res->fi = fi;
1802 atomic_inc(&fi->fib_clntref);
1803 trie_last_dflt = order;
1804 goto out;
1806 if (last_idx >= 0) {
1807 if (res->fi)
1808 fib_info_put(res->fi);
1809 res->fi = last_resort;
1810 if (last_resort)
1811 atomic_inc(&last_resort->fib_clntref);
1813 trie_last_dflt = last_idx;
1814 out:;
1815 rcu_read_unlock();
1818 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1819 struct sk_buff *skb, struct netlink_callback *cb)
1821 int i, s_i;
1822 struct fib_alias *fa;
1824 u32 xkey = htonl(key);
1826 s_i = cb->args[3];
1827 i = 0;
1829 /* rcu_read_lock is hold by caller */
1831 list_for_each_entry_rcu(fa, fah, fa_list) {
1832 if (i < s_i) {
1833 i++;
1834 continue;
1836 if (fa->fa_info->fib_nh == NULL) {
1837 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1838 i++;
1839 continue;
1841 if (fa->fa_info == NULL) {
1842 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1843 i++;
1844 continue;
1847 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1848 cb->nlh->nlmsg_seq,
1849 RTM_NEWROUTE,
1850 tb->tb_id,
1851 fa->fa_type,
1852 fa->fa_scope,
1853 &xkey,
1854 plen,
1855 fa->fa_tos,
1856 fa->fa_info, 0) < 0) {
1857 cb->args[3] = i;
1858 return -1;
1860 i++;
1862 cb->args[3] = i;
1863 return skb->len;
1866 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1867 struct netlink_callback *cb)
1869 int h, s_h;
1870 struct list_head *fa_head;
1871 struct leaf *l = NULL;
1873 s_h = cb->args[2];
1875 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1876 if (h < s_h)
1877 continue;
1878 if (h > s_h)
1879 memset(&cb->args[3], 0,
1880 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1882 fa_head = get_fa_head(l, plen);
1884 if (!fa_head)
1885 continue;
1887 if (list_empty(fa_head))
1888 continue;
1890 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1891 cb->args[2] = h;
1892 return -1;
1895 cb->args[2] = h;
1896 return skb->len;
1899 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1901 int m, s_m;
1902 struct trie *t = (struct trie *) tb->tb_data;
1904 s_m = cb->args[1];
1906 rcu_read_lock();
1907 for (m = 0; m <= 32; m++) {
1908 if (m < s_m)
1909 continue;
1910 if (m > s_m)
1911 memset(&cb->args[2], 0,
1912 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1914 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1915 cb->args[1] = m;
1916 goto out;
1919 rcu_read_unlock();
1920 cb->args[1] = m;
1921 return skb->len;
1922 out:
1923 rcu_read_unlock();
1924 return -1;
1927 /* Fix more generic FIB names for init later */
1929 #ifdef CONFIG_IP_MULTIPLE_TABLES
1930 struct fib_table * fib_hash_init(int id)
1931 #else
1932 struct fib_table * __init fib_hash_init(int id)
1933 #endif
1935 struct fib_table *tb;
1936 struct trie *t;
1938 if (fn_alias_kmem == NULL)
1939 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1940 sizeof(struct fib_alias),
1941 0, SLAB_HWCACHE_ALIGN,
1942 NULL, NULL);
1944 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1945 GFP_KERNEL);
1946 if (tb == NULL)
1947 return NULL;
1949 tb->tb_id = id;
1950 tb->tb_lookup = fn_trie_lookup;
1951 tb->tb_insert = fn_trie_insert;
1952 tb->tb_delete = fn_trie_delete;
1953 tb->tb_flush = fn_trie_flush;
1954 tb->tb_select_default = fn_trie_select_default;
1955 tb->tb_dump = fn_trie_dump;
1956 memset(tb->tb_data, 0, sizeof(struct trie));
1958 t = (struct trie *) tb->tb_data;
1960 trie_init(t);
1962 if (id == RT_TABLE_LOCAL)
1963 trie_local = t;
1964 else if (id == RT_TABLE_MAIN)
1965 trie_main = t;
1967 if (id == RT_TABLE_LOCAL)
1968 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1970 return tb;
1973 #ifdef CONFIG_PROC_FS
1974 /* Depth first Trie walk iterator */
1975 struct fib_trie_iter {
1976 struct tnode *tnode;
1977 struct trie *trie;
1978 unsigned index;
1979 unsigned depth;
1982 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1984 struct tnode *tn = iter->tnode;
1985 unsigned cindex = iter->index;
1986 struct tnode *p;
1988 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1989 iter->tnode, iter->index, iter->depth);
1990 rescan:
1991 while (cindex < (1<<tn->bits)) {
1992 struct node *n = tnode_get_child(tn, cindex);
1994 if (n) {
1995 if (IS_LEAF(n)) {
1996 iter->tnode = tn;
1997 iter->index = cindex + 1;
1998 } else {
1999 /* push down one level */
2000 iter->tnode = (struct tnode *) n;
2001 iter->index = 0;
2002 ++iter->depth;
2004 return n;
2007 ++cindex;
2010 /* Current node exhausted, pop back up */
2011 p = NODE_PARENT(tn);
2012 if (p) {
2013 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2014 tn = p;
2015 --iter->depth;
2016 goto rescan;
2019 /* got root? */
2020 return NULL;
2023 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2024 struct trie *t)
2026 struct node *n = rcu_dereference(t->trie);
2028 if (n && IS_TNODE(n)) {
2029 iter->tnode = (struct tnode *) n;
2030 iter->trie = t;
2031 iter->index = 0;
2032 iter->depth = 0;
2033 return n;
2035 return NULL;
2038 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2040 struct node *n;
2041 struct fib_trie_iter iter;
2043 memset(s, 0, sizeof(*s));
2045 rcu_read_lock();
2046 for (n = fib_trie_get_first(&iter, t); n;
2047 n = fib_trie_get_next(&iter)) {
2048 if (IS_LEAF(n)) {
2049 s->leaves++;
2050 s->totdepth += iter.depth;
2051 if (iter.depth > s->maxdepth)
2052 s->maxdepth = iter.depth;
2053 } else {
2054 const struct tnode *tn = (const struct tnode *) n;
2055 int i;
2057 s->tnodes++;
2058 s->nodesizes[tn->bits]++;
2059 for (i = 0; i < (1<<tn->bits); i++)
2060 if (!tn->child[i])
2061 s->nullpointers++;
2064 rcu_read_unlock();
2068 * This outputs /proc/net/fib_triestats
2070 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2072 unsigned i, max, pointers, bytes, avdepth;
2074 if (stat->leaves)
2075 avdepth = stat->totdepth*100 / stat->leaves;
2076 else
2077 avdepth = 0;
2079 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2080 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2082 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2084 bytes = sizeof(struct leaf) * stat->leaves;
2085 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2086 bytes += sizeof(struct tnode) * stat->tnodes;
2088 max = MAX_CHILDS-1;
2089 while (max >= 0 && stat->nodesizes[max] == 0)
2090 max--;
2092 pointers = 0;
2093 for (i = 1; i <= max; i++)
2094 if (stat->nodesizes[i] != 0) {
2095 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2096 pointers += (1<<i) * stat->nodesizes[i];
2098 seq_putc(seq, '\n');
2099 seq_printf(seq, "\tPointers: %d\n", pointers);
2101 bytes += sizeof(struct node *) * pointers;
2102 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2103 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2105 #ifdef CONFIG_IP_FIB_TRIE_STATS
2106 seq_printf(seq, "Counters:\n---------\n");
2107 seq_printf(seq,"gets = %d\n", t->stats.gets);
2108 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2109 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2110 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2111 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2112 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2113 #ifdef CLEAR_STATS
2114 memset(&(t->stats), 0, sizeof(t->stats));
2115 #endif
2116 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2119 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2121 struct trie_stat *stat;
2123 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2124 if (!stat)
2125 return -ENOMEM;
2127 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2128 sizeof(struct leaf), sizeof(struct tnode));
2130 if (trie_local) {
2131 seq_printf(seq, "Local:\n");
2132 trie_collect_stats(trie_local, stat);
2133 trie_show_stats(seq, stat);
2136 if (trie_main) {
2137 seq_printf(seq, "Main:\n");
2138 trie_collect_stats(trie_main, stat);
2139 trie_show_stats(seq, stat);
2141 kfree(stat);
2143 return 0;
2146 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2148 return single_open(file, fib_triestat_seq_show, NULL);
2151 static struct file_operations fib_triestat_fops = {
2152 .owner = THIS_MODULE,
2153 .open = fib_triestat_seq_open,
2154 .read = seq_read,
2155 .llseek = seq_lseek,
2156 .release = single_release,
2159 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2160 loff_t pos)
2162 loff_t idx = 0;
2163 struct node *n;
2165 for (n = fib_trie_get_first(iter, trie_local);
2166 n; ++idx, n = fib_trie_get_next(iter)) {
2167 if (pos == idx)
2168 return n;
2171 for (n = fib_trie_get_first(iter, trie_main);
2172 n; ++idx, n = fib_trie_get_next(iter)) {
2173 if (pos == idx)
2174 return n;
2176 return NULL;
2179 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2181 rcu_read_lock();
2182 if (*pos == 0)
2183 return SEQ_START_TOKEN;
2184 return fib_trie_get_idx(seq->private, *pos - 1);
2187 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2189 struct fib_trie_iter *iter = seq->private;
2190 void *l = v;
2192 ++*pos;
2193 if (v == SEQ_START_TOKEN)
2194 return fib_trie_get_idx(iter, 0);
2196 v = fib_trie_get_next(iter);
2197 BUG_ON(v == l);
2198 if (v)
2199 return v;
2201 /* continue scan in next trie */
2202 if (iter->trie == trie_local)
2203 return fib_trie_get_first(iter, trie_main);
2205 return NULL;
2208 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2210 rcu_read_unlock();
2213 static void seq_indent(struct seq_file *seq, int n)
2215 while (n-- > 0) seq_puts(seq, " ");
2218 static inline const char *rtn_scope(enum rt_scope_t s)
2220 static char buf[32];
2222 switch(s) {
2223 case RT_SCOPE_UNIVERSE: return "universe";
2224 case RT_SCOPE_SITE: return "site";
2225 case RT_SCOPE_LINK: return "link";
2226 case RT_SCOPE_HOST: return "host";
2227 case RT_SCOPE_NOWHERE: return "nowhere";
2228 default:
2229 snprintf(buf, sizeof(buf), "scope=%d", s);
2230 return buf;
2234 static const char *rtn_type_names[__RTN_MAX] = {
2235 [RTN_UNSPEC] = "UNSPEC",
2236 [RTN_UNICAST] = "UNICAST",
2237 [RTN_LOCAL] = "LOCAL",
2238 [RTN_BROADCAST] = "BROADCAST",
2239 [RTN_ANYCAST] = "ANYCAST",
2240 [RTN_MULTICAST] = "MULTICAST",
2241 [RTN_BLACKHOLE] = "BLACKHOLE",
2242 [RTN_UNREACHABLE] = "UNREACHABLE",
2243 [RTN_PROHIBIT] = "PROHIBIT",
2244 [RTN_THROW] = "THROW",
2245 [RTN_NAT] = "NAT",
2246 [RTN_XRESOLVE] = "XRESOLVE",
2249 static inline const char *rtn_type(unsigned t)
2251 static char buf[32];
2253 if (t < __RTN_MAX && rtn_type_names[t])
2254 return rtn_type_names[t];
2255 snprintf(buf, sizeof(buf), "type %d", t);
2256 return buf;
2259 /* Pretty print the trie */
2260 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2262 const struct fib_trie_iter *iter = seq->private;
2263 struct node *n = v;
2265 if (v == SEQ_START_TOKEN)
2266 return 0;
2268 if (IS_TNODE(n)) {
2269 struct tnode *tn = (struct tnode *) n;
2270 t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2272 if (!NODE_PARENT(n)) {
2273 if (iter->trie == trie_local)
2274 seq_puts(seq, "<local>:\n");
2275 else
2276 seq_puts(seq, "<main>:\n");
2277 } else {
2278 seq_indent(seq, iter->depth-1);
2279 seq_printf(seq, " +-- %d.%d.%d.%d/%d\n",
2280 NIPQUAD(prf), tn->pos);
2282 } else {
2283 struct leaf *l = (struct leaf *) n;
2284 int i;
2285 u32 val = ntohl(l->key);
2287 seq_indent(seq, iter->depth);
2288 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2289 for (i = 32; i >= 0; i--) {
2290 struct leaf_info *li = find_leaf_info(&l->list, i);
2291 if (li) {
2292 struct fib_alias *fa;
2293 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2294 seq_indent(seq, iter->depth+1);
2295 seq_printf(seq, " /%d %s %s", i,
2296 rtn_scope(fa->fa_scope),
2297 rtn_type(fa->fa_type));
2298 if (fa->fa_tos)
2299 seq_printf(seq, "tos =%d\n",
2300 fa->fa_tos);
2301 seq_putc(seq, '\n');
2307 return 0;
2310 static struct seq_operations fib_trie_seq_ops = {
2311 .start = fib_trie_seq_start,
2312 .next = fib_trie_seq_next,
2313 .stop = fib_trie_seq_stop,
2314 .show = fib_trie_seq_show,
2317 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2319 struct seq_file *seq;
2320 int rc = -ENOMEM;
2321 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2323 if (!s)
2324 goto out;
2326 rc = seq_open(file, &fib_trie_seq_ops);
2327 if (rc)
2328 goto out_kfree;
2330 seq = file->private_data;
2331 seq->private = s;
2332 memset(s, 0, sizeof(*s));
2333 out:
2334 return rc;
2335 out_kfree:
2336 kfree(s);
2337 goto out;
2340 static struct file_operations fib_trie_fops = {
2341 .owner = THIS_MODULE,
2342 .open = fib_trie_seq_open,
2343 .read = seq_read,
2344 .llseek = seq_lseek,
2345 .release = seq_release_private,
2348 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2350 static unsigned type2flags[RTN_MAX + 1] = {
2351 [7] = RTF_REJECT, [8] = RTF_REJECT,
2353 unsigned flags = type2flags[type];
2355 if (fi && fi->fib_nh->nh_gw)
2356 flags |= RTF_GATEWAY;
2357 if (mask == 0xFFFFFFFF)
2358 flags |= RTF_HOST;
2359 flags |= RTF_UP;
2360 return flags;
2364 * This outputs /proc/net/route.
2365 * The format of the file is not supposed to be changed
2366 * and needs to be same as fib_hash output to avoid breaking
2367 * legacy utilities
2369 static int fib_route_seq_show(struct seq_file *seq, void *v)
2371 struct leaf *l = v;
2372 int i;
2373 char bf[128];
2375 if (v == SEQ_START_TOKEN) {
2376 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2377 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2378 "\tWindow\tIRTT");
2379 return 0;
2382 if (IS_TNODE(l))
2383 return 0;
2385 for (i=32; i>=0; i--) {
2386 struct leaf_info *li = find_leaf_info(&l->list, i);
2387 struct fib_alias *fa;
2388 u32 mask, prefix;
2390 if (!li)
2391 continue;
2393 mask = inet_make_mask(li->plen);
2394 prefix = htonl(l->key);
2396 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2397 const struct fib_info *fi = rcu_dereference(fa->fa_info);
2398 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2400 if (fa->fa_type == RTN_BROADCAST
2401 || fa->fa_type == RTN_MULTICAST)
2402 continue;
2404 if (fi)
2405 snprintf(bf, sizeof(bf),
2406 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2407 fi->fib_dev ? fi->fib_dev->name : "*",
2408 prefix,
2409 fi->fib_nh->nh_gw, flags, 0, 0,
2410 fi->fib_priority,
2411 mask,
2412 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2413 fi->fib_window,
2414 fi->fib_rtt >> 3);
2415 else
2416 snprintf(bf, sizeof(bf),
2417 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2418 prefix, 0, flags, 0, 0, 0,
2419 mask, 0, 0, 0);
2421 seq_printf(seq, "%-127s\n", bf);
2425 return 0;
2428 static struct seq_operations fib_route_seq_ops = {
2429 .start = fib_trie_seq_start,
2430 .next = fib_trie_seq_next,
2431 .stop = fib_trie_seq_stop,
2432 .show = fib_route_seq_show,
2435 static int fib_route_seq_open(struct inode *inode, struct file *file)
2437 struct seq_file *seq;
2438 int rc = -ENOMEM;
2439 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2441 if (!s)
2442 goto out;
2444 rc = seq_open(file, &fib_route_seq_ops);
2445 if (rc)
2446 goto out_kfree;
2448 seq = file->private_data;
2449 seq->private = s;
2450 memset(s, 0, sizeof(*s));
2451 out:
2452 return rc;
2453 out_kfree:
2454 kfree(s);
2455 goto out;
2458 static struct file_operations fib_route_fops = {
2459 .owner = THIS_MODULE,
2460 .open = fib_route_seq_open,
2461 .read = seq_read,
2462 .llseek = seq_lseek,
2463 .release = seq_release_private,
2466 int __init fib_proc_init(void)
2468 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2469 goto out1;
2471 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2472 goto out2;
2474 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2475 goto out3;
2477 return 0;
2479 out3:
2480 proc_net_remove("fib_triestat");
2481 out2:
2482 proc_net_remove("fib_trie");
2483 out1:
2484 return -ENOMEM;
2487 void __init fib_proc_exit(void)
2489 proc_net_remove("fib_trie");
2490 proc_net_remove("fib_triestat");
2491 proc_net_remove("route");
2494 #endif /* CONFIG_PROC_FS */