net/ipv4/fib_trie.c

   1 /*
   2  *   This program is free software; you can redistribute it and/or
   3  *   modify it under the terms of the GNU General Public License
   4  *   as published by the Free Software Foundation; either version
   5  *   2 of the License, or (at your option) any later version.
   6  *
   7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8  *     & Swedish University of Agricultural Sciences.
   9  *
  10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11  *     Agricultural Sciences.
  12  *
  13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14  *
  15  * This work is based on the LPC-trie which is originally descibed in:
  16  *
  17  * An experimental study of compression methods for dynamic tries
  18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20  *
  21  *
  22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24  *
  25  * Version:     $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
  26  *
  27  *
  28  * Code from fib_hash has been reused which includes the following header:
  29  *
  30  *
  31  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  32  *              operating system.  INET is implemented using the  BSD Socket
  33  *              interface as the means of communication with the user level.
  34  *
  35  *              IPv4 FIB: lookup engine and maintenance routines.
  36  *
  37  *
  38  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  39  *
  40  *              This program is free software; you can redistribute it and/or
  41  *              modify it under the terms of the GNU General Public License
  42  *              as published by the Free Software Foundation; either version
  43  *              2 of the License, or (at your option) any later version.
  44  */
  45
  46 #define VERSION "0.323"
  47
  48 #include <linux/config.h>
  49 #include <asm/uaccess.h>
  50 #include <asm/system.h>
  51 #include <asm/bitops.h>
  52 #include <linux/types.h>
  53 #include <linux/kernel.h>
  54 #include <linux/sched.h>
  55 #include <linux/mm.h>
  56 #include <linux/string.h>
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/errno.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/if_arp.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/skbuff.h>
  66 #include <linux/netlink.h>
  67 #include <linux/init.h>
  68 #include <linux/list.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/sock.h>
  74 #include <net/ip_fib.h>
  75 #include "fib_lookup.h"
  76
  77 #undef CONFIG_IP_FIB_TRIE_STATS
  78 #define MAX_CHILDS 16384
  79
  80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
  81 #define KEYLENGTH (8*sizeof(t_key))
  82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
  83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
  84
  85 static DEFINE_RWLOCK(fib_lock);
  86
  87 typedef unsigned int t_key;
  88
  89 #define T_TNODE 0
  90 #define T_LEAF  1
  91 #define NODE_TYPE_MASK  0x1UL
  92 #define NODE_PARENT(_node) \
  93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
  94 #define NODE_SET_PARENT(_node, _ptr) \
  95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
  96                      ((_node)->_parent & NODE_TYPE_MASK)))
  97 #define NODE_INIT_PARENT(_node, _type) \
  98 ((_node)->_parent = (_type))
  99 #define NODE_TYPE(_node) \
 100 ((_node)->_parent & NODE_TYPE_MASK)
 101
 102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
 103 #define IS_LEAF(n) (n->_parent & T_LEAF)
 104
 105 struct node {
 106         t_key key;
 107         unsigned long _parent;
 108 };
 109
 110 struct leaf {
 111         t_key key;
 112         unsigned long _parent;
 113         struct hlist_head list;
 114 };
 115
 116 struct leaf_info {
 117         struct hlist_node hlist;
 118         int plen;
 119         struct list_head falh;
 120 };
 121
 122 struct tnode {
 123         t_key key;
 124         unsigned long _parent;
 125         unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
 126         unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
 127         unsigned short full_children;  /* KEYLENGTH bits needed */
 128         unsigned short empty_children; /* KEYLENGTH bits needed */
 129         struct node *child[0];
 130 };
 131
 132 #ifdef CONFIG_IP_FIB_TRIE_STATS
 133 struct trie_use_stats {
 134         unsigned int gets;
 135         unsigned int backtrack;
 136         unsigned int semantic_match_passed;
 137         unsigned int semantic_match_miss;
 138         unsigned int null_node_hit;
 139 };
 140 #endif
 141
 142 struct trie_stat {
 143         unsigned int totdepth;
 144         unsigned int maxdepth;
 145         unsigned int tnodes;
 146         unsigned int leaves;
 147         unsigned int nullpointers;
 148         unsigned int nodesizes[MAX_CHILDS];
 149 };
 150
 151 struct trie {
 152         struct node *trie;
 153 #ifdef CONFIG_IP_FIB_TRIE_STATS
 154         struct trie_use_stats stats;
 155 #endif
 156         int size;
 157         unsigned int revision;
 158 };
 159
 160 static int trie_debug = 0;
 161
 162 static int tnode_full(struct tnode *tn, struct node *n);
 163 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 164 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
 165 static int tnode_child_length(struct tnode *tn);
 166 static struct node *resize(struct trie *t, struct tnode *tn);
 167 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 168 static struct tnode *halve(struct trie *t, struct tnode *tn);
 169 static void tnode_free(struct tnode *tn);
 170 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 171 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
 172 extern int fib_detect_death(struct fib_info *fi, int order,
 173                             struct fib_info **last_resort, int *last_idx, int *dflt);
 174
 175 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
 176                struct nlmsghdr *n, struct netlink_skb_parms *req);
 177
 178 static kmem_cache_t *fn_alias_kmem;
 179 static struct trie *trie_local = NULL, *trie_main = NULL;
 180
 181 static void trie_bug(char *err)
 182 {
 183         printk("Trie Bug: %s\n", err);
 184         BUG();
 185 }
 186
 187 static inline struct node *tnode_get_child(struct tnode *tn, int i)
 188 {
 189         if (i >=  1<<tn->bits)
 190                 trie_bug("tnode_get_child");
 191
 192         return tn->child[i];
 193 }
 194
 195 static inline int tnode_child_length(struct tnode *tn)
 196 {
 197         return 1<<tn->bits;
 198 }
 199
 200 /*
 201   _________________________________________________________________
 202   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 203   ----------------------------------------------------------------
 204     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 205
 206   _________________________________________________________________
 207   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 208   -----------------------------------------------------------------
 209    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 210
 211   tp->pos = 7
 212   tp->bits = 3
 213   n->pos = 15
 214   n->bits=4
 215   KEYLENGTH=32
 216 */
 217
 218 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 219 {
 220         if (offset < KEYLENGTH)
 221                 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 222         else
 223                 return 0;
 224 }
 225
 226 static inline int tkey_equals(t_key a, t_key b)
 227 {
 228   return a == b;
 229 }
 230
 231 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 232 {
 233      if (bits == 0 || offset >= KEYLENGTH)
 234             return 1;
 235         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 236         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 237 }
 238
 239 static inline int tkey_mismatch(t_key a, int offset, t_key b)
 240 {
 241         t_key diff = a ^ b;
 242         int i = offset;
 243
 244         if(!diff)
 245           return 0;
 246         while((diff << i) >> (KEYLENGTH-1) == 0)
 247                 i++;
 248         return i;
 249 }
 250
 251 /* Candiate for fib_semantics */
 252
 253 static void fn_free_alias(struct fib_alias *fa)
 254 {
 255         fib_release_info(fa->fa_info);
 256         kmem_cache_free(fn_alias_kmem, fa);
 257 }
 258
 259 /*
 260   To understand this stuff, an understanding of keys and all their bits is
 261   necessary. Every node in the trie has a key associated with it, but not
 262   all of the bits in that key are significant.
 263
 264   Consider a node 'n' and its parent 'tp'.
 265
 266   If n is a leaf, every bit in its key is significant. Its presence is
 267   necessitaded by path compression, since during a tree traversal (when
 268   searching for a leaf - unless we are doing an insertion) we will completely
 269   ignore all skipped bits we encounter. Thus we need to verify, at the end of
 270   a potentially successful search, that we have indeed been walking the
 271   correct key path.
 272
 273   Note that we can never "miss" the correct key in the tree if present by
 274   following the wrong path. Path compression ensures that segments of the key
 275   that are the same for all keys with a given prefix are skipped, but the
 276   skipped part *is* identical for each node in the subtrie below the skipped
 277   bit! trie_insert() in this implementation takes care of that - note the
 278   call to tkey_sub_equals() in trie_insert().
 279
 280   if n is an internal node - a 'tnode' here, the various parts of its key
 281   have many different meanings.
 282
 283   Example:
 284   _________________________________________________________________
 285   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 286   -----------------------------------------------------------------
 287     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 288
 289   _________________________________________________________________
 290   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 291   -----------------------------------------------------------------
 292    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 293
 294   tp->pos = 7
 295   tp->bits = 3
 296   n->pos = 15
 297   n->bits=4
 298
 299   First, let's just ignore the bits that come before the parent tp, that is
 300   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 301   not use them for anything.
 302
 303   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 304   index into the parent's child array. That is, they will be used to find
 305   'n' among tp's children.
 306
 307   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 308   for the node n.
 309
 310   All the bits we have seen so far are significant to the node n. The rest
 311   of the bits are really not needed or indeed known in n->key.
 312
 313   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 314   n's child array, and will of course be different for each child.
 315
 316   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 317   at this point.
 318
 319 */
 320
 321 static void check_tnode(struct tnode *tn)
 322 {
 323         if(tn && tn->pos+tn->bits > 32) {
 324                 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
 325         }
 326 }
 327
 328 static int halve_threshold = 25;
 329 static int inflate_threshold = 50;
 330
 331 static struct leaf *leaf_new(void)
 332 {
 333         struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
 334         if(l) {
 335                 NODE_INIT_PARENT(l, T_LEAF);
 336                 INIT_HLIST_HEAD(&l->list);
 337         }
 338         return l;
 339 }
 340
 341 static struct leaf_info *leaf_info_new(int plen)
 342 {
 343         struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 344         li->plen = plen;
 345         INIT_LIST_HEAD(&li->falh);
 346         return li;
 347 }
 348
 349 static inline void free_leaf(struct leaf *l)
 350 {
 351         kfree(l);
 352 }
 353
 354 static inline void free_leaf_info(struct leaf_info *li)
 355 {
 356         kfree(li);
 357 }
 358
 359 static struct tnode* tnode_new(t_key key, int pos, int bits)
 360 {
 361         int nchildren = 1<<bits;
 362         int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
 363         struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
 364
 365         if(tn)  {
 366                 memset(tn, 0, sz);
 367                 NODE_INIT_PARENT(tn, T_TNODE);
 368                 tn->pos = pos;
 369                 tn->bits = bits;
 370                 tn->key = key;
 371                 tn->full_children = 0;
 372                 tn->empty_children = 1<<bits;
 373         }
 374         if(trie_debug > 0)
 375                 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
 376                        (unsigned int) (sizeof(struct node) * 1<<bits));
 377         return tn;
 378 }
 379
 380 static void tnode_free(struct tnode *tn)
 381 {
 382         if(!tn) {
 383                 trie_bug("tnode_free\n");
 384         }
 385         if(IS_LEAF(tn)) {
 386                 free_leaf((struct leaf *)tn);
 387                 if(trie_debug > 0 )
 388                         printk("FL %p \n", tn);
 389         }
 390         else if(IS_TNODE(tn)) {
 391                 kfree(tn);
 392                 if(trie_debug > 0 )
 393                         printk("FT %p \n", tn);
 394         }
 395         else {
 396                 trie_bug("tnode_free\n");
 397         }
 398 }
 399
 400 /*
 401  * Check whether a tnode 'n' is "full", i.e. it is an internal node
 402  * and no bits are skipped. See discussion in dyntree paper p. 6
 403  */
 404
 405 static inline int tnode_full(struct tnode *tn, struct node *n)
 406 {
 407         if(n == NULL || IS_LEAF(n))
 408                 return 0;
 409
 410         return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 411 }
 412
 413 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
 414 {
 415         tnode_put_child_reorg(tn, i, n, -1);
 416 }
 417
 418  /*
 419   * Add a child at position i overwriting the old value.
 420   * Update the value of full_children and empty_children.
 421   */
 422
 423 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
 424 {
 425         struct node *chi;
 426         int isfull;
 427
 428         if(i >=  1<<tn->bits) {
 429                 printk("bits=%d, i=%d\n", tn->bits, i);
 430                 trie_bug("tnode_put_child_reorg bits");
 431         }
 432         write_lock_bh(&fib_lock);
 433         chi = tn->child[i];
 434
 435         /* update emptyChildren */
 436         if (n == NULL && chi != NULL)
 437                 tn->empty_children++;
 438         else if (n != NULL && chi == NULL)
 439                 tn->empty_children--;
 440
 441         /* update fullChildren */
 442         if (wasfull == -1)
 443                 wasfull = tnode_full(tn, chi);
 444
 445         isfull = tnode_full(tn, n);
 446         if (wasfull && !isfull)
 447                 tn->full_children--;
 448
 449         else if (!wasfull && isfull)
 450                 tn->full_children++;
 451         if(n)
 452                 NODE_SET_PARENT(n, tn);
 453
 454         tn->child[i] = n;
 455         write_unlock_bh(&fib_lock);
 456 }
 457
 458 static struct node *resize(struct trie *t, struct tnode *tn)
 459 {
 460         int i;
 461
 462         if (!tn)
 463                 return NULL;
 464
 465         if(trie_debug)
 466                 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 467                       tn, inflate_threshold, halve_threshold);
 468
 469         /* No children */
 470         if (tn->empty_children == tnode_child_length(tn)) {
 471                 tnode_free(tn);
 472                 return NULL;
 473         }
 474         /* One child */
 475         if (tn->empty_children == tnode_child_length(tn) - 1)
 476                 for (i = 0; i < tnode_child_length(tn); i++) {
 477
 478                         write_lock_bh(&fib_lock);
 479                         if (tn->child[i] != NULL) {
 480
 481                                 /* compress one level */
 482                                 struct node *n = tn->child[i];
 483                                 if(n)
 484                                         NODE_INIT_PARENT(n, NODE_TYPE(n));
 485
 486                                 write_unlock_bh(&fib_lock);
 487                                 tnode_free(tn);
 488                                 return n;
 489                         }
 490                         write_unlock_bh(&fib_lock);
 491                 }
 492         /*
 493          * Double as long as the resulting node has a number of
 494          * nonempty nodes that are above the threshold.
 495          */
 496
 497         /*
 498          * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 499          * the Helsinki University of Technology and Matti Tikkanen of Nokia
 500          * Telecommunications, page 6:
 501          * "A node is doubled if the ratio of non-empty children to all
 502          * children in the *doubled* node is at least 'high'."
 503          *
 504          * 'high' in this instance is the variable 'inflate_threshold'. It
 505          * is expressed as a percentage, so we multiply it with
 506          * tnode_child_length() and instead of multiplying by 2 (since the
 507          * child array will be doubled by inflate()) and multiplying
 508          * the left-hand side by 100 (to handle the percentage thing) we
 509          * multiply the left-hand side by 50.
 510          *
 511          * The left-hand side may look a bit weird: tnode_child_length(tn)
 512          * - tn->empty_children is of course the number of non-null children
 513          * in the current node. tn->full_children is the number of "full"
 514          * children, that is non-null tnodes with a skip value of 0.
 515          * All of those will be doubled in the resulting inflated tnode, so
 516          * we just count them one extra time here.
 517          *
 518          * A clearer way to write this would be:
 519          *
 520          * to_be_doubled = tn->full_children;
 521          * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 522          *     tn->full_children;
 523          *
 524          * new_child_length = tnode_child_length(tn) * 2;
 525          *
 526          * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 527          *      new_child_length;
 528          * if (new_fill_factor >= inflate_threshold)
 529          *
 530          * ...and so on, tho it would mess up the while() loop.
 531          *
 532          * anyway,
 533          * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 534          *      inflate_threshold
 535          *
 536          * avoid a division:
 537          * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 538          *      inflate_threshold * new_child_length
 539          *
 540          * expand not_to_be_doubled and to_be_doubled, and shorten:
 541          * 100 * (tnode_child_length(tn) - tn->empty_children +
 542          *    tn->full_children ) >= inflate_threshold * new_child_length
 543          *
 544          * expand new_child_length:
 545          * 100 * (tnode_child_length(tn) - tn->empty_children +
 546          *    tn->full_children ) >=
 547          *      inflate_threshold * tnode_child_length(tn) * 2
 548          *
 549          * shorten again:
 550          * 50 * (tn->full_children + tnode_child_length(tn) -
 551          *    tn->empty_children ) >= inflate_threshold *
 552          *    tnode_child_length(tn)
 553          *
 554          */
 555
 556         check_tnode(tn);
 557
 558         while ((tn->full_children > 0 &&
 559                50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
 560                                 inflate_threshold * tnode_child_length(tn))) {
 561
 562                 tn = inflate(t, tn);
 563         }
 564
 565         check_tnode(tn);
 566
 567         /*
 568          * Halve as long as the number of empty children in this
 569          * node is above threshold.
 570          */
 571         while (tn->bits > 1 &&
 572                100 * (tnode_child_length(tn) - tn->empty_children) <
 573                halve_threshold * tnode_child_length(tn))
 574
 575                 tn = halve(t, tn);
 576
 577         /* Only one child remains */
 578
 579         if (tn->empty_children == tnode_child_length(tn) - 1)
 580                 for (i = 0; i < tnode_child_length(tn); i++) {
 581
 582                         write_lock_bh(&fib_lock);
 583                         if (tn->child[i] != NULL) {
 584                                 /* compress one level */
 585                                 struct node *n = tn->child[i];
 586
 587                                 if(n)
 588                                         NODE_INIT_PARENT(n, NODE_TYPE(n));
 589
 590                                 write_unlock_bh(&fib_lock);
 591                                 tnode_free(tn);
 592                                 return n;
 593                         }
 594                         write_unlock_bh(&fib_lock);
 595                 }
 596
 597         return (struct node *) tn;
 598 }
 599
 600 static struct tnode *inflate(struct trie *t, struct tnode *tn)
 601 {
 602         struct tnode *inode;
 603         struct tnode *oldtnode = tn;
 604         int olen = tnode_child_length(tn);
 605         int i;
 606
 607         if(trie_debug)
 608                 printk("In inflate\n");
 609
 610         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 611
 612         if (!tn)
 613                 trie_bug("tnode_new failed");
 614
 615         for(i = 0; i < olen; i++) {
 616                 struct node *node = tnode_get_child(oldtnode, i);
 617
 618                 /* An empty child */
 619                 if (node == NULL)
 620                         continue;
 621
 622                 /* A leaf or an internal node with skipped bits */
 623
 624                 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
 625                    tn->pos + tn->bits - 1) {
 626                         if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
 627                                              1) == 0)
 628                                 put_child(t, tn, 2*i, node);
 629                         else
 630                                 put_child(t, tn, 2*i+1, node);
 631                         continue;
 632                 }
 633
 634                 /* An internal node with two children */
 635                 inode = (struct tnode *) node;
 636
 637                 if (inode->bits == 1) {
 638                         put_child(t, tn, 2*i, inode->child[0]);
 639                         put_child(t, tn, 2*i+1, inode->child[1]);
 640
 641                         tnode_free(inode);
 642                 }
 643
 644                         /* An internal node with more than two children */
 645                 else {
 646                         struct tnode *left, *right;
 647                         int size, j;
 648
 649                         /* We will replace this node 'inode' with two new
 650                          * ones, 'left' and 'right', each with half of the
 651                          * original children. The two new nodes will have
 652                          * a position one bit further down the key and this
 653                          * means that the "significant" part of their keys
 654                          * (see the discussion near the top of this file)
 655                          * will differ by one bit, which will be "0" in
 656                          * left's key and "1" in right's key. Since we are
 657                          * moving the key position by one step, the bit that
 658                          * we are moving away from - the bit at position
 659                          * (inode->pos) - is the one that will differ between
 660                          * left and right. So... we synthesize that bit in the
 661                          * two  new keys.
 662                          * The mask 'm' below will be a single "one" bit at
 663                          * the position (inode->pos)
 664                          */
 665
 666                         t_key m = TKEY_GET_MASK(inode->pos, 1);
 667
 668                         /* Use the old key, but set the new significant
 669                          *   bit to zero.
 670                          */
 671                         left = tnode_new(inode->key&(~m), inode->pos + 1,
 672                                          inode->bits - 1);
 673
 674                         if(!left)
 675                                 trie_bug("tnode_new failed");
 676
 677
 678                         /* Use the old key, but set the new significant
 679                          * bit to one.
 680                          */
 681                         right = tnode_new(inode->key|m, inode->pos + 1,
 682                                           inode->bits - 1);
 683
 684                         if(!right)
 685                                 trie_bug("tnode_new failed");
 686
 687                         size = tnode_child_length(left);
 688                         for(j = 0; j < size; j++) {
 689                                 put_child(t, left, j, inode->child[j]);
 690                                 put_child(t, right, j, inode->child[j + size]);
 691                         }
 692                         put_child(t, tn, 2*i, resize(t, left));
 693                         put_child(t, tn, 2*i+1, resize(t, right));
 694
 695                         tnode_free(inode);
 696                 }
 697         }
 698         tnode_free(oldtnode);
 699         return tn;
 700 }
 701
 702 static struct tnode *halve(struct trie *t, struct tnode *tn)
 703 {
 704         struct tnode *oldtnode = tn;
 705         struct node *left, *right;
 706         int i;
 707         int olen = tnode_child_length(tn);
 708
 709         if(trie_debug) printk("In halve\n");
 710
 711         tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 712
 713         if(!tn)
 714                 trie_bug("tnode_new failed");
 715
 716         for(i = 0; i < olen; i += 2) {
 717                 left = tnode_get_child(oldtnode, i);
 718                 right = tnode_get_child(oldtnode, i+1);
 719
 720                 /* At least one of the children is empty */
 721                 if (left == NULL) {
 722                         if (right == NULL)    /* Both are empty */
 723                                 continue;
 724                         put_child(t, tn, i/2, right);
 725                 } else if (right == NULL)
 726                         put_child(t, tn, i/2, left);
 727
 728                 /* Two nonempty children */
 729                 else {
 730                         struct tnode *newBinNode =
 731                                 tnode_new(left->key, tn->pos + tn->bits, 1);
 732
 733                         if(!newBinNode)
 734                                 trie_bug("tnode_new failed");
 735
 736                         put_child(t, newBinNode, 0, left);
 737                         put_child(t, newBinNode, 1, right);
 738                         put_child(t, tn, i/2, resize(t, newBinNode));
 739                 }
 740         }
 741         tnode_free(oldtnode);
 742         return tn;
 743 }
 744
 745 static void *trie_init(struct trie *t)
 746 {
 747         if(t) {
 748                 t->size = 0;
 749                 t->trie = NULL;
 750                 t->revision = 0;
 751 #ifdef CONFIG_IP_FIB_TRIE_STATS
 752                 memset(&t->stats, 0, sizeof(struct trie_use_stats));
 753 #endif
 754         }
 755         return t;
 756 }
 757
 758 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 759 {
 760         struct hlist_node *node;
 761         struct leaf_info *li;
 762
 763         hlist_for_each_entry(li, node, head, hlist) {
 764
 765                 if ( li->plen == plen )
 766                         return li;
 767         }
 768         return NULL;
 769 }
 770
 771 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 772 {
 773         struct list_head *fa_head=NULL;
 774         struct leaf_info *li = find_leaf_info(&l->list, plen);
 775
 776         if(li)
 777                 fa_head = &li->falh;
 778
 779         return fa_head;
 780 }
 781
 782 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 783 {
 784         struct leaf_info *li=NULL, *last=NULL;
 785         struct hlist_node *node, *tmp;
 786
 787         write_lock_bh(&fib_lock);
 788
 789         if(hlist_empty(head))
 790                 hlist_add_head(&new->hlist, head);
 791         else {
 792                 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
 793
 794                         if (new->plen > li->plen)
 795                                 break;
 796
 797                         last = li;
 798                 }
 799                 if(last)
 800                         hlist_add_after(&last->hlist, &new->hlist);
 801                 else
 802                         hlist_add_before(&new->hlist, &li->hlist);
 803         }
 804         write_unlock_bh(&fib_lock);
 805 }
 806
 807 static struct leaf *
 808 fib_find_node(struct trie *t, u32 key)
 809 {
 810         int pos;
 811         struct tnode *tn;
 812         struct node *n;
 813
 814         pos = 0;
 815         n=t->trie;
 816
 817         while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 818                 tn = (struct tnode *) n;
 819
 820                 check_tnode(tn);
 821
 822                 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 823                         pos=tn->pos + tn->bits;
 824                         n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 825                 }
 826                 else
 827                         break;
 828         }
 829         /* Case we have found a leaf. Compare prefixes */
 830
 831         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
 832                 struct leaf *l = (struct leaf *) n;
 833                 return l;
 834         }
 835         return NULL;
 836 }
 837
 838 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 839 {
 840         int i = 0;
 841         int wasfull;
 842         t_key cindex, key;
 843         struct tnode *tp = NULL;
 844
 845         if(!tn)
 846                 BUG();
 847
 848         key = tn->key;
 849         i = 0;
 850
 851         while (tn != NULL && NODE_PARENT(tn) != NULL) {
 852
 853                 if( i > 10 ) {
 854                         printk("Rebalance tn=%p \n", tn);
 855                         if(tn)          printk("tn->parent=%p \n", NODE_PARENT(tn));
 856
 857                         printk("Rebalance tp=%p \n", tp);
 858                         if(tp)          printk("tp->parent=%p \n", NODE_PARENT(tp));
 859                 }
 860
 861                 if( i > 12 ) BUG();
 862                 i++;
 863
 864                 tp = NODE_PARENT(tn);
 865                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 866                 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 867                 tn = (struct tnode *) resize (t, (struct tnode *)tn);
 868                 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
 869
 870                 if(!NODE_PARENT(tn))
 871                         break;
 872
 873                 tn = NODE_PARENT(tn);
 874         }
 875         /* Handle last (top) tnode */
 876         if (IS_TNODE(tn))
 877                 tn = (struct tnode*) resize(t, (struct tnode *)tn);
 878
 879         return (struct node*) tn;
 880 }
 881
 882 static struct list_head *
 883 fib_insert_node(struct trie *t, u32 key, int plen)
 884 {
 885         int pos, newpos;
 886         struct tnode *tp = NULL, *tn = NULL;
 887         struct node *n;
 888         struct leaf *l;
 889         int missbit;
 890         struct list_head *fa_head=NULL;
 891         struct leaf_info *li;
 892         t_key cindex;
 893
 894         pos = 0;
 895         n=t->trie;
 896
 897         /* If we point to NULL, stop. Either the tree is empty and we should
 898          * just put a new leaf in if, or we have reached an empty child slot,
 899          * and we should just put our new leaf in that.
 900          * If we point to a T_TNODE, check if it matches our key. Note that
 901          * a T_TNODE might be skipping any number of bits - its 'pos' need
 902          * not be the parent's 'pos'+'bits'!
 903          *
 904          * If it does match the current key, get pos/bits from it, extract
 905          * the index from our key, push the T_TNODE and walk the tree.
 906          *
 907          * If it doesn't, we have to replace it with a new T_TNODE.
 908          *
 909          * If we point to a T_LEAF, it might or might not have the same key
 910          * as we do. If it does, just change the value, update the T_LEAF's
 911          * value, and return it.
 912          * If it doesn't, we need to replace it with a T_TNODE.
 913          */
 914
 915         while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 916                 tn = (struct tnode *) n;
 917
 918                 check_tnode(tn);
 919
 920                 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 921                         tp = tn;
 922                         pos=tn->pos + tn->bits;
 923                         n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 924
 925                         if(n && NODE_PARENT(n) != tn) {
 926                                 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
 927                                 BUG();
 928                         }
 929                 }
 930                 else
 931                         break;
 932         }
 933
 934         /*
 935          * n  ----> NULL, LEAF or TNODE
 936          *
 937          * tp is n's (parent) ----> NULL or TNODE
 938          */
 939
 940         if(tp && IS_LEAF(tp))
 941                 BUG();
 942
 943         t->revision++;
 944
 945         /* Case 1: n is a leaf. Compare prefixes */
 946
 947         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
 948                 struct leaf *l = ( struct leaf *)  n;
 949
 950                 li = leaf_info_new(plen);
 951
 952                 if(! li)
 953                         BUG();
 954
 955                 fa_head = &li->falh;
 956                 insert_leaf_info(&l->list, li);
 957                 goto done;
 958         }
 959         t->size++;
 960         l = leaf_new();
 961
 962         if(! l)
 963                 BUG();
 964
 965         l->key = key;
 966         li = leaf_info_new(plen);
 967
 968         if(! li)
 969                 BUG();
 970
 971         fa_head = &li->falh;
 972         insert_leaf_info(&l->list, li);
 973
 974         /* Case 2: n is NULL, and will just insert a new leaf */
 975         if (t->trie && n == NULL) {
 976
 977                 NODE_SET_PARENT(l, tp);
 978
 979                 if (!tp)
 980                         BUG();
 981
 982                 else {
 983                         cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 984                         put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
 985                 }
 986         }
 987         /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 988         else {
 989                 /*
 990                  *  Add a new tnode here
 991                  *  first tnode need some special handling
 992                  */
 993
 994                 if (tp)
 995                         pos=tp->pos+tp->bits;
 996                 else
 997                         pos=0;
 998                 if(n) {
 999                         newpos = tkey_mismatch(key, pos, n->key);
1000                         tn = tnode_new(n->key, newpos, 1);
1001                 }
1002                 else {
1003                         newpos = 0;
1004                         tn = tnode_new(key, newpos, 1); /* First tnode */
1005                 }
1006                 if(!tn)
1007                         trie_bug("tnode_pfx_new failed");
1008
1009                 NODE_SET_PARENT(tn, tp);
1010
1011                 missbit=tkey_extract_bits(key, newpos, 1);
1012                 put_child(t, tn, missbit, (struct node *)l);
1013                 put_child(t, tn, 1-missbit, n);
1014
1015                 if(tp) {
1016                         cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017                         put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018                 }
1019                 else {
1020                         t->trie = (struct node*) tn; /* First tnode */
1021                         tp = tn;
1022                 }
1023         }
1024         if(tp && tp->pos+tp->bits > 32) {
1025                 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026                        tp, tp->pos, tp->bits, key, plen);
1027         }
1028         /* Rebalance the trie */
1029         t->trie = trie_rebalance(t, tp);
1030 done:;
1031         return fa_head;
1032 }
1033
1034 static int
1035 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1036                struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1037 {
1038         struct trie *t = (struct trie *) tb->tb_data;
1039         struct fib_alias *fa, *new_fa;
1040         struct list_head *fa_head=NULL;
1041         struct fib_info *fi;
1042         int plen = r->rtm_dst_len;
1043         int type = r->rtm_type;
1044         u8 tos = r->rtm_tos;
1045         u32 key, mask;
1046         int err;
1047         struct leaf *l;
1048
1049         if (plen > 32)
1050                 return -EINVAL;
1051
1052         key = 0;
1053         if (rta->rta_dst)
1054                 memcpy(&key, rta->rta_dst, 4);
1055
1056         key = ntohl(key);
1057
1058         if(trie_debug)
1059                 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060
1061         mask =  ntohl( inet_make_mask(plen) );
1062
1063         if(key & ~mask)
1064                 return -EINVAL;
1065
1066         key = key & mask;
1067
1068         if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1069                 goto err;
1070
1071         l = fib_find_node(t, key);
1072         fa = NULL;
1073
1074         if(l) {
1075                 fa_head = get_fa_head(l, plen);
1076                 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077         }
1078
1079         /* Now fa, if non-NULL, points to the first fib alias
1080          * with the same keys [prefix,tos,priority], if such key already
1081          * exists or to the node before which we will insert new one.
1082          *
1083          * If fa is NULL, we will need to allocate a new one and
1084          * insert to the head of f.
1085          *
1086          * If f is NULL, no fib node matched the destination key
1087          * and we need to allocate a new one of those as well.
1088          */
1089
1090         if (fa &&
1091             fa->fa_info->fib_priority == fi->fib_priority) {
1092                 struct fib_alias *fa_orig;
1093
1094                 err = -EEXIST;
1095                 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1096                         goto out;
1097
1098                 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1099                         struct fib_info *fi_drop;
1100                         u8 state;
1101
1102                         write_lock_bh(&fib_lock);
1103
1104                         fi_drop = fa->fa_info;
1105                         fa->fa_info = fi;
1106                         fa->fa_type = type;
1107                         fa->fa_scope = r->rtm_scope;
1108                         state = fa->fa_state;
1109                         fa->fa_state &= ~FA_S_ACCESSED;
1110
1111                         write_unlock_bh(&fib_lock);
1112
1113                         fib_release_info(fi_drop);
1114                         if (state & FA_S_ACCESSED)
1115                           rt_cache_flush(-1);
1116
1117                             goto succeeded;
1118                 }
1119                 /* Error if we find a perfect match which
1120                  * uses the same scope, type, and nexthop
1121                  * information.
1122                  */
1123                 fa_orig = fa;
1124                 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1125                         if (fa->fa_tos != tos)
1126                                 break;
1127                         if (fa->fa_info->fib_priority != fi->fib_priority)
1128                                 break;
1129                         if (fa->fa_type == type &&
1130                             fa->fa_scope == r->rtm_scope &&
1131                             fa->fa_info == fi) {
1132                                 goto out;
1133                         }
1134                 }
1135                 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1136                         fa = fa_orig;
1137         }
1138         err = -ENOENT;
1139         if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1140                 goto out;
1141
1142         err = -ENOBUFS;
1143         new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1144         if (new_fa == NULL)
1145                 goto out;
1146
1147         new_fa->fa_info = fi;
1148         new_fa->fa_tos = tos;
1149         new_fa->fa_type = type;
1150         new_fa->fa_scope = r->rtm_scope;
1151         new_fa->fa_state = 0;
1152 #if 0
1153         new_fa->dst  = NULL;
1154 #endif
1155         /*
1156          * Insert new entry to the list.
1157          */
1158
1159         if(!fa_head)
1160                 fa_head = fib_insert_node(t, key, plen);
1161
1162         write_lock_bh(&fib_lock);
1163
1164         list_add_tail(&new_fa->fa_list,
1165                  (fa ? &fa->fa_list : fa_head));
1166
1167         write_unlock_bh(&fib_lock);
1168
1169         rt_cache_flush(-1);
1170         rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171 succeeded:
1172         return 0;
1173 out:
1174         fib_release_info(fi);
1175 err:;
1176         return err;
1177 }
1178
1179 static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1180                              struct fib_result *res, int *err)
1181 {
1182         int i;
1183         t_key mask;
1184         struct leaf_info *li;
1185         struct hlist_head *hhead = &l->list;
1186         struct hlist_node *node;
1187
1188         hlist_for_each_entry(li, node, hhead, hlist) {
1189
1190                 i = li->plen;
1191                 mask = ntohl(inet_make_mask(i));
1192                 if (l->key != (key & mask))
1193                         continue;
1194
1195                 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1196                         *plen = i;
1197 #ifdef CONFIG_IP_FIB_TRIE_STATS
1198                         t->stats.semantic_match_passed++;
1199 #endif
1200                         return 1;
1201                 }
1202 #ifdef CONFIG_IP_FIB_TRIE_STATS
1203                 t->stats.semantic_match_miss++;
1204 #endif
1205         }
1206         return 0;
1207 }
1208
1209 static int
1210 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1211 {
1212         struct trie *t = (struct trie *) tb->tb_data;
1213         int plen, ret = 0;
1214         struct node *n;
1215         struct tnode *pn;
1216         int pos, bits;
1217         t_key key=ntohl(flp->fl4_dst);
1218         int chopped_off;
1219         t_key cindex = 0;
1220         int current_prefix_length = KEYLENGTH;
1221         n = t->trie;
1222
1223         read_lock(&fib_lock);
1224         if(!n)
1225                 goto failed;
1226
1227 #ifdef CONFIG_IP_FIB_TRIE_STATS
1228         t->stats.gets++;
1229 #endif
1230
1231         /* Just a leaf? */
1232         if (IS_LEAF(n)) {
1233                 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1234                         goto found;
1235                 goto failed;
1236         }
1237         pn = (struct tnode *) n;
1238         chopped_off = 0;
1239
1240         while (pn) {
1241
1242                 pos = pn->pos;
1243                 bits = pn->bits;
1244
1245                 if(!chopped_off)
1246                         cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247
1248                 n = tnode_get_child(pn, cindex);
1249
1250                 if (n == NULL) {
1251 #ifdef CONFIG_IP_FIB_TRIE_STATS
1252                         t->stats.null_node_hit++;
1253 #endif
1254                         goto backtrace;
1255                 }
1256
1257                 if (IS_TNODE(n)) {
1258 #define HL_OPTIMIZE
1259 #ifdef HL_OPTIMIZE
1260                         struct tnode *cn = (struct tnode *)n;
1261                         t_key node_prefix, key_prefix, pref_mismatch;
1262                         int mp;
1263
1264                         /*
1265                          * It's a tnode, and we can do some extra checks here if we
1266                          * like, to avoid descending into a dead-end branch.
1267                          * This tnode is in the parent's child array at index
1268                          * key[p_pos..p_pos+p_bits] but potentially with some bits
1269                          * chopped off, so in reality the index may be just a
1270                          * subprefix, padded with zero at the end.
1271                          * We can also take a look at any skipped bits in this
1272                          * tnode - everything up to p_pos is supposed to be ok,
1273                          * and the non-chopped bits of the index (se previous
1274                          * paragraph) are also guaranteed ok, but the rest is
1275                          * considered unknown.
1276                          *
1277                          * The skipped bits are key[pos+bits..cn->pos].
1278                          */
1279
1280                         /* If current_prefix_length < pos+bits, we are already doing
1281                          * actual prefix  matching, which means everything from
1282                          * pos+(bits-chopped_off) onward must be zero along some
1283                          * branch of this subtree - otherwise there is *no* valid
1284                          * prefix present. Here we can only check the skipped
1285                          * bits. Remember, since we have already indexed into the
1286                          * parent's child array, we know that the bits we chopped of
1287                          * *are* zero.
1288                          */
1289
1290                         /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291
1292                         if (current_prefix_length < pos+bits) {
1293                                 if (tkey_extract_bits(cn->key, current_prefix_length,
1294                                                       cn->pos - current_prefix_length) != 0 ||
1295                                     !(cn->child[0]))
1296                                         goto backtrace;
1297                         }
1298
1299                         /*
1300                          * If chopped_off=0, the index is fully validated and we
1301                          * only need to look at the skipped bits for this, the new,
1302                          * tnode. What we actually want to do is to find out if
1303                          * these skipped bits match our key perfectly, or if we will
1304                          * have to count on finding a matching prefix further down,
1305                          * because if we do, we would like to have some way of
1306                          * verifying the existence of such a prefix at this point.
1307                          */
1308
1309                         /* The only thing we can do at this point is to verify that
1310                          * any such matching prefix can indeed be a prefix to our
1311                          * key, and if the bits in the node we are inspecting that
1312                          * do not match our key are not ZERO, this cannot be true.
1313                          * Thus, find out where there is a mismatch (before cn->pos)
1314                          * and verify that all the mismatching bits are zero in the
1315                          * new tnode's key.
1316                          */
1317
1318                         /* Note: We aren't very concerned about the piece of the key
1319                          * that precede pn->pos+pn->bits, since these have already been
1320                          * checked. The bits after cn->pos aren't checked since these are
1321                          * by definition "unknown" at this point. Thus, what we want to
1322                          * see is if we are about to enter the "prefix matching" state,
1323                          * and in that case verify that the skipped bits that will prevail
1324                          * throughout this subtree are zero, as they have to be if we are
1325                          * to find a matching prefix.
1326                          */
1327
1328                         node_prefix = MASK_PFX(cn->key, cn->pos);
1329                         key_prefix =  MASK_PFX(key, cn->pos);
1330                         pref_mismatch = key_prefix^node_prefix;
1331                         mp = 0;
1332
1333                         /* In short: If skipped bits in this node do not match the search
1334                          * key, enter the "prefix matching" state.directly.
1335                          */
1336                         if (pref_mismatch) {
1337                                 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1338                                         mp++;
1339                                         pref_mismatch = pref_mismatch <<1;
1340                                 }
1341                                 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342
1343                                 if (key_prefix != 0)
1344                                         goto backtrace;
1345
1346                                 if (current_prefix_length >= cn->pos)
1347                                         current_prefix_length=mp;
1348                        }
1349 #endif
1350                        pn = (struct tnode *)n; /* Descend */
1351                        chopped_off = 0;
1352                        continue;
1353                 }
1354                 if (IS_LEAF(n)) {
1355                         if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1356                                 goto found;
1357                }
1358 backtrace:
1359                 chopped_off++;
1360
1361                 /* As zero don't change the child key (cindex) */
1362                 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1363                         chopped_off++;
1364                 }
1365
1366                 /* Decrease current_... with bits chopped off */
1367                 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368                         current_prefix_length = pn->pos + pn->bits - chopped_off;
1369
1370                 /*
1371                  * Either we do the actual chop off according or if we have
1372                  * chopped off all bits in this tnode walk up to our parent.
1373                  */
1374
1375                 if(chopped_off <= pn->bits)
1376                         cindex &= ~(1 << (chopped_off-1));
1377                 else {
1378                         if( NODE_PARENT(pn) == NULL)
1379                                 goto failed;
1380
1381                         /* Get Child's index */
1382                         cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383                         pn = NODE_PARENT(pn);
1384                         chopped_off = 0;
1385
1386 #ifdef CONFIG_IP_FIB_TRIE_STATS
1387                         t->stats.backtrack++;
1388 #endif
1389                         goto backtrace;
1390                 }
1391         }
1392 failed:
1393         ret =  1;
1394 found:
1395         read_unlock(&fib_lock);
1396         return ret;
1397 }
1398
1399 static int trie_leaf_remove(struct trie *t, t_key key)
1400 {
1401         t_key cindex;
1402         struct tnode *tp = NULL;
1403         struct node *n = t->trie;
1404         struct leaf *l;
1405
1406         if(trie_debug)
1407                 printk("entering trie_leaf_remove(%p)\n", n);
1408
1409         /* Note that in the case skipped bits, those bits are *not* checked!
1410          * When we finish this, we will have NULL or a T_LEAF, and the
1411          * T_LEAF may or may not match our key.
1412          */
1413
1414         while (n != NULL && IS_TNODE(n)) {
1415                 struct tnode *tn = (struct tnode *) n;
1416                 check_tnode(tn);
1417                 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418
1419                         if(n && NODE_PARENT(n) != tn) {
1420                                 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421                                 BUG();
1422                         }
1423         }
1424         l = (struct leaf *) n;
1425
1426         if(!n || !tkey_equals(l->key, key))
1427                 return 0;
1428
1429         /*
1430          * Key found.
1431          * Remove the leaf and rebalance the tree
1432          */
1433
1434         t->revision++;
1435         t->size--;
1436
1437         tp = NODE_PARENT(n);
1438         tnode_free((struct tnode *) n);
1439
1440         if(tp) {
1441                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442                 put_child(t, (struct tnode *)tp, cindex, NULL);
1443                 t->trie = trie_rebalance(t, tp);
1444         }
1445         else
1446                 t->trie = NULL;
1447
1448         return 1;
1449 }
1450
1451 static int
1452 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1453                struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1454 {
1455         struct trie *t = (struct trie *) tb->tb_data;
1456         u32 key, mask;
1457         int plen = r->rtm_dst_len;
1458         u8 tos = r->rtm_tos;
1459         struct fib_alias *fa, *fa_to_delete;
1460         struct list_head *fa_head;
1461         struct leaf *l;
1462
1463         if (plen > 32)
1464                 return -EINVAL;
1465
1466         key = 0;
1467         if (rta->rta_dst)
1468                 memcpy(&key, rta->rta_dst, 4);
1469
1470         key = ntohl(key);
1471         mask =  ntohl( inet_make_mask(plen) );
1472
1473         if(key & ~mask)
1474                 return -EINVAL;
1475
1476         key = key & mask;
1477         l = fib_find_node(t, key);
1478
1479         if(!l)
1480                 return -ESRCH;
1481
1482         fa_head = get_fa_head(l, plen);
1483         fa = fib_find_alias(fa_head, tos, 0);
1484
1485         if (!fa)
1486                 return -ESRCH;
1487
1488         if (trie_debug)
1489                 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1490
1491         fa_to_delete = NULL;
1492         fa_head = fa->fa_list.prev;
1493         list_for_each_entry(fa, fa_head, fa_list) {
1494                 struct fib_info *fi = fa->fa_info;
1495
1496                 if (fa->fa_tos != tos)
1497                         break;
1498
1499                 if ((!r->rtm_type ||
1500                      fa->fa_type == r->rtm_type) &&
1501                     (r->rtm_scope == RT_SCOPE_NOWHERE ||
1502                      fa->fa_scope == r->rtm_scope) &&
1503                     (!r->rtm_protocol ||
1504                      fi->fib_protocol == r->rtm_protocol) &&
1505                     fib_nh_match(r, nlhdr, rta, fi) == 0) {
1506                         fa_to_delete = fa;
1507                         break;
1508                 }
1509         }
1510
1511         if (fa_to_delete) {
1512                 int kill_li = 0;
1513                 struct leaf_info *li;
1514
1515                 fa = fa_to_delete;
1516                 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1517
1518                 l = fib_find_node(t, key);
1519                 li = find_leaf_info(&l->list, plen);
1520
1521                 write_lock_bh(&fib_lock);
1522
1523                 list_del(&fa->fa_list);
1524
1525                 if(list_empty(fa_head)) {
1526                         hlist_del(&li->hlist);
1527                         kill_li = 1;
1528                 }
1529                 write_unlock_bh(&fib_lock);
1530
1531                 if(kill_li)
1532                         free_leaf_info(li);
1533
1534                 if(hlist_empty(&l->list))
1535                         trie_leaf_remove(t, key);
1536
1537                 if (fa->fa_state & FA_S_ACCESSED)
1538                         rt_cache_flush(-1);
1539
1540                 fn_free_alias(fa);
1541                 return 0;
1542         }
1543         return -ESRCH;
1544 }
1545
1546 static int trie_flush_list(struct trie *t, struct list_head *head)
1547 {
1548         struct fib_alias *fa, *fa_node;
1549         int found = 0;
1550
1551         list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552                 struct fib_info *fi = fa->fa_info;
1553
1554                 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555
1556                         write_lock_bh(&fib_lock);
1557                         list_del(&fa->fa_list);
1558                         write_unlock_bh(&fib_lock);
1559
1560                         fn_free_alias(fa);
1561                         found++;
1562                 }
1563         }
1564         return found;
1565 }
1566
1567 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1568 {
1569         int found = 0;
1570         struct hlist_head *lih = &l->list;
1571         struct hlist_node *node, *tmp;
1572         struct leaf_info *li = NULL;
1573
1574         hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575
1576                 found += trie_flush_list(t, &li->falh);
1577
1578                 if (list_empty(&li->falh)) {
1579
1580                         write_lock_bh(&fib_lock);
1581                         hlist_del(&li->hlist);
1582                         write_unlock_bh(&fib_lock);
1583
1584                         free_leaf_info(li);
1585                 }
1586         }
1587         return found;
1588 }
1589
1590 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1591 {
1592         struct node *c = (struct node *) thisleaf;
1593         struct tnode *p;
1594         int idx;
1595
1596         if(c == NULL) {
1597                 if(t->trie == NULL)
1598                         return NULL;
1599
1600                 if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1601                         return (struct leaf *) t->trie;
1602
1603                 p = (struct tnode*) t->trie;  /* Start */
1604         }
1605         else
1606                 p = (struct tnode *) NODE_PARENT(c);
1607         while (p) {
1608                 int pos, last;
1609
1610                 /*  Find the next child of the parent */
1611                 if(c)
1612                         pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613                 else
1614                         pos = 0;
1615
1616                 last = 1 << p->bits;
1617                 for(idx = pos; idx < last ; idx++) {
1618                         if( p->child[idx]) {
1619
1620                                 /* Decend if tnode */
1621
1622                                 while (IS_TNODE(p->child[idx])) {
1623                                         p = (struct tnode*) p->child[idx];
1624                                         idx = 0;
1625
1626                                         /* Rightmost non-NULL branch */
1627                                         if( p && IS_TNODE(p) )
1628                                                 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1629
1630                                         /* Done with this tnode? */
1631                                         if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1632                                                 goto up;
1633                                 }
1634                                 return (struct leaf*) p->child[idx];
1635                         }
1636                 }
1637 up:
1638                 /* No more children go up one step  */
1639                 c = (struct node*) p;
1640                 p = (struct tnode *) NODE_PARENT(p);
1641         }
1642         return NULL; /* Ready. Root of trie */
1643 }
1644
1645 static int fn_trie_flush(struct fib_table *tb)
1646 {
1647         struct trie *t = (struct trie *) tb->tb_data;
1648         struct leaf *ll = NULL, *l = NULL;
1649         int found = 0, h;
1650
1651         t->revision++;
1652
1653         for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1654                 found += trie_flush_leaf(t, l);
1655
1656                 if (ll && hlist_empty(&ll->list))
1657                         trie_leaf_remove(t, ll->key);
1658                 ll = l;
1659         }
1660
1661         if (ll && hlist_empty(&ll->list))
1662                 trie_leaf_remove(t, ll->key);
1663
1664         if(trie_debug)
1665                 printk("trie_flush found=%d\n", found);
1666         return found;
1667 }
1668
1669 static int trie_last_dflt=-1;
1670
1671 static void
1672 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1673 {
1674         struct trie *t = (struct trie *) tb->tb_data;
1675         int order, last_idx;
1676         struct fib_info *fi = NULL;
1677         struct fib_info *last_resort;
1678         struct fib_alias *fa = NULL;
1679         struct list_head *fa_head;
1680         struct leaf *l;
1681
1682         last_idx = -1;
1683         last_resort = NULL;
1684         order = -1;
1685
1686         read_lock(&fib_lock);
1687
1688         l = fib_find_node(t, 0);
1689         if(!l)
1690                 goto out;
1691
1692         fa_head = get_fa_head(l, 0);
1693         if(!fa_head)
1694                 goto out;
1695
1696         if (list_empty(fa_head))
1697                 goto out;
1698
1699         list_for_each_entry(fa, fa_head, fa_list) {
1700                 struct fib_info *next_fi = fa->fa_info;
1701
1702                 if (fa->fa_scope != res->scope ||
1703                     fa->fa_type != RTN_UNICAST)
1704                         continue;
1705
1706                 if (next_fi->fib_priority > res->fi->fib_priority)
1707                         break;
1708                 if (!next_fi->fib_nh[0].nh_gw ||
1709                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710                         continue;
1711                 fa->fa_state |= FA_S_ACCESSED;
1712
1713                 if (fi == NULL) {
1714                         if (next_fi != res->fi)
1715                                 break;
1716                 } else if (!fib_detect_death(fi, order, &last_resort,
1717                                              &last_idx, &trie_last_dflt)) {
1718                         if (res->fi)
1719                                 fib_info_put(res->fi);
1720                         res->fi = fi;
1721                         atomic_inc(&fi->fib_clntref);
1722                         trie_last_dflt = order;
1723                         goto out;
1724                 }
1725                 fi = next_fi;
1726                 order++;
1727         }
1728         if (order <= 0 || fi == NULL) {
1729                 trie_last_dflt = -1;
1730                 goto out;
1731         }
1732
1733         if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1734                 if (res->fi)
1735                         fib_info_put(res->fi);
1736                 res->fi = fi;
1737                 atomic_inc(&fi->fib_clntref);
1738                 trie_last_dflt = order;
1739                 goto out;
1740         }
1741         if (last_idx >= 0) {
1742                 if (res->fi)
1743                         fib_info_put(res->fi);
1744                 res->fi = last_resort;
1745                 if (last_resort)
1746                         atomic_inc(&last_resort->fib_clntref);
1747         }
1748         trie_last_dflt = last_idx;
1749  out:;
1750         read_unlock(&fib_lock);
1751 }
1752
1753 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754                            struct sk_buff *skb, struct netlink_callback *cb)
1755 {
1756         int i, s_i;
1757         struct fib_alias *fa;
1758
1759         u32 xkey=htonl(key);
1760
1761         s_i=cb->args[3];
1762         i = 0;
1763
1764         list_for_each_entry(fa, fah, fa_list) {
1765                 if (i < s_i) {
1766                         i++;
1767                         continue;
1768                 }
1769                 if (fa->fa_info->fib_nh == NULL) {
1770                         printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1771                         i++;
1772                         continue;
1773                 }
1774                 if (fa->fa_info == NULL) {
1775                         printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1776                         i++;
1777                         continue;
1778                 }
1779
1780                 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1781                                   cb->nlh->nlmsg_seq,
1782                                   RTM_NEWROUTE,
1783                                   tb->tb_id,
1784                                   fa->fa_type,
1785                                   fa->fa_scope,
1786                                   &xkey,
1787                                   plen,
1788                                   fa->fa_tos,
1789                                   fa->fa_info, 0) < 0) {
1790                         cb->args[3] = i;
1791                         return -1;
1792                         }
1793                 i++;
1794         }
1795         cb->args[3]=i;
1796         return skb->len;
1797 }
1798
1799 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800                              struct netlink_callback *cb)
1801 {
1802         int h, s_h;
1803         struct list_head *fa_head;
1804         struct leaf *l = NULL;
1805         s_h=cb->args[2];
1806
1807         for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1808
1809                 if (h < s_h)
1810                         continue;
1811                 if (h > s_h)
1812                         memset(&cb->args[3], 0,
1813                                sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814
1815                 fa_head = get_fa_head(l, plen);
1816
1817                 if(!fa_head)
1818                         continue;
1819
1820                 if(list_empty(fa_head))
1821                         continue;
1822
1823                 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1824                         cb->args[2]=h;
1825                         return -1;
1826                 }
1827         }
1828         cb->args[2]=h;
1829         return skb->len;
1830 }
1831
1832 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1833 {
1834         int m, s_m;
1835         struct trie *t = (struct trie *) tb->tb_data;
1836
1837         s_m = cb->args[1];
1838
1839         read_lock(&fib_lock);
1840         for (m=0; m<=32; m++) {
1841
1842                 if (m < s_m)
1843                         continue;
1844                 if (m > s_m)
1845                         memset(&cb->args[2], 0,
1846                                sizeof(cb->args) - 2*sizeof(cb->args[0]));
1847
1848                 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1849                         cb->args[1] = m;
1850                         goto out;
1851                 }
1852         }
1853         read_unlock(&fib_lock);
1854         cb->args[1] = m;
1855         return skb->len;
1856  out:
1857         read_unlock(&fib_lock);
1858         return -1;
1859 }
1860
1861 /* Fix more generic FIB names for init later */
1862
1863 #ifdef CONFIG_IP_MULTIPLE_TABLES
1864 struct fib_table * fib_hash_init(int id)
1865 #else
1866 struct fib_table * __init fib_hash_init(int id)
1867 #endif
1868 {
1869         struct fib_table *tb;
1870         struct trie *t;
1871
1872         if (fn_alias_kmem == NULL)
1873                 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1874                                                   sizeof(struct fib_alias),
1875                                                   0, SLAB_HWCACHE_ALIGN,
1876                                                   NULL, NULL);
1877
1878         tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1879                      GFP_KERNEL);
1880         if (tb == NULL)
1881                 return NULL;
1882
1883         tb->tb_id = id;
1884         tb->tb_lookup = fn_trie_lookup;
1885         tb->tb_insert = fn_trie_insert;
1886         tb->tb_delete = fn_trie_delete;
1887         tb->tb_flush = fn_trie_flush;
1888         tb->tb_select_default = fn_trie_select_default;
1889         tb->tb_dump = fn_trie_dump;
1890         memset(tb->tb_data, 0, sizeof(struct trie));
1891
1892         t = (struct trie *) tb->tb_data;
1893
1894         trie_init(t);
1895
1896         if (id == RT_TABLE_LOCAL)
1897                 trie_local=t;
1898           else if (id == RT_TABLE_MAIN)
1899                 trie_main=t;
1900
1901         if (id == RT_TABLE_LOCAL)
1902                 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1903
1904         return tb;
1905 }
1906
1907 /* Trie dump functions */
1908
1909 static void putspace_seq(struct seq_file *seq, int n)
1910 {
1911         while (n--) seq_printf(seq, " ");
1912 }
1913
1914 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1915 {
1916         while (bits--)
1917                 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918 }
1919
1920 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921                    int pend, int cindex, int bits)
1922 {
1923         putspace_seq(seq, indent);
1924         if (IS_LEAF(n))
1925                 seq_printf(seq, "|");
1926         else
1927                 seq_printf(seq, "+");
1928         if (bits) {
1929                 seq_printf(seq, "%d/", cindex);
1930                 printbin_seq(seq, cindex, bits);
1931                 seq_printf(seq, ": ");
1932         }
1933         else
1934                 seq_printf(seq, "<root>: ");
1935         seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936
1937         if (IS_LEAF(n))
1938                 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939                            n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940         else {
1941                 int plen=((struct tnode *)n)->pos;
1942                 t_key prf=MASK_PFX(n->key, plen);
1943                 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944                            prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945         }
1946         if (IS_LEAF(n)) {
1947                 struct leaf *l=(struct leaf *)n;
1948                 struct fib_alias *fa;
1949                 int i;
1950                 for (i=32; i>=0; i--)
1951                   if(find_leaf_info(&l->list, i)) {
1952
1953                                 struct list_head *fa_head = get_fa_head(l, i);
1954
1955                                 if(!fa_head)
1956                                         continue;
1957
1958                                 if(list_empty(fa_head))
1959                                         continue;
1960
1961                                 putspace_seq(seq, indent+2);
1962                                 seq_printf(seq, "{/%d...dumping}\n", i);
1963
1964
1965                                 list_for_each_entry(fa, fa_head, fa_list) {
1966                                         putspace_seq(seq, indent+2);
1967                                         if (fa->fa_info->fib_nh == NULL) {
1968                                                 seq_printf(seq, "Error _fib_nh=NULL\n");
1969                                                 continue;
1970                                         }
1971                                         if (fa->fa_info == NULL) {
1972                                                 seq_printf(seq, "Error fa_info=NULL\n");
1973                                                 continue;
1974                                         }
1975
1976                                         seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
1977                                               fa->fa_type,
1978                                               fa->fa_scope,
1979                                               fa->fa_tos);
1980                                 }
1981                         }
1982         }
1983         else if (IS_TNODE(n)) {
1984                 struct tnode *tn=(struct tnode *)n;
1985                 putspace_seq(seq, indent); seq_printf(seq, "|    ");
1986                 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987                 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
1988                 seq_printf(seq, "}\n");
1989                 putspace_seq(seq, indent); seq_printf(seq, "|    ");
1990                 seq_printf(seq, "{pos=%d", tn->pos);
1991                 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
1992                 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
1993                 putspace_seq(seq, indent); seq_printf(seq, "|    ");
1994                 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
1995         }
1996 }
1997
1998 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999 {
2000         struct node *n=t->trie;
2001         int cindex=0;
2002         int indent=1;
2003         int pend=0;
2004         int depth = 0;
2005
2006         read_lock(&fib_lock);
2007
2008         seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2009         if (n) {
2010                 printnode_seq(seq, indent, n, pend, cindex, 0);
2011                 if (IS_TNODE(n)) {
2012                         struct tnode *tn=(struct tnode *)n;
2013                         pend = tn->pos+tn->bits;
2014                         putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015                         indent += 3;
2016                         depth++;
2017
2018                         while (tn && cindex < (1 << tn->bits)) {
2019                                 if (tn->child[cindex]) {
2020
2021                                         /* Got a child */
2022
2023                                         printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024                                         if (IS_LEAF(tn->child[cindex])) {
2025                                                 cindex++;
2026
2027                                         }
2028                                         else {
2029                                                 /*
2030                                                  * New tnode. Decend one level
2031                                                  */
2032
2033                                                 depth++;
2034                                                 n=tn->child[cindex];
2035                                                 tn=(struct tnode *)n;
2036                                                 pend=tn->pos+tn->bits;
2037                                                 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038                                                 indent+=3;
2039                                                 cindex=0;
2040                                         }
2041                                 }
2042                                 else
2043                                         cindex++;
2044
2045                                 /*
2046                                  * Test if we are done
2047                                  */
2048
2049                                 while (cindex >= (1 << tn->bits)) {
2050
2051                                         /*
2052                                          * Move upwards and test for root
2053                                          * pop off all traversed  nodes
2054                                          */
2055
2056                                         if (NODE_PARENT(tn) == NULL) {
2057                                                 tn = NULL;
2058                                                 n = NULL;
2059                                                 break;
2060                                         }
2061                                         else {
2062                                                 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063                                                 tn = NODE_PARENT(tn);
2064                                                 cindex++;
2065                                                 n=(struct node *)tn;
2066                                                 pend=tn->pos+tn->bits;
2067                                                 indent-=3;
2068                                                 depth--;
2069                                         }
2070                                 }
2071                         }
2072                 }
2073                 else n = NULL;
2074         }
2075         else seq_printf(seq, "------ trie is empty\n");
2076
2077         read_unlock(&fib_lock);
2078 }
2079
2080 static struct trie_stat *trie_stat_new(void)
2081 {
2082         struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083         int i;
2084
2085         if(s) {
2086                 s->totdepth = 0;
2087                 s->maxdepth = 0;
2088                 s->tnodes = 0;
2089                 s->leaves = 0;
2090                 s->nullpointers = 0;
2091
2092                 for(i=0; i< MAX_CHILDS; i++)
2093                         s->nodesizes[i] = 0;
2094         }
2095         return s;
2096 }
2097
2098 static struct trie_stat *trie_collect_stats(struct trie *t)
2099 {
2100         struct node *n=t->trie;
2101         struct trie_stat *s = trie_stat_new();
2102         int cindex = 0;
2103         int indent = 1;
2104         int pend = 0;
2105         int depth = 0;
2106
2107         read_lock(&fib_lock);
2108
2109         if (s) {
2110                 if (n) {
2111                         if (IS_TNODE(n)) {
2112                                 struct tnode *tn = (struct tnode *)n;
2113                                 pend=tn->pos+tn->bits;
2114                                 indent += 3;
2115                                 s->nodesizes[tn->bits]++;
2116                                 depth++;
2117
2118                                 while (tn && cindex < (1 << tn->bits)) {
2119                                         if (tn->child[cindex]) {
2120                                                 /* Got a child */
2121
2122                                                 if (IS_LEAF(tn->child[cindex])) {
2123                                                         cindex++;
2124
2125                                                         /* stats */
2126                                                         if (depth > s->maxdepth)
2127                                                                 s->maxdepth = depth;
2128                                                         s->totdepth += depth;
2129                                                         s->leaves++;
2130                                                 }
2131
2132                                                 else {
2133                                                         /*
2134                                                          * New tnode. Decend one level
2135                                                          */
2136
2137                                                         s->tnodes++;
2138                                                         s->nodesizes[tn->bits]++;
2139                                                         depth++;
2140
2141                                                         n = tn->child[cindex];
2142                                                         tn = (struct tnode *)n;
2143                                                         pend = tn->pos+tn->bits;
2144
2145                                                         indent += 3;
2146                                                         cindex = 0;
2147                                                 }
2148                                         }
2149                                         else {
2150                                                 cindex++;
2151                                                 s->nullpointers++;
2152                                         }
2153
2154                                         /*
2155                                          * Test if we are done
2156                                          */
2157
2158                                         while (cindex >= (1 << tn->bits)) {
2159
2160                                                 /*
2161                                                  * Move upwards and test for root
2162                                                  * pop off all traversed  nodes
2163                                                  */
2164
2165
2166                                                 if (NODE_PARENT(tn) == NULL) {
2167                                                         tn = NULL;
2168                                                         n = NULL;
2169                                                         break;
2170                                                 }
2171                                                 else {
2172                                                         cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173                                                         tn = NODE_PARENT(tn);
2174                                                         cindex++;
2175                                                         n = (struct node *)tn;
2176                                                         pend=tn->pos+tn->bits;
2177                                                         indent -= 3;
2178                                                         depth--;
2179                                                 }
2180                                         }
2181                                 }
2182                         }
2183                         else n = NULL;
2184                 }
2185         }
2186
2187         read_unlock(&fib_lock);
2188         return s;
2189 }
2190
2191 #ifdef CONFIG_PROC_FS
2192
2193 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2194 {
2195         return NULL;
2196 }
2197
2198 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2199 {
2200         return NULL;
2201 }
2202
2203 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2204 {
2205         void *v = NULL;
2206
2207         if (ip_fib_main_table)
2208                 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2209         return v;
2210 }
2211
2212 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213 {
2214         ++*pos;
2215         return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2216 }
2217
2218 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2219 {
2220
2221 }
2222
2223 /*
2224  *      This outputs /proc/net/fib_triestats
2225  *
2226  *      It always works in backward compatibility mode.
2227  *      The format of the file is not supposed to be changed.
2228  */
2229
2230 static void collect_and_show(struct trie *t, struct seq_file *seq)
2231 {
2232         int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2233         int i, max, pointers;
2234         struct trie_stat *stat;
2235         int avdepth;
2236
2237         stat = trie_collect_stats(t);
2238
2239         bytes=0;
2240         seq_printf(seq, "trie=%p\n", t);
2241
2242         if (stat) {
2243                 if (stat->leaves)
2244                         avdepth=stat->totdepth*100 / stat->leaves;
2245                 else
2246                         avdepth=0;
2247                 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248                 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249
2250                 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251                 bytes += sizeof(struct leaf) * stat->leaves;
2252                 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2253                 bytes += sizeof(struct tnode) * stat->tnodes;
2254
2255                 max = MAX_CHILDS-1;
2256
2257                 while (max >= 0 && stat->nodesizes[max] == 0)
2258                         max--;
2259                 pointers = 0;
2260
2261                 for (i = 1; i <= max; i++)
2262                         if (stat->nodesizes[i] != 0) {
2263                                 seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2264                                 pointers += (1<<i) * stat->nodesizes[i];
2265                         }
2266                 seq_printf(seq, "\n");
2267                 seq_printf(seq, "Pointers: %d\n", pointers);
2268                 bytes += sizeof(struct node *) * pointers;
2269                 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2270                 seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2271
2272                 kfree(stat);
2273         }
2274
2275 #ifdef CONFIG_IP_FIB_TRIE_STATS
2276         seq_printf(seq, "Counters:\n---------\n");
2277         seq_printf(seq,"gets = %d\n", t->stats.gets);
2278         seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2279         seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280         seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281         seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2282 #ifdef CLEAR_STATS
2283         memset(&(t->stats), 0, sizeof(t->stats));
2284 #endif
2285 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2286 }
2287
2288 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289 {
2290         char bf[128];
2291
2292         if (v == SEQ_START_TOKEN) {
2293                 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294                            sizeof(struct leaf), sizeof(struct tnode));
2295                 if (trie_local)
2296                         collect_and_show(trie_local, seq);
2297
2298                 if (trie_main)
2299                         collect_and_show(trie_main, seq);
2300         }
2301         else {
2302                 snprintf(bf, sizeof(bf),
2303                          "*\t%08X\t%08X", 200, 400);
2304
2305                 seq_printf(seq, "%-127s\n", bf);
2306         }
2307         return 0;
2308 }
2309
2310 static struct seq_operations fib_triestat_seq_ops = {
2311         .start  = fib_triestat_seq_start,
2312         .next   = fib_triestat_seq_next,
2313         .stop   = fib_triestat_seq_stop,
2314         .show   = fib_triestat_seq_show,
2315 };
2316
2317 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2318 {
2319         struct seq_file *seq;
2320         int rc = -ENOMEM;
2321
2322         rc = seq_open(file, &fib_triestat_seq_ops);
2323         if (rc)
2324                 goto out_kfree;
2325
2326         seq          = file->private_data;
2327 out:
2328         return rc;
2329 out_kfree:
2330         goto out;
2331 }
2332
2333 static struct file_operations fib_triestat_seq_fops = {
2334         .owner          = THIS_MODULE,
2335         .open           = fib_triestat_seq_open,
2336         .read           = seq_read,
2337         .llseek         = seq_lseek,
2338         .release        = seq_release_private,
2339 };
2340
2341 int __init fib_stat_proc_init(void)
2342 {
2343         if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2344                 return -ENOMEM;
2345         return 0;
2346 }
2347
2348 void __init fib_stat_proc_exit(void)
2349 {
2350         proc_net_remove("fib_triestat");
2351 }
2352
2353 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2354 {
2355         return NULL;
2356 }
2357
2358 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2359 {
2360         return NULL;
2361 }
2362
2363 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2364 {
2365         void *v = NULL;
2366
2367         if (ip_fib_main_table)
2368                 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2369         return v;
2370 }
2371
2372 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2373 {
2374         ++*pos;
2375         return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2376 }
2377
2378 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2379 {
2380
2381 }
2382
2383 /*
2384  *      This outputs /proc/net/fib_trie.
2385  *
2386  *      It always works in backward compatibility mode.
2387  *      The format of the file is not supposed to be changed.
2388  */
2389
2390 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391 {
2392         char bf[128];
2393
2394         if (v == SEQ_START_TOKEN) {
2395                 if (trie_local)
2396                         trie_dump_seq(seq, trie_local);
2397
2398                 if (trie_main)
2399                         trie_dump_seq(seq, trie_main);
2400         }
2401
2402         else {
2403                 snprintf(bf, sizeof(bf),
2404                          "*\t%08X\t%08X", 200, 400);
2405                 seq_printf(seq, "%-127s\n", bf);
2406         }
2407
2408         return 0;
2409 }
2410
2411 static struct seq_operations fib_trie_seq_ops = {
2412         .start  = fib_trie_seq_start,
2413         .next   = fib_trie_seq_next,
2414         .stop   = fib_trie_seq_stop,
2415         .show   = fib_trie_seq_show,
2416 };
2417
2418 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419 {
2420         struct seq_file *seq;
2421         int rc = -ENOMEM;
2422
2423         rc = seq_open(file, &fib_trie_seq_ops);
2424         if (rc)
2425                 goto out_kfree;
2426
2427         seq          = file->private_data;
2428 out:
2429         return rc;
2430 out_kfree:
2431         goto out;
2432 }
2433
2434 static struct file_operations fib_trie_seq_fops = {
2435         .owner          = THIS_MODULE,
2436         .open           = fib_trie_seq_open,
2437         .read           = seq_read,
2438         .llseek         = seq_lseek,
2439         .release        = seq_release_private,
2440 };
2441
2442 int __init fib_proc_init(void)
2443 {
2444         if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2445                 return -ENOMEM;
2446         return 0;
2447 }
2448
2449 void __init fib_proc_exit(void)
2450 {
2451         proc_net_remove("fib_trie");
2452 }
2453
2454 #endif /* CONFIG_PROC_FS */