net/ipv4/fib_trie.c

   1 /*
   2  *   This program is free software; you can redistribute it and/or
   3  *   modify it under the terms of the GNU General Public License
   4  *   as published by the Free Software Foundation; either version
   5  *   2 of the License, or (at your option) any later version.
   6  *
   7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8  *     & Swedish University of Agricultural Sciences.
   9  *
  10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11  *     Agricultural Sciences.
  12  *
  13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14  *
  15  * This work is based on the LPC-trie which is originally descibed in:
  16  *
  17  * An experimental study of compression methods for dynamic tries
  18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20  *
  21  *
  22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24  *
  25  * Version:     $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
  26  *
  27  *
  28  * Code from fib_hash has been reused which includes the following header:
  29  *
  30  *
  31  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  32  *              operating system.  INET is implemented using the  BSD Socket
  33  *              interface as the means of communication with the user level.
  34  *
  35  *              IPv4 FIB: lookup engine and maintenance routines.
  36  *
  37  *
  38  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  39  *
  40  *              This program is free software; you can redistribute it and/or
  41  *              modify it under the terms of the GNU General Public License
  42  *              as published by the Free Software Foundation; either version
  43  *              2 of the License, or (at your option) any later version.
  44  */
  45
  46 #define VERSION "0.325"
  47
  48 #include <linux/config.h>
  49 #include <asm/uaccess.h>
  50 #include <asm/system.h>
  51 #include <asm/bitops.h>
  52 #include <linux/types.h>
  53 #include <linux/kernel.h>
  54 #include <linux/sched.h>
  55 #include <linux/mm.h>
  56 #include <linux/string.h>
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/errno.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/if_arp.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/skbuff.h>
  66 #include <linux/netlink.h>
  67 #include <linux/init.h>
  68 #include <linux/list.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/sock.h>
  74 #include <net/ip_fib.h>
  75 #include "fib_lookup.h"
  76
  77 #undef CONFIG_IP_FIB_TRIE_STATS
  78 #define MAX_CHILDS 16384
  79
  80 #define KEYLENGTH (8*sizeof(t_key))
  81 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
  82 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
  83
  84 static DEFINE_RWLOCK(fib_lock);
  85
  86 typedef unsigned int t_key;
  87
  88 #define T_TNODE 0
  89 #define T_LEAF  1
  90 #define NODE_TYPE_MASK  0x1UL
  91 #define NODE_PARENT(node) \
  92         ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
  93 #define NODE_SET_PARENT(node, ptr) \
  94         ((node)->parent = (((unsigned long)(ptr)) | \
  95                      ((node)->parent & NODE_TYPE_MASK)))
  96 #define NODE_INIT_PARENT(node, type) \
  97         ((node)->parent = (type))
  98 #define NODE_TYPE(node) \
  99         ((node)->parent & NODE_TYPE_MASK)
 100
 101 #define IS_TNODE(n) (!(n->parent & T_LEAF))
 102 #define IS_LEAF(n) (n->parent & T_LEAF)
 103
 104 struct node {
 105         t_key key;
 106         unsigned long parent;
 107 };
 108
 109 struct leaf {
 110         t_key key;
 111         unsigned long parent;
 112         struct hlist_head list;
 113 };
 114
 115 struct leaf_info {
 116         struct hlist_node hlist;
 117         int plen;
 118         struct list_head falh;
 119 };
 120
 121 struct tnode {
 122         t_key key;
 123         unsigned long parent;
 124         unsigned short pos:5;           /* 2log(KEYLENGTH) bits needed */
 125         unsigned short bits:5;          /* 2log(KEYLENGTH) bits needed */
 126         unsigned short full_children;   /* KEYLENGTH bits needed */
 127         unsigned short empty_children;  /* KEYLENGTH bits needed */
 128         struct node *child[0];
 129 };
 130
 131 #ifdef CONFIG_IP_FIB_TRIE_STATS
 132 struct trie_use_stats {
 133         unsigned int gets;
 134         unsigned int backtrack;
 135         unsigned int semantic_match_passed;
 136         unsigned int semantic_match_miss;
 137         unsigned int null_node_hit;
 138         unsigned int resize_node_skipped;
 139 };
 140 #endif
 141
 142 struct trie_stat {
 143         unsigned int totdepth;
 144         unsigned int maxdepth;
 145         unsigned int tnodes;
 146         unsigned int leaves;
 147         unsigned int nullpointers;
 148         unsigned int nodesizes[MAX_CHILDS];
 149 };
 150
 151 struct trie {
 152         struct node *trie;
 153 #ifdef CONFIG_IP_FIB_TRIE_STATS
 154         struct trie_use_stats stats;
 155 #endif
 156         int size;
 157         unsigned int revision;
 158 };
 159
 160 static int trie_debug = 0;
 161
 162 #define DBG(x...) do { if (trie_debug) printk(x); } while (0)
 163
 164 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 165 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
 166 static struct node *resize(struct trie *t, struct tnode *tn);
 167 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 168 static struct tnode *halve(struct trie *t, struct tnode *tn);
 169 static void tnode_free(struct tnode *tn);
 170 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 171 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
 172 extern int fib_detect_death(struct fib_info *fi, int order,
 173                             struct fib_info **last_resort, int *last_idx, int *dflt);
 174
 175 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
 176                       struct nlmsghdr *n, struct netlink_skb_parms *req);
 177
 178 static kmem_cache_t *fn_alias_kmem;
 179 static struct trie *trie_local = NULL, *trie_main = NULL;
 180
 181 static inline struct node *tnode_get_child(struct tnode *tn, int i)
 182 {
 183         BUG_ON(i >= 1 << tn->bits);
 184
 185         return tn->child[i];
 186 }
 187
 188 static inline int tnode_child_length(const struct tnode *tn)
 189 {
 190         return 1 << tn->bits;
 191 }
 192
 193 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 194 {
 195         if (offset < KEYLENGTH)
 196                 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 197         else
 198                 return 0;
 199 }
 200
 201 static inline int tkey_equals(t_key a, t_key b)
 202 {
 203         return a == b;
 204 }
 205
 206 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 207 {
 208         if (bits == 0 || offset >= KEYLENGTH)
 209                 return 1;
 210         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 211         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 212 }
 213
 214 static inline int tkey_mismatch(t_key a, int offset, t_key b)
 215 {
 216         t_key diff = a ^ b;
 217         int i = offset;
 218
 219         if (!diff)
 220                 return 0;
 221         while ((diff << i) >> (KEYLENGTH-1) == 0)
 222                 i++;
 223         return i;
 224 }
 225
 226 /* Candidate for fib_semantics */
 227
 228 static void fn_free_alias(struct fib_alias *fa)
 229 {
 230         fib_release_info(fa->fa_info);
 231         kmem_cache_free(fn_alias_kmem, fa);
 232 }
 233
 234 /*
 235   To understand this stuff, an understanding of keys and all their bits is
 236   necessary. Every node in the trie has a key associated with it, but not
 237   all of the bits in that key are significant.
 238
 239   Consider a node 'n' and its parent 'tp'.
 240
 241   If n is a leaf, every bit in its key is significant. Its presence is
 242   necessitaded by path compression, since during a tree traversal (when
 243   searching for a leaf - unless we are doing an insertion) we will completely
 244   ignore all skipped bits we encounter. Thus we need to verify, at the end of
 245   a potentially successful search, that we have indeed been walking the
 246   correct key path.
 247
 248   Note that we can never "miss" the correct key in the tree if present by
 249   following the wrong path. Path compression ensures that segments of the key
 250   that are the same for all keys with a given prefix are skipped, but the
 251   skipped part *is* identical for each node in the subtrie below the skipped
 252   bit! trie_insert() in this implementation takes care of that - note the
 253   call to tkey_sub_equals() in trie_insert().
 254
 255   if n is an internal node - a 'tnode' here, the various parts of its key
 256   have many different meanings.
 257
 258   Example:
 259   _________________________________________________________________
 260   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 261   -----------------------------------------------------------------
 262     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 263
 264   _________________________________________________________________
 265   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 266   -----------------------------------------------------------------
 267    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 268
 269   tp->pos = 7
 270   tp->bits = 3
 271   n->pos = 15
 272   n->bits = 4
 273
 274   First, let's just ignore the bits that come before the parent tp, that is
 275   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 276   not use them for anything.
 277
 278   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 279   index into the parent's child array. That is, they will be used to find
 280   'n' among tp's children.
 281
 282   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 283   for the node n.
 284
 285   All the bits we have seen so far are significant to the node n. The rest
 286   of the bits are really not needed or indeed known in n->key.
 287
 288   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 289   n's child array, and will of course be different for each child.
 290
 291
 292   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 293   at this point.
 294
 295 */
 296
 297 static void check_tnode(struct tnode *tn)
 298 {
 299         if (tn && tn->pos+tn->bits > 32) {
 300                 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
 301         }
 302 }
 303
 304 static int halve_threshold = 25;
 305 static int inflate_threshold = 50;
 306
 307 static struct leaf *leaf_new(void)
 308 {
 309         struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
 310         if (l) {
 311                 NODE_INIT_PARENT(l, T_LEAF);
 312                 INIT_HLIST_HEAD(&l->list);
 313         }
 314         return l;
 315 }
 316
 317 static struct leaf_info *leaf_info_new(int plen)
 318 {
 319         struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 320
 321         if (!li)
 322                 return NULL;
 323
 324         li->plen = plen;
 325         INIT_LIST_HEAD(&li->falh);
 326
 327         return li;
 328 }
 329
 330 static inline void free_leaf(struct leaf *l)
 331 {
 332         kfree(l);
 333 }
 334
 335 static inline void free_leaf_info(struct leaf_info *li)
 336 {
 337         kfree(li);
 338 }
 339
 340 static struct tnode *tnode_alloc(unsigned int size)
 341 {
 342         if (size <= PAGE_SIZE) {
 343                 return kmalloc(size, GFP_KERNEL);
 344         } else {
 345                 return (struct tnode *)
 346                         __get_free_pages(GFP_KERNEL, get_order(size));
 347         }
 348 }
 349
 350 static void __tnode_free(struct tnode *tn)
 351 {
 352         unsigned int size = sizeof(struct tnode) +
 353                                 (1 << tn->bits) * sizeof(struct node *);
 354
 355         if (size <= PAGE_SIZE)
 356                 kfree(tn);
 357         else
 358                 free_pages((unsigned long)tn, get_order(size));
 359 }
 360
 361 static struct tnode* tnode_new(t_key key, int pos, int bits)
 362 {
 363         int nchildren = 1<<bits;
 364         int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
 365         struct tnode *tn = tnode_alloc(sz);
 366
 367         if (tn) {
 368                 memset(tn, 0, sz);
 369                 NODE_INIT_PARENT(tn, T_TNODE);
 370                 tn->pos = pos;
 371                 tn->bits = bits;
 372                 tn->key = key;
 373                 tn->full_children = 0;
 374                 tn->empty_children = 1<<bits;
 375         }
 376
 377         DBG("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
 378                (unsigned int) (sizeof(struct node) * 1<<bits));
 379         return tn;
 380 }
 381
 382 static void tnode_free(struct tnode *tn)
 383 {
 384         BUG_ON(!tn);
 385
 386         if (IS_LEAF(tn)) {
 387                 free_leaf((struct leaf *)tn);
 388                 DBG("FL %p \n", tn);
 389         } else {
 390                 __tnode_free(tn);
 391                 DBG("FT %p \n", tn);
 392         }
 393 }
 394
 395 /*
 396  * Check whether a tnode 'n' is "full", i.e. it is an internal node
 397  * and no bits are skipped. See discussion in dyntree paper p. 6
 398  */
 399
 400 static inline int tnode_full(const struct tnode *tn, const struct node *n)
 401 {
 402         if (n == NULL || IS_LEAF(n))
 403                 return 0;
 404
 405         return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 406 }
 407
 408 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
 409 {
 410         tnode_put_child_reorg(tn, i, n, -1);
 411 }
 412
 413  /*
 414   * Add a child at position i overwriting the old value.
 415   * Update the value of full_children and empty_children.
 416   */
 417
 418 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
 419 {
 420         struct node *chi;
 421         int isfull;
 422
 423         if (i >= 1<<tn->bits) {
 424                 printk("bits=%d, i=%d\n", tn->bits, i);
 425                 BUG();
 426         }
 427         write_lock_bh(&fib_lock);
 428         chi = tn->child[i];
 429
 430         /* update emptyChildren */
 431         if (n == NULL && chi != NULL)
 432                 tn->empty_children++;
 433         else if (n != NULL && chi == NULL)
 434                 tn->empty_children--;
 435
 436         /* update fullChildren */
 437         if (wasfull == -1)
 438                 wasfull = tnode_full(tn, chi);
 439
 440         isfull = tnode_full(tn, n);
 441         if (wasfull && !isfull)
 442                 tn->full_children--;
 443         else if (!wasfull && isfull)
 444                 tn->full_children++;
 445
 446         if (n)
 447                 NODE_SET_PARENT(n, tn);
 448
 449         tn->child[i] = n;
 450         write_unlock_bh(&fib_lock);
 451 }
 452
 453 static struct node *resize(struct trie *t, struct tnode *tn)
 454 {
 455         int i;
 456         int err = 0;
 457         struct tnode *old_tn;
 458
 459         if (!tn)
 460                 return NULL;
 461
 462         DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 463               tn, inflate_threshold, halve_threshold);
 464
 465         /* No children */
 466         if (tn->empty_children == tnode_child_length(tn)) {
 467                 tnode_free(tn);
 468                 return NULL;
 469         }
 470         /* One child */
 471         if (tn->empty_children == tnode_child_length(tn) - 1)
 472                 for (i = 0; i < tnode_child_length(tn); i++) {
 473                         struct node *n;
 474
 475                         write_lock_bh(&fib_lock);
 476                         n = tn->child[i];
 477                         if (!n) {
 478                                 write_unlock_bh(&fib_lock);
 479                                 continue;
 480                         }
 481
 482                         /* compress one level */
 483                         NODE_INIT_PARENT(n, NODE_TYPE(n));
 484
 485                         write_unlock_bh(&fib_lock);
 486                         tnode_free(tn);
 487                         return n;
 488                 }
 489         /*
 490          * Double as long as the resulting node has a number of
 491          * nonempty nodes that are above the threshold.
 492          */
 493
 494         /*
 495          * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 496          * the Helsinki University of Technology and Matti Tikkanen of Nokia
 497          * Telecommunications, page 6:
 498          * "A node is doubled if the ratio of non-empty children to all
 499          * children in the *doubled* node is at least 'high'."
 500          *
 501          * 'high' in this instance is the variable 'inflate_threshold'. It
 502          * is expressed as a percentage, so we multiply it with
 503          * tnode_child_length() and instead of multiplying by 2 (since the
 504          * child array will be doubled by inflate()) and multiplying
 505          * the left-hand side by 100 (to handle the percentage thing) we
 506          * multiply the left-hand side by 50.
 507          *
 508          * The left-hand side may look a bit weird: tnode_child_length(tn)
 509          * - tn->empty_children is of course the number of non-null children
 510          * in the current node. tn->full_children is the number of "full"
 511          * children, that is non-null tnodes with a skip value of 0.
 512          * All of those will be doubled in the resulting inflated tnode, so
 513          * we just count them one extra time here.
 514          *
 515          * A clearer way to write this would be:
 516          *
 517          * to_be_doubled = tn->full_children;
 518          * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 519          *     tn->full_children;
 520          *
 521          * new_child_length = tnode_child_length(tn) * 2;
 522          *
 523          * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 524          *      new_child_length;
 525          * if (new_fill_factor >= inflate_threshold)
 526          *
 527          * ...and so on, tho it would mess up the while () loop.
 528          *
 529          * anyway,
 530          * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 531          *      inflate_threshold
 532          *
 533          * avoid a division:
 534          * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 535          *      inflate_threshold * new_child_length
 536          *
 537          * expand not_to_be_doubled and to_be_doubled, and shorten:
 538          * 100 * (tnode_child_length(tn) - tn->empty_children +
 539          *    tn->full_children) >= inflate_threshold * new_child_length
 540          *
 541          * expand new_child_length:
 542          * 100 * (tnode_child_length(tn) - tn->empty_children +
 543          *    tn->full_children) >=
 544          *      inflate_threshold * tnode_child_length(tn) * 2
 545          *
 546          * shorten again:
 547          * 50 * (tn->full_children + tnode_child_length(tn) -
 548          *    tn->empty_children) >= inflate_threshold *
 549          *    tnode_child_length(tn)
 550          *
 551          */
 552
 553         check_tnode(tn);
 554
 555         err = 0;
 556         while ((tn->full_children > 0 &&
 557                50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
 558                                 inflate_threshold * tnode_child_length(tn))) {
 559
 560                 old_tn = tn;
 561                 tn = inflate(t, tn);
 562                 if (IS_ERR(tn)) {
 563                         tn = old_tn;
 564 #ifdef CONFIG_IP_FIB_TRIE_STATS
 565                         t->stats.resize_node_skipped++;
 566 #endif
 567                         break;
 568                 }
 569         }
 570
 571         check_tnode(tn);
 572
 573         /*
 574          * Halve as long as the number of empty children in this
 575          * node is above threshold.
 576          */
 577
 578         err = 0;
 579         while (tn->bits > 1 &&
 580                100 * (tnode_child_length(tn) - tn->empty_children) <
 581                halve_threshold * tnode_child_length(tn)) {
 582
 583                 old_tn = tn;
 584                 tn = halve(t, tn);
 585                 if (IS_ERR(tn)) {
 586                         tn = old_tn;
 587 #ifdef CONFIG_IP_FIB_TRIE_STATS
 588                         t->stats.resize_node_skipped++;
 589 #endif
 590                         break;
 591                 }
 592         }
 593
 594
 595         /* Only one child remains */
 596
 597         if (tn->empty_children == tnode_child_length(tn) - 1)
 598                 for (i = 0; i < tnode_child_length(tn); i++) {
 599                         struct node *n;
 600
 601                         write_lock_bh(&fib_lock);
 602
 603                         n = tn->child[i];
 604                         if (!n) {
 605                                 write_unlock_bh(&fib_lock);
 606                                 continue;
 607                         }
 608
 609                         /* compress one level */
 610
 611                         NODE_INIT_PARENT(n, NODE_TYPE(n));
 612
 613                         write_unlock_bh(&fib_lock);
 614                         tnode_free(tn);
 615                         return n;
 616                 }
 617
 618         return (struct node *) tn;
 619 }
 620
 621 static struct tnode *inflate(struct trie *t, struct tnode *tn)
 622 {
 623         struct tnode *inode;
 624         struct tnode *oldtnode = tn;
 625         int olen = tnode_child_length(tn);
 626         int i;
 627
 628         DBG("In inflate\n");
 629
 630         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 631
 632         if (!tn)
 633                 return ERR_PTR(-ENOMEM);
 634
 635         /*
 636          * Preallocate and store tnodes before the actual work so we
 637          * don't get into an inconsistent state if memory allocation
 638          * fails. In case of failure we return the oldnode and  inflate
 639          * of tnode is ignored.
 640          */
 641
 642         for (i = 0; i < olen; i++) {
 643                 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
 644
 645                 if (inode &&
 646                     IS_TNODE(inode) &&
 647                     inode->pos == oldtnode->pos + oldtnode->bits &&
 648                     inode->bits > 1) {
 649                         struct tnode *left, *right;
 650                         t_key m = TKEY_GET_MASK(inode->pos, 1);
 651
 652                         left = tnode_new(inode->key&(~m), inode->pos + 1,
 653                                          inode->bits - 1);
 654                         if (!left)
 655                                 goto nomem;
 656
 657                         right = tnode_new(inode->key|m, inode->pos + 1,
 658                                           inode->bits - 1);
 659
 660                         if (!right) {
 661                                 tnode_free(left);
 662                                 goto nomem;
 663                         }
 664
 665                         put_child(t, tn, 2*i, (struct node *) left);
 666                         put_child(t, tn, 2*i+1, (struct node *) right);
 667                 }
 668         }
 669
 670         for (i = 0; i < olen; i++) {
 671                 struct node *node = tnode_get_child(oldtnode, i);
 672                 struct tnode *left, *right;
 673                 int size, j;
 674
 675                 /* An empty child */
 676                 if (node == NULL)
 677                         continue;
 678
 679                 /* A leaf or an internal node with skipped bits */
 680
 681                 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 682                    tn->pos + tn->bits - 1) {
 683                         if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
 684                                              1) == 0)
 685                                 put_child(t, tn, 2*i, node);
 686                         else
 687                                 put_child(t, tn, 2*i+1, node);
 688                         continue;
 689                 }
 690
 691                 /* An internal node with two children */
 692                 inode = (struct tnode *) node;
 693
 694                 if (inode->bits == 1) {
 695                         put_child(t, tn, 2*i, inode->child[0]);
 696                         put_child(t, tn, 2*i+1, inode->child[1]);
 697
 698                         tnode_free(inode);
 699                         continue;
 700                 }
 701
 702                 /* An internal node with more than two children */
 703
 704                 /* We will replace this node 'inode' with two new
 705                  * ones, 'left' and 'right', each with half of the
 706                  * original children. The two new nodes will have
 707                  * a position one bit further down the key and this
 708                  * means that the "significant" part of their keys
 709                  * (see the discussion near the top of this file)
 710                  * will differ by one bit, which will be "0" in
 711                  * left's key and "1" in right's key. Since we are
 712                  * moving the key position by one step, the bit that
 713                  * we are moving away from - the bit at position
 714                  * (inode->pos) - is the one that will differ between
 715                  * left and right. So... we synthesize that bit in the
 716                  * two  new keys.
 717                  * The mask 'm' below will be a single "one" bit at
 718                  * the position (inode->pos)
 719                  */
 720
 721                 /* Use the old key, but set the new significant
 722                  *   bit to zero.
 723                  */
 724
 725                 left = (struct tnode *) tnode_get_child(tn, 2*i);
 726                 put_child(t, tn, 2*i, NULL);
 727
 728                 BUG_ON(!left);
 729
 730                 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 731                 put_child(t, tn, 2*i+1, NULL);
 732
 733                 BUG_ON(!right);
 734
 735                 size = tnode_child_length(left);
 736                 for (j = 0; j < size; j++) {
 737                         put_child(t, left, j, inode->child[j]);
 738                         put_child(t, right, j, inode->child[j + size]);
 739                 }
 740                 put_child(t, tn, 2*i, resize(t, left));
 741                 put_child(t, tn, 2*i+1, resize(t, right));
 742
 743                 tnode_free(inode);
 744         }
 745         tnode_free(oldtnode);
 746         return tn;
 747 nomem:
 748         {
 749                 int size = tnode_child_length(tn);
 750                 int j;
 751
 752                 for(j = 0; j < size; j++)
 753                         if (tn->child[j])
 754                                 tnode_free((struct tnode *)tn->child[j]);
 755
 756                 tnode_free(tn);
 757
 758                 return ERR_PTR(-ENOMEM);
 759         }
 760 }
 761
 762 static struct tnode *halve(struct trie *t, struct tnode *tn)
 763 {
 764         struct tnode *oldtnode = tn;
 765         struct node *left, *right;
 766         int i;
 767         int olen = tnode_child_length(tn);
 768
 769         DBG("In halve\n");
 770
 771         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 772
 773         if (!tn)
 774                 return ERR_PTR(-ENOMEM);
 775
 776         /*
 777          * Preallocate and store tnodes before the actual work so we
 778          * don't get into an inconsistent state if memory allocation
 779          * fails. In case of failure we return the oldnode and halve
 780          * of tnode is ignored.
 781          */
 782
 783         for (i = 0; i < olen; i += 2) {
 784                 left = tnode_get_child(oldtnode, i);
 785                 right = tnode_get_child(oldtnode, i+1);
 786
 787                 /* Two nonempty children */
 788                 if (left && right)  {
 789                         struct tnode *newn;
 790
 791                         newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 792
 793                         if (!newn)
 794                                 goto nomem;
 795
 796                         put_child(t, tn, i/2, (struct node *)newn);
 797                 }
 798
 799         }
 800
 801         for (i = 0; i < olen; i += 2) {
 802                 struct tnode *newBinNode;
 803
 804                 left = tnode_get_child(oldtnode, i);
 805                 right = tnode_get_child(oldtnode, i+1);
 806
 807                 /* At least one of the children is empty */
 808                 if (left == NULL) {
 809                         if (right == NULL)    /* Both are empty */
 810                                 continue;
 811                         put_child(t, tn, i/2, right);
 812                         continue;
 813                 }
 814
 815                 if (right == NULL) {
 816                         put_child(t, tn, i/2, left);
 817                         continue;
 818                 }
 819
 820                 /* Two nonempty children */
 821                 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 822                 put_child(t, tn, i/2, NULL);
 823
 824                 BUG_ON(!newBinNode);
 825
 826                 put_child(t, newBinNode, 0, left);
 827                 put_child(t, newBinNode, 1, right);
 828                 put_child(t, tn, i/2, resize(t, newBinNode));
 829         }
 830         tnode_free(oldtnode);
 831         return tn;
 832 nomem:
 833         {
 834                 int size = tnode_child_length(tn);
 835                 int j;
 836
 837                 for(j = 0; j < size; j++)
 838                         if (tn->child[j])
 839                                 tnode_free((struct tnode *)tn->child[j]);
 840
 841                 tnode_free(tn);
 842
 843                 return ERR_PTR(-ENOMEM);
 844         }
 845 }
 846
 847 static void trie_init(struct trie *t)
 848 {
 849         if (!t)
 850                 return;
 851
 852         t->size = 0;
 853         t->trie = NULL;
 854         t->revision = 0;
 855 #ifdef CONFIG_IP_FIB_TRIE_STATS
 856         memset(&t->stats, 0, sizeof(struct trie_use_stats));
 857 #endif
 858 }
 859
 860 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 861 {
 862         struct hlist_node *node;
 863         struct leaf_info *li;
 864
 865         hlist_for_each_entry(li, node, head, hlist)
 866                 if (li->plen == plen)
 867                         return li;
 868
 869         return NULL;
 870 }
 871
 872 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 873 {
 874         struct leaf_info *li = find_leaf_info(&l->list, plen);
 875
 876         if (!li)
 877                 return NULL;
 878
 879         return &li->falh;
 880 }
 881
 882 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 883 {
 884         struct leaf_info *li = NULL, *last = NULL;
 885         struct hlist_node *node;
 886
 887         write_lock_bh(&fib_lock);
 888
 889         if (hlist_empty(head)) {
 890                 hlist_add_head(&new->hlist, head);
 891         } else {
 892                 hlist_for_each_entry(li, node, head, hlist) {
 893                         if (new->plen > li->plen)
 894                                 break;
 895
 896                         last = li;
 897                 }
 898                 if (last)
 899                         hlist_add_after(&last->hlist, &new->hlist);
 900                 else
 901                         hlist_add_before(&new->hlist, &li->hlist);
 902         }
 903         write_unlock_bh(&fib_lock);
 904 }
 905
 906 static struct leaf *
 907 fib_find_node(struct trie *t, u32 key)
 908 {
 909         int pos;
 910         struct tnode *tn;
 911         struct node *n;
 912
 913         pos = 0;
 914         n = t->trie;
 915
 916         while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 917                 tn = (struct tnode *) n;
 918
 919                 check_tnode(tn);
 920
 921                 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 922                         pos = tn->pos + tn->bits;
 923                         n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 924                 } else
 925                         break;
 926         }
 927         /* Case we have found a leaf. Compare prefixes */
 928
 929         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 930                 return (struct leaf *)n;
 931
 932         return NULL;
 933 }
 934
 935 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 936 {
 937         int i;
 938         int wasfull;
 939         t_key cindex, key;
 940         struct tnode *tp = NULL;
 941
 942         BUG_ON(!tn);
 943
 944         key = tn->key;
 945         i = 0;
 946
 947         while (tn != NULL && NODE_PARENT(tn) != NULL) {
 948                 if (i > 10) {
 949                         printk("Rebalance tn=%p \n", tn);
 950                         if (tn)
 951                                 printk("tn->parent=%p \n", NODE_PARENT(tn));
 952
 953                         printk("Rebalance tp=%p \n", tp);
 954                         if (tp)
 955                                 printk("tp->parent=%p \n", NODE_PARENT(tp));
 956                 }
 957
 958                 BUG_ON(i > 12); /* Why is this a bug? -ojn */
 959                 i++;
 960
 961                 tp = NODE_PARENT(tn);
 962                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 963                 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 964                 tn = (struct tnode *) resize (t, (struct tnode *)tn);
 965                 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
 966
 967                 if (!NODE_PARENT(tn))
 968                         break;
 969
 970                 tn = NODE_PARENT(tn);
 971         }
 972         /* Handle last (top) tnode */
 973         if (IS_TNODE(tn))
 974                 tn = (struct tnode*) resize(t, (struct tnode *)tn);
 975
 976         return (struct node*) tn;
 977 }
 978
 979 static  struct list_head *
 980 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 981 {
 982         int pos, newpos;
 983         struct tnode *tp = NULL, *tn = NULL;
 984         struct node *n;
 985         struct leaf *l;
 986         int missbit;
 987         struct list_head *fa_head = NULL;
 988         struct leaf_info *li;
 989         t_key cindex;
 990
 991         pos = 0;
 992         n = t->trie;
 993
 994         /* If we point to NULL, stop. Either the tree is empty and we should
 995          * just put a new leaf in if, or we have reached an empty child slot,
 996          * and we should just put our new leaf in that.
 997          * If we point to a T_TNODE, check if it matches our key. Note that
 998          * a T_TNODE might be skipping any number of bits - its 'pos' need
 999          * not be the parent's 'pos'+'bits'!
1000          *
1001          * If it does match the current key, get pos/bits from it, extract
1002          * the index from our key, push the T_TNODE and walk the tree.
1003          *
1004          * If it doesn't, we have to replace it with a new T_TNODE.
1005          *
1006          * If we point to a T_LEAF, it might or might not have the same key
1007          * as we do. If it does, just change the value, update the T_LEAF's
1008          * value, and return it.
1009          * If it doesn't, we need to replace it with a T_TNODE.
1010          */
1011
1012         while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1013                 tn = (struct tnode *) n;
1014
1015                 check_tnode(tn);
1016
1017                 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1018                         tp = tn;
1019                         pos = tn->pos + tn->bits;
1020                         n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1021
1022                         if (n && NODE_PARENT(n) != tn) {
1023                                 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1024                                 BUG();
1025                         }
1026                 } else
1027                         break;
1028         }
1029
1030         /*
1031          * n  ----> NULL, LEAF or TNODE
1032          *
1033          * tp is n's (parent) ----> NULL or TNODE
1034          */
1035
1036         BUG_ON(tp && IS_LEAF(tp));
1037
1038         /* Case 1: n is a leaf. Compare prefixes */
1039
1040         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1041                 struct leaf *l = (struct leaf *) n;
1042
1043                 li = leaf_info_new(plen);
1044
1045                 if (!li) {
1046                         *err = -ENOMEM;
1047                         goto err;
1048                 }
1049
1050                 fa_head = &li->falh;
1051                 insert_leaf_info(&l->list, li);
1052                 goto done;
1053         }
1054         t->size++;
1055         l = leaf_new();
1056
1057         if (!l) {
1058                 *err = -ENOMEM;
1059                 goto err;
1060         }
1061
1062         l->key = key;
1063         li = leaf_info_new(plen);
1064
1065         if (!li) {
1066                 tnode_free((struct tnode *) l);
1067                 *err = -ENOMEM;
1068                 goto err;
1069         }
1070
1071         fa_head = &li->falh;
1072         insert_leaf_info(&l->list, li);
1073
1074         if (t->trie && n == NULL) {
1075                 /* Case 2: n is NULL, and will just insert a new leaf */
1076
1077                 NODE_SET_PARENT(l, tp);
1078
1079                 BUG_ON(!tp);
1080
1081                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1082                 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1083         } else {
1084                 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1085                 /*
1086                  *  Add a new tnode here
1087                  *  first tnode need some special handling
1088                  */
1089
1090                 if (tp)
1091                         pos = tp->pos+tp->bits;
1092                 else
1093                         pos = 0;
1094
1095                 if (n) {
1096                         newpos = tkey_mismatch(key, pos, n->key);
1097                         tn = tnode_new(n->key, newpos, 1);
1098                 } else {
1099                         newpos = 0;
1100                         tn = tnode_new(key, newpos, 1); /* First tnode */
1101                 }
1102
1103                 if (!tn) {
1104                         free_leaf_info(li);
1105                         tnode_free((struct tnode *) l);
1106                         *err = -ENOMEM;
1107                         goto err;
1108                 }
1109
1110                 NODE_SET_PARENT(tn, tp);
1111
1112                 missbit = tkey_extract_bits(key, newpos, 1);
1113                 put_child(t, tn, missbit, (struct node *)l);
1114                 put_child(t, tn, 1-missbit, n);
1115
1116                 if (tp) {
1117                         cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1118                         put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1119                 } else {
1120                         t->trie = (struct node*) tn; /* First tnode */
1121                         tp = tn;
1122                 }
1123         }
1124
1125         if (tp && tp->pos + tp->bits > 32)
1126                 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1127                        tp, tp->pos, tp->bits, key, plen);
1128
1129         /* Rebalance the trie */
1130         t->trie = trie_rebalance(t, tp);
1131 done:
1132         t->revision++;
1133 err:
1134         return fa_head;
1135 }
1136
1137 static int
1138 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1139                struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1140 {
1141         struct trie *t = (struct trie *) tb->tb_data;
1142         struct fib_alias *fa, *new_fa;
1143         struct list_head *fa_head = NULL;
1144         struct fib_info *fi;
1145         int plen = r->rtm_dst_len;
1146         int type = r->rtm_type;
1147         u8 tos = r->rtm_tos;
1148         u32 key, mask;
1149         int err;
1150         struct leaf *l;
1151
1152         if (plen > 32)
1153                 return -EINVAL;
1154
1155         key = 0;
1156         if (rta->rta_dst)
1157                 memcpy(&key, rta->rta_dst, 4);
1158
1159         key = ntohl(key);
1160
1161         DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1162
1163         mask = ntohl(inet_make_mask(plen));
1164
1165         if (key & ~mask)
1166                 return -EINVAL;
1167
1168         key = key & mask;
1169
1170         fi = fib_create_info(r, rta, nlhdr, &err);
1171
1172         if (!fi)
1173                 goto err;
1174
1175         l = fib_find_node(t, key);
1176         fa = NULL;
1177
1178         if (l) {
1179                 fa_head = get_fa_head(l, plen);
1180                 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1181         }
1182
1183         /* Now fa, if non-NULL, points to the first fib alias
1184          * with the same keys [prefix,tos,priority], if such key already
1185          * exists or to the node before which we will insert new one.
1186          *
1187          * If fa is NULL, we will need to allocate a new one and
1188          * insert to the head of f.
1189          *
1190          * If f is NULL, no fib node matched the destination key
1191          * and we need to allocate a new one of those as well.
1192          */
1193
1194         if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1195                 struct fib_alias *fa_orig;
1196
1197                 err = -EEXIST;
1198                 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1199                         goto out;
1200
1201                 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1202                         struct fib_info *fi_drop;
1203                         u8 state;
1204
1205                         write_lock_bh(&fib_lock);
1206
1207                         fi_drop = fa->fa_info;
1208                         fa->fa_info = fi;
1209                         fa->fa_type = type;
1210                         fa->fa_scope = r->rtm_scope;
1211                         state = fa->fa_state;
1212                         fa->fa_state &= ~FA_S_ACCESSED;
1213
1214                         write_unlock_bh(&fib_lock);
1215
1216                         fib_release_info(fi_drop);
1217                         if (state & FA_S_ACCESSED)
1218                                 rt_cache_flush(-1);
1219
1220                         goto succeeded;
1221                 }
1222                 /* Error if we find a perfect match which
1223                  * uses the same scope, type, and nexthop
1224                  * information.
1225                  */
1226                 fa_orig = fa;
1227                 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1228                         if (fa->fa_tos != tos)
1229                                 break;
1230                         if (fa->fa_info->fib_priority != fi->fib_priority)
1231                                 break;
1232                         if (fa->fa_type == type &&
1233                             fa->fa_scope == r->rtm_scope &&
1234                             fa->fa_info == fi) {
1235                                 goto out;
1236                         }
1237                 }
1238                 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1239                         fa = fa_orig;
1240         }
1241         err = -ENOENT;
1242         if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1243                 goto out;
1244
1245         err = -ENOBUFS;
1246         new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1247         if (new_fa == NULL)
1248                 goto out;
1249
1250         new_fa->fa_info = fi;
1251         new_fa->fa_tos = tos;
1252         new_fa->fa_type = type;
1253         new_fa->fa_scope = r->rtm_scope;
1254         new_fa->fa_state = 0;
1255         /*
1256          * Insert new entry to the list.
1257          */
1258
1259         if (!fa_head) {
1260                 fa_head = fib_insert_node(t, &err, key, plen);
1261                 err = 0;
1262                 if (err)
1263                         goto out_free_new_fa;
1264         }
1265
1266         write_lock_bh(&fib_lock);
1267
1268         list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
1269
1270         write_unlock_bh(&fib_lock);
1271
1272         rt_cache_flush(-1);
1273         rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1274 succeeded:
1275         return 0;
1276
1277 out_free_new_fa:
1278         kmem_cache_free(fn_alias_kmem, new_fa);
1279 out:
1280         fib_release_info(fi);
1281 err:
1282         return err;
1283 }
1284
1285 static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1286                              struct fib_result *res)
1287 {
1288         int err, i;
1289         t_key mask;
1290         struct leaf_info *li;
1291         struct hlist_head *hhead = &l->list;
1292         struct hlist_node *node;
1293
1294         hlist_for_each_entry(li, node, hhead, hlist) {
1295                 i = li->plen;
1296                 mask = ntohl(inet_make_mask(i));
1297                 if (l->key != (key & mask))
1298                         continue;
1299
1300                 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1301                         *plen = i;
1302 #ifdef CONFIG_IP_FIB_TRIE_STATS
1303                         t->stats.semantic_match_passed++;
1304 #endif
1305                         return err;
1306                 }
1307 #ifdef CONFIG_IP_FIB_TRIE_STATS
1308                 t->stats.semantic_match_miss++;
1309 #endif
1310         }
1311         return 1;
1312 }
1313
1314 static int
1315 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1316 {
1317         struct trie *t = (struct trie *) tb->tb_data;
1318         int plen, ret = 0;
1319         struct node *n;
1320         struct tnode *pn;
1321         int pos, bits;
1322         t_key key = ntohl(flp->fl4_dst);
1323         int chopped_off;
1324         t_key cindex = 0;
1325         int current_prefix_length = KEYLENGTH;
1326         struct tnode *cn;
1327         t_key node_prefix, key_prefix, pref_mismatch;
1328         int mp;
1329
1330         n = t->trie;
1331
1332         read_lock(&fib_lock);
1333
1334         if (!n)
1335                 goto failed;
1336
1337 #ifdef CONFIG_IP_FIB_TRIE_STATS
1338         t->stats.gets++;
1339 #endif
1340
1341         /* Just a leaf? */
1342         if (IS_LEAF(n)) {
1343                 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1344                         goto found;
1345                 goto failed;
1346         }
1347         pn = (struct tnode *) n;
1348         chopped_off = 0;
1349
1350         while (pn) {
1351                 pos = pn->pos;
1352                 bits = pn->bits;
1353
1354                 if (!chopped_off)
1355                         cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1356
1357                 n = tnode_get_child(pn, cindex);
1358
1359                 if (n == NULL) {
1360 #ifdef CONFIG_IP_FIB_TRIE_STATS
1361                         t->stats.null_node_hit++;
1362 #endif
1363                         goto backtrace;
1364                 }
1365
1366                 if (IS_LEAF(n)) {
1367                         if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1368                                 goto found;
1369                         else
1370                                 goto backtrace;
1371                 }
1372
1373 #define HL_OPTIMIZE
1374 #ifdef HL_OPTIMIZE
1375                 cn = (struct tnode *)n;
1376
1377                 /*
1378                  * It's a tnode, and we can do some extra checks here if we
1379                  * like, to avoid descending into a dead-end branch.
1380                  * This tnode is in the parent's child array at index
1381                  * key[p_pos..p_pos+p_bits] but potentially with some bits
1382                  * chopped off, so in reality the index may be just a
1383                  * subprefix, padded with zero at the end.
1384                  * We can also take a look at any skipped bits in this
1385                  * tnode - everything up to p_pos is supposed to be ok,
1386                  * and the non-chopped bits of the index (se previous
1387                  * paragraph) are also guaranteed ok, but the rest is
1388                  * considered unknown.
1389                  *
1390                  * The skipped bits are key[pos+bits..cn->pos].
1391                  */
1392
1393                 /* If current_prefix_length < pos+bits, we are already doing
1394                  * actual prefix  matching, which means everything from
1395                  * pos+(bits-chopped_off) onward must be zero along some
1396                  * branch of this subtree - otherwise there is *no* valid
1397                  * prefix present. Here we can only check the skipped
1398                  * bits. Remember, since we have already indexed into the
1399                  * parent's child array, we know that the bits we chopped of
1400                  * *are* zero.
1401                  */
1402
1403                 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1404
1405                 if (current_prefix_length < pos+bits) {
1406                         if (tkey_extract_bits(cn->key, current_prefix_length,
1407                                                 cn->pos - current_prefix_length) != 0 ||
1408                             !(cn->child[0]))
1409                                 goto backtrace;
1410                 }
1411
1412                 /*
1413                  * If chopped_off=0, the index is fully validated and we
1414                  * only need to look at the skipped bits for this, the new,
1415                  * tnode. What we actually want to do is to find out if
1416                  * these skipped bits match our key perfectly, or if we will
1417                  * have to count on finding a matching prefix further down,
1418                  * because if we do, we would like to have some way of
1419                  * verifying the existence of such a prefix at this point.
1420                  */
1421
1422                 /* The only thing we can do at this point is to verify that
1423                  * any such matching prefix can indeed be a prefix to our
1424                  * key, and if the bits in the node we are inspecting that
1425                  * do not match our key are not ZERO, this cannot be true.
1426                  * Thus, find out where there is a mismatch (before cn->pos)
1427                  * and verify that all the mismatching bits are zero in the
1428                  * new tnode's key.
1429                  */
1430
1431                 /* Note: We aren't very concerned about the piece of the key
1432                  * that precede pn->pos+pn->bits, since these have already been
1433                  * checked. The bits after cn->pos aren't checked since these are
1434                  * by definition "unknown" at this point. Thus, what we want to
1435                  * see is if we are about to enter the "prefix matching" state,
1436                  * and in that case verify that the skipped bits that will prevail
1437                  * throughout this subtree are zero, as they have to be if we are
1438                  * to find a matching prefix.
1439                  */
1440
1441                 node_prefix = MASK_PFX(cn->key, cn->pos);
1442                 key_prefix = MASK_PFX(key, cn->pos);
1443                 pref_mismatch = key_prefix^node_prefix;
1444                 mp = 0;
1445
1446                 /* In short: If skipped bits in this node do not match the search
1447                  * key, enter the "prefix matching" state.directly.
1448                  */
1449                 if (pref_mismatch) {
1450                         while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1451                                 mp++;
1452                                 pref_mismatch = pref_mismatch <<1;
1453                         }
1454                         key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1455
1456                         if (key_prefix != 0)
1457                                 goto backtrace;
1458
1459                         if (current_prefix_length >= cn->pos)
1460                                 current_prefix_length = mp;
1461                 }
1462 #endif
1463                 pn = (struct tnode *)n; /* Descend */
1464                 chopped_off = 0;
1465                 continue;
1466
1467 backtrace:
1468                 chopped_off++;
1469
1470                 /* As zero don't change the child key (cindex) */
1471                 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1472                         chopped_off++;
1473
1474                 /* Decrease current_... with bits chopped off */
1475                 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1476                         current_prefix_length = pn->pos + pn->bits - chopped_off;
1477
1478                 /*
1479                  * Either we do the actual chop off according or if we have
1480                  * chopped off all bits in this tnode walk up to our parent.
1481                  */
1482
1483                 if (chopped_off <= pn->bits) {
1484                         cindex &= ~(1 << (chopped_off-1));
1485                 } else {
1486                         if (NODE_PARENT(pn) == NULL)
1487                                 goto failed;
1488
1489                         /* Get Child's index */
1490                         cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1491                         pn = NODE_PARENT(pn);
1492                         chopped_off = 0;
1493
1494 #ifdef CONFIG_IP_FIB_TRIE_STATS
1495                         t->stats.backtrack++;
1496 #endif
1497                         goto backtrace;
1498                 }
1499         }
1500 failed:
1501         ret = 1;
1502 found:
1503         read_unlock(&fib_lock);
1504         return ret;
1505 }
1506
1507 static int trie_leaf_remove(struct trie *t, t_key key)
1508 {
1509         t_key cindex;
1510         struct tnode *tp = NULL;
1511         struct node *n = t->trie;
1512         struct leaf *l;
1513
1514         DBG("entering trie_leaf_remove(%p)\n", n);
1515
1516         /* Note that in the case skipped bits, those bits are *not* checked!
1517          * When we finish this, we will have NULL or a T_LEAF, and the
1518          * T_LEAF may or may not match our key.
1519          */
1520
1521         while (n != NULL && IS_TNODE(n)) {
1522                 struct tnode *tn = (struct tnode *) n;
1523                 check_tnode(tn);
1524                 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1525
1526                 if (n && NODE_PARENT(n) != tn) {
1527                         printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1528                         BUG();
1529                 }
1530         }
1531         l = (struct leaf *) n;
1532
1533         if (!n || !tkey_equals(l->key, key))
1534                 return 0;
1535
1536         /*
1537          * Key found.
1538          * Remove the leaf and rebalance the tree
1539          */
1540
1541         t->revision++;
1542         t->size--;
1543
1544         tp = NODE_PARENT(n);
1545         tnode_free((struct tnode *) n);
1546
1547         if (tp) {
1548                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1549                 put_child(t, (struct tnode *)tp, cindex, NULL);
1550                 t->trie = trie_rebalance(t, tp);
1551         } else
1552                 t->trie = NULL;
1553
1554         return 1;
1555 }
1556
1557 static int
1558 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1559                 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1560 {
1561         struct trie *t = (struct trie *) tb->tb_data;
1562         u32 key, mask;
1563         int plen = r->rtm_dst_len;
1564         u8 tos = r->rtm_tos;
1565         struct fib_alias *fa, *fa_to_delete;
1566         struct list_head *fa_head;
1567         struct leaf *l;
1568         int kill_li = 0;
1569         struct leaf_info *li;
1570
1571
1572         if (plen > 32)
1573                 return -EINVAL;
1574
1575         key = 0;
1576         if (rta->rta_dst)
1577                 memcpy(&key, rta->rta_dst, 4);
1578
1579         key = ntohl(key);
1580         mask = ntohl(inet_make_mask(plen));
1581
1582         if (key & ~mask)
1583                 return -EINVAL;
1584
1585         key = key & mask;
1586         l = fib_find_node(t, key);
1587
1588         if (!l)
1589                 return -ESRCH;
1590
1591         fa_head = get_fa_head(l, plen);
1592         fa = fib_find_alias(fa_head, tos, 0);
1593
1594         if (!fa)
1595                 return -ESRCH;
1596
1597         DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1598
1599         fa_to_delete = NULL;
1600         fa_head = fa->fa_list.prev;
1601         list_for_each_entry(fa, fa_head, fa_list) {
1602                 struct fib_info *fi = fa->fa_info;
1603
1604                 if (fa->fa_tos != tos)
1605                         break;
1606
1607                 if ((!r->rtm_type ||
1608                      fa->fa_type == r->rtm_type) &&
1609                     (r->rtm_scope == RT_SCOPE_NOWHERE ||
1610                      fa->fa_scope == r->rtm_scope) &&
1611                     (!r->rtm_protocol ||
1612                      fi->fib_protocol == r->rtm_protocol) &&
1613                     fib_nh_match(r, nlhdr, rta, fi) == 0) {
1614                         fa_to_delete = fa;
1615                         break;
1616                 }
1617         }
1618
1619         if (!fa_to_delete)
1620                 return -ESRCH;
1621
1622         fa = fa_to_delete;
1623         rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1624
1625         l = fib_find_node(t, key);
1626         li = find_leaf_info(&l->list, plen);
1627
1628         write_lock_bh(&fib_lock);
1629
1630         list_del(&fa->fa_list);
1631
1632         if (list_empty(fa_head)) {
1633                 hlist_del(&li->hlist);
1634                 kill_li = 1;
1635         }
1636         write_unlock_bh(&fib_lock);
1637
1638         if (kill_li)
1639                 free_leaf_info(li);
1640
1641         if (hlist_empty(&l->list))
1642                 trie_leaf_remove(t, key);
1643
1644         if (fa->fa_state & FA_S_ACCESSED)
1645                 rt_cache_flush(-1);
1646
1647         fn_free_alias(fa);
1648         return 0;
1649 }
1650
1651 static int trie_flush_list(struct trie *t, struct list_head *head)
1652 {
1653         struct fib_alias *fa, *fa_node;
1654         int found = 0;
1655
1656         list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1657                 struct fib_info *fi = fa->fa_info;
1658
1659                 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1660                         write_lock_bh(&fib_lock);
1661                         list_del(&fa->fa_list);
1662                         write_unlock_bh(&fib_lock);
1663
1664                         fn_free_alias(fa);
1665                         found++;
1666                 }
1667         }
1668         return found;
1669 }
1670
1671 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1672 {
1673         int found = 0;
1674         struct hlist_head *lih = &l->list;
1675         struct hlist_node *node, *tmp;
1676         struct leaf_info *li = NULL;
1677
1678         hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1679                 found += trie_flush_list(t, &li->falh);
1680
1681                 if (list_empty(&li->falh)) {
1682                         write_lock_bh(&fib_lock);
1683                         hlist_del(&li->hlist);
1684                         write_unlock_bh(&fib_lock);
1685
1686                         free_leaf_info(li);
1687                 }
1688         }
1689         return found;
1690 }
1691
1692 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1693 {
1694         struct node *c = (struct node *) thisleaf;
1695         struct tnode *p;
1696         int idx;
1697
1698         if (c == NULL) {
1699                 if (t->trie == NULL)
1700                         return NULL;
1701
1702                 if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1703                         return (struct leaf *) t->trie;
1704
1705                 p = (struct tnode*) t->trie;  /* Start */
1706         } else
1707                 p = (struct tnode *) NODE_PARENT(c);
1708
1709         while (p) {
1710                 int pos, last;
1711
1712                 /*  Find the next child of the parent */
1713                 if (c)
1714                         pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1715                 else
1716                         pos = 0;
1717
1718                 last = 1 << p->bits;
1719                 for (idx = pos; idx < last ; idx++) {
1720                         if (!p->child[idx])
1721                                 continue;
1722
1723                         /* Decend if tnode */
1724                         while (IS_TNODE(p->child[idx])) {
1725                                 p = (struct tnode*) p->child[idx];
1726                                 idx = 0;
1727
1728                                 /* Rightmost non-NULL branch */
1729                                 if (p && IS_TNODE(p))
1730                                         while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1731
1732                                 /* Done with this tnode? */
1733                                 if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1734                                         goto up;
1735                         }
1736                         return (struct leaf*) p->child[idx];
1737                 }
1738 up:
1739                 /* No more children go up one step  */
1740                 c = (struct node *) p;
1741                 p = (struct tnode *) NODE_PARENT(p);
1742         }
1743         return NULL; /* Ready. Root of trie */
1744 }
1745
1746 static int fn_trie_flush(struct fib_table *tb)
1747 {
1748         struct trie *t = (struct trie *) tb->tb_data;
1749         struct leaf *ll = NULL, *l = NULL;
1750         int found = 0, h;
1751
1752         t->revision++;
1753
1754         for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1755                 found += trie_flush_leaf(t, l);
1756
1757                 if (ll && hlist_empty(&ll->list))
1758                         trie_leaf_remove(t, ll->key);
1759                 ll = l;
1760         }
1761
1762         if (ll && hlist_empty(&ll->list))
1763                 trie_leaf_remove(t, ll->key);
1764
1765         DBG("trie_flush found=%d\n", found);
1766         return found;
1767 }
1768
1769 static int trie_last_dflt = -1;
1770
1771 static void
1772 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1773 {
1774         struct trie *t = (struct trie *) tb->tb_data;
1775         int order, last_idx;
1776         struct fib_info *fi = NULL;
1777         struct fib_info *last_resort;
1778         struct fib_alias *fa = NULL;
1779         struct list_head *fa_head;
1780         struct leaf *l;
1781
1782         last_idx = -1;
1783         last_resort = NULL;
1784         order = -1;
1785
1786         read_lock(&fib_lock);
1787
1788         l = fib_find_node(t, 0);
1789         if (!l)
1790                 goto out;
1791
1792         fa_head = get_fa_head(l, 0);
1793         if (!fa_head)
1794                 goto out;
1795
1796         if (list_empty(fa_head))
1797                 goto out;
1798
1799         list_for_each_entry(fa, fa_head, fa_list) {
1800                 struct fib_info *next_fi = fa->fa_info;
1801
1802                 if (fa->fa_scope != res->scope ||
1803                     fa->fa_type != RTN_UNICAST)
1804                         continue;
1805
1806                 if (next_fi->fib_priority > res->fi->fib_priority)
1807                         break;
1808                 if (!next_fi->fib_nh[0].nh_gw ||
1809                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1810                         continue;
1811                 fa->fa_state |= FA_S_ACCESSED;
1812
1813                 if (fi == NULL) {
1814                         if (next_fi != res->fi)
1815                                 break;
1816                 } else if (!fib_detect_death(fi, order, &last_resort,
1817                                              &last_idx, &trie_last_dflt)) {
1818                         if (res->fi)
1819                                 fib_info_put(res->fi);
1820                         res->fi = fi;
1821                         atomic_inc(&fi->fib_clntref);
1822                         trie_last_dflt = order;
1823                         goto out;
1824                 }
1825                 fi = next_fi;
1826                 order++;
1827         }
1828         if (order <= 0 || fi == NULL) {
1829                 trie_last_dflt = -1;
1830                 goto out;
1831         }
1832
1833         if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1834                 if (res->fi)
1835                         fib_info_put(res->fi);
1836                 res->fi = fi;
1837                 atomic_inc(&fi->fib_clntref);
1838                 trie_last_dflt = order;
1839                 goto out;
1840         }
1841         if (last_idx >= 0) {
1842                 if (res->fi)
1843                         fib_info_put(res->fi);
1844                 res->fi = last_resort;
1845                 if (last_resort)
1846                         atomic_inc(&last_resort->fib_clntref);
1847         }
1848         trie_last_dflt = last_idx;
1849  out:;
1850         read_unlock(&fib_lock);
1851 }
1852
1853 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1854                            struct sk_buff *skb, struct netlink_callback *cb)
1855 {
1856         int i, s_i;
1857         struct fib_alias *fa;
1858
1859         u32 xkey = htonl(key);
1860
1861         s_i = cb->args[3];
1862         i = 0;
1863
1864         list_for_each_entry(fa, fah, fa_list) {
1865                 if (i < s_i) {
1866                         i++;
1867                         continue;
1868                 }
1869                 if (fa->fa_info->fib_nh == NULL) {
1870                         printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1871                         i++;
1872                         continue;
1873                 }
1874                 if (fa->fa_info == NULL) {
1875                         printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1876                         i++;
1877                         continue;
1878                 }
1879
1880                 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1881                                   cb->nlh->nlmsg_seq,
1882                                   RTM_NEWROUTE,
1883                                   tb->tb_id,
1884                                   fa->fa_type,
1885                                   fa->fa_scope,
1886                                   &xkey,
1887                                   plen,
1888                                   fa->fa_tos,
1889                                   fa->fa_info, 0) < 0) {
1890                         cb->args[3] = i;
1891                         return -1;
1892                 }
1893                 i++;
1894         }
1895         cb->args[3] = i;
1896         return skb->len;
1897 }
1898
1899 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1900                              struct netlink_callback *cb)
1901 {
1902         int h, s_h;
1903         struct list_head *fa_head;
1904         struct leaf *l = NULL;
1905
1906         s_h = cb->args[2];
1907
1908         for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1909                 if (h < s_h)
1910                         continue;
1911                 if (h > s_h)
1912                         memset(&cb->args[3], 0,
1913                                sizeof(cb->args) - 3*sizeof(cb->args[0]));
1914
1915                 fa_head = get_fa_head(l, plen);
1916
1917                 if (!fa_head)
1918                         continue;
1919
1920                 if (list_empty(fa_head))
1921                         continue;
1922
1923                 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1924                         cb->args[2] = h;
1925                         return -1;
1926                 }
1927         }
1928         cb->args[2] = h;
1929         return skb->len;
1930 }
1931
1932 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1933 {
1934         int m, s_m;
1935         struct trie *t = (struct trie *) tb->tb_data;
1936
1937         s_m = cb->args[1];
1938
1939         read_lock(&fib_lock);
1940         for (m = 0; m <= 32; m++) {
1941                 if (m < s_m)
1942                         continue;
1943                 if (m > s_m)
1944                         memset(&cb->args[2], 0,
1945                                 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1946
1947                 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1948                         cb->args[1] = m;
1949                         goto out;
1950                 }
1951         }
1952         read_unlock(&fib_lock);
1953         cb->args[1] = m;
1954         return skb->len;
1955 out:
1956         read_unlock(&fib_lock);
1957         return -1;
1958 }
1959
1960 /* Fix more generic FIB names for init later */
1961
1962 #ifdef CONFIG_IP_MULTIPLE_TABLES
1963 struct fib_table * fib_hash_init(int id)
1964 #else
1965 struct fib_table * __init fib_hash_init(int id)
1966 #endif
1967 {
1968         struct fib_table *tb;
1969         struct trie *t;
1970
1971         if (fn_alias_kmem == NULL)
1972                 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1973                                                   sizeof(struct fib_alias),
1974                                                   0, SLAB_HWCACHE_ALIGN,
1975                                                   NULL, NULL);
1976
1977         tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1978                      GFP_KERNEL);
1979         if (tb == NULL)
1980                 return NULL;
1981
1982         tb->tb_id = id;
1983         tb->tb_lookup = fn_trie_lookup;
1984         tb->tb_insert = fn_trie_insert;
1985         tb->tb_delete = fn_trie_delete;
1986         tb->tb_flush = fn_trie_flush;
1987         tb->tb_select_default = fn_trie_select_default;
1988         tb->tb_dump = fn_trie_dump;
1989         memset(tb->tb_data, 0, sizeof(struct trie));
1990
1991         t = (struct trie *) tb->tb_data;
1992
1993         trie_init(t);
1994
1995         if (id == RT_TABLE_LOCAL)
1996                 trie_local = t;
1997         else if (id == RT_TABLE_MAIN)
1998                 trie_main = t;
1999
2000         if (id == RT_TABLE_LOCAL)
2001                 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2002
2003         return tb;
2004 }
2005
2006 /* Trie dump functions */
2007
2008 static void putspace_seq(struct seq_file *seq, int n)
2009 {
2010         while (n--)
2011                 seq_printf(seq, " ");
2012 }
2013
2014 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2015 {
2016         while (bits--)
2017                 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2018 }
2019
2020 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2021                    int pend, int cindex, int bits)
2022 {
2023         putspace_seq(seq, indent);
2024         if (IS_LEAF(n))
2025                 seq_printf(seq, "|");
2026         else
2027                 seq_printf(seq, "+");
2028         if (bits) {
2029                 seq_printf(seq, "%d/", cindex);
2030                 printbin_seq(seq, cindex, bits);
2031                 seq_printf(seq, ": ");
2032         } else
2033                 seq_printf(seq, "<root>: ");
2034         seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2035
2036         if (IS_LEAF(n)) {
2037                 struct leaf *l = (struct leaf *)n;
2038                 struct fib_alias *fa;
2039                 int i;
2040
2041                 seq_printf(seq, "key=%d.%d.%d.%d\n",
2042                            n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2043
2044                 for (i = 32; i >= 0; i--)
2045                         if (find_leaf_info(&l->list, i)) {
2046                                 struct list_head *fa_head = get_fa_head(l, i);
2047
2048                                 if (!fa_head)
2049                                         continue;
2050
2051                                 if (list_empty(fa_head))
2052                                         continue;
2053
2054                                 putspace_seq(seq, indent+2);
2055                                 seq_printf(seq, "{/%d...dumping}\n", i);
2056
2057                                 list_for_each_entry(fa, fa_head, fa_list) {
2058                                         putspace_seq(seq, indent+2);
2059                                         if (fa->fa_info == NULL) {
2060                                                 seq_printf(seq, "Error fa_info=NULL\n");
2061                                                 continue;
2062                                         }
2063                                         if (fa->fa_info->fib_nh == NULL) {
2064                                                 seq_printf(seq, "Error _fib_nh=NULL\n");
2065                                                 continue;
2066                                         }
2067
2068                                         seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2069                                               fa->fa_type,
2070                                               fa->fa_scope,
2071                                               fa->fa_tos);
2072                                 }
2073                         }
2074         } else {
2075                 struct tnode *tn = (struct tnode *)n;
2076                 int plen = ((struct tnode *)n)->pos;
2077                 t_key prf = MASK_PFX(n->key, plen);
2078
2079                 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2080                            prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2081
2082                 putspace_seq(seq, indent); seq_printf(seq, "|    ");
2083                 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2084                 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2085                 seq_printf(seq, "}\n");
2086                 putspace_seq(seq, indent); seq_printf(seq, "|    ");
2087                 seq_printf(seq, "{pos=%d", tn->pos);
2088                 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2089                 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2090                 putspace_seq(seq, indent); seq_printf(seq, "|    ");
2091                 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2092         }
2093 }
2094
2095 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2096 {
2097         struct node *n = t->trie;
2098         int cindex = 0;
2099         int indent = 1;
2100         int pend = 0;
2101         int depth = 0;
2102         struct tnode *tn;
2103
2104         read_lock(&fib_lock);
2105
2106         seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2107
2108         if (!n) {
2109                 seq_printf(seq, "------ trie is empty\n");
2110
2111                 read_unlock(&fib_lock);
2112                 return;
2113         }
2114
2115         printnode_seq(seq, indent, n, pend, cindex, 0);
2116
2117         if (!IS_TNODE(n)) {
2118                 read_unlock(&fib_lock);
2119                 return;
2120         }
2121
2122         tn = (struct tnode *)n;
2123         pend = tn->pos+tn->bits;
2124         putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2125         indent += 3;
2126         depth++;
2127
2128         while (tn && cindex < (1 << tn->bits)) {
2129                 if (tn->child[cindex]) {
2130                         /* Got a child */
2131
2132                         printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2133                         if (IS_LEAF(tn->child[cindex])) {
2134                                 cindex++;
2135                         } else {
2136                                 /*
2137                                  * New tnode. Decend one level
2138                                  */
2139
2140                                 depth++;
2141                                 tn = (struct tnode *)tn->child[cindex];
2142                                 pend = tn->pos + tn->bits;
2143                                 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2144                                 indent += 3;
2145                                 cindex = 0;
2146                         }
2147                 } else
2148                         cindex++;
2149
2150                 /*
2151                  * Test if we are done
2152                  */
2153
2154                 while (cindex >= (1 << tn->bits)) {
2155                         /*
2156                          * Move upwards and test for root
2157                          * pop off all traversed  nodes
2158                          */
2159
2160                         if (NODE_PARENT(tn) == NULL) {
2161                                 tn = NULL;
2162                                 break;
2163                         }
2164
2165                         cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2166                         cindex++;
2167                         tn = NODE_PARENT(tn);
2168                         pend = tn->pos + tn->bits;
2169                         indent -= 3;
2170                         depth--;
2171                 }
2172         }
2173
2174         read_unlock(&fib_lock);
2175 }
2176
2177 static struct trie_stat *trie_stat_new(void)
2178 {
2179         struct trie_stat *s;
2180         int i;
2181
2182         s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2183         if (!s)
2184                 return NULL;
2185
2186         s->totdepth = 0;
2187         s->maxdepth = 0;
2188         s->tnodes = 0;
2189         s->leaves = 0;
2190         s->nullpointers = 0;
2191
2192         for (i = 0; i < MAX_CHILDS; i++)
2193                 s->nodesizes[i] = 0;
2194
2195         return s;
2196 }
2197
2198 static struct trie_stat *trie_collect_stats(struct trie *t)
2199 {
2200         struct node *n = t->trie;
2201         struct trie_stat *s = trie_stat_new();
2202         int cindex = 0;
2203         int pend = 0;
2204         int depth = 0;
2205
2206         if (!s)
2207                 return NULL;
2208         if (!n)
2209                 return s;
2210
2211         read_lock(&fib_lock);
2212
2213         if (IS_TNODE(n)) {
2214                 struct tnode *tn = (struct tnode *)n;
2215                 pend = tn->pos+tn->bits;
2216                 s->nodesizes[tn->bits]++;
2217                 depth++;
2218
2219                 while (tn && cindex < (1 << tn->bits)) {
2220                         if (tn->child[cindex]) {
2221                                 /* Got a child */
2222
2223                                 if (IS_LEAF(tn->child[cindex])) {
2224                                         cindex++;
2225
2226                                         /* stats */
2227                                         if (depth > s->maxdepth)
2228                                                 s->maxdepth = depth;
2229                                         s->totdepth += depth;
2230                                         s->leaves++;
2231                                 } else {
2232                                         /*
2233                                          * New tnode. Decend one level
2234                                          */
2235
2236                                         s->tnodes++;
2237                                         s->nodesizes[tn->bits]++;
2238                                         depth++;
2239
2240                                         n = tn->child[cindex];
2241                                         tn = (struct tnode *)n;
2242                                         pend = tn->pos+tn->bits;
2243
2244                                         cindex = 0;
2245                                 }
2246                         } else {
2247                                 cindex++;
2248                                 s->nullpointers++;
2249                         }
2250
2251                         /*
2252                          * Test if we are done
2253                          */
2254
2255                         while (cindex >= (1 << tn->bits)) {
2256                                 /*
2257                                  * Move upwards and test for root
2258                                  * pop off all traversed  nodes
2259                                  */
2260
2261                                 if (NODE_PARENT(tn) == NULL) {
2262                                         tn = NULL;
2263                                         n = NULL;
2264                                         break;
2265                                 }
2266
2267                                 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2268                                 tn = NODE_PARENT(tn);
2269                                 cindex++;
2270                                 n = (struct node *)tn;
2271                                 pend = tn->pos+tn->bits;
2272                                 depth--;
2273                         }
2274                 }
2275         }
2276
2277         read_unlock(&fib_lock);
2278         return s;
2279 }
2280
2281 #ifdef CONFIG_PROC_FS
2282
2283 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2284 {
2285         return NULL;
2286 }
2287
2288 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2289 {
2290         return NULL;
2291 }
2292
2293 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2294 {
2295         if (!ip_fib_main_table)
2296                 return NULL;
2297
2298         if (*pos)
2299                 return fib_triestat_get_next(seq);
2300         else
2301                 return SEQ_START_TOKEN;
2302 }
2303
2304 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305 {
2306         ++*pos;
2307         if (v == SEQ_START_TOKEN)
2308                 return fib_triestat_get_first(seq);
2309         else
2310                 return fib_triestat_get_next(seq);
2311 }
2312
2313 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2314 {
2315
2316 }
2317
2318 /*
2319  *      This outputs /proc/net/fib_triestats
2320  *
2321  *      It always works in backward compatibility mode.
2322  *      The format of the file is not supposed to be changed.
2323  */
2324
2325 static void collect_and_show(struct trie *t, struct seq_file *seq)
2326 {
2327         int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2328         int i, max, pointers;
2329         struct trie_stat *stat;
2330         int avdepth;
2331
2332         stat = trie_collect_stats(t);
2333
2334         bytes = 0;
2335         seq_printf(seq, "trie=%p\n", t);
2336
2337         if (stat) {
2338                 if (stat->leaves)
2339                         avdepth = stat->totdepth*100 / stat->leaves;
2340                 else
2341                         avdepth = 0;
2342                 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2343                 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2344
2345                 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2346                 bytes += sizeof(struct leaf) * stat->leaves;
2347                 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2348                 bytes += sizeof(struct tnode) * stat->tnodes;
2349
2350                 max = MAX_CHILDS-1;
2351
2352                 while (max >= 0 && stat->nodesizes[max] == 0)
2353                         max--;
2354                 pointers = 0;
2355
2356                 for (i = 1; i <= max; i++)
2357                         if (stat->nodesizes[i] != 0) {
2358                                 seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2359                                 pointers += (1<<i) * stat->nodesizes[i];
2360                         }
2361                 seq_printf(seq, "\n");
2362                 seq_printf(seq, "Pointers: %d\n", pointers);
2363                 bytes += sizeof(struct node *) * pointers;
2364                 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2365                 seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2366
2367                 kfree(stat);
2368         }
2369
2370 #ifdef CONFIG_IP_FIB_TRIE_STATS
2371         seq_printf(seq, "Counters:\n---------\n");
2372         seq_printf(seq,"gets = %d\n", t->stats.gets);
2373         seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2374         seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2375         seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2376         seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2377         seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2378 #ifdef CLEAR_STATS
2379         memset(&(t->stats), 0, sizeof(t->stats));
2380 #endif
2381 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2382 }
2383
2384 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2385 {
2386         char bf[128];
2387
2388         if (v == SEQ_START_TOKEN) {
2389                 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2390                            sizeof(struct leaf), sizeof(struct tnode));
2391                 if (trie_local)
2392                         collect_and_show(trie_local, seq);
2393
2394                 if (trie_main)
2395                         collect_and_show(trie_main, seq);
2396         } else {
2397                 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2398
2399                 seq_printf(seq, "%-127s\n", bf);
2400         }
2401         return 0;
2402 }
2403
2404 static struct seq_operations fib_triestat_seq_ops = {
2405         .start = fib_triestat_seq_start,
2406         .next  = fib_triestat_seq_next,
2407         .stop  = fib_triestat_seq_stop,
2408         .show  = fib_triestat_seq_show,
2409 };
2410
2411 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2412 {
2413         struct seq_file *seq;
2414         int rc = -ENOMEM;
2415
2416         rc = seq_open(file, &fib_triestat_seq_ops);
2417         if (rc)
2418                 goto out_kfree;
2419
2420         seq = file->private_data;
2421 out:
2422         return rc;
2423 out_kfree:
2424         goto out;
2425 }
2426
2427 static struct file_operations fib_triestat_seq_fops = {
2428         .owner  = THIS_MODULE,
2429         .open   = fib_triestat_seq_open,
2430         .read   = seq_read,
2431         .llseek = seq_lseek,
2432         .release = seq_release_private,
2433 };
2434
2435 int __init fib_stat_proc_init(void)
2436 {
2437         if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2438                 return -ENOMEM;
2439         return 0;
2440 }
2441
2442 void __init fib_stat_proc_exit(void)
2443 {
2444         proc_net_remove("fib_triestat");
2445 }
2446
2447 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2448 {
2449         return NULL;
2450 }
2451
2452 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2453 {
2454         return NULL;
2455 }
2456
2457 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2458 {
2459         if (!ip_fib_main_table)
2460                 return NULL;
2461
2462         if (*pos)
2463                 return fib_trie_get_next(seq);
2464         else
2465                 return SEQ_START_TOKEN;
2466 }
2467
2468 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2469 {
2470         ++*pos;
2471         if (v == SEQ_START_TOKEN)
2472                 return fib_trie_get_first(seq);
2473         else
2474                 return fib_trie_get_next(seq);
2475
2476 }
2477
2478 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2479 {
2480 }
2481
2482 /*
2483  *      This outputs /proc/net/fib_trie.
2484  *
2485  *      It always works in backward compatibility mode.
2486  *      The format of the file is not supposed to be changed.
2487  */
2488
2489 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2490 {
2491         char bf[128];
2492
2493         if (v == SEQ_START_TOKEN) {
2494                 if (trie_local)
2495                         trie_dump_seq(seq, trie_local);
2496
2497                 if (trie_main)
2498                         trie_dump_seq(seq, trie_main);
2499         } else {
2500                 snprintf(bf, sizeof(bf),
2501                          "*\t%08X\t%08X", 200, 400);
2502                 seq_printf(seq, "%-127s\n", bf);
2503         }
2504
2505         return 0;
2506 }
2507
2508 static struct seq_operations fib_trie_seq_ops = {
2509         .start = fib_trie_seq_start,
2510         .next  = fib_trie_seq_next,
2511         .stop  = fib_trie_seq_stop,
2512         .show  = fib_trie_seq_show,
2513 };
2514
2515 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2516 {
2517         struct seq_file *seq;
2518         int rc = -ENOMEM;
2519
2520         rc = seq_open(file, &fib_trie_seq_ops);
2521         if (rc)
2522                 goto out_kfree;
2523
2524         seq = file->private_data;
2525 out:
2526         return rc;
2527 out_kfree:
2528         goto out;
2529 }
2530
2531 static struct file_operations fib_trie_seq_fops = {
2532         .owner  = THIS_MODULE,
2533         .open   = fib_trie_seq_open,
2534         .read   = seq_read,
2535         .llseek = seq_lseek,
2536         .release= seq_release_private,
2537 };
2538
2539 int __init fib_proc_init(void)
2540 {
2541         if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2542                 return -ENOMEM;
2543         return 0;
2544 }
2545
2546 void __init fib_proc_exit(void)
2547 {
2548         proc_net_remove("fib_trie");
2549 }
2550
2551 #endif /* CONFIG_PROC_FS */