net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/workqueue.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112
 113 #define RT_FL_TOS(oldflp) \
 114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval            = 60 * HZ;
 123 static int ip_rt_gc_min_interval        = HZ / 2;
 124 static int ip_rt_redirect_number        = 9;
 125 static int ip_rt_redirect_load          = HZ / 50;
 126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost             = HZ;
 128 static int ip_rt_error_burst            = 5 * HZ;
 129 static int ip_rt_gc_elasticity          = 8;
 130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 132 static int ip_rt_min_advmss             = 256;
 133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 134
 135 #define RTprint(a...)   printk(KERN_DEBUG a)
 136
 137 static void rt_worker_func(struct work_struct *work);
 138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 139 static struct timer_list rt_secret_timer;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 148                                          struct net_device *dev, int how);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154
 155 static struct dst_ops ipv4_dst_ops = {
 156         .family =               AF_INET,
 157         .protocol =             __constant_htons(ETH_P_IP),
 158         .gc =                   rt_garbage_collect,
 159         .check =                ipv4_dst_check,
 160         .destroy =              ipv4_dst_destroy,
 161         .ifdown =               ipv4_dst_ifdown,
 162         .negative_advice =      ipv4_negative_advice,
 163         .link_failure =         ipv4_link_failure,
 164         .update_pmtu =          ip_rt_update_pmtu,
 165         .local_out =            ip_local_out,
 166         .entry_size =           sizeof(struct rtable),
 167         .entries =              ATOMIC_INIT(0),
 168 };
 169
 170 #define ECN_OR_COST(class)      TC_PRIO_##class
 171
 172 const __u8 ip_tos2prio[16] = {
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(FILLER),
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(BESTEFFORT),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK)
 189 };
 190
 191
 192 /*
 193  * Route cache.
 194  */
 195
 196 /* The locking scheme is rather straight forward:
 197  *
 198  * 1) Read-Copy Update protects the buckets of the central route hash.
 199  * 2) Only writers remove entries, and they hold the lock
 200  *    as they look at rtable reference counts.
 201  * 3) Only readers acquire references to rtable entries,
 202  *    they do so with atomic increments and with the
 203  *    lock held.
 204  */
 205
 206 struct rt_hash_bucket {
 207         struct rtable   *chain;
 208 };
 209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 210         defined(CONFIG_PROVE_LOCKING)
 211 /*
 212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 213  * The size of this table is a power of two and depends on the number of CPUS.
 214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 215  */
 216 #ifdef CONFIG_LOCKDEP
 217 # define RT_HASH_LOCK_SZ        256
 218 #else
 219 # if NR_CPUS >= 32
 220 #  define RT_HASH_LOCK_SZ       4096
 221 # elif NR_CPUS >= 16
 222 #  define RT_HASH_LOCK_SZ       2048
 223 # elif NR_CPUS >= 8
 224 #  define RT_HASH_LOCK_SZ       1024
 225 # elif NR_CPUS >= 4
 226 #  define RT_HASH_LOCK_SZ       512
 227 # else
 228 #  define RT_HASH_LOCK_SZ       256
 229 # endif
 230 #endif
 231
 232 static spinlock_t       *rt_hash_locks;
 233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 234
 235 static __init void rt_hash_lock_init(void)
 236 {
 237         int i;
 238
 239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 240                         GFP_KERNEL);
 241         if (!rt_hash_locks)
 242                 panic("IP: failed to allocate rt_hash_locks\n");
 243
 244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 245                 spin_lock_init(&rt_hash_locks[i]);
 246 }
 247 #else
 248 # define rt_hash_lock_addr(slot) NULL
 249
 250 static inline void rt_hash_lock_init(void)
 251 {
 252 }
 253 #endif
 254
 255 static struct rt_hash_bucket    *rt_hash_table;
 256 static unsigned                 rt_hash_mask;
 257 static unsigned int             rt_hash_log;
 258 static atomic_t                 rt_genid;
 259
 260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 261 #define RT_CACHE_STAT_INC(field) \
 262         (__raw_get_cpu_var(rt_cache_stat).field++)
 263
 264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 265 {
 266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
 267                 & rt_hash_mask;
 268 }
 269
 270 #define rt_hash(daddr, saddr, idx) \
 271         rt_hash_code((__force u32)(__be32)(daddr),\
 272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         int bucket;
 277         int genid;
 278 };
 279
 280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
 281 {
 282         struct rtable *r = NULL;
 283
 284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 285                 rcu_read_lock_bh();
 286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 287                 while (r) {
 288                         if (r->rt_genid == st->genid)
 289                                 return r;
 290                         r = rcu_dereference(r->u.dst.rt_next);
 291                 }
 292                 rcu_read_unlock_bh();
 293         }
 294         return r;
 295 }
 296
 297 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
 298 {
 299         r = r->u.dst.rt_next;
 300         while (!r) {
 301                 rcu_read_unlock_bh();
 302                 if (--st->bucket < 0)
 303                         break;
 304                 rcu_read_lock_bh();
 305                 r = rt_hash_table[st->bucket].chain;
 306         }
 307         return rcu_dereference(r);
 308 }
 309
 310 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
 311 {
 312         struct rtable *r = rt_cache_get_first(st);
 313
 314         if (r)
 315                 while (pos && (r = rt_cache_get_next(st, r))) {
 316                         if (r->rt_genid != st->genid)
 317                                 continue;
 318                         --pos;
 319                 }
 320         return pos ? NULL : r;
 321 }
 322
 323 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326
 327         if (*pos)
 328                 return rt_cache_get_idx(st, *pos - 1);
 329         st->genid = atomic_read(&rt_genid);
 330         return SEQ_START_TOKEN;
 331 }
 332
 333 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 334 {
 335         struct rtable *r;
 336         struct rt_cache_iter_state *st = seq->private;
 337
 338         if (v == SEQ_START_TOKEN)
 339                 r = rt_cache_get_first(st);
 340         else
 341                 r = rt_cache_get_next(st, v);
 342         ++*pos;
 343         return r;
 344 }
 345
 346 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 347 {
 348         if (v && v != SEQ_START_TOKEN)
 349                 rcu_read_unlock_bh();
 350 }
 351
 352 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 353 {
 354         if (v == SEQ_START_TOKEN)
 355                 seq_printf(seq, "%-127s\n",
 356                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 357                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 358                            "HHUptod\tSpecDst");
 359         else {
 360                 struct rtable *r = v;
 361                 char temp[256];
 362
 363                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 364                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 365                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 366                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 367                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 368                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 369                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 370                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 371                         dst_metric(&r->u.dst, RTAX_WINDOW),
 372                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 373                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 374                         r->fl.fl4_tos,
 375                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 376                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 377                                        dev_queue_xmit) : 0,
 378                         r->rt_spec_dst);
 379                 seq_printf(seq, "%-127s\n", temp);
 380         }
 381         return 0;
 382 }
 383
 384 static const struct seq_operations rt_cache_seq_ops = {
 385         .start  = rt_cache_seq_start,
 386         .next   = rt_cache_seq_next,
 387         .stop   = rt_cache_seq_stop,
 388         .show   = rt_cache_seq_show,
 389 };
 390
 391 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 392 {
 393         return seq_open_private(file, &rt_cache_seq_ops,
 394                         sizeof(struct rt_cache_iter_state));
 395 }
 396
 397 static const struct file_operations rt_cache_seq_fops = {
 398         .owner   = THIS_MODULE,
 399         .open    = rt_cache_seq_open,
 400         .read    = seq_read,
 401         .llseek  = seq_lseek,
 402         .release = seq_release_private,
 403 };
 404
 405
 406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 407 {
 408         int cpu;
 409
 410         if (*pos == 0)
 411                 return SEQ_START_TOKEN;
 412
 413         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 414                 if (!cpu_possible(cpu))
 415                         continue;
 416                 *pos = cpu+1;
 417                 return &per_cpu(rt_cache_stat, cpu);
 418         }
 419         return NULL;
 420 }
 421
 422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 423 {
 424         int cpu;
 425
 426         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 427                 if (!cpu_possible(cpu))
 428                         continue;
 429                 *pos = cpu+1;
 430                 return &per_cpu(rt_cache_stat, cpu);
 431         }
 432         return NULL;
 433
 434 }
 435
 436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 437 {
 438
 439 }
 440
 441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 442 {
 443         struct rt_cache_stat *st = v;
 444
 445         if (v == SEQ_START_TOKEN) {
 446                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 447                 return 0;
 448         }
 449
 450         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 451                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 452                    atomic_read(&ipv4_dst_ops.entries),
 453                    st->in_hit,
 454                    st->in_slow_tot,
 455                    st->in_slow_mc,
 456                    st->in_no_route,
 457                    st->in_brd,
 458                    st->in_martian_dst,
 459                    st->in_martian_src,
 460
 461                    st->out_hit,
 462                    st->out_slow_tot,
 463                    st->out_slow_mc,
 464
 465                    st->gc_total,
 466                    st->gc_ignored,
 467                    st->gc_goal_miss,
 468                    st->gc_dst_overflow,
 469                    st->in_hlist_search,
 470                    st->out_hlist_search
 471                 );
 472         return 0;
 473 }
 474
 475 static const struct seq_operations rt_cpu_seq_ops = {
 476         .start  = rt_cpu_seq_start,
 477         .next   = rt_cpu_seq_next,
 478         .stop   = rt_cpu_seq_stop,
 479         .show   = rt_cpu_seq_show,
 480 };
 481
 482
 483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 484 {
 485         return seq_open(file, &rt_cpu_seq_ops);
 486 }
 487
 488 static const struct file_operations rt_cpu_seq_fops = {
 489         .owner   = THIS_MODULE,
 490         .open    = rt_cpu_seq_open,
 491         .read    = seq_read,
 492         .llseek  = seq_lseek,
 493         .release = seq_release,
 494 };
 495
 496 #ifdef CONFIG_NET_CLS_ROUTE
 497 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 498                            int length, int *eof, void *data)
 499 {
 500         unsigned int i;
 501
 502         if ((offset & 3) || (length & 3))
 503                 return -EIO;
 504
 505         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 506                 *eof = 1;
 507                 return 0;
 508         }
 509
 510         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 511                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 512                 *eof = 1;
 513         }
 514
 515         offset /= sizeof(u32);
 516
 517         if (length > 0) {
 518                 u32 *dst = (u32 *) buffer;
 519
 520                 *start = buffer;
 521                 memset(dst, 0, length);
 522
 523                 for_each_possible_cpu(i) {
 524                         unsigned int j;
 525                         u32 *src;
 526
 527                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 528                         for (j = 0; j < length/4; j++)
 529                                 dst[j] += src[j];
 530                 }
 531         }
 532         return length;
 533 }
 534 #endif
 535
 536 static __init int ip_rt_proc_init(struct net *net)
 537 {
 538         struct proc_dir_entry *pde;
 539
 540         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 541                         &rt_cache_seq_fops);
 542         if (!pde)
 543                 goto err1;
 544
 545 <<<<<<< HEAD:net/ipv4/route.c
 546         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
 547 =======
 548         pde = proc_create("rt_cache", S_IRUGO,
 549                           net->proc_net_stat, &rt_cpu_seq_fops);
 550 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:net/ipv4/route.c
 551         if (!pde)
 552                 goto err2;
 553
 554 <<<<<<< HEAD:net/ipv4/route.c
 555         pde->proc_fops = &rt_cpu_seq_fops;
 556
 557 =======
 558 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:net/ipv4/route.c
 559 #ifdef CONFIG_NET_CLS_ROUTE
 560         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 561                         ip_rt_acct_read, NULL);
 562         if (!pde)
 563                 goto err3;
 564 #endif
 565         return 0;
 566
 567 #ifdef CONFIG_NET_CLS_ROUTE
 568 err3:
 569         remove_proc_entry("rt_cache", net->proc_net_stat);
 570 #endif
 571 err2:
 572         remove_proc_entry("rt_cache", net->proc_net);
 573 err1:
 574         return -ENOMEM;
 575 }
 576 #else
 577 static inline int ip_rt_proc_init(struct net *net)
 578 {
 579         return 0;
 580 }
 581 #endif /* CONFIG_PROC_FS */
 582
 583 static __inline__ void rt_free(struct rtable *rt)
 584 {
 585         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 586 }
 587
 588 static __inline__ void rt_drop(struct rtable *rt)
 589 {
 590         ip_rt_put(rt);
 591         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 592 }
 593
 594 static __inline__ int rt_fast_clean(struct rtable *rth)
 595 {
 596         /* Kill broadcast/multicast entries very aggresively, if they
 597            collide in hash table with more useful entries */
 598         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 599                 rth->fl.iif && rth->u.dst.rt_next;
 600 }
 601
 602 static __inline__ int rt_valuable(struct rtable *rth)
 603 {
 604         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 605                 rth->u.dst.expires;
 606 }
 607
 608 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 609 {
 610         unsigned long age;
 611         int ret = 0;
 612
 613         if (atomic_read(&rth->u.dst.__refcnt))
 614                 goto out;
 615
 616         ret = 1;
 617         if (rth->u.dst.expires &&
 618             time_after_eq(jiffies, rth->u.dst.expires))
 619                 goto out;
 620
 621         age = jiffies - rth->u.dst.lastuse;
 622         ret = 0;
 623         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 624             (age <= tmo2 && rt_valuable(rth)))
 625                 goto out;
 626         ret = 1;
 627 out:    return ret;
 628 }
 629
 630 /* Bits of score are:
 631  * 31: very valuable
 632  * 30: not quite useless
 633  * 29..0: usage counter
 634  */
 635 static inline u32 rt_score(struct rtable *rt)
 636 {
 637         u32 score = jiffies - rt->u.dst.lastuse;
 638
 639         score = ~score & ~(3<<30);
 640
 641         if (rt_valuable(rt))
 642                 score |= (1<<31);
 643
 644         if (!rt->fl.iif ||
 645             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 646                 score |= (1<<30);
 647
 648         return score;
 649 }
 650
 651 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 652 {
 653         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 654                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 655                 (fl1->mark ^ fl2->mark) |
 656                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 657                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 658                 (fl1->oif ^ fl2->oif) |
 659                 (fl1->iif ^ fl2->iif)) == 0;
 660 }
 661
 662 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 663 {
 664         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
 665 }
 666
 667 /*
 668  * Perform a full scan of hash table and free all entries.
 669  * Can be called by a softirq or a process.
 670  * In the later case, we want to be reschedule if necessary
 671  */
 672 static void rt_do_flush(int process_context)
 673 {
 674         unsigned int i;
 675         struct rtable *rth, *next;
 676
 677         for (i = 0; i <= rt_hash_mask; i++) {
 678                 if (process_context && need_resched())
 679                         cond_resched();
 680                 rth = rt_hash_table[i].chain;
 681                 if (!rth)
 682                         continue;
 683
 684                 spin_lock_bh(rt_hash_lock_addr(i));
 685                 rth = rt_hash_table[i].chain;
 686                 rt_hash_table[i].chain = NULL;
 687                 spin_unlock_bh(rt_hash_lock_addr(i));
 688
 689                 for (; rth; rth = next) {
 690                         next = rth->u.dst.rt_next;
 691                         rt_free(rth);
 692                 }
 693         }
 694 }
 695
 696 static void rt_check_expire(void)
 697 {
 698         static unsigned int rover;
 699         unsigned int i = rover, goal;
 700         struct rtable *rth, **rthp;
 701         u64 mult;
 702
 703         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 704         if (ip_rt_gc_timeout > 1)
 705                 do_div(mult, ip_rt_gc_timeout);
 706         goal = (unsigned int)mult;
 707         if (goal > rt_hash_mask)
 708                 goal = rt_hash_mask + 1;
 709         for (; goal > 0; goal--) {
 710                 unsigned long tmo = ip_rt_gc_timeout;
 711
 712                 i = (i + 1) & rt_hash_mask;
 713                 rthp = &rt_hash_table[i].chain;
 714
 715                 if (need_resched())
 716                         cond_resched();
 717
 718                 if (*rthp == NULL)
 719                         continue;
 720                 spin_lock_bh(rt_hash_lock_addr(i));
 721                 while ((rth = *rthp) != NULL) {
 722                         if (rth->rt_genid != atomic_read(&rt_genid)) {
 723                                 *rthp = rth->u.dst.rt_next;
 724                                 rt_free(rth);
 725                                 continue;
 726                         }
 727                         if (rth->u.dst.expires) {
 728                                 /* Entry is expired even if it is in use */
 729                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 730                                         tmo >>= 1;
 731                                         rthp = &rth->u.dst.rt_next;
 732                                         continue;
 733                                 }
 734                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 735                                 tmo >>= 1;
 736                                 rthp = &rth->u.dst.rt_next;
 737                                 continue;
 738                         }
 739
 740                         /* Cleanup aged off entries. */
 741                         *rthp = rth->u.dst.rt_next;
 742                         rt_free(rth);
 743                 }
 744                 spin_unlock_bh(rt_hash_lock_addr(i));
 745         }
 746         rover = i;
 747 }
 748
 749 /*
 750  * rt_worker_func() is run in process context.
 751  * we call rt_check_expire() to scan part of the hash table
 752  */
 753 static void rt_worker_func(struct work_struct *work)
 754 {
 755         rt_check_expire();
 756         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 757 }
 758
 759 /*
 760  * Pertubation of rt_genid by a small quantity [1..256]
 761  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 762  * many times (2^24) without giving recent rt_genid.
 763  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 764  */
 765 static void rt_cache_invalidate(void)
 766 {
 767         unsigned char shuffle;
 768
 769         get_random_bytes(&shuffle, sizeof(shuffle));
 770         atomic_add(shuffle + 1U, &rt_genid);
 771 }
 772
 773 /*
 774  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 775  * delay >= 0 : invalidate & flush cache (can be long)
 776  */
 777 void rt_cache_flush(int delay)
 778 {
 779         rt_cache_invalidate();
 780         if (delay >= 0)
 781                 rt_do_flush(!in_softirq());
 782 }
 783
 784 /*
 785  * We change rt_genid and let gc do the cleanup
 786  */
 787 static void rt_secret_rebuild(unsigned long dummy)
 788 {
 789         rt_cache_invalidate();
 790         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 791 }
 792
 793 /*
 794    Short description of GC goals.
 795
 796    We want to build algorithm, which will keep routing cache
 797    at some equilibrium point, when number of aged off entries
 798    is kept approximately equal to newly generated ones.
 799
 800    Current expiration strength is variable "expire".
 801    We try to adjust it dynamically, so that if networking
 802    is idle expires is large enough to keep enough of warm entries,
 803    and when load increases it reduces to limit cache size.
 804  */
 805
 806 static int rt_garbage_collect(struct dst_ops *ops)
 807 {
 808         static unsigned long expire = RT_GC_TIMEOUT;
 809         static unsigned long last_gc;
 810         static int rover;
 811         static int equilibrium;
 812         struct rtable *rth, **rthp;
 813         unsigned long now = jiffies;
 814         int goal;
 815
 816         /*
 817          * Garbage collection is pretty expensive,
 818          * do not make it too frequently.
 819          */
 820
 821         RT_CACHE_STAT_INC(gc_total);
 822
 823         if (now - last_gc < ip_rt_gc_min_interval &&
 824             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 825                 RT_CACHE_STAT_INC(gc_ignored);
 826                 goto out;
 827         }
 828
 829         /* Calculate number of entries, which we want to expire now. */
 830         goal = atomic_read(&ipv4_dst_ops.entries) -
 831                 (ip_rt_gc_elasticity << rt_hash_log);
 832         if (goal <= 0) {
 833                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 834                         equilibrium = ipv4_dst_ops.gc_thresh;
 835                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 836                 if (goal > 0) {
 837                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 838                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 839                 }
 840         } else {
 841                 /* We are in dangerous area. Try to reduce cache really
 842                  * aggressively.
 843                  */
 844                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 845                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 846         }
 847
 848         if (now - last_gc >= ip_rt_gc_min_interval)
 849                 last_gc = now;
 850
 851         if (goal <= 0) {
 852                 equilibrium += goal;
 853                 goto work_done;
 854         }
 855
 856         do {
 857                 int i, k;
 858
 859                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 860                         unsigned long tmo = expire;
 861
 862                         k = (k + 1) & rt_hash_mask;
 863                         rthp = &rt_hash_table[k].chain;
 864                         spin_lock_bh(rt_hash_lock_addr(k));
 865                         while ((rth = *rthp) != NULL) {
 866                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
 867                                         !rt_may_expire(rth, tmo, expire)) {
 868                                         tmo >>= 1;
 869                                         rthp = &rth->u.dst.rt_next;
 870                                         continue;
 871                                 }
 872                                 *rthp = rth->u.dst.rt_next;
 873                                 rt_free(rth);
 874                                 goal--;
 875                         }
 876                         spin_unlock_bh(rt_hash_lock_addr(k));
 877                         if (goal <= 0)
 878                                 break;
 879                 }
 880                 rover = k;
 881
 882                 if (goal <= 0)
 883                         goto work_done;
 884
 885                 /* Goal is not achieved. We stop process if:
 886
 887                    - if expire reduced to zero. Otherwise, expire is halfed.
 888                    - if table is not full.
 889                    - if we are called from interrupt.
 890                    - jiffies check is just fallback/debug loop breaker.
 891                      We will not spin here for long time in any case.
 892                  */
 893
 894                 RT_CACHE_STAT_INC(gc_goal_miss);
 895
 896                 if (expire == 0)
 897                         break;
 898
 899                 expire >>= 1;
 900 #if RT_CACHE_DEBUG >= 2
 901                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 902                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 903 #endif
 904
 905                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 906                         goto out;
 907         } while (!in_softirq() && time_before_eq(jiffies, now));
 908
 909         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 910                 goto out;
 911         if (net_ratelimit())
 912                 printk(KERN_WARNING "dst cache overflow\n");
 913         RT_CACHE_STAT_INC(gc_dst_overflow);
 914         return 1;
 915
 916 work_done:
 917         expire += ip_rt_gc_min_interval;
 918         if (expire > ip_rt_gc_timeout ||
 919             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 920                 expire = ip_rt_gc_timeout;
 921 #if RT_CACHE_DEBUG >= 2
 922         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 923                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 924 #endif
 925 out:    return 0;
 926 }
 927
 928 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 929 {
 930         struct rtable   *rth, **rthp;
 931         unsigned long   now;
 932         struct rtable *cand, **candp;
 933         u32             min_score;
 934         int             chain_length;
 935         int attempts = !in_softirq();
 936
 937 restart:
 938         chain_length = 0;
 939         min_score = ~(u32)0;
 940         cand = NULL;
 941         candp = NULL;
 942         now = jiffies;
 943
 944         rthp = &rt_hash_table[hash].chain;
 945
 946         spin_lock_bh(rt_hash_lock_addr(hash));
 947         while ((rth = *rthp) != NULL) {
 948                 if (rth->rt_genid != atomic_read(&rt_genid)) {
 949                         *rthp = rth->u.dst.rt_next;
 950                         rt_free(rth);
 951                         continue;
 952                 }
 953                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 954                         /* Put it first */
 955                         *rthp = rth->u.dst.rt_next;
 956                         /*
 957                          * Since lookup is lockfree, the deletion
 958                          * must be visible to another weakly ordered CPU before
 959                          * the insertion at the start of the hash chain.
 960                          */
 961                         rcu_assign_pointer(rth->u.dst.rt_next,
 962                                            rt_hash_table[hash].chain);
 963                         /*
 964                          * Since lookup is lockfree, the update writes
 965                          * must be ordered for consistency on SMP.
 966                          */
 967                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 968
 969                         dst_use(&rth->u.dst, now);
 970                         spin_unlock_bh(rt_hash_lock_addr(hash));
 971
 972                         rt_drop(rt);
 973                         *rp = rth;
 974                         return 0;
 975                 }
 976
 977                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 978                         u32 score = rt_score(rth);
 979
 980                         if (score <= min_score) {
 981                                 cand = rth;
 982                                 candp = rthp;
 983                                 min_score = score;
 984                         }
 985                 }
 986
 987                 chain_length++;
 988
 989                 rthp = &rth->u.dst.rt_next;
 990         }
 991
 992         if (cand) {
 993                 /* ip_rt_gc_elasticity used to be average length of chain
 994                  * length, when exceeded gc becomes really aggressive.
 995                  *
 996                  * The second limit is less certain. At the moment it allows
 997                  * only 2 entries per bucket. We will see.
 998                  */
 999                 if (chain_length > ip_rt_gc_elasticity) {
1000                         *candp = cand->u.dst.rt_next;
1001                         rt_free(cand);
1002                 }
1003         }
1004
1005         /* Try to bind route to arp only if it is output
1006            route or unicast forwarding path.
1007          */
1008         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1009                 int err = arp_bind_neighbour(&rt->u.dst);
1010                 if (err) {
1011                         spin_unlock_bh(rt_hash_lock_addr(hash));
1012
1013                         if (err != -ENOBUFS) {
1014                                 rt_drop(rt);
1015                                 return err;
1016                         }
1017
1018                         /* Neighbour tables are full and nothing
1019                            can be released. Try to shrink route cache,
1020                            it is most likely it holds some neighbour records.
1021                          */
1022                         if (attempts-- > 0) {
1023                                 int saved_elasticity = ip_rt_gc_elasticity;
1024                                 int saved_int = ip_rt_gc_min_interval;
1025                                 ip_rt_gc_elasticity     = 1;
1026                                 ip_rt_gc_min_interval   = 0;
1027                                 rt_garbage_collect(&ipv4_dst_ops);
1028                                 ip_rt_gc_min_interval   = saved_int;
1029                                 ip_rt_gc_elasticity     = saved_elasticity;
1030                                 goto restart;
1031                         }
1032
1033                         if (net_ratelimit())
1034                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1035                         rt_drop(rt);
1036                         return -ENOBUFS;
1037                 }
1038         }
1039
1040         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1041 #if RT_CACHE_DEBUG >= 2
1042         if (rt->u.dst.rt_next) {
1043                 struct rtable *trt;
1044                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1045                        NIPQUAD(rt->rt_dst));
1046                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1047                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1048                 printk("\n");
1049         }
1050 #endif
1051         rt_hash_table[hash].chain = rt;
1052         spin_unlock_bh(rt_hash_lock_addr(hash));
1053         *rp = rt;
1054         return 0;
1055 }
1056
1057 void rt_bind_peer(struct rtable *rt, int create)
1058 {
1059         static DEFINE_SPINLOCK(rt_peer_lock);
1060         struct inet_peer *peer;
1061
1062         peer = inet_getpeer(rt->rt_dst, create);
1063
1064         spin_lock_bh(&rt_peer_lock);
1065         if (rt->peer == NULL) {
1066                 rt->peer = peer;
1067                 peer = NULL;
1068         }
1069         spin_unlock_bh(&rt_peer_lock);
1070         if (peer)
1071                 inet_putpeer(peer);
1072 }
1073
1074 /*
1075  * Peer allocation may fail only in serious out-of-memory conditions.  However
1076  * we still can generate some output.
1077  * Random ID selection looks a bit dangerous because we have no chances to
1078  * select ID being unique in a reasonable period of time.
1079  * But broken packet identifier may be better than no packet at all.
1080  */
1081 static void ip_select_fb_ident(struct iphdr *iph)
1082 {
1083         static DEFINE_SPINLOCK(ip_fb_id_lock);
1084         static u32 ip_fallback_id;
1085         u32 salt;
1086
1087         spin_lock_bh(&ip_fb_id_lock);
1088         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1089         iph->id = htons(salt & 0xFFFF);
1090         ip_fallback_id = salt;
1091         spin_unlock_bh(&ip_fb_id_lock);
1092 }
1093
1094 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1095 {
1096         struct rtable *rt = (struct rtable *) dst;
1097
1098         if (rt) {
1099                 if (rt->peer == NULL)
1100                         rt_bind_peer(rt, 1);
1101
1102                 /* If peer is attached to destination, it is never detached,
1103                    so that we need not to grab a lock to dereference it.
1104                  */
1105                 if (rt->peer) {
1106                         iph->id = htons(inet_getid(rt->peer, more));
1107                         return;
1108                 }
1109         } else
1110                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1111                        __builtin_return_address(0));
1112
1113         ip_select_fb_ident(iph);
1114 }
1115
1116 static void rt_del(unsigned hash, struct rtable *rt)
1117 {
1118         struct rtable **rthp, *aux;
1119
1120         rthp = &rt_hash_table[hash].chain;
1121         spin_lock_bh(rt_hash_lock_addr(hash));
1122         ip_rt_put(rt);
1123         while ((aux = *rthp) != NULL) {
1124                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1125                         *rthp = aux->u.dst.rt_next;
1126                         rt_free(aux);
1127                         continue;
1128                 }
1129                 rthp = &aux->u.dst.rt_next;
1130         }
1131         spin_unlock_bh(rt_hash_lock_addr(hash));
1132 }
1133
1134 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1135                     __be32 saddr, struct net_device *dev)
1136 {
1137         int i, k;
1138         struct in_device *in_dev = in_dev_get(dev);
1139         struct rtable *rth, **rthp;
1140         __be32  skeys[2] = { saddr, 0 };
1141         int  ikeys[2] = { dev->ifindex, 0 };
1142         struct netevent_redirect netevent;
1143
1144         if (!in_dev)
1145                 return;
1146
1147         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1148             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1149             || ipv4_is_zeronet(new_gw))
1150                 goto reject_redirect;
1151
1152         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1153                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1154                         goto reject_redirect;
1155                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1156                         goto reject_redirect;
1157         } else {
1158                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1159                         goto reject_redirect;
1160         }
1161
1162         for (i = 0; i < 2; i++) {
1163                 for (k = 0; k < 2; k++) {
1164                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1165
1166                         rthp=&rt_hash_table[hash].chain;
1167
1168                         rcu_read_lock();
1169                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1170                                 struct rtable *rt;
1171
1172                                 if (rth->fl.fl4_dst != daddr ||
1173                                     rth->fl.fl4_src != skeys[i] ||
1174                                     rth->fl.oif != ikeys[k] ||
1175                                     rth->fl.iif != 0 ||
1176                                     rth->rt_genid != atomic_read(&rt_genid)) {
1177                                         rthp = &rth->u.dst.rt_next;
1178                                         continue;
1179                                 }
1180
1181                                 if (rth->rt_dst != daddr ||
1182                                     rth->rt_src != saddr ||
1183                                     rth->u.dst.error ||
1184                                     rth->rt_gateway != old_gw ||
1185                                     rth->u.dst.dev != dev)
1186                                         break;
1187
1188                                 dst_hold(&rth->u.dst);
1189                                 rcu_read_unlock();
1190
1191                                 rt = dst_alloc(&ipv4_dst_ops);
1192                                 if (rt == NULL) {
1193                                         ip_rt_put(rth);
1194                                         in_dev_put(in_dev);
1195                                         return;
1196                                 }
1197
1198                                 /* Copy all the information. */
1199                                 *rt = *rth;
1200                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1201                                 rt->u.dst.__use         = 1;
1202                                 atomic_set(&rt->u.dst.__refcnt, 1);
1203                                 rt->u.dst.child         = NULL;
1204                                 if (rt->u.dst.dev)
1205                                         dev_hold(rt->u.dst.dev);
1206                                 if (rt->idev)
1207                                         in_dev_hold(rt->idev);
1208                                 rt->u.dst.obsolete      = 0;
1209                                 rt->u.dst.lastuse       = jiffies;
1210                                 rt->u.dst.path          = &rt->u.dst;
1211                                 rt->u.dst.neighbour     = NULL;
1212                                 rt->u.dst.hh            = NULL;
1213                                 rt->u.dst.xfrm          = NULL;
1214                                 rt->rt_genid            = atomic_read(&rt_genid);
1215                                 rt->rt_flags            |= RTCF_REDIRECTED;
1216
1217                                 /* Gateway is different ... */
1218                                 rt->rt_gateway          = new_gw;
1219
1220                                 /* Redirect received -> path was valid */
1221                                 dst_confirm(&rth->u.dst);
1222
1223                                 if (rt->peer)
1224                                         atomic_inc(&rt->peer->refcnt);
1225
1226                                 if (arp_bind_neighbour(&rt->u.dst) ||
1227                                     !(rt->u.dst.neighbour->nud_state &
1228                                             NUD_VALID)) {
1229                                         if (rt->u.dst.neighbour)
1230                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1231                                         ip_rt_put(rth);
1232                                         rt_drop(rt);
1233                                         goto do_next;
1234                                 }
1235
1236                                 netevent.old = &rth->u.dst;
1237                                 netevent.new = &rt->u.dst;
1238                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1239                                                         &netevent);
1240
1241                                 rt_del(hash, rth);
1242                                 if (!rt_intern_hash(hash, rt, &rt))
1243                                         ip_rt_put(rt);
1244                                 goto do_next;
1245                         }
1246                         rcu_read_unlock();
1247                 do_next:
1248                         ;
1249                 }
1250         }
1251         in_dev_put(in_dev);
1252         return;
1253
1254 reject_redirect:
1255 #ifdef CONFIG_IP_ROUTE_VERBOSE
1256         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1257                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1258                         "%u.%u.%u.%u ignored.\n"
1259                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1260                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1261                        NIPQUAD(saddr), NIPQUAD(daddr));
1262 #endif
1263         in_dev_put(in_dev);
1264 }
1265
1266 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1267 {
1268         struct rtable *rt = (struct rtable*)dst;
1269         struct dst_entry *ret = dst;
1270
1271         if (rt) {
1272                 if (dst->obsolete) {
1273                         ip_rt_put(rt);
1274                         ret = NULL;
1275                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1276                            rt->u.dst.expires) {
1277                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1278                                                 rt->fl.oif);
1279 #if RT_CACHE_DEBUG >= 1
1280                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1281                                           "%u.%u.%u.%u/%02x dropped\n",
1282                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1283 #endif
1284                         rt_del(hash, rt);
1285                         ret = NULL;
1286                 }
1287         }
1288         return ret;
1289 }
1290
1291 /*
1292  * Algorithm:
1293  *      1. The first ip_rt_redirect_number redirects are sent
1294  *         with exponential backoff, then we stop sending them at all,
1295  *         assuming that the host ignores our redirects.
1296  *      2. If we did not see packets requiring redirects
1297  *         during ip_rt_redirect_silence, we assume that the host
1298  *         forgot redirected route and start to send redirects again.
1299  *
1300  * This algorithm is much cheaper and more intelligent than dumb load limiting
1301  * in icmp.c.
1302  *
1303  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1304  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1305  */
1306
1307 void ip_rt_send_redirect(struct sk_buff *skb)
1308 {
1309         struct rtable *rt = (struct rtable*)skb->dst;
1310         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1311
1312         if (!in_dev)
1313                 return;
1314
1315         if (!IN_DEV_TX_REDIRECTS(in_dev))
1316                 goto out;
1317
1318         /* No redirected packets during ip_rt_redirect_silence;
1319          * reset the algorithm.
1320          */
1321         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1322                 rt->u.dst.rate_tokens = 0;
1323
1324         /* Too many ignored redirects; do not send anything
1325          * set u.dst.rate_last to the last seen redirected packet.
1326          */
1327         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1328                 rt->u.dst.rate_last = jiffies;
1329                 goto out;
1330         }
1331
1332         /* Check for load limit; set rate_last to the latest sent
1333          * redirect.
1334          */
1335         if (rt->u.dst.rate_tokens == 0 ||
1336             time_after(jiffies,
1337                        (rt->u.dst.rate_last +
1338                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1339                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1340                 rt->u.dst.rate_last = jiffies;
1341                 ++rt->u.dst.rate_tokens;
1342 #ifdef CONFIG_IP_ROUTE_VERBOSE
1343                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1344                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1345                     net_ratelimit())
1346                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1347                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1348                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1349                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1350 #endif
1351         }
1352 out:
1353         in_dev_put(in_dev);
1354 }
1355
1356 static int ip_error(struct sk_buff *skb)
1357 {
1358         struct rtable *rt = (struct rtable*)skb->dst;
1359         unsigned long now;
1360         int code;
1361
1362         switch (rt->u.dst.error) {
1363                 case EINVAL:
1364                 default:
1365                         goto out;
1366                 case EHOSTUNREACH:
1367                         code = ICMP_HOST_UNREACH;
1368                         break;
1369                 case ENETUNREACH:
1370                         code = ICMP_NET_UNREACH;
1371                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1372                         break;
1373                 case EACCES:
1374                         code = ICMP_PKT_FILTERED;
1375                         break;
1376         }
1377
1378         now = jiffies;
1379         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1380         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1381                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1382         rt->u.dst.rate_last = now;
1383         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1384                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1385                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1386         }
1387
1388 out:    kfree_skb(skb);
1389         return 0;
1390 }
1391
1392 /*
1393  *      The last two values are not from the RFC but
1394  *      are needed for AMPRnet AX.25 paths.
1395  */
1396
1397 static const unsigned short mtu_plateau[] =
1398 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1399
1400 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1401 {
1402         int i;
1403
1404         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1405                 if (old_mtu > mtu_plateau[i])
1406                         return mtu_plateau[i];
1407         return 68;
1408 }
1409
1410 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1411                                  unsigned short new_mtu)
1412 {
1413         int i;
1414         unsigned short old_mtu = ntohs(iph->tot_len);
1415         struct rtable *rth;
1416         __be32  skeys[2] = { iph->saddr, 0, };
1417         __be32  daddr = iph->daddr;
1418         unsigned short est_mtu = 0;
1419
1420         if (ipv4_config.no_pmtu_disc)
1421                 return 0;
1422
1423         for (i = 0; i < 2; i++) {
1424                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1425
1426                 rcu_read_lock();
1427                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1428                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1429                         if (rth->fl.fl4_dst == daddr &&
1430                             rth->fl.fl4_src == skeys[i] &&
1431                             rth->rt_dst  == daddr &&
1432                             rth->rt_src  == iph->saddr &&
1433                             rth->fl.iif == 0 &&
1434                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1435                             rth->u.dst.dev->nd_net == net &&
1436                             rth->rt_genid == atomic_read(&rt_genid)) {
1437                                 unsigned short mtu = new_mtu;
1438
1439                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1440
1441                                         /* BSD 4.2 compatibility hack :-( */
1442                                         if (mtu == 0 &&
1443                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1444                                             old_mtu >= 68 + (iph->ihl << 2))
1445                                                 old_mtu -= iph->ihl << 2;
1446
1447                                         mtu = guess_mtu(old_mtu);
1448                                 }
1449                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1450                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1451                                                 dst_confirm(&rth->u.dst);
1452                                                 if (mtu < ip_rt_min_pmtu) {
1453                                                         mtu = ip_rt_min_pmtu;
1454                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1455                                                                 (1 << RTAX_MTU);
1456                                                 }
1457                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1458                                                 dst_set_expires(&rth->u.dst,
1459                                                         ip_rt_mtu_expires);
1460                                         }
1461                                         est_mtu = mtu;
1462                                 }
1463                         }
1464                 }
1465                 rcu_read_unlock();
1466         }
1467         return est_mtu ? : new_mtu;
1468 }
1469
1470 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1471 {
1472         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1473             !(dst_metric_locked(dst, RTAX_MTU))) {
1474                 if (mtu < ip_rt_min_pmtu) {
1475                         mtu = ip_rt_min_pmtu;
1476                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1477                 }
1478                 dst->metrics[RTAX_MTU-1] = mtu;
1479                 dst_set_expires(dst, ip_rt_mtu_expires);
1480                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1481         }
1482 }
1483
1484 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1485 {
1486         return NULL;
1487 }
1488
1489 static void ipv4_dst_destroy(struct dst_entry *dst)
1490 {
1491         struct rtable *rt = (struct rtable *) dst;
1492         struct inet_peer *peer = rt->peer;
1493         struct in_device *idev = rt->idev;
1494
1495         if (peer) {
1496                 rt->peer = NULL;
1497                 inet_putpeer(peer);
1498         }
1499
1500         if (idev) {
1501                 rt->idev = NULL;
1502                 in_dev_put(idev);
1503         }
1504 }
1505
1506 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1507                             int how)
1508 {
1509         struct rtable *rt = (struct rtable *) dst;
1510         struct in_device *idev = rt->idev;
1511         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1512                 struct in_device *loopback_idev =
1513                         in_dev_get(dev->nd_net->loopback_dev);
1514                 if (loopback_idev) {
1515                         rt->idev = loopback_idev;
1516                         in_dev_put(idev);
1517                 }
1518         }
1519 }
1520
1521 static void ipv4_link_failure(struct sk_buff *skb)
1522 {
1523         struct rtable *rt;
1524
1525         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1526
1527         rt = (struct rtable *) skb->dst;
1528         if (rt)
1529                 dst_set_expires(&rt->u.dst, 0);
1530 }
1531
1532 static int ip_rt_bug(struct sk_buff *skb)
1533 {
1534         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1535                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1536                 skb->dev ? skb->dev->name : "?");
1537         kfree_skb(skb);
1538         return 0;
1539 }
1540
1541 /*
1542    We do not cache source address of outgoing interface,
1543    because it is used only by IP RR, TS and SRR options,
1544    so that it out of fast path.
1545
1546    BTW remember: "addr" is allowed to be not aligned
1547    in IP options!
1548  */
1549
1550 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1551 {
1552         __be32 src;
1553         struct fib_result res;
1554
1555         if (rt->fl.iif == 0)
1556                 src = rt->rt_src;
1557         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1558                 src = FIB_RES_PREFSRC(res);
1559                 fib_res_put(&res);
1560         } else
1561                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1562                                         RT_SCOPE_UNIVERSE);
1563         memcpy(addr, &src, 4);
1564 }
1565
1566 #ifdef CONFIG_NET_CLS_ROUTE
1567 static void set_class_tag(struct rtable *rt, u32 tag)
1568 {
1569         if (!(rt->u.dst.tclassid & 0xFFFF))
1570                 rt->u.dst.tclassid |= tag & 0xFFFF;
1571         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1572                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1573 }
1574 #endif
1575
1576 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1577 {
1578         struct fib_info *fi = res->fi;
1579
1580         if (fi) {
1581                 if (FIB_RES_GW(*res) &&
1582                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1583                         rt->rt_gateway = FIB_RES_GW(*res);
1584                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1585                        sizeof(rt->u.dst.metrics));
1586                 if (fi->fib_mtu == 0) {
1587                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1588                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1589                             rt->rt_gateway != rt->rt_dst &&
1590                             rt->u.dst.dev->mtu > 576)
1591                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1592                 }
1593 #ifdef CONFIG_NET_CLS_ROUTE
1594                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1595 #endif
1596         } else
1597                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1598
1599         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1600                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1601         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1602                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1603         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1604                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1605                                        ip_rt_min_advmss);
1606         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1607                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1608
1609 #ifdef CONFIG_NET_CLS_ROUTE
1610 #ifdef CONFIG_IP_MULTIPLE_TABLES
1611         set_class_tag(rt, fib_rules_tclass(res));
1612 #endif
1613         set_class_tag(rt, itag);
1614 #endif
1615         rt->rt_type = res->type;
1616 }
1617
1618 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1619                                 u8 tos, struct net_device *dev, int our)
1620 {
1621         unsigned hash;
1622         struct rtable *rth;
1623         __be32 spec_dst;
1624         struct in_device *in_dev = in_dev_get(dev);
1625         u32 itag = 0;
1626
1627         /* Primary sanity checks. */
1628
1629         if (in_dev == NULL)
1630                 return -EINVAL;
1631
1632         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1633             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1634                 goto e_inval;
1635
1636         if (ipv4_is_zeronet(saddr)) {
1637                 if (!ipv4_is_local_multicast(daddr))
1638                         goto e_inval;
1639                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1640         } else if (fib_validate_source(saddr, 0, tos, 0,
1641                                         dev, &spec_dst, &itag) < 0)
1642                 goto e_inval;
1643
1644         rth = dst_alloc(&ipv4_dst_ops);
1645         if (!rth)
1646                 goto e_nobufs;
1647
1648         rth->u.dst.output= ip_rt_bug;
1649
1650         atomic_set(&rth->u.dst.__refcnt, 1);
1651         rth->u.dst.flags= DST_HOST;
1652         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1653                 rth->u.dst.flags |= DST_NOPOLICY;
1654         rth->fl.fl4_dst = daddr;
1655         rth->rt_dst     = daddr;
1656         rth->fl.fl4_tos = tos;
1657         rth->fl.mark    = skb->mark;
1658         rth->fl.fl4_src = saddr;
1659         rth->rt_src     = saddr;
1660 #ifdef CONFIG_NET_CLS_ROUTE
1661         rth->u.dst.tclassid = itag;
1662 #endif
1663         rth->rt_iif     =
1664         rth->fl.iif     = dev->ifindex;
1665         rth->u.dst.dev  = init_net.loopback_dev;
1666         dev_hold(rth->u.dst.dev);
1667         rth->idev       = in_dev_get(rth->u.dst.dev);
1668         rth->fl.oif     = 0;
1669         rth->rt_gateway = daddr;
1670         rth->rt_spec_dst= spec_dst;
1671         rth->rt_genid   = atomic_read(&rt_genid);
1672         rth->rt_flags   = RTCF_MULTICAST;
1673         rth->rt_type    = RTN_MULTICAST;
1674         if (our) {
1675                 rth->u.dst.input= ip_local_deliver;
1676                 rth->rt_flags |= RTCF_LOCAL;
1677         }
1678
1679 #ifdef CONFIG_IP_MROUTE
1680         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1681                 rth->u.dst.input = ip_mr_input;
1682 #endif
1683         RT_CACHE_STAT_INC(in_slow_mc);
1684
1685         in_dev_put(in_dev);
1686         hash = rt_hash(daddr, saddr, dev->ifindex);
1687         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1688
1689 e_nobufs:
1690         in_dev_put(in_dev);
1691         return -ENOBUFS;
1692
1693 e_inval:
1694         in_dev_put(in_dev);
1695         return -EINVAL;
1696 }
1697
1698
1699 static void ip_handle_martian_source(struct net_device *dev,
1700                                      struct in_device *in_dev,
1701                                      struct sk_buff *skb,
1702                                      __be32 daddr,
1703                                      __be32 saddr)
1704 {
1705         RT_CACHE_STAT_INC(in_martian_src);
1706 #ifdef CONFIG_IP_ROUTE_VERBOSE
1707         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1708                 /*
1709                  *      RFC1812 recommendation, if source is martian,
1710                  *      the only hint is MAC header.
1711                  */
1712                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1713                         "%u.%u.%u.%u, on dev %s\n",
1714                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1715                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1716                         int i;
1717                         const unsigned char *p = skb_mac_header(skb);
1718                         printk(KERN_WARNING "ll header: ");
1719                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1720                                 printk("%02x", *p);
1721                                 if (i < (dev->hard_header_len - 1))
1722                                         printk(":");
1723                         }
1724                         printk("\n");
1725                 }
1726         }
1727 #endif
1728 }
1729
1730 static inline int __mkroute_input(struct sk_buff *skb,
1731                                   struct fib_result* res,
1732                                   struct in_device *in_dev,
1733                                   __be32 daddr, __be32 saddr, u32 tos,
1734                                   struct rtable **result)
1735 {
1736
1737         struct rtable *rth;
1738         int err;
1739         struct in_device *out_dev;
1740         unsigned flags = 0;
1741         __be32 spec_dst;
1742         u32 itag;
1743
1744         /* get a working reference to the output device */
1745         out_dev = in_dev_get(FIB_RES_DEV(*res));
1746         if (out_dev == NULL) {
1747                 if (net_ratelimit())
1748                         printk(KERN_CRIT "Bug in ip_route_input" \
1749                                "_slow(). Please, report\n");
1750                 return -EINVAL;
1751         }
1752
1753
1754         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1755                                   in_dev->dev, &spec_dst, &itag);
1756         if (err < 0) {
1757                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1758                                          saddr);
1759
1760                 err = -EINVAL;
1761                 goto cleanup;
1762         }
1763
1764         if (err)
1765                 flags |= RTCF_DIRECTSRC;
1766
1767         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1768             (IN_DEV_SHARED_MEDIA(out_dev) ||
1769              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1770                 flags |= RTCF_DOREDIRECT;
1771
1772         if (skb->protocol != htons(ETH_P_IP)) {
1773                 /* Not IP (i.e. ARP). Do not create route, if it is
1774                  * invalid for proxy arp. DNAT routes are always valid.
1775                  */
1776                 if (out_dev == in_dev) {
1777                         err = -EINVAL;
1778                         goto cleanup;
1779                 }
1780         }
1781
1782
1783         rth = dst_alloc(&ipv4_dst_ops);
1784         if (!rth) {
1785                 err = -ENOBUFS;
1786                 goto cleanup;
1787         }
1788
1789         atomic_set(&rth->u.dst.__refcnt, 1);
1790         rth->u.dst.flags= DST_HOST;
1791         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1792                 rth->u.dst.flags |= DST_NOPOLICY;
1793         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1794                 rth->u.dst.flags |= DST_NOXFRM;
1795         rth->fl.fl4_dst = daddr;
1796         rth->rt_dst     = daddr;
1797         rth->fl.fl4_tos = tos;
1798         rth->fl.mark    = skb->mark;
1799         rth->fl.fl4_src = saddr;
1800         rth->rt_src     = saddr;
1801         rth->rt_gateway = daddr;
1802         rth->rt_iif     =
1803                 rth->fl.iif     = in_dev->dev->ifindex;
1804         rth->u.dst.dev  = (out_dev)->dev;
1805         dev_hold(rth->u.dst.dev);
1806         rth->idev       = in_dev_get(rth->u.dst.dev);
1807         rth->fl.oif     = 0;
1808         rth->rt_spec_dst= spec_dst;
1809
1810         rth->u.dst.input = ip_forward;
1811         rth->u.dst.output = ip_output;
1812         rth->rt_genid = atomic_read(&rt_genid);
1813
1814         rt_set_nexthop(rth, res, itag);
1815
1816         rth->rt_flags = flags;
1817
1818         *result = rth;
1819         err = 0;
1820  cleanup:
1821         /* release the working reference to the output device */
1822         in_dev_put(out_dev);
1823         return err;
1824 }
1825
1826 static inline int ip_mkroute_input(struct sk_buff *skb,
1827                                    struct fib_result* res,
1828                                    const struct flowi *fl,
1829                                    struct in_device *in_dev,
1830                                    __be32 daddr, __be32 saddr, u32 tos)
1831 {
1832         struct rtable* rth = NULL;
1833         int err;
1834         unsigned hash;
1835
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1837         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1838                 fib_select_multipath(fl, res);
1839 #endif
1840
1841         /* create a routing cache entry */
1842         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1843         if (err)
1844                 return err;
1845
1846         /* put it into the cache */
1847         hash = rt_hash(daddr, saddr, fl->iif);
1848         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1849 }
1850
1851 /*
1852  *      NOTE. We drop all the packets that has local source
1853  *      addresses, because every properly looped back packet
1854  *      must have correct destination already attached by output routine.
1855  *
1856  *      Such approach solves two big problems:
1857  *      1. Not simplex devices are handled properly.
1858  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1859  */
1860
1861 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1862                                u8 tos, struct net_device *dev)
1863 {
1864         struct fib_result res;
1865         struct in_device *in_dev = in_dev_get(dev);
1866         struct flowi fl = { .nl_u = { .ip4_u =
1867                                       { .daddr = daddr,
1868                                         .saddr = saddr,
1869                                         .tos = tos,
1870                                         .scope = RT_SCOPE_UNIVERSE,
1871                                       } },
1872                             .mark = skb->mark,
1873                             .iif = dev->ifindex };
1874         unsigned        flags = 0;
1875         u32             itag = 0;
1876         struct rtable * rth;
1877         unsigned        hash;
1878         __be32          spec_dst;
1879         int             err = -EINVAL;
1880         int             free_res = 0;
1881         struct net    * net = dev->nd_net;
1882
1883         /* IP on this device is disabled. */
1884
1885         if (!in_dev)
1886                 goto out;
1887
1888         /* Check for the most weird martians, which can be not detected
1889            by fib_lookup.
1890          */
1891
1892         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1893             ipv4_is_loopback(saddr))
1894                 goto martian_source;
1895
1896         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1897                 goto brd_input;
1898
1899         /* Accept zero addresses only to limited broadcast;
1900          * I even do not know to fix it or not. Waiting for complains :-)
1901          */
1902         if (ipv4_is_zeronet(saddr))
1903                 goto martian_source;
1904
1905         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1906             ipv4_is_loopback(daddr))
1907                 goto martian_destination;
1908
1909         /*
1910          *      Now we are ready to route packet.
1911          */
1912         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1913                 if (!IN_DEV_FORWARD(in_dev))
1914                         goto e_hostunreach;
1915                 goto no_route;
1916         }
1917         free_res = 1;
1918
1919         RT_CACHE_STAT_INC(in_slow_tot);
1920
1921         if (res.type == RTN_BROADCAST)
1922                 goto brd_input;
1923
1924         if (res.type == RTN_LOCAL) {
1925                 int result;
1926                 result = fib_validate_source(saddr, daddr, tos,
1927                                              net->loopback_dev->ifindex,
1928                                              dev, &spec_dst, &itag);
1929                 if (result < 0)
1930                         goto martian_source;
1931                 if (result)
1932                         flags |= RTCF_DIRECTSRC;
1933                 spec_dst = daddr;
1934                 goto local_input;
1935         }
1936
1937         if (!IN_DEV_FORWARD(in_dev))
1938                 goto e_hostunreach;
1939         if (res.type != RTN_UNICAST)
1940                 goto martian_destination;
1941
1942         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1943 done:
1944         in_dev_put(in_dev);
1945         if (free_res)
1946                 fib_res_put(&res);
1947 out:    return err;
1948
1949 brd_input:
1950         if (skb->protocol != htons(ETH_P_IP))
1951                 goto e_inval;
1952
1953         if (ipv4_is_zeronet(saddr))
1954                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1955         else {
1956                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1957                                           &itag);
1958                 if (err < 0)
1959                         goto martian_source;
1960                 if (err)
1961                         flags |= RTCF_DIRECTSRC;
1962         }
1963         flags |= RTCF_BROADCAST;
1964         res.type = RTN_BROADCAST;
1965         RT_CACHE_STAT_INC(in_brd);
1966
1967 local_input:
1968         rth = dst_alloc(&ipv4_dst_ops);
1969         if (!rth)
1970                 goto e_nobufs;
1971
1972         rth->u.dst.output= ip_rt_bug;
1973         rth->rt_genid = atomic_read(&rt_genid);
1974
1975         atomic_set(&rth->u.dst.__refcnt, 1);
1976         rth->u.dst.flags= DST_HOST;
1977         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1978                 rth->u.dst.flags |= DST_NOPOLICY;
1979         rth->fl.fl4_dst = daddr;
1980         rth->rt_dst     = daddr;
1981         rth->fl.fl4_tos = tos;
1982         rth->fl.mark    = skb->mark;
1983         rth->fl.fl4_src = saddr;
1984         rth->rt_src     = saddr;
1985 #ifdef CONFIG_NET_CLS_ROUTE
1986         rth->u.dst.tclassid = itag;
1987 #endif
1988         rth->rt_iif     =
1989         rth->fl.iif     = dev->ifindex;
1990         rth->u.dst.dev  = net->loopback_dev;
1991         dev_hold(rth->u.dst.dev);
1992         rth->idev       = in_dev_get(rth->u.dst.dev);
1993         rth->rt_gateway = daddr;
1994         rth->rt_spec_dst= spec_dst;
1995         rth->u.dst.input= ip_local_deliver;
1996         rth->rt_flags   = flags|RTCF_LOCAL;
1997         if (res.type == RTN_UNREACHABLE) {
1998                 rth->u.dst.input= ip_error;
1999                 rth->u.dst.error= -err;
2000                 rth->rt_flags   &= ~RTCF_LOCAL;
2001         }
2002         rth->rt_type    = res.type;
2003         hash = rt_hash(daddr, saddr, fl.iif);
2004         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2005         goto done;
2006
2007 no_route:
2008         RT_CACHE_STAT_INC(in_no_route);
2009         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2010         res.type = RTN_UNREACHABLE;
2011         if (err == -ESRCH)
2012                 err = -ENETUNREACH;
2013         goto local_input;
2014
2015         /*
2016          *      Do not cache martian addresses: they should be logged (RFC1812)
2017          */
2018 martian_destination:
2019         RT_CACHE_STAT_INC(in_martian_dst);
2020 #ifdef CONFIG_IP_ROUTE_VERBOSE
2021         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2022                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2023                         "%u.%u.%u.%u, dev %s\n",
2024                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2025 #endif
2026
2027 e_hostunreach:
2028         err = -EHOSTUNREACH;
2029         goto done;
2030
2031 e_inval:
2032         err = -EINVAL;
2033         goto done;
2034
2035 e_nobufs:
2036         err = -ENOBUFS;
2037         goto done;
2038
2039 martian_source:
2040         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2041         goto e_inval;
2042 }
2043
2044 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2045                    u8 tos, struct net_device *dev)
2046 {
2047         struct rtable * rth;
2048         unsigned        hash;
2049         int iif = dev->ifindex;
2050         struct net *net;
2051
2052         net = dev->nd_net;
2053         tos &= IPTOS_RT_MASK;
2054         hash = rt_hash(daddr, saddr, iif);
2055
2056         rcu_read_lock();
2057         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2058              rth = rcu_dereference(rth->u.dst.rt_next)) {
2059                 if (rth->fl.fl4_dst == daddr &&
2060                     rth->fl.fl4_src == saddr &&
2061                     rth->fl.iif == iif &&
2062                     rth->fl.oif == 0 &&
2063                     rth->fl.mark == skb->mark &&
2064                     rth->fl.fl4_tos == tos &&
2065                     rth->u.dst.dev->nd_net == net &&
2066                     rth->rt_genid == atomic_read(&rt_genid)) {
2067                         dst_use(&rth->u.dst, jiffies);
2068                         RT_CACHE_STAT_INC(in_hit);
2069                         rcu_read_unlock();
2070                         skb->dst = (struct dst_entry*)rth;
2071                         return 0;
2072                 }
2073                 RT_CACHE_STAT_INC(in_hlist_search);
2074         }
2075         rcu_read_unlock();
2076
2077         /* Multicast recognition logic is moved from route cache to here.
2078            The problem was that too many Ethernet cards have broken/missing
2079            hardware multicast filters :-( As result the host on multicasting
2080            network acquires a lot of useless route cache entries, sort of
2081            SDR messages from all the world. Now we try to get rid of them.
2082            Really, provided software IP multicast filter is organized
2083            reasonably (at least, hashed), it does not result in a slowdown
2084            comparing with route cache reject entries.
2085            Note, that multicast routers are not affected, because
2086            route cache entry is created eventually.
2087          */
2088         if (ipv4_is_multicast(daddr)) {
2089                 struct in_device *in_dev;
2090
2091                 rcu_read_lock();
2092                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2093                         int our = ip_check_mc(in_dev, daddr, saddr,
2094                                 ip_hdr(skb)->protocol);
2095                         if (our
2096 #ifdef CONFIG_IP_MROUTE
2097                             || (!ipv4_is_local_multicast(daddr) &&
2098                                 IN_DEV_MFORWARD(in_dev))
2099 #endif
2100                             ) {
2101                                 rcu_read_unlock();
2102                                 return ip_route_input_mc(skb, daddr, saddr,
2103                                                          tos, dev, our);
2104                         }
2105                 }
2106                 rcu_read_unlock();
2107                 return -EINVAL;
2108         }
2109         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2110 }
2111
2112 static inline int __mkroute_output(struct rtable **result,
2113                                    struct fib_result* res,
2114                                    const struct flowi *fl,
2115                                    const struct flowi *oldflp,
2116                                    struct net_device *dev_out,
2117                                    unsigned flags)
2118 {
2119         struct rtable *rth;
2120         struct in_device *in_dev;
2121         u32 tos = RT_FL_TOS(oldflp);
2122         int err = 0;
2123
2124         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2125                 return -EINVAL;
2126
2127         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2128                 res->type = RTN_BROADCAST;
2129         else if (ipv4_is_multicast(fl->fl4_dst))
2130                 res->type = RTN_MULTICAST;
2131         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2132                 return -EINVAL;
2133
2134         if (dev_out->flags & IFF_LOOPBACK)
2135                 flags |= RTCF_LOCAL;
2136
2137         /* get work reference to inet device */
2138         in_dev = in_dev_get(dev_out);
2139         if (!in_dev)
2140                 return -EINVAL;
2141
2142         if (res->type == RTN_BROADCAST) {
2143                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2144                 if (res->fi) {
2145                         fib_info_put(res->fi);
2146                         res->fi = NULL;
2147                 }
2148         } else if (res->type == RTN_MULTICAST) {
2149                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2150                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2151                                  oldflp->proto))
2152                         flags &= ~RTCF_LOCAL;
2153                 /* If multicast route do not exist use
2154                    default one, but do not gateway in this case.
2155                    Yes, it is hack.
2156                  */
2157                 if (res->fi && res->prefixlen < 4) {
2158                         fib_info_put(res->fi);
2159                         res->fi = NULL;
2160                 }
2161         }
2162
2163
2164         rth = dst_alloc(&ipv4_dst_ops);
2165         if (!rth) {
2166                 err = -ENOBUFS;
2167                 goto cleanup;
2168         }
2169
2170         atomic_set(&rth->u.dst.__refcnt, 1);
2171         rth->u.dst.flags= DST_HOST;
2172         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2173                 rth->u.dst.flags |= DST_NOXFRM;
2174         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2175                 rth->u.dst.flags |= DST_NOPOLICY;
2176
2177         rth->fl.fl4_dst = oldflp->fl4_dst;
2178         rth->fl.fl4_tos = tos;
2179         rth->fl.fl4_src = oldflp->fl4_src;
2180         rth->fl.oif     = oldflp->oif;
2181         rth->fl.mark    = oldflp->mark;
2182         rth->rt_dst     = fl->fl4_dst;
2183         rth->rt_src     = fl->fl4_src;
2184         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2185         /* get references to the devices that are to be hold by the routing
2186            cache entry */
2187         rth->u.dst.dev  = dev_out;
2188         dev_hold(dev_out);
2189         rth->idev       = in_dev_get(dev_out);
2190         rth->rt_gateway = fl->fl4_dst;
2191         rth->rt_spec_dst= fl->fl4_src;
2192
2193         rth->u.dst.output=ip_output;
2194         rth->rt_genid = atomic_read(&rt_genid);
2195
2196         RT_CACHE_STAT_INC(out_slow_tot);
2197
2198         if (flags & RTCF_LOCAL) {
2199                 rth->u.dst.input = ip_local_deliver;
2200                 rth->rt_spec_dst = fl->fl4_dst;
2201         }
2202         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2203                 rth->rt_spec_dst = fl->fl4_src;
2204                 if (flags & RTCF_LOCAL &&
2205                     !(dev_out->flags & IFF_LOOPBACK)) {
2206                         rth->u.dst.output = ip_mc_output;
2207                         RT_CACHE_STAT_INC(out_slow_mc);
2208                 }
2209 #ifdef CONFIG_IP_MROUTE
2210                 if (res->type == RTN_MULTICAST) {
2211                         if (IN_DEV_MFORWARD(in_dev) &&
2212                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2213                                 rth->u.dst.input = ip_mr_input;
2214                                 rth->u.dst.output = ip_mc_output;
2215                         }
2216                 }
2217 #endif
2218         }
2219
2220         rt_set_nexthop(rth, res, 0);
2221
2222         rth->rt_flags = flags;
2223
2224         *result = rth;
2225  cleanup:
2226         /* release work reference to inet device */
2227         in_dev_put(in_dev);
2228
2229         return err;
2230 }
2231
2232 static inline int ip_mkroute_output(struct rtable **rp,
2233                                     struct fib_result* res,
2234                                     const struct flowi *fl,
2235                                     const struct flowi *oldflp,
2236                                     struct net_device *dev_out,
2237                                     unsigned flags)
2238 {
2239         struct rtable *rth = NULL;
2240         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2241         unsigned hash;
2242         if (err == 0) {
2243                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2244                 err = rt_intern_hash(hash, rth, rp);
2245         }
2246
2247         return err;
2248 }
2249
2250 /*
2251  * Major route resolver routine.
2252  */
2253
2254 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2255                                 const struct flowi *oldflp)
2256 {
2257         u32 tos = RT_FL_TOS(oldflp);
2258         struct flowi fl = { .nl_u = { .ip4_u =
2259                                       { .daddr = oldflp->fl4_dst,
2260                                         .saddr = oldflp->fl4_src,
2261                                         .tos = tos & IPTOS_RT_MASK,
2262                                         .scope = ((tos & RTO_ONLINK) ?
2263                                                   RT_SCOPE_LINK :
2264                                                   RT_SCOPE_UNIVERSE),
2265                                       } },
2266                             .mark = oldflp->mark,
2267                             .iif = net->loopback_dev->ifindex,
2268                             .oif = oldflp->oif };
2269         struct fib_result res;
2270         unsigned flags = 0;
2271         struct net_device *dev_out = NULL;
2272         int free_res = 0;
2273         int err;
2274
2275
2276         res.fi          = NULL;
2277 #ifdef CONFIG_IP_MULTIPLE_TABLES
2278         res.r           = NULL;
2279 #endif
2280
2281         if (oldflp->fl4_src) {
2282                 err = -EINVAL;
2283                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2284                     ipv4_is_lbcast(oldflp->fl4_src) ||
2285                     ipv4_is_zeronet(oldflp->fl4_src))
2286                         goto out;
2287
2288                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2289                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2290                 if (dev_out == NULL)
2291                         goto out;
2292
2293                 /* I removed check for oif == dev_out->oif here.
2294                    It was wrong for two reasons:
2295                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2296                       is assigned to multiple interfaces.
2297                    2. Moreover, we are allowed to send packets with saddr
2298                       of another iface. --ANK
2299                  */
2300
2301                 if (oldflp->oif == 0
2302                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2303                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2304                         /* Special hack: user can direct multicasts
2305                            and limited broadcast via necessary interface
2306                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2307                            This hack is not just for fun, it allows
2308                            vic,vat and friends to work.
2309                            They bind socket to loopback, set ttl to zero
2310                            and expect that it will work.
2311                            From the viewpoint of routing cache they are broken,
2312                            because we are not allowed to build multicast path
2313                            with loopback source addr (look, routing cache
2314                            cannot know, that ttl is zero, so that packet
2315                            will not leave this host and route is valid).
2316                            Luckily, this hack is good workaround.
2317                          */
2318
2319                         fl.oif = dev_out->ifindex;
2320                         goto make_route;
2321                 }
2322                 if (dev_out)
2323                         dev_put(dev_out);
2324                 dev_out = NULL;
2325         }
2326
2327
2328         if (oldflp->oif) {
2329                 dev_out = dev_get_by_index(net, oldflp->oif);
2330                 err = -ENODEV;
2331                 if (dev_out == NULL)
2332                         goto out;
2333
2334                 /* RACE: Check return value of inet_select_addr instead. */
2335                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2336                         dev_put(dev_out);
2337                         goto out;       /* Wrong error code */
2338                 }
2339
2340                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2341                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2342                         if (!fl.fl4_src)
2343                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2344                                                               RT_SCOPE_LINK);
2345                         goto make_route;
2346                 }
2347                 if (!fl.fl4_src) {
2348                         if (ipv4_is_multicast(oldflp->fl4_dst))
2349                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2350                                                               fl.fl4_scope);
2351                         else if (!oldflp->fl4_dst)
2352                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2353                                                               RT_SCOPE_HOST);
2354                 }
2355         }
2356
2357         if (!fl.fl4_dst) {
2358                 fl.fl4_dst = fl.fl4_src;
2359                 if (!fl.fl4_dst)
2360                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2361                 if (dev_out)
2362                         dev_put(dev_out);
2363                 dev_out = net->loopback_dev;
2364                 dev_hold(dev_out);
2365                 fl.oif = net->loopback_dev->ifindex;
2366                 res.type = RTN_LOCAL;
2367                 flags |= RTCF_LOCAL;
2368                 goto make_route;
2369         }
2370
2371         if (fib_lookup(net, &fl, &res)) {
2372                 res.fi = NULL;
2373                 if (oldflp->oif) {
2374                         /* Apparently, routing tables are wrong. Assume,
2375                            that the destination is on link.
2376
2377                            WHY? DW.
2378                            Because we are allowed to send to iface
2379                            even if it has NO routes and NO assigned
2380                            addresses. When oif is specified, routing
2381                            tables are looked up with only one purpose:
2382                            to catch if destination is gatewayed, rather than
2383                            direct. Moreover, if MSG_DONTROUTE is set,
2384                            we send packet, ignoring both routing tables
2385                            and ifaddr state. --ANK
2386
2387
2388                            We could make it even if oif is unknown,
2389                            likely IPv6, but we do not.
2390                          */
2391
2392                         if (fl.fl4_src == 0)
2393                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2394                                                               RT_SCOPE_LINK);
2395                         res.type = RTN_UNICAST;
2396                         goto make_route;
2397                 }
2398                 if (dev_out)
2399                         dev_put(dev_out);
2400                 err = -ENETUNREACH;
2401                 goto out;
2402         }
2403         free_res = 1;
2404
2405         if (res.type == RTN_LOCAL) {
2406                 if (!fl.fl4_src)
2407                         fl.fl4_src = fl.fl4_dst;
2408                 if (dev_out)
2409                         dev_put(dev_out);
2410                 dev_out = net->loopback_dev;
2411                 dev_hold(dev_out);
2412                 fl.oif = dev_out->ifindex;
2413                 if (res.fi)
2414                         fib_info_put(res.fi);
2415                 res.fi = NULL;
2416                 flags |= RTCF_LOCAL;
2417                 goto make_route;
2418         }
2419
2420 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2421         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2422                 fib_select_multipath(&fl, &res);
2423         else
2424 #endif
2425         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2426                 fib_select_default(net, &fl, &res);
2427
2428         if (!fl.fl4_src)
2429                 fl.fl4_src = FIB_RES_PREFSRC(res);
2430
2431         if (dev_out)
2432                 dev_put(dev_out);
2433         dev_out = FIB_RES_DEV(res);
2434         dev_hold(dev_out);
2435         fl.oif = dev_out->ifindex;
2436
2437
2438 make_route:
2439         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2440
2441
2442         if (free_res)
2443                 fib_res_put(&res);
2444         if (dev_out)
2445                 dev_put(dev_out);
2446 out:    return err;
2447 }
2448
2449 int __ip_route_output_key(struct net *net, struct rtable **rp,
2450                           const struct flowi *flp)
2451 {
2452         unsigned hash;
2453         struct rtable *rth;
2454
2455         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2456
2457         rcu_read_lock_bh();
2458         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2459                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2460                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2461                     rth->fl.fl4_src == flp->fl4_src &&
2462                     rth->fl.iif == 0 &&
2463                     rth->fl.oif == flp->oif &&
2464                     rth->fl.mark == flp->mark &&
2465                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2466                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2467                     rth->u.dst.dev->nd_net == net &&
2468                     rth->rt_genid == atomic_read(&rt_genid)) {
2469                         dst_use(&rth->u.dst, jiffies);
2470                         RT_CACHE_STAT_INC(out_hit);
2471                         rcu_read_unlock_bh();
2472                         *rp = rth;
2473                         return 0;
2474                 }
2475                 RT_CACHE_STAT_INC(out_hlist_search);
2476         }
2477         rcu_read_unlock_bh();
2478
2479         return ip_route_output_slow(net, rp, flp);
2480 }
2481
2482 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2483
2484 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2485 {
2486 }
2487
2488 static struct dst_ops ipv4_dst_blackhole_ops = {
2489         .family                 =       AF_INET,
2490         .protocol               =       __constant_htons(ETH_P_IP),
2491         .destroy                =       ipv4_dst_destroy,
2492         .check                  =       ipv4_dst_check,
2493         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2494         .entry_size             =       sizeof(struct rtable),
2495         .entries                =       ATOMIC_INIT(0),
2496 };
2497
2498
2499 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2500 {
2501         struct rtable *ort = *rp;
2502         struct rtable *rt = (struct rtable *)
2503                 dst_alloc(&ipv4_dst_blackhole_ops);
2504
2505         if (rt) {
2506                 struct dst_entry *new = &rt->u.dst;
2507
2508                 atomic_set(&new->__refcnt, 1);
2509                 new->__use = 1;
2510                 new->input = dst_discard;
2511                 new->output = dst_discard;
2512                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2513
2514                 new->dev = ort->u.dst.dev;
2515                 if (new->dev)
2516                         dev_hold(new->dev);
2517
2518                 rt->fl = ort->fl;
2519
2520                 rt->idev = ort->idev;
2521                 if (rt->idev)
2522                         in_dev_hold(rt->idev);
2523                 rt->rt_genid = atomic_read(&rt_genid);
2524                 rt->rt_flags = ort->rt_flags;
2525                 rt->rt_type = ort->rt_type;
2526                 rt->rt_dst = ort->rt_dst;
2527                 rt->rt_src = ort->rt_src;
2528                 rt->rt_iif = ort->rt_iif;
2529                 rt->rt_gateway = ort->rt_gateway;
2530                 rt->rt_spec_dst = ort->rt_spec_dst;
2531                 rt->peer = ort->peer;
2532                 if (rt->peer)
2533                         atomic_inc(&rt->peer->refcnt);
2534
2535                 dst_free(new);
2536         }
2537
2538         dst_release(&(*rp)->u.dst);
2539         *rp = rt;
2540         return (rt ? 0 : -ENOMEM);
2541 }
2542
2543 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2544                          struct sock *sk, int flags)
2545 {
2546         int err;
2547
2548         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2549                 return err;
2550
2551         if (flp->proto) {
2552                 if (!flp->fl4_src)
2553                         flp->fl4_src = (*rp)->rt_src;
2554                 if (!flp->fl4_dst)
2555                         flp->fl4_dst = (*rp)->rt_dst;
2556                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2557                                     flags ? XFRM_LOOKUP_WAIT : 0);
2558                 if (err == -EREMOTE)
2559                         err = ipv4_dst_blackhole(rp, flp, sk);
2560
2561                 return err;
2562         }
2563
2564         return 0;
2565 }
2566
2567 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2568
2569 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2570 {
2571         return ip_route_output_flow(net, rp, flp, NULL, 0);
2572 }
2573
2574 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2575                         int nowait, unsigned int flags)
2576 {
2577         struct rtable *rt = (struct rtable*)skb->dst;
2578         struct rtmsg *r;
2579         struct nlmsghdr *nlh;
2580         long expires;
2581         u32 id = 0, ts = 0, tsage = 0, error;
2582
2583         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2584         if (nlh == NULL)
2585                 return -EMSGSIZE;
2586
2587         r = nlmsg_data(nlh);
2588         r->rtm_family    = AF_INET;
2589         r->rtm_dst_len  = 32;
2590         r->rtm_src_len  = 0;
2591         r->rtm_tos      = rt->fl.fl4_tos;
2592         r->rtm_table    = RT_TABLE_MAIN;
2593         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2594         r->rtm_type     = rt->rt_type;
2595         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2596         r->rtm_protocol = RTPROT_UNSPEC;
2597         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2598         if (rt->rt_flags & RTCF_NOTIFY)
2599                 r->rtm_flags |= RTM_F_NOTIFY;
2600
2601         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2602
2603         if (rt->fl.fl4_src) {
2604                 r->rtm_src_len = 32;
2605                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2606         }
2607         if (rt->u.dst.dev)
2608                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2609 #ifdef CONFIG_NET_CLS_ROUTE
2610         if (rt->u.dst.tclassid)
2611                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2612 #endif
2613         if (rt->fl.iif)
2614                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2615         else if (rt->rt_src != rt->fl.fl4_src)
2616                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2617
2618         if (rt->rt_dst != rt->rt_gateway)
2619                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2620
2621         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2622                 goto nla_put_failure;
2623
2624         error = rt->u.dst.error;
2625         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2626         if (rt->peer) {
2627                 id = rt->peer->ip_id_count;
2628                 if (rt->peer->tcp_ts_stamp) {
2629                         ts = rt->peer->tcp_ts;
2630                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2631                 }
2632         }
2633
2634         if (rt->fl.iif) {
2635 #ifdef CONFIG_IP_MROUTE
2636                 __be32 dst = rt->rt_dst;
2637
2638                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2639                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2640                         int err = ipmr_get_route(skb, r, nowait);
2641                         if (err <= 0) {
2642                                 if (!nowait) {
2643                                         if (err == 0)
2644                                                 return 0;
2645                                         goto nla_put_failure;
2646                                 } else {
2647                                         if (err == -EMSGSIZE)
2648                                                 goto nla_put_failure;
2649                                         error = err;
2650                                 }
2651                         }
2652                 } else
2653 #endif
2654                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2655         }
2656
2657         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2658                                expires, error) < 0)
2659                 goto nla_put_failure;
2660
2661         return nlmsg_end(skb, nlh);
2662
2663 nla_put_failure:
2664         nlmsg_cancel(skb, nlh);
2665         return -EMSGSIZE;
2666 }
2667
2668 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2669 {
2670         struct net *net = in_skb->sk->sk_net;
2671         struct rtmsg *rtm;
2672         struct nlattr *tb[RTA_MAX+1];
2673         struct rtable *rt = NULL;
2674         __be32 dst = 0;
2675         __be32 src = 0;
2676         u32 iif;
2677         int err;
2678         struct sk_buff *skb;
2679
2680         if (net != &init_net)
2681                 return -EINVAL;
2682
2683         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2684         if (err < 0)
2685                 goto errout;
2686
2687         rtm = nlmsg_data(nlh);
2688
2689         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2690         if (skb == NULL) {
2691                 err = -ENOBUFS;
2692                 goto errout;
2693         }
2694
2695         /* Reserve room for dummy headers, this skb can pass
2696            through good chunk of routing engine.
2697          */
2698         skb_reset_mac_header(skb);
2699         skb_reset_network_header(skb);
2700
2701         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2702         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2703         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2704
2705         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2706         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2707         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2708
2709         if (iif) {
2710                 struct net_device *dev;
2711
2712                 dev = __dev_get_by_index(&init_net, iif);
2713                 if (dev == NULL) {
2714                         err = -ENODEV;
2715                         goto errout_free;
2716                 }
2717
2718                 skb->protocol   = htons(ETH_P_IP);
2719                 skb->dev        = dev;
2720                 local_bh_disable();
2721                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2722                 local_bh_enable();
2723
2724                 rt = (struct rtable*) skb->dst;
2725                 if (err == 0 && rt->u.dst.error)
2726                         err = -rt->u.dst.error;
2727         } else {
2728                 struct flowi fl = {
2729                         .nl_u = {
2730                                 .ip4_u = {
2731                                         .daddr = dst,
2732                                         .saddr = src,
2733                                         .tos = rtm->rtm_tos,
2734                                 },
2735                         },
2736                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2737                 };
2738                 err = ip_route_output_key(&init_net, &rt, &fl);
2739         }
2740
2741         if (err)
2742                 goto errout_free;
2743
2744         skb->dst = &rt->u.dst;
2745         if (rtm->rtm_flags & RTM_F_NOTIFY)
2746                 rt->rt_flags |= RTCF_NOTIFY;
2747
2748         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2749                                 RTM_NEWROUTE, 0, 0);
2750         if (err <= 0)
2751                 goto errout_free;
2752
2753         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2754 errout:
2755         return err;
2756
2757 errout_free:
2758         kfree_skb(skb);
2759         goto errout;
2760 }
2761
2762 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2763 {
2764         struct rtable *rt;
2765         int h, s_h;
2766         int idx, s_idx;
2767
2768         s_h = cb->args[0];
2769         if (s_h < 0)
2770                 s_h = 0;
2771         s_idx = idx = cb->args[1];
2772         for (h = s_h; h <= rt_hash_mask; h++) {
2773                 rcu_read_lock_bh();
2774                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2775                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2776                         if (idx < s_idx)
2777                                 continue;
2778                         if (rt->rt_genid != atomic_read(&rt_genid))
2779                                 continue;
2780                         skb->dst = dst_clone(&rt->u.dst);
2781                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2782                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2783                                          1, NLM_F_MULTI) <= 0) {
2784                                 dst_release(xchg(&skb->dst, NULL));
2785                                 rcu_read_unlock_bh();
2786                                 goto done;
2787                         }
2788                         dst_release(xchg(&skb->dst, NULL));
2789                 }
2790                 rcu_read_unlock_bh();
2791                 s_idx = 0;
2792         }
2793
2794 done:
2795         cb->args[0] = h;
2796         cb->args[1] = idx;
2797         return skb->len;
2798 }
2799
2800 void ip_rt_multicast_event(struct in_device *in_dev)
2801 {
2802         rt_cache_flush(0);
2803 }
2804
2805 #ifdef CONFIG_SYSCTL
2806 static int flush_delay;
2807
2808 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2809                                         struct file *filp, void __user *buffer,
2810                                         size_t *lenp, loff_t *ppos)
2811 {
2812         if (write) {
2813                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2814                 rt_cache_flush(flush_delay);
2815                 return 0;
2816         }
2817
2818         return -EINVAL;
2819 }
2820
2821 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2822                                                 int __user *name,
2823                                                 int nlen,
2824                                                 void __user *oldval,
2825                                                 size_t __user *oldlenp,
2826                                                 void __user *newval,
2827                                                 size_t newlen)
2828 {
2829         int delay;
2830         if (newlen != sizeof(int))
2831                 return -EINVAL;
2832         if (get_user(delay, (int __user *)newval))
2833                 return -EFAULT;
2834         rt_cache_flush(delay);
2835         return 0;
2836 }
2837
2838 ctl_table ipv4_route_table[] = {
2839         {
2840                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2841                 .procname       = "flush",
2842                 .data           = &flush_delay,
2843                 .maxlen         = sizeof(int),
2844                 .mode           = 0200,
2845                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2846                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2847         },
2848         {
2849                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2850                 .procname       = "gc_thresh",
2851                 .data           = &ipv4_dst_ops.gc_thresh,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = &proc_dointvec,
2855         },
2856         {
2857                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2858                 .procname       = "max_size",
2859                 .data           = &ip_rt_max_size,
2860                 .maxlen         = sizeof(int),
2861                 .mode           = 0644,
2862                 .proc_handler   = &proc_dointvec,
2863         },
2864         {
2865                 /*  Deprecated. Use gc_min_interval_ms */
2866
2867                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2868                 .procname       = "gc_min_interval",
2869                 .data           = &ip_rt_gc_min_interval,
2870                 .maxlen         = sizeof(int),
2871                 .mode           = 0644,
2872                 .proc_handler   = &proc_dointvec_jiffies,
2873                 .strategy       = &sysctl_jiffies,
2874         },
2875         {
2876                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2877                 .procname       = "gc_min_interval_ms",
2878                 .data           = &ip_rt_gc_min_interval,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0644,
2881                 .proc_handler   = &proc_dointvec_ms_jiffies,
2882                 .strategy       = &sysctl_ms_jiffies,
2883         },
2884         {
2885                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2886                 .procname       = "gc_timeout",
2887                 .data           = &ip_rt_gc_timeout,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = &proc_dointvec_jiffies,
2891                 .strategy       = &sysctl_jiffies,
2892         },
2893         {
2894                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2895                 .procname       = "gc_interval",
2896                 .data           = &ip_rt_gc_interval,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = &proc_dointvec_jiffies,
2900                 .strategy       = &sysctl_jiffies,
2901         },
2902         {
2903                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2904                 .procname       = "redirect_load",
2905                 .data           = &ip_rt_redirect_load,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = &proc_dointvec,
2909         },
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2912                 .procname       = "redirect_number",
2913                 .data           = &ip_rt_redirect_number,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = &proc_dointvec,
2917         },
2918         {
2919                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2920                 .procname       = "redirect_silence",
2921                 .data           = &ip_rt_redirect_silence,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = &proc_dointvec,
2925         },
2926         {
2927                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2928                 .procname       = "error_cost",
2929                 .data           = &ip_rt_error_cost,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = &proc_dointvec,
2933         },
2934         {
2935                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2936                 .procname       = "error_burst",
2937                 .data           = &ip_rt_error_burst,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = &proc_dointvec,
2941         },
2942         {
2943                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2944                 .procname       = "gc_elasticity",
2945                 .data           = &ip_rt_gc_elasticity,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec,
2949         },
2950         {
2951                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2952                 .procname       = "mtu_expires",
2953                 .data           = &ip_rt_mtu_expires,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = &proc_dointvec_jiffies,
2957                 .strategy       = &sysctl_jiffies,
2958         },
2959         {
2960                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2961                 .procname       = "min_pmtu",
2962                 .data           = &ip_rt_min_pmtu,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = &proc_dointvec,
2966         },
2967         {
2968                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2969                 .procname       = "min_adv_mss",
2970                 .data           = &ip_rt_min_advmss,
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0644,
2973                 .proc_handler   = &proc_dointvec,
2974         },
2975         {
2976                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2977                 .procname       = "secret_interval",
2978                 .data           = &ip_rt_secret_interval,
2979                 .maxlen         = sizeof(int),
2980                 .mode           = 0644,
2981                 .proc_handler   = &proc_dointvec_jiffies,
2982                 .strategy       = &sysctl_jiffies,
2983         },
2984         { .ctl_name = 0 }
2985 };
2986 #endif
2987
2988 #ifdef CONFIG_NET_CLS_ROUTE
2989 struct ip_rt_acct *ip_rt_acct __read_mostly;
2990 #endif /* CONFIG_NET_CLS_ROUTE */
2991
2992 static __initdata unsigned long rhash_entries;
2993 static int __init set_rhash_entries(char *str)
2994 {
2995         if (!str)
2996                 return 0;
2997         rhash_entries = simple_strtoul(str, &str, 0);
2998         return 1;
2999 }
3000 __setup("rhash_entries=", set_rhash_entries);
3001
3002 int __init ip_rt_init(void)
3003 {
3004         int rc = 0;
3005
3006         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3007                              (jiffies ^ (jiffies >> 7))));
3008
3009 #ifdef CONFIG_NET_CLS_ROUTE
3010         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3011         if (!ip_rt_acct)
3012                 panic("IP: failed to allocate ip_rt_acct\n");
3013 #endif
3014
3015         ipv4_dst_ops.kmem_cachep =
3016                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3017                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3018
3019         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3020
3021         rt_hash_table = (struct rt_hash_bucket *)
3022                 alloc_large_system_hash("IP route cache",
3023                                         sizeof(struct rt_hash_bucket),
3024                                         rhash_entries,
3025                                         (num_physpages >= 128 * 1024) ?
3026                                         15 : 17,
3027                                         0,
3028                                         &rt_hash_log,
3029                                         &rt_hash_mask,
3030                                         0);
3031         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3032         rt_hash_lock_init();
3033
3034         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3035         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3036
3037         devinet_init();
3038         ip_fib_init();
3039
3040         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3041
3042         /* All the timers, started at system startup tend
3043            to synchronize. Perturb it a bit.
3044          */
3045         schedule_delayed_work(&expires_work,
3046                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3047
3048         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3049                 ip_rt_secret_interval;
3050         add_timer(&rt_secret_timer);
3051
3052         if (ip_rt_proc_init(&init_net))
3053                 printk(KERN_ERR "Unable to create route proc files\n");
3054 #ifdef CONFIG_XFRM
3055         xfrm_init();
3056         xfrm4_init();
3057 #endif
3058         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3059
3060         return rc;
3061 }
3062
3063 EXPORT_SYMBOL(__ip_select_ident);
3064 EXPORT_SYMBOL(ip_route_input);
3065 EXPORT_SYMBOL(ip_route_output_key);