net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .default_mtu =          ipv4_default_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len;
 421
 422                 n = dst_get_neighbour(&r->dst);
 423                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 424                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 425                         r->dst.dev ? r->dst.dev->name : "*",
 426                         (__force u32)r->rt_dst,
 427                         (__force u32)r->rt_gateway,
 428                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 429                         r->dst.__use, 0, (__force u32)r->rt_src,
 430                         dst_metric_advmss(&r->dst) + 40,
 431                         dst_metric(&r->dst, RTAX_WINDOW),
 432                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 433                               dst_metric(&r->dst, RTAX_RTTVAR)),
 434                         r->rt_key_tos,
 435                         -1,
 436                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 437                         r->rt_spec_dst, &len);
 438
 439                 seq_printf(seq, "%*s\n", 127 - len, "");
 440         }
 441         return 0;
 442 }
 443
 444 static const struct seq_operations rt_cache_seq_ops = {
 445         .start  = rt_cache_seq_start,
 446         .next   = rt_cache_seq_next,
 447         .stop   = rt_cache_seq_stop,
 448         .show   = rt_cache_seq_show,
 449 };
 450
 451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452 {
 453         return seq_open_net(inode, file, &rt_cache_seq_ops,
 454                         sizeof(struct rt_cache_iter_state));
 455 }
 456
 457 static const struct file_operations rt_cache_seq_fops = {
 458         .owner   = THIS_MODULE,
 459         .open    = rt_cache_seq_open,
 460         .read    = seq_read,
 461         .llseek  = seq_lseek,
 462         .release = seq_release_net,
 463 };
 464
 465
 466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467 {
 468         int cpu;
 469
 470         if (*pos == 0)
 471                 return SEQ_START_TOKEN;
 472
 473         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474                 if (!cpu_possible(cpu))
 475                         continue;
 476                 *pos = cpu+1;
 477                 return &per_cpu(rt_cache_stat, cpu);
 478         }
 479         return NULL;
 480 }
 481
 482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483 {
 484         int cpu;
 485
 486         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487                 if (!cpu_possible(cpu))
 488                         continue;
 489                 *pos = cpu+1;
 490                 return &per_cpu(rt_cache_stat, cpu);
 491         }
 492         return NULL;
 493
 494 }
 495
 496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497 {
 498
 499 }
 500
 501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502 {
 503         struct rt_cache_stat *st = v;
 504
 505         if (v == SEQ_START_TOKEN) {
 506                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507                 return 0;
 508         }
 509
 510         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512                    dst_entries_get_slow(&ipv4_dst_ops),
 513                    st->in_hit,
 514                    st->in_slow_tot,
 515                    st->in_slow_mc,
 516                    st->in_no_route,
 517                    st->in_brd,
 518                    st->in_martian_dst,
 519                    st->in_martian_src,
 520
 521                    st->out_hit,
 522                    st->out_slow_tot,
 523                    st->out_slow_mc,
 524
 525                    st->gc_total,
 526                    st->gc_ignored,
 527                    st->gc_goal_miss,
 528                    st->gc_dst_overflow,
 529                    st->in_hlist_search,
 530                    st->out_hlist_search
 531                 );
 532         return 0;
 533 }
 534
 535 static const struct seq_operations rt_cpu_seq_ops = {
 536         .start  = rt_cpu_seq_start,
 537         .next   = rt_cpu_seq_next,
 538         .stop   = rt_cpu_seq_stop,
 539         .show   = rt_cpu_seq_show,
 540 };
 541
 542
 543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544 {
 545         return seq_open(file, &rt_cpu_seq_ops);
 546 }
 547
 548 static const struct file_operations rt_cpu_seq_fops = {
 549         .owner   = THIS_MODULE,
 550         .open    = rt_cpu_seq_open,
 551         .read    = seq_read,
 552         .llseek  = seq_lseek,
 553         .release = seq_release,
 554 };
 555
 556 #ifdef CONFIG_IP_ROUTE_CLASSID
 557 static int rt_acct_proc_show(struct seq_file *m, void *v)
 558 {
 559         struct ip_rt_acct *dst, *src;
 560         unsigned int i, j;
 561
 562         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563         if (!dst)
 564                 return -ENOMEM;
 565
 566         for_each_possible_cpu(i) {
 567                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568                 for (j = 0; j < 256; j++) {
 569                         dst[j].o_bytes   += src[j].o_bytes;
 570                         dst[j].o_packets += src[j].o_packets;
 571                         dst[j].i_bytes   += src[j].i_bytes;
 572                         dst[j].i_packets += src[j].i_packets;
 573                 }
 574         }
 575
 576         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577         kfree(dst);
 578         return 0;
 579 }
 580
 581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582 {
 583         return single_open(file, rt_acct_proc_show, NULL);
 584 }
 585
 586 static const struct file_operations rt_acct_proc_fops = {
 587         .owner          = THIS_MODULE,
 588         .open           = rt_acct_proc_open,
 589         .read           = seq_read,
 590         .llseek         = seq_lseek,
 591         .release        = single_release,
 592 };
 593 #endif
 594
 595 static int __net_init ip_rt_do_proc_init(struct net *net)
 596 {
 597         struct proc_dir_entry *pde;
 598
 599         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600                         &rt_cache_seq_fops);
 601         if (!pde)
 602                 goto err1;
 603
 604         pde = proc_create("rt_cache", S_IRUGO,
 605                           net->proc_net_stat, &rt_cpu_seq_fops);
 606         if (!pde)
 607                 goto err2;
 608
 609 #ifdef CONFIG_IP_ROUTE_CLASSID
 610         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 611         if (!pde)
 612                 goto err3;
 613 #endif
 614         return 0;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617 err3:
 618         remove_proc_entry("rt_cache", net->proc_net_stat);
 619 #endif
 620 err2:
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 err1:
 623         return -ENOMEM;
 624 }
 625
 626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627 {
 628         remove_proc_entry("rt_cache", net->proc_net_stat);
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 #ifdef CONFIG_IP_ROUTE_CLASSID
 631         remove_proc_entry("rt_acct", net->proc_net);
 632 #endif
 633 }
 634
 635 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636         .init = ip_rt_do_proc_init,
 637         .exit = ip_rt_do_proc_exit,
 638 };
 639
 640 static int __init ip_rt_proc_init(void)
 641 {
 642         return register_pernet_subsys(&ip_rt_proc_ops);
 643 }
 644
 645 #else
 646 static inline int ip_rt_proc_init(void)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_PROC_FS */
 651
 652 static inline void rt_free(struct rtable *rt)
 653 {
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline void rt_drop(struct rtable *rt)
 658 {
 659         ip_rt_put(rt);
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline int rt_fast_clean(struct rtable *rth)
 664 {
 665         /* Kill broadcast/multicast entries very aggresively, if they
 666            collide in hash table with more useful entries */
 667         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668                 rt_is_input_route(rth) && rth->dst.rt_next;
 669 }
 670
 671 static inline int rt_valuable(struct rtable *rth)
 672 {
 673         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674                 (rth->peer && rth->peer->pmtu_expires);
 675 }
 676
 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678 {
 679         unsigned long age;
 680         int ret = 0;
 681
 682         if (atomic_read(&rth->dst.__refcnt))
 683                 goto out;
 684
 685         age = jiffies - rth->dst.lastuse;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (rt_is_output_route(rt) ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct rtable *rt1,
 721                                        const struct rtable *rt2)
 722 {
 723         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725                 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729 {
 730         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_mark ^ rt2->rt_mark) |
 733                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734                 (rt1->rt_oif ^ rt2->rt_oif) |
 735                 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(struct net *net, int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757
 758         for (i = 0; i <= rt_hash_mask; i++) {
 759                 struct rtable __rcu **pprev;
 760                 struct rtable *list;
 761
 762                 if (process_context && need_resched())
 763                         cond_resched();
 764                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 765                 if (!rth)
 766                         continue;
 767
 768                 spin_lock_bh(rt_hash_lock_addr(i));
 769
 770                 list = NULL;
 771                 pprev = &rt_hash_table[i].chain;
 772                 rth = rcu_dereference_protected(*pprev,
 773                         lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775                 while (rth) {
 776                         next = rcu_dereference_protected(rth->dst.rt_next,
 777                                 lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                         if (!net ||
 780                             net_eq(dev_net(rth->dst.dev), net)) {
 781                                 rcu_assign_pointer(*pprev, next);
 782                                 rcu_assign_pointer(rth->dst.rt_next, list);
 783                                 list = rth;
 784                         } else {
 785                                 pprev = &rth->dst.rt_next;
 786                         }
 787                         rth = next;
 788                 }
 789
 790                 spin_unlock_bh(rt_hash_lock_addr(i));
 791
 792                 for (; list; list = next) {
 793                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 794                         rt_free(list);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * While freeing expired entries, we compute average chain length
 801  * and standard deviation, using fixed-point arithmetic.
 802  * This to have an estimation of rt_chain_length_max
 803  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805  */
 806
 807 #define FRACT_BITS 3
 808 #define ONE (1UL << FRACT_BITS)
 809
 810 /*
 811  * Given a hash chain and an item in this hash chain,
 812  * find if a previous entry has the same hash_inputs
 813  * (but differs on tos, mark or oif)
 814  * Returns 0 if an alias is found.
 815  * Returns ONE if rth has no alias before itself.
 816  */
 817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818 {
 819         const struct rtable *aux = head;
 820
 821         while (aux != rth) {
 822                 if (compare_hash_inputs(aux, rth))
 823                         return 0;
 824                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825         }
 826         return ONE;
 827 }
 828
 829 /*
 830  * Perturbation of rt_genid by a small quantity [1..256]
 831  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 832  * many times (2^24) without giving recent rt_genid.
 833  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 834  */
 835 static void rt_cache_invalidate(struct net *net)
 836 {
 837         unsigned char shuffle;
 838
 839         get_random_bytes(&shuffle, sizeof(shuffle));
 840         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 841 }
 842
 843 /*
 844  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 845  * delay >= 0 : invalidate & flush cache (can be long)
 846  */
 847 void rt_cache_flush(struct net *net, int delay)
 848 {
 849         rt_cache_invalidate(net);
 850         if (delay >= 0)
 851                 rt_do_flush(net, !in_softirq());
 852 }
 853
 854 /* Flush previous cache invalidated entries from the cache */
 855 void rt_cache_flush_batch(struct net *net)
 856 {
 857         rt_do_flush(net, !in_softirq());
 858 }
 859
 860 static void rt_emergency_hash_rebuild(struct net *net)
 861 {
 862         if (net_ratelimit())
 863                 printk(KERN_WARNING "Route hash chain too long!\n");
 864         rt_cache_invalidate(net);
 865 }
 866
 867 /*
 868    Short description of GC goals.
 869
 870    We want to build algorithm, which will keep routing cache
 871    at some equilibrium point, when number of aged off entries
 872    is kept approximately equal to newly generated ones.
 873
 874    Current expiration strength is variable "expire".
 875    We try to adjust it dynamically, so that if networking
 876    is idle expires is large enough to keep enough of warm entries,
 877    and when load increases it reduces to limit cache size.
 878  */
 879
 880 static int rt_garbage_collect(struct dst_ops *ops)
 881 {
 882         static unsigned long expire = RT_GC_TIMEOUT;
 883         static unsigned long last_gc;
 884         static int rover;
 885         static int equilibrium;
 886         struct rtable *rth;
 887         struct rtable __rcu **rthp;
 888         unsigned long now = jiffies;
 889         int goal;
 890         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 891
 892         /*
 893          * Garbage collection is pretty expensive,
 894          * do not make it too frequently.
 895          */
 896
 897         RT_CACHE_STAT_INC(gc_total);
 898
 899         if (now - last_gc < ip_rt_gc_min_interval &&
 900             entries < ip_rt_max_size) {
 901                 RT_CACHE_STAT_INC(gc_ignored);
 902                 goto out;
 903         }
 904
 905         entries = dst_entries_get_slow(&ipv4_dst_ops);
 906         /* Calculate number of entries, which we want to expire now. */
 907         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 908         if (goal <= 0) {
 909                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 910                         equilibrium = ipv4_dst_ops.gc_thresh;
 911                 goal = entries - equilibrium;
 912                 if (goal > 0) {
 913                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 914                         goal = entries - equilibrium;
 915                 }
 916         } else {
 917                 /* We are in dangerous area. Try to reduce cache really
 918                  * aggressively.
 919                  */
 920                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 921                 equilibrium = entries - goal;
 922         }
 923
 924         if (now - last_gc >= ip_rt_gc_min_interval)
 925                 last_gc = now;
 926
 927         if (goal <= 0) {
 928                 equilibrium += goal;
 929                 goto work_done;
 930         }
 931
 932         do {
 933                 int i, k;
 934
 935                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 936                         unsigned long tmo = expire;
 937
 938                         k = (k + 1) & rt_hash_mask;
 939                         rthp = &rt_hash_table[k].chain;
 940                         spin_lock_bh(rt_hash_lock_addr(k));
 941                         while ((rth = rcu_dereference_protected(*rthp,
 942                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 943                                 if (!rt_is_expired(rth) &&
 944                                         !rt_may_expire(rth, tmo, expire)) {
 945                                         tmo >>= 1;
 946                                         rthp = &rth->dst.rt_next;
 947                                         continue;
 948                                 }
 949                                 *rthp = rth->dst.rt_next;
 950                                 rt_free(rth);
 951                                 goal--;
 952                         }
 953                         spin_unlock_bh(rt_hash_lock_addr(k));
 954                         if (goal <= 0)
 955                                 break;
 956                 }
 957                 rover = k;
 958
 959                 if (goal <= 0)
 960                         goto work_done;
 961
 962                 /* Goal is not achieved. We stop process if:
 963
 964                    - if expire reduced to zero. Otherwise, expire is halfed.
 965                    - if table is not full.
 966                    - if we are called from interrupt.
 967                    - jiffies check is just fallback/debug loop breaker.
 968                      We will not spin here for long time in any case.
 969                  */
 970
 971                 RT_CACHE_STAT_INC(gc_goal_miss);
 972
 973                 if (expire == 0)
 974                         break;
 975
 976                 expire >>= 1;
 977
 978                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 979                         goto out;
 980         } while (!in_softirq() && time_before_eq(jiffies, now));
 981
 982         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 983                 goto out;
 984         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 985                 goto out;
 986         if (net_ratelimit())
 987                 printk(KERN_WARNING "dst cache overflow\n");
 988         RT_CACHE_STAT_INC(gc_dst_overflow);
 989         return 1;
 990
 991 work_done:
 992         expire += ip_rt_gc_min_interval;
 993         if (expire > ip_rt_gc_timeout ||
 994             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 995             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 996                 expire = ip_rt_gc_timeout;
 997 out:    return 0;
 998 }
 999
1000 /*
1001  * Returns number of entries in a hash chain that have different hash_inputs
1002  */
1003 static int slow_chain_length(const struct rtable *head)
1004 {
1005         int length = 0;
1006         const struct rtable *rth = head;
1007
1008         while (rth) {
1009                 length += has_noalias(head, rth);
1010                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1011         }
1012         return length >> FRACT_BITS;
1013 }
1014
1015 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1016 {
1017         struct neigh_table *tbl = &arp_tbl;
1018         static const __be32 inaddr_any = 0;
1019         struct net_device *dev = dst->dev;
1020         const __be32 *pkey = daddr;
1021         struct neighbour *n;
1022
1023 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1024         if (dev->type == ARPHRD_ATM)
1025                 tbl = clip_tbl_hook;
1026 #endif
1027         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1028                 pkey = &inaddr_any;
1029
1030         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1031         if (n)
1032                 return n;
1033         return neigh_create(tbl, pkey, dev);
1034 }
1035
1036 static int rt_bind_neighbour(struct rtable *rt)
1037 {
1038         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1039         if (IS_ERR(n))
1040                 return PTR_ERR(n);
1041         dst_set_neighbour(&rt->dst, n);
1042
1043         return 0;
1044 }
1045
1046 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1047                                      struct sk_buff *skb, int ifindex)
1048 {
1049         struct rtable   *rth, *cand;
1050         struct rtable __rcu **rthp, **candp;
1051         unsigned long   now;
1052         u32             min_score;
1053         int             chain_length;
1054         int attempts = !in_softirq();
1055
1056 restart:
1057         chain_length = 0;
1058         min_score = ~(u32)0;
1059         cand = NULL;
1060         candp = NULL;
1061         now = jiffies;
1062
1063         if (!rt_caching(dev_net(rt->dst.dev))) {
1064                 /*
1065                  * If we're not caching, just tell the caller we
1066                  * were successful and don't touch the route.  The
1067                  * caller hold the sole reference to the cache entry, and
1068                  * it will be released when the caller is done with it.
1069                  * If we drop it here, the callers have no way to resolve routes
1070                  * when we're not caching.  Instead, just point *rp at rt, so
1071                  * the caller gets a single use out of the route
1072                  * Note that we do rt_free on this new route entry, so that
1073                  * once its refcount hits zero, we are still able to reap it
1074                  * (Thanks Alexey)
1075                  * Note: To avoid expensive rcu stuff for this uncached dst,
1076                  * we set DST_NOCACHE so that dst_release() can free dst without
1077                  * waiting a grace period.
1078                  */
1079
1080                 rt->dst.flags |= DST_NOCACHE;
1081                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1082                         int err = rt_bind_neighbour(rt);
1083                         if (err) {
1084                                 if (net_ratelimit())
1085                                         printk(KERN_WARNING
1086                                             "Neighbour table failure & not caching routes.\n");
1087                                 ip_rt_put(rt);
1088                                 return ERR_PTR(err);
1089                         }
1090                 }
1091
1092                 goto skip_hashing;
1093         }
1094
1095         rthp = &rt_hash_table[hash].chain;
1096
1097         spin_lock_bh(rt_hash_lock_addr(hash));
1098         while ((rth = rcu_dereference_protected(*rthp,
1099                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1100                 if (rt_is_expired(rth)) {
1101                         *rthp = rth->dst.rt_next;
1102                         rt_free(rth);
1103                         continue;
1104                 }
1105                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1106                         /* Put it first */
1107                         *rthp = rth->dst.rt_next;
1108                         /*
1109                          * Since lookup is lockfree, the deletion
1110                          * must be visible to another weakly ordered CPU before
1111                          * the insertion at the start of the hash chain.
1112                          */
1113                         rcu_assign_pointer(rth->dst.rt_next,
1114                                            rt_hash_table[hash].chain);
1115                         /*
1116                          * Since lookup is lockfree, the update writes
1117                          * must be ordered for consistency on SMP.
1118                          */
1119                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1120
1121                         dst_use(&rth->dst, now);
1122                         spin_unlock_bh(rt_hash_lock_addr(hash));
1123
1124                         rt_drop(rt);
1125                         if (skb)
1126                                 skb_dst_set(skb, &rth->dst);
1127                         return rth;
1128                 }
1129
1130                 if (!atomic_read(&rth->dst.__refcnt)) {
1131                         u32 score = rt_score(rth);
1132
1133                         if (score <= min_score) {
1134                                 cand = rth;
1135                                 candp = rthp;
1136                                 min_score = score;
1137                         }
1138                 }
1139
1140                 chain_length++;
1141
1142                 rthp = &rth->dst.rt_next;
1143         }
1144
1145         if (cand) {
1146                 /* ip_rt_gc_elasticity used to be average length of chain
1147                  * length, when exceeded gc becomes really aggressive.
1148                  *
1149                  * The second limit is less certain. At the moment it allows
1150                  * only 2 entries per bucket. We will see.
1151                  */
1152                 if (chain_length > ip_rt_gc_elasticity) {
1153                         *candp = cand->dst.rt_next;
1154                         rt_free(cand);
1155                 }
1156         } else {
1157                 if (chain_length > rt_chain_length_max &&
1158                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1159                         struct net *net = dev_net(rt->dst.dev);
1160                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1161                         if (!rt_caching(net)) {
1162                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1163                                         rt->dst.dev->name, num);
1164                         }
1165                         rt_emergency_hash_rebuild(net);
1166                         spin_unlock_bh(rt_hash_lock_addr(hash));
1167
1168                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1169                                         ifindex, rt_genid(net));
1170                         goto restart;
1171                 }
1172         }
1173
1174         /* Try to bind route to arp only if it is output
1175            route or unicast forwarding path.
1176          */
1177         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                 int err = rt_bind_neighbour(rt);
1179                 if (err) {
1180                         spin_unlock_bh(rt_hash_lock_addr(hash));
1181
1182                         if (err != -ENOBUFS) {
1183                                 rt_drop(rt);
1184                                 return ERR_PTR(err);
1185                         }
1186
1187                         /* Neighbour tables are full and nothing
1188                            can be released. Try to shrink route cache,
1189                            it is most likely it holds some neighbour records.
1190                          */
1191                         if (attempts-- > 0) {
1192                                 int saved_elasticity = ip_rt_gc_elasticity;
1193                                 int saved_int = ip_rt_gc_min_interval;
1194                                 ip_rt_gc_elasticity     = 1;
1195                                 ip_rt_gc_min_interval   = 0;
1196                                 rt_garbage_collect(&ipv4_dst_ops);
1197                                 ip_rt_gc_min_interval   = saved_int;
1198                                 ip_rt_gc_elasticity     = saved_elasticity;
1199                                 goto restart;
1200                         }
1201
1202                         if (net_ratelimit())
1203                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1204                         rt_drop(rt);
1205                         return ERR_PTR(-ENOBUFS);
1206                 }
1207         }
1208
1209         rt->dst.rt_next = rt_hash_table[hash].chain;
1210
1211         /*
1212          * Since lookup is lockfree, we must make sure
1213          * previous writes to rt are committed to memory
1214          * before making rt visible to other CPUS.
1215          */
1216         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1217
1218         spin_unlock_bh(rt_hash_lock_addr(hash));
1219
1220 skip_hashing:
1221         if (skb)
1222                 skb_dst_set(skb, &rt->dst);
1223         return rt;
1224 }
1225
1226 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1227
1228 static u32 rt_peer_genid(void)
1229 {
1230         return atomic_read(&__rt_peer_genid);
1231 }
1232
1233 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1234 {
1235         struct inet_peer *peer;
1236
1237         peer = inet_getpeer_v4(daddr, create);
1238
1239         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1240                 inet_putpeer(peer);
1241         else
1242                 rt->rt_peer_genid = rt_peer_genid();
1243 }
1244
1245 /*
1246  * Peer allocation may fail only in serious out-of-memory conditions.  However
1247  * we still can generate some output.
1248  * Random ID selection looks a bit dangerous because we have no chances to
1249  * select ID being unique in a reasonable period of time.
1250  * But broken packet identifier may be better than no packet at all.
1251  */
1252 static void ip_select_fb_ident(struct iphdr *iph)
1253 {
1254         static DEFINE_SPINLOCK(ip_fb_id_lock);
1255         static u32 ip_fallback_id;
1256         u32 salt;
1257
1258         spin_lock_bh(&ip_fb_id_lock);
1259         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1260         iph->id = htons(salt & 0xFFFF);
1261         ip_fallback_id = salt;
1262         spin_unlock_bh(&ip_fb_id_lock);
1263 }
1264
1265 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1266 {
1267         struct rtable *rt = (struct rtable *) dst;
1268
1269         if (rt) {
1270                 if (rt->peer == NULL)
1271                         rt_bind_peer(rt, rt->rt_dst, 1);
1272
1273                 /* If peer is attached to destination, it is never detached,
1274                    so that we need not to grab a lock to dereference it.
1275                  */
1276                 if (rt->peer) {
1277                         iph->id = htons(inet_getid(rt->peer, more));
1278                         return;
1279                 }
1280         } else
1281                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1282                        __builtin_return_address(0));
1283
1284         ip_select_fb_ident(iph);
1285 }
1286 EXPORT_SYMBOL(__ip_select_ident);
1287
1288 static void rt_del(unsigned hash, struct rtable *rt)
1289 {
1290         struct rtable __rcu **rthp;
1291         struct rtable *aux;
1292
1293         rthp = &rt_hash_table[hash].chain;
1294         spin_lock_bh(rt_hash_lock_addr(hash));
1295         ip_rt_put(rt);
1296         while ((aux = rcu_dereference_protected(*rthp,
1297                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1298                 if (aux == rt || rt_is_expired(aux)) {
1299                         *rthp = aux->dst.rt_next;
1300                         rt_free(aux);
1301                         continue;
1302                 }
1303                 rthp = &aux->dst.rt_next;
1304         }
1305         spin_unlock_bh(rt_hash_lock_addr(hash));
1306 }
1307
1308 /* called in rcu_read_lock() section */
1309 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1310                     __be32 saddr, struct net_device *dev)
1311 {
1312         struct in_device *in_dev = __in_dev_get_rcu(dev);
1313         struct inet_peer *peer;
1314         struct net *net;
1315
1316         if (!in_dev)
1317                 return;
1318
1319         net = dev_net(dev);
1320         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1321             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1322             ipv4_is_zeronet(new_gw))
1323                 goto reject_redirect;
1324
1325         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1326                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1327                         goto reject_redirect;
1328                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1329                         goto reject_redirect;
1330         } else {
1331                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1332                         goto reject_redirect;
1333         }
1334
1335         peer = inet_getpeer_v4(daddr, 1);
1336         if (peer) {
1337                 peer->redirect_learned.a4 = new_gw;
1338
1339                 inet_putpeer(peer);
1340
1341                 atomic_inc(&__rt_peer_genid);
1342         }
1343         return;
1344
1345 reject_redirect:
1346 #ifdef CONFIG_IP_ROUTE_VERBOSE
1347         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1348                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1349                         "  Advised path = %pI4 -> %pI4\n",
1350                        &old_gw, dev->name, &new_gw,
1351                        &saddr, &daddr);
1352 #endif
1353         ;
1354 }
1355
1356 static bool peer_pmtu_expired(struct inet_peer *peer)
1357 {
1358         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1359
1360         return orig &&
1361                time_after_eq(jiffies, orig) &&
1362                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1363 }
1364
1365 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1366 {
1367         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1368
1369         return orig &&
1370                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1371 }
1372
1373 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1374 {
1375         struct rtable *rt = (struct rtable *)dst;
1376         struct dst_entry *ret = dst;
1377
1378         if (rt) {
1379                 if (dst->obsolete > 0) {
1380                         ip_rt_put(rt);
1381                         ret = NULL;
1382                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1383                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1384                                                 rt->rt_oif,
1385                                                 rt_genid(dev_net(dst->dev)));
1386                         rt_del(hash, rt);
1387                         ret = NULL;
1388                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1389                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1390                 }
1391         }
1392         return ret;
1393 }
1394
1395 /*
1396  * Algorithm:
1397  *      1. The first ip_rt_redirect_number redirects are sent
1398  *         with exponential backoff, then we stop sending them at all,
1399  *         assuming that the host ignores our redirects.
1400  *      2. If we did not see packets requiring redirects
1401  *         during ip_rt_redirect_silence, we assume that the host
1402  *         forgot redirected route and start to send redirects again.
1403  *
1404  * This algorithm is much cheaper and more intelligent than dumb load limiting
1405  * in icmp.c.
1406  *
1407  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1408  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1409  */
1410
1411 void ip_rt_send_redirect(struct sk_buff *skb)
1412 {
1413         struct rtable *rt = skb_rtable(skb);
1414         struct in_device *in_dev;
1415         struct inet_peer *peer;
1416         int log_martians;
1417
1418         rcu_read_lock();
1419         in_dev = __in_dev_get_rcu(rt->dst.dev);
1420         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1421                 rcu_read_unlock();
1422                 return;
1423         }
1424         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1425         rcu_read_unlock();
1426
1427         if (!rt->peer)
1428                 rt_bind_peer(rt, rt->rt_dst, 1);
1429         peer = rt->peer;
1430         if (!peer) {
1431                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1432                 return;
1433         }
1434
1435         /* No redirected packets during ip_rt_redirect_silence;
1436          * reset the algorithm.
1437          */
1438         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1439                 peer->rate_tokens = 0;
1440
1441         /* Too many ignored redirects; do not send anything
1442          * set dst.rate_last to the last seen redirected packet.
1443          */
1444         if (peer->rate_tokens >= ip_rt_redirect_number) {
1445                 peer->rate_last = jiffies;
1446                 return;
1447         }
1448
1449         /* Check for load limit; set rate_last to the latest sent
1450          * redirect.
1451          */
1452         if (peer->rate_tokens == 0 ||
1453             time_after(jiffies,
1454                        (peer->rate_last +
1455                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1456                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1457                 peer->rate_last = jiffies;
1458                 ++peer->rate_tokens;
1459 #ifdef CONFIG_IP_ROUTE_VERBOSE
1460                 if (log_martians &&
1461                     peer->rate_tokens == ip_rt_redirect_number &&
1462                     net_ratelimit())
1463                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1464                                &ip_hdr(skb)->saddr, rt->rt_iif,
1465                                 &rt->rt_dst, &rt->rt_gateway);
1466 #endif
1467         }
1468 }
1469
1470 static int ip_error(struct sk_buff *skb)
1471 {
1472         struct rtable *rt = skb_rtable(skb);
1473         struct inet_peer *peer;
1474         unsigned long now;
1475         bool send;
1476         int code;
1477
1478         switch (rt->dst.error) {
1479         case EINVAL:
1480         default:
1481                 goto out;
1482         case EHOSTUNREACH:
1483                 code = ICMP_HOST_UNREACH;
1484                 break;
1485         case ENETUNREACH:
1486                 code = ICMP_NET_UNREACH;
1487                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1488                                 IPSTATS_MIB_INNOROUTES);
1489                 break;
1490         case EACCES:
1491                 code = ICMP_PKT_FILTERED;
1492                 break;
1493         }
1494
1495         if (!rt->peer)
1496                 rt_bind_peer(rt, rt->rt_dst, 1);
1497         peer = rt->peer;
1498
1499         send = true;
1500         if (peer) {
1501                 now = jiffies;
1502                 peer->rate_tokens += now - peer->rate_last;
1503                 if (peer->rate_tokens > ip_rt_error_burst)
1504                         peer->rate_tokens = ip_rt_error_burst;
1505                 peer->rate_last = now;
1506                 if (peer->rate_tokens >= ip_rt_error_cost)
1507                         peer->rate_tokens -= ip_rt_error_cost;
1508                 else
1509                         send = false;
1510         }
1511         if (send)
1512                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1513
1514 out:    kfree_skb(skb);
1515         return 0;
1516 }
1517
1518 /*
1519  *      The last two values are not from the RFC but
1520  *      are needed for AMPRnet AX.25 paths.
1521  */
1522
1523 static const unsigned short mtu_plateau[] =
1524 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1525
1526 static inline unsigned short guess_mtu(unsigned short old_mtu)
1527 {
1528         int i;
1529
1530         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1531                 if (old_mtu > mtu_plateau[i])
1532                         return mtu_plateau[i];
1533         return 68;
1534 }
1535
1536 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1537                                  unsigned short new_mtu,
1538                                  struct net_device *dev)
1539 {
1540         unsigned short old_mtu = ntohs(iph->tot_len);
1541         unsigned short est_mtu = 0;
1542         struct inet_peer *peer;
1543
1544         peer = inet_getpeer_v4(iph->daddr, 1);
1545         if (peer) {
1546                 unsigned short mtu = new_mtu;
1547
1548                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1549                         /* BSD 4.2 derived systems incorrectly adjust
1550                          * tot_len by the IP header length, and report
1551                          * a zero MTU in the ICMP message.
1552                          */
1553                         if (mtu == 0 &&
1554                             old_mtu >= 68 + (iph->ihl << 2))
1555                                 old_mtu -= iph->ihl << 2;
1556                         mtu = guess_mtu(old_mtu);
1557                 }
1558
1559                 if (mtu < ip_rt_min_pmtu)
1560                         mtu = ip_rt_min_pmtu;
1561                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1562                         unsigned long pmtu_expires;
1563
1564                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1565                         if (!pmtu_expires)
1566                                 pmtu_expires = 1UL;
1567
1568                         est_mtu = mtu;
1569                         peer->pmtu_learned = mtu;
1570                         peer->pmtu_expires = pmtu_expires;
1571                 }
1572
1573                 inet_putpeer(peer);
1574
1575                 atomic_inc(&__rt_peer_genid);
1576         }
1577         return est_mtu ? : new_mtu;
1578 }
1579
1580 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1581 {
1582         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1583
1584         if (!expires)
1585                 return;
1586         if (time_before(jiffies, expires)) {
1587                 u32 orig_dst_mtu = dst_mtu(dst);
1588                 if (peer->pmtu_learned < orig_dst_mtu) {
1589                         if (!peer->pmtu_orig)
1590                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1591                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1592                 }
1593         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1594                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1595 }
1596
1597 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1598 {
1599         struct rtable *rt = (struct rtable *) dst;
1600         struct inet_peer *peer;
1601
1602         dst_confirm(dst);
1603
1604         if (!rt->peer)
1605                 rt_bind_peer(rt, rt->rt_dst, 1);
1606         peer = rt->peer;
1607         if (peer) {
1608                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1609
1610                 if (mtu < ip_rt_min_pmtu)
1611                         mtu = ip_rt_min_pmtu;
1612                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1613
1614                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1615                         if (!pmtu_expires)
1616                                 pmtu_expires = 1UL;
1617
1618                         peer->pmtu_learned = mtu;
1619                         peer->pmtu_expires = pmtu_expires;
1620
1621                         atomic_inc(&__rt_peer_genid);
1622                         rt->rt_peer_genid = rt_peer_genid();
1623                 }
1624                 check_peer_pmtu(dst, peer);
1625         }
1626 }
1627
1628 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1629 {
1630         struct rtable *rt = (struct rtable *) dst;
1631         __be32 orig_gw = rt->rt_gateway;
1632         struct neighbour *n, *old_n;
1633
1634         dst_confirm(&rt->dst);
1635
1636         rt->rt_gateway = peer->redirect_learned.a4;
1637
1638         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1639         if (IS_ERR(n))
1640                 return PTR_ERR(n);
1641         old_n = xchg(&rt->dst._neighbour, n);
1642         if (old_n)
1643                 neigh_release(old_n);
1644         if (!n || !(n->nud_state & NUD_VALID)) {
1645                 if (n)
1646                         neigh_event_send(n, NULL);
1647                 rt->rt_gateway = orig_gw;
1648                 return -EAGAIN;
1649         } else {
1650                 rt->rt_flags |= RTCF_REDIRECTED;
1651                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1652         }
1653         return 0;
1654 }
1655
1656 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1657 {
1658         struct rtable *rt = (struct rtable *) dst;
1659
1660         if (rt_is_expired(rt))
1661                 return NULL;
1662         if (rt->rt_peer_genid != rt_peer_genid()) {
1663                 struct inet_peer *peer;
1664
1665                 if (!rt->peer)
1666                         rt_bind_peer(rt, rt->rt_dst, 0);
1667
1668                 peer = rt->peer;
1669                 if (peer) {
1670                         check_peer_pmtu(dst, peer);
1671
1672                         if (peer->redirect_learned.a4 &&
1673                             peer->redirect_learned.a4 != rt->rt_gateway) {
1674                                 if (check_peer_redir(dst, peer))
1675                                         return NULL;
1676                         }
1677                 }
1678
1679                 rt->rt_peer_genid = rt_peer_genid();
1680         }
1681         return dst;
1682 }
1683
1684 static void ipv4_dst_destroy(struct dst_entry *dst)
1685 {
1686         struct rtable *rt = (struct rtable *) dst;
1687         struct inet_peer *peer = rt->peer;
1688
1689         if (rt->fi) {
1690                 fib_info_put(rt->fi);
1691                 rt->fi = NULL;
1692         }
1693         if (peer) {
1694                 rt->peer = NULL;
1695                 inet_putpeer(peer);
1696         }
1697 }
1698
1699
1700 static void ipv4_link_failure(struct sk_buff *skb)
1701 {
1702         struct rtable *rt;
1703
1704         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1705
1706         rt = skb_rtable(skb);
1707         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1708                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1709 }
1710
1711 static int ip_rt_bug(struct sk_buff *skb)
1712 {
1713         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1714                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1715                 skb->dev ? skb->dev->name : "?");
1716         kfree_skb(skb);
1717         WARN_ON(1);
1718         return 0;
1719 }
1720
1721 /*
1722    We do not cache source address of outgoing interface,
1723    because it is used only by IP RR, TS and SRR options,
1724    so that it out of fast path.
1725
1726    BTW remember: "addr" is allowed to be not aligned
1727    in IP options!
1728  */
1729
1730 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1731 {
1732         __be32 src;
1733
1734         if (rt_is_output_route(rt))
1735                 src = ip_hdr(skb)->saddr;
1736         else {
1737                 struct fib_result res;
1738                 struct flowi4 fl4;
1739                 struct iphdr *iph;
1740
1741                 iph = ip_hdr(skb);
1742
1743                 memset(&fl4, 0, sizeof(fl4));
1744                 fl4.daddr = iph->daddr;
1745                 fl4.saddr = iph->saddr;
1746                 fl4.flowi4_tos = RT_TOS(iph->tos);
1747                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1748                 fl4.flowi4_iif = skb->dev->ifindex;
1749                 fl4.flowi4_mark = skb->mark;
1750
1751                 rcu_read_lock();
1752                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1753                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1754                 else
1755                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1756                                         RT_SCOPE_UNIVERSE);
1757                 rcu_read_unlock();
1758         }
1759         memcpy(addr, &src, 4);
1760 }
1761
1762 #ifdef CONFIG_IP_ROUTE_CLASSID
1763 static void set_class_tag(struct rtable *rt, u32 tag)
1764 {
1765         if (!(rt->dst.tclassid & 0xFFFF))
1766                 rt->dst.tclassid |= tag & 0xFFFF;
1767         if (!(rt->dst.tclassid & 0xFFFF0000))
1768                 rt->dst.tclassid |= tag & 0xFFFF0000;
1769 }
1770 #endif
1771
1772 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1773 {
1774         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1775
1776         if (advmss == 0) {
1777                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1778                                ip_rt_min_advmss);
1779                 if (advmss > 65535 - 40)
1780                         advmss = 65535 - 40;
1781         }
1782         return advmss;
1783 }
1784
1785 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1786 {
1787         unsigned int mtu = dst->dev->mtu;
1788
1789         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1790                 const struct rtable *rt = (const struct rtable *) dst;
1791
1792                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1793                         mtu = 576;
1794         }
1795
1796         if (mtu > IP_MAX_MTU)
1797                 mtu = IP_MAX_MTU;
1798
1799         return mtu;
1800 }
1801
1802 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1803                             struct fib_info *fi)
1804 {
1805         struct inet_peer *peer;
1806         int create = 0;
1807
1808         /* If a peer entry exists for this destination, we must hook
1809          * it up in order to get at cached metrics.
1810          */
1811         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1812                 create = 1;
1813
1814         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1815         if (peer) {
1816                 rt->rt_peer_genid = rt_peer_genid();
1817                 if (inet_metrics_new(peer))
1818                         memcpy(peer->metrics, fi->fib_metrics,
1819                                sizeof(u32) * RTAX_MAX);
1820                 dst_init_metrics(&rt->dst, peer->metrics, false);
1821
1822                 check_peer_pmtu(&rt->dst, peer);
1823                 if (peer->redirect_learned.a4 &&
1824                     peer->redirect_learned.a4 != rt->rt_gateway) {
1825                         rt->rt_gateway = peer->redirect_learned.a4;
1826                         rt->rt_flags |= RTCF_REDIRECTED;
1827                 }
1828         } else {
1829                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1830                         rt->fi = fi;
1831                         atomic_inc(&fi->fib_clntref);
1832                 }
1833                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1834         }
1835 }
1836
1837 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1838                            const struct fib_result *res,
1839                            struct fib_info *fi, u16 type, u32 itag)
1840 {
1841         struct dst_entry *dst = &rt->dst;
1842
1843         if (fi) {
1844                 if (FIB_RES_GW(*res) &&
1845                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1846                         rt->rt_gateway = FIB_RES_GW(*res);
1847                 rt_init_metrics(rt, fl4, fi);
1848 #ifdef CONFIG_IP_ROUTE_CLASSID
1849                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1850 #endif
1851         }
1852
1853         if (dst_mtu(dst) > IP_MAX_MTU)
1854                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1855         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1856                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1857
1858 #ifdef CONFIG_IP_ROUTE_CLASSID
1859 #ifdef CONFIG_IP_MULTIPLE_TABLES
1860         set_class_tag(rt, fib_rules_tclass(res));
1861 #endif
1862         set_class_tag(rt, itag);
1863 #endif
1864 }
1865
1866 static struct rtable *rt_dst_alloc(struct net_device *dev,
1867                                    bool nopolicy, bool noxfrm)
1868 {
1869         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1870                          DST_HOST |
1871                          (nopolicy ? DST_NOPOLICY : 0) |
1872                          (noxfrm ? DST_NOXFRM : 0));
1873 }
1874
1875 /* called in rcu_read_lock() section */
1876 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1877                                 u8 tos, struct net_device *dev, int our)
1878 {
1879         unsigned int hash;
1880         struct rtable *rth;
1881         __be32 spec_dst;
1882         struct in_device *in_dev = __in_dev_get_rcu(dev);
1883         u32 itag = 0;
1884         int err;
1885
1886         /* Primary sanity checks. */
1887
1888         if (in_dev == NULL)
1889                 return -EINVAL;
1890
1891         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1892             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1893                 goto e_inval;
1894
1895         if (ipv4_is_zeronet(saddr)) {
1896                 if (!ipv4_is_local_multicast(daddr))
1897                         goto e_inval;
1898                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1899         } else {
1900                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1901                                           &itag);
1902                 if (err < 0)
1903                         goto e_err;
1904         }
1905         rth = rt_dst_alloc(init_net.loopback_dev,
1906                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1907         if (!rth)
1908                 goto e_nobufs;
1909
1910 #ifdef CONFIG_IP_ROUTE_CLASSID
1911         rth->dst.tclassid = itag;
1912 #endif
1913         rth->dst.output = ip_rt_bug;
1914
1915         rth->rt_key_dst = daddr;
1916         rth->rt_key_src = saddr;
1917         rth->rt_genid   = rt_genid(dev_net(dev));
1918         rth->rt_flags   = RTCF_MULTICAST;
1919         rth->rt_type    = RTN_MULTICAST;
1920         rth->rt_key_tos = tos;
1921         rth->rt_dst     = daddr;
1922         rth->rt_src     = saddr;
1923         rth->rt_route_iif = dev->ifindex;
1924         rth->rt_iif     = dev->ifindex;
1925         rth->rt_oif     = 0;
1926         rth->rt_mark    = skb->mark;
1927         rth->rt_gateway = daddr;
1928         rth->rt_spec_dst= spec_dst;
1929         rth->rt_peer_genid = 0;
1930         rth->peer = NULL;
1931         rth->fi = NULL;
1932         if (our) {
1933                 rth->dst.input= ip_local_deliver;
1934                 rth->rt_flags |= RTCF_LOCAL;
1935         }
1936
1937 #ifdef CONFIG_IP_MROUTE
1938         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1939                 rth->dst.input = ip_mr_input;
1940 #endif
1941         RT_CACHE_STAT_INC(in_slow_mc);
1942
1943         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1944         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1945         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1946
1947 e_nobufs:
1948         return -ENOBUFS;
1949 e_inval:
1950         return -EINVAL;
1951 e_err:
1952         return err;
1953 }
1954
1955
1956 static void ip_handle_martian_source(struct net_device *dev,
1957                                      struct in_device *in_dev,
1958                                      struct sk_buff *skb,
1959                                      __be32 daddr,
1960                                      __be32 saddr)
1961 {
1962         RT_CACHE_STAT_INC(in_martian_src);
1963 #ifdef CONFIG_IP_ROUTE_VERBOSE
1964         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1965                 /*
1966                  *      RFC1812 recommendation, if source is martian,
1967                  *      the only hint is MAC header.
1968                  */
1969                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1970                         &daddr, &saddr, dev->name);
1971                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1972                         int i;
1973                         const unsigned char *p = skb_mac_header(skb);
1974                         printk(KERN_WARNING "ll header: ");
1975                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1976                                 printk("%02x", *p);
1977                                 if (i < (dev->hard_header_len - 1))
1978                                         printk(":");
1979                         }
1980                         printk("\n");
1981                 }
1982         }
1983 #endif
1984 }
1985
1986 /* called in rcu_read_lock() section */
1987 static int __mkroute_input(struct sk_buff *skb,
1988                            const struct fib_result *res,
1989                            struct in_device *in_dev,
1990                            __be32 daddr, __be32 saddr, u32 tos,
1991                            struct rtable **result)
1992 {
1993         struct rtable *rth;
1994         int err;
1995         struct in_device *out_dev;
1996         unsigned int flags = 0;
1997         __be32 spec_dst;
1998         u32 itag;
1999
2000         /* get a working reference to the output device */
2001         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2002         if (out_dev == NULL) {
2003                 if (net_ratelimit())
2004                         printk(KERN_CRIT "Bug in ip_route_input" \
2005                                "_slow(). Please, report\n");
2006                 return -EINVAL;
2007         }
2008
2009
2010         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2011                                   in_dev->dev, &spec_dst, &itag);
2012         if (err < 0) {
2013                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2014                                          saddr);
2015
2016                 goto cleanup;
2017         }
2018
2019         if (err)
2020                 flags |= RTCF_DIRECTSRC;
2021
2022         if (out_dev == in_dev && err &&
2023             (IN_DEV_SHARED_MEDIA(out_dev) ||
2024              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2025                 flags |= RTCF_DOREDIRECT;
2026
2027         if (skb->protocol != htons(ETH_P_IP)) {
2028                 /* Not IP (i.e. ARP). Do not create route, if it is
2029                  * invalid for proxy arp. DNAT routes are always valid.
2030                  *
2031                  * Proxy arp feature have been extended to allow, ARP
2032                  * replies back to the same interface, to support
2033                  * Private VLAN switch technologies. See arp.c.
2034                  */
2035                 if (out_dev == in_dev &&
2036                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2037                         err = -EINVAL;
2038                         goto cleanup;
2039                 }
2040         }
2041
2042         rth = rt_dst_alloc(out_dev->dev,
2043                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2044                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2045         if (!rth) {
2046                 err = -ENOBUFS;
2047                 goto cleanup;
2048         }
2049
2050         rth->rt_key_dst = daddr;
2051         rth->rt_key_src = saddr;
2052         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2053         rth->rt_flags = flags;
2054         rth->rt_type = res->type;
2055         rth->rt_key_tos = tos;
2056         rth->rt_dst     = daddr;
2057         rth->rt_src     = saddr;
2058         rth->rt_route_iif = in_dev->dev->ifindex;
2059         rth->rt_iif     = in_dev->dev->ifindex;
2060         rth->rt_oif     = 0;
2061         rth->rt_mark    = skb->mark;
2062         rth->rt_gateway = daddr;
2063         rth->rt_spec_dst= spec_dst;
2064         rth->rt_peer_genid = 0;
2065         rth->peer = NULL;
2066         rth->fi = NULL;
2067
2068         rth->dst.input = ip_forward;
2069         rth->dst.output = ip_output;
2070
2071         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2072
2073         *result = rth;
2074         err = 0;
2075  cleanup:
2076         return err;
2077 }
2078
2079 static int ip_mkroute_input(struct sk_buff *skb,
2080                             struct fib_result *res,
2081                             const struct flowi4 *fl4,
2082                             struct in_device *in_dev,
2083                             __be32 daddr, __be32 saddr, u32 tos)
2084 {
2085         struct rtable* rth = NULL;
2086         int err;
2087         unsigned hash;
2088
2089 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2090         if (res->fi && res->fi->fib_nhs > 1)
2091                 fib_select_multipath(res);
2092 #endif
2093
2094         /* create a routing cache entry */
2095         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2096         if (err)
2097                 return err;
2098
2099         /* put it into the cache */
2100         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2101                        rt_genid(dev_net(rth->dst.dev)));
2102         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2103         if (IS_ERR(rth))
2104                 return PTR_ERR(rth);
2105         return 0;
2106 }
2107
2108 /*
2109  *      NOTE. We drop all the packets that has local source
2110  *      addresses, because every properly looped back packet
2111  *      must have correct destination already attached by output routine.
2112  *
2113  *      Such approach solves two big problems:
2114  *      1. Not simplex devices are handled properly.
2115  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2116  *      called with rcu_read_lock()
2117  */
2118
2119 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120                                u8 tos, struct net_device *dev)
2121 {
2122         struct fib_result res;
2123         struct in_device *in_dev = __in_dev_get_rcu(dev);
2124         struct flowi4   fl4;
2125         unsigned        flags = 0;
2126         u32             itag = 0;
2127         struct rtable * rth;
2128         unsigned        hash;
2129         __be32          spec_dst;
2130         int             err = -EINVAL;
2131         struct net    * net = dev_net(dev);
2132
2133         /* IP on this device is disabled. */
2134
2135         if (!in_dev)
2136                 goto out;
2137
2138         /* Check for the most weird martians, which can be not detected
2139            by fib_lookup.
2140          */
2141
2142         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2143             ipv4_is_loopback(saddr))
2144                 goto martian_source;
2145
2146         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2147                 goto brd_input;
2148
2149         /* Accept zero addresses only to limited broadcast;
2150          * I even do not know to fix it or not. Waiting for complains :-)
2151          */
2152         if (ipv4_is_zeronet(saddr))
2153                 goto martian_source;
2154
2155         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2156                 goto martian_destination;
2157
2158         /*
2159          *      Now we are ready to route packet.
2160          */
2161         fl4.flowi4_oif = 0;
2162         fl4.flowi4_iif = dev->ifindex;
2163         fl4.flowi4_mark = skb->mark;
2164         fl4.flowi4_tos = tos;
2165         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2166         fl4.daddr = daddr;
2167         fl4.saddr = saddr;
2168         err = fib_lookup(net, &fl4, &res);
2169         if (err != 0) {
2170                 if (!IN_DEV_FORWARD(in_dev))
2171                         goto e_hostunreach;
2172                 goto no_route;
2173         }
2174
2175         RT_CACHE_STAT_INC(in_slow_tot);
2176
2177         if (res.type == RTN_BROADCAST)
2178                 goto brd_input;
2179
2180         if (res.type == RTN_LOCAL) {
2181                 err = fib_validate_source(skb, saddr, daddr, tos,
2182                                           net->loopback_dev->ifindex,
2183                                           dev, &spec_dst, &itag);
2184                 if (err < 0)
2185                         goto martian_source_keep_err;
2186                 if (err)
2187                         flags |= RTCF_DIRECTSRC;
2188                 spec_dst = daddr;
2189                 goto local_input;
2190         }
2191
2192         if (!IN_DEV_FORWARD(in_dev))
2193                 goto e_hostunreach;
2194         if (res.type != RTN_UNICAST)
2195                 goto martian_destination;
2196
2197         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2198 out:    return err;
2199
2200 brd_input:
2201         if (skb->protocol != htons(ETH_P_IP))
2202                 goto e_inval;
2203
2204         if (ipv4_is_zeronet(saddr))
2205                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2206         else {
2207                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2208                                           &itag);
2209                 if (err < 0)
2210                         goto martian_source_keep_err;
2211                 if (err)
2212                         flags |= RTCF_DIRECTSRC;
2213         }
2214         flags |= RTCF_BROADCAST;
2215         res.type = RTN_BROADCAST;
2216         RT_CACHE_STAT_INC(in_brd);
2217
2218 local_input:
2219         rth = rt_dst_alloc(net->loopback_dev,
2220                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2221         if (!rth)
2222                 goto e_nobufs;
2223
2224         rth->dst.input= ip_local_deliver;
2225         rth->dst.output= ip_rt_bug;
2226 #ifdef CONFIG_IP_ROUTE_CLASSID
2227         rth->dst.tclassid = itag;
2228 #endif
2229
2230         rth->rt_key_dst = daddr;
2231         rth->rt_key_src = saddr;
2232         rth->rt_genid = rt_genid(net);
2233         rth->rt_flags   = flags|RTCF_LOCAL;
2234         rth->rt_type    = res.type;
2235         rth->rt_key_tos = tos;
2236         rth->rt_dst     = daddr;
2237         rth->rt_src     = saddr;
2238 #ifdef CONFIG_IP_ROUTE_CLASSID
2239         rth->dst.tclassid = itag;
2240 #endif
2241         rth->rt_route_iif = dev->ifindex;
2242         rth->rt_iif     = dev->ifindex;
2243         rth->rt_oif     = 0;
2244         rth->rt_mark    = skb->mark;
2245         rth->rt_gateway = daddr;
2246         rth->rt_spec_dst= spec_dst;
2247         rth->rt_peer_genid = 0;
2248         rth->peer = NULL;
2249         rth->fi = NULL;
2250         if (res.type == RTN_UNREACHABLE) {
2251                 rth->dst.input= ip_error;
2252                 rth->dst.error= -err;
2253                 rth->rt_flags   &= ~RTCF_LOCAL;
2254         }
2255         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2256         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2257         err = 0;
2258         if (IS_ERR(rth))
2259                 err = PTR_ERR(rth);
2260         goto out;
2261
2262 no_route:
2263         RT_CACHE_STAT_INC(in_no_route);
2264         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2265         res.type = RTN_UNREACHABLE;
2266         if (err == -ESRCH)
2267                 err = -ENETUNREACH;
2268         goto local_input;
2269
2270         /*
2271          *      Do not cache martian addresses: they should be logged (RFC1812)
2272          */
2273 martian_destination:
2274         RT_CACHE_STAT_INC(in_martian_dst);
2275 #ifdef CONFIG_IP_ROUTE_VERBOSE
2276         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2277                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2278                         &daddr, &saddr, dev->name);
2279 #endif
2280
2281 e_hostunreach:
2282         err = -EHOSTUNREACH;
2283         goto out;
2284
2285 e_inval:
2286         err = -EINVAL;
2287         goto out;
2288
2289 e_nobufs:
2290         err = -ENOBUFS;
2291         goto out;
2292
2293 martian_source:
2294         err = -EINVAL;
2295 martian_source_keep_err:
2296         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297         goto out;
2298 }
2299
2300 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301                            u8 tos, struct net_device *dev, bool noref)
2302 {
2303         struct rtable * rth;
2304         unsigned        hash;
2305         int iif = dev->ifindex;
2306         struct net *net;
2307         int res;
2308
2309         net = dev_net(dev);
2310
2311         rcu_read_lock();
2312
2313         if (!rt_caching(net))
2314                 goto skip_cache;
2315
2316         tos &= IPTOS_RT_MASK;
2317         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2318
2319         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2320              rth = rcu_dereference(rth->dst.rt_next)) {
2321                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2322                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2323                      (rth->rt_iif ^ iif) |
2324                      rth->rt_oif |
2325                      (rth->rt_key_tos ^ tos)) == 0 &&
2326                     rth->rt_mark == skb->mark &&
2327                     net_eq(dev_net(rth->dst.dev), net) &&
2328                     !rt_is_expired(rth)) {
2329                         if (noref) {
2330                                 dst_use_noref(&rth->dst, jiffies);
2331                                 skb_dst_set_noref(skb, &rth->dst);
2332                         } else {
2333                                 dst_use(&rth->dst, jiffies);
2334                                 skb_dst_set(skb, &rth->dst);
2335                         }
2336                         RT_CACHE_STAT_INC(in_hit);
2337                         rcu_read_unlock();
2338                         return 0;
2339                 }
2340                 RT_CACHE_STAT_INC(in_hlist_search);
2341         }
2342
2343 skip_cache:
2344         /* Multicast recognition logic is moved from route cache to here.
2345            The problem was that too many Ethernet cards have broken/missing
2346            hardware multicast filters :-( As result the host on multicasting
2347            network acquires a lot of useless route cache entries, sort of
2348            SDR messages from all the world. Now we try to get rid of them.
2349            Really, provided software IP multicast filter is organized
2350            reasonably (at least, hashed), it does not result in a slowdown
2351            comparing with route cache reject entries.
2352            Note, that multicast routers are not affected, because
2353            route cache entry is created eventually.
2354          */
2355         if (ipv4_is_multicast(daddr)) {
2356                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2357
2358                 if (in_dev) {
2359                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2360                                                   ip_hdr(skb)->protocol);
2361                         if (our
2362 #ifdef CONFIG_IP_MROUTE
2363                                 ||
2364                             (!ipv4_is_local_multicast(daddr) &&
2365                              IN_DEV_MFORWARD(in_dev))
2366 #endif
2367                            ) {
2368                                 int res = ip_route_input_mc(skb, daddr, saddr,
2369                                                             tos, dev, our);
2370                                 rcu_read_unlock();
2371                                 return res;
2372                         }
2373                 }
2374                 rcu_read_unlock();
2375                 return -EINVAL;
2376         }
2377         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2378         rcu_read_unlock();
2379         return res;
2380 }
2381 EXPORT_SYMBOL(ip_route_input_common);
2382
2383 /* called with rcu_read_lock() */
2384 static struct rtable *__mkroute_output(const struct fib_result *res,
2385                                        const struct flowi4 *fl4,
2386                                        __be32 orig_daddr, __be32 orig_saddr,
2387                                        int orig_oif, struct net_device *dev_out,
2388                                        unsigned int flags)
2389 {
2390         struct fib_info *fi = res->fi;
2391         u32 tos = RT_FL_TOS(fl4);
2392         struct in_device *in_dev;
2393         u16 type = res->type;
2394         struct rtable *rth;
2395
2396         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2397                 return ERR_PTR(-EINVAL);
2398
2399         if (ipv4_is_lbcast(fl4->daddr))
2400                 type = RTN_BROADCAST;
2401         else if (ipv4_is_multicast(fl4->daddr))
2402                 type = RTN_MULTICAST;
2403         else if (ipv4_is_zeronet(fl4->daddr))
2404                 return ERR_PTR(-EINVAL);
2405
2406         if (dev_out->flags & IFF_LOOPBACK)
2407                 flags |= RTCF_LOCAL;
2408
2409         in_dev = __in_dev_get_rcu(dev_out);
2410         if (!in_dev)
2411                 return ERR_PTR(-EINVAL);
2412
2413         if (type == RTN_BROADCAST) {
2414                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2415                 fi = NULL;
2416         } else if (type == RTN_MULTICAST) {
2417                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2418                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2419                                      fl4->flowi4_proto))
2420                         flags &= ~RTCF_LOCAL;
2421                 /* If multicast route do not exist use
2422                  * default one, but do not gateway in this case.
2423                  * Yes, it is hack.
2424                  */
2425                 if (fi && res->prefixlen < 4)
2426                         fi = NULL;
2427         }
2428
2429         rth = rt_dst_alloc(dev_out,
2430                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2431                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2432         if (!rth)
2433                 return ERR_PTR(-ENOBUFS);
2434
2435         rth->dst.output = ip_output;
2436
2437         rth->rt_key_dst = orig_daddr;
2438         rth->rt_key_src = orig_saddr;
2439         rth->rt_genid = rt_genid(dev_net(dev_out));
2440         rth->rt_flags   = flags;
2441         rth->rt_type    = type;
2442         rth->rt_key_tos = tos;
2443         rth->rt_dst     = fl4->daddr;
2444         rth->rt_src     = fl4->saddr;
2445         rth->rt_route_iif = 0;
2446         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2447         rth->rt_oif     = orig_oif;
2448         rth->rt_mark    = fl4->flowi4_mark;
2449         rth->rt_gateway = fl4->daddr;
2450         rth->rt_spec_dst= fl4->saddr;
2451         rth->rt_peer_genid = 0;
2452         rth->peer = NULL;
2453         rth->fi = NULL;
2454
2455         RT_CACHE_STAT_INC(out_slow_tot);
2456
2457         if (flags & RTCF_LOCAL) {
2458                 rth->dst.input = ip_local_deliver;
2459                 rth->rt_spec_dst = fl4->daddr;
2460         }
2461         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2462                 rth->rt_spec_dst = fl4->saddr;
2463                 if (flags & RTCF_LOCAL &&
2464                     !(dev_out->flags & IFF_LOOPBACK)) {
2465                         rth->dst.output = ip_mc_output;
2466                         RT_CACHE_STAT_INC(out_slow_mc);
2467                 }
2468 #ifdef CONFIG_IP_MROUTE
2469                 if (type == RTN_MULTICAST) {
2470                         if (IN_DEV_MFORWARD(in_dev) &&
2471                             !ipv4_is_local_multicast(fl4->daddr)) {
2472                                 rth->dst.input = ip_mr_input;
2473                                 rth->dst.output = ip_mc_output;
2474                         }
2475                 }
2476 #endif
2477         }
2478
2479         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2480
2481         return rth;
2482 }
2483
2484 /*
2485  * Major route resolver routine.
2486  * called with rcu_read_lock();
2487  */
2488
2489 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2490 {
2491         struct net_device *dev_out = NULL;
2492         u32 tos = RT_FL_TOS(fl4);
2493         unsigned int flags = 0;
2494         struct fib_result res;
2495         struct rtable *rth;
2496         __be32 orig_daddr;
2497         __be32 orig_saddr;
2498         int orig_oif;
2499
2500         res.fi          = NULL;
2501 #ifdef CONFIG_IP_MULTIPLE_TABLES
2502         res.r           = NULL;
2503 #endif
2504
2505         orig_daddr = fl4->daddr;
2506         orig_saddr = fl4->saddr;
2507         orig_oif = fl4->flowi4_oif;
2508
2509         fl4->flowi4_iif = net->loopback_dev->ifindex;
2510         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2511         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2512                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2513
2514         rcu_read_lock();
2515         if (fl4->saddr) {
2516                 rth = ERR_PTR(-EINVAL);
2517                 if (ipv4_is_multicast(fl4->saddr) ||
2518                     ipv4_is_lbcast(fl4->saddr) ||
2519                     ipv4_is_zeronet(fl4->saddr))
2520                         goto out;
2521
2522                 /* I removed check for oif == dev_out->oif here.
2523                    It was wrong for two reasons:
2524                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2525                       is assigned to multiple interfaces.
2526                    2. Moreover, we are allowed to send packets with saddr
2527                       of another iface. --ANK
2528                  */
2529
2530                 if (fl4->flowi4_oif == 0 &&
2531                     (ipv4_is_multicast(fl4->daddr) ||
2532                      ipv4_is_lbcast(fl4->daddr))) {
2533                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2535                         if (dev_out == NULL)
2536                                 goto out;
2537
2538                         /* Special hack: user can direct multicasts
2539                            and limited broadcast via necessary interface
2540                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2541                            This hack is not just for fun, it allows
2542                            vic,vat and friends to work.
2543                            They bind socket to loopback, set ttl to zero
2544                            and expect that it will work.
2545                            From the viewpoint of routing cache they are broken,
2546                            because we are not allowed to build multicast path
2547                            with loopback source addr (look, routing cache
2548                            cannot know, that ttl is zero, so that packet
2549                            will not leave this host and route is valid).
2550                            Luckily, this hack is good workaround.
2551                          */
2552
2553                         fl4->flowi4_oif = dev_out->ifindex;
2554                         goto make_route;
2555                 }
2556
2557                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2558                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2559                         if (!__ip_dev_find(net, fl4->saddr, false))
2560                                 goto out;
2561                 }
2562         }
2563
2564
2565         if (fl4->flowi4_oif) {
2566                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2567                 rth = ERR_PTR(-ENODEV);
2568                 if (dev_out == NULL)
2569                         goto out;
2570
2571                 /* RACE: Check return value of inet_select_addr instead. */
2572                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2573                         rth = ERR_PTR(-ENETUNREACH);
2574                         goto out;
2575                 }
2576                 if (ipv4_is_local_multicast(fl4->daddr) ||
2577                     ipv4_is_lbcast(fl4->daddr)) {
2578                         if (!fl4->saddr)
2579                                 fl4->saddr = inet_select_addr(dev_out, 0,
2580                                                               RT_SCOPE_LINK);
2581                         goto make_route;
2582                 }
2583                 if (fl4->saddr) {
2584                         if (ipv4_is_multicast(fl4->daddr))
2585                                 fl4->saddr = inet_select_addr(dev_out, 0,
2586                                                               fl4->flowi4_scope);
2587                         else if (!fl4->daddr)
2588                                 fl4->saddr = inet_select_addr(dev_out, 0,
2589                                                               RT_SCOPE_HOST);
2590                 }
2591         }
2592
2593         if (!fl4->daddr) {
2594                 fl4->daddr = fl4->saddr;
2595                 if (!fl4->daddr)
2596                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2597                 dev_out = net->loopback_dev;
2598                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2599                 res.type = RTN_LOCAL;
2600                 flags |= RTCF_LOCAL;
2601                 goto make_route;
2602         }
2603
2604         if (fib_lookup(net, fl4, &res)) {
2605                 res.fi = NULL;
2606                 if (fl4->flowi4_oif) {
2607                         /* Apparently, routing tables are wrong. Assume,
2608                            that the destination is on link.
2609
2610                            WHY? DW.
2611                            Because we are allowed to send to iface
2612                            even if it has NO routes and NO assigned
2613                            addresses. When oif is specified, routing
2614                            tables are looked up with only one purpose:
2615                            to catch if destination is gatewayed, rather than
2616                            direct. Moreover, if MSG_DONTROUTE is set,
2617                            we send packet, ignoring both routing tables
2618                            and ifaddr state. --ANK
2619
2620
2621                            We could make it even if oif is unknown,
2622                            likely IPv6, but we do not.
2623                          */
2624
2625                         if (fl4->saddr == 0)
2626                                 fl4->saddr = inet_select_addr(dev_out, 0,
2627                                                               RT_SCOPE_LINK);
2628                         res.type = RTN_UNICAST;
2629                         goto make_route;
2630                 }
2631                 rth = ERR_PTR(-ENETUNREACH);
2632                 goto out;
2633         }
2634
2635         if (res.type == RTN_LOCAL) {
2636                 if (!fl4->saddr) {
2637                         if (res.fi->fib_prefsrc)
2638                                 fl4->saddr = res.fi->fib_prefsrc;
2639                         else
2640                                 fl4->saddr = fl4->daddr;
2641                 }
2642                 dev_out = net->loopback_dev;
2643                 fl4->flowi4_oif = dev_out->ifindex;
2644                 res.fi = NULL;
2645                 flags |= RTCF_LOCAL;
2646                 goto make_route;
2647         }
2648
2649 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2650         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2651                 fib_select_multipath(&res);
2652         else
2653 #endif
2654         if (!res.prefixlen &&
2655             res.table->tb_num_default > 1 &&
2656             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2657                 fib_select_default(&res);
2658
2659         if (!fl4->saddr)
2660                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2661
2662         dev_out = FIB_RES_DEV(res);
2663         fl4->flowi4_oif = dev_out->ifindex;
2664
2665
2666 make_route:
2667         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2668                                dev_out, flags);
2669         if (!IS_ERR(rth)) {
2670                 unsigned int hash;
2671
2672                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2673                                rt_genid(dev_net(dev_out)));
2674                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2675         }
2676
2677 out:
2678         rcu_read_unlock();
2679         return rth;
2680 }
2681
2682 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2683 {
2684         struct rtable *rth;
2685         unsigned int hash;
2686
2687         if (!rt_caching(net))
2688                 goto slow_output;
2689
2690         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2691
2692         rcu_read_lock_bh();
2693         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2694                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2695                 if (rth->rt_key_dst == flp4->daddr &&
2696                     rth->rt_key_src == flp4->saddr &&
2697                     rt_is_output_route(rth) &&
2698                     rth->rt_oif == flp4->flowi4_oif &&
2699                     rth->rt_mark == flp4->flowi4_mark &&
2700                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2701                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2702                     net_eq(dev_net(rth->dst.dev), net) &&
2703                     !rt_is_expired(rth)) {
2704                         dst_use(&rth->dst, jiffies);
2705                         RT_CACHE_STAT_INC(out_hit);
2706                         rcu_read_unlock_bh();
2707                         if (!flp4->saddr)
2708                                 flp4->saddr = rth->rt_src;
2709                         if (!flp4->daddr)
2710                                 flp4->daddr = rth->rt_dst;
2711                         return rth;
2712                 }
2713                 RT_CACHE_STAT_INC(out_hlist_search);
2714         }
2715         rcu_read_unlock_bh();
2716
2717 slow_output:
2718         return ip_route_output_slow(net, flp4);
2719 }
2720 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2721
2722 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2723 {
2724         return NULL;
2725 }
2726
2727 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2728 {
2729         return 0;
2730 }
2731
2732 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2733 {
2734 }
2735
2736 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2737                                           unsigned long old)
2738 {
2739         return NULL;
2740 }
2741
2742 static struct dst_ops ipv4_dst_blackhole_ops = {
2743         .family                 =       AF_INET,
2744         .protocol               =       cpu_to_be16(ETH_P_IP),
2745         .destroy                =       ipv4_dst_destroy,
2746         .check                  =       ipv4_blackhole_dst_check,
2747         .default_mtu            =       ipv4_blackhole_default_mtu,
2748         .default_advmss         =       ipv4_default_advmss,
2749         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2750         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2751         .neigh_lookup           =       ipv4_neigh_lookup,
2752 };
2753
2754 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2755 {
2756         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2757         struct rtable *ort = (struct rtable *) dst_orig;
2758
2759         if (rt) {
2760                 struct dst_entry *new = &rt->dst;
2761
2762                 new->__use = 1;
2763                 new->input = dst_discard;
2764                 new->output = dst_discard;
2765                 dst_copy_metrics(new, &ort->dst);
2766
2767                 new->dev = ort->dst.dev;
2768                 if (new->dev)
2769                         dev_hold(new->dev);
2770
2771                 rt->rt_key_dst = ort->rt_key_dst;
2772                 rt->rt_key_src = ort->rt_key_src;
2773                 rt->rt_key_tos = ort->rt_key_tos;
2774                 rt->rt_route_iif = ort->rt_route_iif;
2775                 rt->rt_iif = ort->rt_iif;
2776                 rt->rt_oif = ort->rt_oif;
2777                 rt->rt_mark = ort->rt_mark;
2778
2779                 rt->rt_genid = rt_genid(net);
2780                 rt->rt_flags = ort->rt_flags;
2781                 rt->rt_type = ort->rt_type;
2782                 rt->rt_dst = ort->rt_dst;
2783                 rt->rt_src = ort->rt_src;
2784                 rt->rt_gateway = ort->rt_gateway;
2785                 rt->rt_spec_dst = ort->rt_spec_dst;
2786                 rt->peer = ort->peer;
2787                 if (rt->peer)
2788                         atomic_inc(&rt->peer->refcnt);
2789                 rt->fi = ort->fi;
2790                 if (rt->fi)
2791                         atomic_inc(&rt->fi->fib_clntref);
2792
2793                 dst_free(new);
2794         }
2795
2796         dst_release(dst_orig);
2797
2798         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2799 }
2800
2801 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2802                                     struct sock *sk)
2803 {
2804         struct rtable *rt = __ip_route_output_key(net, flp4);
2805
2806         if (IS_ERR(rt))
2807                 return rt;
2808
2809         if (flp4->flowi4_proto)
2810                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2811                                                    flowi4_to_flowi(flp4),
2812                                                    sk, 0);
2813
2814         return rt;
2815 }
2816 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2817
2818 static int rt_fill_info(struct net *net,
2819                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2820                         int nowait, unsigned int flags)
2821 {
2822         struct rtable *rt = skb_rtable(skb);
2823         struct rtmsg *r;
2824         struct nlmsghdr *nlh;
2825         long expires = 0;
2826         const struct inet_peer *peer = rt->peer;
2827         u32 id = 0, ts = 0, tsage = 0, error;
2828
2829         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2830         if (nlh == NULL)
2831                 return -EMSGSIZE;
2832
2833         r = nlmsg_data(nlh);
2834         r->rtm_family    = AF_INET;
2835         r->rtm_dst_len  = 32;
2836         r->rtm_src_len  = 0;
2837         r->rtm_tos      = rt->rt_key_tos;
2838         r->rtm_table    = RT_TABLE_MAIN;
2839         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2840         r->rtm_type     = rt->rt_type;
2841         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2842         r->rtm_protocol = RTPROT_UNSPEC;
2843         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2844         if (rt->rt_flags & RTCF_NOTIFY)
2845                 r->rtm_flags |= RTM_F_NOTIFY;
2846
2847         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2848
2849         if (rt->rt_key_src) {
2850                 r->rtm_src_len = 32;
2851                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2852         }
2853         if (rt->dst.dev)
2854                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2855 #ifdef CONFIG_IP_ROUTE_CLASSID
2856         if (rt->dst.tclassid)
2857                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2858 #endif
2859         if (rt_is_input_route(rt))
2860                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2861         else if (rt->rt_src != rt->rt_key_src)
2862                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2863
2864         if (rt->rt_dst != rt->rt_gateway)
2865                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2866
2867         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2868                 goto nla_put_failure;
2869
2870         if (rt->rt_mark)
2871                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2872
2873         error = rt->dst.error;
2874         if (peer) {
2875                 inet_peer_refcheck(rt->peer);
2876                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2877                 if (peer->tcp_ts_stamp) {
2878                         ts = peer->tcp_ts;
2879                         tsage = get_seconds() - peer->tcp_ts_stamp;
2880                 }
2881                 expires = ACCESS_ONCE(peer->pmtu_expires);
2882                 if (expires)
2883                         expires -= jiffies;
2884         }
2885
2886         if (rt_is_input_route(rt)) {
2887 #ifdef CONFIG_IP_MROUTE
2888                 __be32 dst = rt->rt_dst;
2889
2890                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2891                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2892                         int err = ipmr_get_route(net, skb,
2893                                                  rt->rt_src, rt->rt_dst,
2894                                                  r, nowait);
2895                         if (err <= 0) {
2896                                 if (!nowait) {
2897                                         if (err == 0)
2898                                                 return 0;
2899                                         goto nla_put_failure;
2900                                 } else {
2901                                         if (err == -EMSGSIZE)
2902                                                 goto nla_put_failure;
2903                                         error = err;
2904                                 }
2905                         }
2906                 } else
2907 #endif
2908                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2909         }
2910
2911         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2912                                expires, error) < 0)
2913                 goto nla_put_failure;
2914
2915         return nlmsg_end(skb, nlh);
2916
2917 nla_put_failure:
2918         nlmsg_cancel(skb, nlh);
2919         return -EMSGSIZE;
2920 }
2921
2922 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2923 {
2924         struct net *net = sock_net(in_skb->sk);
2925         struct rtmsg *rtm;
2926         struct nlattr *tb[RTA_MAX+1];
2927         struct rtable *rt = NULL;
2928         __be32 dst = 0;
2929         __be32 src = 0;
2930         u32 iif;
2931         int err;
2932         int mark;
2933         struct sk_buff *skb;
2934
2935         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2936         if (err < 0)
2937                 goto errout;
2938
2939         rtm = nlmsg_data(nlh);
2940
2941         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2942         if (skb == NULL) {
2943                 err = -ENOBUFS;
2944                 goto errout;
2945         }
2946
2947         /* Reserve room for dummy headers, this skb can pass
2948            through good chunk of routing engine.
2949          */
2950         skb_reset_mac_header(skb);
2951         skb_reset_network_header(skb);
2952
2953         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2954         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2955         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2956
2957         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2958         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2959         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2960         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2961
2962         if (iif) {
2963                 struct net_device *dev;
2964
2965                 dev = __dev_get_by_index(net, iif);
2966                 if (dev == NULL) {
2967                         err = -ENODEV;
2968                         goto errout_free;
2969                 }
2970
2971                 skb->protocol   = htons(ETH_P_IP);
2972                 skb->dev        = dev;
2973                 skb->mark       = mark;
2974                 local_bh_disable();
2975                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2976                 local_bh_enable();
2977
2978                 rt = skb_rtable(skb);
2979                 if (err == 0 && rt->dst.error)
2980                         err = -rt->dst.error;
2981         } else {
2982                 struct flowi4 fl4 = {
2983                         .daddr = dst,
2984                         .saddr = src,
2985                         .flowi4_tos = rtm->rtm_tos,
2986                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2987                         .flowi4_mark = mark,
2988                 };
2989                 rt = ip_route_output_key(net, &fl4);
2990
2991                 err = 0;
2992                 if (IS_ERR(rt))
2993                         err = PTR_ERR(rt);
2994         }
2995
2996         if (err)
2997                 goto errout_free;
2998
2999         skb_dst_set(skb, &rt->dst);
3000         if (rtm->rtm_flags & RTM_F_NOTIFY)
3001                 rt->rt_flags |= RTCF_NOTIFY;
3002
3003         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3004                            RTM_NEWROUTE, 0, 0);
3005         if (err <= 0)
3006                 goto errout_free;
3007
3008         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3009 errout:
3010         return err;
3011
3012 errout_free:
3013         kfree_skb(skb);
3014         goto errout;
3015 }
3016
3017 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3018 {
3019         struct rtable *rt;
3020         int h, s_h;
3021         int idx, s_idx;
3022         struct net *net;
3023
3024         net = sock_net(skb->sk);
3025
3026         s_h = cb->args[0];
3027         if (s_h < 0)
3028                 s_h = 0;
3029         s_idx = idx = cb->args[1];
3030         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3031                 if (!rt_hash_table[h].chain)
3032                         continue;
3033                 rcu_read_lock_bh();
3034                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3035                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3036                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3037                                 continue;
3038                         if (rt_is_expired(rt))
3039                                 continue;
3040                         skb_dst_set_noref(skb, &rt->dst);
3041                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3042                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3043                                          1, NLM_F_MULTI) <= 0) {
3044                                 skb_dst_drop(skb);
3045                                 rcu_read_unlock_bh();
3046                                 goto done;
3047                         }
3048                         skb_dst_drop(skb);
3049                 }
3050                 rcu_read_unlock_bh();
3051         }
3052
3053 done:
3054         cb->args[0] = h;
3055         cb->args[1] = idx;
3056         return skb->len;
3057 }
3058
3059 void ip_rt_multicast_event(struct in_device *in_dev)
3060 {
3061         rt_cache_flush(dev_net(in_dev->dev), 0);
3062 }
3063
3064 #ifdef CONFIG_SYSCTL
3065 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3066                                         void __user *buffer,
3067                                         size_t *lenp, loff_t *ppos)
3068 {
3069         if (write) {
3070                 int flush_delay;
3071                 ctl_table ctl;
3072                 struct net *net;
3073
3074                 memcpy(&ctl, __ctl, sizeof(ctl));
3075                 ctl.data = &flush_delay;
3076                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3077
3078                 net = (struct net *)__ctl->extra1;
3079                 rt_cache_flush(net, flush_delay);
3080                 return 0;
3081         }
3082
3083         return -EINVAL;
3084 }
3085
3086 static ctl_table ipv4_route_table[] = {
3087         {
3088                 .procname       = "gc_thresh",
3089                 .data           = &ipv4_dst_ops.gc_thresh,
3090                 .maxlen         = sizeof(int),
3091                 .mode           = 0644,
3092                 .proc_handler   = proc_dointvec,
3093         },
3094         {
3095                 .procname       = "max_size",
3096                 .data           = &ip_rt_max_size,
3097                 .maxlen         = sizeof(int),
3098                 .mode           = 0644,
3099                 .proc_handler   = proc_dointvec,
3100         },
3101         {
3102                 /*  Deprecated. Use gc_min_interval_ms */
3103
3104                 .procname       = "gc_min_interval",
3105                 .data           = &ip_rt_gc_min_interval,
3106                 .maxlen         = sizeof(int),
3107                 .mode           = 0644,
3108                 .proc_handler   = proc_dointvec_jiffies,
3109         },
3110         {
3111                 .procname       = "gc_min_interval_ms",
3112                 .data           = &ip_rt_gc_min_interval,
3113                 .maxlen         = sizeof(int),
3114                 .mode           = 0644,
3115                 .proc_handler   = proc_dointvec_ms_jiffies,
3116         },
3117         {
3118                 .procname       = "gc_timeout",
3119                 .data           = &ip_rt_gc_timeout,
3120                 .maxlen         = sizeof(int),
3121                 .mode           = 0644,
3122                 .proc_handler   = proc_dointvec_jiffies,
3123         },
3124         {
3125                 .procname       = "gc_interval",
3126                 .data           = &ip_rt_gc_interval,
3127                 .maxlen         = sizeof(int),
3128                 .mode           = 0644,
3129                 .proc_handler   = proc_dointvec_jiffies,
3130         },
3131         {
3132                 .procname       = "redirect_load",
3133                 .data           = &ip_rt_redirect_load,
3134                 .maxlen         = sizeof(int),
3135                 .mode           = 0644,
3136                 .proc_handler   = proc_dointvec,
3137         },
3138         {
3139                 .procname       = "redirect_number",
3140                 .data           = &ip_rt_redirect_number,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = proc_dointvec,
3144         },
3145         {
3146                 .procname       = "redirect_silence",
3147                 .data           = &ip_rt_redirect_silence,
3148                 .maxlen         = sizeof(int),
3149                 .mode           = 0644,
3150                 .proc_handler   = proc_dointvec,
3151         },
3152         {
3153                 .procname       = "error_cost",
3154                 .data           = &ip_rt_error_cost,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec,
3158         },
3159         {
3160                 .procname       = "error_burst",
3161                 .data           = &ip_rt_error_burst,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec,
3165         },
3166         {
3167                 .procname       = "gc_elasticity",
3168                 .data           = &ip_rt_gc_elasticity,
3169                 .maxlen         = sizeof(int),
3170                 .mode           = 0644,
3171                 .proc_handler   = proc_dointvec,
3172         },
3173         {
3174                 .procname       = "mtu_expires",
3175                 .data           = &ip_rt_mtu_expires,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec_jiffies,
3179         },
3180         {
3181                 .procname       = "min_pmtu",
3182                 .data           = &ip_rt_min_pmtu,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 .procname       = "min_adv_mss",
3189                 .data           = &ip_rt_min_advmss,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec,
3193         },
3194         { }
3195 };
3196
3197 static struct ctl_table empty[1];
3198
3199 static struct ctl_table ipv4_skeleton[] =
3200 {
3201         { .procname = "route",
3202           .mode = 0555, .child = ipv4_route_table},
3203         { .procname = "neigh",
3204           .mode = 0555, .child = empty},
3205         { }
3206 };
3207
3208 static __net_initdata struct ctl_path ipv4_path[] = {
3209         { .procname = "net", },
3210         { .procname = "ipv4", },
3211         { },
3212 };
3213
3214 static struct ctl_table ipv4_route_flush_table[] = {
3215         {
3216                 .procname       = "flush",
3217                 .maxlen         = sizeof(int),
3218                 .mode           = 0200,
3219                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3220         },
3221         { },
3222 };
3223
3224 static __net_initdata struct ctl_path ipv4_route_path[] = {
3225         { .procname = "net", },
3226         { .procname = "ipv4", },
3227         { .procname = "route", },
3228         { },
3229 };
3230
3231 static __net_init int sysctl_route_net_init(struct net *net)
3232 {
3233         struct ctl_table *tbl;
3234
3235         tbl = ipv4_route_flush_table;
3236         if (!net_eq(net, &init_net)) {
3237                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3238                 if (tbl == NULL)
3239                         goto err_dup;
3240         }
3241         tbl[0].extra1 = net;
3242
3243         net->ipv4.route_hdr =
3244                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3245         if (net->ipv4.route_hdr == NULL)
3246                 goto err_reg;
3247         return 0;
3248
3249 err_reg:
3250         if (tbl != ipv4_route_flush_table)
3251                 kfree(tbl);
3252 err_dup:
3253         return -ENOMEM;
3254 }
3255
3256 static __net_exit void sysctl_route_net_exit(struct net *net)
3257 {
3258         struct ctl_table *tbl;
3259
3260         tbl = net->ipv4.route_hdr->ctl_table_arg;
3261         unregister_net_sysctl_table(net->ipv4.route_hdr);
3262         BUG_ON(tbl == ipv4_route_flush_table);
3263         kfree(tbl);
3264 }
3265
3266 static __net_initdata struct pernet_operations sysctl_route_ops = {
3267         .init = sysctl_route_net_init,
3268         .exit = sysctl_route_net_exit,
3269 };
3270 #endif
3271
3272 static __net_init int rt_genid_init(struct net *net)
3273 {
3274         get_random_bytes(&net->ipv4.rt_genid,
3275                          sizeof(net->ipv4.rt_genid));
3276         get_random_bytes(&net->ipv4.dev_addr_genid,
3277                          sizeof(net->ipv4.dev_addr_genid));
3278         return 0;
3279 }
3280
3281 static __net_initdata struct pernet_operations rt_genid_ops = {
3282         .init = rt_genid_init,
3283 };
3284
3285
3286 #ifdef CONFIG_IP_ROUTE_CLASSID
3287 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3288 #endif /* CONFIG_IP_ROUTE_CLASSID */
3289
3290 static __initdata unsigned long rhash_entries;
3291 static int __init set_rhash_entries(char *str)
3292 {
3293         if (!str)
3294                 return 0;
3295         rhash_entries = simple_strtoul(str, &str, 0);
3296         return 1;
3297 }
3298 __setup("rhash_entries=", set_rhash_entries);
3299
3300 int __init ip_rt_init(void)
3301 {
3302         int rc = 0;
3303
3304 #ifdef CONFIG_IP_ROUTE_CLASSID
3305         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3306         if (!ip_rt_acct)
3307                 panic("IP: failed to allocate ip_rt_acct\n");
3308 #endif
3309
3310         ipv4_dst_ops.kmem_cachep =
3311                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3312                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3313
3314         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3315
3316         if (dst_entries_init(&ipv4_dst_ops) < 0)
3317                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3318
3319         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3320                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3321
3322         rt_hash_table = (struct rt_hash_bucket *)
3323                 alloc_large_system_hash("IP route cache",
3324                                         sizeof(struct rt_hash_bucket),
3325                                         rhash_entries,
3326                                         (totalram_pages >= 128 * 1024) ?
3327                                         15 : 17,
3328                                         0,
3329                                         &rt_hash_log,
3330                                         &rt_hash_mask,
3331                                         rhash_entries ? 0 : 512 * 1024);
3332         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3333         rt_hash_lock_init();
3334
3335         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3336         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3337
3338         devinet_init();
3339         ip_fib_init();
3340
3341         if (ip_rt_proc_init())
3342                 printk(KERN_ERR "Unable to create route proc files\n");
3343 #ifdef CONFIG_XFRM
3344         xfrm_init();
3345         xfrm4_init(ip_rt_max_size);
3346 #endif
3347         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3348
3349 #ifdef CONFIG_SYSCTL
3350         register_pernet_subsys(&sysctl_route_ops);
3351 #endif
3352         register_pernet_subsys(&rt_genid_ops);
3353         return rc;
3354 }
3355
3356 #ifdef CONFIG_SYSCTL
3357 /*
3358  * We really need to sanitize the damn ipv4 init order, then all
3359  * this nonsense will go away.
3360  */
3361 void __init ip_static_sysctl_init(void)
3362 {
3363         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3364 }
3365 #endif