net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 static struct delayed_work expires_work;
 137 static unsigned long expires_ljiffies;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 145 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(struct dst_ops *ops);
 151
 152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 153                             int how)
 154 {
 155 }
 156
 157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 158 {
 159         struct rtable *rt = (struct rtable *) dst;
 160         struct inet_peer *peer;
 161         u32 *p = NULL;
 162
 163         if (!rt->peer)
 164                 rt_bind_peer(rt, rt->rt_dst, 1);
 165
 166         peer = rt->peer;
 167         if (peer) {
 168                 u32 *old_p = __DST_METRICS_PTR(old);
 169                 unsigned long prev, new;
 170
 171                 p = peer->metrics;
 172                 if (inet_metrics_new(peer))
 173                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 174
 175                 new = (unsigned long) p;
 176                 prev = cmpxchg(&dst->_metrics, old, new);
 177
 178                 if (prev != old) {
 179                         p = __DST_METRICS_PTR(prev);
 180                         if (prev & DST_METRICS_READ_ONLY)
 181                                 p = NULL;
 182                 } else {
 183                         if (rt->fi) {
 184                                 fib_info_put(rt->fi);
 185                                 rt->fi = NULL;
 186                         }
 187                 }
 188         }
 189         return p;
 190 }
 191
 192 static struct dst_ops ipv4_dst_ops = {
 193         .family =               AF_INET,
 194         .protocol =             cpu_to_be16(ETH_P_IP),
 195         .gc =                   rt_garbage_collect,
 196         .check =                ipv4_dst_check,
 197         .default_advmss =       ipv4_default_advmss,
 198         .default_mtu =          ipv4_default_mtu,
 199         .cow_metrics =          ipv4_cow_metrics,
 200         .destroy =              ipv4_dst_destroy,
 201         .ifdown =               ipv4_dst_ifdown,
 202         .negative_advice =      ipv4_negative_advice,
 203         .link_failure =         ipv4_link_failure,
 204         .update_pmtu =          ip_rt_update_pmtu,
 205         .local_out =            __ip_local_out,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len, HHUptod;
 421
 422                 rcu_read_lock();
 423                 n = dst_get_neighbour(&r->dst);
 424                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 425                 rcu_read_unlock();
 426
 427                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 428                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 429                         r->dst.dev ? r->dst.dev->name : "*",
 430                         (__force u32)r->rt_dst,
 431                         (__force u32)r->rt_gateway,
 432                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 433                         r->dst.__use, 0, (__force u32)r->rt_src,
 434                         dst_metric_advmss(&r->dst) + 40,
 435                         dst_metric(&r->dst, RTAX_WINDOW),
 436                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 437                               dst_metric(&r->dst, RTAX_RTTVAR)),
 438                         r->rt_key_tos,
 439                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 440                         HHUptod,
 441                         r->rt_spec_dst, &len);
 442
 443                 seq_printf(seq, "%*s\n", 127 - len, "");
 444         }
 445         return 0;
 446 }
 447
 448 static const struct seq_operations rt_cache_seq_ops = {
 449         .start  = rt_cache_seq_start,
 450         .next   = rt_cache_seq_next,
 451         .stop   = rt_cache_seq_stop,
 452         .show   = rt_cache_seq_show,
 453 };
 454
 455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 456 {
 457         return seq_open_net(inode, file, &rt_cache_seq_ops,
 458                         sizeof(struct rt_cache_iter_state));
 459 }
 460
 461 static const struct file_operations rt_cache_seq_fops = {
 462         .owner   = THIS_MODULE,
 463         .open    = rt_cache_seq_open,
 464         .read    = seq_read,
 465         .llseek  = seq_lseek,
 466         .release = seq_release_net,
 467 };
 468
 469
 470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 471 {
 472         int cpu;
 473
 474         if (*pos == 0)
 475                 return SEQ_START_TOKEN;
 476
 477         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 478                 if (!cpu_possible(cpu))
 479                         continue;
 480                 *pos = cpu+1;
 481                 return &per_cpu(rt_cache_stat, cpu);
 482         }
 483         return NULL;
 484 }
 485
 486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 487 {
 488         int cpu;
 489
 490         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 491                 if (!cpu_possible(cpu))
 492                         continue;
 493                 *pos = cpu+1;
 494                 return &per_cpu(rt_cache_stat, cpu);
 495         }
 496         return NULL;
 497
 498 }
 499
 500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 501 {
 502
 503 }
 504
 505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 506 {
 507         struct rt_cache_stat *st = v;
 508
 509         if (v == SEQ_START_TOKEN) {
 510                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 511                 return 0;
 512         }
 513
 514         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 515                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 516                    dst_entries_get_slow(&ipv4_dst_ops),
 517                    st->in_hit,
 518                    st->in_slow_tot,
 519                    st->in_slow_mc,
 520                    st->in_no_route,
 521                    st->in_brd,
 522                    st->in_martian_dst,
 523                    st->in_martian_src,
 524
 525                    st->out_hit,
 526                    st->out_slow_tot,
 527                    st->out_slow_mc,
 528
 529                    st->gc_total,
 530                    st->gc_ignored,
 531                    st->gc_goal_miss,
 532                    st->gc_dst_overflow,
 533                    st->in_hlist_search,
 534                    st->out_hlist_search
 535                 );
 536         return 0;
 537 }
 538
 539 static const struct seq_operations rt_cpu_seq_ops = {
 540         .start  = rt_cpu_seq_start,
 541         .next   = rt_cpu_seq_next,
 542         .stop   = rt_cpu_seq_stop,
 543         .show   = rt_cpu_seq_show,
 544 };
 545
 546
 547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 548 {
 549         return seq_open(file, &rt_cpu_seq_ops);
 550 }
 551
 552 static const struct file_operations rt_cpu_seq_fops = {
 553         .owner   = THIS_MODULE,
 554         .open    = rt_cpu_seq_open,
 555         .read    = seq_read,
 556         .llseek  = seq_lseek,
 557         .release = seq_release,
 558 };
 559
 560 #ifdef CONFIG_IP_ROUTE_CLASSID
 561 static int rt_acct_proc_show(struct seq_file *m, void *v)
 562 {
 563         struct ip_rt_acct *dst, *src;
 564         unsigned int i, j;
 565
 566         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 567         if (!dst)
 568                 return -ENOMEM;
 569
 570         for_each_possible_cpu(i) {
 571                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 572                 for (j = 0; j < 256; j++) {
 573                         dst[j].o_bytes   += src[j].o_bytes;
 574                         dst[j].o_packets += src[j].o_packets;
 575                         dst[j].i_bytes   += src[j].i_bytes;
 576                         dst[j].i_packets += src[j].i_packets;
 577                 }
 578         }
 579
 580         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 581         kfree(dst);
 582         return 0;
 583 }
 584
 585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 586 {
 587         return single_open(file, rt_acct_proc_show, NULL);
 588 }
 589
 590 static const struct file_operations rt_acct_proc_fops = {
 591         .owner          = THIS_MODULE,
 592         .open           = rt_acct_proc_open,
 593         .read           = seq_read,
 594         .llseek         = seq_lseek,
 595         .release        = single_release,
 596 };
 597 #endif
 598
 599 static int __net_init ip_rt_do_proc_init(struct net *net)
 600 {
 601         struct proc_dir_entry *pde;
 602
 603         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 604                         &rt_cache_seq_fops);
 605         if (!pde)
 606                 goto err1;
 607
 608         pde = proc_create("rt_cache", S_IRUGO,
 609                           net->proc_net_stat, &rt_cpu_seq_fops);
 610         if (!pde)
 611                 goto err2;
 612
 613 #ifdef CONFIG_IP_ROUTE_CLASSID
 614         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 615         if (!pde)
 616                 goto err3;
 617 #endif
 618         return 0;
 619
 620 #ifdef CONFIG_IP_ROUTE_CLASSID
 621 err3:
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623 #endif
 624 err2:
 625         remove_proc_entry("rt_cache", net->proc_net);
 626 err1:
 627         return -ENOMEM;
 628 }
 629
 630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 631 {
 632         remove_proc_entry("rt_cache", net->proc_net_stat);
 633         remove_proc_entry("rt_cache", net->proc_net);
 634 #ifdef CONFIG_IP_ROUTE_CLASSID
 635         remove_proc_entry("rt_acct", net->proc_net);
 636 #endif
 637 }
 638
 639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 640         .init = ip_rt_do_proc_init,
 641         .exit = ip_rt_do_proc_exit,
 642 };
 643
 644 static int __init ip_rt_proc_init(void)
 645 {
 646         return register_pernet_subsys(&ip_rt_proc_ops);
 647 }
 648
 649 #else
 650 static inline int ip_rt_proc_init(void)
 651 {
 652         return 0;
 653 }
 654 #endif /* CONFIG_PROC_FS */
 655
 656 static inline void rt_free(struct rtable *rt)
 657 {
 658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 659 }
 660
 661 static inline void rt_drop(struct rtable *rt)
 662 {
 663         ip_rt_put(rt);
 664         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 665 }
 666
 667 static inline int rt_fast_clean(struct rtable *rth)
 668 {
 669         /* Kill broadcast/multicast entries very aggresively, if they
 670            collide in hash table with more useful entries */
 671         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 672                 rt_is_input_route(rth) && rth->dst.rt_next;
 673 }
 674
 675 static inline int rt_valuable(struct rtable *rth)
 676 {
 677         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 678                 (rth->peer && rth->peer->pmtu_expires);
 679 }
 680
 681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 682 {
 683         unsigned long age;
 684         int ret = 0;
 685
 686         if (atomic_read(&rth->dst.__refcnt))
 687                 goto out;
 688
 689         age = jiffies - rth->dst.lastuse;
 690         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 691             (age <= tmo2 && rt_valuable(rth)))
 692                 goto out;
 693         ret = 1;
 694 out:    return ret;
 695 }
 696
 697 /* Bits of score are:
 698  * 31: very valuable
 699  * 30: not quite useless
 700  * 29..0: usage counter
 701  */
 702 static inline u32 rt_score(struct rtable *rt)
 703 {
 704         u32 score = jiffies - rt->dst.lastuse;
 705
 706         score = ~score & ~(3<<30);
 707
 708         if (rt_valuable(rt))
 709                 score |= (1<<31);
 710
 711         if (rt_is_output_route(rt) ||
 712             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 713                 score |= (1<<30);
 714
 715         return score;
 716 }
 717
 718 static inline bool rt_caching(const struct net *net)
 719 {
 720         return net->ipv4.current_rt_cache_rebuild_count <=
 721                 net->ipv4.sysctl_rt_cache_rebuild_count;
 722 }
 723
 724 static inline bool compare_hash_inputs(const struct rtable *rt1,
 725                                        const struct rtable *rt2)
 726 {
 727         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 728                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 729                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 730 }
 731
 732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 735                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 736                 (rt1->rt_mark ^ rt2->rt_mark) |
 737                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 738                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 739                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 740 }
 741
 742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 743 {
 744         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 745 }
 746
 747 static inline int rt_is_expired(struct rtable *rth)
 748 {
 749         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 750 }
 751
 752 /*
 753  * Perform a full scan of hash table and free all entries.
 754  * Can be called by a softirq or a process.
 755  * In the later case, we want to be reschedule if necessary
 756  */
 757 static void rt_do_flush(struct net *net, int process_context)
 758 {
 759         unsigned int i;
 760         struct rtable *rth, *next;
 761
 762         for (i = 0; i <= rt_hash_mask; i++) {
 763                 struct rtable __rcu **pprev;
 764                 struct rtable *list;
 765
 766                 if (process_context && need_resched())
 767                         cond_resched();
 768                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 769                 if (!rth)
 770                         continue;
 771
 772                 spin_lock_bh(rt_hash_lock_addr(i));
 773
 774                 list = NULL;
 775                 pprev = &rt_hash_table[i].chain;
 776                 rth = rcu_dereference_protected(*pprev,
 777                         lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                 while (rth) {
 780                         next = rcu_dereference_protected(rth->dst.rt_next,
 781                                 lockdep_is_held(rt_hash_lock_addr(i)));
 782
 783                         if (!net ||
 784                             net_eq(dev_net(rth->dst.dev), net)) {
 785                                 rcu_assign_pointer(*pprev, next);
 786                                 rcu_assign_pointer(rth->dst.rt_next, list);
 787                                 list = rth;
 788                         } else {
 789                                 pprev = &rth->dst.rt_next;
 790                         }
 791                         rth = next;
 792                 }
 793
 794                 spin_unlock_bh(rt_hash_lock_addr(i));
 795
 796                 for (; list; list = next) {
 797                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 798                         rt_free(list);
 799                 }
 800         }
 801 }
 802
 803 /*
 804  * While freeing expired entries, we compute average chain length
 805  * and standard deviation, using fixed-point arithmetic.
 806  * This to have an estimation of rt_chain_length_max
 807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 809  */
 810
 811 #define FRACT_BITS 3
 812 #define ONE (1UL << FRACT_BITS)
 813
 814 /*
 815  * Given a hash chain and an item in this hash chain,
 816  * find if a previous entry has the same hash_inputs
 817  * (but differs on tos, mark or oif)
 818  * Returns 0 if an alias is found.
 819  * Returns ONE if rth has no alias before itself.
 820  */
 821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 822 {
 823         const struct rtable *aux = head;
 824
 825         while (aux != rth) {
 826                 if (compare_hash_inputs(aux, rth))
 827                         return 0;
 828                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 829         }
 830         return ONE;
 831 }
 832
 833 static void rt_check_expire(void)
 834 {
 835         static unsigned int rover;
 836         unsigned int i = rover, goal;
 837         struct rtable *rth;
 838         struct rtable __rcu **rthp;
 839         unsigned long samples = 0;
 840         unsigned long sum = 0, sum2 = 0;
 841         unsigned long delta;
 842         u64 mult;
 843
 844         delta = jiffies - expires_ljiffies;
 845         expires_ljiffies = jiffies;
 846         mult = ((u64)delta) << rt_hash_log;
 847         if (ip_rt_gc_timeout > 1)
 848                 do_div(mult, ip_rt_gc_timeout);
 849         goal = (unsigned int)mult;
 850         if (goal > rt_hash_mask)
 851                 goal = rt_hash_mask + 1;
 852         for (; goal > 0; goal--) {
 853                 unsigned long tmo = ip_rt_gc_timeout;
 854                 unsigned long length;
 855
 856                 i = (i + 1) & rt_hash_mask;
 857                 rthp = &rt_hash_table[i].chain;
 858
 859                 if (need_resched())
 860                         cond_resched();
 861
 862                 samples++;
 863
 864                 if (rcu_dereference_raw(*rthp) == NULL)
 865                         continue;
 866                 length = 0;
 867                 spin_lock_bh(rt_hash_lock_addr(i));
 868                 while ((rth = rcu_dereference_protected(*rthp,
 869                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 870                         prefetch(rth->dst.rt_next);
 871                         if (rt_is_expired(rth)) {
 872                                 *rthp = rth->dst.rt_next;
 873                                 rt_free(rth);
 874                                 continue;
 875                         }
 876                         if (rth->dst.expires) {
 877                                 /* Entry is expired even if it is in use */
 878                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 879 nofree:
 880                                         tmo >>= 1;
 881                                         rthp = &rth->dst.rt_next;
 882                                         /*
 883                                          * We only count entries on
 884                                          * a chain with equal hash inputs once
 885                                          * so that entries for different QOS
 886                                          * levels, and other non-hash input
 887                                          * attributes don't unfairly skew
 888                                          * the length computation
 889                                          */
 890                                         length += has_noalias(rt_hash_table[i].chain, rth);
 891                                         continue;
 892                                 }
 893                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 894                                 goto nofree;
 895
 896                         /* Cleanup aged off entries. */
 897                         *rthp = rth->dst.rt_next;
 898                         rt_free(rth);
 899                 }
 900                 spin_unlock_bh(rt_hash_lock_addr(i));
 901                 sum += length;
 902                 sum2 += length*length;
 903         }
 904         if (samples) {
 905                 unsigned long avg = sum / samples;
 906                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 907                 rt_chain_length_max = max_t(unsigned long,
 908                                         ip_rt_gc_elasticity,
 909                                         (avg + 4*sd) >> FRACT_BITS);
 910         }
 911         rover = i;
 912 }
 913
 914 /*
 915  * rt_worker_func() is run in process context.
 916  * we call rt_check_expire() to scan part of the hash table
 917  */
 918 static void rt_worker_func(struct work_struct *work)
 919 {
 920         rt_check_expire();
 921         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 922 }
 923
 924 /*
 925  * Perturbation of rt_genid by a small quantity [1..256]
 926  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 927  * many times (2^24) without giving recent rt_genid.
 928  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 929  */
 930 static void rt_cache_invalidate(struct net *net)
 931 {
 932         unsigned char shuffle;
 933
 934         get_random_bytes(&shuffle, sizeof(shuffle));
 935         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 936 }
 937
 938 /*
 939  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 940  * delay >= 0 : invalidate & flush cache (can be long)
 941  */
 942 void rt_cache_flush(struct net *net, int delay)
 943 {
 944         rt_cache_invalidate(net);
 945         if (delay >= 0)
 946                 rt_do_flush(net, !in_softirq());
 947 }
 948
 949 /* Flush previous cache invalidated entries from the cache */
 950 void rt_cache_flush_batch(struct net *net)
 951 {
 952         rt_do_flush(net, !in_softirq());
 953 }
 954
 955 static void rt_emergency_hash_rebuild(struct net *net)
 956 {
 957         if (net_ratelimit())
 958                 printk(KERN_WARNING "Route hash chain too long!\n");
 959         rt_cache_invalidate(net);
 960 }
 961
 962 /*
 963    Short description of GC goals.
 964
 965    We want to build algorithm, which will keep routing cache
 966    at some equilibrium point, when number of aged off entries
 967    is kept approximately equal to newly generated ones.
 968
 969    Current expiration strength is variable "expire".
 970    We try to adjust it dynamically, so that if networking
 971    is idle expires is large enough to keep enough of warm entries,
 972    and when load increases it reduces to limit cache size.
 973  */
 974
 975 static int rt_garbage_collect(struct dst_ops *ops)
 976 {
 977         static unsigned long expire = RT_GC_TIMEOUT;
 978         static unsigned long last_gc;
 979         static int rover;
 980         static int equilibrium;
 981         struct rtable *rth;
 982         struct rtable __rcu **rthp;
 983         unsigned long now = jiffies;
 984         int goal;
 985         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 986
 987         /*
 988          * Garbage collection is pretty expensive,
 989          * do not make it too frequently.
 990          */
 991
 992         RT_CACHE_STAT_INC(gc_total);
 993
 994         if (now - last_gc < ip_rt_gc_min_interval &&
 995             entries < ip_rt_max_size) {
 996                 RT_CACHE_STAT_INC(gc_ignored);
 997                 goto out;
 998         }
 999
1000         entries = dst_entries_get_slow(&ipv4_dst_ops);
1001         /* Calculate number of entries, which we want to expire now. */
1002         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1003         if (goal <= 0) {
1004                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1005                         equilibrium = ipv4_dst_ops.gc_thresh;
1006                 goal = entries - equilibrium;
1007                 if (goal > 0) {
1008                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1009                         goal = entries - equilibrium;
1010                 }
1011         } else {
1012                 /* We are in dangerous area. Try to reduce cache really
1013                  * aggressively.
1014                  */
1015                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1016                 equilibrium = entries - goal;
1017         }
1018
1019         if (now - last_gc >= ip_rt_gc_min_interval)
1020                 last_gc = now;
1021
1022         if (goal <= 0) {
1023                 equilibrium += goal;
1024                 goto work_done;
1025         }
1026
1027         do {
1028                 int i, k;
1029
1030                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1031                         unsigned long tmo = expire;
1032
1033                         k = (k + 1) & rt_hash_mask;
1034                         rthp = &rt_hash_table[k].chain;
1035                         spin_lock_bh(rt_hash_lock_addr(k));
1036                         while ((rth = rcu_dereference_protected(*rthp,
1037                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1038                                 if (!rt_is_expired(rth) &&
1039                                         !rt_may_expire(rth, tmo, expire)) {
1040                                         tmo >>= 1;
1041                                         rthp = &rth->dst.rt_next;
1042                                         continue;
1043                                 }
1044                                 *rthp = rth->dst.rt_next;
1045                                 rt_free(rth);
1046                                 goal--;
1047                         }
1048                         spin_unlock_bh(rt_hash_lock_addr(k));
1049                         if (goal <= 0)
1050                                 break;
1051                 }
1052                 rover = k;
1053
1054                 if (goal <= 0)
1055                         goto work_done;
1056
1057                 /* Goal is not achieved. We stop process if:
1058
1059                    - if expire reduced to zero. Otherwise, expire is halfed.
1060                    - if table is not full.
1061                    - if we are called from interrupt.
1062                    - jiffies check is just fallback/debug loop breaker.
1063                      We will not spin here for long time in any case.
1064                  */
1065
1066                 RT_CACHE_STAT_INC(gc_goal_miss);
1067
1068                 if (expire == 0)
1069                         break;
1070
1071                 expire >>= 1;
1072
1073                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1074                         goto out;
1075         } while (!in_softirq() && time_before_eq(jiffies, now));
1076
1077         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078                 goto out;
1079         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (net_ratelimit())
1082                 printk(KERN_WARNING "dst cache overflow\n");
1083         RT_CACHE_STAT_INC(gc_dst_overflow);
1084         return 1;
1085
1086 work_done:
1087         expire += ip_rt_gc_min_interval;
1088         if (expire > ip_rt_gc_timeout ||
1089             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1090             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1091                 expire = ip_rt_gc_timeout;
1092 out:    return 0;
1093 }
1094
1095 /*
1096  * Returns number of entries in a hash chain that have different hash_inputs
1097  */
1098 static int slow_chain_length(const struct rtable *head)
1099 {
1100         int length = 0;
1101         const struct rtable *rth = head;
1102
1103         while (rth) {
1104                 length += has_noalias(head, rth);
1105                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1106         }
1107         return length >> FRACT_BITS;
1108 }
1109
1110 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1111                                      struct sk_buff *skb, int ifindex)
1112 {
1113         struct rtable   *rth, *cand;
1114         struct rtable __rcu **rthp, **candp;
1115         unsigned long   now;
1116         u32             min_score;
1117         int             chain_length;
1118         int attempts = !in_softirq();
1119
1120 restart:
1121         chain_length = 0;
1122         min_score = ~(u32)0;
1123         cand = NULL;
1124         candp = NULL;
1125         now = jiffies;
1126
1127         if (!rt_caching(dev_net(rt->dst.dev))) {
1128                 /*
1129                  * If we're not caching, just tell the caller we
1130                  * were successful and don't touch the route.  The
1131                  * caller hold the sole reference to the cache entry, and
1132                  * it will be released when the caller is done with it.
1133                  * If we drop it here, the callers have no way to resolve routes
1134                  * when we're not caching.  Instead, just point *rp at rt, so
1135                  * the caller gets a single use out of the route
1136                  * Note that we do rt_free on this new route entry, so that
1137                  * once its refcount hits zero, we are still able to reap it
1138                  * (Thanks Alexey)
1139                  * Note: To avoid expensive rcu stuff for this uncached dst,
1140                  * we set DST_NOCACHE so that dst_release() can free dst without
1141                  * waiting a grace period.
1142                  */
1143
1144                 rt->dst.flags |= DST_NOCACHE;
1145                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1146                         int err = arp_bind_neighbour(&rt->dst);
1147                         if (err) {
1148                                 if (net_ratelimit())
1149                                         printk(KERN_WARNING
1150                                             "Neighbour table failure & not caching routes.\n");
1151                                 ip_rt_put(rt);
1152                                 return ERR_PTR(err);
1153                         }
1154                 }
1155
1156                 goto skip_hashing;
1157         }
1158
1159         rthp = &rt_hash_table[hash].chain;
1160
1161         spin_lock_bh(rt_hash_lock_addr(hash));
1162         while ((rth = rcu_dereference_protected(*rthp,
1163                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1164                 if (rt_is_expired(rth)) {
1165                         *rthp = rth->dst.rt_next;
1166                         rt_free(rth);
1167                         continue;
1168                 }
1169                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1170                         /* Put it first */
1171                         *rthp = rth->dst.rt_next;
1172                         /*
1173                          * Since lookup is lockfree, the deletion
1174                          * must be visible to another weakly ordered CPU before
1175                          * the insertion at the start of the hash chain.
1176                          */
1177                         rcu_assign_pointer(rth->dst.rt_next,
1178                                            rt_hash_table[hash].chain);
1179                         /*
1180                          * Since lookup is lockfree, the update writes
1181                          * must be ordered for consistency on SMP.
1182                          */
1183                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1184
1185                         dst_use(&rth->dst, now);
1186                         spin_unlock_bh(rt_hash_lock_addr(hash));
1187
1188                         rt_drop(rt);
1189                         if (skb)
1190                                 skb_dst_set(skb, &rth->dst);
1191                         return rth;
1192                 }
1193
1194                 if (!atomic_read(&rth->dst.__refcnt)) {
1195                         u32 score = rt_score(rth);
1196
1197                         if (score <= min_score) {
1198                                 cand = rth;
1199                                 candp = rthp;
1200                                 min_score = score;
1201                         }
1202                 }
1203
1204                 chain_length++;
1205
1206                 rthp = &rth->dst.rt_next;
1207         }
1208
1209         if (cand) {
1210                 /* ip_rt_gc_elasticity used to be average length of chain
1211                  * length, when exceeded gc becomes really aggressive.
1212                  *
1213                  * The second limit is less certain. At the moment it allows
1214                  * only 2 entries per bucket. We will see.
1215                  */
1216                 if (chain_length > ip_rt_gc_elasticity) {
1217                         *candp = cand->dst.rt_next;
1218                         rt_free(cand);
1219                 }
1220         } else {
1221                 if (chain_length > rt_chain_length_max &&
1222                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1223                         struct net *net = dev_net(rt->dst.dev);
1224                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1225                         if (!rt_caching(net)) {
1226                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1227                                         rt->dst.dev->name, num);
1228                         }
1229                         rt_emergency_hash_rebuild(net);
1230                         spin_unlock_bh(rt_hash_lock_addr(hash));
1231
1232                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1233                                         ifindex, rt_genid(net));
1234                         goto restart;
1235                 }
1236         }
1237
1238         /* Try to bind route to arp only if it is output
1239            route or unicast forwarding path.
1240          */
1241         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1242                 int err = arp_bind_neighbour(&rt->dst);
1243                 if (err) {
1244                         spin_unlock_bh(rt_hash_lock_addr(hash));
1245
1246                         if (err != -ENOBUFS) {
1247                                 rt_drop(rt);
1248                                 return ERR_PTR(err);
1249                         }
1250
1251                         /* Neighbour tables are full and nothing
1252                            can be released. Try to shrink route cache,
1253                            it is most likely it holds some neighbour records.
1254                          */
1255                         if (attempts-- > 0) {
1256                                 int saved_elasticity = ip_rt_gc_elasticity;
1257                                 int saved_int = ip_rt_gc_min_interval;
1258                                 ip_rt_gc_elasticity     = 1;
1259                                 ip_rt_gc_min_interval   = 0;
1260                                 rt_garbage_collect(&ipv4_dst_ops);
1261                                 ip_rt_gc_min_interval   = saved_int;
1262                                 ip_rt_gc_elasticity     = saved_elasticity;
1263                                 goto restart;
1264                         }
1265
1266                         if (net_ratelimit())
1267                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1268                         rt_drop(rt);
1269                         return ERR_PTR(-ENOBUFS);
1270                 }
1271         }
1272
1273         rt->dst.rt_next = rt_hash_table[hash].chain;
1274
1275         /*
1276          * Since lookup is lockfree, we must make sure
1277          * previous writes to rt are committed to memory
1278          * before making rt visible to other CPUS.
1279          */
1280         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1281
1282         spin_unlock_bh(rt_hash_lock_addr(hash));
1283
1284 skip_hashing:
1285         if (skb)
1286                 skb_dst_set(skb, &rt->dst);
1287         return rt;
1288 }
1289
1290 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1291
1292 static u32 rt_peer_genid(void)
1293 {
1294         return atomic_read(&__rt_peer_genid);
1295 }
1296
1297 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1298 {
1299         struct inet_peer *peer;
1300
1301         peer = inet_getpeer_v4(daddr, create);
1302
1303         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1304                 inet_putpeer(peer);
1305         else
1306                 rt->rt_peer_genid = rt_peer_genid();
1307 }
1308
1309 /*
1310  * Peer allocation may fail only in serious out-of-memory conditions.  However
1311  * we still can generate some output.
1312  * Random ID selection looks a bit dangerous because we have no chances to
1313  * select ID being unique in a reasonable period of time.
1314  * But broken packet identifier may be better than no packet at all.
1315  */
1316 static void ip_select_fb_ident(struct iphdr *iph)
1317 {
1318         static DEFINE_SPINLOCK(ip_fb_id_lock);
1319         static u32 ip_fallback_id;
1320         u32 salt;
1321
1322         spin_lock_bh(&ip_fb_id_lock);
1323         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1324         iph->id = htons(salt & 0xFFFF);
1325         ip_fallback_id = salt;
1326         spin_unlock_bh(&ip_fb_id_lock);
1327 }
1328
1329 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332
1333         if (rt) {
1334                 if (rt->peer == NULL)
1335                         rt_bind_peer(rt, rt->rt_dst, 1);
1336
1337                 /* If peer is attached to destination, it is never detached,
1338                    so that we need not to grab a lock to dereference it.
1339                  */
1340                 if (rt->peer) {
1341                         iph->id = htons(inet_getid(rt->peer, more));
1342                         return;
1343                 }
1344         } else
1345                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1346                        __builtin_return_address(0));
1347
1348         ip_select_fb_ident(iph);
1349 }
1350 EXPORT_SYMBOL(__ip_select_ident);
1351
1352 static void rt_del(unsigned hash, struct rtable *rt)
1353 {
1354         struct rtable __rcu **rthp;
1355         struct rtable *aux;
1356
1357         rthp = &rt_hash_table[hash].chain;
1358         spin_lock_bh(rt_hash_lock_addr(hash));
1359         ip_rt_put(rt);
1360         while ((aux = rcu_dereference_protected(*rthp,
1361                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1362                 if (aux == rt || rt_is_expired(aux)) {
1363                         *rthp = aux->dst.rt_next;
1364                         rt_free(aux);
1365                         continue;
1366                 }
1367                 rthp = &aux->dst.rt_next;
1368         }
1369         spin_unlock_bh(rt_hash_lock_addr(hash));
1370 }
1371
1372 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1373 {
1374         struct rtable *rt = (struct rtable *) dst;
1375         __be32 orig_gw = rt->rt_gateway;
1376         struct neighbour *n, *old_n;
1377
1378         dst_confirm(&rt->dst);
1379
1380         rt->rt_gateway = peer->redirect_learned.a4;
1381         n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
1382         if (IS_ERR(n))
1383                 return PTR_ERR(n);
1384         old_n = xchg(&rt->dst._neighbour, n);
1385         if (old_n)
1386                 neigh_release(old_n);
1387         if (!n || !(n->nud_state & NUD_VALID)) {
1388                 if (n)
1389                         neigh_event_send(n, NULL);
1390                 rt->rt_gateway = orig_gw;
1391                 return -EAGAIN;
1392         } else {
1393                 rt->rt_flags |= RTCF_REDIRECTED;
1394                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1395         }
1396         return 0;
1397 }
1398
1399 /* called in rcu_read_lock() section */
1400 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1401                     __be32 saddr, struct net_device *dev)
1402 {
1403         int s, i;
1404         struct in_device *in_dev = __in_dev_get_rcu(dev);
1405         __be32 skeys[2] = { saddr, 0 };
1406         int    ikeys[2] = { dev->ifindex, 0 };
1407         struct inet_peer *peer;
1408         struct net *net;
1409
1410         if (!in_dev)
1411                 return;
1412
1413         net = dev_net(dev);
1414         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1415             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1416             ipv4_is_zeronet(new_gw))
1417                 goto reject_redirect;
1418
1419         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1420                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1421                         goto reject_redirect;
1422                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1423                         goto reject_redirect;
1424         } else {
1425                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1426                         goto reject_redirect;
1427         }
1428
1429         for (s = 0; s < 2; s++) {
1430                 for (i = 0; i < 2; i++) {
1431                         unsigned int hash;
1432                         struct rtable __rcu **rthp;
1433                         struct rtable *rt;
1434
1435                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1436
1437                         rthp = &rt_hash_table[hash].chain;
1438
1439                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1440                                 rthp = &rt->dst.rt_next;
1441
1442                                 if (rt->rt_key_dst != daddr ||
1443                                     rt->rt_key_src != skeys[s] ||
1444                                     rt->rt_oif != ikeys[i] ||
1445                                     rt_is_input_route(rt) ||
1446                                     rt_is_expired(rt) ||
1447                                     !net_eq(dev_net(rt->dst.dev), net) ||
1448                                     rt->dst.error ||
1449                                     rt->dst.dev != dev ||
1450                                     rt->rt_gateway != old_gw)
1451                                         continue;
1452
1453                                 if (!rt->peer)
1454                                         rt_bind_peer(rt, rt->rt_dst, 1);
1455
1456                                 peer = rt->peer;
1457                                 if (peer) {
1458                                         if (peer->redirect_learned.a4 != new_gw) {
1459                                                 peer->redirect_learned.a4 = new_gw;
1460                                                 atomic_inc(&__rt_peer_genid);
1461                                         }
1462                                         check_peer_redir(&rt->dst, peer);
1463                                 }
1464                         }
1465                 }
1466         }
1467         return;
1468
1469 reject_redirect:
1470 #ifdef CONFIG_IP_ROUTE_VERBOSE
1471         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1472                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1473                         "  Advised path = %pI4 -> %pI4\n",
1474                        &old_gw, dev->name, &new_gw,
1475                        &saddr, &daddr);
1476 #endif
1477         ;
1478 }
1479
1480 static bool peer_pmtu_expired(struct inet_peer *peer)
1481 {
1482         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1483
1484         return orig &&
1485                time_after_eq(jiffies, orig) &&
1486                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1487 }
1488
1489 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1490 {
1491         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1492
1493         return orig &&
1494                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1495 }
1496
1497 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1498 {
1499         struct rtable *rt = (struct rtable *)dst;
1500         struct dst_entry *ret = dst;
1501
1502         if (rt) {
1503                 if (dst->obsolete > 0) {
1504                         ip_rt_put(rt);
1505                         ret = NULL;
1506                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1507                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1508                                                 rt->rt_oif,
1509                                                 rt_genid(dev_net(dst->dev)));
1510                         rt_del(hash, rt);
1511                         ret = NULL;
1512                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1513                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1514                 }
1515         }
1516         return ret;
1517 }
1518
1519 /*
1520  * Algorithm:
1521  *      1. The first ip_rt_redirect_number redirects are sent
1522  *         with exponential backoff, then we stop sending them at all,
1523  *         assuming that the host ignores our redirects.
1524  *      2. If we did not see packets requiring redirects
1525  *         during ip_rt_redirect_silence, we assume that the host
1526  *         forgot redirected route and start to send redirects again.
1527  *
1528  * This algorithm is much cheaper and more intelligent than dumb load limiting
1529  * in icmp.c.
1530  *
1531  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1532  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1533  */
1534
1535 void ip_rt_send_redirect(struct sk_buff *skb)
1536 {
1537         struct rtable *rt = skb_rtable(skb);
1538         struct in_device *in_dev;
1539         struct inet_peer *peer;
1540         int log_martians;
1541
1542         rcu_read_lock();
1543         in_dev = __in_dev_get_rcu(rt->dst.dev);
1544         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1545                 rcu_read_unlock();
1546                 return;
1547         }
1548         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1549         rcu_read_unlock();
1550
1551         if (!rt->peer)
1552                 rt_bind_peer(rt, rt->rt_dst, 1);
1553         peer = rt->peer;
1554         if (!peer) {
1555                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1556                 return;
1557         }
1558
1559         /* No redirected packets during ip_rt_redirect_silence;
1560          * reset the algorithm.
1561          */
1562         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1563                 peer->rate_tokens = 0;
1564
1565         /* Too many ignored redirects; do not send anything
1566          * set dst.rate_last to the last seen redirected packet.
1567          */
1568         if (peer->rate_tokens >= ip_rt_redirect_number) {
1569                 peer->rate_last = jiffies;
1570                 return;
1571         }
1572
1573         /* Check for load limit; set rate_last to the latest sent
1574          * redirect.
1575          */
1576         if (peer->rate_tokens == 0 ||
1577             time_after(jiffies,
1578                        (peer->rate_last +
1579                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1580                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1581                 peer->rate_last = jiffies;
1582                 ++peer->rate_tokens;
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584                 if (log_martians &&
1585                     peer->rate_tokens == ip_rt_redirect_number &&
1586                     net_ratelimit())
1587                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1588                                &ip_hdr(skb)->saddr, rt->rt_iif,
1589                                 &rt->rt_dst, &rt->rt_gateway);
1590 #endif
1591         }
1592 }
1593
1594 static int ip_error(struct sk_buff *skb)
1595 {
1596         struct rtable *rt = skb_rtable(skb);
1597         struct inet_peer *peer;
1598         unsigned long now;
1599         bool send;
1600         int code;
1601
1602         switch (rt->dst.error) {
1603                 case EINVAL:
1604                 default:
1605                         goto out;
1606                 case EHOSTUNREACH:
1607                         code = ICMP_HOST_UNREACH;
1608                         break;
1609                 case ENETUNREACH:
1610                         code = ICMP_NET_UNREACH;
1611                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1612                                         IPSTATS_MIB_INNOROUTES);
1613                         break;
1614                 case EACCES:
1615                         code = ICMP_PKT_FILTERED;
1616                         break;
1617         }
1618
1619         if (!rt->peer)
1620                 rt_bind_peer(rt, rt->rt_dst, 1);
1621         peer = rt->peer;
1622
1623         send = true;
1624         if (peer) {
1625                 now = jiffies;
1626                 peer->rate_tokens += now - peer->rate_last;
1627                 if (peer->rate_tokens > ip_rt_error_burst)
1628                         peer->rate_tokens = ip_rt_error_burst;
1629                 peer->rate_last = now;
1630                 if (peer->rate_tokens >= ip_rt_error_cost)
1631                         peer->rate_tokens -= ip_rt_error_cost;
1632                 else
1633                         send = false;
1634         }
1635         if (send)
1636                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1637
1638 out:    kfree_skb(skb);
1639         return 0;
1640 }
1641
1642 /*
1643  *      The last two values are not from the RFC but
1644  *      are needed for AMPRnet AX.25 paths.
1645  */
1646
1647 static const unsigned short mtu_plateau[] =
1648 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1649
1650 static inline unsigned short guess_mtu(unsigned short old_mtu)
1651 {
1652         int i;
1653
1654         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1655                 if (old_mtu > mtu_plateau[i])
1656                         return mtu_plateau[i];
1657         return 68;
1658 }
1659
1660 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1661                                  unsigned short new_mtu,
1662                                  struct net_device *dev)
1663 {
1664         unsigned short old_mtu = ntohs(iph->tot_len);
1665         unsigned short est_mtu = 0;
1666         struct inet_peer *peer;
1667
1668         peer = inet_getpeer_v4(iph->daddr, 1);
1669         if (peer) {
1670                 unsigned short mtu = new_mtu;
1671
1672                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1673                         /* BSD 4.2 derived systems incorrectly adjust
1674                          * tot_len by the IP header length, and report
1675                          * a zero MTU in the ICMP message.
1676                          */
1677                         if (mtu == 0 &&
1678                             old_mtu >= 68 + (iph->ihl << 2))
1679                                 old_mtu -= iph->ihl << 2;
1680                         mtu = guess_mtu(old_mtu);
1681                 }
1682
1683                 if (mtu < ip_rt_min_pmtu)
1684                         mtu = ip_rt_min_pmtu;
1685                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1686                         unsigned long pmtu_expires;
1687
1688                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1689                         if (!pmtu_expires)
1690                                 pmtu_expires = 1UL;
1691
1692                         est_mtu = mtu;
1693                         peer->pmtu_learned = mtu;
1694                         peer->pmtu_expires = pmtu_expires;
1695                 }
1696
1697                 inet_putpeer(peer);
1698
1699                 atomic_inc(&__rt_peer_genid);
1700         }
1701         return est_mtu ? : new_mtu;
1702 }
1703
1704 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1705 {
1706         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1707
1708         if (!expires)
1709                 return;
1710         if (time_before(jiffies, expires)) {
1711                 u32 orig_dst_mtu = dst_mtu(dst);
1712                 if (peer->pmtu_learned < orig_dst_mtu) {
1713                         if (!peer->pmtu_orig)
1714                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1715                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1716                 }
1717         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1718                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1719 }
1720
1721 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1722 {
1723         struct rtable *rt = (struct rtable *) dst;
1724         struct inet_peer *peer;
1725
1726         dst_confirm(dst);
1727
1728         if (!rt->peer)
1729                 rt_bind_peer(rt, rt->rt_dst, 1);
1730         peer = rt->peer;
1731         if (peer) {
1732                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1733
1734                 if (mtu < ip_rt_min_pmtu)
1735                         mtu = ip_rt_min_pmtu;
1736                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1737
1738                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1739                         if (!pmtu_expires)
1740                                 pmtu_expires = 1UL;
1741
1742                         peer->pmtu_learned = mtu;
1743                         peer->pmtu_expires = pmtu_expires;
1744
1745                         atomic_inc(&__rt_peer_genid);
1746                         rt->rt_peer_genid = rt_peer_genid();
1747                 }
1748                 check_peer_pmtu(dst, peer);
1749         }
1750 }
1751
1752 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1753 {
1754         struct rtable *rt = (struct rtable *) dst;
1755
1756         if (rt_is_expired(rt))
1757                 return NULL;
1758         if (rt->rt_peer_genid != rt_peer_genid()) {
1759                 struct inet_peer *peer;
1760
1761                 if (!rt->peer)
1762                         rt_bind_peer(rt, rt->rt_dst, 0);
1763
1764                 peer = rt->peer;
1765                 if (peer) {
1766                         check_peer_pmtu(dst, peer);
1767
1768                         if (peer->redirect_learned.a4 &&
1769                             peer->redirect_learned.a4 != rt->rt_gateway) {
1770                                 if (check_peer_redir(dst, peer))
1771                                         return NULL;
1772                         }
1773                 }
1774
1775                 rt->rt_peer_genid = rt_peer_genid();
1776         }
1777         return dst;
1778 }
1779
1780 static void ipv4_dst_destroy(struct dst_entry *dst)
1781 {
1782         struct rtable *rt = (struct rtable *) dst;
1783         struct inet_peer *peer = rt->peer;
1784
1785         if (rt->fi) {
1786                 fib_info_put(rt->fi);
1787                 rt->fi = NULL;
1788         }
1789         if (peer) {
1790                 rt->peer = NULL;
1791                 inet_putpeer(peer);
1792         }
1793 }
1794
1795
1796 static void ipv4_link_failure(struct sk_buff *skb)
1797 {
1798         struct rtable *rt;
1799
1800         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1801
1802         rt = skb_rtable(skb);
1803         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1804                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1805 }
1806
1807 static int ip_rt_bug(struct sk_buff *skb)
1808 {
1809         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1810                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1811                 skb->dev ? skb->dev->name : "?");
1812         kfree_skb(skb);
1813         WARN_ON(1);
1814         return 0;
1815 }
1816
1817 /*
1818    We do not cache source address of outgoing interface,
1819    because it is used only by IP RR, TS and SRR options,
1820    so that it out of fast path.
1821
1822    BTW remember: "addr" is allowed to be not aligned
1823    in IP options!
1824  */
1825
1826 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1827 {
1828         __be32 src;
1829
1830         if (rt_is_output_route(rt))
1831                 src = ip_hdr(skb)->saddr;
1832         else {
1833                 struct fib_result res;
1834                 struct flowi4 fl4;
1835                 struct iphdr *iph;
1836
1837                 iph = ip_hdr(skb);
1838
1839                 memset(&fl4, 0, sizeof(fl4));
1840                 fl4.daddr = iph->daddr;
1841                 fl4.saddr = iph->saddr;
1842                 fl4.flowi4_tos = RT_TOS(iph->tos);
1843                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1844                 fl4.flowi4_iif = skb->dev->ifindex;
1845                 fl4.flowi4_mark = skb->mark;
1846
1847                 rcu_read_lock();
1848                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1849                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1850                 else
1851                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1852                                         RT_SCOPE_UNIVERSE);
1853                 rcu_read_unlock();
1854         }
1855         memcpy(addr, &src, 4);
1856 }
1857
1858 #ifdef CONFIG_IP_ROUTE_CLASSID
1859 static void set_class_tag(struct rtable *rt, u32 tag)
1860 {
1861         if (!(rt->dst.tclassid & 0xFFFF))
1862                 rt->dst.tclassid |= tag & 0xFFFF;
1863         if (!(rt->dst.tclassid & 0xFFFF0000))
1864                 rt->dst.tclassid |= tag & 0xFFFF0000;
1865 }
1866 #endif
1867
1868 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1869 {
1870         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1871
1872         if (advmss == 0) {
1873                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1874                                ip_rt_min_advmss);
1875                 if (advmss > 65535 - 40)
1876                         advmss = 65535 - 40;
1877         }
1878         return advmss;
1879 }
1880
1881 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1882 {
1883         unsigned int mtu = dst->dev->mtu;
1884
1885         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1886                 const struct rtable *rt = (const struct rtable *) dst;
1887
1888                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1889                         mtu = 576;
1890         }
1891
1892         if (mtu > IP_MAX_MTU)
1893                 mtu = IP_MAX_MTU;
1894
1895         return mtu;
1896 }
1897
1898 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1899                             struct fib_info *fi)
1900 {
1901         struct inet_peer *peer;
1902         int create = 0;
1903
1904         /* If a peer entry exists for this destination, we must hook
1905          * it up in order to get at cached metrics.
1906          */
1907         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1908                 create = 1;
1909
1910         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1911         if (peer) {
1912                 rt->rt_peer_genid = rt_peer_genid();
1913                 if (inet_metrics_new(peer))
1914                         memcpy(peer->metrics, fi->fib_metrics,
1915                                sizeof(u32) * RTAX_MAX);
1916                 dst_init_metrics(&rt->dst, peer->metrics, false);
1917
1918                 check_peer_pmtu(&rt->dst, peer);
1919                 if (peer->redirect_learned.a4 &&
1920                     peer->redirect_learned.a4 != rt->rt_gateway) {
1921                         rt->rt_gateway = peer->redirect_learned.a4;
1922                         rt->rt_flags |= RTCF_REDIRECTED;
1923                 }
1924         } else {
1925                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1926                         rt->fi = fi;
1927                         atomic_inc(&fi->fib_clntref);
1928                 }
1929                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1930         }
1931 }
1932
1933 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1934                            const struct fib_result *res,
1935                            struct fib_info *fi, u16 type, u32 itag)
1936 {
1937         struct dst_entry *dst = &rt->dst;
1938
1939         if (fi) {
1940                 if (FIB_RES_GW(*res) &&
1941                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1942                         rt->rt_gateway = FIB_RES_GW(*res);
1943                 rt_init_metrics(rt, fl4, fi);
1944 #ifdef CONFIG_IP_ROUTE_CLASSID
1945                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1946 #endif
1947         }
1948
1949         if (dst_mtu(dst) > IP_MAX_MTU)
1950                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1951         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1952                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1953
1954 #ifdef CONFIG_IP_ROUTE_CLASSID
1955 #ifdef CONFIG_IP_MULTIPLE_TABLES
1956         set_class_tag(rt, fib_rules_tclass(res));
1957 #endif
1958         set_class_tag(rt, itag);
1959 #endif
1960 }
1961
1962 static struct rtable *rt_dst_alloc(struct net_device *dev,
1963                                    bool nopolicy, bool noxfrm)
1964 {
1965         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1966                          DST_HOST |
1967                          (nopolicy ? DST_NOPOLICY : 0) |
1968                          (noxfrm ? DST_NOXFRM : 0));
1969 }
1970
1971 /* called in rcu_read_lock() section */
1972 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1973                                 u8 tos, struct net_device *dev, int our)
1974 {
1975         unsigned int hash;
1976         struct rtable *rth;
1977         __be32 spec_dst;
1978         struct in_device *in_dev = __in_dev_get_rcu(dev);
1979         u32 itag = 0;
1980         int err;
1981
1982         /* Primary sanity checks. */
1983
1984         if (in_dev == NULL)
1985                 return -EINVAL;
1986
1987         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1988             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1989                 goto e_inval;
1990
1991         if (ipv4_is_zeronet(saddr)) {
1992                 if (!ipv4_is_local_multicast(daddr))
1993                         goto e_inval;
1994                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995         } else {
1996                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1997                                           &itag);
1998                 if (err < 0)
1999                         goto e_err;
2000         }
2001         rth = rt_dst_alloc(init_net.loopback_dev,
2002                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2003         if (!rth)
2004                 goto e_nobufs;
2005
2006 #ifdef CONFIG_IP_ROUTE_CLASSID
2007         rth->dst.tclassid = itag;
2008 #endif
2009         rth->dst.output = ip_rt_bug;
2010
2011         rth->rt_key_dst = daddr;
2012         rth->rt_key_src = saddr;
2013         rth->rt_genid   = rt_genid(dev_net(dev));
2014         rth->rt_flags   = RTCF_MULTICAST;
2015         rth->rt_type    = RTN_MULTICAST;
2016         rth->rt_key_tos = tos;
2017         rth->rt_dst     = daddr;
2018         rth->rt_src     = saddr;
2019         rth->rt_route_iif = dev->ifindex;
2020         rth->rt_iif     = dev->ifindex;
2021         rth->rt_oif     = 0;
2022         rth->rt_mark    = skb->mark;
2023         rth->rt_gateway = daddr;
2024         rth->rt_spec_dst= spec_dst;
2025         rth->rt_peer_genid = 0;
2026         rth->peer = NULL;
2027         rth->fi = NULL;
2028         if (our) {
2029                 rth->dst.input= ip_local_deliver;
2030                 rth->rt_flags |= RTCF_LOCAL;
2031         }
2032
2033 #ifdef CONFIG_IP_MROUTE
2034         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2035                 rth->dst.input = ip_mr_input;
2036 #endif
2037         RT_CACHE_STAT_INC(in_slow_mc);
2038
2039         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2040         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2041         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2042
2043 e_nobufs:
2044         return -ENOBUFS;
2045 e_inval:
2046         return -EINVAL;
2047 e_err:
2048         return err;
2049 }
2050
2051
2052 static void ip_handle_martian_source(struct net_device *dev,
2053                                      struct in_device *in_dev,
2054                                      struct sk_buff *skb,
2055                                      __be32 daddr,
2056                                      __be32 saddr)
2057 {
2058         RT_CACHE_STAT_INC(in_martian_src);
2059 #ifdef CONFIG_IP_ROUTE_VERBOSE
2060         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2061                 /*
2062                  *      RFC1812 recommendation, if source is martian,
2063                  *      the only hint is MAC header.
2064                  */
2065                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2066                         &daddr, &saddr, dev->name);
2067                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2068                         int i;
2069                         const unsigned char *p = skb_mac_header(skb);
2070                         printk(KERN_WARNING "ll header: ");
2071                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2072                                 printk("%02x", *p);
2073                                 if (i < (dev->hard_header_len - 1))
2074                                         printk(":");
2075                         }
2076                         printk("\n");
2077                 }
2078         }
2079 #endif
2080 }
2081
2082 /* called in rcu_read_lock() section */
2083 static int __mkroute_input(struct sk_buff *skb,
2084                            const struct fib_result *res,
2085                            struct in_device *in_dev,
2086                            __be32 daddr, __be32 saddr, u32 tos,
2087                            struct rtable **result)
2088 {
2089         struct rtable *rth;
2090         int err;
2091         struct in_device *out_dev;
2092         unsigned int flags = 0;
2093         __be32 spec_dst;
2094         u32 itag;
2095
2096         /* get a working reference to the output device */
2097         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2098         if (out_dev == NULL) {
2099                 if (net_ratelimit())
2100                         printk(KERN_CRIT "Bug in ip_route_input" \
2101                                "_slow(). Please, report\n");
2102                 return -EINVAL;
2103         }
2104
2105
2106         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2107                                   in_dev->dev, &spec_dst, &itag);
2108         if (err < 0) {
2109                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2110                                          saddr);
2111
2112                 goto cleanup;
2113         }
2114
2115         if (err)
2116                 flags |= RTCF_DIRECTSRC;
2117
2118         if (out_dev == in_dev && err &&
2119             (IN_DEV_SHARED_MEDIA(out_dev) ||
2120              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2121                 flags |= RTCF_DOREDIRECT;
2122
2123         if (skb->protocol != htons(ETH_P_IP)) {
2124                 /* Not IP (i.e. ARP). Do not create route, if it is
2125                  * invalid for proxy arp. DNAT routes are always valid.
2126                  *
2127                  * Proxy arp feature have been extended to allow, ARP
2128                  * replies back to the same interface, to support
2129                  * Private VLAN switch technologies. See arp.c.
2130                  */
2131                 if (out_dev == in_dev &&
2132                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2133                         err = -EINVAL;
2134                         goto cleanup;
2135                 }
2136         }
2137
2138         rth = rt_dst_alloc(out_dev->dev,
2139                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2140                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2141         if (!rth) {
2142                 err = -ENOBUFS;
2143                 goto cleanup;
2144         }
2145
2146         rth->rt_key_dst = daddr;
2147         rth->rt_key_src = saddr;
2148         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2149         rth->rt_flags = flags;
2150         rth->rt_type = res->type;
2151         rth->rt_key_tos = tos;
2152         rth->rt_dst     = daddr;
2153         rth->rt_src     = saddr;
2154         rth->rt_route_iif = in_dev->dev->ifindex;
2155         rth->rt_iif     = in_dev->dev->ifindex;
2156         rth->rt_oif     = 0;
2157         rth->rt_mark    = skb->mark;
2158         rth->rt_gateway = daddr;
2159         rth->rt_spec_dst= spec_dst;
2160         rth->rt_peer_genid = 0;
2161         rth->peer = NULL;
2162         rth->fi = NULL;
2163
2164         rth->dst.input = ip_forward;
2165         rth->dst.output = ip_output;
2166
2167         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2168
2169         *result = rth;
2170         err = 0;
2171  cleanup:
2172         return err;
2173 }
2174
2175 static int ip_mkroute_input(struct sk_buff *skb,
2176                             struct fib_result *res,
2177                             const struct flowi4 *fl4,
2178                             struct in_device *in_dev,
2179                             __be32 daddr, __be32 saddr, u32 tos)
2180 {
2181         struct rtable* rth = NULL;
2182         int err;
2183         unsigned hash;
2184
2185 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2186         if (res->fi && res->fi->fib_nhs > 1)
2187                 fib_select_multipath(res);
2188 #endif
2189
2190         /* create a routing cache entry */
2191         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2192         if (err)
2193                 return err;
2194
2195         /* put it into the cache */
2196         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2197                        rt_genid(dev_net(rth->dst.dev)));
2198         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2199         if (IS_ERR(rth))
2200                 return PTR_ERR(rth);
2201         return 0;
2202 }
2203
2204 /*
2205  *      NOTE. We drop all the packets that has local source
2206  *      addresses, because every properly looped back packet
2207  *      must have correct destination already attached by output routine.
2208  *
2209  *      Such approach solves two big problems:
2210  *      1. Not simplex devices are handled properly.
2211  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2212  *      called with rcu_read_lock()
2213  */
2214
2215 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2216                                u8 tos, struct net_device *dev)
2217 {
2218         struct fib_result res;
2219         struct in_device *in_dev = __in_dev_get_rcu(dev);
2220         struct flowi4   fl4;
2221         unsigned        flags = 0;
2222         u32             itag = 0;
2223         struct rtable * rth;
2224         unsigned        hash;
2225         __be32          spec_dst;
2226         int             err = -EINVAL;
2227         struct net    * net = dev_net(dev);
2228
2229         /* IP on this device is disabled. */
2230
2231         if (!in_dev)
2232                 goto out;
2233
2234         /* Check for the most weird martians, which can be not detected
2235            by fib_lookup.
2236          */
2237
2238         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2239             ipv4_is_loopback(saddr))
2240                 goto martian_source;
2241
2242         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2243                 goto brd_input;
2244
2245         /* Accept zero addresses only to limited broadcast;
2246          * I even do not know to fix it or not. Waiting for complains :-)
2247          */
2248         if (ipv4_is_zeronet(saddr))
2249                 goto martian_source;
2250
2251         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2252                 goto martian_destination;
2253
2254         /*
2255          *      Now we are ready to route packet.
2256          */
2257         fl4.flowi4_oif = 0;
2258         fl4.flowi4_iif = dev->ifindex;
2259         fl4.flowi4_mark = skb->mark;
2260         fl4.flowi4_tos = tos;
2261         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2262         fl4.daddr = daddr;
2263         fl4.saddr = saddr;
2264         err = fib_lookup(net, &fl4, &res);
2265         if (err != 0) {
2266                 if (!IN_DEV_FORWARD(in_dev))
2267                         goto e_hostunreach;
2268                 goto no_route;
2269         }
2270
2271         RT_CACHE_STAT_INC(in_slow_tot);
2272
2273         if (res.type == RTN_BROADCAST)
2274                 goto brd_input;
2275
2276         if (res.type == RTN_LOCAL) {
2277                 err = fib_validate_source(skb, saddr, daddr, tos,
2278                                           net->loopback_dev->ifindex,
2279                                           dev, &spec_dst, &itag);
2280                 if (err < 0)
2281                         goto martian_source_keep_err;
2282                 if (err)
2283                         flags |= RTCF_DIRECTSRC;
2284                 spec_dst = daddr;
2285                 goto local_input;
2286         }
2287
2288         if (!IN_DEV_FORWARD(in_dev))
2289                 goto e_hostunreach;
2290         if (res.type != RTN_UNICAST)
2291                 goto martian_destination;
2292
2293         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2294 out:    return err;
2295
2296 brd_input:
2297         if (skb->protocol != htons(ETH_P_IP))
2298                 goto e_inval;
2299
2300         if (ipv4_is_zeronet(saddr))
2301                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2302         else {
2303                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2304                                           &itag);
2305                 if (err < 0)
2306                         goto martian_source_keep_err;
2307                 if (err)
2308                         flags |= RTCF_DIRECTSRC;
2309         }
2310         flags |= RTCF_BROADCAST;
2311         res.type = RTN_BROADCAST;
2312         RT_CACHE_STAT_INC(in_brd);
2313
2314 local_input:
2315         rth = rt_dst_alloc(net->loopback_dev,
2316                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2317         if (!rth)
2318                 goto e_nobufs;
2319
2320         rth->dst.input= ip_local_deliver;
2321         rth->dst.output= ip_rt_bug;
2322 #ifdef CONFIG_IP_ROUTE_CLASSID
2323         rth->dst.tclassid = itag;
2324 #endif
2325
2326         rth->rt_key_dst = daddr;
2327         rth->rt_key_src = saddr;
2328         rth->rt_genid = rt_genid(net);
2329         rth->rt_flags   = flags|RTCF_LOCAL;
2330         rth->rt_type    = res.type;
2331         rth->rt_key_tos = tos;
2332         rth->rt_dst     = daddr;
2333         rth->rt_src     = saddr;
2334 #ifdef CONFIG_IP_ROUTE_CLASSID
2335         rth->dst.tclassid = itag;
2336 #endif
2337         rth->rt_route_iif = dev->ifindex;
2338         rth->rt_iif     = dev->ifindex;
2339         rth->rt_oif     = 0;
2340         rth->rt_mark    = skb->mark;
2341         rth->rt_gateway = daddr;
2342         rth->rt_spec_dst= spec_dst;
2343         rth->rt_peer_genid = 0;
2344         rth->peer = NULL;
2345         rth->fi = NULL;
2346         if (res.type == RTN_UNREACHABLE) {
2347                 rth->dst.input= ip_error;
2348                 rth->dst.error= -err;
2349                 rth->rt_flags   &= ~RTCF_LOCAL;
2350         }
2351         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2352         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2353         err = 0;
2354         if (IS_ERR(rth))
2355                 err = PTR_ERR(rth);
2356         goto out;
2357
2358 no_route:
2359         RT_CACHE_STAT_INC(in_no_route);
2360         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2361         res.type = RTN_UNREACHABLE;
2362         if (err == -ESRCH)
2363                 err = -ENETUNREACH;
2364         goto local_input;
2365
2366         /*
2367          *      Do not cache martian addresses: they should be logged (RFC1812)
2368          */
2369 martian_destination:
2370         RT_CACHE_STAT_INC(in_martian_dst);
2371 #ifdef CONFIG_IP_ROUTE_VERBOSE
2372         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2373                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2374                         &daddr, &saddr, dev->name);
2375 #endif
2376
2377 e_hostunreach:
2378         err = -EHOSTUNREACH;
2379         goto out;
2380
2381 e_inval:
2382         err = -EINVAL;
2383         goto out;
2384
2385 e_nobufs:
2386         err = -ENOBUFS;
2387         goto out;
2388
2389 martian_source:
2390         err = -EINVAL;
2391 martian_source_keep_err:
2392         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2393         goto out;
2394 }
2395
2396 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2397                            u8 tos, struct net_device *dev, bool noref)
2398 {
2399         struct rtable * rth;
2400         unsigned        hash;
2401         int iif = dev->ifindex;
2402         struct net *net;
2403         int res;
2404
2405         net = dev_net(dev);
2406
2407         rcu_read_lock();
2408
2409         if (!rt_caching(net))
2410                 goto skip_cache;
2411
2412         tos &= IPTOS_RT_MASK;
2413         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2414
2415         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2416              rth = rcu_dereference(rth->dst.rt_next)) {
2417                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2418                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2419                      (rth->rt_route_iif ^ iif) |
2420                      (rth->rt_key_tos ^ tos)) == 0 &&
2421                     rth->rt_mark == skb->mark &&
2422                     net_eq(dev_net(rth->dst.dev), net) &&
2423                     !rt_is_expired(rth)) {
2424                         if (noref) {
2425                                 dst_use_noref(&rth->dst, jiffies);
2426                                 skb_dst_set_noref(skb, &rth->dst);
2427                         } else {
2428                                 dst_use(&rth->dst, jiffies);
2429                                 skb_dst_set(skb, &rth->dst);
2430                         }
2431                         RT_CACHE_STAT_INC(in_hit);
2432                         rcu_read_unlock();
2433                         return 0;
2434                 }
2435                 RT_CACHE_STAT_INC(in_hlist_search);
2436         }
2437
2438 skip_cache:
2439         /* Multicast recognition logic is moved from route cache to here.
2440            The problem was that too many Ethernet cards have broken/missing
2441            hardware multicast filters :-( As result the host on multicasting
2442            network acquires a lot of useless route cache entries, sort of
2443            SDR messages from all the world. Now we try to get rid of them.
2444            Really, provided software IP multicast filter is organized
2445            reasonably (at least, hashed), it does not result in a slowdown
2446            comparing with route cache reject entries.
2447            Note, that multicast routers are not affected, because
2448            route cache entry is created eventually.
2449          */
2450         if (ipv4_is_multicast(daddr)) {
2451                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2452
2453                 if (in_dev) {
2454                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2455                                                   ip_hdr(skb)->protocol);
2456                         if (our
2457 #ifdef CONFIG_IP_MROUTE
2458                                 ||
2459                             (!ipv4_is_local_multicast(daddr) &&
2460                              IN_DEV_MFORWARD(in_dev))
2461 #endif
2462                            ) {
2463                                 int res = ip_route_input_mc(skb, daddr, saddr,
2464                                                             tos, dev, our);
2465                                 rcu_read_unlock();
2466                                 return res;
2467                         }
2468                 }
2469                 rcu_read_unlock();
2470                 return -EINVAL;
2471         }
2472         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2473         rcu_read_unlock();
2474         return res;
2475 }
2476 EXPORT_SYMBOL(ip_route_input_common);
2477
2478 /* called with rcu_read_lock() */
2479 static struct rtable *__mkroute_output(const struct fib_result *res,
2480                                        const struct flowi4 *fl4,
2481                                        __be32 orig_daddr, __be32 orig_saddr,
2482                                        int orig_oif, struct net_device *dev_out,
2483                                        unsigned int flags)
2484 {
2485         struct fib_info *fi = res->fi;
2486         u32 tos = RT_FL_TOS(fl4);
2487         struct in_device *in_dev;
2488         u16 type = res->type;
2489         struct rtable *rth;
2490
2491         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2492                 return ERR_PTR(-EINVAL);
2493
2494         if (ipv4_is_lbcast(fl4->daddr))
2495                 type = RTN_BROADCAST;
2496         else if (ipv4_is_multicast(fl4->daddr))
2497                 type = RTN_MULTICAST;
2498         else if (ipv4_is_zeronet(fl4->daddr))
2499                 return ERR_PTR(-EINVAL);
2500
2501         if (dev_out->flags & IFF_LOOPBACK)
2502                 flags |= RTCF_LOCAL;
2503
2504         in_dev = __in_dev_get_rcu(dev_out);
2505         if (!in_dev)
2506                 return ERR_PTR(-EINVAL);
2507
2508         if (type == RTN_BROADCAST) {
2509                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2510                 fi = NULL;
2511         } else if (type == RTN_MULTICAST) {
2512                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2513                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2514                                      fl4->flowi4_proto))
2515                         flags &= ~RTCF_LOCAL;
2516                 /* If multicast route do not exist use
2517                  * default one, but do not gateway in this case.
2518                  * Yes, it is hack.
2519                  */
2520                 if (fi && res->prefixlen < 4)
2521                         fi = NULL;
2522         }
2523
2524         rth = rt_dst_alloc(dev_out,
2525                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2526                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2527         if (!rth)
2528                 return ERR_PTR(-ENOBUFS);
2529
2530         rth->dst.output = ip_output;
2531
2532         rth->rt_key_dst = orig_daddr;
2533         rth->rt_key_src = orig_saddr;
2534         rth->rt_genid = rt_genid(dev_net(dev_out));
2535         rth->rt_flags   = flags;
2536         rth->rt_type    = type;
2537         rth->rt_key_tos = tos;
2538         rth->rt_dst     = fl4->daddr;
2539         rth->rt_src     = fl4->saddr;
2540         rth->rt_route_iif = 0;
2541         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2542         rth->rt_oif     = orig_oif;
2543         rth->rt_mark    = fl4->flowi4_mark;
2544         rth->rt_gateway = fl4->daddr;
2545         rth->rt_spec_dst= fl4->saddr;
2546         rth->rt_peer_genid = 0;
2547         rth->peer = NULL;
2548         rth->fi = NULL;
2549
2550         RT_CACHE_STAT_INC(out_slow_tot);
2551
2552         if (flags & RTCF_LOCAL) {
2553                 rth->dst.input = ip_local_deliver;
2554                 rth->rt_spec_dst = fl4->daddr;
2555         }
2556         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2557                 rth->rt_spec_dst = fl4->saddr;
2558                 if (flags & RTCF_LOCAL &&
2559                     !(dev_out->flags & IFF_LOOPBACK)) {
2560                         rth->dst.output = ip_mc_output;
2561                         RT_CACHE_STAT_INC(out_slow_mc);
2562                 }
2563 #ifdef CONFIG_IP_MROUTE
2564                 if (type == RTN_MULTICAST) {
2565                         if (IN_DEV_MFORWARD(in_dev) &&
2566                             !ipv4_is_local_multicast(fl4->daddr)) {
2567                                 rth->dst.input = ip_mr_input;
2568                                 rth->dst.output = ip_mc_output;
2569                         }
2570                 }
2571 #endif
2572         }
2573
2574         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2575
2576         return rth;
2577 }
2578
2579 /*
2580  * Major route resolver routine.
2581  * called with rcu_read_lock();
2582  */
2583
2584 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2585 {
2586         struct net_device *dev_out = NULL;
2587         u32 tos = RT_FL_TOS(fl4);
2588         unsigned int flags = 0;
2589         struct fib_result res;
2590         struct rtable *rth;
2591         __be32 orig_daddr;
2592         __be32 orig_saddr;
2593         int orig_oif;
2594
2595         res.fi          = NULL;
2596 #ifdef CONFIG_IP_MULTIPLE_TABLES
2597         res.r           = NULL;
2598 #endif
2599
2600         orig_daddr = fl4->daddr;
2601         orig_saddr = fl4->saddr;
2602         orig_oif = fl4->flowi4_oif;
2603
2604         fl4->flowi4_iif = net->loopback_dev->ifindex;
2605         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2606         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2607                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2608
2609         rcu_read_lock();
2610         if (fl4->saddr) {
2611                 rth = ERR_PTR(-EINVAL);
2612                 if (ipv4_is_multicast(fl4->saddr) ||
2613                     ipv4_is_lbcast(fl4->saddr) ||
2614                     ipv4_is_zeronet(fl4->saddr))
2615                         goto out;
2616
2617                 /* I removed check for oif == dev_out->oif here.
2618                    It was wrong for two reasons:
2619                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2620                       is assigned to multiple interfaces.
2621                    2. Moreover, we are allowed to send packets with saddr
2622                       of another iface. --ANK
2623                  */
2624
2625                 if (fl4->flowi4_oif == 0 &&
2626                     (ipv4_is_multicast(fl4->daddr) ||
2627                      ipv4_is_lbcast(fl4->daddr))) {
2628                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2629                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2630                         if (dev_out == NULL)
2631                                 goto out;
2632
2633                         /* Special hack: user can direct multicasts
2634                            and limited broadcast via necessary interface
2635                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2636                            This hack is not just for fun, it allows
2637                            vic,vat and friends to work.
2638                            They bind socket to loopback, set ttl to zero
2639                            and expect that it will work.
2640                            From the viewpoint of routing cache they are broken,
2641                            because we are not allowed to build multicast path
2642                            with loopback source addr (look, routing cache
2643                            cannot know, that ttl is zero, so that packet
2644                            will not leave this host and route is valid).
2645                            Luckily, this hack is good workaround.
2646                          */
2647
2648                         fl4->flowi4_oif = dev_out->ifindex;
2649                         goto make_route;
2650                 }
2651
2652                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2653                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2654                         if (!__ip_dev_find(net, fl4->saddr, false))
2655                                 goto out;
2656                 }
2657         }
2658
2659
2660         if (fl4->flowi4_oif) {
2661                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2662                 rth = ERR_PTR(-ENODEV);
2663                 if (dev_out == NULL)
2664                         goto out;
2665
2666                 /* RACE: Check return value of inet_select_addr instead. */
2667                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2668                         rth = ERR_PTR(-ENETUNREACH);
2669                         goto out;
2670                 }
2671                 if (ipv4_is_local_multicast(fl4->daddr) ||
2672                     ipv4_is_lbcast(fl4->daddr)) {
2673                         if (!fl4->saddr)
2674                                 fl4->saddr = inet_select_addr(dev_out, 0,
2675                                                               RT_SCOPE_LINK);
2676                         goto make_route;
2677                 }
2678                 if (fl4->saddr) {
2679                         if (ipv4_is_multicast(fl4->daddr))
2680                                 fl4->saddr = inet_select_addr(dev_out, 0,
2681                                                               fl4->flowi4_scope);
2682                         else if (!fl4->daddr)
2683                                 fl4->saddr = inet_select_addr(dev_out, 0,
2684                                                               RT_SCOPE_HOST);
2685                 }
2686         }
2687
2688         if (!fl4->daddr) {
2689                 fl4->daddr = fl4->saddr;
2690                 if (!fl4->daddr)
2691                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2692                 dev_out = net->loopback_dev;
2693                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2694                 res.type = RTN_LOCAL;
2695                 flags |= RTCF_LOCAL;
2696                 goto make_route;
2697         }
2698
2699         if (fib_lookup(net, fl4, &res)) {
2700                 res.fi = NULL;
2701                 if (fl4->flowi4_oif) {
2702                         /* Apparently, routing tables are wrong. Assume,
2703                            that the destination is on link.
2704
2705                            WHY? DW.
2706                            Because we are allowed to send to iface
2707                            even if it has NO routes and NO assigned
2708                            addresses. When oif is specified, routing
2709                            tables are looked up with only one purpose:
2710                            to catch if destination is gatewayed, rather than
2711                            direct. Moreover, if MSG_DONTROUTE is set,
2712                            we send packet, ignoring both routing tables
2713                            and ifaddr state. --ANK
2714
2715
2716                            We could make it even if oif is unknown,
2717                            likely IPv6, but we do not.
2718                          */
2719
2720                         if (fl4->saddr == 0)
2721                                 fl4->saddr = inet_select_addr(dev_out, 0,
2722                                                               RT_SCOPE_LINK);
2723                         res.type = RTN_UNICAST;
2724                         goto make_route;
2725                 }
2726                 rth = ERR_PTR(-ENETUNREACH);
2727                 goto out;
2728         }
2729
2730         if (res.type == RTN_LOCAL) {
2731                 if (!fl4->saddr) {
2732                         if (res.fi->fib_prefsrc)
2733                                 fl4->saddr = res.fi->fib_prefsrc;
2734                         else
2735                                 fl4->saddr = fl4->daddr;
2736                 }
2737                 dev_out = net->loopback_dev;
2738                 fl4->flowi4_oif = dev_out->ifindex;
2739                 res.fi = NULL;
2740                 flags |= RTCF_LOCAL;
2741                 goto make_route;
2742         }
2743
2744 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2745         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2746                 fib_select_multipath(&res);
2747         else
2748 #endif
2749         if (!res.prefixlen &&
2750             res.table->tb_num_default > 1 &&
2751             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2752                 fib_select_default(&res);
2753
2754         if (!fl4->saddr)
2755                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2756
2757         dev_out = FIB_RES_DEV(res);
2758         fl4->flowi4_oif = dev_out->ifindex;
2759
2760
2761 make_route:
2762         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2763                                dev_out, flags);
2764         if (!IS_ERR(rth)) {
2765                 unsigned int hash;
2766
2767                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2768                                rt_genid(dev_net(dev_out)));
2769                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2770         }
2771
2772 out:
2773         rcu_read_unlock();
2774         return rth;
2775 }
2776
2777 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2778 {
2779         struct rtable *rth;
2780         unsigned int hash;
2781
2782         if (!rt_caching(net))
2783                 goto slow_output;
2784
2785         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2786
2787         rcu_read_lock_bh();
2788         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2789                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2790                 if (rth->rt_key_dst == flp4->daddr &&
2791                     rth->rt_key_src == flp4->saddr &&
2792                     rt_is_output_route(rth) &&
2793                     rth->rt_oif == flp4->flowi4_oif &&
2794                     rth->rt_mark == flp4->flowi4_mark &&
2795                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2796                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2797                     net_eq(dev_net(rth->dst.dev), net) &&
2798                     !rt_is_expired(rth)) {
2799                         dst_use(&rth->dst, jiffies);
2800                         RT_CACHE_STAT_INC(out_hit);
2801                         rcu_read_unlock_bh();
2802                         if (!flp4->saddr)
2803                                 flp4->saddr = rth->rt_src;
2804                         if (!flp4->daddr)
2805                                 flp4->daddr = rth->rt_dst;
2806                         return rth;
2807                 }
2808                 RT_CACHE_STAT_INC(out_hlist_search);
2809         }
2810         rcu_read_unlock_bh();
2811
2812 slow_output:
2813         return ip_route_output_slow(net, flp4);
2814 }
2815 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2816
2817 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2818 {
2819         return NULL;
2820 }
2821
2822 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2823 {
2824         return 0;
2825 }
2826
2827 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2828 {
2829 }
2830
2831 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2832                                           unsigned long old)
2833 {
2834         return NULL;
2835 }
2836
2837 static struct dst_ops ipv4_dst_blackhole_ops = {
2838         .family                 =       AF_INET,
2839         .protocol               =       cpu_to_be16(ETH_P_IP),
2840         .destroy                =       ipv4_dst_destroy,
2841         .check                  =       ipv4_blackhole_dst_check,
2842         .default_mtu            =       ipv4_blackhole_default_mtu,
2843         .default_advmss         =       ipv4_default_advmss,
2844         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2845         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2846 };
2847
2848 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2849 {
2850         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2851         struct rtable *ort = (struct rtable *) dst_orig;
2852
2853         if (rt) {
2854                 struct dst_entry *new = &rt->dst;
2855
2856                 new->__use = 1;
2857                 new->input = dst_discard;
2858                 new->output = dst_discard;
2859                 dst_copy_metrics(new, &ort->dst);
2860
2861                 new->dev = ort->dst.dev;
2862                 if (new->dev)
2863                         dev_hold(new->dev);
2864
2865                 rt->rt_key_dst = ort->rt_key_dst;
2866                 rt->rt_key_src = ort->rt_key_src;
2867                 rt->rt_key_tos = ort->rt_key_tos;
2868                 rt->rt_route_iif = ort->rt_route_iif;
2869                 rt->rt_iif = ort->rt_iif;
2870                 rt->rt_oif = ort->rt_oif;
2871                 rt->rt_mark = ort->rt_mark;
2872
2873                 rt->rt_genid = rt_genid(net);
2874                 rt->rt_flags = ort->rt_flags;
2875                 rt->rt_type = ort->rt_type;
2876                 rt->rt_dst = ort->rt_dst;
2877                 rt->rt_src = ort->rt_src;
2878                 rt->rt_gateway = ort->rt_gateway;
2879                 rt->rt_spec_dst = ort->rt_spec_dst;
2880                 rt->peer = ort->peer;
2881                 if (rt->peer)
2882                         atomic_inc(&rt->peer->refcnt);
2883                 rt->fi = ort->fi;
2884                 if (rt->fi)
2885                         atomic_inc(&rt->fi->fib_clntref);
2886
2887                 dst_free(new);
2888         }
2889
2890         dst_release(dst_orig);
2891
2892         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2893 }
2894
2895 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2896                                     struct sock *sk)
2897 {
2898         struct rtable *rt = __ip_route_output_key(net, flp4);
2899
2900         if (IS_ERR(rt))
2901                 return rt;
2902
2903         if (flp4->flowi4_proto)
2904                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2905                                                    flowi4_to_flowi(flp4),
2906                                                    sk, 0);
2907
2908         return rt;
2909 }
2910 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2911
2912 static int rt_fill_info(struct net *net,
2913                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2914                         int nowait, unsigned int flags)
2915 {
2916         struct rtable *rt = skb_rtable(skb);
2917         struct rtmsg *r;
2918         struct nlmsghdr *nlh;
2919         long expires = 0;
2920         const struct inet_peer *peer = rt->peer;
2921         u32 id = 0, ts = 0, tsage = 0, error;
2922
2923         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2924         if (nlh == NULL)
2925                 return -EMSGSIZE;
2926
2927         r = nlmsg_data(nlh);
2928         r->rtm_family    = AF_INET;
2929         r->rtm_dst_len  = 32;
2930         r->rtm_src_len  = 0;
2931         r->rtm_tos      = rt->rt_key_tos;
2932         r->rtm_table    = RT_TABLE_MAIN;
2933         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2934         r->rtm_type     = rt->rt_type;
2935         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2936         r->rtm_protocol = RTPROT_UNSPEC;
2937         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2938         if (rt->rt_flags & RTCF_NOTIFY)
2939                 r->rtm_flags |= RTM_F_NOTIFY;
2940
2941         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2942
2943         if (rt->rt_key_src) {
2944                 r->rtm_src_len = 32;
2945                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2946         }
2947         if (rt->dst.dev)
2948                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2949 #ifdef CONFIG_IP_ROUTE_CLASSID
2950         if (rt->dst.tclassid)
2951                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2952 #endif
2953         if (rt_is_input_route(rt))
2954                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2955         else if (rt->rt_src != rt->rt_key_src)
2956                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2957
2958         if (rt->rt_dst != rt->rt_gateway)
2959                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2960
2961         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2962                 goto nla_put_failure;
2963
2964         if (rt->rt_mark)
2965                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2966
2967         error = rt->dst.error;
2968         if (peer) {
2969                 inet_peer_refcheck(rt->peer);
2970                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2971                 if (peer->tcp_ts_stamp) {
2972                         ts = peer->tcp_ts;
2973                         tsage = get_seconds() - peer->tcp_ts_stamp;
2974                 }
2975                 expires = ACCESS_ONCE(peer->pmtu_expires);
2976                 if (expires)
2977                         expires -= jiffies;
2978         }
2979
2980         if (rt_is_input_route(rt)) {
2981 #ifdef CONFIG_IP_MROUTE
2982                 __be32 dst = rt->rt_dst;
2983
2984                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2985                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2986                         int err = ipmr_get_route(net, skb,
2987                                                  rt->rt_src, rt->rt_dst,
2988                                                  r, nowait);
2989                         if (err <= 0) {
2990                                 if (!nowait) {
2991                                         if (err == 0)
2992                                                 return 0;
2993                                         goto nla_put_failure;
2994                                 } else {
2995                                         if (err == -EMSGSIZE)
2996                                                 goto nla_put_failure;
2997                                         error = err;
2998                                 }
2999                         }
3000                 } else
3001 #endif
3002                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3003         }
3004
3005         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3006                                expires, error) < 0)
3007                 goto nla_put_failure;
3008
3009         return nlmsg_end(skb, nlh);
3010
3011 nla_put_failure:
3012         nlmsg_cancel(skb, nlh);
3013         return -EMSGSIZE;
3014 }
3015
3016 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3017 {
3018         struct net *net = sock_net(in_skb->sk);
3019         struct rtmsg *rtm;
3020         struct nlattr *tb[RTA_MAX+1];
3021         struct rtable *rt = NULL;
3022         __be32 dst = 0;
3023         __be32 src = 0;
3024         u32 iif;
3025         int err;
3026         int mark;
3027         struct sk_buff *skb;
3028
3029         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3030         if (err < 0)
3031                 goto errout;
3032
3033         rtm = nlmsg_data(nlh);
3034
3035         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3036         if (skb == NULL) {
3037                 err = -ENOBUFS;
3038                 goto errout;
3039         }
3040
3041         /* Reserve room for dummy headers, this skb can pass
3042            through good chunk of routing engine.
3043          */
3044         skb_reset_mac_header(skb);
3045         skb_reset_network_header(skb);
3046
3047         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3048         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3049         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3050
3051         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3052         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3053         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3054         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3055
3056         if (iif) {
3057                 struct net_device *dev;
3058
3059                 dev = __dev_get_by_index(net, iif);
3060                 if (dev == NULL) {
3061                         err = -ENODEV;
3062                         goto errout_free;
3063                 }
3064
3065                 skb->protocol   = htons(ETH_P_IP);
3066                 skb->dev        = dev;
3067                 skb->mark       = mark;
3068                 local_bh_disable();
3069                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3070                 local_bh_enable();
3071
3072                 rt = skb_rtable(skb);
3073                 if (err == 0 && rt->dst.error)
3074                         err = -rt->dst.error;
3075         } else {
3076                 struct flowi4 fl4 = {
3077                         .daddr = dst,
3078                         .saddr = src,
3079                         .flowi4_tos = rtm->rtm_tos,
3080                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3081                         .flowi4_mark = mark,
3082                 };
3083                 rt = ip_route_output_key(net, &fl4);
3084
3085                 err = 0;
3086                 if (IS_ERR(rt))
3087                         err = PTR_ERR(rt);
3088         }
3089
3090         if (err)
3091                 goto errout_free;
3092
3093         skb_dst_set(skb, &rt->dst);
3094         if (rtm->rtm_flags & RTM_F_NOTIFY)
3095                 rt->rt_flags |= RTCF_NOTIFY;
3096
3097         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3098                            RTM_NEWROUTE, 0, 0);
3099         if (err <= 0)
3100                 goto errout_free;
3101
3102         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3103 errout:
3104         return err;
3105
3106 errout_free:
3107         kfree_skb(skb);
3108         goto errout;
3109 }
3110
3111 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3112 {
3113         struct rtable *rt;
3114         int h, s_h;
3115         int idx, s_idx;
3116         struct net *net;
3117
3118         net = sock_net(skb->sk);
3119
3120         s_h = cb->args[0];
3121         if (s_h < 0)
3122                 s_h = 0;
3123         s_idx = idx = cb->args[1];
3124         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3125                 if (!rt_hash_table[h].chain)
3126                         continue;
3127                 rcu_read_lock_bh();
3128                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3129                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3130                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3131                                 continue;
3132                         if (rt_is_expired(rt))
3133                                 continue;
3134                         skb_dst_set_noref(skb, &rt->dst);
3135                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3136                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3137                                          1, NLM_F_MULTI) <= 0) {
3138                                 skb_dst_drop(skb);
3139                                 rcu_read_unlock_bh();
3140                                 goto done;
3141                         }
3142                         skb_dst_drop(skb);
3143                 }
3144                 rcu_read_unlock_bh();
3145         }
3146
3147 done:
3148         cb->args[0] = h;
3149         cb->args[1] = idx;
3150         return skb->len;
3151 }
3152
3153 void ip_rt_multicast_event(struct in_device *in_dev)
3154 {
3155         rt_cache_flush(dev_net(in_dev->dev), 0);
3156 }
3157
3158 #ifdef CONFIG_SYSCTL
3159 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3160                                         void __user *buffer,
3161                                         size_t *lenp, loff_t *ppos)
3162 {
3163         if (write) {
3164                 int flush_delay;
3165                 ctl_table ctl;
3166                 struct net *net;
3167
3168                 memcpy(&ctl, __ctl, sizeof(ctl));
3169                 ctl.data = &flush_delay;
3170                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3171
3172                 net = (struct net *)__ctl->extra1;
3173                 rt_cache_flush(net, flush_delay);
3174                 return 0;
3175         }
3176
3177         return -EINVAL;
3178 }
3179
3180 static ctl_table ipv4_route_table[] = {
3181         {
3182                 .procname       = "gc_thresh",
3183                 .data           = &ipv4_dst_ops.gc_thresh,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "max_size",
3190                 .data           = &ip_rt_max_size,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 /*  Deprecated. Use gc_min_interval_ms */
3197
3198                 .procname       = "gc_min_interval",
3199                 .data           = &ip_rt_gc_min_interval,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec_jiffies,
3203         },
3204         {
3205                 .procname       = "gc_min_interval_ms",
3206                 .data           = &ip_rt_gc_min_interval,
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0644,
3209                 .proc_handler   = proc_dointvec_ms_jiffies,
3210         },
3211         {
3212                 .procname       = "gc_timeout",
3213                 .data           = &ip_rt_gc_timeout,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec_jiffies,
3217         },
3218         {
3219                 .procname       = "gc_interval",
3220                 .data           = &ip_rt_gc_interval,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec_jiffies,
3224         },
3225         {
3226                 .procname       = "gc_interval",
3227                 .data           = &ip_rt_gc_interval,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec_jiffies,
3231         },
3232         {
3233                 .procname       = "redirect_load",
3234                 .data           = &ip_rt_redirect_load,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec,
3238         },
3239         {
3240                 .procname       = "redirect_number",
3241                 .data           = &ip_rt_redirect_number,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "redirect_silence",
3248                 .data           = &ip_rt_redirect_silence,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         {
3254                 .procname       = "error_cost",
3255                 .data           = &ip_rt_error_cost,
3256                 .maxlen         = sizeof(int),
3257                 .mode           = 0644,
3258                 .proc_handler   = proc_dointvec,
3259         },
3260         {
3261                 .procname       = "error_burst",
3262                 .data           = &ip_rt_error_burst,
3263                 .maxlen         = sizeof(int),
3264                 .mode           = 0644,
3265                 .proc_handler   = proc_dointvec,
3266         },
3267         {
3268                 .procname       = "gc_elasticity",
3269                 .data           = &ip_rt_gc_elasticity,
3270                 .maxlen         = sizeof(int),
3271                 .mode           = 0644,
3272                 .proc_handler   = proc_dointvec,
3273         },
3274         {
3275                 .procname       = "mtu_expires",
3276                 .data           = &ip_rt_mtu_expires,
3277                 .maxlen         = sizeof(int),
3278                 .mode           = 0644,
3279                 .proc_handler   = proc_dointvec_jiffies,
3280         },
3281         {
3282                 .procname       = "min_pmtu",
3283                 .data           = &ip_rt_min_pmtu,
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0644,
3286                 .proc_handler   = proc_dointvec,
3287         },
3288         {
3289                 .procname       = "min_adv_mss",
3290                 .data           = &ip_rt_min_advmss,
3291                 .maxlen         = sizeof(int),
3292                 .mode           = 0644,
3293                 .proc_handler   = proc_dointvec,
3294         },
3295         { }
3296 };
3297
3298 static struct ctl_table empty[1];
3299
3300 static struct ctl_table ipv4_skeleton[] =
3301 {
3302         { .procname = "route",
3303           .mode = 0555, .child = ipv4_route_table},
3304         { .procname = "neigh",
3305           .mode = 0555, .child = empty},
3306         { }
3307 };
3308
3309 static __net_initdata struct ctl_path ipv4_path[] = {
3310         { .procname = "net", },
3311         { .procname = "ipv4", },
3312         { },
3313 };
3314
3315 static struct ctl_table ipv4_route_flush_table[] = {
3316         {
3317                 .procname       = "flush",
3318                 .maxlen         = sizeof(int),
3319                 .mode           = 0200,
3320                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3321         },
3322         { },
3323 };
3324
3325 static __net_initdata struct ctl_path ipv4_route_path[] = {
3326         { .procname = "net", },
3327         { .procname = "ipv4", },
3328         { .procname = "route", },
3329         { },
3330 };
3331
3332 static __net_init int sysctl_route_net_init(struct net *net)
3333 {
3334         struct ctl_table *tbl;
3335
3336         tbl = ipv4_route_flush_table;
3337         if (!net_eq(net, &init_net)) {
3338                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3339                 if (tbl == NULL)
3340                         goto err_dup;
3341         }
3342         tbl[0].extra1 = net;
3343
3344         net->ipv4.route_hdr =
3345                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3346         if (net->ipv4.route_hdr == NULL)
3347                 goto err_reg;
3348         return 0;
3349
3350 err_reg:
3351         if (tbl != ipv4_route_flush_table)
3352                 kfree(tbl);
3353 err_dup:
3354         return -ENOMEM;
3355 }
3356
3357 static __net_exit void sysctl_route_net_exit(struct net *net)
3358 {
3359         struct ctl_table *tbl;
3360
3361         tbl = net->ipv4.route_hdr->ctl_table_arg;
3362         unregister_net_sysctl_table(net->ipv4.route_hdr);
3363         BUG_ON(tbl == ipv4_route_flush_table);
3364         kfree(tbl);
3365 }
3366
3367 static __net_initdata struct pernet_operations sysctl_route_ops = {
3368         .init = sysctl_route_net_init,
3369         .exit = sysctl_route_net_exit,
3370 };
3371 #endif
3372
3373 static __net_init int rt_genid_init(struct net *net)
3374 {
3375         get_random_bytes(&net->ipv4.rt_genid,
3376                          sizeof(net->ipv4.rt_genid));
3377         get_random_bytes(&net->ipv4.dev_addr_genid,
3378                          sizeof(net->ipv4.dev_addr_genid));
3379         return 0;
3380 }
3381
3382 static __net_initdata struct pernet_operations rt_genid_ops = {
3383         .init = rt_genid_init,
3384 };
3385
3386
3387 #ifdef CONFIG_IP_ROUTE_CLASSID
3388 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3389 #endif /* CONFIG_IP_ROUTE_CLASSID */
3390
3391 static __initdata unsigned long rhash_entries;
3392 static int __init set_rhash_entries(char *str)
3393 {
3394         if (!str)
3395                 return 0;
3396         rhash_entries = simple_strtoul(str, &str, 0);
3397         return 1;
3398 }
3399 __setup("rhash_entries=", set_rhash_entries);
3400
3401 int __init ip_rt_init(void)
3402 {
3403         int rc = 0;
3404
3405 #ifdef CONFIG_IP_ROUTE_CLASSID
3406         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3407         if (!ip_rt_acct)
3408                 panic("IP: failed to allocate ip_rt_acct\n");
3409 #endif
3410
3411         ipv4_dst_ops.kmem_cachep =
3412                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3413                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3414
3415         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3416
3417         if (dst_entries_init(&ipv4_dst_ops) < 0)
3418                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3419
3420         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3421                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3422
3423         rt_hash_table = (struct rt_hash_bucket *)
3424                 alloc_large_system_hash("IP route cache",
3425                                         sizeof(struct rt_hash_bucket),
3426                                         rhash_entries,
3427                                         (totalram_pages >= 128 * 1024) ?
3428                                         15 : 17,
3429                                         0,
3430                                         &rt_hash_log,
3431                                         &rt_hash_mask,
3432                                         rhash_entries ? 0 : 512 * 1024);
3433         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3434         rt_hash_lock_init();
3435
3436         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3437         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3438
3439         devinet_init();
3440         ip_fib_init();
3441
3442         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3443         expires_ljiffies = jiffies;
3444         schedule_delayed_work(&expires_work,
3445                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3446
3447         if (ip_rt_proc_init())
3448                 printk(KERN_ERR "Unable to create route proc files\n");
3449 #ifdef CONFIG_XFRM
3450         xfrm_init();
3451         xfrm4_init(ip_rt_max_size);
3452 #endif
3453         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3454
3455 #ifdef CONFIG_SYSCTL
3456         register_pernet_subsys(&sysctl_route_ops);
3457 #endif
3458         register_pernet_subsys(&rt_genid_ops);
3459         return rc;
3460 }
3461
3462 #ifdef CONFIG_SYSCTL
3463 /*
3464  * We really need to sanitize the damn ipv4 init order, then all
3465  * this nonsense will go away.
3466  */
3467 void __init ip_static_sysctl_init(void)
3468 {
3469         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3470 }
3471 #endif