net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 148                                            struct sk_buff *skb, u32 mtu);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         return NULL;
 281
 282 }
 283
 284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285 {
 286
 287 }
 288
 289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290 {
 291         struct rt_cache_stat *st = v;
 292
 293         if (v == SEQ_START_TOKEN) {
 294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                 return 0;
 296         }
 297
 298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                    dst_entries_get_slow(&ipv4_dst_ops),
 301                    0, /* st->in_hit */
 302                    st->in_slow_tot,
 303                    st->in_slow_mc,
 304                    st->in_no_route,
 305                    st->in_brd,
 306                    st->in_martian_dst,
 307                    st->in_martian_src,
 308
 309                    0, /* st->out_hit */
 310                    st->out_slow_tot,
 311                    st->out_slow_mc,
 312
 313                    0, /* st->gc_total */
 314                    0, /* st->gc_ignored */
 315                    0, /* st->gc_goal_miss */
 316                    0, /* st->gc_dst_overflow */
 317                    0, /* st->in_hlist_search */
 318                    0  /* st->out_hlist_search */
 319                 );
 320         return 0;
 321 }
 322
 323 static const struct seq_operations rt_cpu_seq_ops = {
 324         .start  = rt_cpu_seq_start,
 325         .next   = rt_cpu_seq_next,
 326         .stop   = rt_cpu_seq_stop,
 327         .show   = rt_cpu_seq_show,
 328 };
 329
 330
 331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332 {
 333         return seq_open(file, &rt_cpu_seq_ops);
 334 }
 335
 336 static const struct file_operations rt_cpu_seq_fops = {
 337         .owner   = THIS_MODULE,
 338         .open    = rt_cpu_seq_open,
 339         .read    = seq_read,
 340         .llseek  = seq_lseek,
 341         .release = seq_release,
 342 };
 343
 344 #ifdef CONFIG_IP_ROUTE_CLASSID
 345 static int rt_acct_proc_show(struct seq_file *m, void *v)
 346 {
 347         struct ip_rt_acct *dst, *src;
 348         unsigned int i, j;
 349
 350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351         if (!dst)
 352                 return -ENOMEM;
 353
 354         for_each_possible_cpu(i) {
 355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                 for (j = 0; j < 256; j++) {
 357                         dst[j].o_bytes   += src[j].o_bytes;
 358                         dst[j].o_packets += src[j].o_packets;
 359                         dst[j].i_bytes   += src[j].i_bytes;
 360                         dst[j].i_packets += src[j].i_packets;
 361                 }
 362         }
 363
 364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365         kfree(dst);
 366         return 0;
 367 }
 368
 369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370 {
 371         return single_open(file, rt_acct_proc_show, NULL);
 372 }
 373
 374 static const struct file_operations rt_acct_proc_fops = {
 375         .owner          = THIS_MODULE,
 376         .open           = rt_acct_proc_open,
 377         .read           = seq_read,
 378         .llseek         = seq_lseek,
 379         .release        = single_release,
 380 };
 381 #endif
 382
 383 static int __net_init ip_rt_do_proc_init(struct net *net)
 384 {
 385         struct proc_dir_entry *pde;
 386
 387         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 388                           &rt_cache_seq_fops);
 389         if (!pde)
 390                 goto err1;
 391
 392         pde = proc_create("rt_cache", S_IRUGO,
 393                           net->proc_net_stat, &rt_cpu_seq_fops);
 394         if (!pde)
 395                 goto err2;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399         if (!pde)
 400                 goto err3;
 401 #endif
 402         return 0;
 403
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405 err3:
 406         remove_proc_entry("rt_cache", net->proc_net_stat);
 407 #endif
 408 err2:
 409         remove_proc_entry("rt_cache", net->proc_net);
 410 err1:
 411         return -ENOMEM;
 412 }
 413
 414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415 {
 416         remove_proc_entry("rt_cache", net->proc_net_stat);
 417         remove_proc_entry("rt_cache", net->proc_net);
 418 #ifdef CONFIG_IP_ROUTE_CLASSID
 419         remove_proc_entry("rt_acct", net->proc_net);
 420 #endif
 421 }
 422
 423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424         .init = ip_rt_do_proc_init,
 425         .exit = ip_rt_do_proc_exit,
 426 };
 427
 428 static int __init ip_rt_proc_init(void)
 429 {
 430         return register_pernet_subsys(&ip_rt_proc_ops);
 431 }
 432
 433 #else
 434 static inline int ip_rt_proc_init(void)
 435 {
 436         return 0;
 437 }
 438 #endif /* CONFIG_PROC_FS */
 439
 440 static inline bool rt_is_expired(const struct rtable *rth)
 441 {
 442         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 443 }
 444
 445 void rt_cache_flush(struct net *net)
 446 {
 447         rt_genid_bump_ipv4(net);
 448 }
 449
 450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                            struct sk_buff *skb,
 452                                            const void *daddr)
 453 {
 454         struct net_device *dev = dst->dev;
 455         const __be32 *pkey = daddr;
 456         const struct rtable *rt;
 457         struct neighbour *n;
 458
 459         rt = (const struct rtable *) dst;
 460         if (rt->rt_gateway)
 461                 pkey = (const __be32 *) &rt->rt_gateway;
 462         else if (skb)
 463                 pkey = &ip_hdr(skb)->daddr;
 464
 465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466         if (n)
 467                 return n;
 468         return neigh_create(&arp_tbl, pkey, dev);
 469 }
 470
 471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 472 {
 473         struct net_device *dev = dst->dev;
 474         const __be32 *pkey = daddr;
 475         const struct rtable *rt;
 476
 477         rt = (const struct rtable *)dst;
 478         if (rt->rt_gateway)
 479                 pkey = (const __be32 *)&rt->rt_gateway;
 480         else if (!daddr ||
 481                  (rt->rt_flags &
 482                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 483                 return;
 484
 485         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 486 }
 487
 488 #define IP_IDENTS_SZ 2048u
 489
 490 static atomic_t *ip_idents __read_mostly;
 491 static u32 *ip_tstamps __read_mostly;
 492
 493 /* In order to protect privacy, we add a perturbation to identifiers
 494  * if one generator is seldom used. This makes hard for an attacker
 495  * to infer how many packets were sent between two points in time.
 496  */
 497 u32 ip_idents_reserve(u32 hash, int segs)
 498 {
 499         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 500         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 501         u32 old = ACCESS_ONCE(*p_tstamp);
 502         u32 now = (u32)jiffies;
 503         u32 new, delta = 0;
 504
 505         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 506                 delta = prandom_u32_max(now - old);
 507
 508         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 509         do {
 510                 old = (u32)atomic_read(p_id);
 511                 new = old + delta + segs;
 512         } while (atomic_cmpxchg(p_id, old, new) != old);
 513
 514         return new - segs;
 515 }
 516 EXPORT_SYMBOL(ip_idents_reserve);
 517
 518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 519 {
 520         static u32 ip_idents_hashrnd __read_mostly;
 521         u32 hash, id;
 522
 523         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 524
 525         hash = jhash_3words((__force u32)iph->daddr,
 526                             (__force u32)iph->saddr,
 527                             iph->protocol ^ net_hash_mix(net),
 528                             ip_idents_hashrnd);
 529         id = ip_idents_reserve(hash, segs);
 530         iph->id = htons(id);
 531 }
 532 EXPORT_SYMBOL(__ip_select_ident);
 533
 534 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 535                              const struct sock *sk,
 536                              const struct iphdr *iph,
 537                              int oif, u8 tos,
 538                              u8 prot, u32 mark, int flow_flags)
 539 {
 540         if (sk) {
 541                 const struct inet_sock *inet = inet_sk(sk);
 542
 543                 oif = sk->sk_bound_dev_if;
 544                 mark = sk->sk_mark;
 545                 tos = RT_CONN_FLAGS(sk);
 546                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 547         }
 548         flowi4_init_output(fl4, oif, mark, tos,
 549                            RT_SCOPE_UNIVERSE, prot,
 550                            flow_flags,
 551                            iph->daddr, iph->saddr, 0, 0,
 552                            sock_net_uid(net, sk));
 553 }
 554
 555 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 556                                const struct sock *sk)
 557 {
 558         const struct net *net = dev_net(skb->dev);
 559         const struct iphdr *iph = ip_hdr(skb);
 560         int oif = skb->dev->ifindex;
 561         u8 tos = RT_TOS(iph->tos);
 562         u8 prot = iph->protocol;
 563         u32 mark = skb->mark;
 564
 565         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 566 }
 567
 568 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 569 {
 570         const struct inet_sock *inet = inet_sk(sk);
 571         const struct ip_options_rcu *inet_opt;
 572         __be32 daddr = inet->inet_daddr;
 573
 574         rcu_read_lock();
 575         inet_opt = rcu_dereference(inet->inet_opt);
 576         if (inet_opt && inet_opt->opt.srr)
 577                 daddr = inet_opt->opt.faddr;
 578         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 579                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 580                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 581                            inet_sk_flowi_flags(sk),
 582                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 583         rcu_read_unlock();
 584 }
 585
 586 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 587                                  const struct sk_buff *skb)
 588 {
 589         if (skb)
 590                 build_skb_flow_key(fl4, skb, sk);
 591         else
 592                 build_sk_flow_key(fl4, sk);
 593 }
 594
 595 static DEFINE_SPINLOCK(fnhe_lock);
 596
 597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 598 {
 599         struct rtable *rt;
 600
 601         rt = rcu_dereference(fnhe->fnhe_rth_input);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607         rt = rcu_dereference(fnhe->fnhe_rth_output);
 608         if (rt) {
 609                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 610                 dst_dev_put(&rt->dst);
 611                 dst_release(&rt->dst);
 612         }
 613 }
 614
 615 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 616 {
 617         struct fib_nh_exception *fnhe, *oldest;
 618
 619         oldest = rcu_dereference(hash->chain);
 620         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 621              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 622                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 623                         oldest = fnhe;
 624         }
 625         fnhe_flush_routes(oldest);
 626         return oldest;
 627 }
 628
 629 static inline u32 fnhe_hashfun(__be32 daddr)
 630 {
 631         static u32 fnhe_hashrnd __read_mostly;
 632         u32 hval;
 633
 634         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 635         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 636         return hash_32(hval, FNHE_HASH_SHIFT);
 637 }
 638
 639 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 640 {
 641         rt->rt_pmtu = fnhe->fnhe_pmtu;
 642         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 643         rt->dst.expires = fnhe->fnhe_expires;
 644
 645         if (fnhe->fnhe_gw) {
 646                 rt->rt_flags |= RTCF_REDIRECTED;
 647                 rt->rt_gateway = fnhe->fnhe_gw;
 648                 rt->rt_uses_gateway = 1;
 649         }
 650 }
 651
 652 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 653                                   u32 pmtu, bool lock, unsigned long expires)
 654 {
 655         struct fnhe_hash_bucket *hash;
 656         struct fib_nh_exception *fnhe;
 657         struct rtable *rt;
 658         u32 genid, hval;
 659         unsigned int i;
 660         int depth;
 661
 662         genid = fnhe_genid(dev_net(nh->nh_dev));
 663         hval = fnhe_hashfun(daddr);
 664
 665         spin_lock_bh(&fnhe_lock);
 666
 667         hash = rcu_dereference(nh->nh_exceptions);
 668         if (!hash) {
 669                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 670                 if (!hash)
 671                         goto out_unlock;
 672                 rcu_assign_pointer(nh->nh_exceptions, hash);
 673         }
 674
 675         hash += hval;
 676
 677         depth = 0;
 678         for (fnhe = rcu_dereference(hash->chain); fnhe;
 679              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 680                 if (fnhe->fnhe_daddr == daddr)
 681                         break;
 682                 depth++;
 683         }
 684
 685         if (fnhe) {
 686                 if (fnhe->fnhe_genid != genid)
 687                         fnhe->fnhe_genid = genid;
 688                 if (gw)
 689                         fnhe->fnhe_gw = gw;
 690                 if (pmtu) {
 691                         fnhe->fnhe_pmtu = pmtu;
 692                         fnhe->fnhe_mtu_locked = lock;
 693                 }
 694                 fnhe->fnhe_expires = max(1UL, expires);
 695                 /* Update all cached dsts too */
 696                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 697                 if (rt)
 698                         fill_route_from_fnhe(rt, fnhe);
 699                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 700                 if (rt)
 701                         fill_route_from_fnhe(rt, fnhe);
 702         } else {
 703                 if (depth > FNHE_RECLAIM_DEPTH)
 704                         fnhe = fnhe_oldest(hash);
 705                 else {
 706                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 707                         if (!fnhe)
 708                                 goto out_unlock;
 709
 710                         fnhe->fnhe_next = hash->chain;
 711                         rcu_assign_pointer(hash->chain, fnhe);
 712                 }
 713                 fnhe->fnhe_genid = genid;
 714                 fnhe->fnhe_daddr = daddr;
 715                 fnhe->fnhe_gw = gw;
 716                 fnhe->fnhe_pmtu = pmtu;
 717                 fnhe->fnhe_mtu_locked = lock;
 718                 fnhe->fnhe_expires = max(1UL, expires);
 719
 720                 /* Exception created; mark the cached routes for the nexthop
 721                  * stale, so anyone caching it rechecks if this exception
 722                  * applies to them.
 723                  */
 724                 rt = rcu_dereference(nh->nh_rth_input);
 725                 if (rt)
 726                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 727
 728                 for_each_possible_cpu(i) {
 729                         struct rtable __rcu **prt;
 730                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 731                         rt = rcu_dereference(*prt);
 732                         if (rt)
 733                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 734                 }
 735         }
 736
 737         fnhe->fnhe_stamp = jiffies;
 738
 739 out_unlock:
 740         spin_unlock_bh(&fnhe_lock);
 741 }
 742
 743 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 744                              bool kill_route)
 745 {
 746         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 747         __be32 old_gw = ip_hdr(skb)->saddr;
 748         struct net_device *dev = skb->dev;
 749         struct in_device *in_dev;
 750         struct fib_result res;
 751         struct neighbour *n;
 752         struct net *net;
 753
 754         switch (icmp_hdr(skb)->code & 7) {
 755         case ICMP_REDIR_NET:
 756         case ICMP_REDIR_NETTOS:
 757         case ICMP_REDIR_HOST:
 758         case ICMP_REDIR_HOSTTOS:
 759                 break;
 760
 761         default:
 762                 return;
 763         }
 764
 765         if (rt->rt_gateway != old_gw)
 766                 return;
 767
 768         in_dev = __in_dev_get_rcu(dev);
 769         if (!in_dev)
 770                 return;
 771
 772         net = dev_net(dev);
 773         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 774             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 775             ipv4_is_zeronet(new_gw))
 776                 goto reject_redirect;
 777
 778         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 779                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 780                         goto reject_redirect;
 781                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 782                         goto reject_redirect;
 783         } else {
 784                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 785                         goto reject_redirect;
 786         }
 787
 788         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 789         if (!n)
 790                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 791         if (!IS_ERR(n)) {
 792                 if (!(n->nud_state & NUD_VALID)) {
 793                         neigh_event_send(n, NULL);
 794                 } else {
 795                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 796                                 struct fib_nh *nh = &FIB_RES_NH(res);
 797
 798                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 799                                                 0, false,
 800                                                 jiffies + ip_rt_gc_timeout);
 801                         }
 802                         if (kill_route)
 803                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 804                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 805                 }
 806                 neigh_release(n);
 807         }
 808         return;
 809
 810 reject_redirect:
 811 #ifdef CONFIG_IP_ROUTE_VERBOSE
 812         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 813                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 814                 __be32 daddr = iph->daddr;
 815                 __be32 saddr = iph->saddr;
 816
 817                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 818                                      "  Advised path = %pI4 -> %pI4\n",
 819                                      &old_gw, dev->name, &new_gw,
 820                                      &saddr, &daddr);
 821         }
 822 #endif
 823         ;
 824 }
 825
 826 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 827 {
 828         struct rtable *rt;
 829         struct flowi4 fl4;
 830         const struct iphdr *iph = (const struct iphdr *) skb->data;
 831         struct net *net = dev_net(skb->dev);
 832         int oif = skb->dev->ifindex;
 833         u8 tos = RT_TOS(iph->tos);
 834         u8 prot = iph->protocol;
 835         u32 mark = skb->mark;
 836
 837         rt = (struct rtable *) dst;
 838
 839         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 840         __ip_do_redirect(rt, skb, &fl4, true);
 841 }
 842
 843 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 844 {
 845         struct rtable *rt = (struct rtable *)dst;
 846         struct dst_entry *ret = dst;
 847
 848         if (rt) {
 849                 if (dst->obsolete > 0) {
 850                         ip_rt_put(rt);
 851                         ret = NULL;
 852                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 853                            rt->dst.expires) {
 854                         ip_rt_put(rt);
 855                         ret = NULL;
 856                 }
 857         }
 858         return ret;
 859 }
 860
 861 /*
 862  * Algorithm:
 863  *      1. The first ip_rt_redirect_number redirects are sent
 864  *         with exponential backoff, then we stop sending them at all,
 865  *         assuming that the host ignores our redirects.
 866  *      2. If we did not see packets requiring redirects
 867  *         during ip_rt_redirect_silence, we assume that the host
 868  *         forgot redirected route and start to send redirects again.
 869  *
 870  * This algorithm is much cheaper and more intelligent than dumb load limiting
 871  * in icmp.c.
 872  *
 873  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 874  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 875  */
 876
 877 void ip_rt_send_redirect(struct sk_buff *skb)
 878 {
 879         struct rtable *rt = skb_rtable(skb);
 880         struct in_device *in_dev;
 881         struct inet_peer *peer;
 882         struct net *net;
 883         int log_martians;
 884         int vif;
 885
 886         rcu_read_lock();
 887         in_dev = __in_dev_get_rcu(rt->dst.dev);
 888         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 889                 rcu_read_unlock();
 890                 return;
 891         }
 892         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 893         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 894         rcu_read_unlock();
 895
 896         net = dev_net(rt->dst.dev);
 897         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 898         if (!peer) {
 899                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 900                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 901                 return;
 902         }
 903
 904         /* No redirected packets during ip_rt_redirect_silence;
 905          * reset the algorithm.
 906          */
 907         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 908                 peer->rate_tokens = 0;
 909                 peer->n_redirects = 0;
 910         }
 911
 912         /* Too many ignored redirects; do not send anything
 913          * set dst.rate_last to the last seen redirected packet.
 914          */
 915         if (peer->n_redirects >= ip_rt_redirect_number) {
 916                 peer->rate_last = jiffies;
 917                 goto out_put_peer;
 918         }
 919
 920         /* Check for load limit; set rate_last to the latest sent
 921          * redirect.
 922          */
 923         if (peer->rate_tokens == 0 ||
 924             time_after(jiffies,
 925                        (peer->rate_last +
 926                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 927                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 928
 929                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 930                 peer->rate_last = jiffies;
 931                 ++peer->rate_tokens;
 932                 ++peer->n_redirects;
 933 #ifdef CONFIG_IP_ROUTE_VERBOSE
 934                 if (log_martians &&
 935                     peer->rate_tokens == ip_rt_redirect_number)
 936                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 937                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 938                                              &ip_hdr(skb)->daddr, &gw);
 939 #endif
 940         }
 941 out_put_peer:
 942         inet_putpeer(peer);
 943 }
 944
 945 static int ip_error(struct sk_buff *skb)
 946 {
 947         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 948         struct rtable *rt = skb_rtable(skb);
 949         struct inet_peer *peer;
 950         unsigned long now;
 951         struct net *net;
 952         bool send;
 953         int code;
 954
 955         /* IP on this device is disabled. */
 956         if (!in_dev)
 957                 goto out;
 958
 959         net = dev_net(rt->dst.dev);
 960         if (!IN_DEV_FORWARD(in_dev)) {
 961                 switch (rt->dst.error) {
 962                 case EHOSTUNREACH:
 963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964                         break;
 965
 966                 case ENETUNREACH:
 967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                         break;
 969                 }
 970                 goto out;
 971         }
 972
 973         switch (rt->dst.error) {
 974         case EINVAL:
 975         default:
 976                 goto out;
 977         case EHOSTUNREACH:
 978                 code = ICMP_HOST_UNREACH;
 979                 break;
 980         case ENETUNREACH:
 981                 code = ICMP_NET_UNREACH;
 982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                 break;
 984         case EACCES:
 985                 code = ICMP_PKT_FILTERED;
 986                 break;
 987         }
 988
 989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990                                l3mdev_master_ifindex(skb->dev), 1);
 991
 992         send = true;
 993         if (peer) {
 994                 now = jiffies;
 995                 peer->rate_tokens += now - peer->rate_last;
 996                 if (peer->rate_tokens > ip_rt_error_burst)
 997                         peer->rate_tokens = ip_rt_error_burst;
 998                 peer->rate_last = now;
 999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         struct fib_result res;
1016         bool lock = false;
1017
1018         if (ip_mtu_locked(dst))
1019                 return;
1020
1021         if (ipv4_mtu(dst) < mtu)
1022                 return;
1023
1024         if (mtu < ip_rt_min_pmtu) {
1025                 lock = true;
1026                 mtu = ip_rt_min_pmtu;
1027         }
1028
1029         if (rt->rt_pmtu == mtu &&
1030             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031                 return;
1032
1033         rcu_read_lock();
1034         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035                 struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038                                       jiffies + ip_rt_mtu_expires);
1039         }
1040         rcu_read_unlock();
1041 }
1042
1043 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044                               struct sk_buff *skb, u32 mtu)
1045 {
1046         struct rtable *rt = (struct rtable *) dst;
1047         struct flowi4 fl4;
1048
1049         ip_rt_build_flow_key(&fl4, sk, skb);
1050         __ip_rt_update_pmtu(rt, &fl4, mtu);
1051 }
1052
1053 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054                       int oif, u32 mark, u8 protocol, int flow_flags)
1055 {
1056         const struct iphdr *iph = (const struct iphdr *) skb->data;
1057         struct flowi4 fl4;
1058         struct rtable *rt;
1059
1060         if (!mark)
1061                 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063         __build_flow_key(net, &fl4, NULL, iph, oif,
1064                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1065         rt = __ip_route_output_key(net, &fl4);
1066         if (!IS_ERR(rt)) {
1067                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1068                 ip_rt_put(rt);
1069         }
1070 }
1071 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081         if (!fl4.flowi4_mark)
1082                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084         rt = __ip_route_output_key(sock_net(sk), &fl4);
1085         if (!IS_ERR(rt)) {
1086                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1087                 ip_rt_put(rt);
1088         }
1089 }
1090
1091 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 {
1093         const struct iphdr *iph = (const struct iphdr *) skb->data;
1094         struct flowi4 fl4;
1095         struct rtable *rt;
1096         struct dst_entry *odst = NULL;
1097         bool new = false;
1098         struct net *net = sock_net(sk);
1099
1100         bh_lock_sock(sk);
1101
1102         if (!ip_sk_accept_pmtu(sk))
1103                 goto out;
1104
1105         odst = sk_dst_get(sk);
1106
1107         if (sock_owned_by_user(sk) || !odst) {
1108                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1109                 goto out;
1110         }
1111
1112         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114         rt = (struct rtable *)odst;
1115         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117                 if (IS_ERR(rt))
1118                         goto out;
1119
1120                 new = true;
1121         }
1122
1123         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1124
1125         if (!dst_check(&rt->dst, 0)) {
1126                 if (new)
1127                         dst_release(&rt->dst);
1128
1129                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130                 if (IS_ERR(rt))
1131                         goto out;
1132
1133                 new = true;
1134         }
1135
1136         if (new)
1137                 sk_dst_set(sk, &rt->dst);
1138
1139 out:
1140         bh_unlock_sock(sk);
1141         dst_release(odst);
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146                    int oif, u32 mark, u8 protocol, int flow_flags)
1147 {
1148         const struct iphdr *iph = (const struct iphdr *) skb->data;
1149         struct flowi4 fl4;
1150         struct rtable *rt;
1151
1152         __build_flow_key(net, &fl4, NULL, iph, oif,
1153                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1154         rt = __ip_route_output_key(net, &fl4);
1155         if (!IS_ERR(rt)) {
1156                 __ip_do_redirect(rt, skb, &fl4, false);
1157                 ip_rt_put(rt);
1158         }
1159 }
1160 EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163 {
1164         const struct iphdr *iph = (const struct iphdr *) skb->data;
1165         struct flowi4 fl4;
1166         struct rtable *rt;
1167         struct net *net = sock_net(sk);
1168
1169         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170         rt = __ip_route_output_key(net, &fl4);
1171         if (!IS_ERR(rt)) {
1172                 __ip_do_redirect(rt, skb, &fl4, false);
1173                 ip_rt_put(rt);
1174         }
1175 }
1176 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179 {
1180         struct rtable *rt = (struct rtable *) dst;
1181
1182         /* All IPV4 dsts are created with ->obsolete set to the value
1183          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184          * into this function always.
1185          *
1186          * When a PMTU/redirect information update invalidates a route,
1187          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188          * DST_OBSOLETE_DEAD by dst_free().
1189          */
1190         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191                 return NULL;
1192         return dst;
1193 }
1194
1195 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1196 {
1197         struct ip_options opt;
1198         int res;
1199
1200         /* Recompile ip options since IPCB may not be valid anymore.
1201          * Also check we have a reasonable ipv4 header.
1202          */
1203         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1204             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1205                 return;
1206
1207         memset(&opt, 0, sizeof(opt));
1208         if (ip_hdr(skb)->ihl > 5) {
1209                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1210                         return;
1211                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1212
1213                 rcu_read_lock();
1214                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1215                 rcu_read_unlock();
1216
1217                 if (res)
1218                         return;
1219         }
1220         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1221 }
1222
1223 static void ipv4_link_failure(struct sk_buff *skb)
1224 {
1225         struct rtable *rt;
1226
1227         ipv4_send_dest_unreach(skb);
1228
1229         rt = skb_rtable(skb);
1230         if (rt)
1231                 dst_set_expires(&rt->dst, 0);
1232 }
1233
1234 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1235 {
1236         pr_debug("%s: %pI4 -> %pI4, %s\n",
1237                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1238                  skb->dev ? skb->dev->name : "?");
1239         kfree_skb(skb);
1240         WARN_ON(1);
1241         return 0;
1242 }
1243
1244 /*
1245    We do not cache source address of outgoing interface,
1246    because it is used only by IP RR, TS and SRR options,
1247    so that it out of fast path.
1248
1249    BTW remember: "addr" is allowed to be not aligned
1250    in IP options!
1251  */
1252
1253 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1254 {
1255         __be32 src;
1256
1257         if (rt_is_output_route(rt))
1258                 src = ip_hdr(skb)->saddr;
1259         else {
1260                 struct fib_result res;
1261                 struct flowi4 fl4;
1262                 struct iphdr *iph;
1263
1264                 iph = ip_hdr(skb);
1265
1266                 memset(&fl4, 0, sizeof(fl4));
1267                 fl4.daddr = iph->daddr;
1268                 fl4.saddr = iph->saddr;
1269                 fl4.flowi4_tos = RT_TOS(iph->tos);
1270                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1271                 fl4.flowi4_iif = skb->dev->ifindex;
1272                 fl4.flowi4_mark = skb->mark;
1273
1274                 rcu_read_lock();
1275                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1276                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1277                 else
1278                         src = inet_select_addr(rt->dst.dev,
1279                                                rt_nexthop(rt, iph->daddr),
1280                                                RT_SCOPE_UNIVERSE);
1281                 rcu_read_unlock();
1282         }
1283         memcpy(addr, &src, 4);
1284 }
1285
1286 #ifdef CONFIG_IP_ROUTE_CLASSID
1287 static void set_class_tag(struct rtable *rt, u32 tag)
1288 {
1289         if (!(rt->dst.tclassid & 0xFFFF))
1290                 rt->dst.tclassid |= tag & 0xFFFF;
1291         if (!(rt->dst.tclassid & 0xFFFF0000))
1292                 rt->dst.tclassid |= tag & 0xFFFF0000;
1293 }
1294 #endif
1295
1296 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1297 {
1298         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1299         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1300                                     ip_rt_min_advmss);
1301
1302         return min(advmss, IPV4_MAX_PMTU - header_size);
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327
1328 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash;
1331         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332         u32 hval = fnhe_hashfun(daddr);
1333
1334         spin_lock_bh(&fnhe_lock);
1335
1336         hash = rcu_dereference_protected(nh->nh_exceptions,
1337                                          lockdep_is_held(&fnhe_lock));
1338         hash += hval;
1339
1340         fnhe_p = &hash->chain;
1341         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342         while (fnhe) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346                         /* set fnhe_daddr to 0 to ensure it won't bind with
1347                          * new dsts in rt_bind_exception().
1348                          */
1349                         fnhe->fnhe_daddr = 0;
1350                         fnhe_flush_routes(fnhe);
1351                         kfree_rcu(fnhe, rcu);
1352                         break;
1353                 }
1354                 fnhe_p = &fnhe->fnhe_next;
1355                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356                                                  lockdep_is_held(&fnhe_lock));
1357         }
1358
1359         spin_unlock_bh(&fnhe_lock);
1360 }
1361
1362 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1363 {
1364         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1365         struct fib_nh_exception *fnhe;
1366         u32 hval;
1367
1368         if (!hash)
1369                 return NULL;
1370
1371         hval = fnhe_hashfun(daddr);
1372
1373         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1374              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1375                 if (fnhe->fnhe_daddr == daddr) {
1376                         if (fnhe->fnhe_expires &&
1377                             time_after(jiffies, fnhe->fnhe_expires)) {
1378                                 ip_del_fnhe(nh, daddr);
1379                                 break;
1380                         }
1381                         return fnhe;
1382                 }
1383         }
1384         return NULL;
1385 }
1386
1387 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1388                               __be32 daddr, const bool do_cache)
1389 {
1390         bool ret = false;
1391
1392         spin_lock_bh(&fnhe_lock);
1393
1394         if (daddr == fnhe->fnhe_daddr) {
1395                 struct rtable __rcu **porig;
1396                 struct rtable *orig;
1397                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1398
1399                 if (rt_is_input_route(rt))
1400                         porig = &fnhe->fnhe_rth_input;
1401                 else
1402                         porig = &fnhe->fnhe_rth_output;
1403                 orig = rcu_dereference(*porig);
1404
1405                 if (fnhe->fnhe_genid != genid) {
1406                         fnhe->fnhe_genid = genid;
1407                         fnhe->fnhe_gw = 0;
1408                         fnhe->fnhe_pmtu = 0;
1409                         fnhe->fnhe_expires = 0;
1410                         fnhe_flush_routes(fnhe);
1411                         orig = NULL;
1412                 }
1413                 fill_route_from_fnhe(rt, fnhe);
1414                 if (!rt->rt_gateway)
1415                         rt->rt_gateway = daddr;
1416
1417                 if (do_cache) {
1418                         dst_hold(&rt->dst);
1419                         rcu_assign_pointer(*porig, rt);
1420                         if (orig) {
1421                                 dst_dev_put(&orig->dst);
1422                                 dst_release(&orig->dst);
1423                         }
1424                         ret = true;
1425                 }
1426
1427                 fnhe->fnhe_stamp = jiffies;
1428         }
1429         spin_unlock_bh(&fnhe_lock);
1430
1431         return ret;
1432 }
1433
1434 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1435 {
1436         struct rtable *orig, *prev, **p;
1437         bool ret = true;
1438
1439         if (rt_is_input_route(rt)) {
1440                 p = (struct rtable **)&nh->nh_rth_input;
1441         } else {
1442                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1443         }
1444         orig = *p;
1445
1446         /* hold dst before doing cmpxchg() to avoid race condition
1447          * on this dst
1448          */
1449         dst_hold(&rt->dst);
1450         prev = cmpxchg(p, orig, rt);
1451         if (prev == orig) {
1452                 if (orig) {
1453                         dst_dev_put(&orig->dst);
1454                         dst_release(&orig->dst);
1455                 }
1456         } else {
1457                 dst_release(&rt->dst);
1458                 ret = false;
1459         }
1460
1461         return ret;
1462 }
1463
1464 struct uncached_list {
1465         spinlock_t              lock;
1466         struct list_head        head;
1467 };
1468
1469 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1470
1471 static void rt_add_uncached_list(struct rtable *rt)
1472 {
1473         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1474
1475         rt->rt_uncached_list = ul;
1476
1477         spin_lock_bh(&ul->lock);
1478         list_add_tail(&rt->rt_uncached, &ul->head);
1479         spin_unlock_bh(&ul->lock);
1480 }
1481
1482 static void ipv4_dst_destroy(struct dst_entry *dst)
1483 {
1484         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1485         struct rtable *rt = (struct rtable *) dst;
1486
1487         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1488                 kfree(p);
1489
1490         if (!list_empty(&rt->rt_uncached)) {
1491                 struct uncached_list *ul = rt->rt_uncached_list;
1492
1493                 spin_lock_bh(&ul->lock);
1494                 list_del(&rt->rt_uncached);
1495                 spin_unlock_bh(&ul->lock);
1496         }
1497 }
1498
1499 void rt_flush_dev(struct net_device *dev)
1500 {
1501         struct net *net = dev_net(dev);
1502         struct rtable *rt;
1503         int cpu;
1504
1505         for_each_possible_cpu(cpu) {
1506                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1507
1508                 spin_lock_bh(&ul->lock);
1509                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1510                         if (rt->dst.dev != dev)
1511                                 continue;
1512                         rt->dst.dev = net->loopback_dev;
1513                         dev_hold(rt->dst.dev);
1514                         dev_put(dev);
1515                 }
1516                 spin_unlock_bh(&ul->lock);
1517         }
1518 }
1519
1520 static bool rt_cache_valid(const struct rtable *rt)
1521 {
1522         return  rt &&
1523                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1524                 !rt_is_expired(rt);
1525 }
1526
1527 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1528                            const struct fib_result *res,
1529                            struct fib_nh_exception *fnhe,
1530                            struct fib_info *fi, u16 type, u32 itag,
1531                            const bool do_cache)
1532 {
1533         bool cached = false;
1534
1535         if (fi) {
1536                 struct fib_nh *nh = &FIB_RES_NH(*res);
1537
1538                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1539                         rt->rt_gateway = nh->nh_gw;
1540                         rt->rt_uses_gateway = 1;
1541                 }
1542                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1543                 if (fi->fib_metrics != &dst_default_metrics) {
1544                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1545                         refcount_inc(&fi->fib_metrics->refcnt);
1546                 }
1547 #ifdef CONFIG_IP_ROUTE_CLASSID
1548                 rt->dst.tclassid = nh->nh_tclassid;
1549 #endif
1550                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1551                 if (unlikely(fnhe))
1552                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1553                 else if (do_cache)
1554                         cached = rt_cache_route(nh, rt);
1555                 if (unlikely(!cached)) {
1556                         /* Routes we intend to cache in nexthop exception or
1557                          * FIB nexthop have the DST_NOCACHE bit clear.
1558                          * However, if we are unsuccessful at storing this
1559                          * route into the cache we really need to set it.
1560                          */
1561                         if (!rt->rt_gateway)
1562                                 rt->rt_gateway = daddr;
1563                         rt_add_uncached_list(rt);
1564                 }
1565         } else
1566                 rt_add_uncached_list(rt);
1567
1568 #ifdef CONFIG_IP_ROUTE_CLASSID
1569 #ifdef CONFIG_IP_MULTIPLE_TABLES
1570         set_class_tag(rt, res->tclassid);
1571 #endif
1572         set_class_tag(rt, itag);
1573 #endif
1574 }
1575
1576 struct rtable *rt_dst_alloc(struct net_device *dev,
1577                             unsigned int flags, u16 type,
1578                             bool nopolicy, bool noxfrm, bool will_cache)
1579 {
1580         struct rtable *rt;
1581
1582         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1583                        (will_cache ? 0 : DST_HOST) |
1584                        (nopolicy ? DST_NOPOLICY : 0) |
1585                        (noxfrm ? DST_NOXFRM : 0));
1586
1587         if (rt) {
1588                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1589                 rt->rt_flags = flags;
1590                 rt->rt_type = type;
1591                 rt->rt_is_input = 0;
1592                 rt->rt_iif = 0;
1593                 rt->rt_pmtu = 0;
1594                 rt->rt_mtu_locked = 0;
1595                 rt->rt_gateway = 0;
1596                 rt->rt_uses_gateway = 0;
1597                 rt->rt_table_id = 0;
1598                 INIT_LIST_HEAD(&rt->rt_uncached);
1599
1600                 rt->dst.output = ip_output;
1601                 if (flags & RTCF_LOCAL)
1602                         rt->dst.input = ip_local_deliver;
1603         }
1604
1605         return rt;
1606 }
1607 EXPORT_SYMBOL(rt_dst_alloc);
1608
1609 /* called in rcu_read_lock() section */
1610 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1611                           u8 tos, struct net_device *dev,
1612                           struct in_device *in_dev, u32 *itag)
1613 {
1614         int err;
1615
1616         /* Primary sanity checks. */
1617         if (!in_dev)
1618                 return -EINVAL;
1619
1620         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1621             skb->protocol != htons(ETH_P_IP))
1622                 return -EINVAL;
1623
1624         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1625                 return -EINVAL;
1626
1627         if (ipv4_is_zeronet(saddr)) {
1628                 if (!ipv4_is_local_multicast(daddr))
1629                         return -EINVAL;
1630         } else {
1631                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1632                                           in_dev, itag);
1633                 if (err < 0)
1634                         return err;
1635         }
1636         return 0;
1637 }
1638
1639 /* called in rcu_read_lock() section */
1640 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1641                              u8 tos, struct net_device *dev, int our)
1642 {
1643         struct in_device *in_dev = __in_dev_get_rcu(dev);
1644         unsigned int flags = RTCF_MULTICAST;
1645         struct rtable *rth;
1646         u32 itag = 0;
1647         int err;
1648
1649         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1650         if (err)
1651                 return err;
1652
1653         if (our)
1654                 flags |= RTCF_LOCAL;
1655
1656         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1657                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1658         if (!rth)
1659                 return -ENOBUFS;
1660
1661 #ifdef CONFIG_IP_ROUTE_CLASSID
1662         rth->dst.tclassid = itag;
1663 #endif
1664         rth->dst.output = ip_rt_bug;
1665         rth->rt_is_input= 1;
1666
1667 #ifdef CONFIG_IP_MROUTE
1668         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1669                 rth->dst.input = ip_mr_input;
1670 #endif
1671         RT_CACHE_STAT_INC(in_slow_mc);
1672
1673         skb_dst_set(skb, &rth->dst);
1674         return 0;
1675 }
1676
1677
1678 static void ip_handle_martian_source(struct net_device *dev,
1679                                      struct in_device *in_dev,
1680                                      struct sk_buff *skb,
1681                                      __be32 daddr,
1682                                      __be32 saddr)
1683 {
1684         RT_CACHE_STAT_INC(in_martian_src);
1685 #ifdef CONFIG_IP_ROUTE_VERBOSE
1686         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1687                 /*
1688                  *      RFC1812 recommendation, if source is martian,
1689                  *      the only hint is MAC header.
1690                  */
1691                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1692                         &daddr, &saddr, dev->name);
1693                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1694                         print_hex_dump(KERN_WARNING, "ll header: ",
1695                                        DUMP_PREFIX_OFFSET, 16, 1,
1696                                        skb_mac_header(skb),
1697                                        dev->hard_header_len, true);
1698                 }
1699         }
1700 #endif
1701 }
1702
1703 static void set_lwt_redirect(struct rtable *rth)
1704 {
1705         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1706                 rth->dst.lwtstate->orig_output = rth->dst.output;
1707                 rth->dst.output = lwtunnel_output;
1708         }
1709
1710         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1711                 rth->dst.lwtstate->orig_input = rth->dst.input;
1712                 rth->dst.input = lwtunnel_input;
1713         }
1714 }
1715
1716 /* called in rcu_read_lock() section */
1717 static int __mkroute_input(struct sk_buff *skb,
1718                            const struct fib_result *res,
1719                            struct in_device *in_dev,
1720                            __be32 daddr, __be32 saddr, u32 tos)
1721 {
1722         struct fib_nh_exception *fnhe;
1723         struct rtable *rth;
1724         int err;
1725         struct in_device *out_dev;
1726         bool do_cache;
1727         u32 itag = 0;
1728
1729         /* get a working reference to the output device */
1730         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1731         if (!out_dev) {
1732                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1733                 return -EINVAL;
1734         }
1735
1736         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1737                                   in_dev->dev, in_dev, &itag);
1738         if (err < 0) {
1739                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1740                                          saddr);
1741
1742                 goto cleanup;
1743         }
1744
1745         do_cache = res->fi && !itag;
1746         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1747             skb->protocol == htons(ETH_P_IP) &&
1748             (IN_DEV_SHARED_MEDIA(out_dev) ||
1749              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1750                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1751
1752         if (skb->protocol != htons(ETH_P_IP)) {
1753                 /* Not IP (i.e. ARP). Do not create route, if it is
1754                  * invalid for proxy arp. DNAT routes are always valid.
1755                  *
1756                  * Proxy arp feature have been extended to allow, ARP
1757                  * replies back to the same interface, to support
1758                  * Private VLAN switch technologies. See arp.c.
1759                  */
1760                 if (out_dev == in_dev &&
1761                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1762                         err = -EINVAL;
1763                         goto cleanup;
1764                 }
1765         }
1766
1767         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1768         if (do_cache) {
1769                 if (fnhe)
1770                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1771                 else
1772                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1773                 if (rt_cache_valid(rth)) {
1774                         skb_dst_set_noref(skb, &rth->dst);
1775                         goto out;
1776                 }
1777         }
1778
1779         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1780                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1781                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1782         if (!rth) {
1783                 err = -ENOBUFS;
1784                 goto cleanup;
1785         }
1786
1787         rth->rt_is_input = 1;
1788         if (res->table)
1789                 rth->rt_table_id = res->table->tb_id;
1790         RT_CACHE_STAT_INC(in_slow_tot);
1791
1792         rth->dst.input = ip_forward;
1793
1794         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1795                        do_cache);
1796         set_lwt_redirect(rth);
1797         skb_dst_set(skb, &rth->dst);
1798 out:
1799         err = 0;
1800  cleanup:
1801         return err;
1802 }
1803
1804 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1805 /* To make ICMP packets follow the right flow, the multipath hash is
1806  * calculated from the inner IP addresses.
1807  */
1808 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1809                                  struct flow_keys *hash_keys)
1810 {
1811         const struct iphdr *outer_iph = ip_hdr(skb);
1812         const struct iphdr *inner_iph;
1813         const struct icmphdr *icmph;
1814         struct iphdr _inner_iph;
1815         struct icmphdr _icmph;
1816
1817         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1818         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1819         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1820                 return;
1821
1822         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1823                 return;
1824
1825         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1826                                    &_icmph);
1827         if (!icmph)
1828                 return;
1829
1830         if (icmph->type != ICMP_DEST_UNREACH &&
1831             icmph->type != ICMP_REDIRECT &&
1832             icmph->type != ICMP_TIME_EXCEEDED &&
1833             icmph->type != ICMP_PARAMETERPROB)
1834                 return;
1835
1836         inner_iph = skb_header_pointer(skb,
1837                                        outer_iph->ihl * 4 + sizeof(_icmph),
1838                                        sizeof(_inner_iph), &_inner_iph);
1839         if (!inner_iph)
1840                 return;
1841         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1842         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1843 }
1844
1845 /* if skb is set it will be used and fl4 can be NULL */
1846 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1847                        const struct sk_buff *skb)
1848 {
1849         struct net *net = fi->fib_net;
1850         struct flow_keys hash_keys;
1851         u32 mhash;
1852
1853         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1854         case 0:
1855                 memset(&hash_keys, 0, sizeof(hash_keys));
1856                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1857                 if (skb) {
1858                         ip_multipath_l3_keys(skb, &hash_keys);
1859                 } else {
1860                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1861                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1862                 }
1863                 break;
1864         case 1:
1865                 /* skb is currently provided only when forwarding */
1866                 if (skb) {
1867                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1868                         struct flow_keys keys;
1869
1870                         /* short-circuit if we already have L4 hash present */
1871                         if (skb->l4_hash)
1872                                 return skb_get_hash_raw(skb) >> 1;
1873                         memset(&hash_keys, 0, sizeof(hash_keys));
1874                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1875
1876                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1877                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1878                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1879                         hash_keys.ports.src = keys.ports.src;
1880                         hash_keys.ports.dst = keys.ports.dst;
1881                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1882                 } else {
1883                         memset(&hash_keys, 0, sizeof(hash_keys));
1884                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1885                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1886                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1887                         hash_keys.ports.src = fl4->fl4_sport;
1888                         hash_keys.ports.dst = fl4->fl4_dport;
1889                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1890                 }
1891                 break;
1892         }
1893         mhash = flow_hash_from_keys(&hash_keys);
1894
1895         return mhash >> 1;
1896 }
1897 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1898 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1899
1900 static int ip_mkroute_input(struct sk_buff *skb,
1901                             struct fib_result *res,
1902                             struct in_device *in_dev,
1903                             __be32 daddr, __be32 saddr, u32 tos)
1904 {
1905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1906         if (res->fi && res->fi->fib_nhs > 1) {
1907                 int h = fib_multipath_hash(res->fi, NULL, skb);
1908
1909                 fib_select_multipath(res, h);
1910         }
1911 #endif
1912
1913         /* create a routing cache entry */
1914         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1915 }
1916
1917 /*
1918  *      NOTE. We drop all the packets that has local source
1919  *      addresses, because every properly looped back packet
1920  *      must have correct destination already attached by output routine.
1921  *
1922  *      Such approach solves two big problems:
1923  *      1. Not simplex devices are handled properly.
1924  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1925  *      called with rcu_read_lock()
1926  */
1927
1928 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1929                                u8 tos, struct net_device *dev,
1930                                struct fib_result *res)
1931 {
1932         struct in_device *in_dev = __in_dev_get_rcu(dev);
1933         struct ip_tunnel_info *tun_info;
1934         struct flowi4   fl4;
1935         unsigned int    flags = 0;
1936         u32             itag = 0;
1937         struct rtable   *rth;
1938         int             err = -EINVAL;
1939         struct net    *net = dev_net(dev);
1940         bool do_cache;
1941
1942         /* IP on this device is disabled. */
1943
1944         if (!in_dev)
1945                 goto out;
1946
1947         /* Check for the most weird martians, which can be not detected
1948            by fib_lookup.
1949          */
1950
1951         tun_info = skb_tunnel_info(skb);
1952         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1953                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1954         else
1955                 fl4.flowi4_tun_key.tun_id = 0;
1956         skb_dst_drop(skb);
1957
1958         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1959                 goto martian_source;
1960
1961         res->fi = NULL;
1962         res->table = NULL;
1963         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1964                 goto brd_input;
1965
1966         /* Accept zero addresses only to limited broadcast;
1967          * I even do not know to fix it or not. Waiting for complains :-)
1968          */
1969         if (ipv4_is_zeronet(saddr))
1970                 goto martian_source;
1971
1972         if (ipv4_is_zeronet(daddr))
1973                 goto martian_destination;
1974
1975         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1976          * and call it once if daddr or/and saddr are loopback addresses
1977          */
1978         if (ipv4_is_loopback(daddr)) {
1979                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1980                         goto martian_destination;
1981         } else if (ipv4_is_loopback(saddr)) {
1982                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1983                         goto martian_source;
1984         }
1985
1986         /*
1987          *      Now we are ready to route packet.
1988          */
1989         fl4.flowi4_oif = 0;
1990         fl4.flowi4_iif = dev->ifindex;
1991         fl4.flowi4_mark = skb->mark;
1992         fl4.flowi4_tos = tos;
1993         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1994         fl4.flowi4_flags = 0;
1995         fl4.daddr = daddr;
1996         fl4.saddr = saddr;
1997         fl4.flowi4_uid = sock_net_uid(net, NULL);
1998         err = fib_lookup(net, &fl4, res, 0);
1999         if (err != 0) {
2000                 if (!IN_DEV_FORWARD(in_dev))
2001                         err = -EHOSTUNREACH;
2002                 goto no_route;
2003         }
2004
2005         if (res->type == RTN_BROADCAST)
2006                 goto brd_input;
2007
2008         if (res->type == RTN_LOCAL) {
2009                 err = fib_validate_source(skb, saddr, daddr, tos,
2010                                           0, dev, in_dev, &itag);
2011                 if (err < 0)
2012                         goto martian_source;
2013                 goto local_input;
2014         }
2015
2016         if (!IN_DEV_FORWARD(in_dev)) {
2017                 err = -EHOSTUNREACH;
2018                 goto no_route;
2019         }
2020         if (res->type != RTN_UNICAST)
2021                 goto martian_destination;
2022
2023         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2024 out:    return err;
2025
2026 brd_input:
2027         if (skb->protocol != htons(ETH_P_IP))
2028                 goto e_inval;
2029
2030         if (!ipv4_is_zeronet(saddr)) {
2031                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2032                                           in_dev, &itag);
2033                 if (err < 0)
2034                         goto martian_source;
2035         }
2036         flags |= RTCF_BROADCAST;
2037         res->type = RTN_BROADCAST;
2038         RT_CACHE_STAT_INC(in_brd);
2039
2040 local_input:
2041         do_cache = false;
2042         if (res->fi) {
2043                 if (!itag) {
2044                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2045                         if (rt_cache_valid(rth)) {
2046                                 skb_dst_set_noref(skb, &rth->dst);
2047                                 err = 0;
2048                                 goto out;
2049                         }
2050                         do_cache = true;
2051                 }
2052         }
2053
2054         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2055                            flags | RTCF_LOCAL, res->type,
2056                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2057         if (!rth)
2058                 goto e_nobufs;
2059
2060         rth->dst.output= ip_rt_bug;
2061 #ifdef CONFIG_IP_ROUTE_CLASSID
2062         rth->dst.tclassid = itag;
2063 #endif
2064         rth->rt_is_input = 1;
2065         if (res->table)
2066                 rth->rt_table_id = res->table->tb_id;
2067
2068         RT_CACHE_STAT_INC(in_slow_tot);
2069         if (res->type == RTN_UNREACHABLE) {
2070                 rth->dst.input= ip_error;
2071                 rth->dst.error= -err;
2072                 rth->rt_flags   &= ~RTCF_LOCAL;
2073         }
2074
2075         if (do_cache) {
2076                 struct fib_nh *nh = &FIB_RES_NH(*res);
2077
2078                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2079                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2080                         WARN_ON(rth->dst.input == lwtunnel_input);
2081                         rth->dst.lwtstate->orig_input = rth->dst.input;
2082                         rth->dst.input = lwtunnel_input;
2083                 }
2084
2085                 if (unlikely(!rt_cache_route(nh, rth)))
2086                         rt_add_uncached_list(rth);
2087         }
2088         skb_dst_set(skb, &rth->dst);
2089         err = 0;
2090         goto out;
2091
2092 no_route:
2093         RT_CACHE_STAT_INC(in_no_route);
2094         res->type = RTN_UNREACHABLE;
2095         res->fi = NULL;
2096         res->table = NULL;
2097         goto local_input;
2098
2099         /*
2100          *      Do not cache martian addresses: they should be logged (RFC1812)
2101          */
2102 martian_destination:
2103         RT_CACHE_STAT_INC(in_martian_dst);
2104 #ifdef CONFIG_IP_ROUTE_VERBOSE
2105         if (IN_DEV_LOG_MARTIANS(in_dev))
2106                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2107                                      &daddr, &saddr, dev->name);
2108 #endif
2109
2110 e_inval:
2111         err = -EINVAL;
2112         goto out;
2113
2114 e_nobufs:
2115         err = -ENOBUFS;
2116         goto out;
2117
2118 martian_source:
2119         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2120         goto out;
2121 }
2122
2123 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124                          u8 tos, struct net_device *dev)
2125 {
2126         struct fib_result res;
2127         int err;
2128
2129         tos &= IPTOS_RT_MASK;
2130         rcu_read_lock();
2131         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2132         rcu_read_unlock();
2133
2134         return err;
2135 }
2136 EXPORT_SYMBOL(ip_route_input_noref);
2137
2138 /* called with rcu_read_lock held */
2139 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2140                        u8 tos, struct net_device *dev, struct fib_result *res)
2141 {
2142         /* Multicast recognition logic is moved from route cache to here.
2143            The problem was that too many Ethernet cards have broken/missing
2144            hardware multicast filters :-( As result the host on multicasting
2145            network acquires a lot of useless route cache entries, sort of
2146            SDR messages from all the world. Now we try to get rid of them.
2147            Really, provided software IP multicast filter is organized
2148            reasonably (at least, hashed), it does not result in a slowdown
2149            comparing with route cache reject entries.
2150            Note, that multicast routers are not affected, because
2151            route cache entry is created eventually.
2152          */
2153         if (ipv4_is_multicast(daddr)) {
2154                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2155                 int our = 0;
2156                 int err = -EINVAL;
2157
2158                 if (!in_dev)
2159                         return err;
2160                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2161                                       ip_hdr(skb)->protocol);
2162
2163                 /* check l3 master if no match yet */
2164                 if (!our && netif_is_l3_slave(dev)) {
2165                         struct in_device *l3_in_dev;
2166
2167                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2168                         if (l3_in_dev)
2169                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2170                                                       ip_hdr(skb)->protocol);
2171                 }
2172
2173                 if (our
2174 #ifdef CONFIG_IP_MROUTE
2175                         ||
2176                     (!ipv4_is_local_multicast(daddr) &&
2177                      IN_DEV_MFORWARD(in_dev))
2178 #endif
2179                    ) {
2180                         err = ip_route_input_mc(skb, daddr, saddr,
2181                                                 tos, dev, our);
2182                 }
2183                 return err;
2184         }
2185
2186         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2187 }
2188
2189 /* called with rcu_read_lock() */
2190 static struct rtable *__mkroute_output(const struct fib_result *res,
2191                                        const struct flowi4 *fl4, int orig_oif,
2192                                        struct net_device *dev_out,
2193                                        unsigned int flags)
2194 {
2195         struct fib_info *fi = res->fi;
2196         struct fib_nh_exception *fnhe;
2197         struct in_device *in_dev;
2198         u16 type = res->type;
2199         struct rtable *rth;
2200         bool do_cache;
2201
2202         in_dev = __in_dev_get_rcu(dev_out);
2203         if (!in_dev)
2204                 return ERR_PTR(-EINVAL);
2205
2206         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2207                 if (ipv4_is_loopback(fl4->saddr) &&
2208                     !(dev_out->flags & IFF_LOOPBACK) &&
2209                     !netif_is_l3_master(dev_out))
2210                         return ERR_PTR(-EINVAL);
2211
2212         if (ipv4_is_lbcast(fl4->daddr))
2213                 type = RTN_BROADCAST;
2214         else if (ipv4_is_multicast(fl4->daddr))
2215                 type = RTN_MULTICAST;
2216         else if (ipv4_is_zeronet(fl4->daddr))
2217                 return ERR_PTR(-EINVAL);
2218
2219         if (dev_out->flags & IFF_LOOPBACK)
2220                 flags |= RTCF_LOCAL;
2221
2222         do_cache = true;
2223         if (type == RTN_BROADCAST) {
2224                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2225                 fi = NULL;
2226         } else if (type == RTN_MULTICAST) {
2227                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2228                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2229                                      fl4->flowi4_proto))
2230                         flags &= ~RTCF_LOCAL;
2231                 else
2232                         do_cache = false;
2233                 /* If multicast route do not exist use
2234                  * default one, but do not gateway in this case.
2235                  * Yes, it is hack.
2236                  */
2237                 if (fi && res->prefixlen < 4)
2238                         fi = NULL;
2239         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2240                    (orig_oif != dev_out->ifindex)) {
2241                 /* For local routes that require a particular output interface
2242                  * we do not want to cache the result.  Caching the result
2243                  * causes incorrect behaviour when there are multiple source
2244                  * addresses on the interface, the end result being that if the
2245                  * intended recipient is waiting on that interface for the
2246                  * packet he won't receive it because it will be delivered on
2247                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2248                  * be set to the loopback interface as well.
2249                  */
2250                 do_cache = false;
2251         }
2252
2253         fnhe = NULL;
2254         do_cache &= fi != NULL;
2255         if (fi) {
2256                 struct rtable __rcu **prth;
2257                 struct fib_nh *nh = &FIB_RES_NH(*res);
2258
2259                 fnhe = find_exception(nh, fl4->daddr);
2260                 if (!do_cache)
2261                         goto add;
2262                 if (fnhe) {
2263                         prth = &fnhe->fnhe_rth_output;
2264                 } else {
2265                         if (unlikely(fl4->flowi4_flags &
2266                                      FLOWI_FLAG_KNOWN_NH &&
2267                                      !(nh->nh_gw &&
2268                                        nh->nh_scope == RT_SCOPE_LINK))) {
2269                                 do_cache = false;
2270                                 goto add;
2271                         }
2272                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2273                 }
2274                 rth = rcu_dereference(*prth);
2275                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2276                         return rth;
2277         }
2278
2279 add:
2280         rth = rt_dst_alloc(dev_out, flags, type,
2281                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2282                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2283                            do_cache);
2284         if (!rth)
2285                 return ERR_PTR(-ENOBUFS);
2286
2287         rth->rt_iif = orig_oif;
2288         if (res->table)
2289                 rth->rt_table_id = res->table->tb_id;
2290
2291         RT_CACHE_STAT_INC(out_slow_tot);
2292
2293         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2294                 if (flags & RTCF_LOCAL &&
2295                     !(dev_out->flags & IFF_LOOPBACK)) {
2296                         rth->dst.output = ip_mc_output;
2297                         RT_CACHE_STAT_INC(out_slow_mc);
2298                 }
2299 #ifdef CONFIG_IP_MROUTE
2300                 if (type == RTN_MULTICAST) {
2301                         if (IN_DEV_MFORWARD(in_dev) &&
2302                             !ipv4_is_local_multicast(fl4->daddr)) {
2303                                 rth->dst.input = ip_mr_input;
2304                                 rth->dst.output = ip_mc_output;
2305                         }
2306                 }
2307 #endif
2308         }
2309
2310         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2311         set_lwt_redirect(rth);
2312
2313         return rth;
2314 }
2315
2316 /*
2317  * Major route resolver routine.
2318  */
2319
2320 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2321                                         const struct sk_buff *skb)
2322 {
2323         __u8 tos = RT_FL_TOS(fl4);
2324         struct fib_result res = {
2325                 .type           = RTN_UNSPEC,
2326                 .fi             = NULL,
2327                 .table          = NULL,
2328                 .tclassid       = 0,
2329         };
2330         struct rtable *rth;
2331
2332         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2333         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2334         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2335                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2336
2337         rcu_read_lock();
2338         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2339         rcu_read_unlock();
2340
2341         return rth;
2342 }
2343 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2344
2345 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2346                                             struct fib_result *res,
2347                                             const struct sk_buff *skb)
2348 {
2349         struct net_device *dev_out = NULL;
2350         int orig_oif = fl4->flowi4_oif;
2351         unsigned int flags = 0;
2352         struct rtable *rth;
2353         int err = -ENETUNREACH;
2354
2355         if (fl4->saddr) {
2356                 rth = ERR_PTR(-EINVAL);
2357                 if (ipv4_is_multicast(fl4->saddr) ||
2358                     ipv4_is_lbcast(fl4->saddr) ||
2359                     ipv4_is_zeronet(fl4->saddr))
2360                         goto out;
2361
2362                 /* I removed check for oif == dev_out->oif here.
2363                    It was wrong for two reasons:
2364                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2365                       is assigned to multiple interfaces.
2366                    2. Moreover, we are allowed to send packets with saddr
2367                       of another iface. --ANK
2368                  */
2369
2370                 if (fl4->flowi4_oif == 0 &&
2371                     (ipv4_is_multicast(fl4->daddr) ||
2372                      ipv4_is_lbcast(fl4->daddr))) {
2373                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2374                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2375                         if (!dev_out)
2376                                 goto out;
2377
2378                         /* Special hack: user can direct multicasts
2379                            and limited broadcast via necessary interface
2380                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2381                            This hack is not just for fun, it allows
2382                            vic,vat and friends to work.
2383                            They bind socket to loopback, set ttl to zero
2384                            and expect that it will work.
2385                            From the viewpoint of routing cache they are broken,
2386                            because we are not allowed to build multicast path
2387                            with loopback source addr (look, routing cache
2388                            cannot know, that ttl is zero, so that packet
2389                            will not leave this host and route is valid).
2390                            Luckily, this hack is good workaround.
2391                          */
2392
2393                         fl4->flowi4_oif = dev_out->ifindex;
2394                         goto make_route;
2395                 }
2396
2397                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2398                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399                         if (!__ip_dev_find(net, fl4->saddr, false))
2400                                 goto out;
2401                 }
2402         }
2403
2404
2405         if (fl4->flowi4_oif) {
2406                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2407                 rth = ERR_PTR(-ENODEV);
2408                 if (!dev_out)
2409                         goto out;
2410
2411                 /* RACE: Check return value of inet_select_addr instead. */
2412                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2413                         rth = ERR_PTR(-ENETUNREACH);
2414                         goto out;
2415                 }
2416                 if (ipv4_is_local_multicast(fl4->daddr) ||
2417                     ipv4_is_lbcast(fl4->daddr) ||
2418                     fl4->flowi4_proto == IPPROTO_IGMP) {
2419                         if (!fl4->saddr)
2420                                 fl4->saddr = inet_select_addr(dev_out, 0,
2421                                                               RT_SCOPE_LINK);
2422                         goto make_route;
2423                 }
2424                 if (!fl4->saddr) {
2425                         if (ipv4_is_multicast(fl4->daddr))
2426                                 fl4->saddr = inet_select_addr(dev_out, 0,
2427                                                               fl4->flowi4_scope);
2428                         else if (!fl4->daddr)
2429                                 fl4->saddr = inet_select_addr(dev_out, 0,
2430                                                               RT_SCOPE_HOST);
2431                 }
2432         }
2433
2434         if (!fl4->daddr) {
2435                 fl4->daddr = fl4->saddr;
2436                 if (!fl4->daddr)
2437                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2438                 dev_out = net->loopback_dev;
2439                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2440                 res->type = RTN_LOCAL;
2441                 flags |= RTCF_LOCAL;
2442                 goto make_route;
2443         }
2444
2445         err = fib_lookup(net, fl4, res, 0);
2446         if (err) {
2447                 res->fi = NULL;
2448                 res->table = NULL;
2449                 if (fl4->flowi4_oif &&
2450                     (ipv4_is_multicast(fl4->daddr) ||
2451                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2452                         /* Apparently, routing tables are wrong. Assume,
2453                            that the destination is on link.
2454
2455                            WHY? DW.
2456                            Because we are allowed to send to iface
2457                            even if it has NO routes and NO assigned
2458                            addresses. When oif is specified, routing
2459                            tables are looked up with only one purpose:
2460                            to catch if destination is gatewayed, rather than
2461                            direct. Moreover, if MSG_DONTROUTE is set,
2462                            we send packet, ignoring both routing tables
2463                            and ifaddr state. --ANK
2464
2465
2466                            We could make it even if oif is unknown,
2467                            likely IPv6, but we do not.
2468                          */
2469
2470                         if (fl4->saddr == 0)
2471                                 fl4->saddr = inet_select_addr(dev_out, 0,
2472                                                               RT_SCOPE_LINK);
2473                         res->type = RTN_UNICAST;
2474                         goto make_route;
2475                 }
2476                 rth = ERR_PTR(err);
2477                 goto out;
2478         }
2479
2480         if (res->type == RTN_LOCAL) {
2481                 if (!fl4->saddr) {
2482                         if (res->fi->fib_prefsrc)
2483                                 fl4->saddr = res->fi->fib_prefsrc;
2484                         else
2485                                 fl4->saddr = fl4->daddr;
2486                 }
2487
2488                 /* L3 master device is the loopback for that domain */
2489                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2490                         net->loopback_dev;
2491
2492                 /* make sure orig_oif points to fib result device even
2493                  * though packet rx/tx happens over loopback or l3mdev
2494                  */
2495                 orig_oif = FIB_RES_OIF(*res);
2496
2497                 fl4->flowi4_oif = dev_out->ifindex;
2498                 flags |= RTCF_LOCAL;
2499                 goto make_route;
2500         }
2501
2502         fib_select_path(net, res, fl4, skb);
2503
2504         dev_out = FIB_RES_DEV(*res);
2505         fl4->flowi4_oif = dev_out->ifindex;
2506
2507
2508 make_route:
2509         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2510
2511 out:
2512         return rth;
2513 }
2514
2515 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2516 {
2517         return NULL;
2518 }
2519
2520 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2521 {
2522         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2523
2524         return mtu ? : dst->dev->mtu;
2525 }
2526
2527 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2528                                           struct sk_buff *skb, u32 mtu)
2529 {
2530 }
2531
2532 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2533                                        struct sk_buff *skb)
2534 {
2535 }
2536
2537 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2538                                           unsigned long old)
2539 {
2540         return NULL;
2541 }
2542
2543 static struct dst_ops ipv4_dst_blackhole_ops = {
2544         .family                 =       AF_INET,
2545         .check                  =       ipv4_blackhole_dst_check,
2546         .mtu                    =       ipv4_blackhole_mtu,
2547         .default_advmss         =       ipv4_default_advmss,
2548         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2549         .redirect               =       ipv4_rt_blackhole_redirect,
2550         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2551         .neigh_lookup           =       ipv4_neigh_lookup,
2552 };
2553
2554 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2555 {
2556         struct rtable *ort = (struct rtable *) dst_orig;
2557         struct rtable *rt;
2558
2559         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2560         if (rt) {
2561                 struct dst_entry *new = &rt->dst;
2562
2563                 new->__use = 1;
2564                 new->input = dst_discard;
2565                 new->output = dst_discard_out;
2566
2567                 new->dev = net->loopback_dev;
2568                 if (new->dev)
2569                         dev_hold(new->dev);
2570
2571                 rt->rt_is_input = ort->rt_is_input;
2572                 rt->rt_iif = ort->rt_iif;
2573                 rt->rt_pmtu = ort->rt_pmtu;
2574                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2575
2576                 rt->rt_genid = rt_genid_ipv4(net);
2577                 rt->rt_flags = ort->rt_flags;
2578                 rt->rt_type = ort->rt_type;
2579                 rt->rt_gateway = ort->rt_gateway;
2580                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2581
2582                 INIT_LIST_HEAD(&rt->rt_uncached);
2583         }
2584
2585         dst_release(dst_orig);
2586
2587         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2588 }
2589
2590 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2591                                     const struct sock *sk)
2592 {
2593         struct rtable *rt = __ip_route_output_key(net, flp4);
2594
2595         if (IS_ERR(rt))
2596                 return rt;
2597
2598         if (flp4->flowi4_proto)
2599                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2600                                                         flowi4_to_flowi(flp4),
2601                                                         sk, 0);
2602
2603         return rt;
2604 }
2605 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2606
2607 /* called with rcu_read_lock held */
2608 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2609                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2610                         u32 seq)
2611 {
2612         struct rtable *rt = skb_rtable(skb);
2613         struct rtmsg *r;
2614         struct nlmsghdr *nlh;
2615         unsigned long expires = 0;
2616         u32 error;
2617         u32 metrics[RTAX_MAX];
2618
2619         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2620         if (!nlh)
2621                 return -EMSGSIZE;
2622
2623         r = nlmsg_data(nlh);
2624         r->rtm_family    = AF_INET;
2625         r->rtm_dst_len  = 32;
2626         r->rtm_src_len  = 0;
2627         r->rtm_tos      = fl4->flowi4_tos;
2628         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2629         if (nla_put_u32(skb, RTA_TABLE, table_id))
2630                 goto nla_put_failure;
2631         r->rtm_type     = rt->rt_type;
2632         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2633         r->rtm_protocol = RTPROT_UNSPEC;
2634         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2635         if (rt->rt_flags & RTCF_NOTIFY)
2636                 r->rtm_flags |= RTM_F_NOTIFY;
2637         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2638                 r->rtm_flags |= RTCF_DOREDIRECT;
2639
2640         if (nla_put_in_addr(skb, RTA_DST, dst))
2641                 goto nla_put_failure;
2642         if (src) {
2643                 r->rtm_src_len = 32;
2644                 if (nla_put_in_addr(skb, RTA_SRC, src))
2645                         goto nla_put_failure;
2646         }
2647         if (rt->dst.dev &&
2648             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2649                 goto nla_put_failure;
2650 #ifdef CONFIG_IP_ROUTE_CLASSID
2651         if (rt->dst.tclassid &&
2652             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2653                 goto nla_put_failure;
2654 #endif
2655         if (!rt_is_input_route(rt) &&
2656             fl4->saddr != src) {
2657                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2658                         goto nla_put_failure;
2659         }
2660         if (rt->rt_uses_gateway &&
2661             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2662                 goto nla_put_failure;
2663
2664         expires = rt->dst.expires;
2665         if (expires) {
2666                 unsigned long now = jiffies;
2667
2668                 if (time_before(now, expires))
2669                         expires -= now;
2670                 else
2671                         expires = 0;
2672         }
2673
2674         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2675         if (rt->rt_pmtu && expires)
2676                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2677         if (rt->rt_mtu_locked && expires)
2678                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2679         if (rtnetlink_put_metrics(skb, metrics) < 0)
2680                 goto nla_put_failure;
2681
2682         if (fl4->flowi4_mark &&
2683             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2684                 goto nla_put_failure;
2685
2686         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2687             nla_put_u32(skb, RTA_UID,
2688                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2689                 goto nla_put_failure;
2690
2691         error = rt->dst.error;
2692
2693         if (rt_is_input_route(rt)) {
2694 #ifdef CONFIG_IP_MROUTE
2695                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2696                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2697                         int err = ipmr_get_route(net, skb,
2698                                                  fl4->saddr, fl4->daddr,
2699                                                  r, portid);
2700
2701                         if (err <= 0) {
2702                                 if (err == 0)
2703                                         return 0;
2704                                 goto nla_put_failure;
2705                         }
2706                 } else
2707 #endif
2708                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2709                                 goto nla_put_failure;
2710         }
2711
2712         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2713                 goto nla_put_failure;
2714
2715         nlmsg_end(skb, nlh);
2716         return 0;
2717
2718 nla_put_failure:
2719         nlmsg_cancel(skb, nlh);
2720         return -EMSGSIZE;
2721 }
2722
2723 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2724                              struct netlink_ext_ack *extack)
2725 {
2726         struct net *net = sock_net(in_skb->sk);
2727         struct rtmsg *rtm;
2728         struct nlattr *tb[RTA_MAX+1];
2729         struct fib_result res = {};
2730         struct rtable *rt = NULL;
2731         struct flowi4 fl4;
2732         __be32 dst = 0;
2733         __be32 src = 0;
2734         u32 iif;
2735         int err;
2736         int mark;
2737         struct sk_buff *skb;
2738         u32 table_id = RT_TABLE_MAIN;
2739         kuid_t uid;
2740
2741         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2742                           extack);
2743         if (err < 0)
2744                 goto errout;
2745
2746         rtm = nlmsg_data(nlh);
2747
2748         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2749         if (!skb) {
2750                 err = -ENOBUFS;
2751                 goto errout;
2752         }
2753
2754         /* Reserve room for dummy headers, this skb can pass
2755            through good chunk of routing engine.
2756          */
2757         skb_reset_mac_header(skb);
2758         skb_reset_network_header(skb);
2759
2760         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2761         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2762         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2763         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2764         if (tb[RTA_UID])
2765                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2766         else
2767                 uid = (iif ? INVALID_UID : current_uid());
2768
2769         /* Bugfix: need to give ip_route_input enough of an IP header to
2770          * not gag.
2771          */
2772         ip_hdr(skb)->protocol = IPPROTO_UDP;
2773         ip_hdr(skb)->saddr = src;
2774         ip_hdr(skb)->daddr = dst;
2775
2776         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2777
2778         memset(&fl4, 0, sizeof(fl4));
2779         fl4.daddr = dst;
2780         fl4.saddr = src;
2781         fl4.flowi4_tos = rtm->rtm_tos;
2782         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2783         fl4.flowi4_mark = mark;
2784         fl4.flowi4_uid = uid;
2785
2786         rcu_read_lock();
2787
2788         if (iif) {
2789                 struct net_device *dev;
2790
2791                 dev = dev_get_by_index_rcu(net, iif);
2792                 if (!dev) {
2793                         err = -ENODEV;
2794                         goto errout_free;
2795                 }
2796
2797                 skb->protocol   = htons(ETH_P_IP);
2798                 skb->dev        = dev;
2799                 skb->mark       = mark;
2800                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2801                                          dev, &res);
2802
2803                 rt = skb_rtable(skb);
2804                 if (err == 0 && rt->dst.error)
2805                         err = -rt->dst.error;
2806         } else {
2807                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2808                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2809                 err = 0;
2810                 if (IS_ERR(rt))
2811                         err = PTR_ERR(rt);
2812                 else
2813                         skb_dst_set(skb, &rt->dst);
2814         }
2815
2816         if (err)
2817                 goto errout_free;
2818
2819         if (rtm->rtm_flags & RTM_F_NOTIFY)
2820                 rt->rt_flags |= RTCF_NOTIFY;
2821
2822         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2823                 table_id = rt->rt_table_id;
2824
2825         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2826                 if (!res.fi) {
2827                         err = fib_props[res.type].error;
2828                         if (!err)
2829                                 err = -EHOSTUNREACH;
2830                         goto errout_free;
2831                 }
2832                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2833                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2834                                     rt->rt_type, res.prefix, res.prefixlen,
2835                                     fl4.flowi4_tos, res.fi, 0);
2836         } else {
2837                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2838                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2839         }
2840         if (err < 0)
2841                 goto errout_free;
2842
2843         rcu_read_unlock();
2844
2845         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2846 errout:
2847         return err;
2848
2849 errout_free:
2850         rcu_read_unlock();
2851         kfree_skb(skb);
2852         goto errout;
2853 }
2854
2855 void ip_rt_multicast_event(struct in_device *in_dev)
2856 {
2857         rt_cache_flush(dev_net(in_dev->dev));
2858 }
2859
2860 #ifdef CONFIG_SYSCTL
2861 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2862 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2863 static int ip_rt_gc_elasticity __read_mostly    = 8;
2864
2865 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2866                                         void __user *buffer,
2867                                         size_t *lenp, loff_t *ppos)
2868 {
2869         struct net *net = (struct net *)__ctl->extra1;
2870
2871         if (write) {
2872                 rt_cache_flush(net);
2873                 fnhe_genid_bump(net);
2874                 return 0;
2875         }
2876
2877         return -EINVAL;
2878 }
2879
2880 static struct ctl_table ipv4_route_table[] = {
2881         {
2882                 .procname       = "gc_thresh",
2883                 .data           = &ipv4_dst_ops.gc_thresh,
2884                 .maxlen         = sizeof(int),
2885                 .mode           = 0644,
2886                 .proc_handler   = proc_dointvec,
2887         },
2888         {
2889                 .procname       = "max_size",
2890                 .data           = &ip_rt_max_size,
2891                 .maxlen         = sizeof(int),
2892                 .mode           = 0644,
2893                 .proc_handler   = proc_dointvec,
2894         },
2895         {
2896                 /*  Deprecated. Use gc_min_interval_ms */
2897
2898                 .procname       = "gc_min_interval",
2899                 .data           = &ip_rt_gc_min_interval,
2900                 .maxlen         = sizeof(int),
2901                 .mode           = 0644,
2902                 .proc_handler   = proc_dointvec_jiffies,
2903         },
2904         {
2905                 .procname       = "gc_min_interval_ms",
2906                 .data           = &ip_rt_gc_min_interval,
2907                 .maxlen         = sizeof(int),
2908                 .mode           = 0644,
2909                 .proc_handler   = proc_dointvec_ms_jiffies,
2910         },
2911         {
2912                 .procname       = "gc_timeout",
2913                 .data           = &ip_rt_gc_timeout,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = proc_dointvec_jiffies,
2917         },
2918         {
2919                 .procname       = "gc_interval",
2920                 .data           = &ip_rt_gc_interval,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = proc_dointvec_jiffies,
2924         },
2925         {
2926                 .procname       = "redirect_load",
2927                 .data           = &ip_rt_redirect_load,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = proc_dointvec,
2931         },
2932         {
2933                 .procname       = "redirect_number",
2934                 .data           = &ip_rt_redirect_number,
2935                 .maxlen         = sizeof(int),
2936                 .mode           = 0644,
2937                 .proc_handler   = proc_dointvec,
2938         },
2939         {
2940                 .procname       = "redirect_silence",
2941                 .data           = &ip_rt_redirect_silence,
2942                 .maxlen         = sizeof(int),
2943                 .mode           = 0644,
2944                 .proc_handler   = proc_dointvec,
2945         },
2946         {
2947                 .procname       = "error_cost",
2948                 .data           = &ip_rt_error_cost,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = proc_dointvec,
2952         },
2953         {
2954                 .procname       = "error_burst",
2955                 .data           = &ip_rt_error_burst,
2956                 .maxlen         = sizeof(int),
2957                 .mode           = 0644,
2958                 .proc_handler   = proc_dointvec,
2959         },
2960         {
2961                 .procname       = "gc_elasticity",
2962                 .data           = &ip_rt_gc_elasticity,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = proc_dointvec,
2966         },
2967         {
2968                 .procname       = "mtu_expires",
2969                 .data           = &ip_rt_mtu_expires,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = proc_dointvec_jiffies,
2973         },
2974         {
2975                 .procname       = "min_pmtu",
2976                 .data           = &ip_rt_min_pmtu,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = proc_dointvec_minmax,
2980                 .extra1         = &ip_min_valid_pmtu,
2981         },
2982         {
2983                 .procname       = "min_adv_mss",
2984                 .data           = &ip_rt_min_advmss,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = proc_dointvec,
2988         },
2989         { }
2990 };
2991
2992 static struct ctl_table ipv4_route_flush_table[] = {
2993         {
2994                 .procname       = "flush",
2995                 .maxlen         = sizeof(int),
2996                 .mode           = 0200,
2997                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2998         },
2999         { },
3000 };
3001
3002 static __net_init int sysctl_route_net_init(struct net *net)
3003 {
3004         struct ctl_table *tbl;
3005
3006         tbl = ipv4_route_flush_table;
3007         if (!net_eq(net, &init_net)) {
3008                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3009                 if (!tbl)
3010                         goto err_dup;
3011
3012                 /* Don't export sysctls to unprivileged users */
3013                 if (net->user_ns != &init_user_ns)
3014                         tbl[0].procname = NULL;
3015         }
3016         tbl[0].extra1 = net;
3017
3018         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3019         if (!net->ipv4.route_hdr)
3020                 goto err_reg;
3021         return 0;
3022
3023 err_reg:
3024         if (tbl != ipv4_route_flush_table)
3025                 kfree(tbl);
3026 err_dup:
3027         return -ENOMEM;
3028 }
3029
3030 static __net_exit void sysctl_route_net_exit(struct net *net)
3031 {
3032         struct ctl_table *tbl;
3033
3034         tbl = net->ipv4.route_hdr->ctl_table_arg;
3035         unregister_net_sysctl_table(net->ipv4.route_hdr);
3036         BUG_ON(tbl == ipv4_route_flush_table);
3037         kfree(tbl);
3038 }
3039
3040 static __net_initdata struct pernet_operations sysctl_route_ops = {
3041         .init = sysctl_route_net_init,
3042         .exit = sysctl_route_net_exit,
3043 };
3044 #endif
3045
3046 static __net_init int rt_genid_init(struct net *net)
3047 {
3048         atomic_set(&net->ipv4.rt_genid, 0);
3049         atomic_set(&net->fnhe_genid, 0);
3050         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3051         return 0;
3052 }
3053
3054 static __net_initdata struct pernet_operations rt_genid_ops = {
3055         .init = rt_genid_init,
3056 };
3057
3058 static int __net_init ipv4_inetpeer_init(struct net *net)
3059 {
3060         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3061
3062         if (!bp)
3063                 return -ENOMEM;
3064         inet_peer_base_init(bp);
3065         net->ipv4.peers = bp;
3066         return 0;
3067 }
3068
3069 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3070 {
3071         struct inet_peer_base *bp = net->ipv4.peers;
3072
3073         net->ipv4.peers = NULL;
3074         inetpeer_invalidate_tree(bp);
3075         kfree(bp);
3076 }
3077
3078 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3079         .init   =       ipv4_inetpeer_init,
3080         .exit   =       ipv4_inetpeer_exit,
3081 };
3082
3083 #ifdef CONFIG_IP_ROUTE_CLASSID
3084 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3085 #endif /* CONFIG_IP_ROUTE_CLASSID */
3086
3087 int __init ip_rt_init(void)
3088 {
3089         int rc = 0;
3090         int cpu;
3091
3092         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3093         if (!ip_idents)
3094                 panic("IP: failed to allocate ip_idents\n");
3095
3096         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3097
3098         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3099         if (!ip_tstamps)
3100                 panic("IP: failed to allocate ip_tstamps\n");
3101
3102         for_each_possible_cpu(cpu) {
3103                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3104
3105                 INIT_LIST_HEAD(&ul->head);
3106                 spin_lock_init(&ul->lock);
3107         }
3108 #ifdef CONFIG_IP_ROUTE_CLASSID
3109         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3110         if (!ip_rt_acct)
3111                 panic("IP: failed to allocate ip_rt_acct\n");
3112 #endif
3113
3114         ipv4_dst_ops.kmem_cachep =
3115                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3116                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3117
3118         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3119
3120         if (dst_entries_init(&ipv4_dst_ops) < 0)
3121                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3122
3123         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3124                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3125
3126         ipv4_dst_ops.gc_thresh = ~0;
3127         ip_rt_max_size = INT_MAX;
3128
3129         devinet_init();
3130         ip_fib_init();
3131
3132         if (ip_rt_proc_init())
3133                 pr_err("Unable to create route proc files\n");
3134 #ifdef CONFIG_XFRM
3135         xfrm_init();
3136         xfrm4_init();
3137 #endif
3138         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3139                       RTNL_FLAG_DOIT_UNLOCKED);
3140
3141 #ifdef CONFIG_SYSCTL
3142         register_pernet_subsys(&sysctl_route_ops);
3143 #endif
3144         register_pernet_subsys(&rt_genid_ops);
3145         register_pernet_subsys(&ipv4_inetpeer_ops);
3146         return rc;
3147 }
3148
3149 #ifdef CONFIG_SYSCTL
3150 /*
3151  * We really need to sanitize the damn ipv4 init order, then all
3152  * this nonsense will go away.
3153  */
3154 void __init ip_static_sysctl_init(void)
3155 {
3156         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3157 }
3158 #endif