net/ipv4/route.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              ROUTE - implementation of the IP router.
   8  *
   9  * Authors:     Ross Biro
  10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Verify area fixes.
  17  *              Alan Cox        :       cli() protects routing changes
  18  *              Rui Oliveira    :       ICMP routing table updates
  19  *              (rco@di.uminho.pt)      Routing table insertion and update
  20  *              Linus Torvalds  :       Rewrote bits to be sensible
  21  *              Alan Cox        :       Added BSD route gw semantics
  22  *              Alan Cox        :       Super /proc >4K
  23  *              Alan Cox        :       MTU in route table
  24  *              Alan Cox        :       MSS actually. Also added the window
  25  *                                      clamper.
  26  *              Sam Lantinga    :       Fixed route matching in rt_del()
  27  *              Alan Cox        :       Routing cache support.
  28  *              Alan Cox        :       Removed compatibility cruft.
  29  *              Alan Cox        :       RTF_REJECT support.
  30  *              Alan Cox        :       TCP irtt support.
  31  *              Jonathan Naylor :       Added Metric support.
  32  *      Miquel van Smoorenburg  :       BSD API fixes.
  33  *      Miquel van Smoorenburg  :       Metrics.
  34  *              Alan Cox        :       Use __u32 properly
  35  *              Alan Cox        :       Aligned routing errors more closely with BSD
  36  *                                      our system is still very different.
  37  *              Alan Cox        :       Faster /proc handling
  38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39  *                                      routing caches and better behaviour.
  40  *
  41  *              Olaf Erb        :       irtt wasn't being copied right.
  42  *              Bjorn Ekwall    :       Kerneld route support.
  43  *              Alan Cox        :       Multicast fixed (I hope)
  44  *              Pavel Krauz     :       Limited broadcast fixed
  45  *              Mike McLagan    :       Routing by source
  46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47  *                                      route.c and rewritten from scratch.
  48  *              Andi Kleen      :       Load-limit warning messages.
  49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53  *              Marc Boucher    :       routing by fwmark
  54  *      Robert Olsson           :       Added rt_cache statistics
  55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59  */
  60
  61 #define pr_fmt(fmt) "IPv4: " fmt
  62
  63 #include <linux/module.h>
  64 #include <linux/uaccess.h>
  65 #include <linux/bitops.h>
  66 #include <linux/types.h>
  67 #include <linux/kernel.h>
  68 #include <linux/mm.h>
  69 #include <linux/string.h>
  70 #include <linux/socket.h>
  71 #include <linux/sockios.h>
  72 #include <linux/errno.h>
  73 #include <linux/in.h>
  74 #include <linux/inet.h>
  75 #include <linux/netdevice.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/init.h>
  78 #include <linux/skbuff.h>
  79 #include <linux/inetdevice.h>
  80 #include <linux/igmp.h>
  81 #include <linux/pkt_sched.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/random.h>
  85 #include <linux/rcupdate.h>
  86 #include <linux/times.h>
  87 #include <linux/slab.h>
  88 #include <linux/jhash.h>
  89 #include <net/dst.h>
  90 #include <net/dst_metadata.h>
  91 #include <net/net_namespace.h>
  92 #include <net/protocol.h>
  93 #include <net/ip.h>
  94 #include <net/route.h>
  95 #include <net/inetpeer.h>
  96 #include <net/sock.h>
  97 #include <net/ip_fib.h>
  98 #include <net/arp.h>
  99 #include <net/tcp.h>
 100 #include <net/icmp.h>
 101 #include <net/xfrm.h>
 102 #include <net/lwtunnel.h>
 103 #include <net/netevent.h>
 104 #include <net/rtnetlink.h>
 105 #ifdef CONFIG_SYSCTL
 106 #include <linux/sysctl.h>
 107 #endif
 108 #include <net/secure_seq.h>
 109 #include <net/ip_tunnels.h>
 110 #include <net/l3mdev.h>
 111
 112 #include "fib_lookup.h"
 113
 114 #define RT_FL_TOS(oldflp4) \
 115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_redirect_number __read_mostly  = 9;
 121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 123 static int ip_rt_error_cost __read_mostly       = HZ;
 124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 126 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 127 static int ip_rt_min_advmss __read_mostly       = 256;
 128
 129 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 130
 131 /*
 132  *      Interface to generic destination cache.
 133  */
 134
 135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 136 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 137 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 139 static void              ipv4_link_failure(struct sk_buff *skb);
 140 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 141                                            struct sk_buff *skb, u32 mtu);
 142 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 143                                         struct sk_buff *skb);
 144 static void             ipv4_dst_destroy(struct dst_entry *dst);
 145
 146 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 147 {
 148         WARN_ON(1);
 149         return NULL;
 150 }
 151
 152 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 153                                            struct sk_buff *skb,
 154                                            const void *daddr);
 155 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 156
 157 static struct dst_ops ipv4_dst_ops = {
 158         .family =               AF_INET,
 159         .check =                ipv4_dst_check,
 160         .default_advmss =       ipv4_default_advmss,
 161         .mtu =                  ipv4_mtu,
 162         .cow_metrics =          ipv4_cow_metrics,
 163         .destroy =              ipv4_dst_destroy,
 164         .negative_advice =      ipv4_negative_advice,
 165         .link_failure =         ipv4_link_failure,
 166         .update_pmtu =          ip_rt_update_pmtu,
 167         .redirect =             ip_do_redirect,
 168         .local_out =            __ip_local_out,
 169         .neigh_lookup =         ipv4_neigh_lookup,
 170         .confirm_neigh =        ipv4_confirm_neigh,
 171 };
 172
 173 #define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175 const __u8 ip_tos2prio[16] = {
 176         TC_PRIO_BESTEFFORT,
 177         ECN_OR_COST(BESTEFFORT),
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BULK,
 181         ECN_OR_COST(BULK),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_INTERACTIVE,
 185         ECN_OR_COST(INTERACTIVE),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE_BULK,
 189         ECN_OR_COST(INTERACTIVE_BULK),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK)
 192 };
 193 EXPORT_SYMBOL(ip_tos2prio);
 194
 195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198 #ifdef CONFIG_PROC_FS
 199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200 {
 201         if (*pos)
 202                 return NULL;
 203         return SEQ_START_TOKEN;
 204 }
 205
 206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207 {
 208         ++*pos;
 209         return NULL;
 210 }
 211
 212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213 {
 214 }
 215
 216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217 {
 218         if (v == SEQ_START_TOKEN)
 219                 seq_printf(seq, "%-127s\n",
 220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                            "HHUptod\tSpecDst");
 223         return 0;
 224 }
 225
 226 static const struct seq_operations rt_cache_seq_ops = {
 227         .start  = rt_cache_seq_start,
 228         .next   = rt_cache_seq_next,
 229         .stop   = rt_cache_seq_stop,
 230         .show   = rt_cache_seq_show,
 231 };
 232
 233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234 {
 235         return seq_open(file, &rt_cache_seq_ops);
 236 }
 237
 238 static const struct file_operations rt_cache_seq_fops = {
 239         .open    = rt_cache_seq_open,
 240         .read    = seq_read,
 241         .llseek  = seq_lseek,
 242         .release = seq_release,
 243 };
 244
 245
 246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 247 {
 248         int cpu;
 249
 250         if (*pos == 0)
 251                 return SEQ_START_TOKEN;
 252
 253         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 254                 if (!cpu_possible(cpu))
 255                         continue;
 256                 *pos = cpu+1;
 257                 return &per_cpu(rt_cache_stat, cpu);
 258         }
 259         return NULL;
 260 }
 261
 262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 263 {
 264         int cpu;
 265
 266         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 267                 if (!cpu_possible(cpu))
 268                         continue;
 269                 *pos = cpu+1;
 270                 return &per_cpu(rt_cache_stat, cpu);
 271         }
 272         return NULL;
 273
 274 }
 275
 276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 277 {
 278
 279 }
 280
 281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 282 {
 283         struct rt_cache_stat *st = v;
 284
 285         if (v == SEQ_START_TOKEN) {
 286                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 287                 return 0;
 288         }
 289
 290         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 291                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 292                    dst_entries_get_slow(&ipv4_dst_ops),
 293                    0, /* st->in_hit */
 294                    st->in_slow_tot,
 295                    st->in_slow_mc,
 296                    st->in_no_route,
 297                    st->in_brd,
 298                    st->in_martian_dst,
 299                    st->in_martian_src,
 300
 301                    0, /* st->out_hit */
 302                    st->out_slow_tot,
 303                    st->out_slow_mc,
 304
 305                    0, /* st->gc_total */
 306                    0, /* st->gc_ignored */
 307                    0, /* st->gc_goal_miss */
 308                    0, /* st->gc_dst_overflow */
 309                    0, /* st->in_hlist_search */
 310                    0  /* st->out_hlist_search */
 311                 );
 312         return 0;
 313 }
 314
 315 static const struct seq_operations rt_cpu_seq_ops = {
 316         .start  = rt_cpu_seq_start,
 317         .next   = rt_cpu_seq_next,
 318         .stop   = rt_cpu_seq_stop,
 319         .show   = rt_cpu_seq_show,
 320 };
 321
 322
 323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 324 {
 325         return seq_open(file, &rt_cpu_seq_ops);
 326 }
 327
 328 static const struct file_operations rt_cpu_seq_fops = {
 329         .open    = rt_cpu_seq_open,
 330         .read    = seq_read,
 331         .llseek  = seq_lseek,
 332         .release = seq_release,
 333 };
 334
 335 #ifdef CONFIG_IP_ROUTE_CLASSID
 336 static int rt_acct_proc_show(struct seq_file *m, void *v)
 337 {
 338         struct ip_rt_acct *dst, *src;
 339         unsigned int i, j;
 340
 341         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 342         if (!dst)
 343                 return -ENOMEM;
 344
 345         for_each_possible_cpu(i) {
 346                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 347                 for (j = 0; j < 256; j++) {
 348                         dst[j].o_bytes   += src[j].o_bytes;
 349                         dst[j].o_packets += src[j].o_packets;
 350                         dst[j].i_bytes   += src[j].i_bytes;
 351                         dst[j].i_packets += src[j].i_packets;
 352                 }
 353         }
 354
 355         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 356         kfree(dst);
 357         return 0;
 358 }
 359 #endif
 360
 361 static int __net_init ip_rt_do_proc_init(struct net *net)
 362 {
 363         struct proc_dir_entry *pde;
 364
 365         pde = proc_create("rt_cache", 0444, net->proc_net,
 366                           &rt_cache_seq_fops);
 367         if (!pde)
 368                 goto err1;
 369
 370         pde = proc_create("rt_cache", 0444,
 371                           net->proc_net_stat, &rt_cpu_seq_fops);
 372         if (!pde)
 373                 goto err2;
 374
 375 #ifdef CONFIG_IP_ROUTE_CLASSID
 376         pde = proc_create_single("rt_acct", 0, net->proc_net,
 377                         rt_acct_proc_show);
 378         if (!pde)
 379                 goto err3;
 380 #endif
 381         return 0;
 382
 383 #ifdef CONFIG_IP_ROUTE_CLASSID
 384 err3:
 385         remove_proc_entry("rt_cache", net->proc_net_stat);
 386 #endif
 387 err2:
 388         remove_proc_entry("rt_cache", net->proc_net);
 389 err1:
 390         return -ENOMEM;
 391 }
 392
 393 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 394 {
 395         remove_proc_entry("rt_cache", net->proc_net_stat);
 396         remove_proc_entry("rt_cache", net->proc_net);
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398         remove_proc_entry("rt_acct", net->proc_net);
 399 #endif
 400 }
 401
 402 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 403         .init = ip_rt_do_proc_init,
 404         .exit = ip_rt_do_proc_exit,
 405 };
 406
 407 static int __init ip_rt_proc_init(void)
 408 {
 409         return register_pernet_subsys(&ip_rt_proc_ops);
 410 }
 411
 412 #else
 413 static inline int ip_rt_proc_init(void)
 414 {
 415         return 0;
 416 }
 417 #endif /* CONFIG_PROC_FS */
 418
 419 static inline bool rt_is_expired(const struct rtable *rth)
 420 {
 421         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 422 }
 423
 424 void rt_cache_flush(struct net *net)
 425 {
 426         rt_genid_bump_ipv4(net);
 427 }
 428
 429 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 430                                            struct sk_buff *skb,
 431                                            const void *daddr)
 432 {
 433         const struct rtable *rt = container_of(dst, struct rtable, dst);
 434         struct net_device *dev = dst->dev;
 435         struct neighbour *n;
 436
 437         rcu_read_lock_bh();
 438
 439         if (likely(rt->rt_gw_family == AF_INET)) {
 440                 n = ip_neigh_gw4(dev, rt->rt_gw4);
 441         } else if (rt->rt_gw_family == AF_INET6) {
 442                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
 443         } else {
 444                 __be32 pkey;
 445
 446                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 447                 n = ip_neigh_gw4(dev, pkey);
 448         }
 449
 450         if (n && !refcount_inc_not_zero(&n->refcnt))
 451                 n = NULL;
 452
 453         rcu_read_unlock_bh();
 454
 455         return n;
 456 }
 457
 458 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 459 {
 460         const struct rtable *rt = container_of(dst, struct rtable, dst);
 461         struct net_device *dev = dst->dev;
 462         const __be32 *pkey = daddr;
 463
 464         if (rt->rt_gw_family == AF_INET) {
 465                 pkey = (const __be32 *)&rt->rt_gw4;
 466         } else if (rt->rt_gw_family == AF_INET6) {
 467                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 468         } else if (!daddr ||
 469                  (rt->rt_flags &
 470                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 471                 return;
 472         }
 473         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 474 }
 475
 476 #define IP_IDENTS_SZ 2048u
 477
 478 static atomic_t *ip_idents __read_mostly;
 479 static u32 *ip_tstamps __read_mostly;
 480
 481 /* In order to protect privacy, we add a perturbation to identifiers
 482  * if one generator is seldom used. This makes hard for an attacker
 483  * to infer how many packets were sent between two points in time.
 484  */
 485 u32 ip_idents_reserve(u32 hash, int segs)
 486 {
 487         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 488         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 489         u32 old = READ_ONCE(*p_tstamp);
 490         u32 now = (u32)jiffies;
 491         u32 new, delta = 0;
 492
 493         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 494                 delta = prandom_u32_max(now - old);
 495
 496         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 497         do {
 498                 old = (u32)atomic_read(p_id);
 499                 new = old + delta + segs;
 500         } while (atomic_cmpxchg(p_id, old, new) != old);
 501
 502         return new - segs;
 503 }
 504 EXPORT_SYMBOL(ip_idents_reserve);
 505
 506 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 507 {
 508         u32 hash, id;
 509
 510         /* Note the following code is not safe, but this is okay. */
 511         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 512                 get_random_bytes(&net->ipv4.ip_id_key,
 513                                  sizeof(net->ipv4.ip_id_key));
 514
 515         hash = siphash_3u32((__force u32)iph->daddr,
 516                             (__force u32)iph->saddr,
 517                             iph->protocol,
 518                             &net->ipv4.ip_id_key);
 519         id = ip_idents_reserve(hash, segs);
 520         iph->id = htons(id);
 521 }
 522 EXPORT_SYMBOL(__ip_select_ident);
 523
 524 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 525                              const struct sock *sk,
 526                              const struct iphdr *iph,
 527                              int oif, u8 tos,
 528                              u8 prot, u32 mark, int flow_flags)
 529 {
 530         if (sk) {
 531                 const struct inet_sock *inet = inet_sk(sk);
 532
 533                 oif = sk->sk_bound_dev_if;
 534                 mark = sk->sk_mark;
 535                 tos = RT_CONN_FLAGS(sk);
 536                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 537         }
 538         flowi4_init_output(fl4, oif, mark, tos,
 539                            RT_SCOPE_UNIVERSE, prot,
 540                            flow_flags,
 541                            iph->daddr, iph->saddr, 0, 0,
 542                            sock_net_uid(net, sk));
 543 }
 544
 545 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 546                                const struct sock *sk)
 547 {
 548         const struct net *net = dev_net(skb->dev);
 549         const struct iphdr *iph = ip_hdr(skb);
 550         int oif = skb->dev->ifindex;
 551         u8 tos = RT_TOS(iph->tos);
 552         u8 prot = iph->protocol;
 553         u32 mark = skb->mark;
 554
 555         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 556 }
 557
 558 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 559 {
 560         const struct inet_sock *inet = inet_sk(sk);
 561         const struct ip_options_rcu *inet_opt;
 562         __be32 daddr = inet->inet_daddr;
 563
 564         rcu_read_lock();
 565         inet_opt = rcu_dereference(inet->inet_opt);
 566         if (inet_opt && inet_opt->opt.srr)
 567                 daddr = inet_opt->opt.faddr;
 568         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 569                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 570                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 571                            inet_sk_flowi_flags(sk),
 572                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 573         rcu_read_unlock();
 574 }
 575
 576 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 577                                  const struct sk_buff *skb)
 578 {
 579         if (skb)
 580                 build_skb_flow_key(fl4, skb, sk);
 581         else
 582                 build_sk_flow_key(fl4, sk);
 583 }
 584
 585 static DEFINE_SPINLOCK(fnhe_lock);
 586
 587 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 588 {
 589         struct rtable *rt;
 590
 591         rt = rcu_dereference(fnhe->fnhe_rth_input);
 592         if (rt) {
 593                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 594                 dst_dev_put(&rt->dst);
 595                 dst_release(&rt->dst);
 596         }
 597         rt = rcu_dereference(fnhe->fnhe_rth_output);
 598         if (rt) {
 599                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 600                 dst_dev_put(&rt->dst);
 601                 dst_release(&rt->dst);
 602         }
 603 }
 604
 605 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 606 {
 607         struct fib_nh_exception *fnhe, *oldest;
 608
 609         oldest = rcu_dereference(hash->chain);
 610         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 611              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 612                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 613                         oldest = fnhe;
 614         }
 615         fnhe_flush_routes(oldest);
 616         return oldest;
 617 }
 618
 619 static inline u32 fnhe_hashfun(__be32 daddr)
 620 {
 621         static u32 fnhe_hashrnd __read_mostly;
 622         u32 hval;
 623
 624         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 625         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 626         return hash_32(hval, FNHE_HASH_SHIFT);
 627 }
 628
 629 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 630 {
 631         rt->rt_pmtu = fnhe->fnhe_pmtu;
 632         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 633         rt->dst.expires = fnhe->fnhe_expires;
 634
 635         if (fnhe->fnhe_gw) {
 636                 rt->rt_flags |= RTCF_REDIRECTED;
 637                 rt->rt_gw_family = AF_INET;
 638                 rt->rt_gw4 = fnhe->fnhe_gw;
 639         }
 640 }
 641
 642 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 643                                   __be32 gw, u32 pmtu, bool lock,
 644                                   unsigned long expires)
 645 {
 646         struct fnhe_hash_bucket *hash;
 647         struct fib_nh_exception *fnhe;
 648         struct rtable *rt;
 649         u32 genid, hval;
 650         unsigned int i;
 651         int depth;
 652
 653         genid = fnhe_genid(dev_net(nhc->nhc_dev));
 654         hval = fnhe_hashfun(daddr);
 655
 656         spin_lock_bh(&fnhe_lock);
 657
 658         hash = rcu_dereference(nhc->nhc_exceptions);
 659         if (!hash) {
 660                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 661                 if (!hash)
 662                         goto out_unlock;
 663                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
 664         }
 665
 666         hash += hval;
 667
 668         depth = 0;
 669         for (fnhe = rcu_dereference(hash->chain); fnhe;
 670              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 671                 if (fnhe->fnhe_daddr == daddr)
 672                         break;
 673                 depth++;
 674         }
 675
 676         if (fnhe) {
 677                 if (fnhe->fnhe_genid != genid)
 678                         fnhe->fnhe_genid = genid;
 679                 if (gw)
 680                         fnhe->fnhe_gw = gw;
 681                 if (pmtu) {
 682                         fnhe->fnhe_pmtu = pmtu;
 683                         fnhe->fnhe_mtu_locked = lock;
 684                 }
 685                 fnhe->fnhe_expires = max(1UL, expires);
 686                 /* Update all cached dsts too */
 687                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 688                 if (rt)
 689                         fill_route_from_fnhe(rt, fnhe);
 690                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 691                 if (rt)
 692                         fill_route_from_fnhe(rt, fnhe);
 693         } else {
 694                 if (depth > FNHE_RECLAIM_DEPTH)
 695                         fnhe = fnhe_oldest(hash);
 696                 else {
 697                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 698                         if (!fnhe)
 699                                 goto out_unlock;
 700
 701                         fnhe->fnhe_next = hash->chain;
 702                         rcu_assign_pointer(hash->chain, fnhe);
 703                 }
 704                 fnhe->fnhe_genid = genid;
 705                 fnhe->fnhe_daddr = daddr;
 706                 fnhe->fnhe_gw = gw;
 707                 fnhe->fnhe_pmtu = pmtu;
 708                 fnhe->fnhe_mtu_locked = lock;
 709                 fnhe->fnhe_expires = max(1UL, expires);
 710
 711                 /* Exception created; mark the cached routes for the nexthop
 712                  * stale, so anyone caching it rechecks if this exception
 713                  * applies to them.
 714                  */
 715                 rt = rcu_dereference(nhc->nhc_rth_input);
 716                 if (rt)
 717                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 718
 719                 for_each_possible_cpu(i) {
 720                         struct rtable __rcu **prt;
 721                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 722                         rt = rcu_dereference(*prt);
 723                         if (rt)
 724                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 725                 }
 726         }
 727
 728         fnhe->fnhe_stamp = jiffies;
 729
 730 out_unlock:
 731         spin_unlock_bh(&fnhe_lock);
 732 }
 733
 734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 735                              bool kill_route)
 736 {
 737         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 738         __be32 old_gw = ip_hdr(skb)->saddr;
 739         struct net_device *dev = skb->dev;
 740         struct in_device *in_dev;
 741         struct fib_result res;
 742         struct neighbour *n;
 743         struct net *net;
 744
 745         switch (icmp_hdr(skb)->code & 7) {
 746         case ICMP_REDIR_NET:
 747         case ICMP_REDIR_NETTOS:
 748         case ICMP_REDIR_HOST:
 749         case ICMP_REDIR_HOSTTOS:
 750                 break;
 751
 752         default:
 753                 return;
 754         }
 755
 756         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 757                 return;
 758
 759         in_dev = __in_dev_get_rcu(dev);
 760         if (!in_dev)
 761                 return;
 762
 763         net = dev_net(dev);
 764         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 765             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 766             ipv4_is_zeronet(new_gw))
 767                 goto reject_redirect;
 768
 769         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 770                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 771                         goto reject_redirect;
 772                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 773                         goto reject_redirect;
 774         } else {
 775                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 776                         goto reject_redirect;
 777         }
 778
 779         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 780         if (!n)
 781                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 782         if (!IS_ERR(n)) {
 783                 if (!(n->nud_state & NUD_VALID)) {
 784                         neigh_event_send(n, NULL);
 785                 } else {
 786                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 787                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
 788
 789                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 790                                                 0, false,
 791                                                 jiffies + ip_rt_gc_timeout);
 792                         }
 793                         if (kill_route)
 794                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 795                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 796                 }
 797                 neigh_release(n);
 798         }
 799         return;
 800
 801 reject_redirect:
 802 #ifdef CONFIG_IP_ROUTE_VERBOSE
 803         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 804                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 805                 __be32 daddr = iph->daddr;
 806                 __be32 saddr = iph->saddr;
 807
 808                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 809                                      "  Advised path = %pI4 -> %pI4\n",
 810                                      &old_gw, dev->name, &new_gw,
 811                                      &saddr, &daddr);
 812         }
 813 #endif
 814         ;
 815 }
 816
 817 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 818 {
 819         struct rtable *rt;
 820         struct flowi4 fl4;
 821         const struct iphdr *iph = (const struct iphdr *) skb->data;
 822         struct net *net = dev_net(skb->dev);
 823         int oif = skb->dev->ifindex;
 824         u8 tos = RT_TOS(iph->tos);
 825         u8 prot = iph->protocol;
 826         u32 mark = skb->mark;
 827
 828         rt = (struct rtable *) dst;
 829
 830         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 831         __ip_do_redirect(rt, skb, &fl4, true);
 832 }
 833
 834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 835 {
 836         struct rtable *rt = (struct rtable *)dst;
 837         struct dst_entry *ret = dst;
 838
 839         if (rt) {
 840                 if (dst->obsolete > 0) {
 841                         ip_rt_put(rt);
 842                         ret = NULL;
 843                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 844                            rt->dst.expires) {
 845                         ip_rt_put(rt);
 846                         ret = NULL;
 847                 }
 848         }
 849         return ret;
 850 }
 851
 852 /*
 853  * Algorithm:
 854  *      1. The first ip_rt_redirect_number redirects are sent
 855  *         with exponential backoff, then we stop sending them at all,
 856  *         assuming that the host ignores our redirects.
 857  *      2. If we did not see packets requiring redirects
 858  *         during ip_rt_redirect_silence, we assume that the host
 859  *         forgot redirected route and start to send redirects again.
 860  *
 861  * This algorithm is much cheaper and more intelligent than dumb load limiting
 862  * in icmp.c.
 863  *
 864  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 865  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 866  */
 867
 868 void ip_rt_send_redirect(struct sk_buff *skb)
 869 {
 870         struct rtable *rt = skb_rtable(skb);
 871         struct in_device *in_dev;
 872         struct inet_peer *peer;
 873         struct net *net;
 874         int log_martians;
 875         int vif;
 876
 877         rcu_read_lock();
 878         in_dev = __in_dev_get_rcu(rt->dst.dev);
 879         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 880                 rcu_read_unlock();
 881                 return;
 882         }
 883         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 884         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 885         rcu_read_unlock();
 886
 887         net = dev_net(rt->dst.dev);
 888         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 889         if (!peer) {
 890                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 891                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 892                 return;
 893         }
 894
 895         /* No redirected packets during ip_rt_redirect_silence;
 896          * reset the algorithm.
 897          */
 898         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 899                 peer->rate_tokens = 0;
 900                 peer->n_redirects = 0;
 901         }
 902
 903         /* Too many ignored redirects; do not send anything
 904          * set dst.rate_last to the last seen redirected packet.
 905          */
 906         if (peer->n_redirects >= ip_rt_redirect_number) {
 907                 peer->rate_last = jiffies;
 908                 goto out_put_peer;
 909         }
 910
 911         /* Check for load limit; set rate_last to the latest sent
 912          * redirect.
 913          */
 914         if (peer->rate_tokens == 0 ||
 915             time_after(jiffies,
 916                        (peer->rate_last +
 917                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 918                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 919
 920                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 921                 peer->rate_last = jiffies;
 922                 ++peer->rate_tokens;
 923                 ++peer->n_redirects;
 924 #ifdef CONFIG_IP_ROUTE_VERBOSE
 925                 if (log_martians &&
 926                     peer->rate_tokens == ip_rt_redirect_number)
 927                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 929                                              &ip_hdr(skb)->daddr, &gw);
 930 #endif
 931         }
 932 out_put_peer:
 933         inet_putpeer(peer);
 934 }
 935
 936 static int ip_error(struct sk_buff *skb)
 937 {
 938         struct rtable *rt = skb_rtable(skb);
 939         struct net_device *dev = skb->dev;
 940         struct in_device *in_dev;
 941         struct inet_peer *peer;
 942         unsigned long now;
 943         struct net *net;
 944         bool send;
 945         int code;
 946
 947         if (netif_is_l3_master(skb->dev)) {
 948                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949                 if (!dev)
 950                         goto out;
 951         }
 952
 953         in_dev = __in_dev_get_rcu(dev);
 954
 955         /* IP on this device is disabled. */
 956         if (!in_dev)
 957                 goto out;
 958
 959         net = dev_net(rt->dst.dev);
 960         if (!IN_DEV_FORWARD(in_dev)) {
 961                 switch (rt->dst.error) {
 962                 case EHOSTUNREACH:
 963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964                         break;
 965
 966                 case ENETUNREACH:
 967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                         break;
 969                 }
 970                 goto out;
 971         }
 972
 973         switch (rt->dst.error) {
 974         case EINVAL:
 975         default:
 976                 goto out;
 977         case EHOSTUNREACH:
 978                 code = ICMP_HOST_UNREACH;
 979                 break;
 980         case ENETUNREACH:
 981                 code = ICMP_NET_UNREACH;
 982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                 break;
 984         case EACCES:
 985                 code = ICMP_PKT_FILTERED;
 986                 break;
 987         }
 988
 989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990                                l3mdev_master_ifindex(skb->dev), 1);
 991
 992         send = true;
 993         if (peer) {
 994                 now = jiffies;
 995                 peer->rate_tokens += now - peer->rate_last;
 996                 if (peer->rate_tokens > ip_rt_error_burst)
 997                         peer->rate_tokens = ip_rt_error_burst;
 998                 peer->rate_last = now;
 999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         u32 old_mtu = ipv4_mtu(dst);
1016         struct fib_result res;
1017         bool lock = false;
1018
1019         if (ip_mtu_locked(dst))
1020                 return;
1021
1022         if (old_mtu < mtu)
1023                 return;
1024
1025         if (mtu < ip_rt_min_pmtu) {
1026                 lock = true;
1027                 mtu = min(old_mtu, ip_rt_min_pmtu);
1028         }
1029
1030         if (rt->rt_pmtu == mtu && !lock &&
1031             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032                 return;
1033
1034         rcu_read_lock();
1035         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1037
1038                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1039                                       jiffies + ip_rt_mtu_expires);
1040         }
1041         rcu_read_unlock();
1042 }
1043
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045                               struct sk_buff *skb, u32 mtu)
1046 {
1047         struct rtable *rt = (struct rtable *) dst;
1048         struct flowi4 fl4;
1049
1050         ip_rt_build_flow_key(&fl4, sk, skb);
1051         __ip_rt_update_pmtu(rt, &fl4, mtu);
1052 }
1053
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055                       int oif, u8 protocol)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1061
1062         __build_flow_key(net, &fl4, NULL, iph, oif,
1063                          RT_TOS(iph->tos), protocol, mark, 0);
1064         rt = __ip_route_output_key(net, &fl4);
1065         if (!IS_ERR(rt)) {
1066                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1067                 ip_rt_put(rt);
1068         }
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1071
1072 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1073 {
1074         const struct iphdr *iph = (const struct iphdr *) skb->data;
1075         struct flowi4 fl4;
1076         struct rtable *rt;
1077
1078         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1079
1080         if (!fl4.flowi4_mark)
1081                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1082
1083         rt = __ip_route_output_key(sock_net(sk), &fl4);
1084         if (!IS_ERR(rt)) {
1085                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086                 ip_rt_put(rt);
1087         }
1088 }
1089
1090 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1091 {
1092         const struct iphdr *iph = (const struct iphdr *) skb->data;
1093         struct flowi4 fl4;
1094         struct rtable *rt;
1095         struct dst_entry *odst = NULL;
1096         bool new = false;
1097         struct net *net = sock_net(sk);
1098
1099         bh_lock_sock(sk);
1100
1101         if (!ip_sk_accept_pmtu(sk))
1102                 goto out;
1103
1104         odst = sk_dst_get(sk);
1105
1106         if (sock_owned_by_user(sk) || !odst) {
1107                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1108                 goto out;
1109         }
1110
1111         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1112
1113         rt = (struct rtable *)odst;
1114         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1115                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116                 if (IS_ERR(rt))
1117                         goto out;
1118
1119                 new = true;
1120         }
1121
1122         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1123
1124         if (!dst_check(&rt->dst, 0)) {
1125                 if (new)
1126                         dst_release(&rt->dst);
1127
1128                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129                 if (IS_ERR(rt))
1130                         goto out;
1131
1132                 new = true;
1133         }
1134
1135         if (new)
1136                 sk_dst_set(sk, &rt->dst);
1137
1138 out:
1139         bh_unlock_sock(sk);
1140         dst_release(odst);
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1143
1144 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1145                    int oif, u8 protocol)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150
1151         __build_flow_key(net, &fl4, NULL, iph, oif,
1152                          RT_TOS(iph->tos), protocol, 0, 0);
1153         rt = __ip_route_output_key(net, &fl4);
1154         if (!IS_ERR(rt)) {
1155                 __ip_do_redirect(rt, skb, &fl4, false);
1156                 ip_rt_put(rt);
1157         }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_redirect);
1160
1161 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1162 {
1163         const struct iphdr *iph = (const struct iphdr *) skb->data;
1164         struct flowi4 fl4;
1165         struct rtable *rt;
1166         struct net *net = sock_net(sk);
1167
1168         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169         rt = __ip_route_output_key(net, &fl4);
1170         if (!IS_ERR(rt)) {
1171                 __ip_do_redirect(rt, skb, &fl4, false);
1172                 ip_rt_put(rt);
1173         }
1174 }
1175 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1176
1177 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1178 {
1179         struct rtable *rt = (struct rtable *) dst;
1180
1181         /* All IPV4 dsts are created with ->obsolete set to the value
1182          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183          * into this function always.
1184          *
1185          * When a PMTU/redirect information update invalidates a route,
1186          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1187          * DST_OBSOLETE_DEAD.
1188          */
1189         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1190                 return NULL;
1191         return dst;
1192 }
1193
1194 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1195 {
1196         struct ip_options opt;
1197         int res;
1198
1199         /* Recompile ip options since IPCB may not be valid anymore.
1200          * Also check we have a reasonable ipv4 header.
1201          */
1202         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204                 return;
1205
1206         memset(&opt, 0, sizeof(opt));
1207         if (ip_hdr(skb)->ihl > 5) {
1208                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209                         return;
1210                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1211
1212                 rcu_read_lock();
1213                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214                 rcu_read_unlock();
1215
1216                 if (res)
1217                         return;
1218         }
1219         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1220 }
1221
1222 static void ipv4_link_failure(struct sk_buff *skb)
1223 {
1224         struct rtable *rt;
1225
1226         ipv4_send_dest_unreach(skb);
1227
1228         rt = skb_rtable(skb);
1229         if (rt)
1230                 dst_set_expires(&rt->dst, 0);
1231 }
1232
1233 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1234 {
1235         pr_debug("%s: %pI4 -> %pI4, %s\n",
1236                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237                  skb->dev ? skb->dev->name : "?");
1238         kfree_skb(skb);
1239         WARN_ON(1);
1240         return 0;
1241 }
1242
1243 /*
1244    We do not cache source address of outgoing interface,
1245    because it is used only by IP RR, TS and SRR options,
1246    so that it out of fast path.
1247
1248    BTW remember: "addr" is allowed to be not aligned
1249    in IP options!
1250  */
1251
1252 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1253 {
1254         __be32 src;
1255
1256         if (rt_is_output_route(rt))
1257                 src = ip_hdr(skb)->saddr;
1258         else {
1259                 struct fib_result res;
1260                 struct iphdr *iph = ip_hdr(skb);
1261                 struct flowi4 fl4 = {
1262                         .daddr = iph->daddr,
1263                         .saddr = iph->saddr,
1264                         .flowi4_tos = RT_TOS(iph->tos),
1265                         .flowi4_oif = rt->dst.dev->ifindex,
1266                         .flowi4_iif = skb->dev->ifindex,
1267                         .flowi4_mark = skb->mark,
1268                 };
1269
1270                 rcu_read_lock();
1271                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1273                 else
1274                         src = inet_select_addr(rt->dst.dev,
1275                                                rt_nexthop(rt, iph->daddr),
1276                                                RT_SCOPE_UNIVERSE);
1277                 rcu_read_unlock();
1278         }
1279         memcpy(addr, &src, 4);
1280 }
1281
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285         if (!(rt->dst.tclassid & 0xFFFF))
1286                 rt->dst.tclassid |= tag & 0xFFFF;
1287         if (!(rt->dst.tclassid & 0xFFFF0000))
1288                 rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1295         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1296                                     ip_rt_min_advmss);
1297
1298         return min(advmss, IPV4_MAX_PMTU - header_size);
1299 }
1300
1301 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1302 {
1303         const struct rtable *rt = (const struct rtable *) dst;
1304         unsigned int mtu = rt->rt_pmtu;
1305
1306         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1307                 mtu = dst_metric_raw(dst, RTAX_MTU);
1308
1309         if (mtu)
1310                 return mtu;
1311
1312         mtu = READ_ONCE(dst->dev->mtu);
1313
1314         if (unlikely(ip_mtu_locked(dst))) {
1315                 if (rt->rt_gw_family && mtu > 576)
1316                         mtu = 576;
1317         }
1318
1319         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320
1321         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1322 }
1323
1324 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1325 {
1326         struct fnhe_hash_bucket *hash;
1327         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328         u32 hval = fnhe_hashfun(daddr);
1329
1330         spin_lock_bh(&fnhe_lock);
1331
1332         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1333                                          lockdep_is_held(&fnhe_lock));
1334         hash += hval;
1335
1336         fnhe_p = &hash->chain;
1337         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338         while (fnhe) {
1339                 if (fnhe->fnhe_daddr == daddr) {
1340                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1342                         /* set fnhe_daddr to 0 to ensure it won't bind with
1343                          * new dsts in rt_bind_exception().
1344                          */
1345                         fnhe->fnhe_daddr = 0;
1346                         fnhe_flush_routes(fnhe);
1347                         kfree_rcu(fnhe, rcu);
1348                         break;
1349                 }
1350                 fnhe_p = &fnhe->fnhe_next;
1351                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352                                                  lockdep_is_held(&fnhe_lock));
1353         }
1354
1355         spin_unlock_bh(&fnhe_lock);
1356 }
1357
1358 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359                                                __be32 daddr)
1360 {
1361         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1362         struct fib_nh_exception *fnhe;
1363         u32 hval;
1364
1365         if (!hash)
1366                 return NULL;
1367
1368         hval = fnhe_hashfun(daddr);
1369
1370         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1372                 if (fnhe->fnhe_daddr == daddr) {
1373                         if (fnhe->fnhe_expires &&
1374                             time_after(jiffies, fnhe->fnhe_expires)) {
1375                                 ip_del_fnhe(nhc, daddr);
1376                                 break;
1377                         }
1378                         return fnhe;
1379                 }
1380         }
1381         return NULL;
1382 }
1383
1384 /* MTU selection:
1385  * 1. mtu on route is locked - use it
1386  * 2. mtu from nexthop exception
1387  * 3. mtu from egress device
1388  */
1389
1390 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1391 {
1392         struct fib_nh_common *nhc = res->nhc;
1393         struct net_device *dev = nhc->nhc_dev;
1394         struct fib_info *fi = res->fi;
1395         u32 mtu = 0;
1396
1397         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399                 mtu = fi->fib_mtu;
1400
1401         if (likely(!mtu)) {
1402                 struct fib_nh_exception *fnhe;
1403
1404                 fnhe = find_exception(nhc, daddr);
1405                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406                         mtu = fnhe->fnhe_pmtu;
1407         }
1408
1409         if (likely(!mtu))
1410                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1411
1412         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1413 }
1414
1415 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1416                               __be32 daddr, const bool do_cache)
1417 {
1418         bool ret = false;
1419
1420         spin_lock_bh(&fnhe_lock);
1421
1422         if (daddr == fnhe->fnhe_daddr) {
1423                 struct rtable __rcu **porig;
1424                 struct rtable *orig;
1425                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1426
1427                 if (rt_is_input_route(rt))
1428                         porig = &fnhe->fnhe_rth_input;
1429                 else
1430                         porig = &fnhe->fnhe_rth_output;
1431                 orig = rcu_dereference(*porig);
1432
1433                 if (fnhe->fnhe_genid != genid) {
1434                         fnhe->fnhe_genid = genid;
1435                         fnhe->fnhe_gw = 0;
1436                         fnhe->fnhe_pmtu = 0;
1437                         fnhe->fnhe_expires = 0;
1438                         fnhe->fnhe_mtu_locked = false;
1439                         fnhe_flush_routes(fnhe);
1440                         orig = NULL;
1441                 }
1442                 fill_route_from_fnhe(rt, fnhe);
1443                 if (!rt->rt_gw4) {
1444                         rt->rt_gw4 = daddr;
1445                         rt->rt_gw_family = AF_INET;
1446                 }
1447
1448                 if (do_cache) {
1449                         dst_hold(&rt->dst);
1450                         rcu_assign_pointer(*porig, rt);
1451                         if (orig) {
1452                                 dst_dev_put(&orig->dst);
1453                                 dst_release(&orig->dst);
1454                         }
1455                         ret = true;
1456                 }
1457
1458                 fnhe->fnhe_stamp = jiffies;
1459         }
1460         spin_unlock_bh(&fnhe_lock);
1461
1462         return ret;
1463 }
1464
1465 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1466 {
1467         struct rtable *orig, *prev, **p;
1468         bool ret = true;
1469
1470         if (rt_is_input_route(rt)) {
1471                 p = (struct rtable **)&nhc->nhc_rth_input;
1472         } else {
1473                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1474         }
1475         orig = *p;
1476
1477         /* hold dst before doing cmpxchg() to avoid race condition
1478          * on this dst
1479          */
1480         dst_hold(&rt->dst);
1481         prev = cmpxchg(p, orig, rt);
1482         if (prev == orig) {
1483                 if (orig) {
1484                         dst_dev_put(&orig->dst);
1485                         dst_release(&orig->dst);
1486                 }
1487         } else {
1488                 dst_release(&rt->dst);
1489                 ret = false;
1490         }
1491
1492         return ret;
1493 }
1494
1495 struct uncached_list {
1496         spinlock_t              lock;
1497         struct list_head        head;
1498 };
1499
1500 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1501
1502 void rt_add_uncached_list(struct rtable *rt)
1503 {
1504         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1505
1506         rt->rt_uncached_list = ul;
1507
1508         spin_lock_bh(&ul->lock);
1509         list_add_tail(&rt->rt_uncached, &ul->head);
1510         spin_unlock_bh(&ul->lock);
1511 }
1512
1513 void rt_del_uncached_list(struct rtable *rt)
1514 {
1515         if (!list_empty(&rt->rt_uncached)) {
1516                 struct uncached_list *ul = rt->rt_uncached_list;
1517
1518                 spin_lock_bh(&ul->lock);
1519                 list_del(&rt->rt_uncached);
1520                 spin_unlock_bh(&ul->lock);
1521         }
1522 }
1523
1524 static void ipv4_dst_destroy(struct dst_entry *dst)
1525 {
1526         struct rtable *rt = (struct rtable *)dst;
1527
1528         ip_dst_metrics_put(dst);
1529         rt_del_uncached_list(rt);
1530 }
1531
1532 void rt_flush_dev(struct net_device *dev)
1533 {
1534         struct net *net = dev_net(dev);
1535         struct rtable *rt;
1536         int cpu;
1537
1538         for_each_possible_cpu(cpu) {
1539                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541                 spin_lock_bh(&ul->lock);
1542                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1543                         if (rt->dst.dev != dev)
1544                                 continue;
1545                         rt->dst.dev = net->loopback_dev;
1546                         dev_hold(rt->dst.dev);
1547                         dev_put(dev);
1548                 }
1549                 spin_unlock_bh(&ul->lock);
1550         }
1551 }
1552
1553 static bool rt_cache_valid(const struct rtable *rt)
1554 {
1555         return  rt &&
1556                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557                 !rt_is_expired(rt);
1558 }
1559
1560 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561                            const struct fib_result *res,
1562                            struct fib_nh_exception *fnhe,
1563                            struct fib_info *fi, u16 type, u32 itag,
1564                            const bool do_cache)
1565 {
1566         bool cached = false;
1567
1568         if (fi) {
1569                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572                         rt->rt_gw_family = nhc->nhc_gw_family;
1573                         /* only INET and INET6 are supported */
1574                         if (likely(nhc->nhc_gw_family == AF_INET))
1575                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576                         else
1577                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1578                 }
1579
1580                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581
1582 #ifdef CONFIG_IP_ROUTE_CLASSID
1583                 {
1584                         struct fib_nh *nh;
1585
1586                         nh = container_of(nhc, struct fib_nh, nh_common);
1587                         rt->dst.tclassid = nh->nh_tclassid;
1588                 }
1589 #endif
1590                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591                 if (unlikely(fnhe))
1592                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593                 else if (do_cache)
1594                         cached = rt_cache_route(nhc, rt);
1595                 if (unlikely(!cached)) {
1596                         /* Routes we intend to cache in nexthop exception or
1597                          * FIB nexthop have the DST_NOCACHE bit clear.
1598                          * However, if we are unsuccessful at storing this
1599                          * route into the cache we really need to set it.
1600                          */
1601                         if (!rt->rt_gw4) {
1602                                 rt->rt_gw_family = AF_INET;
1603                                 rt->rt_gw4 = daddr;
1604                         }
1605                         rt_add_uncached_list(rt);
1606                 }
1607         } else
1608                 rt_add_uncached_list(rt);
1609
1610 #ifdef CONFIG_IP_ROUTE_CLASSID
1611 #ifdef CONFIG_IP_MULTIPLE_TABLES
1612         set_class_tag(rt, res->tclassid);
1613 #endif
1614         set_class_tag(rt, itag);
1615 #endif
1616 }
1617
1618 struct rtable *rt_dst_alloc(struct net_device *dev,
1619                             unsigned int flags, u16 type,
1620                             bool nopolicy, bool noxfrm, bool will_cache)
1621 {
1622         struct rtable *rt;
1623
1624         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625                        (will_cache ? 0 : DST_HOST) |
1626                        (nopolicy ? DST_NOPOLICY : 0) |
1627                        (noxfrm ? DST_NOXFRM : 0));
1628
1629         if (rt) {
1630                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631                 rt->rt_flags = flags;
1632                 rt->rt_type = type;
1633                 rt->rt_is_input = 0;
1634                 rt->rt_iif = 0;
1635                 rt->rt_pmtu = 0;
1636                 rt->rt_mtu_locked = 0;
1637                 rt->rt_gw_family = 0;
1638                 rt->rt_gw4 = 0;
1639                 INIT_LIST_HEAD(&rt->rt_uncached);
1640
1641                 rt->dst.output = ip_output;
1642                 if (flags & RTCF_LOCAL)
1643                         rt->dst.input = ip_local_deliver;
1644         }
1645
1646         return rt;
1647 }
1648 EXPORT_SYMBOL(rt_dst_alloc);
1649
1650 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1651 {
1652         struct rtable *new_rt;
1653
1654         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1655                            rt->dst.flags);
1656
1657         if (new_rt) {
1658                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1659                 new_rt->rt_flags = rt->rt_flags;
1660                 new_rt->rt_type = rt->rt_type;
1661                 new_rt->rt_is_input = rt->rt_is_input;
1662                 new_rt->rt_iif = rt->rt_iif;
1663                 new_rt->rt_pmtu = rt->rt_pmtu;
1664                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1665                 new_rt->rt_gw_family = rt->rt_gw_family;
1666                 if (rt->rt_gw_family == AF_INET)
1667                         new_rt->rt_gw4 = rt->rt_gw4;
1668                 else if (rt->rt_gw_family == AF_INET6)
1669                         new_rt->rt_gw6 = rt->rt_gw6;
1670                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1671
1672                 new_rt->dst.flags |= DST_HOST;
1673                 new_rt->dst.input = rt->dst.input;
1674                 new_rt->dst.output = rt->dst.output;
1675                 new_rt->dst.error = rt->dst.error;
1676                 new_rt->dst.lastuse = jiffies;
1677                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1678         }
1679         return new_rt;
1680 }
1681 EXPORT_SYMBOL(rt_dst_clone);
1682
1683 /* called in rcu_read_lock() section */
1684 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1685                           u8 tos, struct net_device *dev,
1686                           struct in_device *in_dev, u32 *itag)
1687 {
1688         int err;
1689
1690         /* Primary sanity checks. */
1691         if (!in_dev)
1692                 return -EINVAL;
1693
1694         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1695             skb->protocol != htons(ETH_P_IP))
1696                 return -EINVAL;
1697
1698         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1699                 return -EINVAL;
1700
1701         if (ipv4_is_zeronet(saddr)) {
1702                 if (!ipv4_is_local_multicast(daddr) &&
1703                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1704                         return -EINVAL;
1705         } else {
1706                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1707                                           in_dev, itag);
1708                 if (err < 0)
1709                         return err;
1710         }
1711         return 0;
1712 }
1713
1714 /* called in rcu_read_lock() section */
1715 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1716                              u8 tos, struct net_device *dev, int our)
1717 {
1718         struct in_device *in_dev = __in_dev_get_rcu(dev);
1719         unsigned int flags = RTCF_MULTICAST;
1720         struct rtable *rth;
1721         u32 itag = 0;
1722         int err;
1723
1724         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1725         if (err)
1726                 return err;
1727
1728         if (our)
1729                 flags |= RTCF_LOCAL;
1730
1731         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1732                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1733         if (!rth)
1734                 return -ENOBUFS;
1735
1736 #ifdef CONFIG_IP_ROUTE_CLASSID
1737         rth->dst.tclassid = itag;
1738 #endif
1739         rth->dst.output = ip_rt_bug;
1740         rth->rt_is_input= 1;
1741
1742 #ifdef CONFIG_IP_MROUTE
1743         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1744                 rth->dst.input = ip_mr_input;
1745 #endif
1746         RT_CACHE_STAT_INC(in_slow_mc);
1747
1748         skb_dst_set(skb, &rth->dst);
1749         return 0;
1750 }
1751
1752
1753 static void ip_handle_martian_source(struct net_device *dev,
1754                                      struct in_device *in_dev,
1755                                      struct sk_buff *skb,
1756                                      __be32 daddr,
1757                                      __be32 saddr)
1758 {
1759         RT_CACHE_STAT_INC(in_martian_src);
1760 #ifdef CONFIG_IP_ROUTE_VERBOSE
1761         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1762                 /*
1763                  *      RFC1812 recommendation, if source is martian,
1764                  *      the only hint is MAC header.
1765                  */
1766                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1767                         &daddr, &saddr, dev->name);
1768                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1769                         print_hex_dump(KERN_WARNING, "ll header: ",
1770                                        DUMP_PREFIX_OFFSET, 16, 1,
1771                                        skb_mac_header(skb),
1772                                        dev->hard_header_len, false);
1773                 }
1774         }
1775 #endif
1776 }
1777
1778 /* called in rcu_read_lock() section */
1779 static int __mkroute_input(struct sk_buff *skb,
1780                            const struct fib_result *res,
1781                            struct in_device *in_dev,
1782                            __be32 daddr, __be32 saddr, u32 tos)
1783 {
1784         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1785         struct net_device *dev = nhc->nhc_dev;
1786         struct fib_nh_exception *fnhe;
1787         struct rtable *rth;
1788         int err;
1789         struct in_device *out_dev;
1790         bool do_cache;
1791         u32 itag = 0;
1792
1793         /* get a working reference to the output device */
1794         out_dev = __in_dev_get_rcu(dev);
1795         if (!out_dev) {
1796                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1797                 return -EINVAL;
1798         }
1799
1800         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1801                                   in_dev->dev, in_dev, &itag);
1802         if (err < 0) {
1803                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1804                                          saddr);
1805
1806                 goto cleanup;
1807         }
1808
1809         do_cache = res->fi && !itag;
1810         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1811             skb->protocol == htons(ETH_P_IP)) {
1812                 __be32 gw;
1813
1814                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1815                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1816                     inet_addr_onlink(out_dev, saddr, gw))
1817                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1818         }
1819
1820         if (skb->protocol != htons(ETH_P_IP)) {
1821                 /* Not IP (i.e. ARP). Do not create route, if it is
1822                  * invalid for proxy arp. DNAT routes are always valid.
1823                  *
1824                  * Proxy arp feature have been extended to allow, ARP
1825                  * replies back to the same interface, to support
1826                  * Private VLAN switch technologies. See arp.c.
1827                  */
1828                 if (out_dev == in_dev &&
1829                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1830                         err = -EINVAL;
1831                         goto cleanup;
1832                 }
1833         }
1834
1835         fnhe = find_exception(nhc, daddr);
1836         if (do_cache) {
1837                 if (fnhe)
1838                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1839                 else
1840                         rth = rcu_dereference(nhc->nhc_rth_input);
1841                 if (rt_cache_valid(rth)) {
1842                         skb_dst_set_noref(skb, &rth->dst);
1843                         goto out;
1844                 }
1845         }
1846
1847         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1848                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1849                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1850         if (!rth) {
1851                 err = -ENOBUFS;
1852                 goto cleanup;
1853         }
1854
1855         rth->rt_is_input = 1;
1856         RT_CACHE_STAT_INC(in_slow_tot);
1857
1858         rth->dst.input = ip_forward;
1859
1860         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1861                        do_cache);
1862         lwtunnel_set_redirect(&rth->dst);
1863         skb_dst_set(skb, &rth->dst);
1864 out:
1865         err = 0;
1866  cleanup:
1867         return err;
1868 }
1869
1870 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1871 /* To make ICMP packets follow the right flow, the multipath hash is
1872  * calculated from the inner IP addresses.
1873  */
1874 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1875                                  struct flow_keys *hash_keys)
1876 {
1877         const struct iphdr *outer_iph = ip_hdr(skb);
1878         const struct iphdr *key_iph = outer_iph;
1879         const struct iphdr *inner_iph;
1880         const struct icmphdr *icmph;
1881         struct iphdr _inner_iph;
1882         struct icmphdr _icmph;
1883
1884         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1885                 goto out;
1886
1887         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1888                 goto out;
1889
1890         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1891                                    &_icmph);
1892         if (!icmph)
1893                 goto out;
1894
1895         if (icmph->type != ICMP_DEST_UNREACH &&
1896             icmph->type != ICMP_REDIRECT &&
1897             icmph->type != ICMP_TIME_EXCEEDED &&
1898             icmph->type != ICMP_PARAMETERPROB)
1899                 goto out;
1900
1901         inner_iph = skb_header_pointer(skb,
1902                                        outer_iph->ihl * 4 + sizeof(_icmph),
1903                                        sizeof(_inner_iph), &_inner_iph);
1904         if (!inner_iph)
1905                 goto out;
1906
1907         key_iph = inner_iph;
1908 out:
1909         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1911 }
1912
1913 /* if skb is set it will be used and fl4 can be NULL */
1914 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1915                        const struct sk_buff *skb, struct flow_keys *flkeys)
1916 {
1917         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1918         struct flow_keys hash_keys;
1919         u32 mhash;
1920
1921         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922         case 0:
1923                 memset(&hash_keys, 0, sizeof(hash_keys));
1924                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925                 if (skb) {
1926                         ip_multipath_l3_keys(skb, &hash_keys);
1927                 } else {
1928                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1929                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930                 }
1931                 break;
1932         case 1:
1933                 /* skb is currently provided only when forwarding */
1934                 if (skb) {
1935                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936                         struct flow_keys keys;
1937
1938                         /* short-circuit if we already have L4 hash present */
1939                         if (skb->l4_hash)
1940                                 return skb_get_hash_raw(skb) >> 1;
1941
1942                         memset(&hash_keys, 0, sizeof(hash_keys));
1943
1944                         if (!flkeys) {
1945                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1946                                 flkeys = &keys;
1947                         }
1948
1949                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952                         hash_keys.ports.src = flkeys->ports.src;
1953                         hash_keys.ports.dst = flkeys->ports.dst;
1954                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1955                 } else {
1956                         memset(&hash_keys, 0, sizeof(hash_keys));
1957                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1959                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960                         hash_keys.ports.src = fl4->fl4_sport;
1961                         hash_keys.ports.dst = fl4->fl4_dport;
1962                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1963                 }
1964                 break;
1965         }
1966         mhash = flow_hash_from_keys(&hash_keys);
1967
1968         if (multipath_hash)
1969                 mhash = jhash_2words(mhash, multipath_hash, 0);
1970
1971         return mhash >> 1;
1972 }
1973 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1974
1975 static int ip_mkroute_input(struct sk_buff *skb,
1976                             struct fib_result *res,
1977                             struct in_device *in_dev,
1978                             __be32 daddr, __be32 saddr, u32 tos,
1979                             struct flow_keys *hkeys)
1980 {
1981 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1982         if (res->fi && res->fi->fib_nhs > 1) {
1983                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1984
1985                 fib_select_multipath(res, h);
1986         }
1987 #endif
1988
1989         /* create a routing cache entry */
1990         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1991 }
1992
1993 /*
1994  *      NOTE. We drop all the packets that has local source
1995  *      addresses, because every properly looped back packet
1996  *      must have correct destination already attached by output routine.
1997  *
1998  *      Such approach solves two big problems:
1999  *      1. Not simplex devices are handled properly.
2000  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2001  *      called with rcu_read_lock()
2002  */
2003
2004 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2005                                u8 tos, struct net_device *dev,
2006                                struct fib_result *res)
2007 {
2008         struct in_device *in_dev = __in_dev_get_rcu(dev);
2009         struct flow_keys *flkeys = NULL, _flkeys;
2010         struct net    *net = dev_net(dev);
2011         struct ip_tunnel_info *tun_info;
2012         int             err = -EINVAL;
2013         unsigned int    flags = 0;
2014         u32             itag = 0;
2015         struct rtable   *rth;
2016         struct flowi4   fl4;
2017         bool do_cache = true;
2018
2019         /* IP on this device is disabled. */
2020
2021         if (!in_dev)
2022                 goto out;
2023
2024         /* Check for the most weird martians, which can be not detected
2025            by fib_lookup.
2026          */
2027
2028         tun_info = skb_tunnel_info(skb);
2029         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2030                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2031         else
2032                 fl4.flowi4_tun_key.tun_id = 0;
2033         skb_dst_drop(skb);
2034
2035         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2036                 goto martian_source;
2037
2038         res->fi = NULL;
2039         res->table = NULL;
2040         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2041                 goto brd_input;
2042
2043         /* Accept zero addresses only to limited broadcast;
2044          * I even do not know to fix it or not. Waiting for complains :-)
2045          */
2046         if (ipv4_is_zeronet(saddr))
2047                 goto martian_source;
2048
2049         if (ipv4_is_zeronet(daddr))
2050                 goto martian_destination;
2051
2052         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2053          * and call it once if daddr or/and saddr are loopback addresses
2054          */
2055         if (ipv4_is_loopback(daddr)) {
2056                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2057                         goto martian_destination;
2058         } else if (ipv4_is_loopback(saddr)) {
2059                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2060                         goto martian_source;
2061         }
2062
2063         /*
2064          *      Now we are ready to route packet.
2065          */
2066         fl4.flowi4_oif = 0;
2067         fl4.flowi4_iif = dev->ifindex;
2068         fl4.flowi4_mark = skb->mark;
2069         fl4.flowi4_tos = tos;
2070         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2071         fl4.flowi4_flags = 0;
2072         fl4.daddr = daddr;
2073         fl4.saddr = saddr;
2074         fl4.flowi4_uid = sock_net_uid(net, NULL);
2075
2076         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2077                 flkeys = &_flkeys;
2078         } else {
2079                 fl4.flowi4_proto = 0;
2080                 fl4.fl4_sport = 0;
2081                 fl4.fl4_dport = 0;
2082         }
2083
2084         err = fib_lookup(net, &fl4, res, 0);
2085         if (err != 0) {
2086                 if (!IN_DEV_FORWARD(in_dev))
2087                         err = -EHOSTUNREACH;
2088                 goto no_route;
2089         }
2090
2091         if (res->type == RTN_BROADCAST) {
2092                 if (IN_DEV_BFORWARD(in_dev))
2093                         goto make_route;
2094                 /* not do cache if bc_forwarding is enabled */
2095                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2096                         do_cache = false;
2097                 goto brd_input;
2098         }
2099
2100         if (res->type == RTN_LOCAL) {
2101                 err = fib_validate_source(skb, saddr, daddr, tos,
2102                                           0, dev, in_dev, &itag);
2103                 if (err < 0)
2104                         goto martian_source;
2105                 goto local_input;
2106         }
2107
2108         if (!IN_DEV_FORWARD(in_dev)) {
2109                 err = -EHOSTUNREACH;
2110                 goto no_route;
2111         }
2112         if (res->type != RTN_UNICAST)
2113                 goto martian_destination;
2114
2115 make_route:
2116         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2117 out:    return err;
2118
2119 brd_input:
2120         if (skb->protocol != htons(ETH_P_IP))
2121                 goto e_inval;
2122
2123         if (!ipv4_is_zeronet(saddr)) {
2124                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2125                                           in_dev, &itag);
2126                 if (err < 0)
2127                         goto martian_source;
2128         }
2129         flags |= RTCF_BROADCAST;
2130         res->type = RTN_BROADCAST;
2131         RT_CACHE_STAT_INC(in_brd);
2132
2133 local_input:
2134         do_cache &= res->fi && !itag;
2135         if (do_cache) {
2136                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2137
2138                 rth = rcu_dereference(nhc->nhc_rth_input);
2139                 if (rt_cache_valid(rth)) {
2140                         skb_dst_set_noref(skb, &rth->dst);
2141                         err = 0;
2142                         goto out;
2143                 }
2144         }
2145
2146         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2147                            flags | RTCF_LOCAL, res->type,
2148                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2149         if (!rth)
2150                 goto e_nobufs;
2151
2152         rth->dst.output= ip_rt_bug;
2153 #ifdef CONFIG_IP_ROUTE_CLASSID
2154         rth->dst.tclassid = itag;
2155 #endif
2156         rth->rt_is_input = 1;
2157
2158         RT_CACHE_STAT_INC(in_slow_tot);
2159         if (res->type == RTN_UNREACHABLE) {
2160                 rth->dst.input= ip_error;
2161                 rth->dst.error= -err;
2162                 rth->rt_flags   &= ~RTCF_LOCAL;
2163         }
2164
2165         if (do_cache) {
2166                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2167
2168                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2169                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2170                         WARN_ON(rth->dst.input == lwtunnel_input);
2171                         rth->dst.lwtstate->orig_input = rth->dst.input;
2172                         rth->dst.input = lwtunnel_input;
2173                 }
2174
2175                 if (unlikely(!rt_cache_route(nhc, rth)))
2176                         rt_add_uncached_list(rth);
2177         }
2178         skb_dst_set(skb, &rth->dst);
2179         err = 0;
2180         goto out;
2181
2182 no_route:
2183         RT_CACHE_STAT_INC(in_no_route);
2184         res->type = RTN_UNREACHABLE;
2185         res->fi = NULL;
2186         res->table = NULL;
2187         goto local_input;
2188
2189         /*
2190          *      Do not cache martian addresses: they should be logged (RFC1812)
2191          */
2192 martian_destination:
2193         RT_CACHE_STAT_INC(in_martian_dst);
2194 #ifdef CONFIG_IP_ROUTE_VERBOSE
2195         if (IN_DEV_LOG_MARTIANS(in_dev))
2196                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2197                                      &daddr, &saddr, dev->name);
2198 #endif
2199
2200 e_inval:
2201         err = -EINVAL;
2202         goto out;
2203
2204 e_nobufs:
2205         err = -ENOBUFS;
2206         goto out;
2207
2208 martian_source:
2209         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2210         goto out;
2211 }
2212
2213 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2214                          u8 tos, struct net_device *dev)
2215 {
2216         struct fib_result res;
2217         int err;
2218
2219         tos &= IPTOS_RT_MASK;
2220         rcu_read_lock();
2221         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2222         rcu_read_unlock();
2223
2224         return err;
2225 }
2226 EXPORT_SYMBOL(ip_route_input_noref);
2227
2228 /* called with rcu_read_lock held */
2229 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2230                        u8 tos, struct net_device *dev, struct fib_result *res)
2231 {
2232         /* Multicast recognition logic is moved from route cache to here.
2233            The problem was that too many Ethernet cards have broken/missing
2234            hardware multicast filters :-( As result the host on multicasting
2235            network acquires a lot of useless route cache entries, sort of
2236            SDR messages from all the world. Now we try to get rid of them.
2237            Really, provided software IP multicast filter is organized
2238            reasonably (at least, hashed), it does not result in a slowdown
2239            comparing with route cache reject entries.
2240            Note, that multicast routers are not affected, because
2241            route cache entry is created eventually.
2242          */
2243         if (ipv4_is_multicast(daddr)) {
2244                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2245                 int our = 0;
2246                 int err = -EINVAL;
2247
2248                 if (!in_dev)
2249                         return err;
2250                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2251                                       ip_hdr(skb)->protocol);
2252
2253                 /* check l3 master if no match yet */
2254                 if (!our && netif_is_l3_slave(dev)) {
2255                         struct in_device *l3_in_dev;
2256
2257                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2258                         if (l3_in_dev)
2259                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2260                                                       ip_hdr(skb)->protocol);
2261                 }
2262
2263                 if (our
2264 #ifdef CONFIG_IP_MROUTE
2265                         ||
2266                     (!ipv4_is_local_multicast(daddr) &&
2267                      IN_DEV_MFORWARD(in_dev))
2268 #endif
2269                    ) {
2270                         err = ip_route_input_mc(skb, daddr, saddr,
2271                                                 tos, dev, our);
2272                 }
2273                 return err;
2274         }
2275
2276         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2277 }
2278
2279 /* called with rcu_read_lock() */
2280 static struct rtable *__mkroute_output(const struct fib_result *res,
2281                                        const struct flowi4 *fl4, int orig_oif,
2282                                        struct net_device *dev_out,
2283                                        unsigned int flags)
2284 {
2285         struct fib_info *fi = res->fi;
2286         struct fib_nh_exception *fnhe;
2287         struct in_device *in_dev;
2288         u16 type = res->type;
2289         struct rtable *rth;
2290         bool do_cache;
2291
2292         in_dev = __in_dev_get_rcu(dev_out);
2293         if (!in_dev)
2294                 return ERR_PTR(-EINVAL);
2295
2296         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2297                 if (ipv4_is_loopback(fl4->saddr) &&
2298                     !(dev_out->flags & IFF_LOOPBACK) &&
2299                     !netif_is_l3_master(dev_out))
2300                         return ERR_PTR(-EINVAL);
2301
2302         if (ipv4_is_lbcast(fl4->daddr))
2303                 type = RTN_BROADCAST;
2304         else if (ipv4_is_multicast(fl4->daddr))
2305                 type = RTN_MULTICAST;
2306         else if (ipv4_is_zeronet(fl4->daddr))
2307                 return ERR_PTR(-EINVAL);
2308
2309         if (dev_out->flags & IFF_LOOPBACK)
2310                 flags |= RTCF_LOCAL;
2311
2312         do_cache = true;
2313         if (type == RTN_BROADCAST) {
2314                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2315                 fi = NULL;
2316         } else if (type == RTN_MULTICAST) {
2317                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2318                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2319                                      fl4->flowi4_proto))
2320                         flags &= ~RTCF_LOCAL;
2321                 else
2322                         do_cache = false;
2323                 /* If multicast route do not exist use
2324                  * default one, but do not gateway in this case.
2325                  * Yes, it is hack.
2326                  */
2327                 if (fi && res->prefixlen < 4)
2328                         fi = NULL;
2329         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2330                    (orig_oif != dev_out->ifindex)) {
2331                 /* For local routes that require a particular output interface
2332                  * we do not want to cache the result.  Caching the result
2333                  * causes incorrect behaviour when there are multiple source
2334                  * addresses on the interface, the end result being that if the
2335                  * intended recipient is waiting on that interface for the
2336                  * packet he won't receive it because it will be delivered on
2337                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2338                  * be set to the loopback interface as well.
2339                  */
2340                 do_cache = false;
2341         }
2342
2343         fnhe = NULL;
2344         do_cache &= fi != NULL;
2345         if (fi) {
2346                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2347                 struct rtable __rcu **prth;
2348
2349                 fnhe = find_exception(nhc, fl4->daddr);
2350                 if (!do_cache)
2351                         goto add;
2352                 if (fnhe) {
2353                         prth = &fnhe->fnhe_rth_output;
2354                 } else {
2355                         if (unlikely(fl4->flowi4_flags &
2356                                      FLOWI_FLAG_KNOWN_NH &&
2357                                      !(nhc->nhc_gw_family &&
2358                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2359                                 do_cache = false;
2360                                 goto add;
2361                         }
2362                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2363                 }
2364                 rth = rcu_dereference(*prth);
2365                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2366                         return rth;
2367         }
2368
2369 add:
2370         rth = rt_dst_alloc(dev_out, flags, type,
2371                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2372                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2373                            do_cache);
2374         if (!rth)
2375                 return ERR_PTR(-ENOBUFS);
2376
2377         rth->rt_iif = orig_oif;
2378
2379         RT_CACHE_STAT_INC(out_slow_tot);
2380
2381         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2382                 if (flags & RTCF_LOCAL &&
2383                     !(dev_out->flags & IFF_LOOPBACK)) {
2384                         rth->dst.output = ip_mc_output;
2385                         RT_CACHE_STAT_INC(out_slow_mc);
2386                 }
2387 #ifdef CONFIG_IP_MROUTE
2388                 if (type == RTN_MULTICAST) {
2389                         if (IN_DEV_MFORWARD(in_dev) &&
2390                             !ipv4_is_local_multicast(fl4->daddr)) {
2391                                 rth->dst.input = ip_mr_input;
2392                                 rth->dst.output = ip_mc_output;
2393                         }
2394                 }
2395 #endif
2396         }
2397
2398         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2399         lwtunnel_set_redirect(&rth->dst);
2400
2401         return rth;
2402 }
2403
2404 /*
2405  * Major route resolver routine.
2406  */
2407
2408 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2409                                         const struct sk_buff *skb)
2410 {
2411         __u8 tos = RT_FL_TOS(fl4);
2412         struct fib_result res = {
2413                 .type           = RTN_UNSPEC,
2414                 .fi             = NULL,
2415                 .table          = NULL,
2416                 .tclassid       = 0,
2417         };
2418         struct rtable *rth;
2419
2420         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2421         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2422         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2423                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2424
2425         rcu_read_lock();
2426         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2427         rcu_read_unlock();
2428
2429         return rth;
2430 }
2431 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2432
2433 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2434                                             struct fib_result *res,
2435                                             const struct sk_buff *skb)
2436 {
2437         struct net_device *dev_out = NULL;
2438         int orig_oif = fl4->flowi4_oif;
2439         unsigned int flags = 0;
2440         struct rtable *rth;
2441         int err = -ENETUNREACH;
2442
2443         if (fl4->saddr) {
2444                 rth = ERR_PTR(-EINVAL);
2445                 if (ipv4_is_multicast(fl4->saddr) ||
2446                     ipv4_is_lbcast(fl4->saddr) ||
2447                     ipv4_is_zeronet(fl4->saddr))
2448                         goto out;
2449
2450                 /* I removed check for oif == dev_out->oif here.
2451                    It was wrong for two reasons:
2452                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2453                       is assigned to multiple interfaces.
2454                    2. Moreover, we are allowed to send packets with saddr
2455                       of another iface. --ANK
2456                  */
2457
2458                 if (fl4->flowi4_oif == 0 &&
2459                     (ipv4_is_multicast(fl4->daddr) ||
2460                      ipv4_is_lbcast(fl4->daddr))) {
2461                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2462                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2463                         if (!dev_out)
2464                                 goto out;
2465
2466                         /* Special hack: user can direct multicasts
2467                            and limited broadcast via necessary interface
2468                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2469                            This hack is not just for fun, it allows
2470                            vic,vat and friends to work.
2471                            They bind socket to loopback, set ttl to zero
2472                            and expect that it will work.
2473                            From the viewpoint of routing cache they are broken,
2474                            because we are not allowed to build multicast path
2475                            with loopback source addr (look, routing cache
2476                            cannot know, that ttl is zero, so that packet
2477                            will not leave this host and route is valid).
2478                            Luckily, this hack is good workaround.
2479                          */
2480
2481                         fl4->flowi4_oif = dev_out->ifindex;
2482                         goto make_route;
2483                 }
2484
2485                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2486                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2487                         if (!__ip_dev_find(net, fl4->saddr, false))
2488                                 goto out;
2489                 }
2490         }
2491
2492
2493         if (fl4->flowi4_oif) {
2494                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2495                 rth = ERR_PTR(-ENODEV);
2496                 if (!dev_out)
2497                         goto out;
2498
2499                 /* RACE: Check return value of inet_select_addr instead. */
2500                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2501                         rth = ERR_PTR(-ENETUNREACH);
2502                         goto out;
2503                 }
2504                 if (ipv4_is_local_multicast(fl4->daddr) ||
2505                     ipv4_is_lbcast(fl4->daddr) ||
2506                     fl4->flowi4_proto == IPPROTO_IGMP) {
2507                         if (!fl4->saddr)
2508                                 fl4->saddr = inet_select_addr(dev_out, 0,
2509                                                               RT_SCOPE_LINK);
2510                         goto make_route;
2511                 }
2512                 if (!fl4->saddr) {
2513                         if (ipv4_is_multicast(fl4->daddr))
2514                                 fl4->saddr = inet_select_addr(dev_out, 0,
2515                                                               fl4->flowi4_scope);
2516                         else if (!fl4->daddr)
2517                                 fl4->saddr = inet_select_addr(dev_out, 0,
2518                                                               RT_SCOPE_HOST);
2519                 }
2520         }
2521
2522         if (!fl4->daddr) {
2523                 fl4->daddr = fl4->saddr;
2524                 if (!fl4->daddr)
2525                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2526                 dev_out = net->loopback_dev;
2527                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2528                 res->type = RTN_LOCAL;
2529                 flags |= RTCF_LOCAL;
2530                 goto make_route;
2531         }
2532
2533         err = fib_lookup(net, fl4, res, 0);
2534         if (err) {
2535                 res->fi = NULL;
2536                 res->table = NULL;
2537                 if (fl4->flowi4_oif &&
2538                     (ipv4_is_multicast(fl4->daddr) ||
2539                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2540                         /* Apparently, routing tables are wrong. Assume,
2541                            that the destination is on link.
2542
2543                            WHY? DW.
2544                            Because we are allowed to send to iface
2545                            even if it has NO routes and NO assigned
2546                            addresses. When oif is specified, routing
2547                            tables are looked up with only one purpose:
2548                            to catch if destination is gatewayed, rather than
2549                            direct. Moreover, if MSG_DONTROUTE is set,
2550                            we send packet, ignoring both routing tables
2551                            and ifaddr state. --ANK
2552
2553
2554                            We could make it even if oif is unknown,
2555                            likely IPv6, but we do not.
2556                          */
2557
2558                         if (fl4->saddr == 0)
2559                                 fl4->saddr = inet_select_addr(dev_out, 0,
2560                                                               RT_SCOPE_LINK);
2561                         res->type = RTN_UNICAST;
2562                         goto make_route;
2563                 }
2564                 rth = ERR_PTR(err);
2565                 goto out;
2566         }
2567
2568         if (res->type == RTN_LOCAL) {
2569                 if (!fl4->saddr) {
2570                         if (res->fi->fib_prefsrc)
2571                                 fl4->saddr = res->fi->fib_prefsrc;
2572                         else
2573                                 fl4->saddr = fl4->daddr;
2574                 }
2575
2576                 /* L3 master device is the loopback for that domain */
2577                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2578                         net->loopback_dev;
2579
2580                 /* make sure orig_oif points to fib result device even
2581                  * though packet rx/tx happens over loopback or l3mdev
2582                  */
2583                 orig_oif = FIB_RES_OIF(*res);
2584
2585                 fl4->flowi4_oif = dev_out->ifindex;
2586                 flags |= RTCF_LOCAL;
2587                 goto make_route;
2588         }
2589
2590         fib_select_path(net, res, fl4, skb);
2591
2592         dev_out = FIB_RES_DEV(*res);
2593         fl4->flowi4_oif = dev_out->ifindex;
2594
2595
2596 make_route:
2597         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2598
2599 out:
2600         return rth;
2601 }
2602
2603 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2604 {
2605         return NULL;
2606 }
2607
2608 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2609 {
2610         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2611
2612         return mtu ? : dst->dev->mtu;
2613 }
2614
2615 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2616                                           struct sk_buff *skb, u32 mtu)
2617 {
2618 }
2619
2620 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2621                                        struct sk_buff *skb)
2622 {
2623 }
2624
2625 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2626                                           unsigned long old)
2627 {
2628         return NULL;
2629 }
2630
2631 static struct dst_ops ipv4_dst_blackhole_ops = {
2632         .family                 =       AF_INET,
2633         .check                  =       ipv4_blackhole_dst_check,
2634         .mtu                    =       ipv4_blackhole_mtu,
2635         .default_advmss         =       ipv4_default_advmss,
2636         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2637         .redirect               =       ipv4_rt_blackhole_redirect,
2638         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2639         .neigh_lookup           =       ipv4_neigh_lookup,
2640 };
2641
2642 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2643 {
2644         struct rtable *ort = (struct rtable *) dst_orig;
2645         struct rtable *rt;
2646
2647         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2648         if (rt) {
2649                 struct dst_entry *new = &rt->dst;
2650
2651                 new->__use = 1;
2652                 new->input = dst_discard;
2653                 new->output = dst_discard_out;
2654
2655                 new->dev = net->loopback_dev;
2656                 if (new->dev)
2657                         dev_hold(new->dev);
2658
2659                 rt->rt_is_input = ort->rt_is_input;
2660                 rt->rt_iif = ort->rt_iif;
2661                 rt->rt_pmtu = ort->rt_pmtu;
2662                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2663
2664                 rt->rt_genid = rt_genid_ipv4(net);
2665                 rt->rt_flags = ort->rt_flags;
2666                 rt->rt_type = ort->rt_type;
2667                 rt->rt_gw_family = ort->rt_gw_family;
2668                 if (rt->rt_gw_family == AF_INET)
2669                         rt->rt_gw4 = ort->rt_gw4;
2670                 else if (rt->rt_gw_family == AF_INET6)
2671                         rt->rt_gw6 = ort->rt_gw6;
2672
2673                 INIT_LIST_HEAD(&rt->rt_uncached);
2674         }
2675
2676         dst_release(dst_orig);
2677
2678         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2679 }
2680
2681 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2682                                     const struct sock *sk)
2683 {
2684         struct rtable *rt = __ip_route_output_key(net, flp4);
2685
2686         if (IS_ERR(rt))
2687                 return rt;
2688
2689         if (flp4->flowi4_proto)
2690                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2691                                                         flowi4_to_flowi(flp4),
2692                                                         sk, 0);
2693
2694         return rt;
2695 }
2696 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2697
2698 /* called with rcu_read_lock held */
2699 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2700                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2701                         struct sk_buff *skb, u32 portid, u32 seq)
2702 {
2703         struct rtmsg *r;
2704         struct nlmsghdr *nlh;
2705         unsigned long expires = 0;
2706         u32 error;
2707         u32 metrics[RTAX_MAX];
2708
2709         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2710         if (!nlh)
2711                 return -EMSGSIZE;
2712
2713         r = nlmsg_data(nlh);
2714         r->rtm_family    = AF_INET;
2715         r->rtm_dst_len  = 32;
2716         r->rtm_src_len  = 0;
2717         r->rtm_tos      = fl4->flowi4_tos;
2718         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2719         if (nla_put_u32(skb, RTA_TABLE, table_id))
2720                 goto nla_put_failure;
2721         r->rtm_type     = rt->rt_type;
2722         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2723         r->rtm_protocol = RTPROT_UNSPEC;
2724         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2725         if (rt->rt_flags & RTCF_NOTIFY)
2726                 r->rtm_flags |= RTM_F_NOTIFY;
2727         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2728                 r->rtm_flags |= RTCF_DOREDIRECT;
2729
2730         if (nla_put_in_addr(skb, RTA_DST, dst))
2731                 goto nla_put_failure;
2732         if (src) {
2733                 r->rtm_src_len = 32;
2734                 if (nla_put_in_addr(skb, RTA_SRC, src))
2735                         goto nla_put_failure;
2736         }
2737         if (rt->dst.dev &&
2738             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2739                 goto nla_put_failure;
2740 #ifdef CONFIG_IP_ROUTE_CLASSID
2741         if (rt->dst.tclassid &&
2742             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2743                 goto nla_put_failure;
2744 #endif
2745         if (!rt_is_input_route(rt) &&
2746             fl4->saddr != src) {
2747                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2748                         goto nla_put_failure;
2749         }
2750         if (rt->rt_gw_family == AF_INET &&
2751             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2752                 goto nla_put_failure;
2753         } else if (rt->rt_gw_family == AF_INET6) {
2754                 int alen = sizeof(struct in6_addr);
2755                 struct nlattr *nla;
2756                 struct rtvia *via;
2757
2758                 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2759                 if (!nla)
2760                         goto nla_put_failure;
2761
2762                 via = nla_data(nla);
2763                 via->rtvia_family = AF_INET6;
2764                 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2765         }
2766
2767         expires = rt->dst.expires;
2768         if (expires) {
2769                 unsigned long now = jiffies;
2770
2771                 if (time_before(now, expires))
2772                         expires -= now;
2773                 else
2774                         expires = 0;
2775         }
2776
2777         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2778         if (rt->rt_pmtu && expires)
2779                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2780         if (rt->rt_mtu_locked && expires)
2781                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2782         if (rtnetlink_put_metrics(skb, metrics) < 0)
2783                 goto nla_put_failure;
2784
2785         if (fl4->flowi4_mark &&
2786             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2787                 goto nla_put_failure;
2788
2789         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2790             nla_put_u32(skb, RTA_UID,
2791                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2792                 goto nla_put_failure;
2793
2794         error = rt->dst.error;
2795
2796         if (rt_is_input_route(rt)) {
2797 #ifdef CONFIG_IP_MROUTE
2798                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2799                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2800                         int err = ipmr_get_route(net, skb,
2801                                                  fl4->saddr, fl4->daddr,
2802                                                  r, portid);
2803
2804                         if (err <= 0) {
2805                                 if (err == 0)
2806                                         return 0;
2807                                 goto nla_put_failure;
2808                         }
2809                 } else
2810 #endif
2811                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2812                                 goto nla_put_failure;
2813         }
2814
2815         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2816                 goto nla_put_failure;
2817
2818         nlmsg_end(skb, nlh);
2819         return 0;
2820
2821 nla_put_failure:
2822         nlmsg_cancel(skb, nlh);
2823         return -EMSGSIZE;
2824 }
2825
2826 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2827                                                    u8 ip_proto, __be16 sport,
2828                                                    __be16 dport)
2829 {
2830         struct sk_buff *skb;
2831         struct iphdr *iph;
2832
2833         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2834         if (!skb)
2835                 return NULL;
2836
2837         /* Reserve room for dummy headers, this skb can pass
2838          * through good chunk of routing engine.
2839          */
2840         skb_reset_mac_header(skb);
2841         skb_reset_network_header(skb);
2842         skb->protocol = htons(ETH_P_IP);
2843         iph = skb_put(skb, sizeof(struct iphdr));
2844         iph->protocol = ip_proto;
2845         iph->saddr = src;
2846         iph->daddr = dst;
2847         iph->version = 0x4;
2848         iph->frag_off = 0;
2849         iph->ihl = 0x5;
2850         skb_set_transport_header(skb, skb->len);
2851
2852         switch (iph->protocol) {
2853         case IPPROTO_UDP: {
2854                 struct udphdr *udph;
2855
2856                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2857                 udph->source = sport;
2858                 udph->dest = dport;
2859                 udph->len = sizeof(struct udphdr);
2860                 udph->check = 0;
2861                 break;
2862         }
2863         case IPPROTO_TCP: {
2864                 struct tcphdr *tcph;
2865
2866                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2867                 tcph->source    = sport;
2868                 tcph->dest      = dport;
2869                 tcph->doff      = sizeof(struct tcphdr) / 4;
2870                 tcph->rst = 1;
2871                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2872                                             src, dst, 0);
2873                 break;
2874         }
2875         case IPPROTO_ICMP: {
2876                 struct icmphdr *icmph;
2877
2878                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2879                 icmph->type = ICMP_ECHO;
2880                 icmph->code = 0;
2881         }
2882         }
2883
2884         return skb;
2885 }
2886
2887 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2888                                        const struct nlmsghdr *nlh,
2889                                        struct nlattr **tb,
2890                                        struct netlink_ext_ack *extack)
2891 {
2892         struct rtmsg *rtm;
2893         int i, err;
2894
2895         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2896                 NL_SET_ERR_MSG(extack,
2897                                "ipv4: Invalid header for route get request");
2898                 return -EINVAL;
2899         }
2900
2901         if (!netlink_strict_get_check(skb))
2902                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2903                                               rtm_ipv4_policy, extack);
2904
2905         rtm = nlmsg_data(nlh);
2906         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2907             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2908             rtm->rtm_table || rtm->rtm_protocol ||
2909             rtm->rtm_scope || rtm->rtm_type) {
2910                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2911                 return -EINVAL;
2912         }
2913
2914         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2915                                RTM_F_LOOKUP_TABLE |
2916                                RTM_F_FIB_MATCH)) {
2917                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2918                 return -EINVAL;
2919         }
2920
2921         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2922                                             rtm_ipv4_policy, extack);
2923         if (err)
2924                 return err;
2925
2926         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2927             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2928                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2929                 return -EINVAL;
2930         }
2931
2932         for (i = 0; i <= RTA_MAX; i++) {
2933                 if (!tb[i])
2934                         continue;
2935
2936                 switch (i) {
2937                 case RTA_IIF:
2938                 case RTA_OIF:
2939                 case RTA_SRC:
2940                 case RTA_DST:
2941                 case RTA_IP_PROTO:
2942                 case RTA_SPORT:
2943                 case RTA_DPORT:
2944                 case RTA_MARK:
2945                 case RTA_UID:
2946                         break;
2947                 default:
2948                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2949                         return -EINVAL;
2950                 }
2951         }
2952
2953         return 0;
2954 }
2955
2956 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2957                              struct netlink_ext_ack *extack)
2958 {
2959         struct net *net = sock_net(in_skb->sk);
2960         struct nlattr *tb[RTA_MAX+1];
2961         u32 table_id = RT_TABLE_MAIN;
2962         __be16 sport = 0, dport = 0;
2963         struct fib_result res = {};
2964         u8 ip_proto = IPPROTO_UDP;
2965         struct rtable *rt = NULL;
2966         struct sk_buff *skb;
2967         struct rtmsg *rtm;
2968         struct flowi4 fl4 = {};
2969         __be32 dst = 0;
2970         __be32 src = 0;
2971         kuid_t uid;
2972         u32 iif;
2973         int err;
2974         int mark;
2975
2976         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2977         if (err < 0)
2978                 return err;
2979
2980         rtm = nlmsg_data(nlh);
2981         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2982         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2983         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2984         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2985         if (tb[RTA_UID])
2986                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2987         else
2988                 uid = (iif ? INVALID_UID : current_uid());
2989
2990         if (tb[RTA_IP_PROTO]) {
2991                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2992                                                   &ip_proto, AF_INET, extack);
2993                 if (err)
2994                         return err;
2995         }
2996
2997         if (tb[RTA_SPORT])
2998                 sport = nla_get_be16(tb[RTA_SPORT]);
2999
3000         if (tb[RTA_DPORT])
3001                 dport = nla_get_be16(tb[RTA_DPORT]);
3002
3003         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3004         if (!skb)
3005                 return -ENOBUFS;
3006
3007         fl4.daddr = dst;
3008         fl4.saddr = src;
3009         fl4.flowi4_tos = rtm->rtm_tos;
3010         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3011         fl4.flowi4_mark = mark;
3012         fl4.flowi4_uid = uid;
3013         if (sport)
3014                 fl4.fl4_sport = sport;
3015         if (dport)
3016                 fl4.fl4_dport = dport;
3017         fl4.flowi4_proto = ip_proto;
3018
3019         rcu_read_lock();
3020
3021         if (iif) {
3022                 struct net_device *dev;
3023
3024                 dev = dev_get_by_index_rcu(net, iif);
3025                 if (!dev) {
3026                         err = -ENODEV;
3027                         goto errout_rcu;
3028                 }
3029
3030                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3031                 skb->dev        = dev;
3032                 skb->mark       = mark;
3033                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3034                                          dev, &res);
3035
3036                 rt = skb_rtable(skb);
3037                 if (err == 0 && rt->dst.error)
3038                         err = -rt->dst.error;
3039         } else {
3040                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3041                 skb->dev = net->loopback_dev;
3042                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3043                 err = 0;
3044                 if (IS_ERR(rt))
3045                         err = PTR_ERR(rt);
3046                 else
3047                         skb_dst_set(skb, &rt->dst);
3048         }
3049
3050         if (err)
3051                 goto errout_rcu;
3052
3053         if (rtm->rtm_flags & RTM_F_NOTIFY)
3054                 rt->rt_flags |= RTCF_NOTIFY;
3055
3056         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3057                 table_id = res.table ? res.table->tb_id : 0;
3058
3059         /* reset skb for netlink reply msg */
3060         skb_trim(skb, 0);
3061         skb_reset_network_header(skb);
3062         skb_reset_transport_header(skb);
3063         skb_reset_mac_header(skb);
3064
3065         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3066                 if (!res.fi) {
3067                         err = fib_props[res.type].error;
3068                         if (!err)
3069                                 err = -EHOSTUNREACH;
3070                         goto errout_rcu;
3071                 }
3072                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3073                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3074                                     rt->rt_type, res.prefix, res.prefixlen,
3075                                     fl4.flowi4_tos, res.fi, 0);
3076         } else {
3077                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3078                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3079         }
3080         if (err < 0)
3081                 goto errout_rcu;
3082
3083         rcu_read_unlock();
3084
3085         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3086
3087 errout_free:
3088         return err;
3089 errout_rcu:
3090         rcu_read_unlock();
3091         kfree_skb(skb);
3092         goto errout_free;
3093 }
3094
3095 void ip_rt_multicast_event(struct in_device *in_dev)
3096 {
3097         rt_cache_flush(dev_net(in_dev->dev));
3098 }
3099
3100 #ifdef CONFIG_SYSCTL
3101 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3102 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3103 static int ip_rt_gc_elasticity __read_mostly    = 8;
3104 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3105
3106 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3107                                         void __user *buffer,
3108                                         size_t *lenp, loff_t *ppos)
3109 {
3110         struct net *net = (struct net *)__ctl->extra1;
3111
3112         if (write) {
3113                 rt_cache_flush(net);
3114                 fnhe_genid_bump(net);
3115                 return 0;
3116         }
3117
3118         return -EINVAL;
3119 }
3120
3121 static struct ctl_table ipv4_route_table[] = {
3122         {
3123                 .procname       = "gc_thresh",
3124                 .data           = &ipv4_dst_ops.gc_thresh,
3125                 .maxlen         = sizeof(int),
3126                 .mode           = 0644,
3127                 .proc_handler   = proc_dointvec,
3128         },
3129         {
3130                 .procname       = "max_size",
3131                 .data           = &ip_rt_max_size,
3132                 .maxlen         = sizeof(int),
3133                 .mode           = 0644,
3134                 .proc_handler   = proc_dointvec,
3135         },
3136         {
3137                 /*  Deprecated. Use gc_min_interval_ms */
3138
3139                 .procname       = "gc_min_interval",
3140                 .data           = &ip_rt_gc_min_interval,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = proc_dointvec_jiffies,
3144         },
3145         {
3146                 .procname       = "gc_min_interval_ms",
3147                 .data           = &ip_rt_gc_min_interval,
3148                 .maxlen         = sizeof(int),
3149                 .mode           = 0644,
3150                 .proc_handler   = proc_dointvec_ms_jiffies,
3151         },
3152         {
3153                 .procname       = "gc_timeout",
3154                 .data           = &ip_rt_gc_timeout,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec_jiffies,
3158         },
3159         {
3160                 .procname       = "gc_interval",
3161                 .data           = &ip_rt_gc_interval,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec_jiffies,
3165         },
3166         {
3167                 .procname       = "redirect_load",
3168                 .data           = &ip_rt_redirect_load,
3169                 .maxlen         = sizeof(int),
3170                 .mode           = 0644,
3171                 .proc_handler   = proc_dointvec,
3172         },
3173         {
3174                 .procname       = "redirect_number",
3175                 .data           = &ip_rt_redirect_number,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "redirect_silence",
3182                 .data           = &ip_rt_redirect_silence,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 .procname       = "error_cost",
3189                 .data           = &ip_rt_error_cost,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec,
3193         },
3194         {
3195                 .procname       = "error_burst",
3196                 .data           = &ip_rt_error_burst,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec,
3200         },
3201         {
3202                 .procname       = "gc_elasticity",
3203                 .data           = &ip_rt_gc_elasticity,
3204                 .maxlen         = sizeof(int),
3205                 .mode           = 0644,
3206                 .proc_handler   = proc_dointvec,
3207         },
3208         {
3209                 .procname       = "mtu_expires",
3210                 .data           = &ip_rt_mtu_expires,
3211                 .maxlen         = sizeof(int),
3212                 .mode           = 0644,
3213                 .proc_handler   = proc_dointvec_jiffies,
3214         },
3215         {
3216                 .procname       = "min_pmtu",
3217                 .data           = &ip_rt_min_pmtu,
3218                 .maxlen         = sizeof(int),
3219                 .mode           = 0644,
3220                 .proc_handler   = proc_dointvec_minmax,
3221                 .extra1         = &ip_min_valid_pmtu,
3222         },
3223         {
3224                 .procname       = "min_adv_mss",
3225                 .data           = &ip_rt_min_advmss,
3226                 .maxlen         = sizeof(int),
3227                 .mode           = 0644,
3228                 .proc_handler   = proc_dointvec,
3229         },
3230         { }
3231 };
3232
3233 static struct ctl_table ipv4_route_flush_table[] = {
3234         {
3235                 .procname       = "flush",
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0200,
3238                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3239         },
3240         { },
3241 };
3242
3243 static __net_init int sysctl_route_net_init(struct net *net)
3244 {
3245         struct ctl_table *tbl;
3246
3247         tbl = ipv4_route_flush_table;
3248         if (!net_eq(net, &init_net)) {
3249                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3250                 if (!tbl)
3251                         goto err_dup;
3252
3253                 /* Don't export sysctls to unprivileged users */
3254                 if (net->user_ns != &init_user_ns)
3255                         tbl[0].procname = NULL;
3256         }
3257         tbl[0].extra1 = net;
3258
3259         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3260         if (!net->ipv4.route_hdr)
3261                 goto err_reg;
3262         return 0;
3263
3264 err_reg:
3265         if (tbl != ipv4_route_flush_table)
3266                 kfree(tbl);
3267 err_dup:
3268         return -ENOMEM;
3269 }
3270
3271 static __net_exit void sysctl_route_net_exit(struct net *net)
3272 {
3273         struct ctl_table *tbl;
3274
3275         tbl = net->ipv4.route_hdr->ctl_table_arg;
3276         unregister_net_sysctl_table(net->ipv4.route_hdr);
3277         BUG_ON(tbl == ipv4_route_flush_table);
3278         kfree(tbl);
3279 }
3280
3281 static __net_initdata struct pernet_operations sysctl_route_ops = {
3282         .init = sysctl_route_net_init,
3283         .exit = sysctl_route_net_exit,
3284 };
3285 #endif
3286
3287 static __net_init int rt_genid_init(struct net *net)
3288 {
3289         atomic_set(&net->ipv4.rt_genid, 0);
3290         atomic_set(&net->fnhe_genid, 0);
3291         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3292         return 0;
3293 }
3294
3295 static __net_initdata struct pernet_operations rt_genid_ops = {
3296         .init = rt_genid_init,
3297 };
3298
3299 static int __net_init ipv4_inetpeer_init(struct net *net)
3300 {
3301         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3302
3303         if (!bp)
3304                 return -ENOMEM;
3305         inet_peer_base_init(bp);
3306         net->ipv4.peers = bp;
3307         return 0;
3308 }
3309
3310 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3311 {
3312         struct inet_peer_base *bp = net->ipv4.peers;
3313
3314         net->ipv4.peers = NULL;
3315         inetpeer_invalidate_tree(bp);
3316         kfree(bp);
3317 }
3318
3319 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3320         .init   =       ipv4_inetpeer_init,
3321         .exit   =       ipv4_inetpeer_exit,
3322 };
3323
3324 #ifdef CONFIG_IP_ROUTE_CLASSID
3325 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3326 #endif /* CONFIG_IP_ROUTE_CLASSID */
3327
3328 int __init ip_rt_init(void)
3329 {
3330         int cpu;
3331
3332         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3333                                   GFP_KERNEL);
3334         if (!ip_idents)
3335                 panic("IP: failed to allocate ip_idents\n");
3336
3337         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3338
3339         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3340         if (!ip_tstamps)
3341                 panic("IP: failed to allocate ip_tstamps\n");
3342
3343         for_each_possible_cpu(cpu) {
3344                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3345
3346                 INIT_LIST_HEAD(&ul->head);
3347                 spin_lock_init(&ul->lock);
3348         }
3349 #ifdef CONFIG_IP_ROUTE_CLASSID
3350         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3351         if (!ip_rt_acct)
3352                 panic("IP: failed to allocate ip_rt_acct\n");
3353 #endif
3354
3355         ipv4_dst_ops.kmem_cachep =
3356                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3357                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3358
3359         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3360
3361         if (dst_entries_init(&ipv4_dst_ops) < 0)
3362                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3363
3364         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3365                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3366
3367         ipv4_dst_ops.gc_thresh = ~0;
3368         ip_rt_max_size = INT_MAX;
3369
3370         devinet_init();
3371         ip_fib_init();
3372
3373         if (ip_rt_proc_init())
3374                 pr_err("Unable to create route proc files\n");
3375 #ifdef CONFIG_XFRM
3376         xfrm_init();
3377         xfrm4_init();
3378 #endif
3379         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3380                       RTNL_FLAG_DOIT_UNLOCKED);
3381
3382 #ifdef CONFIG_SYSCTL
3383         register_pernet_subsys(&sysctl_route_ops);
3384 #endif
3385         register_pernet_subsys(&rt_genid_ops);
3386         register_pernet_subsys(&ipv4_inetpeer_ops);
3387         return 0;
3388 }
3389
3390 #ifdef CONFIG_SYSCTL
3391 /*
3392  * We really need to sanitize the damn ipv4 init order, then all
3393  * this nonsense will go away.
3394  */
3395 void __init ip_static_sysctl_init(void)
3396 {
3397         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3398 }
3399 #endif