Merge tag 'nfsd-5.2-2' of git://linux-nfs.org/~bfields/linux
[linux-2.6/linux-2.6-arm.git] / net / ipv4 / route.c
blob8ea0735a67546b745c2ef5ecd7dd0b5d06a22c77
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * ROUTE - implementation of the IP router.
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
61 #define pr_fmt(fmt) "IPv4: " fmt
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/arp.h>
99 #include <net/tcp.h>
100 #include <net/icmp.h>
101 #include <net/xfrm.h>
102 #include <net/lwtunnel.h>
103 #include <net/netevent.h>
104 #include <net/rtnetlink.h>
105 #ifdef CONFIG_SYSCTL
106 #include <linux/sysctl.h>
107 #endif
108 #include <net/secure_seq.h>
109 #include <net/ip_tunnels.h>
110 #include <net/l3mdev.h>
112 #include "fib_lookup.h"
114 #define RT_FL_TOS(oldflp4) \
115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly = 9;
121 static int ip_rt_redirect_load __read_mostly = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly = HZ;
124 static int ip_rt_error_burst __read_mostly = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
126 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly = 256;
129 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132 * Interface to generic destination cache.
135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
136 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
137 static unsigned int ipv4_mtu(const struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void ipv4_link_failure(struct sk_buff *skb);
140 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
141 struct sk_buff *skb, u32 mtu);
142 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
146 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148 WARN_ON(1);
149 return NULL;
152 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
153 struct sk_buff *skb,
154 const void *daddr);
155 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
157 static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
159 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss,
161 .mtu = ipv4_mtu,
162 .cow_metrics = ipv4_cow_metrics,
163 .destroy = ipv4_dst_destroy,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .redirect = ip_do_redirect,
168 .local_out = __ip_local_out,
169 .neigh_lookup = ipv4_neigh_lookup,
170 .confirm_neigh = ipv4_confirm_neigh,
173 #define ECN_OR_COST(class) TC_PRIO_##class
175 const __u8 ip_tos2prio[16] = {
176 TC_PRIO_BESTEFFORT,
177 ECN_OR_COST(BESTEFFORT),
178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(BESTEFFORT),
180 TC_PRIO_BULK,
181 ECN_OR_COST(BULK),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_INTERACTIVE,
185 ECN_OR_COST(INTERACTIVE),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK)
193 EXPORT_SYMBOL(ip_tos2prio);
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
201 if (*pos)
202 return NULL;
203 return SEQ_START_TOKEN;
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
208 ++*pos;
209 return NULL;
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
218 if (v == SEQ_START_TOKEN)
219 seq_printf(seq, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222 "HHUptod\tSpecDst");
223 return 0;
226 static const struct seq_operations rt_cache_seq_ops = {
227 .start = rt_cache_seq_start,
228 .next = rt_cache_seq_next,
229 .stop = rt_cache_seq_stop,
230 .show = rt_cache_seq_show,
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
235 return seq_open(file, &rt_cache_seq_ops);
238 static const struct file_operations rt_cache_seq_fops = {
239 .open = rt_cache_seq_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = seq_release,
246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 int cpu;
250 if (*pos == 0)
251 return SEQ_START_TOKEN;
253 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
254 if (!cpu_possible(cpu))
255 continue;
256 *pos = cpu+1;
257 return &per_cpu(rt_cache_stat, cpu);
259 return NULL;
262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 int cpu;
266 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
267 if (!cpu_possible(cpu))
268 continue;
269 *pos = cpu+1;
270 return &per_cpu(rt_cache_stat, cpu);
272 return NULL;
276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 struct rt_cache_stat *st = v;
285 if (v == SEQ_START_TOKEN) {
286 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
287 return 0;
290 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
291 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292 dst_entries_get_slow(&ipv4_dst_ops),
293 0, /* st->in_hit */
294 st->in_slow_tot,
295 st->in_slow_mc,
296 st->in_no_route,
297 st->in_brd,
298 st->in_martian_dst,
299 st->in_martian_src,
301 0, /* st->out_hit */
302 st->out_slow_tot,
303 st->out_slow_mc,
305 0, /* st->gc_total */
306 0, /* st->gc_ignored */
307 0, /* st->gc_goal_miss */
308 0, /* st->gc_dst_overflow */
309 0, /* st->in_hlist_search */
310 0 /* st->out_hlist_search */
312 return 0;
315 static const struct seq_operations rt_cpu_seq_ops = {
316 .start = rt_cpu_seq_start,
317 .next = rt_cpu_seq_next,
318 .stop = rt_cpu_seq_stop,
319 .show = rt_cpu_seq_show,
323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 return seq_open(file, &rt_cpu_seq_ops);
328 static const struct file_operations rt_cpu_seq_fops = {
329 .open = rt_cpu_seq_open,
330 .read = seq_read,
331 .llseek = seq_lseek,
332 .release = seq_release,
335 #ifdef CONFIG_IP_ROUTE_CLASSID
336 static int rt_acct_proc_show(struct seq_file *m, void *v)
338 struct ip_rt_acct *dst, *src;
339 unsigned int i, j;
341 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
342 if (!dst)
343 return -ENOMEM;
345 for_each_possible_cpu(i) {
346 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
347 for (j = 0; j < 256; j++) {
348 dst[j].o_bytes += src[j].o_bytes;
349 dst[j].o_packets += src[j].o_packets;
350 dst[j].i_bytes += src[j].i_bytes;
351 dst[j].i_packets += src[j].i_packets;
355 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
356 kfree(dst);
357 return 0;
359 #endif
361 static int __net_init ip_rt_do_proc_init(struct net *net)
363 struct proc_dir_entry *pde;
365 pde = proc_create("rt_cache", 0444, net->proc_net,
366 &rt_cache_seq_fops);
367 if (!pde)
368 goto err1;
370 pde = proc_create("rt_cache", 0444,
371 net->proc_net_stat, &rt_cpu_seq_fops);
372 if (!pde)
373 goto err2;
375 #ifdef CONFIG_IP_ROUTE_CLASSID
376 pde = proc_create_single("rt_acct", 0, net->proc_net,
377 rt_acct_proc_show);
378 if (!pde)
379 goto err3;
380 #endif
381 return 0;
383 #ifdef CONFIG_IP_ROUTE_CLASSID
384 err3:
385 remove_proc_entry("rt_cache", net->proc_net_stat);
386 #endif
387 err2:
388 remove_proc_entry("rt_cache", net->proc_net);
389 err1:
390 return -ENOMEM;
393 static void __net_exit ip_rt_do_proc_exit(struct net *net)
395 remove_proc_entry("rt_cache", net->proc_net_stat);
396 remove_proc_entry("rt_cache", net->proc_net);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 remove_proc_entry("rt_acct", net->proc_net);
399 #endif
402 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
403 .init = ip_rt_do_proc_init,
404 .exit = ip_rt_do_proc_exit,
407 static int __init ip_rt_proc_init(void)
409 return register_pernet_subsys(&ip_rt_proc_ops);
412 #else
413 static inline int ip_rt_proc_init(void)
415 return 0;
417 #endif /* CONFIG_PROC_FS */
419 static inline bool rt_is_expired(const struct rtable *rth)
421 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
424 void rt_cache_flush(struct net *net)
426 rt_genid_bump_ipv4(net);
429 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
430 struct sk_buff *skb,
431 const void *daddr)
433 const struct rtable *rt = container_of(dst, struct rtable, dst);
434 struct net_device *dev = dst->dev;
435 struct neighbour *n;
437 rcu_read_lock_bh();
439 if (likely(rt->rt_gw_family == AF_INET)) {
440 n = ip_neigh_gw4(dev, rt->rt_gw4);
441 } else if (rt->rt_gw_family == AF_INET6) {
442 n = ip_neigh_gw6(dev, &rt->rt_gw6);
443 } else {
444 __be32 pkey;
446 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
447 n = ip_neigh_gw4(dev, pkey);
450 if (n && !refcount_inc_not_zero(&n->refcnt))
451 n = NULL;
453 rcu_read_unlock_bh();
455 return n;
458 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
460 const struct rtable *rt = container_of(dst, struct rtable, dst);
461 struct net_device *dev = dst->dev;
462 const __be32 *pkey = daddr;
464 if (rt->rt_gw_family == AF_INET) {
465 pkey = (const __be32 *)&rt->rt_gw4;
466 } else if (rt->rt_gw_family == AF_INET6) {
467 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
468 } else if (!daddr ||
469 (rt->rt_flags &
470 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
471 return;
473 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
476 #define IP_IDENTS_SZ 2048u
478 static atomic_t *ip_idents __read_mostly;
479 static u32 *ip_tstamps __read_mostly;
481 /* In order to protect privacy, we add a perturbation to identifiers
482 * if one generator is seldom used. This makes hard for an attacker
483 * to infer how many packets were sent between two points in time.
485 u32 ip_idents_reserve(u32 hash, int segs)
487 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
488 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
489 u32 old = READ_ONCE(*p_tstamp);
490 u32 now = (u32)jiffies;
491 u32 new, delta = 0;
493 if (old != now && cmpxchg(p_tstamp, old, now) == old)
494 delta = prandom_u32_max(now - old);
496 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
497 do {
498 old = (u32)atomic_read(p_id);
499 new = old + delta + segs;
500 } while (atomic_cmpxchg(p_id, old, new) != old);
502 return new - segs;
504 EXPORT_SYMBOL(ip_idents_reserve);
506 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 u32 hash, id;
510 /* Note the following code is not safe, but this is okay. */
511 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
512 get_random_bytes(&net->ipv4.ip_id_key,
513 sizeof(net->ipv4.ip_id_key));
515 hash = siphash_3u32((__force u32)iph->daddr,
516 (__force u32)iph->saddr,
517 iph->protocol,
518 &net->ipv4.ip_id_key);
519 id = ip_idents_reserve(hash, segs);
520 iph->id = htons(id);
522 EXPORT_SYMBOL(__ip_select_ident);
524 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
525 const struct sock *sk,
526 const struct iphdr *iph,
527 int oif, u8 tos,
528 u8 prot, u32 mark, int flow_flags)
530 if (sk) {
531 const struct inet_sock *inet = inet_sk(sk);
533 oif = sk->sk_bound_dev_if;
534 mark = sk->sk_mark;
535 tos = RT_CONN_FLAGS(sk);
536 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538 flowi4_init_output(fl4, oif, mark, tos,
539 RT_SCOPE_UNIVERSE, prot,
540 flow_flags,
541 iph->daddr, iph->saddr, 0, 0,
542 sock_net_uid(net, sk));
545 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
546 const struct sock *sk)
548 const struct net *net = dev_net(skb->dev);
549 const struct iphdr *iph = ip_hdr(skb);
550 int oif = skb->dev->ifindex;
551 u8 tos = RT_TOS(iph->tos);
552 u8 prot = iph->protocol;
553 u32 mark = skb->mark;
555 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
558 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 const struct inet_sock *inet = inet_sk(sk);
561 const struct ip_options_rcu *inet_opt;
562 __be32 daddr = inet->inet_daddr;
564 rcu_read_lock();
565 inet_opt = rcu_dereference(inet->inet_opt);
566 if (inet_opt && inet_opt->opt.srr)
567 daddr = inet_opt->opt.faddr;
568 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
569 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
570 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
571 inet_sk_flowi_flags(sk),
572 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
573 rcu_read_unlock();
576 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
577 const struct sk_buff *skb)
579 if (skb)
580 build_skb_flow_key(fl4, skb, sk);
581 else
582 build_sk_flow_key(fl4, sk);
585 static DEFINE_SPINLOCK(fnhe_lock);
587 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 struct rtable *rt;
591 rt = rcu_dereference(fnhe->fnhe_rth_input);
592 if (rt) {
593 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
594 dst_dev_put(&rt->dst);
595 dst_release(&rt->dst);
597 rt = rcu_dereference(fnhe->fnhe_rth_output);
598 if (rt) {
599 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
600 dst_dev_put(&rt->dst);
601 dst_release(&rt->dst);
605 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 struct fib_nh_exception *fnhe, *oldest;
609 oldest = rcu_dereference(hash->chain);
610 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
611 fnhe = rcu_dereference(fnhe->fnhe_next)) {
612 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
613 oldest = fnhe;
615 fnhe_flush_routes(oldest);
616 return oldest;
619 static inline u32 fnhe_hashfun(__be32 daddr)
621 static u32 fnhe_hashrnd __read_mostly;
622 u32 hval;
624 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
625 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
626 return hash_32(hval, FNHE_HASH_SHIFT);
629 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 rt->rt_pmtu = fnhe->fnhe_pmtu;
632 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
633 rt->dst.expires = fnhe->fnhe_expires;
635 if (fnhe->fnhe_gw) {
636 rt->rt_flags |= RTCF_REDIRECTED;
637 rt->rt_gw_family = AF_INET;
638 rt->rt_gw4 = fnhe->fnhe_gw;
642 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
643 __be32 gw, u32 pmtu, bool lock,
644 unsigned long expires)
646 struct fnhe_hash_bucket *hash;
647 struct fib_nh_exception *fnhe;
648 struct rtable *rt;
649 u32 genid, hval;
650 unsigned int i;
651 int depth;
653 genid = fnhe_genid(dev_net(nhc->nhc_dev));
654 hval = fnhe_hashfun(daddr);
656 spin_lock_bh(&fnhe_lock);
658 hash = rcu_dereference(nhc->nhc_exceptions);
659 if (!hash) {
660 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
661 if (!hash)
662 goto out_unlock;
663 rcu_assign_pointer(nhc->nhc_exceptions, hash);
666 hash += hval;
668 depth = 0;
669 for (fnhe = rcu_dereference(hash->chain); fnhe;
670 fnhe = rcu_dereference(fnhe->fnhe_next)) {
671 if (fnhe->fnhe_daddr == daddr)
672 break;
673 depth++;
676 if (fnhe) {
677 if (fnhe->fnhe_genid != genid)
678 fnhe->fnhe_genid = genid;
679 if (gw)
680 fnhe->fnhe_gw = gw;
681 if (pmtu) {
682 fnhe->fnhe_pmtu = pmtu;
683 fnhe->fnhe_mtu_locked = lock;
685 fnhe->fnhe_expires = max(1UL, expires);
686 /* Update all cached dsts too */
687 rt = rcu_dereference(fnhe->fnhe_rth_input);
688 if (rt)
689 fill_route_from_fnhe(rt, fnhe);
690 rt = rcu_dereference(fnhe->fnhe_rth_output);
691 if (rt)
692 fill_route_from_fnhe(rt, fnhe);
693 } else {
694 if (depth > FNHE_RECLAIM_DEPTH)
695 fnhe = fnhe_oldest(hash);
696 else {
697 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698 if (!fnhe)
699 goto out_unlock;
701 fnhe->fnhe_next = hash->chain;
702 rcu_assign_pointer(hash->chain, fnhe);
704 fnhe->fnhe_genid = genid;
705 fnhe->fnhe_daddr = daddr;
706 fnhe->fnhe_gw = gw;
707 fnhe->fnhe_pmtu = pmtu;
708 fnhe->fnhe_mtu_locked = lock;
709 fnhe->fnhe_expires = max(1UL, expires);
711 /* Exception created; mark the cached routes for the nexthop
712 * stale, so anyone caching it rechecks if this exception
713 * applies to them.
715 rt = rcu_dereference(nhc->nhc_rth_input);
716 if (rt)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
719 for_each_possible_cpu(i) {
720 struct rtable __rcu **prt;
721 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
722 rt = rcu_dereference(*prt);
723 if (rt)
724 rt->dst.obsolete = DST_OBSOLETE_KILL;
728 fnhe->fnhe_stamp = jiffies;
730 out_unlock:
731 spin_unlock_bh(&fnhe_lock);
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 bool kill_route)
737 __be32 new_gw = icmp_hdr(skb)->un.gateway;
738 __be32 old_gw = ip_hdr(skb)->saddr;
739 struct net_device *dev = skb->dev;
740 struct in_device *in_dev;
741 struct fib_result res;
742 struct neighbour *n;
743 struct net *net;
745 switch (icmp_hdr(skb)->code & 7) {
746 case ICMP_REDIR_NET:
747 case ICMP_REDIR_NETTOS:
748 case ICMP_REDIR_HOST:
749 case ICMP_REDIR_HOSTTOS:
750 break;
752 default:
753 return;
756 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
757 return;
759 in_dev = __in_dev_get_rcu(dev);
760 if (!in_dev)
761 return;
763 net = dev_net(dev);
764 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 ipv4_is_zeronet(new_gw))
767 goto reject_redirect;
769 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 goto reject_redirect;
772 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 goto reject_redirect;
774 } else {
775 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776 goto reject_redirect;
779 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 if (!n)
781 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782 if (!IS_ERR(n)) {
783 if (!(n->nud_state & NUD_VALID)) {
784 neigh_event_send(n, NULL);
785 } else {
786 if (fib_lookup(net, fl4, &res, 0) == 0) {
787 struct fib_nh_common *nhc = FIB_RES_NHC(res);
789 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
790 0, false,
791 jiffies + ip_rt_gc_timeout);
793 if (kill_route)
794 rt->dst.obsolete = DST_OBSOLETE_KILL;
795 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
797 neigh_release(n);
799 return;
801 reject_redirect:
802 #ifdef CONFIG_IP_ROUTE_VERBOSE
803 if (IN_DEV_LOG_MARTIANS(in_dev)) {
804 const struct iphdr *iph = (const struct iphdr *) skb->data;
805 __be32 daddr = iph->daddr;
806 __be32 saddr = iph->saddr;
808 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
809 " Advised path = %pI4 -> %pI4\n",
810 &old_gw, dev->name, &new_gw,
811 &saddr, &daddr);
813 #endif
817 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
819 struct rtable *rt;
820 struct flowi4 fl4;
821 const struct iphdr *iph = (const struct iphdr *) skb->data;
822 struct net *net = dev_net(skb->dev);
823 int oif = skb->dev->ifindex;
824 u8 tos = RT_TOS(iph->tos);
825 u8 prot = iph->protocol;
826 u32 mark = skb->mark;
828 rt = (struct rtable *) dst;
830 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
831 __ip_do_redirect(rt, skb, &fl4, true);
834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
836 struct rtable *rt = (struct rtable *)dst;
837 struct dst_entry *ret = dst;
839 if (rt) {
840 if (dst->obsolete > 0) {
841 ip_rt_put(rt);
842 ret = NULL;
843 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
844 rt->dst.expires) {
845 ip_rt_put(rt);
846 ret = NULL;
849 return ret;
853 * Algorithm:
854 * 1. The first ip_rt_redirect_number redirects are sent
855 * with exponential backoff, then we stop sending them at all,
856 * assuming that the host ignores our redirects.
857 * 2. If we did not see packets requiring redirects
858 * during ip_rt_redirect_silence, we assume that the host
859 * forgot redirected route and start to send redirects again.
861 * This algorithm is much cheaper and more intelligent than dumb load limiting
862 * in icmp.c.
864 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
865 * and "frag. need" (breaks PMTU discovery) in icmp.c.
868 void ip_rt_send_redirect(struct sk_buff *skb)
870 struct rtable *rt = skb_rtable(skb);
871 struct in_device *in_dev;
872 struct inet_peer *peer;
873 struct net *net;
874 int log_martians;
875 int vif;
877 rcu_read_lock();
878 in_dev = __in_dev_get_rcu(rt->dst.dev);
879 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
880 rcu_read_unlock();
881 return;
883 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
884 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
885 rcu_read_unlock();
887 net = dev_net(rt->dst.dev);
888 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
889 if (!peer) {
890 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
891 rt_nexthop(rt, ip_hdr(skb)->daddr));
892 return;
895 /* No redirected packets during ip_rt_redirect_silence;
896 * reset the algorithm.
898 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
899 peer->rate_tokens = 0;
900 peer->n_redirects = 0;
903 /* Too many ignored redirects; do not send anything
904 * set dst.rate_last to the last seen redirected packet.
906 if (peer->n_redirects >= ip_rt_redirect_number) {
907 peer->rate_last = jiffies;
908 goto out_put_peer;
911 /* Check for load limit; set rate_last to the latest sent
912 * redirect.
914 if (peer->rate_tokens == 0 ||
915 time_after(jiffies,
916 (peer->rate_last +
917 (ip_rt_redirect_load << peer->rate_tokens)))) {
918 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
920 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
921 peer->rate_last = jiffies;
922 ++peer->rate_tokens;
923 ++peer->n_redirects;
924 #ifdef CONFIG_IP_ROUTE_VERBOSE
925 if (log_martians &&
926 peer->rate_tokens == ip_rt_redirect_number)
927 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
928 &ip_hdr(skb)->saddr, inet_iif(skb),
929 &ip_hdr(skb)->daddr, &gw);
930 #endif
932 out_put_peer:
933 inet_putpeer(peer);
936 static int ip_error(struct sk_buff *skb)
938 struct rtable *rt = skb_rtable(skb);
939 struct net_device *dev = skb->dev;
940 struct in_device *in_dev;
941 struct inet_peer *peer;
942 unsigned long now;
943 struct net *net;
944 bool send;
945 int code;
947 if (netif_is_l3_master(skb->dev)) {
948 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949 if (!dev)
950 goto out;
953 in_dev = __in_dev_get_rcu(dev);
955 /* IP on this device is disabled. */
956 if (!in_dev)
957 goto out;
959 net = dev_net(rt->dst.dev);
960 if (!IN_DEV_FORWARD(in_dev)) {
961 switch (rt->dst.error) {
962 case EHOSTUNREACH:
963 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
964 break;
966 case ENETUNREACH:
967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968 break;
970 goto out;
973 switch (rt->dst.error) {
974 case EINVAL:
975 default:
976 goto out;
977 case EHOSTUNREACH:
978 code = ICMP_HOST_UNREACH;
979 break;
980 case ENETUNREACH:
981 code = ICMP_NET_UNREACH;
982 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
983 break;
984 case EACCES:
985 code = ICMP_PKT_FILTERED;
986 break;
989 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
990 l3mdev_master_ifindex(skb->dev), 1);
992 send = true;
993 if (peer) {
994 now = jiffies;
995 peer->rate_tokens += now - peer->rate_last;
996 if (peer->rate_tokens > ip_rt_error_burst)
997 peer->rate_tokens = ip_rt_error_burst;
998 peer->rate_last = now;
999 if (peer->rate_tokens >= ip_rt_error_cost)
1000 peer->rate_tokens -= ip_rt_error_cost;
1001 else
1002 send = false;
1003 inet_putpeer(peer);
1005 if (send)
1006 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008 out: kfree_skb(skb);
1009 return 0;
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014 struct dst_entry *dst = &rt->dst;
1015 u32 old_mtu = ipv4_mtu(dst);
1016 struct fib_result res;
1017 bool lock = false;
1019 if (ip_mtu_locked(dst))
1020 return;
1022 if (old_mtu < mtu)
1023 return;
1025 if (mtu < ip_rt_min_pmtu) {
1026 lock = true;
1027 mtu = min(old_mtu, ip_rt_min_pmtu);
1030 if (rt->rt_pmtu == mtu && !lock &&
1031 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032 return;
1034 rcu_read_lock();
1035 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1039 jiffies + ip_rt_mtu_expires);
1041 rcu_read_unlock();
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045 struct sk_buff *skb, u32 mtu)
1047 struct rtable *rt = (struct rtable *) dst;
1048 struct flowi4 fl4;
1050 ip_rt_build_flow_key(&fl4, sk, skb);
1051 __ip_rt_update_pmtu(rt, &fl4, mtu);
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055 int oif, u8 protocol)
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
1060 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1062 __build_flow_key(net, &fl4, NULL, iph, oif,
1063 RT_TOS(iph->tos), protocol, mark, 0);
1064 rt = __ip_route_output_key(net, &fl4);
1065 if (!IS_ERR(rt)) {
1066 __ip_rt_update_pmtu(rt, &fl4, mtu);
1067 ip_rt_put(rt);
1070 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 const struct iphdr *iph = (const struct iphdr *) skb->data;
1075 struct flowi4 fl4;
1076 struct rtable *rt;
1078 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080 if (!fl4.flowi4_mark)
1081 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083 rt = __ip_route_output_key(sock_net(sk), &fl4);
1084 if (!IS_ERR(rt)) {
1085 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086 ip_rt_put(rt);
1090 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 const struct iphdr *iph = (const struct iphdr *) skb->data;
1093 struct flowi4 fl4;
1094 struct rtable *rt;
1095 struct dst_entry *odst = NULL;
1096 bool new = false;
1097 struct net *net = sock_net(sk);
1099 bh_lock_sock(sk);
1101 if (!ip_sk_accept_pmtu(sk))
1102 goto out;
1104 odst = sk_dst_get(sk);
1106 if (sock_owned_by_user(sk) || !odst) {
1107 __ipv4_sk_update_pmtu(skb, sk, mtu);
1108 goto out;
1111 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113 rt = (struct rtable *)odst;
1114 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1115 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116 if (IS_ERR(rt))
1117 goto out;
1119 new = true;
1122 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124 if (!dst_check(&rt->dst, 0)) {
1125 if (new)
1126 dst_release(&rt->dst);
1128 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129 if (IS_ERR(rt))
1130 goto out;
1132 new = true;
1135 if (new)
1136 sk_dst_set(sk, &rt->dst);
1138 out:
1139 bh_unlock_sock(sk);
1140 dst_release(odst);
1142 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1145 int oif, u8 protocol)
1147 const struct iphdr *iph = (const struct iphdr *) skb->data;
1148 struct flowi4 fl4;
1149 struct rtable *rt;
1151 __build_flow_key(net, &fl4, NULL, iph, oif,
1152 RT_TOS(iph->tos), protocol, 0, 0);
1153 rt = __ip_route_output_key(net, &fl4);
1154 if (!IS_ERR(rt)) {
1155 __ip_do_redirect(rt, skb, &fl4, false);
1156 ip_rt_put(rt);
1159 EXPORT_SYMBOL_GPL(ipv4_redirect);
1161 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163 const struct iphdr *iph = (const struct iphdr *) skb->data;
1164 struct flowi4 fl4;
1165 struct rtable *rt;
1166 struct net *net = sock_net(sk);
1168 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169 rt = __ip_route_output_key(net, &fl4);
1170 if (!IS_ERR(rt)) {
1171 __ip_do_redirect(rt, skb, &fl4, false);
1172 ip_rt_put(rt);
1175 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179 struct rtable *rt = (struct rtable *) dst;
1181 /* All IPV4 dsts are created with ->obsolete set to the value
1182 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183 * into this function always.
1185 * When a PMTU/redirect information update invalidates a route,
1186 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1187 * DST_OBSOLETE_DEAD.
1189 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1190 return NULL;
1191 return dst;
1194 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1196 struct ip_options opt;
1197 int res;
1199 /* Recompile ip options since IPCB may not be valid anymore.
1200 * Also check we have a reasonable ipv4 header.
1202 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204 return;
1206 memset(&opt, 0, sizeof(opt));
1207 if (ip_hdr(skb)->ihl > 5) {
1208 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209 return;
1210 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1212 rcu_read_lock();
1213 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214 rcu_read_unlock();
1216 if (res)
1217 return;
1219 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1222 static void ipv4_link_failure(struct sk_buff *skb)
1224 struct rtable *rt;
1226 ipv4_send_dest_unreach(skb);
1228 rt = skb_rtable(skb);
1229 if (rt)
1230 dst_set_expires(&rt->dst, 0);
1233 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1235 pr_debug("%s: %pI4 -> %pI4, %s\n",
1236 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237 skb->dev ? skb->dev->name : "?");
1238 kfree_skb(skb);
1239 WARN_ON(1);
1240 return 0;
1244 We do not cache source address of outgoing interface,
1245 because it is used only by IP RR, TS and SRR options,
1246 so that it out of fast path.
1248 BTW remember: "addr" is allowed to be not aligned
1249 in IP options!
1252 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1254 __be32 src;
1256 if (rt_is_output_route(rt))
1257 src = ip_hdr(skb)->saddr;
1258 else {
1259 struct fib_result res;
1260 struct iphdr *iph = ip_hdr(skb);
1261 struct flowi4 fl4 = {
1262 .daddr = iph->daddr,
1263 .saddr = iph->saddr,
1264 .flowi4_tos = RT_TOS(iph->tos),
1265 .flowi4_oif = rt->dst.dev->ifindex,
1266 .flowi4_iif = skb->dev->ifindex,
1267 .flowi4_mark = skb->mark,
1270 rcu_read_lock();
1271 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1273 else
1274 src = inet_select_addr(rt->dst.dev,
1275 rt_nexthop(rt, iph->daddr),
1276 RT_SCOPE_UNIVERSE);
1277 rcu_read_unlock();
1279 memcpy(addr, &src, 4);
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1285 if (!(rt->dst.tclassid & 0xFFFF))
1286 rt->dst.tclassid |= tag & 0xFFFF;
1287 if (!(rt->dst.tclassid & 0xFFFF0000))
1288 rt->dst.tclassid |= tag & 0xFFFF0000;
1290 #endif
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1294 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1295 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1296 ip_rt_min_advmss);
1298 return min(advmss, IPV4_MAX_PMTU - header_size);
1301 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1303 const struct rtable *rt = (const struct rtable *) dst;
1304 unsigned int mtu = rt->rt_pmtu;
1306 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1307 mtu = dst_metric_raw(dst, RTAX_MTU);
1309 if (mtu)
1310 return mtu;
1312 mtu = READ_ONCE(dst->dev->mtu);
1314 if (unlikely(ip_mtu_locked(dst))) {
1315 if (rt->rt_gw_family && mtu > 576)
1316 mtu = 576;
1319 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1321 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1324 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1326 struct fnhe_hash_bucket *hash;
1327 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328 u32 hval = fnhe_hashfun(daddr);
1330 spin_lock_bh(&fnhe_lock);
1332 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1333 lockdep_is_held(&fnhe_lock));
1334 hash += hval;
1336 fnhe_p = &hash->chain;
1337 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338 while (fnhe) {
1339 if (fnhe->fnhe_daddr == daddr) {
1340 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1342 /* set fnhe_daddr to 0 to ensure it won't bind with
1343 * new dsts in rt_bind_exception().
1345 fnhe->fnhe_daddr = 0;
1346 fnhe_flush_routes(fnhe);
1347 kfree_rcu(fnhe, rcu);
1348 break;
1350 fnhe_p = &fnhe->fnhe_next;
1351 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352 lockdep_is_held(&fnhe_lock));
1355 spin_unlock_bh(&fnhe_lock);
1358 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359 __be32 daddr)
1361 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1362 struct fib_nh_exception *fnhe;
1363 u32 hval;
1365 if (!hash)
1366 return NULL;
1368 hval = fnhe_hashfun(daddr);
1370 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1372 if (fnhe->fnhe_daddr == daddr) {
1373 if (fnhe->fnhe_expires &&
1374 time_after(jiffies, fnhe->fnhe_expires)) {
1375 ip_del_fnhe(nhc, daddr);
1376 break;
1378 return fnhe;
1381 return NULL;
1384 /* MTU selection:
1385 * 1. mtu on route is locked - use it
1386 * 2. mtu from nexthop exception
1387 * 3. mtu from egress device
1390 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1392 struct fib_nh_common *nhc = res->nhc;
1393 struct net_device *dev = nhc->nhc_dev;
1394 struct fib_info *fi = res->fi;
1395 u32 mtu = 0;
1397 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399 mtu = fi->fib_mtu;
1401 if (likely(!mtu)) {
1402 struct fib_nh_exception *fnhe;
1404 fnhe = find_exception(nhc, daddr);
1405 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406 mtu = fnhe->fnhe_pmtu;
1409 if (likely(!mtu))
1410 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1412 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1415 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1416 __be32 daddr, const bool do_cache)
1418 bool ret = false;
1420 spin_lock_bh(&fnhe_lock);
1422 if (daddr == fnhe->fnhe_daddr) {
1423 struct rtable __rcu **porig;
1424 struct rtable *orig;
1425 int genid = fnhe_genid(dev_net(rt->dst.dev));
1427 if (rt_is_input_route(rt))
1428 porig = &fnhe->fnhe_rth_input;
1429 else
1430 porig = &fnhe->fnhe_rth_output;
1431 orig = rcu_dereference(*porig);
1433 if (fnhe->fnhe_genid != genid) {
1434 fnhe->fnhe_genid = genid;
1435 fnhe->fnhe_gw = 0;
1436 fnhe->fnhe_pmtu = 0;
1437 fnhe->fnhe_expires = 0;
1438 fnhe->fnhe_mtu_locked = false;
1439 fnhe_flush_routes(fnhe);
1440 orig = NULL;
1442 fill_route_from_fnhe(rt, fnhe);
1443 if (!rt->rt_gw4) {
1444 rt->rt_gw4 = daddr;
1445 rt->rt_gw_family = AF_INET;
1448 if (do_cache) {
1449 dst_hold(&rt->dst);
1450 rcu_assign_pointer(*porig, rt);
1451 if (orig) {
1452 dst_dev_put(&orig->dst);
1453 dst_release(&orig->dst);
1455 ret = true;
1458 fnhe->fnhe_stamp = jiffies;
1460 spin_unlock_bh(&fnhe_lock);
1462 return ret;
1465 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1467 struct rtable *orig, *prev, **p;
1468 bool ret = true;
1470 if (rt_is_input_route(rt)) {
1471 p = (struct rtable **)&nhc->nhc_rth_input;
1472 } else {
1473 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1475 orig = *p;
1477 /* hold dst before doing cmpxchg() to avoid race condition
1478 * on this dst
1480 dst_hold(&rt->dst);
1481 prev = cmpxchg(p, orig, rt);
1482 if (prev == orig) {
1483 if (orig) {
1484 dst_dev_put(&orig->dst);
1485 dst_release(&orig->dst);
1487 } else {
1488 dst_release(&rt->dst);
1489 ret = false;
1492 return ret;
1495 struct uncached_list {
1496 spinlock_t lock;
1497 struct list_head head;
1500 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1502 void rt_add_uncached_list(struct rtable *rt)
1504 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1506 rt->rt_uncached_list = ul;
1508 spin_lock_bh(&ul->lock);
1509 list_add_tail(&rt->rt_uncached, &ul->head);
1510 spin_unlock_bh(&ul->lock);
1513 void rt_del_uncached_list(struct rtable *rt)
1515 if (!list_empty(&rt->rt_uncached)) {
1516 struct uncached_list *ul = rt->rt_uncached_list;
1518 spin_lock_bh(&ul->lock);
1519 list_del(&rt->rt_uncached);
1520 spin_unlock_bh(&ul->lock);
1524 static void ipv4_dst_destroy(struct dst_entry *dst)
1526 struct rtable *rt = (struct rtable *)dst;
1528 ip_dst_metrics_put(dst);
1529 rt_del_uncached_list(rt);
1532 void rt_flush_dev(struct net_device *dev)
1534 struct net *net = dev_net(dev);
1535 struct rtable *rt;
1536 int cpu;
1538 for_each_possible_cpu(cpu) {
1539 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1541 spin_lock_bh(&ul->lock);
1542 list_for_each_entry(rt, &ul->head, rt_uncached) {
1543 if (rt->dst.dev != dev)
1544 continue;
1545 rt->dst.dev = net->loopback_dev;
1546 dev_hold(rt->dst.dev);
1547 dev_put(dev);
1549 spin_unlock_bh(&ul->lock);
1553 static bool rt_cache_valid(const struct rtable *rt)
1555 return rt &&
1556 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557 !rt_is_expired(rt);
1560 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561 const struct fib_result *res,
1562 struct fib_nh_exception *fnhe,
1563 struct fib_info *fi, u16 type, u32 itag,
1564 const bool do_cache)
1566 bool cached = false;
1568 if (fi) {
1569 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1571 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572 rt->rt_gw_family = nhc->nhc_gw_family;
1573 /* only INET and INET6 are supported */
1574 if (likely(nhc->nhc_gw_family == AF_INET))
1575 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576 else
1577 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1580 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1582 #ifdef CONFIG_IP_ROUTE_CLASSID
1584 struct fib_nh *nh;
1586 nh = container_of(nhc, struct fib_nh, nh_common);
1587 rt->dst.tclassid = nh->nh_tclassid;
1589 #endif
1590 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591 if (unlikely(fnhe))
1592 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593 else if (do_cache)
1594 cached = rt_cache_route(nhc, rt);
1595 if (unlikely(!cached)) {
1596 /* Routes we intend to cache in nexthop exception or
1597 * FIB nexthop have the DST_NOCACHE bit clear.
1598 * However, if we are unsuccessful at storing this
1599 * route into the cache we really need to set it.
1601 if (!rt->rt_gw4) {
1602 rt->rt_gw_family = AF_INET;
1603 rt->rt_gw4 = daddr;
1605 rt_add_uncached_list(rt);
1607 } else
1608 rt_add_uncached_list(rt);
1610 #ifdef CONFIG_IP_ROUTE_CLASSID
1611 #ifdef CONFIG_IP_MULTIPLE_TABLES
1612 set_class_tag(rt, res->tclassid);
1613 #endif
1614 set_class_tag(rt, itag);
1615 #endif
1618 struct rtable *rt_dst_alloc(struct net_device *dev,
1619 unsigned int flags, u16 type,
1620 bool nopolicy, bool noxfrm, bool will_cache)
1622 struct rtable *rt;
1624 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625 (will_cache ? 0 : DST_HOST) |
1626 (nopolicy ? DST_NOPOLICY : 0) |
1627 (noxfrm ? DST_NOXFRM : 0));
1629 if (rt) {
1630 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631 rt->rt_flags = flags;
1632 rt->rt_type = type;
1633 rt->rt_is_input = 0;
1634 rt->rt_iif = 0;
1635 rt->rt_pmtu = 0;
1636 rt->rt_mtu_locked = 0;
1637 rt->rt_gw_family = 0;
1638 rt->rt_gw4 = 0;
1639 INIT_LIST_HEAD(&rt->rt_uncached);
1641 rt->dst.output = ip_output;
1642 if (flags & RTCF_LOCAL)
1643 rt->dst.input = ip_local_deliver;
1646 return rt;
1648 EXPORT_SYMBOL(rt_dst_alloc);
1650 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1652 struct rtable *new_rt;
1654 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1655 rt->dst.flags);
1657 if (new_rt) {
1658 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1659 new_rt->rt_flags = rt->rt_flags;
1660 new_rt->rt_type = rt->rt_type;
1661 new_rt->rt_is_input = rt->rt_is_input;
1662 new_rt->rt_iif = rt->rt_iif;
1663 new_rt->rt_pmtu = rt->rt_pmtu;
1664 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1665 new_rt->rt_gw_family = rt->rt_gw_family;
1666 if (rt->rt_gw_family == AF_INET)
1667 new_rt->rt_gw4 = rt->rt_gw4;
1668 else if (rt->rt_gw_family == AF_INET6)
1669 new_rt->rt_gw6 = rt->rt_gw6;
1670 INIT_LIST_HEAD(&new_rt->rt_uncached);
1672 new_rt->dst.flags |= DST_HOST;
1673 new_rt->dst.input = rt->dst.input;
1674 new_rt->dst.output = rt->dst.output;
1675 new_rt->dst.error = rt->dst.error;
1676 new_rt->dst.lastuse = jiffies;
1677 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1679 return new_rt;
1681 EXPORT_SYMBOL(rt_dst_clone);
1683 /* called in rcu_read_lock() section */
1684 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1685 u8 tos, struct net_device *dev,
1686 struct in_device *in_dev, u32 *itag)
1688 int err;
1690 /* Primary sanity checks. */
1691 if (!in_dev)
1692 return -EINVAL;
1694 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1695 skb->protocol != htons(ETH_P_IP))
1696 return -EINVAL;
1698 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1699 return -EINVAL;
1701 if (ipv4_is_zeronet(saddr)) {
1702 if (!ipv4_is_local_multicast(daddr) &&
1703 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1704 return -EINVAL;
1705 } else {
1706 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1707 in_dev, itag);
1708 if (err < 0)
1709 return err;
1711 return 0;
1714 /* called in rcu_read_lock() section */
1715 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1716 u8 tos, struct net_device *dev, int our)
1718 struct in_device *in_dev = __in_dev_get_rcu(dev);
1719 unsigned int flags = RTCF_MULTICAST;
1720 struct rtable *rth;
1721 u32 itag = 0;
1722 int err;
1724 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1725 if (err)
1726 return err;
1728 if (our)
1729 flags |= RTCF_LOCAL;
1731 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1732 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1733 if (!rth)
1734 return -ENOBUFS;
1736 #ifdef CONFIG_IP_ROUTE_CLASSID
1737 rth->dst.tclassid = itag;
1738 #endif
1739 rth->dst.output = ip_rt_bug;
1740 rth->rt_is_input= 1;
1742 #ifdef CONFIG_IP_MROUTE
1743 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1744 rth->dst.input = ip_mr_input;
1745 #endif
1746 RT_CACHE_STAT_INC(in_slow_mc);
1748 skb_dst_set(skb, &rth->dst);
1749 return 0;
1753 static void ip_handle_martian_source(struct net_device *dev,
1754 struct in_device *in_dev,
1755 struct sk_buff *skb,
1756 __be32 daddr,
1757 __be32 saddr)
1759 RT_CACHE_STAT_INC(in_martian_src);
1760 #ifdef CONFIG_IP_ROUTE_VERBOSE
1761 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1763 * RFC1812 recommendation, if source is martian,
1764 * the only hint is MAC header.
1766 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1767 &daddr, &saddr, dev->name);
1768 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1769 print_hex_dump(KERN_WARNING, "ll header: ",
1770 DUMP_PREFIX_OFFSET, 16, 1,
1771 skb_mac_header(skb),
1772 dev->hard_header_len, false);
1775 #endif
1778 /* called in rcu_read_lock() section */
1779 static int __mkroute_input(struct sk_buff *skb,
1780 const struct fib_result *res,
1781 struct in_device *in_dev,
1782 __be32 daddr, __be32 saddr, u32 tos)
1784 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1785 struct net_device *dev = nhc->nhc_dev;
1786 struct fib_nh_exception *fnhe;
1787 struct rtable *rth;
1788 int err;
1789 struct in_device *out_dev;
1790 bool do_cache;
1791 u32 itag = 0;
1793 /* get a working reference to the output device */
1794 out_dev = __in_dev_get_rcu(dev);
1795 if (!out_dev) {
1796 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1797 return -EINVAL;
1800 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1801 in_dev->dev, in_dev, &itag);
1802 if (err < 0) {
1803 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1804 saddr);
1806 goto cleanup;
1809 do_cache = res->fi && !itag;
1810 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1811 skb->protocol == htons(ETH_P_IP)) {
1812 __be32 gw;
1814 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1815 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1816 inet_addr_onlink(out_dev, saddr, gw))
1817 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1820 if (skb->protocol != htons(ETH_P_IP)) {
1821 /* Not IP (i.e. ARP). Do not create route, if it is
1822 * invalid for proxy arp. DNAT routes are always valid.
1824 * Proxy arp feature have been extended to allow, ARP
1825 * replies back to the same interface, to support
1826 * Private VLAN switch technologies. See arp.c.
1828 if (out_dev == in_dev &&
1829 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1830 err = -EINVAL;
1831 goto cleanup;
1835 fnhe = find_exception(nhc, daddr);
1836 if (do_cache) {
1837 if (fnhe)
1838 rth = rcu_dereference(fnhe->fnhe_rth_input);
1839 else
1840 rth = rcu_dereference(nhc->nhc_rth_input);
1841 if (rt_cache_valid(rth)) {
1842 skb_dst_set_noref(skb, &rth->dst);
1843 goto out;
1847 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1848 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1849 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1850 if (!rth) {
1851 err = -ENOBUFS;
1852 goto cleanup;
1855 rth->rt_is_input = 1;
1856 RT_CACHE_STAT_INC(in_slow_tot);
1858 rth->dst.input = ip_forward;
1860 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1861 do_cache);
1862 lwtunnel_set_redirect(&rth->dst);
1863 skb_dst_set(skb, &rth->dst);
1864 out:
1865 err = 0;
1866 cleanup:
1867 return err;
1870 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1871 /* To make ICMP packets follow the right flow, the multipath hash is
1872 * calculated from the inner IP addresses.
1874 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1875 struct flow_keys *hash_keys)
1877 const struct iphdr *outer_iph = ip_hdr(skb);
1878 const struct iphdr *key_iph = outer_iph;
1879 const struct iphdr *inner_iph;
1880 const struct icmphdr *icmph;
1881 struct iphdr _inner_iph;
1882 struct icmphdr _icmph;
1884 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1885 goto out;
1887 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1888 goto out;
1890 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1891 &_icmph);
1892 if (!icmph)
1893 goto out;
1895 if (icmph->type != ICMP_DEST_UNREACH &&
1896 icmph->type != ICMP_REDIRECT &&
1897 icmph->type != ICMP_TIME_EXCEEDED &&
1898 icmph->type != ICMP_PARAMETERPROB)
1899 goto out;
1901 inner_iph = skb_header_pointer(skb,
1902 outer_iph->ihl * 4 + sizeof(_icmph),
1903 sizeof(_inner_iph), &_inner_iph);
1904 if (!inner_iph)
1905 goto out;
1907 key_iph = inner_iph;
1908 out:
1909 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1913 /* if skb is set it will be used and fl4 can be NULL */
1914 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1915 const struct sk_buff *skb, struct flow_keys *flkeys)
1917 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1918 struct flow_keys hash_keys;
1919 u32 mhash;
1921 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922 case 0:
1923 memset(&hash_keys, 0, sizeof(hash_keys));
1924 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925 if (skb) {
1926 ip_multipath_l3_keys(skb, &hash_keys);
1927 } else {
1928 hash_keys.addrs.v4addrs.src = fl4->saddr;
1929 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1931 break;
1932 case 1:
1933 /* skb is currently provided only when forwarding */
1934 if (skb) {
1935 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936 struct flow_keys keys;
1938 /* short-circuit if we already have L4 hash present */
1939 if (skb->l4_hash)
1940 return skb_get_hash_raw(skb) >> 1;
1942 memset(&hash_keys, 0, sizeof(hash_keys));
1944 if (!flkeys) {
1945 skb_flow_dissect_flow_keys(skb, &keys, flag);
1946 flkeys = &keys;
1949 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952 hash_keys.ports.src = flkeys->ports.src;
1953 hash_keys.ports.dst = flkeys->ports.dst;
1954 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1955 } else {
1956 memset(&hash_keys, 0, sizeof(hash_keys));
1957 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958 hash_keys.addrs.v4addrs.src = fl4->saddr;
1959 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960 hash_keys.ports.src = fl4->fl4_sport;
1961 hash_keys.ports.dst = fl4->fl4_dport;
1962 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1964 break;
1966 mhash = flow_hash_from_keys(&hash_keys);
1968 if (multipath_hash)
1969 mhash = jhash_2words(mhash, multipath_hash, 0);
1971 return mhash >> 1;
1973 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1975 static int ip_mkroute_input(struct sk_buff *skb,
1976 struct fib_result *res,
1977 struct in_device *in_dev,
1978 __be32 daddr, __be32 saddr, u32 tos,
1979 struct flow_keys *hkeys)
1981 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1982 if (res->fi && res->fi->fib_nhs > 1) {
1983 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1985 fib_select_multipath(res, h);
1987 #endif
1989 /* create a routing cache entry */
1990 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1994 * NOTE. We drop all the packets that has local source
1995 * addresses, because every properly looped back packet
1996 * must have correct destination already attached by output routine.
1998 * Such approach solves two big problems:
1999 * 1. Not simplex devices are handled properly.
2000 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2001 * called with rcu_read_lock()
2004 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2005 u8 tos, struct net_device *dev,
2006 struct fib_result *res)
2008 struct in_device *in_dev = __in_dev_get_rcu(dev);
2009 struct flow_keys *flkeys = NULL, _flkeys;
2010 struct net *net = dev_net(dev);
2011 struct ip_tunnel_info *tun_info;
2012 int err = -EINVAL;
2013 unsigned int flags = 0;
2014 u32 itag = 0;
2015 struct rtable *rth;
2016 struct flowi4 fl4;
2017 bool do_cache = true;
2019 /* IP on this device is disabled. */
2021 if (!in_dev)
2022 goto out;
2024 /* Check for the most weird martians, which can be not detected
2025 by fib_lookup.
2028 tun_info = skb_tunnel_info(skb);
2029 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2030 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2031 else
2032 fl4.flowi4_tun_key.tun_id = 0;
2033 skb_dst_drop(skb);
2035 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2036 goto martian_source;
2038 res->fi = NULL;
2039 res->table = NULL;
2040 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2041 goto brd_input;
2043 /* Accept zero addresses only to limited broadcast;
2044 * I even do not know to fix it or not. Waiting for complains :-)
2046 if (ipv4_is_zeronet(saddr))
2047 goto martian_source;
2049 if (ipv4_is_zeronet(daddr))
2050 goto martian_destination;
2052 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2053 * and call it once if daddr or/and saddr are loopback addresses
2055 if (ipv4_is_loopback(daddr)) {
2056 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2057 goto martian_destination;
2058 } else if (ipv4_is_loopback(saddr)) {
2059 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2060 goto martian_source;
2064 * Now we are ready to route packet.
2066 fl4.flowi4_oif = 0;
2067 fl4.flowi4_iif = dev->ifindex;
2068 fl4.flowi4_mark = skb->mark;
2069 fl4.flowi4_tos = tos;
2070 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2071 fl4.flowi4_flags = 0;
2072 fl4.daddr = daddr;
2073 fl4.saddr = saddr;
2074 fl4.flowi4_uid = sock_net_uid(net, NULL);
2076 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2077 flkeys = &_flkeys;
2078 } else {
2079 fl4.flowi4_proto = 0;
2080 fl4.fl4_sport = 0;
2081 fl4.fl4_dport = 0;
2084 err = fib_lookup(net, &fl4, res, 0);
2085 if (err != 0) {
2086 if (!IN_DEV_FORWARD(in_dev))
2087 err = -EHOSTUNREACH;
2088 goto no_route;
2091 if (res->type == RTN_BROADCAST) {
2092 if (IN_DEV_BFORWARD(in_dev))
2093 goto make_route;
2094 /* not do cache if bc_forwarding is enabled */
2095 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2096 do_cache = false;
2097 goto brd_input;
2100 if (res->type == RTN_LOCAL) {
2101 err = fib_validate_source(skb, saddr, daddr, tos,
2102 0, dev, in_dev, &itag);
2103 if (err < 0)
2104 goto martian_source;
2105 goto local_input;
2108 if (!IN_DEV_FORWARD(in_dev)) {
2109 err = -EHOSTUNREACH;
2110 goto no_route;
2112 if (res->type != RTN_UNICAST)
2113 goto martian_destination;
2115 make_route:
2116 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2117 out: return err;
2119 brd_input:
2120 if (skb->protocol != htons(ETH_P_IP))
2121 goto e_inval;
2123 if (!ipv4_is_zeronet(saddr)) {
2124 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2125 in_dev, &itag);
2126 if (err < 0)
2127 goto martian_source;
2129 flags |= RTCF_BROADCAST;
2130 res->type = RTN_BROADCAST;
2131 RT_CACHE_STAT_INC(in_brd);
2133 local_input:
2134 do_cache &= res->fi && !itag;
2135 if (do_cache) {
2136 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2138 rth = rcu_dereference(nhc->nhc_rth_input);
2139 if (rt_cache_valid(rth)) {
2140 skb_dst_set_noref(skb, &rth->dst);
2141 err = 0;
2142 goto out;
2146 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2147 flags | RTCF_LOCAL, res->type,
2148 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2149 if (!rth)
2150 goto e_nobufs;
2152 rth->dst.output= ip_rt_bug;
2153 #ifdef CONFIG_IP_ROUTE_CLASSID
2154 rth->dst.tclassid = itag;
2155 #endif
2156 rth->rt_is_input = 1;
2158 RT_CACHE_STAT_INC(in_slow_tot);
2159 if (res->type == RTN_UNREACHABLE) {
2160 rth->dst.input= ip_error;
2161 rth->dst.error= -err;
2162 rth->rt_flags &= ~RTCF_LOCAL;
2165 if (do_cache) {
2166 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2168 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2169 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2170 WARN_ON(rth->dst.input == lwtunnel_input);
2171 rth->dst.lwtstate->orig_input = rth->dst.input;
2172 rth->dst.input = lwtunnel_input;
2175 if (unlikely(!rt_cache_route(nhc, rth)))
2176 rt_add_uncached_list(rth);
2178 skb_dst_set(skb, &rth->dst);
2179 err = 0;
2180 goto out;
2182 no_route:
2183 RT_CACHE_STAT_INC(in_no_route);
2184 res->type = RTN_UNREACHABLE;
2185 res->fi = NULL;
2186 res->table = NULL;
2187 goto local_input;
2190 * Do not cache martian addresses: they should be logged (RFC1812)
2192 martian_destination:
2193 RT_CACHE_STAT_INC(in_martian_dst);
2194 #ifdef CONFIG_IP_ROUTE_VERBOSE
2195 if (IN_DEV_LOG_MARTIANS(in_dev))
2196 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2197 &daddr, &saddr, dev->name);
2198 #endif
2200 e_inval:
2201 err = -EINVAL;
2202 goto out;
2204 e_nobufs:
2205 err = -ENOBUFS;
2206 goto out;
2208 martian_source:
2209 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2210 goto out;
2213 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2214 u8 tos, struct net_device *dev)
2216 struct fib_result res;
2217 int err;
2219 tos &= IPTOS_RT_MASK;
2220 rcu_read_lock();
2221 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2222 rcu_read_unlock();
2224 return err;
2226 EXPORT_SYMBOL(ip_route_input_noref);
2228 /* called with rcu_read_lock held */
2229 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2230 u8 tos, struct net_device *dev, struct fib_result *res)
2232 /* Multicast recognition logic is moved from route cache to here.
2233 The problem was that too many Ethernet cards have broken/missing
2234 hardware multicast filters :-( As result the host on multicasting
2235 network acquires a lot of useless route cache entries, sort of
2236 SDR messages from all the world. Now we try to get rid of them.
2237 Really, provided software IP multicast filter is organized
2238 reasonably (at least, hashed), it does not result in a slowdown
2239 comparing with route cache reject entries.
2240 Note, that multicast routers are not affected, because
2241 route cache entry is created eventually.
2243 if (ipv4_is_multicast(daddr)) {
2244 struct in_device *in_dev = __in_dev_get_rcu(dev);
2245 int our = 0;
2246 int err = -EINVAL;
2248 if (!in_dev)
2249 return err;
2250 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2251 ip_hdr(skb)->protocol);
2253 /* check l3 master if no match yet */
2254 if (!our && netif_is_l3_slave(dev)) {
2255 struct in_device *l3_in_dev;
2257 l3_in_dev = __in_dev_get_rcu(skb->dev);
2258 if (l3_in_dev)
2259 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2260 ip_hdr(skb)->protocol);
2263 if (our
2264 #ifdef CONFIG_IP_MROUTE
2266 (!ipv4_is_local_multicast(daddr) &&
2267 IN_DEV_MFORWARD(in_dev))
2268 #endif
2270 err = ip_route_input_mc(skb, daddr, saddr,
2271 tos, dev, our);
2273 return err;
2276 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2279 /* called with rcu_read_lock() */
2280 static struct rtable *__mkroute_output(const struct fib_result *res,
2281 const struct flowi4 *fl4, int orig_oif,
2282 struct net_device *dev_out,
2283 unsigned int flags)
2285 struct fib_info *fi = res->fi;
2286 struct fib_nh_exception *fnhe;
2287 struct in_device *in_dev;
2288 u16 type = res->type;
2289 struct rtable *rth;
2290 bool do_cache;
2292 in_dev = __in_dev_get_rcu(dev_out);
2293 if (!in_dev)
2294 return ERR_PTR(-EINVAL);
2296 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2297 if (ipv4_is_loopback(fl4->saddr) &&
2298 !(dev_out->flags & IFF_LOOPBACK) &&
2299 !netif_is_l3_master(dev_out))
2300 return ERR_PTR(-EINVAL);
2302 if (ipv4_is_lbcast(fl4->daddr))
2303 type = RTN_BROADCAST;
2304 else if (ipv4_is_multicast(fl4->daddr))
2305 type = RTN_MULTICAST;
2306 else if (ipv4_is_zeronet(fl4->daddr))
2307 return ERR_PTR(-EINVAL);
2309 if (dev_out->flags & IFF_LOOPBACK)
2310 flags |= RTCF_LOCAL;
2312 do_cache = true;
2313 if (type == RTN_BROADCAST) {
2314 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2315 fi = NULL;
2316 } else if (type == RTN_MULTICAST) {
2317 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2318 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2319 fl4->flowi4_proto))
2320 flags &= ~RTCF_LOCAL;
2321 else
2322 do_cache = false;
2323 /* If multicast route do not exist use
2324 * default one, but do not gateway in this case.
2325 * Yes, it is hack.
2327 if (fi && res->prefixlen < 4)
2328 fi = NULL;
2329 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2330 (orig_oif != dev_out->ifindex)) {
2331 /* For local routes that require a particular output interface
2332 * we do not want to cache the result. Caching the result
2333 * causes incorrect behaviour when there are multiple source
2334 * addresses on the interface, the end result being that if the
2335 * intended recipient is waiting on that interface for the
2336 * packet he won't receive it because it will be delivered on
2337 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2338 * be set to the loopback interface as well.
2340 do_cache = false;
2343 fnhe = NULL;
2344 do_cache &= fi != NULL;
2345 if (fi) {
2346 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2347 struct rtable __rcu **prth;
2349 fnhe = find_exception(nhc, fl4->daddr);
2350 if (!do_cache)
2351 goto add;
2352 if (fnhe) {
2353 prth = &fnhe->fnhe_rth_output;
2354 } else {
2355 if (unlikely(fl4->flowi4_flags &
2356 FLOWI_FLAG_KNOWN_NH &&
2357 !(nhc->nhc_gw_family &&
2358 nhc->nhc_scope == RT_SCOPE_LINK))) {
2359 do_cache = false;
2360 goto add;
2362 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2364 rth = rcu_dereference(*prth);
2365 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2366 return rth;
2369 add:
2370 rth = rt_dst_alloc(dev_out, flags, type,
2371 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2372 IN_DEV_CONF_GET(in_dev, NOXFRM),
2373 do_cache);
2374 if (!rth)
2375 return ERR_PTR(-ENOBUFS);
2377 rth->rt_iif = orig_oif;
2379 RT_CACHE_STAT_INC(out_slow_tot);
2381 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2382 if (flags & RTCF_LOCAL &&
2383 !(dev_out->flags & IFF_LOOPBACK)) {
2384 rth->dst.output = ip_mc_output;
2385 RT_CACHE_STAT_INC(out_slow_mc);
2387 #ifdef CONFIG_IP_MROUTE
2388 if (type == RTN_MULTICAST) {
2389 if (IN_DEV_MFORWARD(in_dev) &&
2390 !ipv4_is_local_multicast(fl4->daddr)) {
2391 rth->dst.input = ip_mr_input;
2392 rth->dst.output = ip_mc_output;
2395 #endif
2398 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2399 lwtunnel_set_redirect(&rth->dst);
2401 return rth;
2405 * Major route resolver routine.
2408 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2409 const struct sk_buff *skb)
2411 __u8 tos = RT_FL_TOS(fl4);
2412 struct fib_result res = {
2413 .type = RTN_UNSPEC,
2414 .fi = NULL,
2415 .table = NULL,
2416 .tclassid = 0,
2418 struct rtable *rth;
2420 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2421 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2422 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2423 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2425 rcu_read_lock();
2426 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2427 rcu_read_unlock();
2429 return rth;
2431 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2433 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2434 struct fib_result *res,
2435 const struct sk_buff *skb)
2437 struct net_device *dev_out = NULL;
2438 int orig_oif = fl4->flowi4_oif;
2439 unsigned int flags = 0;
2440 struct rtable *rth;
2441 int err = -ENETUNREACH;
2443 if (fl4->saddr) {
2444 rth = ERR_PTR(-EINVAL);
2445 if (ipv4_is_multicast(fl4->saddr) ||
2446 ipv4_is_lbcast(fl4->saddr) ||
2447 ipv4_is_zeronet(fl4->saddr))
2448 goto out;
2450 /* I removed check for oif == dev_out->oif here.
2451 It was wrong for two reasons:
2452 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2453 is assigned to multiple interfaces.
2454 2. Moreover, we are allowed to send packets with saddr
2455 of another iface. --ANK
2458 if (fl4->flowi4_oif == 0 &&
2459 (ipv4_is_multicast(fl4->daddr) ||
2460 ipv4_is_lbcast(fl4->daddr))) {
2461 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2462 dev_out = __ip_dev_find(net, fl4->saddr, false);
2463 if (!dev_out)
2464 goto out;
2466 /* Special hack: user can direct multicasts
2467 and limited broadcast via necessary interface
2468 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2469 This hack is not just for fun, it allows
2470 vic,vat and friends to work.
2471 They bind socket to loopback, set ttl to zero
2472 and expect that it will work.
2473 From the viewpoint of routing cache they are broken,
2474 because we are not allowed to build multicast path
2475 with loopback source addr (look, routing cache
2476 cannot know, that ttl is zero, so that packet
2477 will not leave this host and route is valid).
2478 Luckily, this hack is good workaround.
2481 fl4->flowi4_oif = dev_out->ifindex;
2482 goto make_route;
2485 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2486 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2487 if (!__ip_dev_find(net, fl4->saddr, false))
2488 goto out;
2493 if (fl4->flowi4_oif) {
2494 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2495 rth = ERR_PTR(-ENODEV);
2496 if (!dev_out)
2497 goto out;
2499 /* RACE: Check return value of inet_select_addr instead. */
2500 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2501 rth = ERR_PTR(-ENETUNREACH);
2502 goto out;
2504 if (ipv4_is_local_multicast(fl4->daddr) ||
2505 ipv4_is_lbcast(fl4->daddr) ||
2506 fl4->flowi4_proto == IPPROTO_IGMP) {
2507 if (!fl4->saddr)
2508 fl4->saddr = inet_select_addr(dev_out, 0,
2509 RT_SCOPE_LINK);
2510 goto make_route;
2512 if (!fl4->saddr) {
2513 if (ipv4_is_multicast(fl4->daddr))
2514 fl4->saddr = inet_select_addr(dev_out, 0,
2515 fl4->flowi4_scope);
2516 else if (!fl4->daddr)
2517 fl4->saddr = inet_select_addr(dev_out, 0,
2518 RT_SCOPE_HOST);
2522 if (!fl4->daddr) {
2523 fl4->daddr = fl4->saddr;
2524 if (!fl4->daddr)
2525 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2526 dev_out = net->loopback_dev;
2527 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2528 res->type = RTN_LOCAL;
2529 flags |= RTCF_LOCAL;
2530 goto make_route;
2533 err = fib_lookup(net, fl4, res, 0);
2534 if (err) {
2535 res->fi = NULL;
2536 res->table = NULL;
2537 if (fl4->flowi4_oif &&
2538 (ipv4_is_multicast(fl4->daddr) ||
2539 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2540 /* Apparently, routing tables are wrong. Assume,
2541 that the destination is on link.
2543 WHY? DW.
2544 Because we are allowed to send to iface
2545 even if it has NO routes and NO assigned
2546 addresses. When oif is specified, routing
2547 tables are looked up with only one purpose:
2548 to catch if destination is gatewayed, rather than
2549 direct. Moreover, if MSG_DONTROUTE is set,
2550 we send packet, ignoring both routing tables
2551 and ifaddr state. --ANK
2554 We could make it even if oif is unknown,
2555 likely IPv6, but we do not.
2558 if (fl4->saddr == 0)
2559 fl4->saddr = inet_select_addr(dev_out, 0,
2560 RT_SCOPE_LINK);
2561 res->type = RTN_UNICAST;
2562 goto make_route;
2564 rth = ERR_PTR(err);
2565 goto out;
2568 if (res->type == RTN_LOCAL) {
2569 if (!fl4->saddr) {
2570 if (res->fi->fib_prefsrc)
2571 fl4->saddr = res->fi->fib_prefsrc;
2572 else
2573 fl4->saddr = fl4->daddr;
2576 /* L3 master device is the loopback for that domain */
2577 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2578 net->loopback_dev;
2580 /* make sure orig_oif points to fib result device even
2581 * though packet rx/tx happens over loopback or l3mdev
2583 orig_oif = FIB_RES_OIF(*res);
2585 fl4->flowi4_oif = dev_out->ifindex;
2586 flags |= RTCF_LOCAL;
2587 goto make_route;
2590 fib_select_path(net, res, fl4, skb);
2592 dev_out = FIB_RES_DEV(*res);
2593 fl4->flowi4_oif = dev_out->ifindex;
2596 make_route:
2597 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2599 out:
2600 return rth;
2603 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2605 return NULL;
2608 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2610 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2612 return mtu ? : dst->dev->mtu;
2615 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2616 struct sk_buff *skb, u32 mtu)
2620 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2621 struct sk_buff *skb)
2625 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2626 unsigned long old)
2628 return NULL;
2631 static struct dst_ops ipv4_dst_blackhole_ops = {
2632 .family = AF_INET,
2633 .check = ipv4_blackhole_dst_check,
2634 .mtu = ipv4_blackhole_mtu,
2635 .default_advmss = ipv4_default_advmss,
2636 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2637 .redirect = ipv4_rt_blackhole_redirect,
2638 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2639 .neigh_lookup = ipv4_neigh_lookup,
2642 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2644 struct rtable *ort = (struct rtable *) dst_orig;
2645 struct rtable *rt;
2647 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2648 if (rt) {
2649 struct dst_entry *new = &rt->dst;
2651 new->__use = 1;
2652 new->input = dst_discard;
2653 new->output = dst_discard_out;
2655 new->dev = net->loopback_dev;
2656 if (new->dev)
2657 dev_hold(new->dev);
2659 rt->rt_is_input = ort->rt_is_input;
2660 rt->rt_iif = ort->rt_iif;
2661 rt->rt_pmtu = ort->rt_pmtu;
2662 rt->rt_mtu_locked = ort->rt_mtu_locked;
2664 rt->rt_genid = rt_genid_ipv4(net);
2665 rt->rt_flags = ort->rt_flags;
2666 rt->rt_type = ort->rt_type;
2667 rt->rt_gw_family = ort->rt_gw_family;
2668 if (rt->rt_gw_family == AF_INET)
2669 rt->rt_gw4 = ort->rt_gw4;
2670 else if (rt->rt_gw_family == AF_INET6)
2671 rt->rt_gw6 = ort->rt_gw6;
2673 INIT_LIST_HEAD(&rt->rt_uncached);
2676 dst_release(dst_orig);
2678 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2681 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2682 const struct sock *sk)
2684 struct rtable *rt = __ip_route_output_key(net, flp4);
2686 if (IS_ERR(rt))
2687 return rt;
2689 if (flp4->flowi4_proto)
2690 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2691 flowi4_to_flowi(flp4),
2692 sk, 0);
2694 return rt;
2696 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2698 /* called with rcu_read_lock held */
2699 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2700 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2701 struct sk_buff *skb, u32 portid, u32 seq)
2703 struct rtmsg *r;
2704 struct nlmsghdr *nlh;
2705 unsigned long expires = 0;
2706 u32 error;
2707 u32 metrics[RTAX_MAX];
2709 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2710 if (!nlh)
2711 return -EMSGSIZE;
2713 r = nlmsg_data(nlh);
2714 r->rtm_family = AF_INET;
2715 r->rtm_dst_len = 32;
2716 r->rtm_src_len = 0;
2717 r->rtm_tos = fl4->flowi4_tos;
2718 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2719 if (nla_put_u32(skb, RTA_TABLE, table_id))
2720 goto nla_put_failure;
2721 r->rtm_type = rt->rt_type;
2722 r->rtm_scope = RT_SCOPE_UNIVERSE;
2723 r->rtm_protocol = RTPROT_UNSPEC;
2724 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2725 if (rt->rt_flags & RTCF_NOTIFY)
2726 r->rtm_flags |= RTM_F_NOTIFY;
2727 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2728 r->rtm_flags |= RTCF_DOREDIRECT;
2730 if (nla_put_in_addr(skb, RTA_DST, dst))
2731 goto nla_put_failure;
2732 if (src) {
2733 r->rtm_src_len = 32;
2734 if (nla_put_in_addr(skb, RTA_SRC, src))
2735 goto nla_put_failure;
2737 if (rt->dst.dev &&
2738 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2739 goto nla_put_failure;
2740 #ifdef CONFIG_IP_ROUTE_CLASSID
2741 if (rt->dst.tclassid &&
2742 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2743 goto nla_put_failure;
2744 #endif
2745 if (!rt_is_input_route(rt) &&
2746 fl4->saddr != src) {
2747 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2748 goto nla_put_failure;
2750 if (rt->rt_gw_family == AF_INET &&
2751 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2752 goto nla_put_failure;
2753 } else if (rt->rt_gw_family == AF_INET6) {
2754 int alen = sizeof(struct in6_addr);
2755 struct nlattr *nla;
2756 struct rtvia *via;
2758 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2759 if (!nla)
2760 goto nla_put_failure;
2762 via = nla_data(nla);
2763 via->rtvia_family = AF_INET6;
2764 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2767 expires = rt->dst.expires;
2768 if (expires) {
2769 unsigned long now = jiffies;
2771 if (time_before(now, expires))
2772 expires -= now;
2773 else
2774 expires = 0;
2777 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2778 if (rt->rt_pmtu && expires)
2779 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2780 if (rt->rt_mtu_locked && expires)
2781 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2782 if (rtnetlink_put_metrics(skb, metrics) < 0)
2783 goto nla_put_failure;
2785 if (fl4->flowi4_mark &&
2786 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2787 goto nla_put_failure;
2789 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2790 nla_put_u32(skb, RTA_UID,
2791 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2792 goto nla_put_failure;
2794 error = rt->dst.error;
2796 if (rt_is_input_route(rt)) {
2797 #ifdef CONFIG_IP_MROUTE
2798 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2799 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2800 int err = ipmr_get_route(net, skb,
2801 fl4->saddr, fl4->daddr,
2802 r, portid);
2804 if (err <= 0) {
2805 if (err == 0)
2806 return 0;
2807 goto nla_put_failure;
2809 } else
2810 #endif
2811 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2812 goto nla_put_failure;
2815 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2816 goto nla_put_failure;
2818 nlmsg_end(skb, nlh);
2819 return 0;
2821 nla_put_failure:
2822 nlmsg_cancel(skb, nlh);
2823 return -EMSGSIZE;
2826 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2827 u8 ip_proto, __be16 sport,
2828 __be16 dport)
2830 struct sk_buff *skb;
2831 struct iphdr *iph;
2833 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2834 if (!skb)
2835 return NULL;
2837 /* Reserve room for dummy headers, this skb can pass
2838 * through good chunk of routing engine.
2840 skb_reset_mac_header(skb);
2841 skb_reset_network_header(skb);
2842 skb->protocol = htons(ETH_P_IP);
2843 iph = skb_put(skb, sizeof(struct iphdr));
2844 iph->protocol = ip_proto;
2845 iph->saddr = src;
2846 iph->daddr = dst;
2847 iph->version = 0x4;
2848 iph->frag_off = 0;
2849 iph->ihl = 0x5;
2850 skb_set_transport_header(skb, skb->len);
2852 switch (iph->protocol) {
2853 case IPPROTO_UDP: {
2854 struct udphdr *udph;
2856 udph = skb_put_zero(skb, sizeof(struct udphdr));
2857 udph->source = sport;
2858 udph->dest = dport;
2859 udph->len = sizeof(struct udphdr);
2860 udph->check = 0;
2861 break;
2863 case IPPROTO_TCP: {
2864 struct tcphdr *tcph;
2866 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2867 tcph->source = sport;
2868 tcph->dest = dport;
2869 tcph->doff = sizeof(struct tcphdr) / 4;
2870 tcph->rst = 1;
2871 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2872 src, dst, 0);
2873 break;
2875 case IPPROTO_ICMP: {
2876 struct icmphdr *icmph;
2878 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2879 icmph->type = ICMP_ECHO;
2880 icmph->code = 0;
2884 return skb;
2887 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2888 const struct nlmsghdr *nlh,
2889 struct nlattr **tb,
2890 struct netlink_ext_ack *extack)
2892 struct rtmsg *rtm;
2893 int i, err;
2895 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2896 NL_SET_ERR_MSG(extack,
2897 "ipv4: Invalid header for route get request");
2898 return -EINVAL;
2901 if (!netlink_strict_get_check(skb))
2902 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2903 rtm_ipv4_policy, extack);
2905 rtm = nlmsg_data(nlh);
2906 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2907 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2908 rtm->rtm_table || rtm->rtm_protocol ||
2909 rtm->rtm_scope || rtm->rtm_type) {
2910 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2911 return -EINVAL;
2914 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2915 RTM_F_LOOKUP_TABLE |
2916 RTM_F_FIB_MATCH)) {
2917 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2918 return -EINVAL;
2921 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2922 rtm_ipv4_policy, extack);
2923 if (err)
2924 return err;
2926 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2927 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2928 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2929 return -EINVAL;
2932 for (i = 0; i <= RTA_MAX; i++) {
2933 if (!tb[i])
2934 continue;
2936 switch (i) {
2937 case RTA_IIF:
2938 case RTA_OIF:
2939 case RTA_SRC:
2940 case RTA_DST:
2941 case RTA_IP_PROTO:
2942 case RTA_SPORT:
2943 case RTA_DPORT:
2944 case RTA_MARK:
2945 case RTA_UID:
2946 break;
2947 default:
2948 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2949 return -EINVAL;
2953 return 0;
2956 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2957 struct netlink_ext_ack *extack)
2959 struct net *net = sock_net(in_skb->sk);
2960 struct nlattr *tb[RTA_MAX+1];
2961 u32 table_id = RT_TABLE_MAIN;
2962 __be16 sport = 0, dport = 0;
2963 struct fib_result res = {};
2964 u8 ip_proto = IPPROTO_UDP;
2965 struct rtable *rt = NULL;
2966 struct sk_buff *skb;
2967 struct rtmsg *rtm;
2968 struct flowi4 fl4 = {};
2969 __be32 dst = 0;
2970 __be32 src = 0;
2971 kuid_t uid;
2972 u32 iif;
2973 int err;
2974 int mark;
2976 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2977 if (err < 0)
2978 return err;
2980 rtm = nlmsg_data(nlh);
2981 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2982 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2983 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2984 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2985 if (tb[RTA_UID])
2986 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2987 else
2988 uid = (iif ? INVALID_UID : current_uid());
2990 if (tb[RTA_IP_PROTO]) {
2991 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2992 &ip_proto, AF_INET, extack);
2993 if (err)
2994 return err;
2997 if (tb[RTA_SPORT])
2998 sport = nla_get_be16(tb[RTA_SPORT]);
3000 if (tb[RTA_DPORT])
3001 dport = nla_get_be16(tb[RTA_DPORT]);
3003 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3004 if (!skb)
3005 return -ENOBUFS;
3007 fl4.daddr = dst;
3008 fl4.saddr = src;
3009 fl4.flowi4_tos = rtm->rtm_tos;
3010 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3011 fl4.flowi4_mark = mark;
3012 fl4.flowi4_uid = uid;
3013 if (sport)
3014 fl4.fl4_sport = sport;
3015 if (dport)
3016 fl4.fl4_dport = dport;
3017 fl4.flowi4_proto = ip_proto;
3019 rcu_read_lock();
3021 if (iif) {
3022 struct net_device *dev;
3024 dev = dev_get_by_index_rcu(net, iif);
3025 if (!dev) {
3026 err = -ENODEV;
3027 goto errout_rcu;
3030 fl4.flowi4_iif = iif; /* for rt_fill_info */
3031 skb->dev = dev;
3032 skb->mark = mark;
3033 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3034 dev, &res);
3036 rt = skb_rtable(skb);
3037 if (err == 0 && rt->dst.error)
3038 err = -rt->dst.error;
3039 } else {
3040 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3041 skb->dev = net->loopback_dev;
3042 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3043 err = 0;
3044 if (IS_ERR(rt))
3045 err = PTR_ERR(rt);
3046 else
3047 skb_dst_set(skb, &rt->dst);
3050 if (err)
3051 goto errout_rcu;
3053 if (rtm->rtm_flags & RTM_F_NOTIFY)
3054 rt->rt_flags |= RTCF_NOTIFY;
3056 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3057 table_id = res.table ? res.table->tb_id : 0;
3059 /* reset skb for netlink reply msg */
3060 skb_trim(skb, 0);
3061 skb_reset_network_header(skb);
3062 skb_reset_transport_header(skb);
3063 skb_reset_mac_header(skb);
3065 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3066 if (!res.fi) {
3067 err = fib_props[res.type].error;
3068 if (!err)
3069 err = -EHOSTUNREACH;
3070 goto errout_rcu;
3072 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3073 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3074 rt->rt_type, res.prefix, res.prefixlen,
3075 fl4.flowi4_tos, res.fi, 0);
3076 } else {
3077 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3078 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3080 if (err < 0)
3081 goto errout_rcu;
3083 rcu_read_unlock();
3085 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3087 errout_free:
3088 return err;
3089 errout_rcu:
3090 rcu_read_unlock();
3091 kfree_skb(skb);
3092 goto errout_free;
3095 void ip_rt_multicast_event(struct in_device *in_dev)
3097 rt_cache_flush(dev_net(in_dev->dev));
3100 #ifdef CONFIG_SYSCTL
3101 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3102 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3103 static int ip_rt_gc_elasticity __read_mostly = 8;
3104 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3106 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3107 void __user *buffer,
3108 size_t *lenp, loff_t *ppos)
3110 struct net *net = (struct net *)__ctl->extra1;
3112 if (write) {
3113 rt_cache_flush(net);
3114 fnhe_genid_bump(net);
3115 return 0;
3118 return -EINVAL;
3121 static struct ctl_table ipv4_route_table[] = {
3123 .procname = "gc_thresh",
3124 .data = &ipv4_dst_ops.gc_thresh,
3125 .maxlen = sizeof(int),
3126 .mode = 0644,
3127 .proc_handler = proc_dointvec,
3130 .procname = "max_size",
3131 .data = &ip_rt_max_size,
3132 .maxlen = sizeof(int),
3133 .mode = 0644,
3134 .proc_handler = proc_dointvec,
3137 /* Deprecated. Use gc_min_interval_ms */
3139 .procname = "gc_min_interval",
3140 .data = &ip_rt_gc_min_interval,
3141 .maxlen = sizeof(int),
3142 .mode = 0644,
3143 .proc_handler = proc_dointvec_jiffies,
3146 .procname = "gc_min_interval_ms",
3147 .data = &ip_rt_gc_min_interval,
3148 .maxlen = sizeof(int),
3149 .mode = 0644,
3150 .proc_handler = proc_dointvec_ms_jiffies,
3153 .procname = "gc_timeout",
3154 .data = &ip_rt_gc_timeout,
3155 .maxlen = sizeof(int),
3156 .mode = 0644,
3157 .proc_handler = proc_dointvec_jiffies,
3160 .procname = "gc_interval",
3161 .data = &ip_rt_gc_interval,
3162 .maxlen = sizeof(int),
3163 .mode = 0644,
3164 .proc_handler = proc_dointvec_jiffies,
3167 .procname = "redirect_load",
3168 .data = &ip_rt_redirect_load,
3169 .maxlen = sizeof(int),
3170 .mode = 0644,
3171 .proc_handler = proc_dointvec,
3174 .procname = "redirect_number",
3175 .data = &ip_rt_redirect_number,
3176 .maxlen = sizeof(int),
3177 .mode = 0644,
3178 .proc_handler = proc_dointvec,
3181 .procname = "redirect_silence",
3182 .data = &ip_rt_redirect_silence,
3183 .maxlen = sizeof(int),
3184 .mode = 0644,
3185 .proc_handler = proc_dointvec,
3188 .procname = "error_cost",
3189 .data = &ip_rt_error_cost,
3190 .maxlen = sizeof(int),
3191 .mode = 0644,
3192 .proc_handler = proc_dointvec,
3195 .procname = "error_burst",
3196 .data = &ip_rt_error_burst,
3197 .maxlen = sizeof(int),
3198 .mode = 0644,
3199 .proc_handler = proc_dointvec,
3202 .procname = "gc_elasticity",
3203 .data = &ip_rt_gc_elasticity,
3204 .maxlen = sizeof(int),
3205 .mode = 0644,
3206 .proc_handler = proc_dointvec,
3209 .procname = "mtu_expires",
3210 .data = &ip_rt_mtu_expires,
3211 .maxlen = sizeof(int),
3212 .mode = 0644,
3213 .proc_handler = proc_dointvec_jiffies,
3216 .procname = "min_pmtu",
3217 .data = &ip_rt_min_pmtu,
3218 .maxlen = sizeof(int),
3219 .mode = 0644,
3220 .proc_handler = proc_dointvec_minmax,
3221 .extra1 = &ip_min_valid_pmtu,
3224 .procname = "min_adv_mss",
3225 .data = &ip_rt_min_advmss,
3226 .maxlen = sizeof(int),
3227 .mode = 0644,
3228 .proc_handler = proc_dointvec,
3233 static struct ctl_table ipv4_route_flush_table[] = {
3235 .procname = "flush",
3236 .maxlen = sizeof(int),
3237 .mode = 0200,
3238 .proc_handler = ipv4_sysctl_rtcache_flush,
3240 { },
3243 static __net_init int sysctl_route_net_init(struct net *net)
3245 struct ctl_table *tbl;
3247 tbl = ipv4_route_flush_table;
3248 if (!net_eq(net, &init_net)) {
3249 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3250 if (!tbl)
3251 goto err_dup;
3253 /* Don't export sysctls to unprivileged users */
3254 if (net->user_ns != &init_user_ns)
3255 tbl[0].procname = NULL;
3257 tbl[0].extra1 = net;
3259 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3260 if (!net->ipv4.route_hdr)
3261 goto err_reg;
3262 return 0;
3264 err_reg:
3265 if (tbl != ipv4_route_flush_table)
3266 kfree(tbl);
3267 err_dup:
3268 return -ENOMEM;
3271 static __net_exit void sysctl_route_net_exit(struct net *net)
3273 struct ctl_table *tbl;
3275 tbl = net->ipv4.route_hdr->ctl_table_arg;
3276 unregister_net_sysctl_table(net->ipv4.route_hdr);
3277 BUG_ON(tbl == ipv4_route_flush_table);
3278 kfree(tbl);
3281 static __net_initdata struct pernet_operations sysctl_route_ops = {
3282 .init = sysctl_route_net_init,
3283 .exit = sysctl_route_net_exit,
3285 #endif
3287 static __net_init int rt_genid_init(struct net *net)
3289 atomic_set(&net->ipv4.rt_genid, 0);
3290 atomic_set(&net->fnhe_genid, 0);
3291 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3292 return 0;
3295 static __net_initdata struct pernet_operations rt_genid_ops = {
3296 .init = rt_genid_init,
3299 static int __net_init ipv4_inetpeer_init(struct net *net)
3301 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3303 if (!bp)
3304 return -ENOMEM;
3305 inet_peer_base_init(bp);
3306 net->ipv4.peers = bp;
3307 return 0;
3310 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3312 struct inet_peer_base *bp = net->ipv4.peers;
3314 net->ipv4.peers = NULL;
3315 inetpeer_invalidate_tree(bp);
3316 kfree(bp);
3319 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3320 .init = ipv4_inetpeer_init,
3321 .exit = ipv4_inetpeer_exit,
3324 #ifdef CONFIG_IP_ROUTE_CLASSID
3325 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3326 #endif /* CONFIG_IP_ROUTE_CLASSID */
3328 int __init ip_rt_init(void)
3330 int cpu;
3332 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3333 GFP_KERNEL);
3334 if (!ip_idents)
3335 panic("IP: failed to allocate ip_idents\n");
3337 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3339 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3340 if (!ip_tstamps)
3341 panic("IP: failed to allocate ip_tstamps\n");
3343 for_each_possible_cpu(cpu) {
3344 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3346 INIT_LIST_HEAD(&ul->head);
3347 spin_lock_init(&ul->lock);
3349 #ifdef CONFIG_IP_ROUTE_CLASSID
3350 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3351 if (!ip_rt_acct)
3352 panic("IP: failed to allocate ip_rt_acct\n");
3353 #endif
3355 ipv4_dst_ops.kmem_cachep =
3356 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3357 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3359 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3361 if (dst_entries_init(&ipv4_dst_ops) < 0)
3362 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3364 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3365 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3367 ipv4_dst_ops.gc_thresh = ~0;
3368 ip_rt_max_size = INT_MAX;
3370 devinet_init();
3371 ip_fib_init();
3373 if (ip_rt_proc_init())
3374 pr_err("Unable to create route proc files\n");
3375 #ifdef CONFIG_XFRM
3376 xfrm_init();
3377 xfrm4_init();
3378 #endif
3379 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3380 RTNL_FLAG_DOIT_UNLOCKED);
3382 #ifdef CONFIG_SYSCTL
3383 register_pernet_subsys(&sysctl_route_ops);
3384 #endif
3385 register_pernet_subsys(&rt_genid_ops);
3386 register_pernet_subsys(&ipv4_inetpeer_ops);
3387 return 0;
3390 #ifdef CONFIG_SYSCTL
3392 * We really need to sanitize the damn ipv4 init order, then all
3393 * this nonsense will go away.
3395 void __init ip_static_sysctl_init(void)
3397 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3399 #endif