ipv6: fix uninit-value in ip6_multipath_l3_keys()
[linux/fpc-iii.git] / net / ipv6 / route.c
blob1aee1a537cb17d98ed6d94b6510a5f52f1d65755
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
74 enum rt6_nud_state {
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
78 RT6_NUD_SUCCEED = 1
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb);
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
117 unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
122 #endif
124 struct uncached_list {
125 spinlock_t lock;
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 struct net_device *loopback_dev = net->loopback_dev;
158 int cpu;
160 if (dev == loopback_dev)
161 return;
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 struct rt6_info *rt;
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
177 if (rt_dev == dev) {
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
180 dev_put(rt_dev);
183 spin_unlock_bh(&ul->lock);
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
189 return dst_metrics_write_ptr(&rt->from->dst);
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
194 struct rt6_info *rt = (struct rt6_info *)dst;
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
199 return NULL;
200 else
201 return dst_cow_metrics_generic(dst, old);
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 struct sk_buff *skb,
206 const void *daddr)
208 struct in6_addr *p = &rt->rt6i_gateway;
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
212 else if (skb)
213 return &ipv6_hdr(skb)->daddr;
214 return daddr;
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 struct sk_buff *skb,
219 const void *daddr)
221 struct rt6_info *rt = (struct rt6_info *) dst;
222 struct neighbour *n;
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
226 if (n)
227 return n;
228 return neigh_create(&nd_tbl, daddr, dst->dev);
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (!daddr)
238 return;
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 return;
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 return;
243 __ipv6_confirm_neigh(dev, daddr);
246 static struct dst_ops ip6_dst_ops_template = {
247 .family = AF_INET6,
248 .gc = ip6_dst_gc,
249 .gc_thresh = 1024,
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
252 .mtu = ip6_mtu,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
269 return mtu ? : dst->dev->mtu;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 struct sk_buff *skb)
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 .family = AF_INET6,
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
298 static const struct rt6_info ip6_null_entry_template = {
299 .dst = {
300 .__refcnt = ATOMIC_INIT(1),
301 .__use = 1,
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
320 .error = -EACCES,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .dst = {
332 .__refcnt = ATOMIC_INIT(1),
333 .__use = 1,
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .error = -EINVAL,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
345 #endif
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
359 int flags)
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
364 if (rt) {
365 rt6_info_init(rt);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
369 return rt;
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
374 int flags)
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
378 if (rt) {
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
382 return NULL;
386 return rt;
388 EXPORT_SYMBOL(ip6_dst_alloc);
390 static void ip6_dst_destroy(struct dst_entry *dst)
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket;
394 struct rt6_info *from = rt->from;
395 struct inet6_dev *idev;
397 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt);
401 idev = rt->rt6i_idev;
402 if (idev) {
403 rt->rt6i_idev = NULL;
404 in6_dev_put(idev);
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 if (bucket) {
408 rt->rt6i_exception_bucket = NULL;
409 kfree(bucket);
412 rt->from = NULL;
413 dst_release(&from->dst);
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 int how)
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
421 struct net_device *loopback_dev =
422 dev_net(dev)->loopback_dev;
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 if (loopback_idev) {
427 rt->rt6i_idev = loopback_idev;
428 in6_dev_put(idev);
433 static bool __rt6_check_expired(const struct rt6_info *rt)
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
437 else
438 return false;
441 static bool rt6_check_expired(const struct rt6_info *rt)
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
445 return true;
446 } else if (rt->from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired(rt->from);
450 return false;
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 struct flowi6 *fl6, int oif,
455 int strict)
457 struct rt6_info *sibling, *next_sibling;
459 /* We might have already computed the hash for ICMPv6 errors. In such
460 * case it will always be non-zero. Otherwise now is the time to do it.
462 if (!fl6->mp_hash)
463 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466 return match;
468 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469 rt6i_siblings) {
470 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471 continue;
472 if (rt6_score_route(sibling, oif, strict) < 0)
473 break;
474 match = sibling;
475 break;
478 return match;
482 * Route lookup. rcu_read_lock() should be held.
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486 struct rt6_info *rt,
487 const struct in6_addr *saddr,
488 int oif,
489 int flags)
491 struct rt6_info *local = NULL;
492 struct rt6_info *sprt;
494 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495 return rt;
497 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498 struct net_device *dev = sprt->dst.dev;
500 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501 continue;
503 if (oif) {
504 if (dev->ifindex == oif)
505 return sprt;
506 if (dev->flags & IFF_LOOPBACK) {
507 if (!sprt->rt6i_idev ||
508 sprt->rt6i_idev->dev->ifindex != oif) {
509 if (flags & RT6_LOOKUP_F_IFACE)
510 continue;
511 if (local &&
512 local->rt6i_idev->dev->ifindex == oif)
513 continue;
515 local = sprt;
517 } else {
518 if (ipv6_chk_addr(net, saddr, dev,
519 flags & RT6_LOOKUP_F_IFACE))
520 return sprt;
524 if (oif) {
525 if (local)
526 return local;
528 if (flags & RT6_LOOKUP_F_IFACE)
529 return net->ipv6.ip6_null_entry;
532 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 struct work_struct work;
538 struct in6_addr target;
539 struct net_device *dev;
542 static void rt6_probe_deferred(struct work_struct *w)
544 struct in6_addr mcaddr;
545 struct __rt6_probe_work *work =
546 container_of(w, struct __rt6_probe_work, work);
548 addrconf_addr_solict_mult(&work->target, &mcaddr);
549 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 dev_put(work->dev);
551 kfree(work);
554 static void rt6_probe(struct rt6_info *rt)
556 struct __rt6_probe_work *work;
557 struct neighbour *neigh;
559 * Okay, this does not seem to be appropriate
560 * for now, however, we need to check if it
561 * is really so; aka Router Reachability Probing.
563 * Router Reachability Probe MUST be rate-limited
564 * to no more than one per minute.
566 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 return;
568 rcu_read_lock_bh();
569 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 if (neigh) {
571 if (neigh->nud_state & NUD_VALID)
572 goto out;
574 work = NULL;
575 write_lock(&neigh->lock);
576 if (!(neigh->nud_state & NUD_VALID) &&
577 time_after(jiffies,
578 neigh->updated +
579 rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 if (work)
582 __neigh_set_probe_once(neigh);
584 write_unlock(&neigh->lock);
585 } else {
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 if (work) {
590 INIT_WORK(&work->work, rt6_probe_deferred);
591 work->target = rt->rt6i_gateway;
592 dev_hold(rt->dst.dev);
593 work->dev = rt->dst.dev;
594 schedule_work(&work->work);
597 out:
598 rcu_read_unlock_bh();
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
604 #endif
607 * Default Router Selection (RFC 2461 6.3.6)
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
611 struct net_device *dev = rt->dst.dev;
612 if (!oif || dev->ifindex == oif)
613 return 2;
614 if ((dev->flags & IFF_LOOPBACK) &&
615 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 return 1;
617 return 0;
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
622 struct neighbour *neigh;
623 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
625 if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 !(rt->rt6i_flags & RTF_GATEWAY))
627 return RT6_NUD_SUCCEED;
629 rcu_read_lock_bh();
630 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 if (neigh) {
632 read_lock(&neigh->lock);
633 if (neigh->nud_state & NUD_VALID)
634 ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 else if (!(neigh->nud_state & NUD_FAILED))
637 ret = RT6_NUD_SUCCEED;
638 else
639 ret = RT6_NUD_FAIL_PROBE;
640 #endif
641 read_unlock(&neigh->lock);
642 } else {
643 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
646 rcu_read_unlock_bh();
648 return ret;
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652 int strict)
654 int m;
656 m = rt6_check_dev(rt, oif);
657 if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662 if (strict & RT6_LOOKUP_F_REACHABLE) {
663 int n = rt6_check_neigh(rt);
664 if (n < 0)
665 return n;
667 return m;
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 int *mpri, struct rt6_info *match,
672 bool *do_rr)
674 int m;
675 bool match_do_rr = false;
676 struct inet6_dev *idev = rt->rt6i_idev;
678 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679 goto out;
681 if (idev->cnf.ignore_routes_with_linkdown &&
682 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684 goto out;
686 if (rt6_check_expired(rt))
687 goto out;
689 m = rt6_score_route(rt, oif, strict);
690 if (m == RT6_NUD_FAIL_DO_RR) {
691 match_do_rr = true;
692 m = 0; /* lowest valid score */
693 } else if (m == RT6_NUD_FAIL_HARD) {
694 goto out;
697 if (strict & RT6_LOOKUP_F_REACHABLE)
698 rt6_probe(rt);
700 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
701 if (m > *mpri) {
702 *do_rr = match_do_rr;
703 *mpri = m;
704 match = rt;
706 out:
707 return match;
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711 struct rt6_info *leaf,
712 struct rt6_info *rr_head,
713 u32 metric, int oif, int strict,
714 bool *do_rr)
716 struct rt6_info *rt, *match, *cont;
717 int mpri = -1;
719 match = NULL;
720 cont = NULL;
721 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722 if (rt->rt6i_metric != metric) {
723 cont = rt;
724 break;
727 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 for (rt = leaf; rt && rt != rr_head;
731 rt = rcu_dereference(rt->rt6_next)) {
732 if (rt->rt6i_metric != metric) {
733 cont = rt;
734 break;
737 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740 if (match || !cont)
741 return match;
743 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744 match = find_match(rt, oif, strict, &mpri, match, do_rr);
746 return match;
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750 int oif, int strict)
752 struct rt6_info *leaf = rcu_dereference(fn->leaf);
753 struct rt6_info *match, *rt0;
754 bool do_rr = false;
755 int key_plen;
757 if (!leaf || leaf == net->ipv6.ip6_null_entry)
758 return net->ipv6.ip6_null_entry;
760 rt0 = rcu_dereference(fn->rr_ptr);
761 if (!rt0)
762 rt0 = leaf;
764 /* Double check to make sure fn is not an intermediate node
765 * and fn->leaf does not points to its child's leaf
766 * (This might happen if all routes under fn are deleted from
767 * the tree and fib6_repair_tree() is called on the node.)
769 key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771 if (rt0->rt6i_src.plen)
772 key_plen = rt0->rt6i_src.plen;
773 #endif
774 if (fn->fn_bit != key_plen)
775 return net->ipv6.ip6_null_entry;
777 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778 &do_rr);
780 if (do_rr) {
781 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
783 /* no entries matched; do round-robin */
784 if (!next || next->rt6i_metric != rt0->rt6i_metric)
785 next = leaf;
787 if (next != rt0) {
788 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789 /* make sure next is not being deleted from the tree */
790 if (next->rt6i_node)
791 rcu_assign_pointer(fn->rr_ptr, next);
792 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
796 return match ? match : net->ipv6.ip6_null_entry;
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
801 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806 const struct in6_addr *gwaddr)
808 struct net *net = dev_net(dev);
809 struct route_info *rinfo = (struct route_info *) opt;
810 struct in6_addr prefix_buf, *prefix;
811 unsigned int pref;
812 unsigned long lifetime;
813 struct rt6_info *rt;
815 if (len < sizeof(struct route_info)) {
816 return -EINVAL;
819 /* Sanity check for prefix_len and length */
820 if (rinfo->length > 3) {
821 return -EINVAL;
822 } else if (rinfo->prefix_len > 128) {
823 return -EINVAL;
824 } else if (rinfo->prefix_len > 64) {
825 if (rinfo->length < 2) {
826 return -EINVAL;
828 } else if (rinfo->prefix_len > 0) {
829 if (rinfo->length < 1) {
830 return -EINVAL;
834 pref = rinfo->route_pref;
835 if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 return -EINVAL;
838 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
840 if (rinfo->length == 3)
841 prefix = (struct in6_addr *)rinfo->prefix;
842 else {
843 /* this function is safe */
844 ipv6_addr_prefix(&prefix_buf,
845 (struct in6_addr *)rinfo->prefix,
846 rinfo->prefix_len);
847 prefix = &prefix_buf;
850 if (rinfo->prefix_len == 0)
851 rt = rt6_get_dflt_router(gwaddr, dev);
852 else
853 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 gwaddr, dev);
856 if (rt && !lifetime) {
857 ip6_del_rt(rt);
858 rt = NULL;
861 if (!rt && lifetime)
862 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 dev, pref);
864 else if (rt)
865 rt->rt6i_flags = RTF_ROUTEINFO |
866 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
868 if (rt) {
869 if (!addrconf_finite_timeout(lifetime))
870 rt6_clean_expires(rt);
871 else
872 rt6_set_expires(rt, jiffies + HZ * lifetime);
874 ip6_rt_put(rt);
876 return 0;
878 #endif
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881 struct in6_addr *saddr)
883 struct fib6_node *pn, *sn;
884 while (1) {
885 if (fn->fn_flags & RTN_TL_ROOT)
886 return NULL;
887 pn = rcu_dereference(fn->parent);
888 sn = FIB6_SUBTREE(pn);
889 if (sn && sn != fn)
890 fn = fib6_lookup(sn, NULL, saddr);
891 else
892 fn = pn;
893 if (fn->fn_flags & RTN_RTINFO)
894 return fn;
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899 bool null_fallback)
901 struct rt6_info *rt = *prt;
903 if (dst_hold_safe(&rt->dst))
904 return true;
905 if (null_fallback) {
906 rt = net->ipv6.ip6_null_entry;
907 dst_hold(&rt->dst);
908 } else {
909 rt = NULL;
911 *prt = rt;
912 return false;
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916 struct fib6_table *table,
917 struct flowi6 *fl6, int flags)
919 struct rt6_info *rt, *rt_cache;
920 struct fib6_node *fn;
922 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
923 flags &= ~RT6_LOOKUP_F_IFACE;
925 rcu_read_lock();
926 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
927 restart:
928 rt = rcu_dereference(fn->leaf);
929 if (!rt) {
930 rt = net->ipv6.ip6_null_entry;
931 } else {
932 rt = rt6_device_match(net, rt, &fl6->saddr,
933 fl6->flowi6_oif, flags);
934 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
935 rt = rt6_multipath_select(rt, fl6,
936 fl6->flowi6_oif, flags);
938 if (rt == net->ipv6.ip6_null_entry) {
939 fn = fib6_backtrack(fn, &fl6->saddr);
940 if (fn)
941 goto restart;
943 /* Search through exception table */
944 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
945 if (rt_cache)
946 rt = rt_cache;
948 if (ip6_hold_safe(net, &rt, true))
949 dst_use_noref(&rt->dst, jiffies);
951 rcu_read_unlock();
953 trace_fib6_table_lookup(net, rt, table, fl6);
955 return rt;
959 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
960 int flags)
962 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
964 EXPORT_SYMBOL_GPL(ip6_route_lookup);
966 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
967 const struct in6_addr *saddr, int oif, int strict)
969 struct flowi6 fl6 = {
970 .flowi6_oif = oif,
971 .daddr = *daddr,
973 struct dst_entry *dst;
974 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
976 if (saddr) {
977 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
978 flags |= RT6_LOOKUP_F_HAS_SADDR;
981 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
982 if (dst->error == 0)
983 return (struct rt6_info *) dst;
985 dst_release(dst);
987 return NULL;
989 EXPORT_SYMBOL(rt6_lookup);
991 /* ip6_ins_rt is called with FREE table->tb6_lock.
992 * It takes new route entry, the addition fails by any reason the
993 * route is released.
994 * Caller must hold dst before calling it.
997 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
998 struct mx6_config *mxc,
999 struct netlink_ext_ack *extack)
1001 int err;
1002 struct fib6_table *table;
1004 table = rt->rt6i_table;
1005 spin_lock_bh(&table->tb6_lock);
1006 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1007 spin_unlock_bh(&table->tb6_lock);
1009 return err;
1012 int ip6_ins_rt(struct rt6_info *rt)
1014 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1015 struct mx6_config mxc = { .mx = NULL, };
1017 /* Hold dst to account for the reference from the fib6 tree */
1018 dst_hold(&rt->dst);
1019 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1022 /* called with rcu_lock held */
1023 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1025 struct net_device *dev = rt->dst.dev;
1027 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1028 /* for copies of local routes, dst->dev needs to be the
1029 * device if it is a master device, the master device if
1030 * device is enslaved, and the loopback as the default
1032 if (netif_is_l3_slave(dev) &&
1033 !rt6_need_strict(&rt->rt6i_dst.addr))
1034 dev = l3mdev_master_dev_rcu(dev);
1035 else if (!netif_is_l3_master(dev))
1036 dev = dev_net(dev)->loopback_dev;
1037 /* last case is netif_is_l3_master(dev) is true in which
1038 * case we want dev returned to be dev
1042 return dev;
1045 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1046 const struct in6_addr *daddr,
1047 const struct in6_addr *saddr)
1049 struct net_device *dev;
1050 struct rt6_info *rt;
1053 * Clone the route.
1056 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1057 ort = ort->from;
1059 rcu_read_lock();
1060 dev = ip6_rt_get_dev_rcu(ort);
1061 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1062 rcu_read_unlock();
1063 if (!rt)
1064 return NULL;
1066 ip6_rt_copy_init(rt, ort);
1067 rt->rt6i_flags |= RTF_CACHE;
1068 rt->rt6i_metric = 0;
1069 rt->dst.flags |= DST_HOST;
1070 rt->rt6i_dst.addr = *daddr;
1071 rt->rt6i_dst.plen = 128;
1073 if (!rt6_is_gw_or_nonexthop(ort)) {
1074 if (ort->rt6i_dst.plen != 128 &&
1075 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1076 rt->rt6i_flags |= RTF_ANYCAST;
1077 #ifdef CONFIG_IPV6_SUBTREES
1078 if (rt->rt6i_src.plen && saddr) {
1079 rt->rt6i_src.addr = *saddr;
1080 rt->rt6i_src.plen = 128;
1082 #endif
1085 return rt;
1088 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1090 struct net_device *dev;
1091 struct rt6_info *pcpu_rt;
1093 rcu_read_lock();
1094 dev = ip6_rt_get_dev_rcu(rt);
1095 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1096 rcu_read_unlock();
1097 if (!pcpu_rt)
1098 return NULL;
1099 ip6_rt_copy_init(pcpu_rt, rt);
1100 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1101 pcpu_rt->rt6i_flags |= RTF_PCPU;
1102 return pcpu_rt;
1105 /* It should be called with rcu_read_lock() acquired */
1106 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1108 struct rt6_info *pcpu_rt, **p;
1110 p = this_cpu_ptr(rt->rt6i_pcpu);
1111 pcpu_rt = *p;
1113 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1114 rt6_dst_from_metrics_check(pcpu_rt);
1116 return pcpu_rt;
1119 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1121 struct rt6_info *pcpu_rt, *prev, **p;
1123 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1124 if (!pcpu_rt) {
1125 struct net *net = dev_net(rt->dst.dev);
1127 dst_hold(&net->ipv6.ip6_null_entry->dst);
1128 return net->ipv6.ip6_null_entry;
1131 dst_hold(&pcpu_rt->dst);
1132 p = this_cpu_ptr(rt->rt6i_pcpu);
1133 prev = cmpxchg(p, NULL, pcpu_rt);
1134 BUG_ON(prev);
1136 rt6_dst_from_metrics_check(pcpu_rt);
1137 return pcpu_rt;
1140 /* exception hash table implementation
1142 static DEFINE_SPINLOCK(rt6_exception_lock);
1144 /* Remove rt6_ex from hash table and free the memory
1145 * Caller must hold rt6_exception_lock
1147 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1148 struct rt6_exception *rt6_ex)
1150 struct net *net;
1152 if (!bucket || !rt6_ex)
1153 return;
1155 net = dev_net(rt6_ex->rt6i->dst.dev);
1156 rt6_ex->rt6i->rt6i_node = NULL;
1157 hlist_del_rcu(&rt6_ex->hlist);
1158 rt6_release(rt6_ex->rt6i);
1159 kfree_rcu(rt6_ex, rcu);
1160 WARN_ON_ONCE(!bucket->depth);
1161 bucket->depth--;
1162 net->ipv6.rt6_stats->fib_rt_cache--;
1165 /* Remove oldest rt6_ex in bucket and free the memory
1166 * Caller must hold rt6_exception_lock
1168 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1170 struct rt6_exception *rt6_ex, *oldest = NULL;
1172 if (!bucket)
1173 return;
1175 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1176 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1177 oldest = rt6_ex;
1179 rt6_remove_exception(bucket, oldest);
1182 static u32 rt6_exception_hash(const struct in6_addr *dst,
1183 const struct in6_addr *src)
1185 static u32 seed __read_mostly;
1186 u32 val;
1188 net_get_random_once(&seed, sizeof(seed));
1189 val = jhash(dst, sizeof(*dst), seed);
1191 #ifdef CONFIG_IPV6_SUBTREES
1192 if (src)
1193 val = jhash(src, sizeof(*src), val);
1194 #endif
1195 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1198 /* Helper function to find the cached rt in the hash table
1199 * and update bucket pointer to point to the bucket for this
1200 * (daddr, saddr) pair
1201 * Caller must hold rt6_exception_lock
1203 static struct rt6_exception *
1204 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1205 const struct in6_addr *daddr,
1206 const struct in6_addr *saddr)
1208 struct rt6_exception *rt6_ex;
1209 u32 hval;
1211 if (!(*bucket) || !daddr)
1212 return NULL;
1214 hval = rt6_exception_hash(daddr, saddr);
1215 *bucket += hval;
1217 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1218 struct rt6_info *rt6 = rt6_ex->rt6i;
1219 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1221 #ifdef CONFIG_IPV6_SUBTREES
1222 if (matched && saddr)
1223 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1224 #endif
1225 if (matched)
1226 return rt6_ex;
1228 return NULL;
1231 /* Helper function to find the cached rt in the hash table
1232 * and update bucket pointer to point to the bucket for this
1233 * (daddr, saddr) pair
1234 * Caller must hold rcu_read_lock()
1236 static struct rt6_exception *
1237 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1238 const struct in6_addr *daddr,
1239 const struct in6_addr *saddr)
1241 struct rt6_exception *rt6_ex;
1242 u32 hval;
1244 WARN_ON_ONCE(!rcu_read_lock_held());
1246 if (!(*bucket) || !daddr)
1247 return NULL;
1249 hval = rt6_exception_hash(daddr, saddr);
1250 *bucket += hval;
1252 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1253 struct rt6_info *rt6 = rt6_ex->rt6i;
1254 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1256 #ifdef CONFIG_IPV6_SUBTREES
1257 if (matched && saddr)
1258 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1259 #endif
1260 if (matched)
1261 return rt6_ex;
1263 return NULL;
1266 static int rt6_insert_exception(struct rt6_info *nrt,
1267 struct rt6_info *ort)
1269 struct net *net = dev_net(ort->dst.dev);
1270 struct rt6_exception_bucket *bucket;
1271 struct in6_addr *src_key = NULL;
1272 struct rt6_exception *rt6_ex;
1273 int err = 0;
1275 /* ort can't be a cache or pcpu route */
1276 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1277 ort = ort->from;
1278 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1280 spin_lock_bh(&rt6_exception_lock);
1282 if (ort->exception_bucket_flushed) {
1283 err = -EINVAL;
1284 goto out;
1287 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1288 lockdep_is_held(&rt6_exception_lock));
1289 if (!bucket) {
1290 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1291 GFP_ATOMIC);
1292 if (!bucket) {
1293 err = -ENOMEM;
1294 goto out;
1296 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1299 #ifdef CONFIG_IPV6_SUBTREES
1300 /* rt6i_src.plen != 0 indicates ort is in subtree
1301 * and exception table is indexed by a hash of
1302 * both rt6i_dst and rt6i_src.
1303 * Otherwise, the exception table is indexed by
1304 * a hash of only rt6i_dst.
1306 if (ort->rt6i_src.plen)
1307 src_key = &nrt->rt6i_src.addr;
1308 #endif
1310 /* Update rt6i_prefsrc as it could be changed
1311 * in rt6_remove_prefsrc()
1313 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1314 /* rt6_mtu_change() might lower mtu on ort.
1315 * Only insert this exception route if its mtu
1316 * is less than ort's mtu value.
1318 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1319 err = -EINVAL;
1320 goto out;
1323 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1324 src_key);
1325 if (rt6_ex)
1326 rt6_remove_exception(bucket, rt6_ex);
1328 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1329 if (!rt6_ex) {
1330 err = -ENOMEM;
1331 goto out;
1333 rt6_ex->rt6i = nrt;
1334 rt6_ex->stamp = jiffies;
1335 atomic_inc(&nrt->rt6i_ref);
1336 nrt->rt6i_node = ort->rt6i_node;
1337 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1338 bucket->depth++;
1339 net->ipv6.rt6_stats->fib_rt_cache++;
1341 if (bucket->depth > FIB6_MAX_DEPTH)
1342 rt6_exception_remove_oldest(bucket);
1344 out:
1345 spin_unlock_bh(&rt6_exception_lock);
1347 /* Update fn->fn_sernum to invalidate all cached dst */
1348 if (!err) {
1349 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1350 fib6_update_sernum(ort);
1351 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1352 fib6_force_start_gc(net);
1355 return err;
1358 void rt6_flush_exceptions(struct rt6_info *rt)
1360 struct rt6_exception_bucket *bucket;
1361 struct rt6_exception *rt6_ex;
1362 struct hlist_node *tmp;
1363 int i;
1365 spin_lock_bh(&rt6_exception_lock);
1366 /* Prevent rt6_insert_exception() to recreate the bucket list */
1367 rt->exception_bucket_flushed = 1;
1369 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1370 lockdep_is_held(&rt6_exception_lock));
1371 if (!bucket)
1372 goto out;
1374 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1375 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1376 rt6_remove_exception(bucket, rt6_ex);
1377 WARN_ON_ONCE(bucket->depth);
1378 bucket++;
1381 out:
1382 spin_unlock_bh(&rt6_exception_lock);
1385 /* Find cached rt in the hash table inside passed in rt
1386 * Caller has to hold rcu_read_lock()
1388 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1389 struct in6_addr *daddr,
1390 struct in6_addr *saddr)
1392 struct rt6_exception_bucket *bucket;
1393 struct in6_addr *src_key = NULL;
1394 struct rt6_exception *rt6_ex;
1395 struct rt6_info *res = NULL;
1397 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1399 #ifdef CONFIG_IPV6_SUBTREES
1400 /* rt6i_src.plen != 0 indicates rt is in subtree
1401 * and exception table is indexed by a hash of
1402 * both rt6i_dst and rt6i_src.
1403 * Otherwise, the exception table is indexed by
1404 * a hash of only rt6i_dst.
1406 if (rt->rt6i_src.plen)
1407 src_key = saddr;
1408 #endif
1409 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1411 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1412 res = rt6_ex->rt6i;
1414 return res;
1417 /* Remove the passed in cached rt from the hash table that contains it */
1418 int rt6_remove_exception_rt(struct rt6_info *rt)
1420 struct rt6_exception_bucket *bucket;
1421 struct rt6_info *from = rt->from;
1422 struct in6_addr *src_key = NULL;
1423 struct rt6_exception *rt6_ex;
1424 int err;
1426 if (!from ||
1427 !(rt->rt6i_flags & RTF_CACHE))
1428 return -EINVAL;
1430 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1431 return -ENOENT;
1433 spin_lock_bh(&rt6_exception_lock);
1434 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1435 lockdep_is_held(&rt6_exception_lock));
1436 #ifdef CONFIG_IPV6_SUBTREES
1437 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1438 * and exception table is indexed by a hash of
1439 * both rt6i_dst and rt6i_src.
1440 * Otherwise, the exception table is indexed by
1441 * a hash of only rt6i_dst.
1443 if (from->rt6i_src.plen)
1444 src_key = &rt->rt6i_src.addr;
1445 #endif
1446 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1447 &rt->rt6i_dst.addr,
1448 src_key);
1449 if (rt6_ex) {
1450 rt6_remove_exception(bucket, rt6_ex);
1451 err = 0;
1452 } else {
1453 err = -ENOENT;
1456 spin_unlock_bh(&rt6_exception_lock);
1457 return err;
1460 /* Find rt6_ex which contains the passed in rt cache and
1461 * refresh its stamp
1463 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1465 struct rt6_exception_bucket *bucket;
1466 struct rt6_info *from = rt->from;
1467 struct in6_addr *src_key = NULL;
1468 struct rt6_exception *rt6_ex;
1470 if (!from ||
1471 !(rt->rt6i_flags & RTF_CACHE))
1472 return;
1474 rcu_read_lock();
1475 bucket = rcu_dereference(from->rt6i_exception_bucket);
1477 #ifdef CONFIG_IPV6_SUBTREES
1478 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1479 * and exception table is indexed by a hash of
1480 * both rt6i_dst and rt6i_src.
1481 * Otherwise, the exception table is indexed by
1482 * a hash of only rt6i_dst.
1484 if (from->rt6i_src.plen)
1485 src_key = &rt->rt6i_src.addr;
1486 #endif
1487 rt6_ex = __rt6_find_exception_rcu(&bucket,
1488 &rt->rt6i_dst.addr,
1489 src_key);
1490 if (rt6_ex)
1491 rt6_ex->stamp = jiffies;
1493 rcu_read_unlock();
1496 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1498 struct rt6_exception_bucket *bucket;
1499 struct rt6_exception *rt6_ex;
1500 int i;
1502 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1503 lockdep_is_held(&rt6_exception_lock));
1505 if (bucket) {
1506 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1507 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1508 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1510 bucket++;
1515 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1516 struct rt6_info *rt, int mtu)
1518 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1519 * lowest MTU in the path: always allow updating the route PMTU to
1520 * reflect PMTU decreases.
1522 * If the new MTU is higher, and the route PMTU is equal to the local
1523 * MTU, this means the old MTU is the lowest in the path, so allow
1524 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1525 * handle this.
1528 if (dst_mtu(&rt->dst) >= mtu)
1529 return true;
1531 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1532 return true;
1534 return false;
1537 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1538 struct rt6_info *rt, int mtu)
1540 struct rt6_exception_bucket *bucket;
1541 struct rt6_exception *rt6_ex;
1542 int i;
1544 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1545 lockdep_is_held(&rt6_exception_lock));
1547 if (!bucket)
1548 return;
1550 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1551 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1552 struct rt6_info *entry = rt6_ex->rt6i;
1554 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1555 * route), the metrics of its rt->dst.from have already
1556 * been updated.
1558 if (entry->rt6i_pmtu &&
1559 rt6_mtu_change_route_allowed(idev, entry, mtu))
1560 entry->rt6i_pmtu = mtu;
1562 bucket++;
1566 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1568 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1569 struct in6_addr *gateway)
1571 struct rt6_exception_bucket *bucket;
1572 struct rt6_exception *rt6_ex;
1573 struct hlist_node *tmp;
1574 int i;
1576 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1577 return;
1579 spin_lock_bh(&rt6_exception_lock);
1580 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1581 lockdep_is_held(&rt6_exception_lock));
1583 if (bucket) {
1584 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1585 hlist_for_each_entry_safe(rt6_ex, tmp,
1586 &bucket->chain, hlist) {
1587 struct rt6_info *entry = rt6_ex->rt6i;
1589 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1590 RTF_CACHE_GATEWAY &&
1591 ipv6_addr_equal(gateway,
1592 &entry->rt6i_gateway)) {
1593 rt6_remove_exception(bucket, rt6_ex);
1596 bucket++;
1600 spin_unlock_bh(&rt6_exception_lock);
1603 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1604 struct rt6_exception *rt6_ex,
1605 struct fib6_gc_args *gc_args,
1606 unsigned long now)
1608 struct rt6_info *rt = rt6_ex->rt6i;
1610 /* we are pruning and obsoleting aged-out and non gateway exceptions
1611 * even if others have still references to them, so that on next
1612 * dst_check() such references can be dropped.
1613 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1614 * expired, independently from their aging, as per RFC 8201 section 4
1616 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1617 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1618 RT6_TRACE("aging clone %p\n", rt);
1619 rt6_remove_exception(bucket, rt6_ex);
1620 return;
1622 } else if (time_after(jiffies, rt->dst.expires)) {
1623 RT6_TRACE("purging expired route %p\n", rt);
1624 rt6_remove_exception(bucket, rt6_ex);
1625 return;
1628 if (rt->rt6i_flags & RTF_GATEWAY) {
1629 struct neighbour *neigh;
1630 __u8 neigh_flags = 0;
1632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1633 if (neigh)
1634 neigh_flags = neigh->flags;
1636 if (!(neigh_flags & NTF_ROUTER)) {
1637 RT6_TRACE("purging route %p via non-router but gateway\n",
1638 rt);
1639 rt6_remove_exception(bucket, rt6_ex);
1640 return;
1644 gc_args->more++;
1647 void rt6_age_exceptions(struct rt6_info *rt,
1648 struct fib6_gc_args *gc_args,
1649 unsigned long now)
1651 struct rt6_exception_bucket *bucket;
1652 struct rt6_exception *rt6_ex;
1653 struct hlist_node *tmp;
1654 int i;
1656 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1657 return;
1659 rcu_read_lock_bh();
1660 spin_lock(&rt6_exception_lock);
1661 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1662 lockdep_is_held(&rt6_exception_lock));
1664 if (bucket) {
1665 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1666 hlist_for_each_entry_safe(rt6_ex, tmp,
1667 &bucket->chain, hlist) {
1668 rt6_age_examine_exception(bucket, rt6_ex,
1669 gc_args, now);
1671 bucket++;
1674 spin_unlock(&rt6_exception_lock);
1675 rcu_read_unlock_bh();
1678 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1679 int oif, struct flowi6 *fl6, int flags)
1681 struct fib6_node *fn, *saved_fn;
1682 struct rt6_info *rt, *rt_cache;
1683 int strict = 0;
1685 strict |= flags & RT6_LOOKUP_F_IFACE;
1686 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1687 if (net->ipv6.devconf_all->forwarding == 0)
1688 strict |= RT6_LOOKUP_F_REACHABLE;
1690 rcu_read_lock();
1692 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1693 saved_fn = fn;
1695 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1696 oif = 0;
1698 redo_rt6_select:
1699 rt = rt6_select(net, fn, oif, strict);
1700 if (rt->rt6i_nsiblings)
1701 rt = rt6_multipath_select(rt, fl6, oif, strict);
1702 if (rt == net->ipv6.ip6_null_entry) {
1703 fn = fib6_backtrack(fn, &fl6->saddr);
1704 if (fn)
1705 goto redo_rt6_select;
1706 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1707 /* also consider unreachable route */
1708 strict &= ~RT6_LOOKUP_F_REACHABLE;
1709 fn = saved_fn;
1710 goto redo_rt6_select;
1714 /*Search through exception table */
1715 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1716 if (rt_cache)
1717 rt = rt_cache;
1719 if (rt == net->ipv6.ip6_null_entry) {
1720 rcu_read_unlock();
1721 dst_hold(&rt->dst);
1722 trace_fib6_table_lookup(net, rt, table, fl6);
1723 return rt;
1724 } else if (rt->rt6i_flags & RTF_CACHE) {
1725 if (ip6_hold_safe(net, &rt, true)) {
1726 dst_use_noref(&rt->dst, jiffies);
1727 rt6_dst_from_metrics_check(rt);
1729 rcu_read_unlock();
1730 trace_fib6_table_lookup(net, rt, table, fl6);
1731 return rt;
1732 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1733 !(rt->rt6i_flags & RTF_GATEWAY))) {
1734 /* Create a RTF_CACHE clone which will not be
1735 * owned by the fib6 tree. It is for the special case where
1736 * the daddr in the skb during the neighbor look-up is different
1737 * from the fl6->daddr used to look-up route here.
1740 struct rt6_info *uncached_rt;
1742 if (ip6_hold_safe(net, &rt, true)) {
1743 dst_use_noref(&rt->dst, jiffies);
1744 } else {
1745 rcu_read_unlock();
1746 uncached_rt = rt;
1747 goto uncached_rt_out;
1749 rcu_read_unlock();
1751 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1752 dst_release(&rt->dst);
1754 if (uncached_rt) {
1755 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1756 * No need for another dst_hold()
1758 rt6_uncached_list_add(uncached_rt);
1759 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1760 } else {
1761 uncached_rt = net->ipv6.ip6_null_entry;
1762 dst_hold(&uncached_rt->dst);
1765 uncached_rt_out:
1766 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1767 return uncached_rt;
1769 } else {
1770 /* Get a percpu copy */
1772 struct rt6_info *pcpu_rt;
1774 dst_use_noref(&rt->dst, jiffies);
1775 local_bh_disable();
1776 pcpu_rt = rt6_get_pcpu_route(rt);
1778 if (!pcpu_rt) {
1779 /* atomic_inc_not_zero() is needed when using rcu */
1780 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1781 /* No dst_hold() on rt is needed because grabbing
1782 * rt->rt6i_ref makes sure rt can't be released.
1784 pcpu_rt = rt6_make_pcpu_route(rt);
1785 rt6_release(rt);
1786 } else {
1787 /* rt is already removed from tree */
1788 pcpu_rt = net->ipv6.ip6_null_entry;
1789 dst_hold(&pcpu_rt->dst);
1792 local_bh_enable();
1793 rcu_read_unlock();
1794 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1795 return pcpu_rt;
1798 EXPORT_SYMBOL_GPL(ip6_pol_route);
1800 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1801 struct flowi6 *fl6, int flags)
1803 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1806 struct dst_entry *ip6_route_input_lookup(struct net *net,
1807 struct net_device *dev,
1808 struct flowi6 *fl6, int flags)
1810 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1811 flags |= RT6_LOOKUP_F_IFACE;
1813 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1815 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1817 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1818 struct flow_keys *keys)
1820 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1821 const struct ipv6hdr *key_iph = outer_iph;
1822 const struct ipv6hdr *inner_iph;
1823 const struct icmp6hdr *icmph;
1824 struct ipv6hdr _inner_iph;
1825 struct icmp6hdr _icmph;
1827 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1828 goto out;
1830 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1831 sizeof(_icmph), &_icmph);
1832 if (!icmph)
1833 goto out;
1835 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1836 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1837 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1838 icmph->icmp6_type != ICMPV6_PARAMPROB)
1839 goto out;
1841 inner_iph = skb_header_pointer(skb,
1842 skb_transport_offset(skb) + sizeof(*icmph),
1843 sizeof(_inner_iph), &_inner_iph);
1844 if (!inner_iph)
1845 goto out;
1847 key_iph = inner_iph;
1848 out:
1849 memset(keys, 0, sizeof(*keys));
1850 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1851 keys->addrs.v6addrs.src = key_iph->saddr;
1852 keys->addrs.v6addrs.dst = key_iph->daddr;
1853 keys->tags.flow_label = ip6_flowinfo(key_iph);
1854 keys->basic.ip_proto = key_iph->nexthdr;
1857 /* if skb is set it will be used and fl6 can be NULL */
1858 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1860 struct flow_keys hash_keys;
1862 if (skb) {
1863 ip6_multipath_l3_keys(skb, &hash_keys);
1864 return flow_hash_from_keys(&hash_keys) >> 1;
1867 return get_hash_from_flowi6(fl6) >> 1;
1870 void ip6_route_input(struct sk_buff *skb)
1872 const struct ipv6hdr *iph = ipv6_hdr(skb);
1873 struct net *net = dev_net(skb->dev);
1874 int flags = RT6_LOOKUP_F_HAS_SADDR;
1875 struct ip_tunnel_info *tun_info;
1876 struct flowi6 fl6 = {
1877 .flowi6_iif = skb->dev->ifindex,
1878 .daddr = iph->daddr,
1879 .saddr = iph->saddr,
1880 .flowlabel = ip6_flowinfo(iph),
1881 .flowi6_mark = skb->mark,
1882 .flowi6_proto = iph->nexthdr,
1885 tun_info = skb_tunnel_info(skb);
1886 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1887 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1888 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1889 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1890 skb_dst_drop(skb);
1891 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1894 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1895 struct flowi6 *fl6, int flags)
1897 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1900 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1901 struct flowi6 *fl6, int flags)
1903 bool any_src;
1905 if (rt6_need_strict(&fl6->daddr)) {
1906 struct dst_entry *dst;
1908 dst = l3mdev_link_scope_lookup(net, fl6);
1909 if (dst)
1910 return dst;
1913 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1915 any_src = ipv6_addr_any(&fl6->saddr);
1916 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1917 (fl6->flowi6_oif && any_src))
1918 flags |= RT6_LOOKUP_F_IFACE;
1920 if (!any_src)
1921 flags |= RT6_LOOKUP_F_HAS_SADDR;
1922 else if (sk)
1923 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1925 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1927 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1929 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1931 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1932 struct net_device *loopback_dev = net->loopback_dev;
1933 struct dst_entry *new = NULL;
1935 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1936 DST_OBSOLETE_DEAD, 0);
1937 if (rt) {
1938 rt6_info_init(rt);
1939 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1941 new = &rt->dst;
1942 new->__use = 1;
1943 new->input = dst_discard;
1944 new->output = dst_discard_out;
1946 dst_copy_metrics(new, &ort->dst);
1948 rt->rt6i_idev = in6_dev_get(loopback_dev);
1949 rt->rt6i_gateway = ort->rt6i_gateway;
1950 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1951 rt->rt6i_metric = 0;
1953 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1954 #ifdef CONFIG_IPV6_SUBTREES
1955 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1956 #endif
1959 dst_release(dst_orig);
1960 return new ? new : ERR_PTR(-ENOMEM);
1964 * Destination cache support functions
1967 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1969 if (rt->from &&
1970 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1971 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1974 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1976 u32 rt_cookie = 0;
1978 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1979 return NULL;
1981 if (rt6_check_expired(rt))
1982 return NULL;
1984 return &rt->dst;
1987 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1989 if (!__rt6_check_expired(rt) &&
1990 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1991 rt6_check(rt->from, cookie))
1992 return &rt->dst;
1993 else
1994 return NULL;
1997 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1999 struct rt6_info *rt;
2001 rt = (struct rt6_info *) dst;
2003 /* All IPV6 dsts are created with ->obsolete set to the value
2004 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2005 * into this function always.
2008 rt6_dst_from_metrics_check(rt);
2010 if (rt->rt6i_flags & RTF_PCPU ||
2011 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2012 return rt6_dst_from_check(rt, cookie);
2013 else
2014 return rt6_check(rt, cookie);
2017 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2019 struct rt6_info *rt = (struct rt6_info *) dst;
2021 if (rt) {
2022 if (rt->rt6i_flags & RTF_CACHE) {
2023 if (rt6_check_expired(rt)) {
2024 ip6_del_rt(rt);
2025 dst = NULL;
2027 } else {
2028 dst_release(dst);
2029 dst = NULL;
2032 return dst;
2035 static void ip6_link_failure(struct sk_buff *skb)
2037 struct rt6_info *rt;
2039 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2041 rt = (struct rt6_info *) skb_dst(skb);
2042 if (rt) {
2043 if (rt->rt6i_flags & RTF_CACHE) {
2044 if (dst_hold_safe(&rt->dst))
2045 ip6_del_rt(rt);
2046 } else {
2047 struct fib6_node *fn;
2049 rcu_read_lock();
2050 fn = rcu_dereference(rt->rt6i_node);
2051 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2052 fn->fn_sernum = -1;
2053 rcu_read_unlock();
2058 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2060 struct net *net = dev_net(rt->dst.dev);
2062 rt->rt6i_flags |= RTF_MODIFIED;
2063 rt->rt6i_pmtu = mtu;
2064 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2067 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2069 return !(rt->rt6i_flags & RTF_CACHE) &&
2070 (rt->rt6i_flags & RTF_PCPU ||
2071 rcu_access_pointer(rt->rt6i_node));
2074 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2075 const struct ipv6hdr *iph, u32 mtu)
2077 const struct in6_addr *daddr, *saddr;
2078 struct rt6_info *rt6 = (struct rt6_info *)dst;
2080 if (rt6->rt6i_flags & RTF_LOCAL)
2081 return;
2083 if (dst_metric_locked(dst, RTAX_MTU))
2084 return;
2086 if (iph) {
2087 daddr = &iph->daddr;
2088 saddr = &iph->saddr;
2089 } else if (sk) {
2090 daddr = &sk->sk_v6_daddr;
2091 saddr = &inet6_sk(sk)->saddr;
2092 } else {
2093 daddr = NULL;
2094 saddr = NULL;
2096 dst_confirm_neigh(dst, daddr);
2097 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2098 if (mtu >= dst_mtu(dst))
2099 return;
2101 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2102 rt6_do_update_pmtu(rt6, mtu);
2103 /* update rt6_ex->stamp for cache */
2104 if (rt6->rt6i_flags & RTF_CACHE)
2105 rt6_update_exception_stamp_rt(rt6);
2106 } else if (daddr) {
2107 struct rt6_info *nrt6;
2109 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2110 if (nrt6) {
2111 rt6_do_update_pmtu(nrt6, mtu);
2112 if (rt6_insert_exception(nrt6, rt6))
2113 dst_release_immediate(&nrt6->dst);
2118 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2119 struct sk_buff *skb, u32 mtu)
2121 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2124 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2125 int oif, u32 mark, kuid_t uid)
2127 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2128 struct dst_entry *dst;
2129 struct flowi6 fl6;
2131 memset(&fl6, 0, sizeof(fl6));
2132 fl6.flowi6_oif = oif;
2133 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2134 fl6.daddr = iph->daddr;
2135 fl6.saddr = iph->saddr;
2136 fl6.flowlabel = ip6_flowinfo(iph);
2137 fl6.flowi6_uid = uid;
2139 dst = ip6_route_output(net, NULL, &fl6);
2140 if (!dst->error)
2141 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2142 dst_release(dst);
2144 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2146 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2148 struct dst_entry *dst;
2150 ip6_update_pmtu(skb, sock_net(sk), mtu,
2151 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2153 dst = __sk_dst_get(sk);
2154 if (!dst || !dst->obsolete ||
2155 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2156 return;
2158 bh_lock_sock(sk);
2159 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2160 ip6_datagram_dst_update(sk, false);
2161 bh_unlock_sock(sk);
2163 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2165 /* Handle redirects */
2166 struct ip6rd_flowi {
2167 struct flowi6 fl6;
2168 struct in6_addr gateway;
2171 static struct rt6_info *__ip6_route_redirect(struct net *net,
2172 struct fib6_table *table,
2173 struct flowi6 *fl6,
2174 int flags)
2176 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2177 struct rt6_info *rt, *rt_cache;
2178 struct fib6_node *fn;
2180 /* Get the "current" route for this destination and
2181 * check if the redirect has come from appropriate router.
2183 * RFC 4861 specifies that redirects should only be
2184 * accepted if they come from the nexthop to the target.
2185 * Due to the way the routes are chosen, this notion
2186 * is a bit fuzzy and one might need to check all possible
2187 * routes.
2190 rcu_read_lock();
2191 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2192 restart:
2193 for_each_fib6_node_rt_rcu(fn) {
2194 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2195 continue;
2196 if (rt6_check_expired(rt))
2197 continue;
2198 if (rt->dst.error)
2199 break;
2200 if (!(rt->rt6i_flags & RTF_GATEWAY))
2201 continue;
2202 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2203 continue;
2204 /* rt_cache's gateway might be different from its 'parent'
2205 * in the case of an ip redirect.
2206 * So we keep searching in the exception table if the gateway
2207 * is different.
2209 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2210 rt_cache = rt6_find_cached_rt(rt,
2211 &fl6->daddr,
2212 &fl6->saddr);
2213 if (rt_cache &&
2214 ipv6_addr_equal(&rdfl->gateway,
2215 &rt_cache->rt6i_gateway)) {
2216 rt = rt_cache;
2217 break;
2219 continue;
2221 break;
2224 if (!rt)
2225 rt = net->ipv6.ip6_null_entry;
2226 else if (rt->dst.error) {
2227 rt = net->ipv6.ip6_null_entry;
2228 goto out;
2231 if (rt == net->ipv6.ip6_null_entry) {
2232 fn = fib6_backtrack(fn, &fl6->saddr);
2233 if (fn)
2234 goto restart;
2237 out:
2238 ip6_hold_safe(net, &rt, true);
2240 rcu_read_unlock();
2242 trace_fib6_table_lookup(net, rt, table, fl6);
2243 return rt;
2246 static struct dst_entry *ip6_route_redirect(struct net *net,
2247 const struct flowi6 *fl6,
2248 const struct in6_addr *gateway)
2250 int flags = RT6_LOOKUP_F_HAS_SADDR;
2251 struct ip6rd_flowi rdfl;
2253 rdfl.fl6 = *fl6;
2254 rdfl.gateway = *gateway;
2256 return fib6_rule_lookup(net, &rdfl.fl6,
2257 flags, __ip6_route_redirect);
2260 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2261 kuid_t uid)
2263 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2264 struct dst_entry *dst;
2265 struct flowi6 fl6;
2267 memset(&fl6, 0, sizeof(fl6));
2268 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2269 fl6.flowi6_oif = oif;
2270 fl6.flowi6_mark = mark;
2271 fl6.daddr = iph->daddr;
2272 fl6.saddr = iph->saddr;
2273 fl6.flowlabel = ip6_flowinfo(iph);
2274 fl6.flowi6_uid = uid;
2276 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2277 rt6_do_redirect(dst, NULL, skb);
2278 dst_release(dst);
2280 EXPORT_SYMBOL_GPL(ip6_redirect);
2282 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2283 u32 mark)
2285 const struct ipv6hdr *iph = ipv6_hdr(skb);
2286 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2287 struct dst_entry *dst;
2288 struct flowi6 fl6;
2290 memset(&fl6, 0, sizeof(fl6));
2291 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2292 fl6.flowi6_oif = oif;
2293 fl6.flowi6_mark = mark;
2294 fl6.daddr = msg->dest;
2295 fl6.saddr = iph->daddr;
2296 fl6.flowi6_uid = sock_net_uid(net, NULL);
2298 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2299 rt6_do_redirect(dst, NULL, skb);
2300 dst_release(dst);
2303 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2305 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2306 sk->sk_uid);
2308 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2310 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2312 struct net_device *dev = dst->dev;
2313 unsigned int mtu = dst_mtu(dst);
2314 struct net *net = dev_net(dev);
2316 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2318 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2319 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2322 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2323 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2324 * IPV6_MAXPLEN is also valid and means: "any MSS,
2325 * rely only on pmtu discovery"
2327 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2328 mtu = IPV6_MAXPLEN;
2329 return mtu;
2332 static unsigned int ip6_mtu(const struct dst_entry *dst)
2334 const struct rt6_info *rt = (const struct rt6_info *)dst;
2335 unsigned int mtu = rt->rt6i_pmtu;
2336 struct inet6_dev *idev;
2338 if (mtu)
2339 goto out;
2341 mtu = dst_metric_raw(dst, RTAX_MTU);
2342 if (mtu)
2343 goto out;
2345 mtu = IPV6_MIN_MTU;
2347 rcu_read_lock();
2348 idev = __in6_dev_get(dst->dev);
2349 if (idev)
2350 mtu = idev->cnf.mtu6;
2351 rcu_read_unlock();
2353 out:
2354 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2356 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2359 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2360 struct flowi6 *fl6)
2362 struct dst_entry *dst;
2363 struct rt6_info *rt;
2364 struct inet6_dev *idev = in6_dev_get(dev);
2365 struct net *net = dev_net(dev);
2367 if (unlikely(!idev))
2368 return ERR_PTR(-ENODEV);
2370 rt = ip6_dst_alloc(net, dev, 0);
2371 if (unlikely(!rt)) {
2372 in6_dev_put(idev);
2373 dst = ERR_PTR(-ENOMEM);
2374 goto out;
2377 rt->dst.flags |= DST_HOST;
2378 rt->dst.input = ip6_input;
2379 rt->dst.output = ip6_output;
2380 rt->rt6i_gateway = fl6->daddr;
2381 rt->rt6i_dst.addr = fl6->daddr;
2382 rt->rt6i_dst.plen = 128;
2383 rt->rt6i_idev = idev;
2384 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2386 /* Add this dst into uncached_list so that rt6_disable_ip() can
2387 * do proper release of the net_device
2389 rt6_uncached_list_add(rt);
2390 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2392 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2394 out:
2395 return dst;
2398 static int ip6_dst_gc(struct dst_ops *ops)
2400 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2401 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2402 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2403 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2404 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2405 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2406 int entries;
2408 entries = dst_entries_get_fast(ops);
2409 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2410 entries <= rt_max_size)
2411 goto out;
2413 net->ipv6.ip6_rt_gc_expire++;
2414 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2415 entries = dst_entries_get_slow(ops);
2416 if (entries < ops->gc_thresh)
2417 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2418 out:
2419 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2420 return entries > rt_max_size;
2423 static int ip6_convert_metrics(struct mx6_config *mxc,
2424 const struct fib6_config *cfg)
2426 struct net *net = cfg->fc_nlinfo.nl_net;
2427 bool ecn_ca = false;
2428 struct nlattr *nla;
2429 int remaining;
2430 u32 *mp;
2432 if (!cfg->fc_mx)
2433 return 0;
2435 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2436 if (unlikely(!mp))
2437 return -ENOMEM;
2439 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2440 int type = nla_type(nla);
2441 u32 val;
2443 if (!type)
2444 continue;
2445 if (unlikely(type > RTAX_MAX))
2446 goto err;
2448 if (type == RTAX_CC_ALGO) {
2449 char tmp[TCP_CA_NAME_MAX];
2451 nla_strlcpy(tmp, nla, sizeof(tmp));
2452 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2453 if (val == TCP_CA_UNSPEC)
2454 goto err;
2455 } else {
2456 val = nla_get_u32(nla);
2458 if (type == RTAX_HOPLIMIT && val > 255)
2459 val = 255;
2460 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2461 goto err;
2463 mp[type - 1] = val;
2464 __set_bit(type - 1, mxc->mx_valid);
2467 if (ecn_ca) {
2468 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2469 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2472 mxc->mx = mp;
2473 return 0;
2474 err:
2475 kfree(mp);
2476 return -EINVAL;
2479 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2480 struct fib6_config *cfg,
2481 const struct in6_addr *gw_addr,
2482 u32 tbid, int flags)
2484 struct flowi6 fl6 = {
2485 .flowi6_oif = cfg->fc_ifindex,
2486 .daddr = *gw_addr,
2487 .saddr = cfg->fc_prefsrc,
2489 struct fib6_table *table;
2490 struct rt6_info *rt;
2492 table = fib6_get_table(net, tbid);
2493 if (!table)
2494 return NULL;
2496 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2497 flags |= RT6_LOOKUP_F_HAS_SADDR;
2499 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2500 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2502 /* if table lookup failed, fall back to full lookup */
2503 if (rt == net->ipv6.ip6_null_entry) {
2504 ip6_rt_put(rt);
2505 rt = NULL;
2508 return rt;
2511 static int ip6_route_check_nh_onlink(struct net *net,
2512 struct fib6_config *cfg,
2513 struct net_device *dev,
2514 struct netlink_ext_ack *extack)
2516 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2517 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2518 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2519 struct rt6_info *grt;
2520 int err;
2522 err = 0;
2523 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2524 if (grt) {
2525 if (!grt->dst.error &&
2526 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2527 NL_SET_ERR_MSG(extack,
2528 "Nexthop has invalid gateway or device mismatch");
2529 err = -EINVAL;
2532 ip6_rt_put(grt);
2535 return err;
2538 static int ip6_route_check_nh(struct net *net,
2539 struct fib6_config *cfg,
2540 struct net_device **_dev,
2541 struct inet6_dev **idev)
2543 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2544 struct net_device *dev = _dev ? *_dev : NULL;
2545 struct rt6_info *grt = NULL;
2546 int err = -EHOSTUNREACH;
2548 if (cfg->fc_table) {
2549 int flags = RT6_LOOKUP_F_IFACE;
2551 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2552 cfg->fc_table, flags);
2553 if (grt) {
2554 if (grt->rt6i_flags & RTF_GATEWAY ||
2555 (dev && dev != grt->dst.dev)) {
2556 ip6_rt_put(grt);
2557 grt = NULL;
2562 if (!grt)
2563 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2565 if (!grt)
2566 goto out;
2568 if (dev) {
2569 if (dev != grt->dst.dev) {
2570 ip6_rt_put(grt);
2571 goto out;
2573 } else {
2574 *_dev = dev = grt->dst.dev;
2575 *idev = grt->rt6i_idev;
2576 dev_hold(dev);
2577 in6_dev_hold(grt->rt6i_idev);
2580 if (!(grt->rt6i_flags & RTF_GATEWAY))
2581 err = 0;
2583 ip6_rt_put(grt);
2585 out:
2586 return err;
2589 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2590 struct netlink_ext_ack *extack)
2592 struct net *net = cfg->fc_nlinfo.nl_net;
2593 struct rt6_info *rt = NULL;
2594 struct net_device *dev = NULL;
2595 struct inet6_dev *idev = NULL;
2596 struct fib6_table *table;
2597 int addr_type;
2598 int err = -EINVAL;
2600 /* RTF_PCPU is an internal flag; can not be set by userspace */
2601 if (cfg->fc_flags & RTF_PCPU) {
2602 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2603 goto out;
2606 /* RTF_CACHE is an internal flag; can not be set by userspace */
2607 if (cfg->fc_flags & RTF_CACHE) {
2608 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2609 goto out;
2612 if (cfg->fc_dst_len > 128) {
2613 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2614 goto out;
2616 if (cfg->fc_src_len > 128) {
2617 NL_SET_ERR_MSG(extack, "Invalid source address length");
2618 goto out;
2620 #ifndef CONFIG_IPV6_SUBTREES
2621 if (cfg->fc_src_len) {
2622 NL_SET_ERR_MSG(extack,
2623 "Specifying source address requires IPV6_SUBTREES to be enabled");
2624 goto out;
2626 #endif
2627 if (cfg->fc_ifindex) {
2628 err = -ENODEV;
2629 dev = dev_get_by_index(net, cfg->fc_ifindex);
2630 if (!dev)
2631 goto out;
2632 idev = in6_dev_get(dev);
2633 if (!idev)
2634 goto out;
2637 if (cfg->fc_metric == 0)
2638 cfg->fc_metric = IP6_RT_PRIO_USER;
2640 if (cfg->fc_flags & RTNH_F_ONLINK) {
2641 if (!dev) {
2642 NL_SET_ERR_MSG(extack,
2643 "Nexthop device required for onlink");
2644 err = -ENODEV;
2645 goto out;
2648 if (!(dev->flags & IFF_UP)) {
2649 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2650 err = -ENETDOWN;
2651 goto out;
2655 err = -ENOBUFS;
2656 if (cfg->fc_nlinfo.nlh &&
2657 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2658 table = fib6_get_table(net, cfg->fc_table);
2659 if (!table) {
2660 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2661 table = fib6_new_table(net, cfg->fc_table);
2663 } else {
2664 table = fib6_new_table(net, cfg->fc_table);
2667 if (!table)
2668 goto out;
2670 rt = ip6_dst_alloc(net, NULL,
2671 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2673 if (!rt) {
2674 err = -ENOMEM;
2675 goto out;
2678 if (cfg->fc_flags & RTF_EXPIRES)
2679 rt6_set_expires(rt, jiffies +
2680 clock_t_to_jiffies(cfg->fc_expires));
2681 else
2682 rt6_clean_expires(rt);
2684 if (cfg->fc_protocol == RTPROT_UNSPEC)
2685 cfg->fc_protocol = RTPROT_BOOT;
2686 rt->rt6i_protocol = cfg->fc_protocol;
2688 addr_type = ipv6_addr_type(&cfg->fc_dst);
2690 if (addr_type & IPV6_ADDR_MULTICAST)
2691 rt->dst.input = ip6_mc_input;
2692 else if (cfg->fc_flags & RTF_LOCAL)
2693 rt->dst.input = ip6_input;
2694 else
2695 rt->dst.input = ip6_forward;
2697 rt->dst.output = ip6_output;
2699 if (cfg->fc_encap) {
2700 struct lwtunnel_state *lwtstate;
2702 err = lwtunnel_build_state(cfg->fc_encap_type,
2703 cfg->fc_encap, AF_INET6, cfg,
2704 &lwtstate, extack);
2705 if (err)
2706 goto out;
2707 rt->dst.lwtstate = lwtstate_get(lwtstate);
2708 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2709 rt->dst.lwtstate->orig_output = rt->dst.output;
2710 rt->dst.output = lwtunnel_output;
2712 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2713 rt->dst.lwtstate->orig_input = rt->dst.input;
2714 rt->dst.input = lwtunnel_input;
2718 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2719 rt->rt6i_dst.plen = cfg->fc_dst_len;
2720 if (rt->rt6i_dst.plen == 128)
2721 rt->dst.flags |= DST_HOST;
2723 #ifdef CONFIG_IPV6_SUBTREES
2724 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2725 rt->rt6i_src.plen = cfg->fc_src_len;
2726 #endif
2728 rt->rt6i_metric = cfg->fc_metric;
2729 rt->rt6i_nh_weight = 1;
2731 /* We cannot add true routes via loopback here,
2732 they would result in kernel looping; promote them to reject routes
2734 if ((cfg->fc_flags & RTF_REJECT) ||
2735 (dev && (dev->flags & IFF_LOOPBACK) &&
2736 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2737 !(cfg->fc_flags & RTF_LOCAL))) {
2738 /* hold loopback dev/idev if we haven't done so. */
2739 if (dev != net->loopback_dev) {
2740 if (dev) {
2741 dev_put(dev);
2742 in6_dev_put(idev);
2744 dev = net->loopback_dev;
2745 dev_hold(dev);
2746 idev = in6_dev_get(dev);
2747 if (!idev) {
2748 err = -ENODEV;
2749 goto out;
2752 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2753 switch (cfg->fc_type) {
2754 case RTN_BLACKHOLE:
2755 rt->dst.error = -EINVAL;
2756 rt->dst.output = dst_discard_out;
2757 rt->dst.input = dst_discard;
2758 break;
2759 case RTN_PROHIBIT:
2760 rt->dst.error = -EACCES;
2761 rt->dst.output = ip6_pkt_prohibit_out;
2762 rt->dst.input = ip6_pkt_prohibit;
2763 break;
2764 case RTN_THROW:
2765 case RTN_UNREACHABLE:
2766 default:
2767 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2768 : (cfg->fc_type == RTN_UNREACHABLE)
2769 ? -EHOSTUNREACH : -ENETUNREACH;
2770 rt->dst.output = ip6_pkt_discard_out;
2771 rt->dst.input = ip6_pkt_discard;
2772 break;
2774 goto install_route;
2777 if (cfg->fc_flags & RTF_GATEWAY) {
2778 const struct in6_addr *gw_addr;
2779 int gwa_type;
2781 gw_addr = &cfg->fc_gateway;
2782 gwa_type = ipv6_addr_type(gw_addr);
2784 /* if gw_addr is local we will fail to detect this in case
2785 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2786 * will return already-added prefix route via interface that
2787 * prefix route was assigned to, which might be non-loopback.
2789 err = -EINVAL;
2790 if (ipv6_chk_addr_and_flags(net, gw_addr,
2791 gwa_type & IPV6_ADDR_LINKLOCAL ?
2792 dev : NULL, 0, 0)) {
2793 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2794 goto out;
2796 rt->rt6i_gateway = *gw_addr;
2798 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2799 /* IPv6 strictly inhibits using not link-local
2800 addresses as nexthop address.
2801 Otherwise, router will not able to send redirects.
2802 It is very good, but in some (rare!) circumstances
2803 (SIT, PtP, NBMA NOARP links) it is handy to allow
2804 some exceptions. --ANK
2805 We allow IPv4-mapped nexthops to support RFC4798-type
2806 addressing
2808 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2809 IPV6_ADDR_MAPPED))) {
2810 NL_SET_ERR_MSG(extack,
2811 "Invalid gateway address");
2812 goto out;
2815 if (cfg->fc_flags & RTNH_F_ONLINK) {
2816 err = ip6_route_check_nh_onlink(net, cfg, dev,
2817 extack);
2818 } else {
2819 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2821 if (err)
2822 goto out;
2824 err = -EINVAL;
2825 if (!dev) {
2826 NL_SET_ERR_MSG(extack, "Egress device not specified");
2827 goto out;
2828 } else if (dev->flags & IFF_LOOPBACK) {
2829 NL_SET_ERR_MSG(extack,
2830 "Egress device can not be loopback device for this route");
2831 goto out;
2835 err = -ENODEV;
2836 if (!dev)
2837 goto out;
2839 if (!(dev->flags & IFF_UP)) {
2840 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2841 err = -ENETDOWN;
2842 goto out;
2845 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2846 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2847 NL_SET_ERR_MSG(extack, "Invalid source address");
2848 err = -EINVAL;
2849 goto out;
2851 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2852 rt->rt6i_prefsrc.plen = 128;
2853 } else
2854 rt->rt6i_prefsrc.plen = 0;
2856 rt->rt6i_flags = cfg->fc_flags;
2858 install_route:
2859 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2860 !netif_carrier_ok(dev))
2861 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2862 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2863 rt->dst.dev = dev;
2864 rt->rt6i_idev = idev;
2865 rt->rt6i_table = table;
2867 cfg->fc_nlinfo.nl_net = dev_net(dev);
2869 return rt;
2870 out:
2871 if (dev)
2872 dev_put(dev);
2873 if (idev)
2874 in6_dev_put(idev);
2875 if (rt)
2876 dst_release_immediate(&rt->dst);
2878 return ERR_PTR(err);
2881 int ip6_route_add(struct fib6_config *cfg,
2882 struct netlink_ext_ack *extack)
2884 struct mx6_config mxc = { .mx = NULL, };
2885 struct rt6_info *rt;
2886 int err;
2888 rt = ip6_route_info_create(cfg, extack);
2889 if (IS_ERR(rt)) {
2890 err = PTR_ERR(rt);
2891 rt = NULL;
2892 goto out;
2895 err = ip6_convert_metrics(&mxc, cfg);
2896 if (err)
2897 goto out;
2899 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2901 kfree(mxc.mx);
2903 return err;
2904 out:
2905 if (rt)
2906 dst_release_immediate(&rt->dst);
2908 return err;
2911 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2913 int err;
2914 struct fib6_table *table;
2915 struct net *net = dev_net(rt->dst.dev);
2917 if (rt == net->ipv6.ip6_null_entry) {
2918 err = -ENOENT;
2919 goto out;
2922 table = rt->rt6i_table;
2923 spin_lock_bh(&table->tb6_lock);
2924 err = fib6_del(rt, info);
2925 spin_unlock_bh(&table->tb6_lock);
2927 out:
2928 ip6_rt_put(rt);
2929 return err;
2932 int ip6_del_rt(struct rt6_info *rt)
2934 struct nl_info info = {
2935 .nl_net = dev_net(rt->dst.dev),
2937 return __ip6_del_rt(rt, &info);
2940 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2942 struct nl_info *info = &cfg->fc_nlinfo;
2943 struct net *net = info->nl_net;
2944 struct sk_buff *skb = NULL;
2945 struct fib6_table *table;
2946 int err = -ENOENT;
2948 if (rt == net->ipv6.ip6_null_entry)
2949 goto out_put;
2950 table = rt->rt6i_table;
2951 spin_lock_bh(&table->tb6_lock);
2953 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2954 struct rt6_info *sibling, *next_sibling;
2956 /* prefer to send a single notification with all hops */
2957 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2958 if (skb) {
2959 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2961 if (rt6_fill_node(net, skb, rt,
2962 NULL, NULL, 0, RTM_DELROUTE,
2963 info->portid, seq, 0) < 0) {
2964 kfree_skb(skb);
2965 skb = NULL;
2966 } else
2967 info->skip_notify = 1;
2970 list_for_each_entry_safe(sibling, next_sibling,
2971 &rt->rt6i_siblings,
2972 rt6i_siblings) {
2973 err = fib6_del(sibling, info);
2974 if (err)
2975 goto out_unlock;
2979 err = fib6_del(rt, info);
2980 out_unlock:
2981 spin_unlock_bh(&table->tb6_lock);
2982 out_put:
2983 ip6_rt_put(rt);
2985 if (skb) {
2986 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2987 info->nlh, gfp_any());
2989 return err;
2992 static int ip6_route_del(struct fib6_config *cfg,
2993 struct netlink_ext_ack *extack)
2995 struct rt6_info *rt, *rt_cache;
2996 struct fib6_table *table;
2997 struct fib6_node *fn;
2998 int err = -ESRCH;
3000 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3001 if (!table) {
3002 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3003 return err;
3006 rcu_read_lock();
3008 fn = fib6_locate(&table->tb6_root,
3009 &cfg->fc_dst, cfg->fc_dst_len,
3010 &cfg->fc_src, cfg->fc_src_len,
3011 !(cfg->fc_flags & RTF_CACHE));
3013 if (fn) {
3014 for_each_fib6_node_rt_rcu(fn) {
3015 if (cfg->fc_flags & RTF_CACHE) {
3016 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3017 &cfg->fc_src);
3018 if (!rt_cache)
3019 continue;
3020 rt = rt_cache;
3022 if (cfg->fc_ifindex &&
3023 (!rt->dst.dev ||
3024 rt->dst.dev->ifindex != cfg->fc_ifindex))
3025 continue;
3026 if (cfg->fc_flags & RTF_GATEWAY &&
3027 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3028 continue;
3029 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3030 continue;
3031 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3032 continue;
3033 if (!dst_hold_safe(&rt->dst))
3034 break;
3035 rcu_read_unlock();
3037 /* if gateway was specified only delete the one hop */
3038 if (cfg->fc_flags & RTF_GATEWAY)
3039 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3041 return __ip6_del_rt_siblings(rt, cfg);
3044 rcu_read_unlock();
3046 return err;
3049 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3051 struct netevent_redirect netevent;
3052 struct rt6_info *rt, *nrt = NULL;
3053 struct ndisc_options ndopts;
3054 struct inet6_dev *in6_dev;
3055 struct neighbour *neigh;
3056 struct rd_msg *msg;
3057 int optlen, on_link;
3058 u8 *lladdr;
3060 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3061 optlen -= sizeof(*msg);
3063 if (optlen < 0) {
3064 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3065 return;
3068 msg = (struct rd_msg *)icmp6_hdr(skb);
3070 if (ipv6_addr_is_multicast(&msg->dest)) {
3071 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3072 return;
3075 on_link = 0;
3076 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3077 on_link = 1;
3078 } else if (ipv6_addr_type(&msg->target) !=
3079 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3080 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3081 return;
3084 in6_dev = __in6_dev_get(skb->dev);
3085 if (!in6_dev)
3086 return;
3087 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3088 return;
3090 /* RFC2461 8.1:
3091 * The IP source address of the Redirect MUST be the same as the current
3092 * first-hop router for the specified ICMP Destination Address.
3095 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3096 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3097 return;
3100 lladdr = NULL;
3101 if (ndopts.nd_opts_tgt_lladdr) {
3102 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3103 skb->dev);
3104 if (!lladdr) {
3105 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3106 return;
3110 rt = (struct rt6_info *) dst;
3111 if (rt->rt6i_flags & RTF_REJECT) {
3112 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3113 return;
3116 /* Redirect received -> path was valid.
3117 * Look, redirects are sent only in response to data packets,
3118 * so that this nexthop apparently is reachable. --ANK
3120 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3122 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3123 if (!neigh)
3124 return;
3127 * We have finally decided to accept it.
3130 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3131 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3132 NEIGH_UPDATE_F_OVERRIDE|
3133 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3134 NEIGH_UPDATE_F_ISROUTER)),
3135 NDISC_REDIRECT, &ndopts);
3137 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3138 if (!nrt)
3139 goto out;
3141 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3142 if (on_link)
3143 nrt->rt6i_flags &= ~RTF_GATEWAY;
3145 nrt->rt6i_protocol = RTPROT_REDIRECT;
3146 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3148 /* No need to remove rt from the exception table if rt is
3149 * a cached route because rt6_insert_exception() will
3150 * takes care of it
3152 if (rt6_insert_exception(nrt, rt)) {
3153 dst_release_immediate(&nrt->dst);
3154 goto out;
3157 netevent.old = &rt->dst;
3158 netevent.new = &nrt->dst;
3159 netevent.daddr = &msg->dest;
3160 netevent.neigh = neigh;
3161 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3163 out:
3164 neigh_release(neigh);
3168 * Misc support functions
3171 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3173 BUG_ON(from->from);
3175 rt->rt6i_flags &= ~RTF_EXPIRES;
3176 dst_hold(&from->dst);
3177 rt->from = from;
3178 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3181 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3183 rt->dst.input = ort->dst.input;
3184 rt->dst.output = ort->dst.output;
3185 rt->rt6i_dst = ort->rt6i_dst;
3186 rt->dst.error = ort->dst.error;
3187 rt->rt6i_idev = ort->rt6i_idev;
3188 if (rt->rt6i_idev)
3189 in6_dev_hold(rt->rt6i_idev);
3190 rt->dst.lastuse = jiffies;
3191 rt->rt6i_gateway = ort->rt6i_gateway;
3192 rt->rt6i_flags = ort->rt6i_flags;
3193 rt6_set_from(rt, ort);
3194 rt->rt6i_metric = ort->rt6i_metric;
3195 #ifdef CONFIG_IPV6_SUBTREES
3196 rt->rt6i_src = ort->rt6i_src;
3197 #endif
3198 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3199 rt->rt6i_table = ort->rt6i_table;
3200 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3203 #ifdef CONFIG_IPV6_ROUTE_INFO
3204 static struct rt6_info *rt6_get_route_info(struct net *net,
3205 const struct in6_addr *prefix, int prefixlen,
3206 const struct in6_addr *gwaddr,
3207 struct net_device *dev)
3209 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3210 int ifindex = dev->ifindex;
3211 struct fib6_node *fn;
3212 struct rt6_info *rt = NULL;
3213 struct fib6_table *table;
3215 table = fib6_get_table(net, tb_id);
3216 if (!table)
3217 return NULL;
3219 rcu_read_lock();
3220 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3221 if (!fn)
3222 goto out;
3224 for_each_fib6_node_rt_rcu(fn) {
3225 if (rt->dst.dev->ifindex != ifindex)
3226 continue;
3227 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3228 continue;
3229 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3230 continue;
3231 ip6_hold_safe(NULL, &rt, false);
3232 break;
3234 out:
3235 rcu_read_unlock();
3236 return rt;
3239 static struct rt6_info *rt6_add_route_info(struct net *net,
3240 const struct in6_addr *prefix, int prefixlen,
3241 const struct in6_addr *gwaddr,
3242 struct net_device *dev,
3243 unsigned int pref)
3245 struct fib6_config cfg = {
3246 .fc_metric = IP6_RT_PRIO_USER,
3247 .fc_ifindex = dev->ifindex,
3248 .fc_dst_len = prefixlen,
3249 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3250 RTF_UP | RTF_PREF(pref),
3251 .fc_protocol = RTPROT_RA,
3252 .fc_nlinfo.portid = 0,
3253 .fc_nlinfo.nlh = NULL,
3254 .fc_nlinfo.nl_net = net,
3257 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3258 cfg.fc_dst = *prefix;
3259 cfg.fc_gateway = *gwaddr;
3261 /* We should treat it as a default route if prefix length is 0. */
3262 if (!prefixlen)
3263 cfg.fc_flags |= RTF_DEFAULT;
3265 ip6_route_add(&cfg, NULL);
3267 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3269 #endif
3271 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3273 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3274 struct rt6_info *rt;
3275 struct fib6_table *table;
3277 table = fib6_get_table(dev_net(dev), tb_id);
3278 if (!table)
3279 return NULL;
3281 rcu_read_lock();
3282 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3283 if (dev == rt->dst.dev &&
3284 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3285 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3286 break;
3288 if (rt)
3289 ip6_hold_safe(NULL, &rt, false);
3290 rcu_read_unlock();
3291 return rt;
3294 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3295 struct net_device *dev,
3296 unsigned int pref)
3298 struct fib6_config cfg = {
3299 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3300 .fc_metric = IP6_RT_PRIO_USER,
3301 .fc_ifindex = dev->ifindex,
3302 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3303 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3304 .fc_protocol = RTPROT_RA,
3305 .fc_nlinfo.portid = 0,
3306 .fc_nlinfo.nlh = NULL,
3307 .fc_nlinfo.nl_net = dev_net(dev),
3310 cfg.fc_gateway = *gwaddr;
3312 if (!ip6_route_add(&cfg, NULL)) {
3313 struct fib6_table *table;
3315 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3316 if (table)
3317 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3320 return rt6_get_dflt_router(gwaddr, dev);
3323 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3325 struct rt6_info *rt;
3327 restart:
3328 rcu_read_lock();
3329 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3330 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3331 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3332 if (dst_hold_safe(&rt->dst)) {
3333 rcu_read_unlock();
3334 ip6_del_rt(rt);
3335 } else {
3336 rcu_read_unlock();
3338 goto restart;
3341 rcu_read_unlock();
3343 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3346 void rt6_purge_dflt_routers(struct net *net)
3348 struct fib6_table *table;
3349 struct hlist_head *head;
3350 unsigned int h;
3352 rcu_read_lock();
3354 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3355 head = &net->ipv6.fib_table_hash[h];
3356 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3357 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3358 __rt6_purge_dflt_routers(table);
3362 rcu_read_unlock();
3365 static void rtmsg_to_fib6_config(struct net *net,
3366 struct in6_rtmsg *rtmsg,
3367 struct fib6_config *cfg)
3369 memset(cfg, 0, sizeof(*cfg));
3371 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3372 : RT6_TABLE_MAIN;
3373 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3374 cfg->fc_metric = rtmsg->rtmsg_metric;
3375 cfg->fc_expires = rtmsg->rtmsg_info;
3376 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3377 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3378 cfg->fc_flags = rtmsg->rtmsg_flags;
3380 cfg->fc_nlinfo.nl_net = net;
3382 cfg->fc_dst = rtmsg->rtmsg_dst;
3383 cfg->fc_src = rtmsg->rtmsg_src;
3384 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3387 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3389 struct fib6_config cfg;
3390 struct in6_rtmsg rtmsg;
3391 int err;
3393 switch (cmd) {
3394 case SIOCADDRT: /* Add a route */
3395 case SIOCDELRT: /* Delete a route */
3396 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3397 return -EPERM;
3398 err = copy_from_user(&rtmsg, arg,
3399 sizeof(struct in6_rtmsg));
3400 if (err)
3401 return -EFAULT;
3403 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3405 rtnl_lock();
3406 switch (cmd) {
3407 case SIOCADDRT:
3408 err = ip6_route_add(&cfg, NULL);
3409 break;
3410 case SIOCDELRT:
3411 err = ip6_route_del(&cfg, NULL);
3412 break;
3413 default:
3414 err = -EINVAL;
3416 rtnl_unlock();
3418 return err;
3421 return -EINVAL;
3425 * Drop the packet on the floor
3428 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3430 int type;
3431 struct dst_entry *dst = skb_dst(skb);
3432 switch (ipstats_mib_noroutes) {
3433 case IPSTATS_MIB_INNOROUTES:
3434 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3435 if (type == IPV6_ADDR_ANY) {
3436 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3437 IPSTATS_MIB_INADDRERRORS);
3438 break;
3440 /* FALLTHROUGH */
3441 case IPSTATS_MIB_OUTNOROUTES:
3442 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3443 ipstats_mib_noroutes);
3444 break;
3446 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3447 kfree_skb(skb);
3448 return 0;
3451 static int ip6_pkt_discard(struct sk_buff *skb)
3453 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3456 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3458 skb->dev = skb_dst(skb)->dev;
3459 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3462 static int ip6_pkt_prohibit(struct sk_buff *skb)
3464 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3467 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3469 skb->dev = skb_dst(skb)->dev;
3470 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3474 * Allocate a dst for local (unicast / anycast) address.
3477 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3478 const struct in6_addr *addr,
3479 bool anycast)
3481 u32 tb_id;
3482 struct net *net = dev_net(idev->dev);
3483 struct net_device *dev = idev->dev;
3484 struct rt6_info *rt;
3486 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3487 if (!rt)
3488 return ERR_PTR(-ENOMEM);
3490 in6_dev_hold(idev);
3492 rt->dst.flags |= DST_HOST;
3493 rt->dst.input = ip6_input;
3494 rt->dst.output = ip6_output;
3495 rt->rt6i_idev = idev;
3497 rt->rt6i_protocol = RTPROT_KERNEL;
3498 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3499 if (anycast)
3500 rt->rt6i_flags |= RTF_ANYCAST;
3501 else
3502 rt->rt6i_flags |= RTF_LOCAL;
3504 rt->rt6i_gateway = *addr;
3505 rt->rt6i_dst.addr = *addr;
3506 rt->rt6i_dst.plen = 128;
3507 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3508 rt->rt6i_table = fib6_get_table(net, tb_id);
3510 return rt;
3513 /* remove deleted ip from prefsrc entries */
3514 struct arg_dev_net_ip {
3515 struct net_device *dev;
3516 struct net *net;
3517 struct in6_addr *addr;
3520 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3522 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3523 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3524 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3526 if (((void *)rt->dst.dev == dev || !dev) &&
3527 rt != net->ipv6.ip6_null_entry &&
3528 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3529 spin_lock_bh(&rt6_exception_lock);
3530 /* remove prefsrc entry */
3531 rt->rt6i_prefsrc.plen = 0;
3532 /* need to update cache as well */
3533 rt6_exceptions_remove_prefsrc(rt);
3534 spin_unlock_bh(&rt6_exception_lock);
3536 return 0;
3539 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3541 struct net *net = dev_net(ifp->idev->dev);
3542 struct arg_dev_net_ip adni = {
3543 .dev = ifp->idev->dev,
3544 .net = net,
3545 .addr = &ifp->addr,
3547 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3550 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3552 /* Remove routers and update dst entries when gateway turn into host. */
3553 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3555 struct in6_addr *gateway = (struct in6_addr *)arg;
3557 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3558 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3559 return -1;
3562 /* Further clean up cached routes in exception table.
3563 * This is needed because cached route may have a different
3564 * gateway than its 'parent' in the case of an ip redirect.
3566 rt6_exceptions_clean_tohost(rt, gateway);
3568 return 0;
3571 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3573 fib6_clean_all(net, fib6_clean_tohost, gateway);
3576 struct arg_netdev_event {
3577 const struct net_device *dev;
3578 union {
3579 unsigned int nh_flags;
3580 unsigned long event;
3584 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3586 struct rt6_info *iter;
3587 struct fib6_node *fn;
3589 fn = rcu_dereference_protected(rt->rt6i_node,
3590 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3591 iter = rcu_dereference_protected(fn->leaf,
3592 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3593 while (iter) {
3594 if (iter->rt6i_metric == rt->rt6i_metric &&
3595 rt6_qualify_for_ecmp(iter))
3596 return iter;
3597 iter = rcu_dereference_protected(iter->rt6_next,
3598 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3601 return NULL;
3604 static bool rt6_is_dead(const struct rt6_info *rt)
3606 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3607 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3608 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3609 return true;
3611 return false;
3614 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3616 struct rt6_info *iter;
3617 int total = 0;
3619 if (!rt6_is_dead(rt))
3620 total += rt->rt6i_nh_weight;
3622 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3623 if (!rt6_is_dead(iter))
3624 total += iter->rt6i_nh_weight;
3627 return total;
3630 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3632 int upper_bound = -1;
3634 if (!rt6_is_dead(rt)) {
3635 *weight += rt->rt6i_nh_weight;
3636 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3637 total) - 1;
3639 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3642 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3644 struct rt6_info *iter;
3645 int weight = 0;
3647 rt6_upper_bound_set(rt, &weight, total);
3649 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3650 rt6_upper_bound_set(iter, &weight, total);
3653 void rt6_multipath_rebalance(struct rt6_info *rt)
3655 struct rt6_info *first;
3656 int total;
3658 /* In case the entire multipath route was marked for flushing,
3659 * then there is no need to rebalance upon the removal of every
3660 * sibling route.
3662 if (!rt->rt6i_nsiblings || rt->should_flush)
3663 return;
3665 /* During lookup routes are evaluated in order, so we need to
3666 * make sure upper bounds are assigned from the first sibling
3667 * onwards.
3669 first = rt6_multipath_first_sibling(rt);
3670 if (WARN_ON_ONCE(!first))
3671 return;
3673 total = rt6_multipath_total_weight(first);
3674 rt6_multipath_upper_bound_set(first, total);
3677 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3679 const struct arg_netdev_event *arg = p_arg;
3680 const struct net *net = dev_net(arg->dev);
3682 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3683 rt->rt6i_nh_flags &= ~arg->nh_flags;
3684 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3685 rt6_multipath_rebalance(rt);
3688 return 0;
3691 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3693 struct arg_netdev_event arg = {
3694 .dev = dev,
3696 .nh_flags = nh_flags,
3700 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3701 arg.nh_flags |= RTNH_F_LINKDOWN;
3703 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3706 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3707 const struct net_device *dev)
3709 struct rt6_info *iter;
3711 if (rt->dst.dev == dev)
3712 return true;
3713 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3714 if (iter->dst.dev == dev)
3715 return true;
3717 return false;
3720 static void rt6_multipath_flush(struct rt6_info *rt)
3722 struct rt6_info *iter;
3724 rt->should_flush = 1;
3725 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3726 iter->should_flush = 1;
3729 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3730 const struct net_device *down_dev)
3732 struct rt6_info *iter;
3733 unsigned int dead = 0;
3735 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3736 dead++;
3737 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3738 if (iter->dst.dev == down_dev ||
3739 iter->rt6i_nh_flags & RTNH_F_DEAD)
3740 dead++;
3742 return dead;
3745 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3746 const struct net_device *dev,
3747 unsigned int nh_flags)
3749 struct rt6_info *iter;
3751 if (rt->dst.dev == dev)
3752 rt->rt6i_nh_flags |= nh_flags;
3753 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3754 if (iter->dst.dev == dev)
3755 iter->rt6i_nh_flags |= nh_flags;
3758 /* called with write lock held for table with rt */
3759 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3761 const struct arg_netdev_event *arg = p_arg;
3762 const struct net_device *dev = arg->dev;
3763 const struct net *net = dev_net(dev);
3765 if (rt == net->ipv6.ip6_null_entry)
3766 return 0;
3768 switch (arg->event) {
3769 case NETDEV_UNREGISTER:
3770 return rt->dst.dev == dev ? -1 : 0;
3771 case NETDEV_DOWN:
3772 if (rt->should_flush)
3773 return -1;
3774 if (!rt->rt6i_nsiblings)
3775 return rt->dst.dev == dev ? -1 : 0;
3776 if (rt6_multipath_uses_dev(rt, dev)) {
3777 unsigned int count;
3779 count = rt6_multipath_dead_count(rt, dev);
3780 if (rt->rt6i_nsiblings + 1 == count) {
3781 rt6_multipath_flush(rt);
3782 return -1;
3784 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3785 RTNH_F_LINKDOWN);
3786 fib6_update_sernum(rt);
3787 rt6_multipath_rebalance(rt);
3789 return -2;
3790 case NETDEV_CHANGE:
3791 if (rt->dst.dev != dev ||
3792 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3793 break;
3794 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3795 rt6_multipath_rebalance(rt);
3796 break;
3799 return 0;
3802 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3804 struct arg_netdev_event arg = {
3805 .dev = dev,
3807 .event = event,
3811 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3814 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3816 rt6_sync_down_dev(dev, event);
3817 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3818 neigh_ifdown(&nd_tbl, dev);
3821 struct rt6_mtu_change_arg {
3822 struct net_device *dev;
3823 unsigned int mtu;
3826 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3828 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3829 struct inet6_dev *idev;
3831 /* In IPv6 pmtu discovery is not optional,
3832 so that RTAX_MTU lock cannot disable it.
3833 We still use this lock to block changes
3834 caused by addrconf/ndisc.
3837 idev = __in6_dev_get(arg->dev);
3838 if (!idev)
3839 return 0;
3841 /* For administrative MTU increase, there is no way to discover
3842 IPv6 PMTU increase, so PMTU increase should be updated here.
3843 Since RFC 1981 doesn't include administrative MTU increase
3844 update PMTU increase is a MUST. (i.e. jumbo frame)
3846 if (rt->dst.dev == arg->dev &&
3847 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3848 spin_lock_bh(&rt6_exception_lock);
3849 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3850 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3851 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3852 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3853 spin_unlock_bh(&rt6_exception_lock);
3855 return 0;
3858 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3860 struct rt6_mtu_change_arg arg = {
3861 .dev = dev,
3862 .mtu = mtu,
3865 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3868 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3869 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3870 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
3871 [RTA_OIF] = { .type = NLA_U32 },
3872 [RTA_IIF] = { .type = NLA_U32 },
3873 [RTA_PRIORITY] = { .type = NLA_U32 },
3874 [RTA_METRICS] = { .type = NLA_NESTED },
3875 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3876 [RTA_PREF] = { .type = NLA_U8 },
3877 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3878 [RTA_ENCAP] = { .type = NLA_NESTED },
3879 [RTA_EXPIRES] = { .type = NLA_U32 },
3880 [RTA_UID] = { .type = NLA_U32 },
3881 [RTA_MARK] = { .type = NLA_U32 },
3882 [RTA_TABLE] = { .type = NLA_U32 },
3885 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3886 struct fib6_config *cfg,
3887 struct netlink_ext_ack *extack)
3889 struct rtmsg *rtm;
3890 struct nlattr *tb[RTA_MAX+1];
3891 unsigned int pref;
3892 int err;
3894 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3895 NULL);
3896 if (err < 0)
3897 goto errout;
3899 err = -EINVAL;
3900 rtm = nlmsg_data(nlh);
3901 memset(cfg, 0, sizeof(*cfg));
3903 cfg->fc_table = rtm->rtm_table;
3904 cfg->fc_dst_len = rtm->rtm_dst_len;
3905 cfg->fc_src_len = rtm->rtm_src_len;
3906 cfg->fc_flags = RTF_UP;
3907 cfg->fc_protocol = rtm->rtm_protocol;
3908 cfg->fc_type = rtm->rtm_type;
3910 if (rtm->rtm_type == RTN_UNREACHABLE ||
3911 rtm->rtm_type == RTN_BLACKHOLE ||
3912 rtm->rtm_type == RTN_PROHIBIT ||
3913 rtm->rtm_type == RTN_THROW)
3914 cfg->fc_flags |= RTF_REJECT;
3916 if (rtm->rtm_type == RTN_LOCAL)
3917 cfg->fc_flags |= RTF_LOCAL;
3919 if (rtm->rtm_flags & RTM_F_CLONED)
3920 cfg->fc_flags |= RTF_CACHE;
3922 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3924 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3925 cfg->fc_nlinfo.nlh = nlh;
3926 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3928 if (tb[RTA_GATEWAY]) {
3929 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3930 cfg->fc_flags |= RTF_GATEWAY;
3933 if (tb[RTA_DST]) {
3934 int plen = (rtm->rtm_dst_len + 7) >> 3;
3936 if (nla_len(tb[RTA_DST]) < plen)
3937 goto errout;
3939 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3942 if (tb[RTA_SRC]) {
3943 int plen = (rtm->rtm_src_len + 7) >> 3;
3945 if (nla_len(tb[RTA_SRC]) < plen)
3946 goto errout;
3948 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3951 if (tb[RTA_PREFSRC])
3952 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3954 if (tb[RTA_OIF])
3955 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3957 if (tb[RTA_PRIORITY])
3958 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3960 if (tb[RTA_METRICS]) {
3961 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3962 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3965 if (tb[RTA_TABLE])
3966 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3968 if (tb[RTA_MULTIPATH]) {
3969 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3970 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3972 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3973 cfg->fc_mp_len, extack);
3974 if (err < 0)
3975 goto errout;
3978 if (tb[RTA_PREF]) {
3979 pref = nla_get_u8(tb[RTA_PREF]);
3980 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3981 pref != ICMPV6_ROUTER_PREF_HIGH)
3982 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3983 cfg->fc_flags |= RTF_PREF(pref);
3986 if (tb[RTA_ENCAP])
3987 cfg->fc_encap = tb[RTA_ENCAP];
3989 if (tb[RTA_ENCAP_TYPE]) {
3990 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3992 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3993 if (err < 0)
3994 goto errout;
3997 if (tb[RTA_EXPIRES]) {
3998 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4000 if (addrconf_finite_timeout(timeout)) {
4001 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4002 cfg->fc_flags |= RTF_EXPIRES;
4006 err = 0;
4007 errout:
4008 return err;
4011 struct rt6_nh {
4012 struct rt6_info *rt6_info;
4013 struct fib6_config r_cfg;
4014 struct mx6_config mxc;
4015 struct list_head next;
4018 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4020 struct rt6_nh *nh;
4022 list_for_each_entry(nh, rt6_nh_list, next) {
4023 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4024 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4025 nh->r_cfg.fc_ifindex);
4029 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4030 struct rt6_info *rt, struct fib6_config *r_cfg)
4032 struct rt6_nh *nh;
4033 int err = -EEXIST;
4035 list_for_each_entry(nh, rt6_nh_list, next) {
4036 /* check if rt6_info already exists */
4037 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4038 return err;
4041 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4042 if (!nh)
4043 return -ENOMEM;
4044 nh->rt6_info = rt;
4045 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4046 if (err) {
4047 kfree(nh);
4048 return err;
4050 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4051 list_add_tail(&nh->next, rt6_nh_list);
4053 return 0;
4056 static void ip6_route_mpath_notify(struct rt6_info *rt,
4057 struct rt6_info *rt_last,
4058 struct nl_info *info,
4059 __u16 nlflags)
4061 /* if this is an APPEND route, then rt points to the first route
4062 * inserted and rt_last points to last route inserted. Userspace
4063 * wants a consistent dump of the route which starts at the first
4064 * nexthop. Since sibling routes are always added at the end of
4065 * the list, find the first sibling of the last route appended
4067 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4068 rt = list_first_entry(&rt_last->rt6i_siblings,
4069 struct rt6_info,
4070 rt6i_siblings);
4073 if (rt)
4074 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4077 static int ip6_route_multipath_add(struct fib6_config *cfg,
4078 struct netlink_ext_ack *extack)
4080 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4081 struct nl_info *info = &cfg->fc_nlinfo;
4082 struct fib6_config r_cfg;
4083 struct rtnexthop *rtnh;
4084 struct rt6_info *rt;
4085 struct rt6_nh *err_nh;
4086 struct rt6_nh *nh, *nh_safe;
4087 __u16 nlflags;
4088 int remaining;
4089 int attrlen;
4090 int err = 1;
4091 int nhn = 0;
4092 int replace = (cfg->fc_nlinfo.nlh &&
4093 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4094 LIST_HEAD(rt6_nh_list);
4096 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4097 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4098 nlflags |= NLM_F_APPEND;
4100 remaining = cfg->fc_mp_len;
4101 rtnh = (struct rtnexthop *)cfg->fc_mp;
4103 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4104 * rt6_info structs per nexthop
4106 while (rtnh_ok(rtnh, remaining)) {
4107 memcpy(&r_cfg, cfg, sizeof(*cfg));
4108 if (rtnh->rtnh_ifindex)
4109 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4111 attrlen = rtnh_attrlen(rtnh);
4112 if (attrlen > 0) {
4113 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4115 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4116 if (nla) {
4117 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4118 r_cfg.fc_flags |= RTF_GATEWAY;
4120 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4121 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4122 if (nla)
4123 r_cfg.fc_encap_type = nla_get_u16(nla);
4126 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4127 rt = ip6_route_info_create(&r_cfg, extack);
4128 if (IS_ERR(rt)) {
4129 err = PTR_ERR(rt);
4130 rt = NULL;
4131 goto cleanup;
4134 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4136 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4137 if (err) {
4138 dst_release_immediate(&rt->dst);
4139 goto cleanup;
4142 rtnh = rtnh_next(rtnh, &remaining);
4145 /* for add and replace send one notification with all nexthops.
4146 * Skip the notification in fib6_add_rt2node and send one with
4147 * the full route when done
4149 info->skip_notify = 1;
4151 err_nh = NULL;
4152 list_for_each_entry(nh, &rt6_nh_list, next) {
4153 rt_last = nh->rt6_info;
4154 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4155 /* save reference to first route for notification */
4156 if (!rt_notif && !err)
4157 rt_notif = nh->rt6_info;
4159 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4160 nh->rt6_info = NULL;
4161 if (err) {
4162 if (replace && nhn)
4163 ip6_print_replace_route_err(&rt6_nh_list);
4164 err_nh = nh;
4165 goto add_errout;
4168 /* Because each route is added like a single route we remove
4169 * these flags after the first nexthop: if there is a collision,
4170 * we have already failed to add the first nexthop:
4171 * fib6_add_rt2node() has rejected it; when replacing, old
4172 * nexthops have been replaced by first new, the rest should
4173 * be added to it.
4175 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4176 NLM_F_REPLACE);
4177 nhn++;
4180 /* success ... tell user about new route */
4181 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4182 goto cleanup;
4184 add_errout:
4185 /* send notification for routes that were added so that
4186 * the delete notifications sent by ip6_route_del are
4187 * coherent
4189 if (rt_notif)
4190 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4192 /* Delete routes that were already added */
4193 list_for_each_entry(nh, &rt6_nh_list, next) {
4194 if (err_nh == nh)
4195 break;
4196 ip6_route_del(&nh->r_cfg, extack);
4199 cleanup:
4200 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4201 if (nh->rt6_info)
4202 dst_release_immediate(&nh->rt6_info->dst);
4203 kfree(nh->mxc.mx);
4204 list_del(&nh->next);
4205 kfree(nh);
4208 return err;
4211 static int ip6_route_multipath_del(struct fib6_config *cfg,
4212 struct netlink_ext_ack *extack)
4214 struct fib6_config r_cfg;
4215 struct rtnexthop *rtnh;
4216 int remaining;
4217 int attrlen;
4218 int err = 1, last_err = 0;
4220 remaining = cfg->fc_mp_len;
4221 rtnh = (struct rtnexthop *)cfg->fc_mp;
4223 /* Parse a Multipath Entry */
4224 while (rtnh_ok(rtnh, remaining)) {
4225 memcpy(&r_cfg, cfg, sizeof(*cfg));
4226 if (rtnh->rtnh_ifindex)
4227 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4229 attrlen = rtnh_attrlen(rtnh);
4230 if (attrlen > 0) {
4231 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4233 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4234 if (nla) {
4235 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4236 r_cfg.fc_flags |= RTF_GATEWAY;
4239 err = ip6_route_del(&r_cfg, extack);
4240 if (err)
4241 last_err = err;
4243 rtnh = rtnh_next(rtnh, &remaining);
4246 return last_err;
4249 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4250 struct netlink_ext_ack *extack)
4252 struct fib6_config cfg;
4253 int err;
4255 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4256 if (err < 0)
4257 return err;
4259 if (cfg.fc_mp)
4260 return ip6_route_multipath_del(&cfg, extack);
4261 else {
4262 cfg.fc_delete_all_nh = 1;
4263 return ip6_route_del(&cfg, extack);
4267 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4268 struct netlink_ext_ack *extack)
4270 struct fib6_config cfg;
4271 int err;
4273 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4274 if (err < 0)
4275 return err;
4277 if (cfg.fc_mp)
4278 return ip6_route_multipath_add(&cfg, extack);
4279 else
4280 return ip6_route_add(&cfg, extack);
4283 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4285 int nexthop_len = 0;
4287 if (rt->rt6i_nsiblings) {
4288 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4289 + NLA_ALIGN(sizeof(struct rtnexthop))
4290 + nla_total_size(16) /* RTA_GATEWAY */
4291 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4293 nexthop_len *= rt->rt6i_nsiblings;
4296 return NLMSG_ALIGN(sizeof(struct rtmsg))
4297 + nla_total_size(16) /* RTA_SRC */
4298 + nla_total_size(16) /* RTA_DST */
4299 + nla_total_size(16) /* RTA_GATEWAY */
4300 + nla_total_size(16) /* RTA_PREFSRC */
4301 + nla_total_size(4) /* RTA_TABLE */
4302 + nla_total_size(4) /* RTA_IIF */
4303 + nla_total_size(4) /* RTA_OIF */
4304 + nla_total_size(4) /* RTA_PRIORITY */
4305 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4306 + nla_total_size(sizeof(struct rta_cacheinfo))
4307 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4308 + nla_total_size(1) /* RTA_PREF */
4309 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4310 + nexthop_len;
4313 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4314 unsigned int *flags, bool skip_oif)
4316 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4317 *flags |= RTNH_F_DEAD;
4319 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4320 *flags |= RTNH_F_LINKDOWN;
4321 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4322 *flags |= RTNH_F_DEAD;
4325 if (rt->rt6i_flags & RTF_GATEWAY) {
4326 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4327 goto nla_put_failure;
4330 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4331 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4332 *flags |= RTNH_F_OFFLOAD;
4334 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4335 if (!skip_oif && rt->dst.dev &&
4336 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4337 goto nla_put_failure;
4339 if (rt->dst.lwtstate &&
4340 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4341 goto nla_put_failure;
4343 return 0;
4345 nla_put_failure:
4346 return -EMSGSIZE;
4349 /* add multipath next hop */
4350 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4352 struct rtnexthop *rtnh;
4353 unsigned int flags = 0;
4355 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4356 if (!rtnh)
4357 goto nla_put_failure;
4359 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4360 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4362 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4363 goto nla_put_failure;
4365 rtnh->rtnh_flags = flags;
4367 /* length of rtnetlink header + attributes */
4368 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4370 return 0;
4372 nla_put_failure:
4373 return -EMSGSIZE;
4376 static int rt6_fill_node(struct net *net,
4377 struct sk_buff *skb, struct rt6_info *rt,
4378 struct in6_addr *dst, struct in6_addr *src,
4379 int iif, int type, u32 portid, u32 seq,
4380 unsigned int flags)
4382 u32 metrics[RTAX_MAX];
4383 struct rtmsg *rtm;
4384 struct nlmsghdr *nlh;
4385 long expires;
4386 u32 table;
4388 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4389 if (!nlh)
4390 return -EMSGSIZE;
4392 rtm = nlmsg_data(nlh);
4393 rtm->rtm_family = AF_INET6;
4394 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4395 rtm->rtm_src_len = rt->rt6i_src.plen;
4396 rtm->rtm_tos = 0;
4397 if (rt->rt6i_table)
4398 table = rt->rt6i_table->tb6_id;
4399 else
4400 table = RT6_TABLE_UNSPEC;
4401 rtm->rtm_table = table;
4402 if (nla_put_u32(skb, RTA_TABLE, table))
4403 goto nla_put_failure;
4404 if (rt->rt6i_flags & RTF_REJECT) {
4405 switch (rt->dst.error) {
4406 case -EINVAL:
4407 rtm->rtm_type = RTN_BLACKHOLE;
4408 break;
4409 case -EACCES:
4410 rtm->rtm_type = RTN_PROHIBIT;
4411 break;
4412 case -EAGAIN:
4413 rtm->rtm_type = RTN_THROW;
4414 break;
4415 default:
4416 rtm->rtm_type = RTN_UNREACHABLE;
4417 break;
4420 else if (rt->rt6i_flags & RTF_LOCAL)
4421 rtm->rtm_type = RTN_LOCAL;
4422 else if (rt->rt6i_flags & RTF_ANYCAST)
4423 rtm->rtm_type = RTN_ANYCAST;
4424 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4425 rtm->rtm_type = RTN_LOCAL;
4426 else
4427 rtm->rtm_type = RTN_UNICAST;
4428 rtm->rtm_flags = 0;
4429 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4430 rtm->rtm_protocol = rt->rt6i_protocol;
4432 if (rt->rt6i_flags & RTF_CACHE)
4433 rtm->rtm_flags |= RTM_F_CLONED;
4435 if (dst) {
4436 if (nla_put_in6_addr(skb, RTA_DST, dst))
4437 goto nla_put_failure;
4438 rtm->rtm_dst_len = 128;
4439 } else if (rtm->rtm_dst_len)
4440 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4441 goto nla_put_failure;
4442 #ifdef CONFIG_IPV6_SUBTREES
4443 if (src) {
4444 if (nla_put_in6_addr(skb, RTA_SRC, src))
4445 goto nla_put_failure;
4446 rtm->rtm_src_len = 128;
4447 } else if (rtm->rtm_src_len &&
4448 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4449 goto nla_put_failure;
4450 #endif
4451 if (iif) {
4452 #ifdef CONFIG_IPV6_MROUTE
4453 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4454 int err = ip6mr_get_route(net, skb, rtm, portid);
4456 if (err == 0)
4457 return 0;
4458 if (err < 0)
4459 goto nla_put_failure;
4460 } else
4461 #endif
4462 if (nla_put_u32(skb, RTA_IIF, iif))
4463 goto nla_put_failure;
4464 } else if (dst) {
4465 struct in6_addr saddr_buf;
4466 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4467 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4468 goto nla_put_failure;
4471 if (rt->rt6i_prefsrc.plen) {
4472 struct in6_addr saddr_buf;
4473 saddr_buf = rt->rt6i_prefsrc.addr;
4474 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4475 goto nla_put_failure;
4478 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4479 if (rt->rt6i_pmtu)
4480 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4481 if (rtnetlink_put_metrics(skb, metrics) < 0)
4482 goto nla_put_failure;
4484 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4485 goto nla_put_failure;
4487 /* For multipath routes, walk the siblings list and add
4488 * each as a nexthop within RTA_MULTIPATH.
4490 if (rt->rt6i_nsiblings) {
4491 struct rt6_info *sibling, *next_sibling;
4492 struct nlattr *mp;
4494 mp = nla_nest_start(skb, RTA_MULTIPATH);
4495 if (!mp)
4496 goto nla_put_failure;
4498 if (rt6_add_nexthop(skb, rt) < 0)
4499 goto nla_put_failure;
4501 list_for_each_entry_safe(sibling, next_sibling,
4502 &rt->rt6i_siblings, rt6i_siblings) {
4503 if (rt6_add_nexthop(skb, sibling) < 0)
4504 goto nla_put_failure;
4507 nla_nest_end(skb, mp);
4508 } else {
4509 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4510 goto nla_put_failure;
4513 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4515 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4516 goto nla_put_failure;
4518 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4519 goto nla_put_failure;
4522 nlmsg_end(skb, nlh);
4523 return 0;
4525 nla_put_failure:
4526 nlmsg_cancel(skb, nlh);
4527 return -EMSGSIZE;
4530 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4532 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4533 struct net *net = arg->net;
4535 if (rt == net->ipv6.ip6_null_entry)
4536 return 0;
4538 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4539 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4541 /* user wants prefix routes only */
4542 if (rtm->rtm_flags & RTM_F_PREFIX &&
4543 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4544 /* success since this is not a prefix route */
4545 return 1;
4549 return rt6_fill_node(net,
4550 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4551 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4552 NLM_F_MULTI);
4555 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4556 struct netlink_ext_ack *extack)
4558 struct net *net = sock_net(in_skb->sk);
4559 struct nlattr *tb[RTA_MAX+1];
4560 int err, iif = 0, oif = 0;
4561 struct dst_entry *dst;
4562 struct rt6_info *rt;
4563 struct sk_buff *skb;
4564 struct rtmsg *rtm;
4565 struct flowi6 fl6;
4566 bool fibmatch;
4568 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4569 extack);
4570 if (err < 0)
4571 goto errout;
4573 err = -EINVAL;
4574 memset(&fl6, 0, sizeof(fl6));
4575 rtm = nlmsg_data(nlh);
4576 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4577 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4579 if (tb[RTA_SRC]) {
4580 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4581 goto errout;
4583 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4586 if (tb[RTA_DST]) {
4587 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4588 goto errout;
4590 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4593 if (tb[RTA_IIF])
4594 iif = nla_get_u32(tb[RTA_IIF]);
4596 if (tb[RTA_OIF])
4597 oif = nla_get_u32(tb[RTA_OIF]);
4599 if (tb[RTA_MARK])
4600 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4602 if (tb[RTA_UID])
4603 fl6.flowi6_uid = make_kuid(current_user_ns(),
4604 nla_get_u32(tb[RTA_UID]));
4605 else
4606 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4608 if (iif) {
4609 struct net_device *dev;
4610 int flags = 0;
4612 rcu_read_lock();
4614 dev = dev_get_by_index_rcu(net, iif);
4615 if (!dev) {
4616 rcu_read_unlock();
4617 err = -ENODEV;
4618 goto errout;
4621 fl6.flowi6_iif = iif;
4623 if (!ipv6_addr_any(&fl6.saddr))
4624 flags |= RT6_LOOKUP_F_HAS_SADDR;
4626 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4628 rcu_read_unlock();
4629 } else {
4630 fl6.flowi6_oif = oif;
4632 dst = ip6_route_output(net, NULL, &fl6);
4636 rt = container_of(dst, struct rt6_info, dst);
4637 if (rt->dst.error) {
4638 err = rt->dst.error;
4639 ip6_rt_put(rt);
4640 goto errout;
4643 if (rt == net->ipv6.ip6_null_entry) {
4644 err = rt->dst.error;
4645 ip6_rt_put(rt);
4646 goto errout;
4649 if (fibmatch && rt->from) {
4650 struct rt6_info *ort = rt->from;
4652 dst_hold(&ort->dst);
4653 ip6_rt_put(rt);
4654 rt = ort;
4657 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4658 if (!skb) {
4659 ip6_rt_put(rt);
4660 err = -ENOBUFS;
4661 goto errout;
4664 skb_dst_set(skb, &rt->dst);
4665 if (fibmatch)
4666 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4667 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4668 nlh->nlmsg_seq, 0);
4669 else
4670 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4671 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4672 nlh->nlmsg_seq, 0);
4673 if (err < 0) {
4674 kfree_skb(skb);
4675 goto errout;
4678 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4679 errout:
4680 return err;
4683 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4684 unsigned int nlm_flags)
4686 struct sk_buff *skb;
4687 struct net *net = info->nl_net;
4688 u32 seq;
4689 int err;
4691 err = -ENOBUFS;
4692 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4694 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4695 if (!skb)
4696 goto errout;
4698 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4699 event, info->portid, seq, nlm_flags);
4700 if (err < 0) {
4701 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4702 WARN_ON(err == -EMSGSIZE);
4703 kfree_skb(skb);
4704 goto errout;
4706 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4707 info->nlh, gfp_any());
4708 return;
4709 errout:
4710 if (err < 0)
4711 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4714 static int ip6_route_dev_notify(struct notifier_block *this,
4715 unsigned long event, void *ptr)
4717 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4718 struct net *net = dev_net(dev);
4720 if (!(dev->flags & IFF_LOOPBACK))
4721 return NOTIFY_OK;
4723 if (event == NETDEV_REGISTER) {
4724 net->ipv6.ip6_null_entry->dst.dev = dev;
4725 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4726 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4727 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4728 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4729 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4730 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4731 #endif
4732 } else if (event == NETDEV_UNREGISTER &&
4733 dev->reg_state != NETREG_UNREGISTERED) {
4734 /* NETDEV_UNREGISTER could be fired for multiple times by
4735 * netdev_wait_allrefs(). Make sure we only call this once.
4737 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4738 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4739 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4740 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4741 #endif
4744 return NOTIFY_OK;
4748 * /proc
4751 #ifdef CONFIG_PROC_FS
4753 static const struct file_operations ipv6_route_proc_fops = {
4754 .open = ipv6_route_open,
4755 .read = seq_read,
4756 .llseek = seq_lseek,
4757 .release = seq_release_net,
4760 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4762 struct net *net = (struct net *)seq->private;
4763 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4764 net->ipv6.rt6_stats->fib_nodes,
4765 net->ipv6.rt6_stats->fib_route_nodes,
4766 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4767 net->ipv6.rt6_stats->fib_rt_entries,
4768 net->ipv6.rt6_stats->fib_rt_cache,
4769 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4770 net->ipv6.rt6_stats->fib_discarded_routes);
4772 return 0;
4775 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4777 return single_open_net(inode, file, rt6_stats_seq_show);
4780 static const struct file_operations rt6_stats_seq_fops = {
4781 .open = rt6_stats_seq_open,
4782 .read = seq_read,
4783 .llseek = seq_lseek,
4784 .release = single_release_net,
4786 #endif /* CONFIG_PROC_FS */
4788 #ifdef CONFIG_SYSCTL
4790 static
4791 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4792 void __user *buffer, size_t *lenp, loff_t *ppos)
4794 struct net *net;
4795 int delay;
4796 if (!write)
4797 return -EINVAL;
4799 net = (struct net *)ctl->extra1;
4800 delay = net->ipv6.sysctl.flush_delay;
4801 proc_dointvec(ctl, write, buffer, lenp, ppos);
4802 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4803 return 0;
4806 struct ctl_table ipv6_route_table_template[] = {
4808 .procname = "flush",
4809 .data = &init_net.ipv6.sysctl.flush_delay,
4810 .maxlen = sizeof(int),
4811 .mode = 0200,
4812 .proc_handler = ipv6_sysctl_rtcache_flush
4815 .procname = "gc_thresh",
4816 .data = &ip6_dst_ops_template.gc_thresh,
4817 .maxlen = sizeof(int),
4818 .mode = 0644,
4819 .proc_handler = proc_dointvec,
4822 .procname = "max_size",
4823 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4824 .maxlen = sizeof(int),
4825 .mode = 0644,
4826 .proc_handler = proc_dointvec,
4829 .procname = "gc_min_interval",
4830 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4831 .maxlen = sizeof(int),
4832 .mode = 0644,
4833 .proc_handler = proc_dointvec_jiffies,
4836 .procname = "gc_timeout",
4837 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4838 .maxlen = sizeof(int),
4839 .mode = 0644,
4840 .proc_handler = proc_dointvec_jiffies,
4843 .procname = "gc_interval",
4844 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4845 .maxlen = sizeof(int),
4846 .mode = 0644,
4847 .proc_handler = proc_dointvec_jiffies,
4850 .procname = "gc_elasticity",
4851 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4852 .maxlen = sizeof(int),
4853 .mode = 0644,
4854 .proc_handler = proc_dointvec,
4857 .procname = "mtu_expires",
4858 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4859 .maxlen = sizeof(int),
4860 .mode = 0644,
4861 .proc_handler = proc_dointvec_jiffies,
4864 .procname = "min_adv_mss",
4865 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4866 .maxlen = sizeof(int),
4867 .mode = 0644,
4868 .proc_handler = proc_dointvec,
4871 .procname = "gc_min_interval_ms",
4872 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4873 .maxlen = sizeof(int),
4874 .mode = 0644,
4875 .proc_handler = proc_dointvec_ms_jiffies,
4880 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4882 struct ctl_table *table;
4884 table = kmemdup(ipv6_route_table_template,
4885 sizeof(ipv6_route_table_template),
4886 GFP_KERNEL);
4888 if (table) {
4889 table[0].data = &net->ipv6.sysctl.flush_delay;
4890 table[0].extra1 = net;
4891 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4892 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4893 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4894 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4895 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4896 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4897 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4898 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4899 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4901 /* Don't export sysctls to unprivileged users */
4902 if (net->user_ns != &init_user_ns)
4903 table[0].procname = NULL;
4906 return table;
4908 #endif
4910 static int __net_init ip6_route_net_init(struct net *net)
4912 int ret = -ENOMEM;
4914 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4915 sizeof(net->ipv6.ip6_dst_ops));
4917 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4918 goto out_ip6_dst_ops;
4920 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4921 sizeof(*net->ipv6.ip6_null_entry),
4922 GFP_KERNEL);
4923 if (!net->ipv6.ip6_null_entry)
4924 goto out_ip6_dst_entries;
4925 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4926 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4927 ip6_template_metrics, true);
4929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4930 net->ipv6.fib6_has_custom_rules = false;
4931 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4932 sizeof(*net->ipv6.ip6_prohibit_entry),
4933 GFP_KERNEL);
4934 if (!net->ipv6.ip6_prohibit_entry)
4935 goto out_ip6_null_entry;
4936 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4937 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4938 ip6_template_metrics, true);
4940 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4941 sizeof(*net->ipv6.ip6_blk_hole_entry),
4942 GFP_KERNEL);
4943 if (!net->ipv6.ip6_blk_hole_entry)
4944 goto out_ip6_prohibit_entry;
4945 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4946 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4947 ip6_template_metrics, true);
4948 #endif
4950 net->ipv6.sysctl.flush_delay = 0;
4951 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4952 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4953 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4954 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4955 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4956 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4957 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4959 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4961 ret = 0;
4962 out:
4963 return ret;
4965 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4966 out_ip6_prohibit_entry:
4967 kfree(net->ipv6.ip6_prohibit_entry);
4968 out_ip6_null_entry:
4969 kfree(net->ipv6.ip6_null_entry);
4970 #endif
4971 out_ip6_dst_entries:
4972 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4973 out_ip6_dst_ops:
4974 goto out;
4977 static void __net_exit ip6_route_net_exit(struct net *net)
4979 kfree(net->ipv6.ip6_null_entry);
4980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4981 kfree(net->ipv6.ip6_prohibit_entry);
4982 kfree(net->ipv6.ip6_blk_hole_entry);
4983 #endif
4984 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4987 static int __net_init ip6_route_net_init_late(struct net *net)
4989 #ifdef CONFIG_PROC_FS
4990 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4991 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4992 #endif
4993 return 0;
4996 static void __net_exit ip6_route_net_exit_late(struct net *net)
4998 #ifdef CONFIG_PROC_FS
4999 remove_proc_entry("ipv6_route", net->proc_net);
5000 remove_proc_entry("rt6_stats", net->proc_net);
5001 #endif
5004 static struct pernet_operations ip6_route_net_ops = {
5005 .init = ip6_route_net_init,
5006 .exit = ip6_route_net_exit,
5009 static int __net_init ipv6_inetpeer_init(struct net *net)
5011 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5013 if (!bp)
5014 return -ENOMEM;
5015 inet_peer_base_init(bp);
5016 net->ipv6.peers = bp;
5017 return 0;
5020 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5022 struct inet_peer_base *bp = net->ipv6.peers;
5024 net->ipv6.peers = NULL;
5025 inetpeer_invalidate_tree(bp);
5026 kfree(bp);
5029 static struct pernet_operations ipv6_inetpeer_ops = {
5030 .init = ipv6_inetpeer_init,
5031 .exit = ipv6_inetpeer_exit,
5034 static struct pernet_operations ip6_route_net_late_ops = {
5035 .init = ip6_route_net_init_late,
5036 .exit = ip6_route_net_exit_late,
5039 static struct notifier_block ip6_route_dev_notifier = {
5040 .notifier_call = ip6_route_dev_notify,
5041 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5044 void __init ip6_route_init_special_entries(void)
5046 /* Registering of the loopback is done before this portion of code,
5047 * the loopback reference in rt6_info will not be taken, do it
5048 * manually for init_net */
5049 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5050 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5051 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5052 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5053 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5054 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5055 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5056 #endif
5059 int __init ip6_route_init(void)
5061 int ret;
5062 int cpu;
5064 ret = -ENOMEM;
5065 ip6_dst_ops_template.kmem_cachep =
5066 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5067 SLAB_HWCACHE_ALIGN, NULL);
5068 if (!ip6_dst_ops_template.kmem_cachep)
5069 goto out;
5071 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5072 if (ret)
5073 goto out_kmem_cache;
5075 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5076 if (ret)
5077 goto out_dst_entries;
5079 ret = register_pernet_subsys(&ip6_route_net_ops);
5080 if (ret)
5081 goto out_register_inetpeer;
5083 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5085 ret = fib6_init();
5086 if (ret)
5087 goto out_register_subsys;
5089 ret = xfrm6_init();
5090 if (ret)
5091 goto out_fib6_init;
5093 ret = fib6_rules_init();
5094 if (ret)
5095 goto xfrm6_init;
5097 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5098 if (ret)
5099 goto fib6_rules_init;
5101 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5102 inet6_rtm_newroute, NULL, 0);
5103 if (ret < 0)
5104 goto out_register_late_subsys;
5106 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5107 inet6_rtm_delroute, NULL, 0);
5108 if (ret < 0)
5109 goto out_register_late_subsys;
5111 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5112 inet6_rtm_getroute, NULL,
5113 RTNL_FLAG_DOIT_UNLOCKED);
5114 if (ret < 0)
5115 goto out_register_late_subsys;
5117 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5118 if (ret)
5119 goto out_register_late_subsys;
5121 for_each_possible_cpu(cpu) {
5122 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5124 INIT_LIST_HEAD(&ul->head);
5125 spin_lock_init(&ul->lock);
5128 out:
5129 return ret;
5131 out_register_late_subsys:
5132 rtnl_unregister_all(PF_INET6);
5133 unregister_pernet_subsys(&ip6_route_net_late_ops);
5134 fib6_rules_init:
5135 fib6_rules_cleanup();
5136 xfrm6_init:
5137 xfrm6_fini();
5138 out_fib6_init:
5139 fib6_gc_cleanup();
5140 out_register_subsys:
5141 unregister_pernet_subsys(&ip6_route_net_ops);
5142 out_register_inetpeer:
5143 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5144 out_dst_entries:
5145 dst_entries_destroy(&ip6_dst_blackhole_ops);
5146 out_kmem_cache:
5147 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5148 goto out;
5151 void ip6_route_cleanup(void)
5153 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5154 unregister_pernet_subsys(&ip6_route_net_late_ops);
5155 fib6_rules_cleanup();
5156 xfrm6_fini();
5157 fib6_gc_cleanup();
5158 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5159 unregister_pernet_subsys(&ip6_route_net_ops);
5160 dst_entries_destroy(&ip6_dst_blackhole_ops);
5161 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);