Linux 4.13.16
[linux/fpc-iii.git] / net / ipv6 / route.c
blob44eebe738c09b2bddd55c5a676f7f3e85d5907d5
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
67 #include <linux/uaccess.h>
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
73 enum rt6_nud_state {
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
77 RT6_NUD_SUCCEED = 1
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 struct sk_buff *skb);
99 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103 struct sk_buff *skb, struct rt6_info *rt,
104 struct in6_addr *dst, struct in6_addr *src,
105 int iif, int type, u32 portid, u32 seq,
106 unsigned int flags);
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110 const struct in6_addr *prefix, int prefixlen,
111 const struct in6_addr *gwaddr,
112 struct net_device *dev,
113 unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115 const struct in6_addr *prefix, int prefixlen,
116 const struct in6_addr *gwaddr,
117 struct net_device *dev);
118 #endif
120 struct uncached_list {
121 spinlock_t lock;
122 struct list_head head;
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
127 static void rt6_uncached_list_add(struct rt6_info *rt)
129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
131 rt->rt6i_uncached_list = ul;
133 spin_lock_bh(&ul->lock);
134 list_add_tail(&rt->rt6i_uncached, &ul->head);
135 spin_unlock_bh(&ul->lock);
138 static void rt6_uncached_list_del(struct rt6_info *rt)
140 if (!list_empty(&rt->rt6i_uncached)) {
141 struct uncached_list *ul = rt->rt6i_uncached_list;
143 spin_lock_bh(&ul->lock);
144 list_del(&rt->rt6i_uncached);
145 spin_unlock_bh(&ul->lock);
149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 struct net_device *loopback_dev = net->loopback_dev;
152 int cpu;
154 if (dev == loopback_dev)
155 return;
157 for_each_possible_cpu(cpu) {
158 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
159 struct rt6_info *rt;
161 spin_lock_bh(&ul->lock);
162 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
163 struct inet6_dev *rt_idev = rt->rt6i_idev;
164 struct net_device *rt_dev = rt->dst.dev;
166 if (rt_idev->dev == dev) {
167 rt->rt6i_idev = in6_dev_get(loopback_dev);
168 in6_dev_put(rt_idev);
171 if (rt_dev == dev) {
172 rt->dst.dev = loopback_dev;
173 dev_hold(rt->dst.dev);
174 dev_put(rt_dev);
177 spin_unlock_bh(&ul->lock);
181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 return dst_metrics_write_ptr(rt->dst.from);
186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 struct rt6_info *rt = (struct rt6_info *)dst;
190 if (rt->rt6i_flags & RTF_PCPU)
191 return rt6_pcpu_cow_metrics(rt);
192 else if (rt->rt6i_flags & RTF_CACHE)
193 return NULL;
194 else
195 return dst_cow_metrics_generic(dst, old);
198 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
199 struct sk_buff *skb,
200 const void *daddr)
202 struct in6_addr *p = &rt->rt6i_gateway;
204 if (!ipv6_addr_any(p))
205 return (const void *) p;
206 else if (skb)
207 return &ipv6_hdr(skb)->daddr;
208 return daddr;
211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
212 struct sk_buff *skb,
213 const void *daddr)
215 struct rt6_info *rt = (struct rt6_info *) dst;
216 struct neighbour *n;
218 daddr = choose_neigh_daddr(rt, skb, daddr);
219 n = __ipv6_neigh_lookup(dst->dev, daddr);
220 if (n)
221 return n;
222 return neigh_create(&nd_tbl, daddr, dst->dev);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(rt, NULL, daddr);
231 if (!daddr)
232 return;
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 return;
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 return;
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
241 .family = AF_INET6,
242 .gc = ip6_dst_gc,
243 .gc_thresh = 1024,
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
246 .mtu = ip6_mtu,
247 .cow_metrics = ipv6_cow_metrics,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 struct sk_buff *skb)
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 .family = AF_INET6,
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct rt6_info ip6_null_entry_template = {
293 .dst = {
294 .__refcnt = ATOMIC_INIT(1),
295 .__use = 1,
296 .obsolete = DST_OBSOLETE_FORCE_CHK,
297 .error = -ENETUNREACH,
298 .input = ip6_pkt_discard,
299 .output = ip6_pkt_discard_out,
301 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
302 .rt6i_protocol = RTPROT_KERNEL,
303 .rt6i_metric = ~(u32) 0,
304 .rt6i_ref = ATOMIC_INIT(1),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 .dst = {
311 .__refcnt = ATOMIC_INIT(1),
312 .__use = 1,
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
314 .error = -EACCES,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
319 .rt6i_protocol = RTPROT_KERNEL,
320 .rt6i_metric = ~(u32) 0,
321 .rt6i_ref = ATOMIC_INIT(1),
324 static const struct rt6_info ip6_blk_hole_entry_template = {
325 .dst = {
326 .__refcnt = ATOMIC_INIT(1),
327 .__use = 1,
328 .obsolete = DST_OBSOLETE_FORCE_CHK,
329 .error = -EINVAL,
330 .input = dst_discard,
331 .output = dst_discard_out,
333 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
334 .rt6i_protocol = RTPROT_KERNEL,
335 .rt6i_metric = ~(u32) 0,
336 .rt6i_ref = ATOMIC_INIT(1),
339 #endif
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_siblings);
347 INIT_LIST_HEAD(&rt->rt6i_uncached);
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info *__ip6_dst_alloc(struct net *net,
352 struct net_device *dev,
353 int flags)
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
358 if (rt)
359 rt6_info_init(rt);
361 return rt;
364 struct rt6_info *ip6_dst_alloc(struct net *net,
365 struct net_device *dev,
366 int flags)
368 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370 if (rt) {
371 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
372 if (rt->rt6i_pcpu) {
373 int cpu;
375 for_each_possible_cpu(cpu) {
376 struct rt6_info **p;
378 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
379 /* no one shares rt */
380 *p = NULL;
382 } else {
383 dst_release_immediate(&rt->dst);
384 return NULL;
388 return rt;
390 EXPORT_SYMBOL(ip6_dst_alloc);
392 static void ip6_dst_destroy(struct dst_entry *dst)
394 struct rt6_info *rt = (struct rt6_info *)dst;
395 struct dst_entry *from = dst->from;
396 struct inet6_dev *idev;
398 dst_destroy_metrics_generic(dst);
399 free_percpu(rt->rt6i_pcpu);
400 rt6_uncached_list_del(rt);
402 idev = rt->rt6i_idev;
403 if (idev) {
404 rt->rt6i_idev = NULL;
405 in6_dev_put(idev);
408 dst->from = NULL;
409 dst_release(from);
412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
413 int how)
415 struct rt6_info *rt = (struct rt6_info *)dst;
416 struct inet6_dev *idev = rt->rt6i_idev;
417 struct net_device *loopback_dev =
418 dev_net(dev)->loopback_dev;
420 if (idev && idev->dev != loopback_dev) {
421 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
422 if (loopback_idev) {
423 rt->rt6i_idev = loopback_idev;
424 in6_dev_put(idev);
429 static bool __rt6_check_expired(const struct rt6_info *rt)
431 if (rt->rt6i_flags & RTF_EXPIRES)
432 return time_after(jiffies, rt->dst.expires);
433 else
434 return false;
437 static bool rt6_check_expired(const struct rt6_info *rt)
439 if (rt->rt6i_flags & RTF_EXPIRES) {
440 if (time_after(jiffies, rt->dst.expires))
441 return true;
442 } else if (rt->dst.from) {
443 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
444 rt6_check_expired((struct rt6_info *)rt->dst.from);
446 return false;
449 /* Multipath route selection:
450 * Hash based function using packet header and flowlabel.
451 * Adapted from fib_info_hashfn()
453 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
454 const struct flowi6 *fl6)
456 return get_hash_from_flowi6(fl6) % candidate_count;
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460 struct flowi6 *fl6, int oif,
461 int strict)
463 struct rt6_info *sibling, *next_sibling;
464 int route_choosen;
466 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
467 /* Don't change the route, if route_choosen == 0
468 * (siblings does not include ourself)
470 if (route_choosen)
471 list_for_each_entry_safe(sibling, next_sibling,
472 &match->rt6i_siblings, rt6i_siblings) {
473 route_choosen--;
474 if (route_choosen == 0) {
475 if (rt6_score_route(sibling, oif, strict) < 0)
476 break;
477 match = sibling;
478 break;
481 return match;
485 * Route lookup. Any table->tb6_lock is implied.
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489 struct rt6_info *rt,
490 const struct in6_addr *saddr,
491 int oif,
492 int flags)
494 struct rt6_info *local = NULL;
495 struct rt6_info *sprt;
497 if (!oif && ipv6_addr_any(saddr))
498 goto out;
500 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
501 struct net_device *dev = sprt->dst.dev;
503 if (oif) {
504 if (dev->ifindex == oif)
505 return sprt;
506 if (dev->flags & IFF_LOOPBACK) {
507 if (!sprt->rt6i_idev ||
508 sprt->rt6i_idev->dev->ifindex != oif) {
509 if (flags & RT6_LOOKUP_F_IFACE)
510 continue;
511 if (local &&
512 local->rt6i_idev->dev->ifindex == oif)
513 continue;
515 local = sprt;
517 } else {
518 if (ipv6_chk_addr(net, saddr, dev,
519 flags & RT6_LOOKUP_F_IFACE))
520 return sprt;
524 if (oif) {
525 if (local)
526 return local;
528 if (flags & RT6_LOOKUP_F_IFACE)
529 return net->ipv6.ip6_null_entry;
531 out:
532 return rt;
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 struct work_struct work;
538 struct in6_addr target;
539 struct net_device *dev;
542 static void rt6_probe_deferred(struct work_struct *w)
544 struct in6_addr mcaddr;
545 struct __rt6_probe_work *work =
546 container_of(w, struct __rt6_probe_work, work);
548 addrconf_addr_solict_mult(&work->target, &mcaddr);
549 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 dev_put(work->dev);
551 kfree(work);
554 static void rt6_probe(struct rt6_info *rt)
556 struct __rt6_probe_work *work;
557 struct neighbour *neigh;
559 * Okay, this does not seem to be appropriate
560 * for now, however, we need to check if it
561 * is really so; aka Router Reachability Probing.
563 * Router Reachability Probe MUST be rate-limited
564 * to no more than one per minute.
566 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 return;
568 rcu_read_lock_bh();
569 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 if (neigh) {
571 if (neigh->nud_state & NUD_VALID)
572 goto out;
574 work = NULL;
575 write_lock(&neigh->lock);
576 if (!(neigh->nud_state & NUD_VALID) &&
577 time_after(jiffies,
578 neigh->updated +
579 rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 if (work)
582 __neigh_set_probe_once(neigh);
584 write_unlock(&neigh->lock);
585 } else {
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 if (work) {
590 INIT_WORK(&work->work, rt6_probe_deferred);
591 work->target = rt->rt6i_gateway;
592 dev_hold(rt->dst.dev);
593 work->dev = rt->dst.dev;
594 schedule_work(&work->work);
597 out:
598 rcu_read_unlock_bh();
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
604 #endif
607 * Default Router Selection (RFC 2461 6.3.6)
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
611 struct net_device *dev = rt->dst.dev;
612 if (!oif || dev->ifindex == oif)
613 return 2;
614 if ((dev->flags & IFF_LOOPBACK) &&
615 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 return 1;
617 return 0;
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
622 struct neighbour *neigh;
623 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
625 if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 !(rt->rt6i_flags & RTF_GATEWAY))
627 return RT6_NUD_SUCCEED;
629 rcu_read_lock_bh();
630 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 if (neigh) {
632 read_lock(&neigh->lock);
633 if (neigh->nud_state & NUD_VALID)
634 ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 else if (!(neigh->nud_state & NUD_FAILED))
637 ret = RT6_NUD_SUCCEED;
638 else
639 ret = RT6_NUD_FAIL_PROBE;
640 #endif
641 read_unlock(&neigh->lock);
642 } else {
643 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
646 rcu_read_unlock_bh();
648 return ret;
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652 int strict)
654 int m;
656 m = rt6_check_dev(rt, oif);
657 if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662 if (strict & RT6_LOOKUP_F_REACHABLE) {
663 int n = rt6_check_neigh(rt);
664 if (n < 0)
665 return n;
667 return m;
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 int *mpri, struct rt6_info *match,
672 bool *do_rr)
674 int m;
675 bool match_do_rr = false;
676 struct inet6_dev *idev = rt->rt6i_idev;
677 struct net_device *dev = rt->dst.dev;
679 if (dev && !netif_carrier_ok(dev) &&
680 idev->cnf.ignore_routes_with_linkdown &&
681 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682 goto out;
684 if (rt6_check_expired(rt))
685 goto out;
687 m = rt6_score_route(rt, oif, strict);
688 if (m == RT6_NUD_FAIL_DO_RR) {
689 match_do_rr = true;
690 m = 0; /* lowest valid score */
691 } else if (m == RT6_NUD_FAIL_HARD) {
692 goto out;
695 if (strict & RT6_LOOKUP_F_REACHABLE)
696 rt6_probe(rt);
698 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699 if (m > *mpri) {
700 *do_rr = match_do_rr;
701 *mpri = m;
702 match = rt;
704 out:
705 return match;
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709 struct rt6_info *rr_head,
710 u32 metric, int oif, int strict,
711 bool *do_rr)
713 struct rt6_info *rt, *match, *cont;
714 int mpri = -1;
716 match = NULL;
717 cont = NULL;
718 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
719 if (rt->rt6i_metric != metric) {
720 cont = rt;
721 break;
724 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
728 if (rt->rt6i_metric != metric) {
729 cont = rt;
730 break;
733 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736 if (match || !cont)
737 return match;
739 for (rt = cont; rt; rt = rt->dst.rt6_next)
740 match = find_match(rt, oif, strict, &mpri, match, do_rr);
742 return match;
745 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
747 struct rt6_info *match, *rt0;
748 struct net *net;
749 bool do_rr = false;
751 rt0 = fn->rr_ptr;
752 if (!rt0)
753 fn->rr_ptr = rt0 = fn->leaf;
755 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
756 &do_rr);
758 if (do_rr) {
759 struct rt6_info *next = rt0->dst.rt6_next;
761 /* no entries matched; do round-robin */
762 if (!next || next->rt6i_metric != rt0->rt6i_metric)
763 next = fn->leaf;
765 if (next != rt0)
766 fn->rr_ptr = next;
769 net = dev_net(rt0->dst.dev);
770 return match ? match : net->ipv6.ip6_null_entry;
773 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
775 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
778 #ifdef CONFIG_IPV6_ROUTE_INFO
779 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
780 const struct in6_addr *gwaddr)
782 struct net *net = dev_net(dev);
783 struct route_info *rinfo = (struct route_info *) opt;
784 struct in6_addr prefix_buf, *prefix;
785 unsigned int pref;
786 unsigned long lifetime;
787 struct rt6_info *rt;
789 if (len < sizeof(struct route_info)) {
790 return -EINVAL;
793 /* Sanity check for prefix_len and length */
794 if (rinfo->length > 3) {
795 return -EINVAL;
796 } else if (rinfo->prefix_len > 128) {
797 return -EINVAL;
798 } else if (rinfo->prefix_len > 64) {
799 if (rinfo->length < 2) {
800 return -EINVAL;
802 } else if (rinfo->prefix_len > 0) {
803 if (rinfo->length < 1) {
804 return -EINVAL;
808 pref = rinfo->route_pref;
809 if (pref == ICMPV6_ROUTER_PREF_INVALID)
810 return -EINVAL;
812 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
814 if (rinfo->length == 3)
815 prefix = (struct in6_addr *)rinfo->prefix;
816 else {
817 /* this function is safe */
818 ipv6_addr_prefix(&prefix_buf,
819 (struct in6_addr *)rinfo->prefix,
820 rinfo->prefix_len);
821 prefix = &prefix_buf;
824 if (rinfo->prefix_len == 0)
825 rt = rt6_get_dflt_router(gwaddr, dev);
826 else
827 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
828 gwaddr, dev);
830 if (rt && !lifetime) {
831 ip6_del_rt(rt);
832 rt = NULL;
835 if (!rt && lifetime)
836 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
837 dev, pref);
838 else if (rt)
839 rt->rt6i_flags = RTF_ROUTEINFO |
840 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
842 if (rt) {
843 if (!addrconf_finite_timeout(lifetime))
844 rt6_clean_expires(rt);
845 else
846 rt6_set_expires(rt, jiffies + HZ * lifetime);
848 ip6_rt_put(rt);
850 return 0;
852 #endif
854 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
855 struct in6_addr *saddr)
857 struct fib6_node *pn;
858 while (1) {
859 if (fn->fn_flags & RTN_TL_ROOT)
860 return NULL;
861 pn = fn->parent;
862 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
863 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
864 else
865 fn = pn;
866 if (fn->fn_flags & RTN_RTINFO)
867 return fn;
871 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
872 struct fib6_table *table,
873 struct flowi6 *fl6, int flags)
875 struct fib6_node *fn;
876 struct rt6_info *rt;
878 read_lock_bh(&table->tb6_lock);
879 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
880 restart:
881 rt = fn->leaf;
882 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
883 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
884 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
885 if (rt == net->ipv6.ip6_null_entry) {
886 fn = fib6_backtrack(fn, &fl6->saddr);
887 if (fn)
888 goto restart;
890 dst_use(&rt->dst, jiffies);
891 read_unlock_bh(&table->tb6_lock);
893 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
895 return rt;
899 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
900 int flags)
902 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
904 EXPORT_SYMBOL_GPL(ip6_route_lookup);
906 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
907 const struct in6_addr *saddr, int oif, int strict)
909 struct flowi6 fl6 = {
910 .flowi6_oif = oif,
911 .daddr = *daddr,
913 struct dst_entry *dst;
914 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
916 if (saddr) {
917 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
918 flags |= RT6_LOOKUP_F_HAS_SADDR;
921 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
922 if (dst->error == 0)
923 return (struct rt6_info *) dst;
925 dst_release(dst);
927 return NULL;
929 EXPORT_SYMBOL(rt6_lookup);
931 /* ip6_ins_rt is called with FREE table->tb6_lock.
932 * It takes new route entry, the addition fails by any reason the
933 * route is released.
934 * Caller must hold dst before calling it.
937 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
938 struct mx6_config *mxc,
939 struct netlink_ext_ack *extack)
941 int err;
942 struct fib6_table *table;
944 table = rt->rt6i_table;
945 write_lock_bh(&table->tb6_lock);
946 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
947 write_unlock_bh(&table->tb6_lock);
949 return err;
952 int ip6_ins_rt(struct rt6_info *rt)
954 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
955 struct mx6_config mxc = { .mx = NULL, };
957 /* Hold dst to account for the reference from the fib6 tree */
958 dst_hold(&rt->dst);
959 return __ip6_ins_rt(rt, &info, &mxc, NULL);
962 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
963 const struct in6_addr *daddr,
964 const struct in6_addr *saddr)
966 struct rt6_info *rt;
969 * Clone the route.
972 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
973 ort = (struct rt6_info *)ort->dst.from;
975 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
977 if (!rt)
978 return NULL;
980 ip6_rt_copy_init(rt, ort);
981 rt->rt6i_flags |= RTF_CACHE;
982 rt->rt6i_metric = 0;
983 rt->dst.flags |= DST_HOST;
984 rt->rt6i_dst.addr = *daddr;
985 rt->rt6i_dst.plen = 128;
987 if (!rt6_is_gw_or_nonexthop(ort)) {
988 if (ort->rt6i_dst.plen != 128 &&
989 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
990 rt->rt6i_flags |= RTF_ANYCAST;
991 #ifdef CONFIG_IPV6_SUBTREES
992 if (rt->rt6i_src.plen && saddr) {
993 rt->rt6i_src.addr = *saddr;
994 rt->rt6i_src.plen = 128;
996 #endif
999 return rt;
1002 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1004 struct rt6_info *pcpu_rt;
1006 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1007 rt->dst.dev, rt->dst.flags);
1009 if (!pcpu_rt)
1010 return NULL;
1011 ip6_rt_copy_init(pcpu_rt, rt);
1012 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1013 pcpu_rt->rt6i_flags |= RTF_PCPU;
1014 return pcpu_rt;
1017 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1018 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1020 struct rt6_info *pcpu_rt, **p;
1022 p = this_cpu_ptr(rt->rt6i_pcpu);
1023 pcpu_rt = *p;
1025 if (pcpu_rt) {
1026 dst_hold(&pcpu_rt->dst);
1027 rt6_dst_from_metrics_check(pcpu_rt);
1029 return pcpu_rt;
1032 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1034 struct fib6_table *table = rt->rt6i_table;
1035 struct rt6_info *pcpu_rt, *prev, **p;
1037 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1038 if (!pcpu_rt) {
1039 struct net *net = dev_net(rt->dst.dev);
1041 dst_hold(&net->ipv6.ip6_null_entry->dst);
1042 return net->ipv6.ip6_null_entry;
1045 read_lock_bh(&table->tb6_lock);
1046 if (rt->rt6i_pcpu) {
1047 p = this_cpu_ptr(rt->rt6i_pcpu);
1048 prev = cmpxchg(p, NULL, pcpu_rt);
1049 if (prev) {
1050 /* If someone did it before us, return prev instead */
1051 dst_release_immediate(&pcpu_rt->dst);
1052 pcpu_rt = prev;
1054 } else {
1055 /* rt has been removed from the fib6 tree
1056 * before we have a chance to acquire the read_lock.
1057 * In this case, don't brother to create a pcpu rt
1058 * since rt is going away anyway. The next
1059 * dst_check() will trigger a re-lookup.
1061 dst_release_immediate(&pcpu_rt->dst);
1062 pcpu_rt = rt;
1064 dst_hold(&pcpu_rt->dst);
1065 rt6_dst_from_metrics_check(pcpu_rt);
1066 read_unlock_bh(&table->tb6_lock);
1067 return pcpu_rt;
1070 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1071 int oif, struct flowi6 *fl6, int flags)
1073 struct fib6_node *fn, *saved_fn;
1074 struct rt6_info *rt;
1075 int strict = 0;
1077 strict |= flags & RT6_LOOKUP_F_IFACE;
1078 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1079 if (net->ipv6.devconf_all->forwarding == 0)
1080 strict |= RT6_LOOKUP_F_REACHABLE;
1082 read_lock_bh(&table->tb6_lock);
1084 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1085 saved_fn = fn;
1087 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1088 oif = 0;
1090 redo_rt6_select:
1091 rt = rt6_select(fn, oif, strict);
1092 if (rt->rt6i_nsiblings)
1093 rt = rt6_multipath_select(rt, fl6, oif, strict);
1094 if (rt == net->ipv6.ip6_null_entry) {
1095 fn = fib6_backtrack(fn, &fl6->saddr);
1096 if (fn)
1097 goto redo_rt6_select;
1098 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1099 /* also consider unreachable route */
1100 strict &= ~RT6_LOOKUP_F_REACHABLE;
1101 fn = saved_fn;
1102 goto redo_rt6_select;
1107 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1108 dst_use(&rt->dst, jiffies);
1109 read_unlock_bh(&table->tb6_lock);
1111 rt6_dst_from_metrics_check(rt);
1113 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1114 return rt;
1115 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1116 !(rt->rt6i_flags & RTF_GATEWAY))) {
1117 /* Create a RTF_CACHE clone which will not be
1118 * owned by the fib6 tree. It is for the special case where
1119 * the daddr in the skb during the neighbor look-up is different
1120 * from the fl6->daddr used to look-up route here.
1123 struct rt6_info *uncached_rt;
1125 dst_use(&rt->dst, jiffies);
1126 read_unlock_bh(&table->tb6_lock);
1128 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1129 dst_release(&rt->dst);
1131 if (uncached_rt) {
1132 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1133 * No need for another dst_hold()
1135 rt6_uncached_list_add(uncached_rt);
1136 } else {
1137 uncached_rt = net->ipv6.ip6_null_entry;
1138 dst_hold(&uncached_rt->dst);
1141 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1142 return uncached_rt;
1144 } else {
1145 /* Get a percpu copy */
1147 struct rt6_info *pcpu_rt;
1149 rt->dst.lastuse = jiffies;
1150 rt->dst.__use++;
1151 pcpu_rt = rt6_get_pcpu_route(rt);
1153 if (pcpu_rt) {
1154 read_unlock_bh(&table->tb6_lock);
1155 } else {
1156 /* We have to do the read_unlock first
1157 * because rt6_make_pcpu_route() may trigger
1158 * ip6_dst_gc() which will take the write_lock.
1160 dst_hold(&rt->dst);
1161 read_unlock_bh(&table->tb6_lock);
1162 pcpu_rt = rt6_make_pcpu_route(rt);
1163 dst_release(&rt->dst);
1166 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1167 return pcpu_rt;
1171 EXPORT_SYMBOL_GPL(ip6_pol_route);
1173 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1174 struct flowi6 *fl6, int flags)
1176 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1179 struct dst_entry *ip6_route_input_lookup(struct net *net,
1180 struct net_device *dev,
1181 struct flowi6 *fl6, int flags)
1183 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1184 flags |= RT6_LOOKUP_F_IFACE;
1186 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1188 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1190 void ip6_route_input(struct sk_buff *skb)
1192 const struct ipv6hdr *iph = ipv6_hdr(skb);
1193 struct net *net = dev_net(skb->dev);
1194 int flags = RT6_LOOKUP_F_HAS_SADDR;
1195 struct ip_tunnel_info *tun_info;
1196 struct flowi6 fl6 = {
1197 .flowi6_iif = skb->dev->ifindex,
1198 .daddr = iph->daddr,
1199 .saddr = iph->saddr,
1200 .flowlabel = ip6_flowinfo(iph),
1201 .flowi6_mark = skb->mark,
1202 .flowi6_proto = iph->nexthdr,
1205 tun_info = skb_tunnel_info(skb);
1206 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1207 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1208 skb_dst_drop(skb);
1209 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1212 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1213 struct flowi6 *fl6, int flags)
1215 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1218 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1219 struct flowi6 *fl6, int flags)
1221 bool any_src;
1223 if (rt6_need_strict(&fl6->daddr)) {
1224 struct dst_entry *dst;
1226 dst = l3mdev_link_scope_lookup(net, fl6);
1227 if (dst)
1228 return dst;
1231 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1233 any_src = ipv6_addr_any(&fl6->saddr);
1234 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1235 (fl6->flowi6_oif && any_src))
1236 flags |= RT6_LOOKUP_F_IFACE;
1238 if (!any_src)
1239 flags |= RT6_LOOKUP_F_HAS_SADDR;
1240 else if (sk)
1241 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1243 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1245 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1247 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1249 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1250 struct net_device *loopback_dev = net->loopback_dev;
1251 struct dst_entry *new = NULL;
1253 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1254 DST_OBSOLETE_DEAD, 0);
1255 if (rt) {
1256 rt6_info_init(rt);
1258 new = &rt->dst;
1259 new->__use = 1;
1260 new->input = dst_discard;
1261 new->output = dst_discard_out;
1263 dst_copy_metrics(new, &ort->dst);
1265 rt->rt6i_idev = in6_dev_get(loopback_dev);
1266 rt->rt6i_gateway = ort->rt6i_gateway;
1267 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1268 rt->rt6i_metric = 0;
1270 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1271 #ifdef CONFIG_IPV6_SUBTREES
1272 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1273 #endif
1276 dst_release(dst_orig);
1277 return new ? new : ERR_PTR(-ENOMEM);
1281 * Destination cache support functions
1284 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1286 if (rt->dst.from &&
1287 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1288 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1291 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1293 u32 rt_cookie = 0;
1295 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1296 return NULL;
1298 if (rt6_check_expired(rt))
1299 return NULL;
1301 return &rt->dst;
1304 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1306 if (!__rt6_check_expired(rt) &&
1307 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1308 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1309 return &rt->dst;
1310 else
1311 return NULL;
1314 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1316 struct rt6_info *rt;
1318 rt = (struct rt6_info *) dst;
1320 /* All IPV6 dsts are created with ->obsolete set to the value
1321 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1322 * into this function always.
1325 rt6_dst_from_metrics_check(rt);
1327 if (rt->rt6i_flags & RTF_PCPU ||
1328 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1329 return rt6_dst_from_check(rt, cookie);
1330 else
1331 return rt6_check(rt, cookie);
1334 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1336 struct rt6_info *rt = (struct rt6_info *) dst;
1338 if (rt) {
1339 if (rt->rt6i_flags & RTF_CACHE) {
1340 if (rt6_check_expired(rt)) {
1341 ip6_del_rt(rt);
1342 dst = NULL;
1344 } else {
1345 dst_release(dst);
1346 dst = NULL;
1349 return dst;
1352 static void ip6_link_failure(struct sk_buff *skb)
1354 struct rt6_info *rt;
1356 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1358 rt = (struct rt6_info *) skb_dst(skb);
1359 if (rt) {
1360 if (rt->rt6i_flags & RTF_CACHE) {
1361 if (dst_hold_safe(&rt->dst))
1362 ip6_del_rt(rt);
1363 } else {
1364 struct fib6_node *fn;
1366 rcu_read_lock();
1367 fn = rcu_dereference(rt->rt6i_node);
1368 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1369 fn->fn_sernum = -1;
1370 rcu_read_unlock();
1375 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1377 struct net *net = dev_net(rt->dst.dev);
1379 rt->rt6i_flags |= RTF_MODIFIED;
1380 rt->rt6i_pmtu = mtu;
1381 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1384 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1386 return !(rt->rt6i_flags & RTF_CACHE) &&
1387 (rt->rt6i_flags & RTF_PCPU ||
1388 rcu_access_pointer(rt->rt6i_node));
1391 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1392 const struct ipv6hdr *iph, u32 mtu)
1394 const struct in6_addr *daddr, *saddr;
1395 struct rt6_info *rt6 = (struct rt6_info *)dst;
1397 if (rt6->rt6i_flags & RTF_LOCAL)
1398 return;
1400 if (dst_metric_locked(dst, RTAX_MTU))
1401 return;
1403 if (iph) {
1404 daddr = &iph->daddr;
1405 saddr = &iph->saddr;
1406 } else if (sk) {
1407 daddr = &sk->sk_v6_daddr;
1408 saddr = &inet6_sk(sk)->saddr;
1409 } else {
1410 daddr = NULL;
1411 saddr = NULL;
1413 dst_confirm_neigh(dst, daddr);
1414 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1415 if (mtu >= dst_mtu(dst))
1416 return;
1418 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1419 rt6_do_update_pmtu(rt6, mtu);
1420 } else if (daddr) {
1421 struct rt6_info *nrt6;
1423 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1424 if (nrt6) {
1425 rt6_do_update_pmtu(nrt6, mtu);
1427 /* ip6_ins_rt(nrt6) will bump the
1428 * rt6->rt6i_node->fn_sernum
1429 * which will fail the next rt6_check() and
1430 * invalidate the sk->sk_dst_cache.
1432 ip6_ins_rt(nrt6);
1433 /* Release the reference taken in
1434 * ip6_rt_cache_alloc()
1436 dst_release(&nrt6->dst);
1441 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1442 struct sk_buff *skb, u32 mtu)
1444 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1447 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1448 int oif, u32 mark, kuid_t uid)
1450 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1451 struct dst_entry *dst;
1452 struct flowi6 fl6;
1454 memset(&fl6, 0, sizeof(fl6));
1455 fl6.flowi6_oif = oif;
1456 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1457 fl6.daddr = iph->daddr;
1458 fl6.saddr = iph->saddr;
1459 fl6.flowlabel = ip6_flowinfo(iph);
1460 fl6.flowi6_uid = uid;
1462 dst = ip6_route_output(net, NULL, &fl6);
1463 if (!dst->error)
1464 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1465 dst_release(dst);
1467 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1469 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1471 struct dst_entry *dst;
1473 ip6_update_pmtu(skb, sock_net(sk), mtu,
1474 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1476 dst = __sk_dst_get(sk);
1477 if (!dst || !dst->obsolete ||
1478 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1479 return;
1481 bh_lock_sock(sk);
1482 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1483 ip6_datagram_dst_update(sk, false);
1484 bh_unlock_sock(sk);
1486 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1488 /* Handle redirects */
1489 struct ip6rd_flowi {
1490 struct flowi6 fl6;
1491 struct in6_addr gateway;
1494 static struct rt6_info *__ip6_route_redirect(struct net *net,
1495 struct fib6_table *table,
1496 struct flowi6 *fl6,
1497 int flags)
1499 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1500 struct rt6_info *rt;
1501 struct fib6_node *fn;
1503 /* Get the "current" route for this destination and
1504 * check if the redirect has come from appropriate router.
1506 * RFC 4861 specifies that redirects should only be
1507 * accepted if they come from the nexthop to the target.
1508 * Due to the way the routes are chosen, this notion
1509 * is a bit fuzzy and one might need to check all possible
1510 * routes.
1513 read_lock_bh(&table->tb6_lock);
1514 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1515 restart:
1516 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1517 if (rt6_check_expired(rt))
1518 continue;
1519 if (rt->dst.error)
1520 break;
1521 if (!(rt->rt6i_flags & RTF_GATEWAY))
1522 continue;
1523 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1524 continue;
1525 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1526 continue;
1527 break;
1530 if (!rt)
1531 rt = net->ipv6.ip6_null_entry;
1532 else if (rt->dst.error) {
1533 rt = net->ipv6.ip6_null_entry;
1534 goto out;
1537 if (rt == net->ipv6.ip6_null_entry) {
1538 fn = fib6_backtrack(fn, &fl6->saddr);
1539 if (fn)
1540 goto restart;
1543 out:
1544 dst_hold(&rt->dst);
1546 read_unlock_bh(&table->tb6_lock);
1548 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1549 return rt;
1552 static struct dst_entry *ip6_route_redirect(struct net *net,
1553 const struct flowi6 *fl6,
1554 const struct in6_addr *gateway)
1556 int flags = RT6_LOOKUP_F_HAS_SADDR;
1557 struct ip6rd_flowi rdfl;
1559 rdfl.fl6 = *fl6;
1560 rdfl.gateway = *gateway;
1562 return fib6_rule_lookup(net, &rdfl.fl6,
1563 flags, __ip6_route_redirect);
1566 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1567 kuid_t uid)
1569 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1570 struct dst_entry *dst;
1571 struct flowi6 fl6;
1573 memset(&fl6, 0, sizeof(fl6));
1574 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1575 fl6.flowi6_oif = oif;
1576 fl6.flowi6_mark = mark;
1577 fl6.daddr = iph->daddr;
1578 fl6.saddr = iph->saddr;
1579 fl6.flowlabel = ip6_flowinfo(iph);
1580 fl6.flowi6_uid = uid;
1582 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1583 rt6_do_redirect(dst, NULL, skb);
1584 dst_release(dst);
1586 EXPORT_SYMBOL_GPL(ip6_redirect);
1588 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1589 u32 mark)
1591 const struct ipv6hdr *iph = ipv6_hdr(skb);
1592 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1593 struct dst_entry *dst;
1594 struct flowi6 fl6;
1596 memset(&fl6, 0, sizeof(fl6));
1597 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1598 fl6.flowi6_oif = oif;
1599 fl6.flowi6_mark = mark;
1600 fl6.daddr = msg->dest;
1601 fl6.saddr = iph->daddr;
1602 fl6.flowi6_uid = sock_net_uid(net, NULL);
1604 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1605 rt6_do_redirect(dst, NULL, skb);
1606 dst_release(dst);
1609 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1611 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1612 sk->sk_uid);
1614 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1616 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1618 struct net_device *dev = dst->dev;
1619 unsigned int mtu = dst_mtu(dst);
1620 struct net *net = dev_net(dev);
1622 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1624 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1625 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1628 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1629 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1630 * IPV6_MAXPLEN is also valid and means: "any MSS,
1631 * rely only on pmtu discovery"
1633 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1634 mtu = IPV6_MAXPLEN;
1635 return mtu;
1638 static unsigned int ip6_mtu(const struct dst_entry *dst)
1640 const struct rt6_info *rt = (const struct rt6_info *)dst;
1641 unsigned int mtu = rt->rt6i_pmtu;
1642 struct inet6_dev *idev;
1644 if (mtu)
1645 goto out;
1647 mtu = dst_metric_raw(dst, RTAX_MTU);
1648 if (mtu)
1649 goto out;
1651 mtu = IPV6_MIN_MTU;
1653 rcu_read_lock();
1654 idev = __in6_dev_get(dst->dev);
1655 if (idev)
1656 mtu = idev->cnf.mtu6;
1657 rcu_read_unlock();
1659 out:
1660 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1662 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1665 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1666 struct flowi6 *fl6)
1668 struct dst_entry *dst;
1669 struct rt6_info *rt;
1670 struct inet6_dev *idev = in6_dev_get(dev);
1671 struct net *net = dev_net(dev);
1673 if (unlikely(!idev))
1674 return ERR_PTR(-ENODEV);
1676 rt = ip6_dst_alloc(net, dev, 0);
1677 if (unlikely(!rt)) {
1678 in6_dev_put(idev);
1679 dst = ERR_PTR(-ENOMEM);
1680 goto out;
1683 rt->dst.flags |= DST_HOST;
1684 rt->dst.output = ip6_output;
1685 rt->rt6i_gateway = fl6->daddr;
1686 rt->rt6i_dst.addr = fl6->daddr;
1687 rt->rt6i_dst.plen = 128;
1688 rt->rt6i_idev = idev;
1689 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1691 /* Add this dst into uncached_list so that rt6_ifdown() can
1692 * do proper release of the net_device
1694 rt6_uncached_list_add(rt);
1696 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1698 out:
1699 return dst;
1702 static int ip6_dst_gc(struct dst_ops *ops)
1704 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1705 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1706 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1707 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1708 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1709 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1710 int entries;
1712 entries = dst_entries_get_fast(ops);
1713 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1714 entries <= rt_max_size)
1715 goto out;
1717 net->ipv6.ip6_rt_gc_expire++;
1718 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1719 entries = dst_entries_get_slow(ops);
1720 if (entries < ops->gc_thresh)
1721 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1722 out:
1723 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1724 return entries > rt_max_size;
1727 static int ip6_convert_metrics(struct mx6_config *mxc,
1728 const struct fib6_config *cfg)
1730 bool ecn_ca = false;
1731 struct nlattr *nla;
1732 int remaining;
1733 u32 *mp;
1735 if (!cfg->fc_mx)
1736 return 0;
1738 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1739 if (unlikely(!mp))
1740 return -ENOMEM;
1742 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1743 int type = nla_type(nla);
1744 u32 val;
1746 if (!type)
1747 continue;
1748 if (unlikely(type > RTAX_MAX))
1749 goto err;
1751 if (type == RTAX_CC_ALGO) {
1752 char tmp[TCP_CA_NAME_MAX];
1754 nla_strlcpy(tmp, nla, sizeof(tmp));
1755 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1756 if (val == TCP_CA_UNSPEC)
1757 goto err;
1758 } else {
1759 val = nla_get_u32(nla);
1761 if (type == RTAX_HOPLIMIT && val > 255)
1762 val = 255;
1763 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1764 goto err;
1766 mp[type - 1] = val;
1767 __set_bit(type - 1, mxc->mx_valid);
1770 if (ecn_ca) {
1771 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1772 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1775 mxc->mx = mp;
1776 return 0;
1777 err:
1778 kfree(mp);
1779 return -EINVAL;
1782 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1783 struct fib6_config *cfg,
1784 const struct in6_addr *gw_addr)
1786 struct flowi6 fl6 = {
1787 .flowi6_oif = cfg->fc_ifindex,
1788 .daddr = *gw_addr,
1789 .saddr = cfg->fc_prefsrc,
1791 struct fib6_table *table;
1792 struct rt6_info *rt;
1793 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1795 table = fib6_get_table(net, cfg->fc_table);
1796 if (!table)
1797 return NULL;
1799 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1800 flags |= RT6_LOOKUP_F_HAS_SADDR;
1802 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1804 /* if table lookup failed, fall back to full lookup */
1805 if (rt == net->ipv6.ip6_null_entry) {
1806 ip6_rt_put(rt);
1807 rt = NULL;
1810 return rt;
1813 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1814 struct netlink_ext_ack *extack)
1816 struct net *net = cfg->fc_nlinfo.nl_net;
1817 struct rt6_info *rt = NULL;
1818 struct net_device *dev = NULL;
1819 struct inet6_dev *idev = NULL;
1820 struct fib6_table *table;
1821 int addr_type;
1822 int err = -EINVAL;
1824 /* RTF_PCPU is an internal flag; can not be set by userspace */
1825 if (cfg->fc_flags & RTF_PCPU) {
1826 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1827 goto out;
1830 if (cfg->fc_dst_len > 128) {
1831 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1832 goto out;
1834 if (cfg->fc_src_len > 128) {
1835 NL_SET_ERR_MSG(extack, "Invalid source address length");
1836 goto out;
1838 #ifndef CONFIG_IPV6_SUBTREES
1839 if (cfg->fc_src_len) {
1840 NL_SET_ERR_MSG(extack,
1841 "Specifying source address requires IPV6_SUBTREES to be enabled");
1842 goto out;
1844 #endif
1845 if (cfg->fc_ifindex) {
1846 err = -ENODEV;
1847 dev = dev_get_by_index(net, cfg->fc_ifindex);
1848 if (!dev)
1849 goto out;
1850 idev = in6_dev_get(dev);
1851 if (!idev)
1852 goto out;
1855 if (cfg->fc_metric == 0)
1856 cfg->fc_metric = IP6_RT_PRIO_USER;
1858 err = -ENOBUFS;
1859 if (cfg->fc_nlinfo.nlh &&
1860 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1861 table = fib6_get_table(net, cfg->fc_table);
1862 if (!table) {
1863 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1864 table = fib6_new_table(net, cfg->fc_table);
1866 } else {
1867 table = fib6_new_table(net, cfg->fc_table);
1870 if (!table)
1871 goto out;
1873 rt = ip6_dst_alloc(net, NULL,
1874 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1876 if (!rt) {
1877 err = -ENOMEM;
1878 goto out;
1881 if (cfg->fc_flags & RTF_EXPIRES)
1882 rt6_set_expires(rt, jiffies +
1883 clock_t_to_jiffies(cfg->fc_expires));
1884 else
1885 rt6_clean_expires(rt);
1887 if (cfg->fc_protocol == RTPROT_UNSPEC)
1888 cfg->fc_protocol = RTPROT_BOOT;
1889 rt->rt6i_protocol = cfg->fc_protocol;
1891 addr_type = ipv6_addr_type(&cfg->fc_dst);
1893 if (addr_type & IPV6_ADDR_MULTICAST)
1894 rt->dst.input = ip6_mc_input;
1895 else if (cfg->fc_flags & RTF_LOCAL)
1896 rt->dst.input = ip6_input;
1897 else
1898 rt->dst.input = ip6_forward;
1900 rt->dst.output = ip6_output;
1902 if (cfg->fc_encap) {
1903 struct lwtunnel_state *lwtstate;
1905 err = lwtunnel_build_state(cfg->fc_encap_type,
1906 cfg->fc_encap, AF_INET6, cfg,
1907 &lwtstate, extack);
1908 if (err)
1909 goto out;
1910 rt->dst.lwtstate = lwtstate_get(lwtstate);
1911 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1912 rt->dst.lwtstate->orig_output = rt->dst.output;
1913 rt->dst.output = lwtunnel_output;
1915 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1916 rt->dst.lwtstate->orig_input = rt->dst.input;
1917 rt->dst.input = lwtunnel_input;
1921 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1922 rt->rt6i_dst.plen = cfg->fc_dst_len;
1923 if (rt->rt6i_dst.plen == 128)
1924 rt->dst.flags |= DST_HOST;
1926 #ifdef CONFIG_IPV6_SUBTREES
1927 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1928 rt->rt6i_src.plen = cfg->fc_src_len;
1929 #endif
1931 rt->rt6i_metric = cfg->fc_metric;
1933 /* We cannot add true routes via loopback here,
1934 they would result in kernel looping; promote them to reject routes
1936 if ((cfg->fc_flags & RTF_REJECT) ||
1937 (dev && (dev->flags & IFF_LOOPBACK) &&
1938 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1939 !(cfg->fc_flags & RTF_LOCAL))) {
1940 /* hold loopback dev/idev if we haven't done so. */
1941 if (dev != net->loopback_dev) {
1942 if (dev) {
1943 dev_put(dev);
1944 in6_dev_put(idev);
1946 dev = net->loopback_dev;
1947 dev_hold(dev);
1948 idev = in6_dev_get(dev);
1949 if (!idev) {
1950 err = -ENODEV;
1951 goto out;
1954 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1955 switch (cfg->fc_type) {
1956 case RTN_BLACKHOLE:
1957 rt->dst.error = -EINVAL;
1958 rt->dst.output = dst_discard_out;
1959 rt->dst.input = dst_discard;
1960 break;
1961 case RTN_PROHIBIT:
1962 rt->dst.error = -EACCES;
1963 rt->dst.output = ip6_pkt_prohibit_out;
1964 rt->dst.input = ip6_pkt_prohibit;
1965 break;
1966 case RTN_THROW:
1967 case RTN_UNREACHABLE:
1968 default:
1969 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1970 : (cfg->fc_type == RTN_UNREACHABLE)
1971 ? -EHOSTUNREACH : -ENETUNREACH;
1972 rt->dst.output = ip6_pkt_discard_out;
1973 rt->dst.input = ip6_pkt_discard;
1974 break;
1976 goto install_route;
1979 if (cfg->fc_flags & RTF_GATEWAY) {
1980 const struct in6_addr *gw_addr;
1981 int gwa_type;
1983 gw_addr = &cfg->fc_gateway;
1984 gwa_type = ipv6_addr_type(gw_addr);
1986 /* if gw_addr is local we will fail to detect this in case
1987 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1988 * will return already-added prefix route via interface that
1989 * prefix route was assigned to, which might be non-loopback.
1991 err = -EINVAL;
1992 if (ipv6_chk_addr_and_flags(net, gw_addr,
1993 gwa_type & IPV6_ADDR_LINKLOCAL ?
1994 dev : NULL, 0, 0)) {
1995 NL_SET_ERR_MSG(extack, "Invalid gateway address");
1996 goto out;
1998 rt->rt6i_gateway = *gw_addr;
2000 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2001 struct rt6_info *grt = NULL;
2003 /* IPv6 strictly inhibits using not link-local
2004 addresses as nexthop address.
2005 Otherwise, router will not able to send redirects.
2006 It is very good, but in some (rare!) circumstances
2007 (SIT, PtP, NBMA NOARP links) it is handy to allow
2008 some exceptions. --ANK
2009 We allow IPv4-mapped nexthops to support RFC4798-type
2010 addressing
2012 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2013 IPV6_ADDR_MAPPED))) {
2014 NL_SET_ERR_MSG(extack,
2015 "Invalid gateway address");
2016 goto out;
2019 if (cfg->fc_table) {
2020 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2022 if (grt) {
2023 if (grt->rt6i_flags & RTF_GATEWAY ||
2024 (dev && dev != grt->dst.dev)) {
2025 ip6_rt_put(grt);
2026 grt = NULL;
2031 if (!grt)
2032 grt = rt6_lookup(net, gw_addr, NULL,
2033 cfg->fc_ifindex, 1);
2035 err = -EHOSTUNREACH;
2036 if (!grt)
2037 goto out;
2038 if (dev) {
2039 if (dev != grt->dst.dev) {
2040 ip6_rt_put(grt);
2041 goto out;
2043 } else {
2044 dev = grt->dst.dev;
2045 idev = grt->rt6i_idev;
2046 dev_hold(dev);
2047 in6_dev_hold(grt->rt6i_idev);
2049 if (!(grt->rt6i_flags & RTF_GATEWAY))
2050 err = 0;
2051 ip6_rt_put(grt);
2053 if (err)
2054 goto out;
2056 err = -EINVAL;
2057 if (!dev) {
2058 NL_SET_ERR_MSG(extack, "Egress device not specified");
2059 goto out;
2060 } else if (dev->flags & IFF_LOOPBACK) {
2061 NL_SET_ERR_MSG(extack,
2062 "Egress device can not be loopback device for this route");
2063 goto out;
2067 err = -ENODEV;
2068 if (!dev)
2069 goto out;
2071 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2072 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2073 NL_SET_ERR_MSG(extack, "Invalid source address");
2074 err = -EINVAL;
2075 goto out;
2077 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2078 rt->rt6i_prefsrc.plen = 128;
2079 } else
2080 rt->rt6i_prefsrc.plen = 0;
2082 rt->rt6i_flags = cfg->fc_flags;
2084 install_route:
2085 rt->dst.dev = dev;
2086 rt->rt6i_idev = idev;
2087 rt->rt6i_table = table;
2089 cfg->fc_nlinfo.nl_net = dev_net(dev);
2091 return rt;
2092 out:
2093 if (dev)
2094 dev_put(dev);
2095 if (idev)
2096 in6_dev_put(idev);
2097 if (rt)
2098 dst_release_immediate(&rt->dst);
2100 return ERR_PTR(err);
2103 int ip6_route_add(struct fib6_config *cfg,
2104 struct netlink_ext_ack *extack)
2106 struct mx6_config mxc = { .mx = NULL, };
2107 struct rt6_info *rt;
2108 int err;
2110 rt = ip6_route_info_create(cfg, extack);
2111 if (IS_ERR(rt)) {
2112 err = PTR_ERR(rt);
2113 rt = NULL;
2114 goto out;
2117 err = ip6_convert_metrics(&mxc, cfg);
2118 if (err)
2119 goto out;
2121 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2123 kfree(mxc.mx);
2125 return err;
2126 out:
2127 if (rt)
2128 dst_release_immediate(&rt->dst);
2130 return err;
2133 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2135 int err;
2136 struct fib6_table *table;
2137 struct net *net = dev_net(rt->dst.dev);
2139 if (rt == net->ipv6.ip6_null_entry) {
2140 err = -ENOENT;
2141 goto out;
2144 table = rt->rt6i_table;
2145 write_lock_bh(&table->tb6_lock);
2146 err = fib6_del(rt, info);
2147 write_unlock_bh(&table->tb6_lock);
2149 out:
2150 ip6_rt_put(rt);
2151 return err;
2154 int ip6_del_rt(struct rt6_info *rt)
2156 struct nl_info info = {
2157 .nl_net = dev_net(rt->dst.dev),
2159 return __ip6_del_rt(rt, &info);
2162 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2164 struct nl_info *info = &cfg->fc_nlinfo;
2165 struct net *net = info->nl_net;
2166 struct sk_buff *skb = NULL;
2167 struct fib6_table *table;
2168 int err = -ENOENT;
2170 if (rt == net->ipv6.ip6_null_entry)
2171 goto out_put;
2172 table = rt->rt6i_table;
2173 write_lock_bh(&table->tb6_lock);
2175 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2176 struct rt6_info *sibling, *next_sibling;
2178 /* prefer to send a single notification with all hops */
2179 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2180 if (skb) {
2181 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2183 if (rt6_fill_node(net, skb, rt,
2184 NULL, NULL, 0, RTM_DELROUTE,
2185 info->portid, seq, 0) < 0) {
2186 kfree_skb(skb);
2187 skb = NULL;
2188 } else
2189 info->skip_notify = 1;
2192 list_for_each_entry_safe(sibling, next_sibling,
2193 &rt->rt6i_siblings,
2194 rt6i_siblings) {
2195 err = fib6_del(sibling, info);
2196 if (err)
2197 goto out_unlock;
2201 err = fib6_del(rt, info);
2202 out_unlock:
2203 write_unlock_bh(&table->tb6_lock);
2204 out_put:
2205 ip6_rt_put(rt);
2207 if (skb) {
2208 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2209 info->nlh, gfp_any());
2211 return err;
2214 static int ip6_route_del(struct fib6_config *cfg,
2215 struct netlink_ext_ack *extack)
2217 struct fib6_table *table;
2218 struct fib6_node *fn;
2219 struct rt6_info *rt;
2220 int err = -ESRCH;
2222 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2223 if (!table) {
2224 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2225 return err;
2228 read_lock_bh(&table->tb6_lock);
2230 fn = fib6_locate(&table->tb6_root,
2231 &cfg->fc_dst, cfg->fc_dst_len,
2232 &cfg->fc_src, cfg->fc_src_len);
2234 if (fn) {
2235 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2236 if ((rt->rt6i_flags & RTF_CACHE) &&
2237 !(cfg->fc_flags & RTF_CACHE))
2238 continue;
2239 if (cfg->fc_ifindex &&
2240 (!rt->dst.dev ||
2241 rt->dst.dev->ifindex != cfg->fc_ifindex))
2242 continue;
2243 if (cfg->fc_flags & RTF_GATEWAY &&
2244 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2245 continue;
2246 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2247 continue;
2248 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2249 continue;
2250 dst_hold(&rt->dst);
2251 read_unlock_bh(&table->tb6_lock);
2253 /* if gateway was specified only delete the one hop */
2254 if (cfg->fc_flags & RTF_GATEWAY)
2255 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2257 return __ip6_del_rt_siblings(rt, cfg);
2260 read_unlock_bh(&table->tb6_lock);
2262 return err;
2265 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2267 struct netevent_redirect netevent;
2268 struct rt6_info *rt, *nrt = NULL;
2269 struct ndisc_options ndopts;
2270 struct inet6_dev *in6_dev;
2271 struct neighbour *neigh;
2272 struct rd_msg *msg;
2273 int optlen, on_link;
2274 u8 *lladdr;
2276 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2277 optlen -= sizeof(*msg);
2279 if (optlen < 0) {
2280 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2281 return;
2284 msg = (struct rd_msg *)icmp6_hdr(skb);
2286 if (ipv6_addr_is_multicast(&msg->dest)) {
2287 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2288 return;
2291 on_link = 0;
2292 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2293 on_link = 1;
2294 } else if (ipv6_addr_type(&msg->target) !=
2295 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2296 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2297 return;
2300 in6_dev = __in6_dev_get(skb->dev);
2301 if (!in6_dev)
2302 return;
2303 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2304 return;
2306 /* RFC2461 8.1:
2307 * The IP source address of the Redirect MUST be the same as the current
2308 * first-hop router for the specified ICMP Destination Address.
2311 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2312 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2313 return;
2316 lladdr = NULL;
2317 if (ndopts.nd_opts_tgt_lladdr) {
2318 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2319 skb->dev);
2320 if (!lladdr) {
2321 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2322 return;
2326 rt = (struct rt6_info *) dst;
2327 if (rt->rt6i_flags & RTF_REJECT) {
2328 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2329 return;
2332 /* Redirect received -> path was valid.
2333 * Look, redirects are sent only in response to data packets,
2334 * so that this nexthop apparently is reachable. --ANK
2336 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2338 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2339 if (!neigh)
2340 return;
2343 * We have finally decided to accept it.
2346 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2347 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2348 NEIGH_UPDATE_F_OVERRIDE|
2349 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2350 NEIGH_UPDATE_F_ISROUTER)),
2351 NDISC_REDIRECT, &ndopts);
2353 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2354 if (!nrt)
2355 goto out;
2357 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2358 if (on_link)
2359 nrt->rt6i_flags &= ~RTF_GATEWAY;
2361 nrt->rt6i_protocol = RTPROT_REDIRECT;
2362 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2364 if (ip6_ins_rt(nrt))
2365 goto out_release;
2367 netevent.old = &rt->dst;
2368 netevent.new = &nrt->dst;
2369 netevent.daddr = &msg->dest;
2370 netevent.neigh = neigh;
2371 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2373 if (rt->rt6i_flags & RTF_CACHE) {
2374 rt = (struct rt6_info *) dst_clone(&rt->dst);
2375 ip6_del_rt(rt);
2378 out_release:
2379 /* Release the reference taken in
2380 * ip6_rt_cache_alloc()
2382 dst_release(&nrt->dst);
2384 out:
2385 neigh_release(neigh);
2389 * Misc support functions
2392 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2394 BUG_ON(from->dst.from);
2396 rt->rt6i_flags &= ~RTF_EXPIRES;
2397 dst_hold(&from->dst);
2398 rt->dst.from = &from->dst;
2399 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2402 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2404 rt->dst.input = ort->dst.input;
2405 rt->dst.output = ort->dst.output;
2406 rt->rt6i_dst = ort->rt6i_dst;
2407 rt->dst.error = ort->dst.error;
2408 rt->rt6i_idev = ort->rt6i_idev;
2409 if (rt->rt6i_idev)
2410 in6_dev_hold(rt->rt6i_idev);
2411 rt->dst.lastuse = jiffies;
2412 rt->rt6i_gateway = ort->rt6i_gateway;
2413 rt->rt6i_flags = ort->rt6i_flags;
2414 rt6_set_from(rt, ort);
2415 rt->rt6i_metric = ort->rt6i_metric;
2416 #ifdef CONFIG_IPV6_SUBTREES
2417 rt->rt6i_src = ort->rt6i_src;
2418 #endif
2419 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2420 rt->rt6i_table = ort->rt6i_table;
2421 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2424 #ifdef CONFIG_IPV6_ROUTE_INFO
2425 static struct rt6_info *rt6_get_route_info(struct net *net,
2426 const struct in6_addr *prefix, int prefixlen,
2427 const struct in6_addr *gwaddr,
2428 struct net_device *dev)
2430 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2431 int ifindex = dev->ifindex;
2432 struct fib6_node *fn;
2433 struct rt6_info *rt = NULL;
2434 struct fib6_table *table;
2436 table = fib6_get_table(net, tb_id);
2437 if (!table)
2438 return NULL;
2440 read_lock_bh(&table->tb6_lock);
2441 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2442 if (!fn)
2443 goto out;
2445 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2446 if (rt->dst.dev->ifindex != ifindex)
2447 continue;
2448 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2449 continue;
2450 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2451 continue;
2452 dst_hold(&rt->dst);
2453 break;
2455 out:
2456 read_unlock_bh(&table->tb6_lock);
2457 return rt;
2460 static struct rt6_info *rt6_add_route_info(struct net *net,
2461 const struct in6_addr *prefix, int prefixlen,
2462 const struct in6_addr *gwaddr,
2463 struct net_device *dev,
2464 unsigned int pref)
2466 struct fib6_config cfg = {
2467 .fc_metric = IP6_RT_PRIO_USER,
2468 .fc_ifindex = dev->ifindex,
2469 .fc_dst_len = prefixlen,
2470 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2471 RTF_UP | RTF_PREF(pref),
2472 .fc_protocol = RTPROT_RA,
2473 .fc_nlinfo.portid = 0,
2474 .fc_nlinfo.nlh = NULL,
2475 .fc_nlinfo.nl_net = net,
2478 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2479 cfg.fc_dst = *prefix;
2480 cfg.fc_gateway = *gwaddr;
2482 /* We should treat it as a default route if prefix length is 0. */
2483 if (!prefixlen)
2484 cfg.fc_flags |= RTF_DEFAULT;
2486 ip6_route_add(&cfg, NULL);
2488 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2490 #endif
2492 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2494 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2495 struct rt6_info *rt;
2496 struct fib6_table *table;
2498 table = fib6_get_table(dev_net(dev), tb_id);
2499 if (!table)
2500 return NULL;
2502 read_lock_bh(&table->tb6_lock);
2503 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2504 if (dev == rt->dst.dev &&
2505 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2506 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2507 break;
2509 if (rt)
2510 dst_hold(&rt->dst);
2511 read_unlock_bh(&table->tb6_lock);
2512 return rt;
2515 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2516 struct net_device *dev,
2517 unsigned int pref)
2519 struct fib6_config cfg = {
2520 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2521 .fc_metric = IP6_RT_PRIO_USER,
2522 .fc_ifindex = dev->ifindex,
2523 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2524 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2525 .fc_protocol = RTPROT_RA,
2526 .fc_nlinfo.portid = 0,
2527 .fc_nlinfo.nlh = NULL,
2528 .fc_nlinfo.nl_net = dev_net(dev),
2531 cfg.fc_gateway = *gwaddr;
2533 if (!ip6_route_add(&cfg, NULL)) {
2534 struct fib6_table *table;
2536 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2537 if (table)
2538 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2541 return rt6_get_dflt_router(gwaddr, dev);
2544 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2546 struct rt6_info *rt;
2548 restart:
2549 read_lock_bh(&table->tb6_lock);
2550 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2551 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2552 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2553 dst_hold(&rt->dst);
2554 read_unlock_bh(&table->tb6_lock);
2555 ip6_del_rt(rt);
2556 goto restart;
2559 read_unlock_bh(&table->tb6_lock);
2561 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2564 void rt6_purge_dflt_routers(struct net *net)
2566 struct fib6_table *table;
2567 struct hlist_head *head;
2568 unsigned int h;
2570 rcu_read_lock();
2572 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2573 head = &net->ipv6.fib_table_hash[h];
2574 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2575 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2576 __rt6_purge_dflt_routers(table);
2580 rcu_read_unlock();
2583 static void rtmsg_to_fib6_config(struct net *net,
2584 struct in6_rtmsg *rtmsg,
2585 struct fib6_config *cfg)
2587 memset(cfg, 0, sizeof(*cfg));
2589 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2590 : RT6_TABLE_MAIN;
2591 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2592 cfg->fc_metric = rtmsg->rtmsg_metric;
2593 cfg->fc_expires = rtmsg->rtmsg_info;
2594 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2595 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2596 cfg->fc_flags = rtmsg->rtmsg_flags;
2598 cfg->fc_nlinfo.nl_net = net;
2600 cfg->fc_dst = rtmsg->rtmsg_dst;
2601 cfg->fc_src = rtmsg->rtmsg_src;
2602 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2605 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2607 struct fib6_config cfg;
2608 struct in6_rtmsg rtmsg;
2609 int err;
2611 switch (cmd) {
2612 case SIOCADDRT: /* Add a route */
2613 case SIOCDELRT: /* Delete a route */
2614 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2615 return -EPERM;
2616 err = copy_from_user(&rtmsg, arg,
2617 sizeof(struct in6_rtmsg));
2618 if (err)
2619 return -EFAULT;
2621 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2623 rtnl_lock();
2624 switch (cmd) {
2625 case SIOCADDRT:
2626 err = ip6_route_add(&cfg, NULL);
2627 break;
2628 case SIOCDELRT:
2629 err = ip6_route_del(&cfg, NULL);
2630 break;
2631 default:
2632 err = -EINVAL;
2634 rtnl_unlock();
2636 return err;
2639 return -EINVAL;
2643 * Drop the packet on the floor
2646 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2648 int type;
2649 struct dst_entry *dst = skb_dst(skb);
2650 switch (ipstats_mib_noroutes) {
2651 case IPSTATS_MIB_INNOROUTES:
2652 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2653 if (type == IPV6_ADDR_ANY) {
2654 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2655 IPSTATS_MIB_INADDRERRORS);
2656 break;
2658 /* FALLTHROUGH */
2659 case IPSTATS_MIB_OUTNOROUTES:
2660 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2661 ipstats_mib_noroutes);
2662 break;
2664 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2665 kfree_skb(skb);
2666 return 0;
2669 static int ip6_pkt_discard(struct sk_buff *skb)
2671 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2674 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2676 skb->dev = skb_dst(skb)->dev;
2677 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2680 static int ip6_pkt_prohibit(struct sk_buff *skb)
2682 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2685 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2687 skb->dev = skb_dst(skb)->dev;
2688 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2692 * Allocate a dst for local (unicast / anycast) address.
2695 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2696 const struct in6_addr *addr,
2697 bool anycast)
2699 u32 tb_id;
2700 struct net *net = dev_net(idev->dev);
2701 struct net_device *dev = net->loopback_dev;
2702 struct rt6_info *rt;
2704 /* use L3 Master device as loopback for host routes if device
2705 * is enslaved and address is not link local or multicast
2707 if (!rt6_need_strict(addr))
2708 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2710 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2711 if (!rt)
2712 return ERR_PTR(-ENOMEM);
2714 in6_dev_hold(idev);
2716 rt->dst.flags |= DST_HOST;
2717 rt->dst.input = ip6_input;
2718 rt->dst.output = ip6_output;
2719 rt->rt6i_idev = idev;
2721 rt->rt6i_protocol = RTPROT_KERNEL;
2722 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2723 if (anycast)
2724 rt->rt6i_flags |= RTF_ANYCAST;
2725 else
2726 rt->rt6i_flags |= RTF_LOCAL;
2728 rt->rt6i_gateway = *addr;
2729 rt->rt6i_dst.addr = *addr;
2730 rt->rt6i_dst.plen = 128;
2731 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2732 rt->rt6i_table = fib6_get_table(net, tb_id);
2734 return rt;
2737 /* remove deleted ip from prefsrc entries */
2738 struct arg_dev_net_ip {
2739 struct net_device *dev;
2740 struct net *net;
2741 struct in6_addr *addr;
2744 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2746 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2747 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2748 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2750 if (((void *)rt->dst.dev == dev || !dev) &&
2751 rt != net->ipv6.ip6_null_entry &&
2752 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2753 /* remove prefsrc entry */
2754 rt->rt6i_prefsrc.plen = 0;
2756 return 0;
2759 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2761 struct net *net = dev_net(ifp->idev->dev);
2762 struct arg_dev_net_ip adni = {
2763 .dev = ifp->idev->dev,
2764 .net = net,
2765 .addr = &ifp->addr,
2767 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2770 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2771 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2773 /* Remove routers and update dst entries when gateway turn into host. */
2774 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2776 struct in6_addr *gateway = (struct in6_addr *)arg;
2778 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2779 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2780 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2781 return -1;
2783 return 0;
2786 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2788 fib6_clean_all(net, fib6_clean_tohost, gateway);
2791 struct arg_dev_net {
2792 struct net_device *dev;
2793 struct net *net;
2796 /* called with write lock held for table with rt */
2797 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2799 const struct arg_dev_net *adn = arg;
2800 const struct net_device *dev = adn->dev;
2802 if ((rt->dst.dev == dev || !dev) &&
2803 rt != adn->net->ipv6.ip6_null_entry &&
2804 (rt->rt6i_nsiblings == 0 ||
2805 (dev && netdev_unregistering(dev)) ||
2806 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2807 return -1;
2809 return 0;
2812 void rt6_ifdown(struct net *net, struct net_device *dev)
2814 struct arg_dev_net adn = {
2815 .dev = dev,
2816 .net = net,
2819 fib6_clean_all(net, fib6_ifdown, &adn);
2820 if (dev)
2821 rt6_uncached_list_flush_dev(net, dev);
2824 struct rt6_mtu_change_arg {
2825 struct net_device *dev;
2826 unsigned int mtu;
2829 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2831 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2832 struct inet6_dev *idev;
2834 /* In IPv6 pmtu discovery is not optional,
2835 so that RTAX_MTU lock cannot disable it.
2836 We still use this lock to block changes
2837 caused by addrconf/ndisc.
2840 idev = __in6_dev_get(arg->dev);
2841 if (!idev)
2842 return 0;
2844 /* For administrative MTU increase, there is no way to discover
2845 IPv6 PMTU increase, so PMTU increase should be updated here.
2846 Since RFC 1981 doesn't include administrative MTU increase
2847 update PMTU increase is a MUST. (i.e. jumbo frame)
2850 If new MTU is less than route PMTU, this new MTU will be the
2851 lowest MTU in the path, update the route PMTU to reflect PMTU
2852 decreases; if new MTU is greater than route PMTU, and the
2853 old MTU is the lowest MTU in the path, update the route PMTU
2854 to reflect the increase. In this case if the other nodes' MTU
2855 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2856 PMTU discovery.
2858 if (rt->dst.dev == arg->dev &&
2859 dst_metric_raw(&rt->dst, RTAX_MTU) &&
2860 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2861 if (rt->rt6i_flags & RTF_CACHE) {
2862 /* For RTF_CACHE with rt6i_pmtu == 0
2863 * (i.e. a redirected route),
2864 * the metrics of its rt->dst.from has already
2865 * been updated.
2867 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2868 rt->rt6i_pmtu = arg->mtu;
2869 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2870 (dst_mtu(&rt->dst) < arg->mtu &&
2871 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2872 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2875 return 0;
2878 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2880 struct rt6_mtu_change_arg arg = {
2881 .dev = dev,
2882 .mtu = mtu,
2885 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2888 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2889 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2890 [RTA_OIF] = { .type = NLA_U32 },
2891 [RTA_IIF] = { .type = NLA_U32 },
2892 [RTA_PRIORITY] = { .type = NLA_U32 },
2893 [RTA_METRICS] = { .type = NLA_NESTED },
2894 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2895 [RTA_PREF] = { .type = NLA_U8 },
2896 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2897 [RTA_ENCAP] = { .type = NLA_NESTED },
2898 [RTA_EXPIRES] = { .type = NLA_U32 },
2899 [RTA_UID] = { .type = NLA_U32 },
2900 [RTA_MARK] = { .type = NLA_U32 },
2903 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2904 struct fib6_config *cfg,
2905 struct netlink_ext_ack *extack)
2907 struct rtmsg *rtm;
2908 struct nlattr *tb[RTA_MAX+1];
2909 unsigned int pref;
2910 int err;
2912 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2913 NULL);
2914 if (err < 0)
2915 goto errout;
2917 err = -EINVAL;
2918 rtm = nlmsg_data(nlh);
2919 memset(cfg, 0, sizeof(*cfg));
2921 cfg->fc_table = rtm->rtm_table;
2922 cfg->fc_dst_len = rtm->rtm_dst_len;
2923 cfg->fc_src_len = rtm->rtm_src_len;
2924 cfg->fc_flags = RTF_UP;
2925 cfg->fc_protocol = rtm->rtm_protocol;
2926 cfg->fc_type = rtm->rtm_type;
2928 if (rtm->rtm_type == RTN_UNREACHABLE ||
2929 rtm->rtm_type == RTN_BLACKHOLE ||
2930 rtm->rtm_type == RTN_PROHIBIT ||
2931 rtm->rtm_type == RTN_THROW)
2932 cfg->fc_flags |= RTF_REJECT;
2934 if (rtm->rtm_type == RTN_LOCAL)
2935 cfg->fc_flags |= RTF_LOCAL;
2937 if (rtm->rtm_flags & RTM_F_CLONED)
2938 cfg->fc_flags |= RTF_CACHE;
2940 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2941 cfg->fc_nlinfo.nlh = nlh;
2942 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2944 if (tb[RTA_GATEWAY]) {
2945 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2946 cfg->fc_flags |= RTF_GATEWAY;
2949 if (tb[RTA_DST]) {
2950 int plen = (rtm->rtm_dst_len + 7) >> 3;
2952 if (nla_len(tb[RTA_DST]) < plen)
2953 goto errout;
2955 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2958 if (tb[RTA_SRC]) {
2959 int plen = (rtm->rtm_src_len + 7) >> 3;
2961 if (nla_len(tb[RTA_SRC]) < plen)
2962 goto errout;
2964 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2967 if (tb[RTA_PREFSRC])
2968 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2970 if (tb[RTA_OIF])
2971 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2973 if (tb[RTA_PRIORITY])
2974 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2976 if (tb[RTA_METRICS]) {
2977 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2978 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2981 if (tb[RTA_TABLE])
2982 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2984 if (tb[RTA_MULTIPATH]) {
2985 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2986 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2988 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2989 cfg->fc_mp_len, extack);
2990 if (err < 0)
2991 goto errout;
2994 if (tb[RTA_PREF]) {
2995 pref = nla_get_u8(tb[RTA_PREF]);
2996 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2997 pref != ICMPV6_ROUTER_PREF_HIGH)
2998 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2999 cfg->fc_flags |= RTF_PREF(pref);
3002 if (tb[RTA_ENCAP])
3003 cfg->fc_encap = tb[RTA_ENCAP];
3005 if (tb[RTA_ENCAP_TYPE]) {
3006 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3008 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3009 if (err < 0)
3010 goto errout;
3013 if (tb[RTA_EXPIRES]) {
3014 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3016 if (addrconf_finite_timeout(timeout)) {
3017 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3018 cfg->fc_flags |= RTF_EXPIRES;
3022 err = 0;
3023 errout:
3024 return err;
3027 struct rt6_nh {
3028 struct rt6_info *rt6_info;
3029 struct fib6_config r_cfg;
3030 struct mx6_config mxc;
3031 struct list_head next;
3034 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3036 struct rt6_nh *nh;
3038 list_for_each_entry(nh, rt6_nh_list, next) {
3039 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3040 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3041 nh->r_cfg.fc_ifindex);
3045 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3046 struct rt6_info *rt, struct fib6_config *r_cfg)
3048 struct rt6_nh *nh;
3049 int err = -EEXIST;
3051 list_for_each_entry(nh, rt6_nh_list, next) {
3052 /* check if rt6_info already exists */
3053 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3054 return err;
3057 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3058 if (!nh)
3059 return -ENOMEM;
3060 nh->rt6_info = rt;
3061 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3062 if (err) {
3063 kfree(nh);
3064 return err;
3066 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3067 list_add_tail(&nh->next, rt6_nh_list);
3069 return 0;
3072 static void ip6_route_mpath_notify(struct rt6_info *rt,
3073 struct rt6_info *rt_last,
3074 struct nl_info *info,
3075 __u16 nlflags)
3077 /* if this is an APPEND route, then rt points to the first route
3078 * inserted and rt_last points to last route inserted. Userspace
3079 * wants a consistent dump of the route which starts at the first
3080 * nexthop. Since sibling routes are always added at the end of
3081 * the list, find the first sibling of the last route appended
3083 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3084 rt = list_first_entry(&rt_last->rt6i_siblings,
3085 struct rt6_info,
3086 rt6i_siblings);
3089 if (rt)
3090 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3093 static int ip6_route_multipath_add(struct fib6_config *cfg,
3094 struct netlink_ext_ack *extack)
3096 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3097 struct nl_info *info = &cfg->fc_nlinfo;
3098 struct fib6_config r_cfg;
3099 struct rtnexthop *rtnh;
3100 struct rt6_info *rt;
3101 struct rt6_nh *err_nh;
3102 struct rt6_nh *nh, *nh_safe;
3103 __u16 nlflags;
3104 int remaining;
3105 int attrlen;
3106 int err = 1;
3107 int nhn = 0;
3108 int replace = (cfg->fc_nlinfo.nlh &&
3109 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3110 LIST_HEAD(rt6_nh_list);
3112 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3113 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3114 nlflags |= NLM_F_APPEND;
3116 remaining = cfg->fc_mp_len;
3117 rtnh = (struct rtnexthop *)cfg->fc_mp;
3119 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3120 * rt6_info structs per nexthop
3122 while (rtnh_ok(rtnh, remaining)) {
3123 memcpy(&r_cfg, cfg, sizeof(*cfg));
3124 if (rtnh->rtnh_ifindex)
3125 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3127 attrlen = rtnh_attrlen(rtnh);
3128 if (attrlen > 0) {
3129 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3131 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3132 if (nla) {
3133 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3134 r_cfg.fc_flags |= RTF_GATEWAY;
3136 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3137 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3138 if (nla)
3139 r_cfg.fc_encap_type = nla_get_u16(nla);
3142 rt = ip6_route_info_create(&r_cfg, extack);
3143 if (IS_ERR(rt)) {
3144 err = PTR_ERR(rt);
3145 rt = NULL;
3146 goto cleanup;
3149 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3150 if (err) {
3151 dst_release_immediate(&rt->dst);
3152 goto cleanup;
3155 rtnh = rtnh_next(rtnh, &remaining);
3158 /* for add and replace send one notification with all nexthops.
3159 * Skip the notification in fib6_add_rt2node and send one with
3160 * the full route when done
3162 info->skip_notify = 1;
3164 err_nh = NULL;
3165 list_for_each_entry(nh, &rt6_nh_list, next) {
3166 rt_last = nh->rt6_info;
3167 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3168 /* save reference to first route for notification */
3169 if (!rt_notif && !err)
3170 rt_notif = nh->rt6_info;
3172 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3173 nh->rt6_info = NULL;
3174 if (err) {
3175 if (replace && nhn)
3176 ip6_print_replace_route_err(&rt6_nh_list);
3177 err_nh = nh;
3178 goto add_errout;
3181 /* Because each route is added like a single route we remove
3182 * these flags after the first nexthop: if there is a collision,
3183 * we have already failed to add the first nexthop:
3184 * fib6_add_rt2node() has rejected it; when replacing, old
3185 * nexthops have been replaced by first new, the rest should
3186 * be added to it.
3188 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3189 NLM_F_REPLACE);
3190 nhn++;
3193 /* success ... tell user about new route */
3194 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3195 goto cleanup;
3197 add_errout:
3198 /* send notification for routes that were added so that
3199 * the delete notifications sent by ip6_route_del are
3200 * coherent
3202 if (rt_notif)
3203 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3205 /* Delete routes that were already added */
3206 list_for_each_entry(nh, &rt6_nh_list, next) {
3207 if (err_nh == nh)
3208 break;
3209 ip6_route_del(&nh->r_cfg, extack);
3212 cleanup:
3213 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3214 if (nh->rt6_info)
3215 dst_release_immediate(&nh->rt6_info->dst);
3216 kfree(nh->mxc.mx);
3217 list_del(&nh->next);
3218 kfree(nh);
3221 return err;
3224 static int ip6_route_multipath_del(struct fib6_config *cfg,
3225 struct netlink_ext_ack *extack)
3227 struct fib6_config r_cfg;
3228 struct rtnexthop *rtnh;
3229 int remaining;
3230 int attrlen;
3231 int err = 1, last_err = 0;
3233 remaining = cfg->fc_mp_len;
3234 rtnh = (struct rtnexthop *)cfg->fc_mp;
3236 /* Parse a Multipath Entry */
3237 while (rtnh_ok(rtnh, remaining)) {
3238 memcpy(&r_cfg, cfg, sizeof(*cfg));
3239 if (rtnh->rtnh_ifindex)
3240 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3242 attrlen = rtnh_attrlen(rtnh);
3243 if (attrlen > 0) {
3244 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3246 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3247 if (nla) {
3248 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3249 r_cfg.fc_flags |= RTF_GATEWAY;
3252 err = ip6_route_del(&r_cfg, extack);
3253 if (err)
3254 last_err = err;
3256 rtnh = rtnh_next(rtnh, &remaining);
3259 return last_err;
3262 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3263 struct netlink_ext_ack *extack)
3265 struct fib6_config cfg;
3266 int err;
3268 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3269 if (err < 0)
3270 return err;
3272 if (cfg.fc_mp)
3273 return ip6_route_multipath_del(&cfg, extack);
3274 else {
3275 cfg.fc_delete_all_nh = 1;
3276 return ip6_route_del(&cfg, extack);
3280 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3281 struct netlink_ext_ack *extack)
3283 struct fib6_config cfg;
3284 int err;
3286 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3287 if (err < 0)
3288 return err;
3290 if (cfg.fc_mp)
3291 return ip6_route_multipath_add(&cfg, extack);
3292 else
3293 return ip6_route_add(&cfg, extack);
3296 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3298 int nexthop_len = 0;
3300 if (rt->rt6i_nsiblings) {
3301 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3302 + NLA_ALIGN(sizeof(struct rtnexthop))
3303 + nla_total_size(16) /* RTA_GATEWAY */
3304 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3306 nexthop_len *= rt->rt6i_nsiblings;
3309 return NLMSG_ALIGN(sizeof(struct rtmsg))
3310 + nla_total_size(16) /* RTA_SRC */
3311 + nla_total_size(16) /* RTA_DST */
3312 + nla_total_size(16) /* RTA_GATEWAY */
3313 + nla_total_size(16) /* RTA_PREFSRC */
3314 + nla_total_size(4) /* RTA_TABLE */
3315 + nla_total_size(4) /* RTA_IIF */
3316 + nla_total_size(4) /* RTA_OIF */
3317 + nla_total_size(4) /* RTA_PRIORITY */
3318 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3319 + nla_total_size(sizeof(struct rta_cacheinfo))
3320 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3321 + nla_total_size(1) /* RTA_PREF */
3322 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3323 + nexthop_len;
3326 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3327 unsigned int *flags, bool skip_oif)
3329 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3330 *flags |= RTNH_F_LINKDOWN;
3331 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3332 *flags |= RTNH_F_DEAD;
3335 if (rt->rt6i_flags & RTF_GATEWAY) {
3336 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3337 goto nla_put_failure;
3340 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3341 if (!skip_oif && rt->dst.dev &&
3342 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3343 goto nla_put_failure;
3345 if (rt->dst.lwtstate &&
3346 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3347 goto nla_put_failure;
3349 return 0;
3351 nla_put_failure:
3352 return -EMSGSIZE;
3355 /* add multipath next hop */
3356 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3358 struct rtnexthop *rtnh;
3359 unsigned int flags = 0;
3361 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3362 if (!rtnh)
3363 goto nla_put_failure;
3365 rtnh->rtnh_hops = 0;
3366 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3368 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3369 goto nla_put_failure;
3371 rtnh->rtnh_flags = flags;
3373 /* length of rtnetlink header + attributes */
3374 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3376 return 0;
3378 nla_put_failure:
3379 return -EMSGSIZE;
3382 static int rt6_fill_node(struct net *net,
3383 struct sk_buff *skb, struct rt6_info *rt,
3384 struct in6_addr *dst, struct in6_addr *src,
3385 int iif, int type, u32 portid, u32 seq,
3386 unsigned int flags)
3388 u32 metrics[RTAX_MAX];
3389 struct rtmsg *rtm;
3390 struct nlmsghdr *nlh;
3391 long expires;
3392 u32 table;
3394 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3395 if (!nlh)
3396 return -EMSGSIZE;
3398 rtm = nlmsg_data(nlh);
3399 rtm->rtm_family = AF_INET6;
3400 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3401 rtm->rtm_src_len = rt->rt6i_src.plen;
3402 rtm->rtm_tos = 0;
3403 if (rt->rt6i_table)
3404 table = rt->rt6i_table->tb6_id;
3405 else
3406 table = RT6_TABLE_UNSPEC;
3407 rtm->rtm_table = table;
3408 if (nla_put_u32(skb, RTA_TABLE, table))
3409 goto nla_put_failure;
3410 if (rt->rt6i_flags & RTF_REJECT) {
3411 switch (rt->dst.error) {
3412 case -EINVAL:
3413 rtm->rtm_type = RTN_BLACKHOLE;
3414 break;
3415 case -EACCES:
3416 rtm->rtm_type = RTN_PROHIBIT;
3417 break;
3418 case -EAGAIN:
3419 rtm->rtm_type = RTN_THROW;
3420 break;
3421 default:
3422 rtm->rtm_type = RTN_UNREACHABLE;
3423 break;
3426 else if (rt->rt6i_flags & RTF_LOCAL)
3427 rtm->rtm_type = RTN_LOCAL;
3428 else if (rt->rt6i_flags & RTF_ANYCAST)
3429 rtm->rtm_type = RTN_ANYCAST;
3430 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3431 rtm->rtm_type = RTN_LOCAL;
3432 else
3433 rtm->rtm_type = RTN_UNICAST;
3434 rtm->rtm_flags = 0;
3435 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3436 rtm->rtm_protocol = rt->rt6i_protocol;
3438 if (rt->rt6i_flags & RTF_CACHE)
3439 rtm->rtm_flags |= RTM_F_CLONED;
3441 if (dst) {
3442 if (nla_put_in6_addr(skb, RTA_DST, dst))
3443 goto nla_put_failure;
3444 rtm->rtm_dst_len = 128;
3445 } else if (rtm->rtm_dst_len)
3446 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3447 goto nla_put_failure;
3448 #ifdef CONFIG_IPV6_SUBTREES
3449 if (src) {
3450 if (nla_put_in6_addr(skb, RTA_SRC, src))
3451 goto nla_put_failure;
3452 rtm->rtm_src_len = 128;
3453 } else if (rtm->rtm_src_len &&
3454 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3455 goto nla_put_failure;
3456 #endif
3457 if (iif) {
3458 #ifdef CONFIG_IPV6_MROUTE
3459 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3460 int err = ip6mr_get_route(net, skb, rtm, portid);
3462 if (err == 0)
3463 return 0;
3464 if (err < 0)
3465 goto nla_put_failure;
3466 } else
3467 #endif
3468 if (nla_put_u32(skb, RTA_IIF, iif))
3469 goto nla_put_failure;
3470 } else if (dst) {
3471 struct in6_addr saddr_buf;
3472 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3473 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3474 goto nla_put_failure;
3477 if (rt->rt6i_prefsrc.plen) {
3478 struct in6_addr saddr_buf;
3479 saddr_buf = rt->rt6i_prefsrc.addr;
3480 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3481 goto nla_put_failure;
3484 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3485 if (rt->rt6i_pmtu)
3486 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3487 if (rtnetlink_put_metrics(skb, metrics) < 0)
3488 goto nla_put_failure;
3490 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3491 goto nla_put_failure;
3493 /* For multipath routes, walk the siblings list and add
3494 * each as a nexthop within RTA_MULTIPATH.
3496 if (rt->rt6i_nsiblings) {
3497 struct rt6_info *sibling, *next_sibling;
3498 struct nlattr *mp;
3500 mp = nla_nest_start(skb, RTA_MULTIPATH);
3501 if (!mp)
3502 goto nla_put_failure;
3504 if (rt6_add_nexthop(skb, rt) < 0)
3505 goto nla_put_failure;
3507 list_for_each_entry_safe(sibling, next_sibling,
3508 &rt->rt6i_siblings, rt6i_siblings) {
3509 if (rt6_add_nexthop(skb, sibling) < 0)
3510 goto nla_put_failure;
3513 nla_nest_end(skb, mp);
3514 } else {
3515 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3516 goto nla_put_failure;
3519 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3521 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3522 goto nla_put_failure;
3524 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3525 goto nla_put_failure;
3528 nlmsg_end(skb, nlh);
3529 return 0;
3531 nla_put_failure:
3532 nlmsg_cancel(skb, nlh);
3533 return -EMSGSIZE;
3536 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3538 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3539 struct net *net = arg->net;
3541 if (rt == net->ipv6.ip6_null_entry)
3542 return 0;
3544 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3545 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3547 /* user wants prefix routes only */
3548 if (rtm->rtm_flags & RTM_F_PREFIX &&
3549 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3550 /* success since this is not a prefix route */
3551 return 1;
3555 return rt6_fill_node(net,
3556 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3557 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3558 NLM_F_MULTI);
3561 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3562 struct netlink_ext_ack *extack)
3564 struct net *net = sock_net(in_skb->sk);
3565 struct nlattr *tb[RTA_MAX+1];
3566 int err, iif = 0, oif = 0;
3567 struct dst_entry *dst;
3568 struct rt6_info *rt;
3569 struct sk_buff *skb;
3570 struct rtmsg *rtm;
3571 struct flowi6 fl6;
3572 bool fibmatch;
3574 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3575 extack);
3576 if (err < 0)
3577 goto errout;
3579 err = -EINVAL;
3580 memset(&fl6, 0, sizeof(fl6));
3581 rtm = nlmsg_data(nlh);
3582 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3583 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3585 if (tb[RTA_SRC]) {
3586 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3587 goto errout;
3589 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3592 if (tb[RTA_DST]) {
3593 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3594 goto errout;
3596 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3599 if (tb[RTA_IIF])
3600 iif = nla_get_u32(tb[RTA_IIF]);
3602 if (tb[RTA_OIF])
3603 oif = nla_get_u32(tb[RTA_OIF]);
3605 if (tb[RTA_MARK])
3606 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3608 if (tb[RTA_UID])
3609 fl6.flowi6_uid = make_kuid(current_user_ns(),
3610 nla_get_u32(tb[RTA_UID]));
3611 else
3612 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3614 if (iif) {
3615 struct net_device *dev;
3616 int flags = 0;
3618 dev = __dev_get_by_index(net, iif);
3619 if (!dev) {
3620 err = -ENODEV;
3621 goto errout;
3624 fl6.flowi6_iif = iif;
3626 if (!ipv6_addr_any(&fl6.saddr))
3627 flags |= RT6_LOOKUP_F_HAS_SADDR;
3629 if (!fibmatch)
3630 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3631 } else {
3632 fl6.flowi6_oif = oif;
3634 if (!fibmatch)
3635 dst = ip6_route_output(net, NULL, &fl6);
3638 if (fibmatch)
3639 dst = ip6_route_lookup(net, &fl6, 0);
3641 rt = container_of(dst, struct rt6_info, dst);
3642 if (rt->dst.error) {
3643 err = rt->dst.error;
3644 ip6_rt_put(rt);
3645 goto errout;
3648 if (rt == net->ipv6.ip6_null_entry) {
3649 err = rt->dst.error;
3650 ip6_rt_put(rt);
3651 goto errout;
3654 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3655 if (!skb) {
3656 ip6_rt_put(rt);
3657 err = -ENOBUFS;
3658 goto errout;
3661 skb_dst_set(skb, &rt->dst);
3662 if (fibmatch)
3663 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3664 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3665 nlh->nlmsg_seq, 0);
3666 else
3667 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3668 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3669 nlh->nlmsg_seq, 0);
3670 if (err < 0) {
3671 kfree_skb(skb);
3672 goto errout;
3675 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3676 errout:
3677 return err;
3680 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3681 unsigned int nlm_flags)
3683 struct sk_buff *skb;
3684 struct net *net = info->nl_net;
3685 u32 seq;
3686 int err;
3688 err = -ENOBUFS;
3689 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3691 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3692 if (!skb)
3693 goto errout;
3695 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3696 event, info->portid, seq, nlm_flags);
3697 if (err < 0) {
3698 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3699 WARN_ON(err == -EMSGSIZE);
3700 kfree_skb(skb);
3701 goto errout;
3703 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3704 info->nlh, gfp_any());
3705 return;
3706 errout:
3707 if (err < 0)
3708 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3711 static int ip6_route_dev_notify(struct notifier_block *this,
3712 unsigned long event, void *ptr)
3714 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3715 struct net *net = dev_net(dev);
3717 if (!(dev->flags & IFF_LOOPBACK))
3718 return NOTIFY_OK;
3720 if (event == NETDEV_REGISTER) {
3721 net->ipv6.ip6_null_entry->dst.dev = dev;
3722 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3723 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3724 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3725 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3726 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3727 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3728 #endif
3729 } else if (event == NETDEV_UNREGISTER &&
3730 dev->reg_state != NETREG_UNREGISTERED) {
3731 /* NETDEV_UNREGISTER could be fired for multiple times by
3732 * netdev_wait_allrefs(). Make sure we only call this once.
3734 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3735 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3736 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3737 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3738 #endif
3741 return NOTIFY_OK;
3745 * /proc
3748 #ifdef CONFIG_PROC_FS
3750 static const struct file_operations ipv6_route_proc_fops = {
3751 .owner = THIS_MODULE,
3752 .open = ipv6_route_open,
3753 .read = seq_read,
3754 .llseek = seq_lseek,
3755 .release = seq_release_net,
3758 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3760 struct net *net = (struct net *)seq->private;
3761 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3762 net->ipv6.rt6_stats->fib_nodes,
3763 net->ipv6.rt6_stats->fib_route_nodes,
3764 net->ipv6.rt6_stats->fib_rt_alloc,
3765 net->ipv6.rt6_stats->fib_rt_entries,
3766 net->ipv6.rt6_stats->fib_rt_cache,
3767 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3768 net->ipv6.rt6_stats->fib_discarded_routes);
3770 return 0;
3773 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3775 return single_open_net(inode, file, rt6_stats_seq_show);
3778 static const struct file_operations rt6_stats_seq_fops = {
3779 .owner = THIS_MODULE,
3780 .open = rt6_stats_seq_open,
3781 .read = seq_read,
3782 .llseek = seq_lseek,
3783 .release = single_release_net,
3785 #endif /* CONFIG_PROC_FS */
3787 #ifdef CONFIG_SYSCTL
3789 static
3790 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3791 void __user *buffer, size_t *lenp, loff_t *ppos)
3793 struct net *net;
3794 int delay;
3795 if (!write)
3796 return -EINVAL;
3798 net = (struct net *)ctl->extra1;
3799 delay = net->ipv6.sysctl.flush_delay;
3800 proc_dointvec(ctl, write, buffer, lenp, ppos);
3801 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3802 return 0;
3805 struct ctl_table ipv6_route_table_template[] = {
3807 .procname = "flush",
3808 .data = &init_net.ipv6.sysctl.flush_delay,
3809 .maxlen = sizeof(int),
3810 .mode = 0200,
3811 .proc_handler = ipv6_sysctl_rtcache_flush
3814 .procname = "gc_thresh",
3815 .data = &ip6_dst_ops_template.gc_thresh,
3816 .maxlen = sizeof(int),
3817 .mode = 0644,
3818 .proc_handler = proc_dointvec,
3821 .procname = "max_size",
3822 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3823 .maxlen = sizeof(int),
3824 .mode = 0644,
3825 .proc_handler = proc_dointvec,
3828 .procname = "gc_min_interval",
3829 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3830 .maxlen = sizeof(int),
3831 .mode = 0644,
3832 .proc_handler = proc_dointvec_jiffies,
3835 .procname = "gc_timeout",
3836 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3837 .maxlen = sizeof(int),
3838 .mode = 0644,
3839 .proc_handler = proc_dointvec_jiffies,
3842 .procname = "gc_interval",
3843 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3844 .maxlen = sizeof(int),
3845 .mode = 0644,
3846 .proc_handler = proc_dointvec_jiffies,
3849 .procname = "gc_elasticity",
3850 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3851 .maxlen = sizeof(int),
3852 .mode = 0644,
3853 .proc_handler = proc_dointvec,
3856 .procname = "mtu_expires",
3857 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3858 .maxlen = sizeof(int),
3859 .mode = 0644,
3860 .proc_handler = proc_dointvec_jiffies,
3863 .procname = "min_adv_mss",
3864 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3865 .maxlen = sizeof(int),
3866 .mode = 0644,
3867 .proc_handler = proc_dointvec,
3870 .procname = "gc_min_interval_ms",
3871 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3872 .maxlen = sizeof(int),
3873 .mode = 0644,
3874 .proc_handler = proc_dointvec_ms_jiffies,
3879 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3881 struct ctl_table *table;
3883 table = kmemdup(ipv6_route_table_template,
3884 sizeof(ipv6_route_table_template),
3885 GFP_KERNEL);
3887 if (table) {
3888 table[0].data = &net->ipv6.sysctl.flush_delay;
3889 table[0].extra1 = net;
3890 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3891 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3892 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3893 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3894 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3895 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3896 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3897 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3898 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3900 /* Don't export sysctls to unprivileged users */
3901 if (net->user_ns != &init_user_ns)
3902 table[0].procname = NULL;
3905 return table;
3907 #endif
3909 static int __net_init ip6_route_net_init(struct net *net)
3911 int ret = -ENOMEM;
3913 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3914 sizeof(net->ipv6.ip6_dst_ops));
3916 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3917 goto out_ip6_dst_ops;
3919 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3920 sizeof(*net->ipv6.ip6_null_entry),
3921 GFP_KERNEL);
3922 if (!net->ipv6.ip6_null_entry)
3923 goto out_ip6_dst_entries;
3924 net->ipv6.ip6_null_entry->dst.path =
3925 (struct dst_entry *)net->ipv6.ip6_null_entry;
3926 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3927 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3928 ip6_template_metrics, true);
3930 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3931 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3932 sizeof(*net->ipv6.ip6_prohibit_entry),
3933 GFP_KERNEL);
3934 if (!net->ipv6.ip6_prohibit_entry)
3935 goto out_ip6_null_entry;
3936 net->ipv6.ip6_prohibit_entry->dst.path =
3937 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3938 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3939 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3940 ip6_template_metrics, true);
3942 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3943 sizeof(*net->ipv6.ip6_blk_hole_entry),
3944 GFP_KERNEL);
3945 if (!net->ipv6.ip6_blk_hole_entry)
3946 goto out_ip6_prohibit_entry;
3947 net->ipv6.ip6_blk_hole_entry->dst.path =
3948 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3949 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3950 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3951 ip6_template_metrics, true);
3952 #endif
3954 net->ipv6.sysctl.flush_delay = 0;
3955 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3956 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3957 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3958 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3959 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3960 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3961 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3963 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3965 ret = 0;
3966 out:
3967 return ret;
3969 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3970 out_ip6_prohibit_entry:
3971 kfree(net->ipv6.ip6_prohibit_entry);
3972 out_ip6_null_entry:
3973 kfree(net->ipv6.ip6_null_entry);
3974 #endif
3975 out_ip6_dst_entries:
3976 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3977 out_ip6_dst_ops:
3978 goto out;
3981 static void __net_exit ip6_route_net_exit(struct net *net)
3983 kfree(net->ipv6.ip6_null_entry);
3984 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3985 kfree(net->ipv6.ip6_prohibit_entry);
3986 kfree(net->ipv6.ip6_blk_hole_entry);
3987 #endif
3988 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3991 static int __net_init ip6_route_net_init_late(struct net *net)
3993 #ifdef CONFIG_PROC_FS
3994 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3995 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3996 #endif
3997 return 0;
4000 static void __net_exit ip6_route_net_exit_late(struct net *net)
4002 #ifdef CONFIG_PROC_FS
4003 remove_proc_entry("ipv6_route", net->proc_net);
4004 remove_proc_entry("rt6_stats", net->proc_net);
4005 #endif
4008 static struct pernet_operations ip6_route_net_ops = {
4009 .init = ip6_route_net_init,
4010 .exit = ip6_route_net_exit,
4013 static int __net_init ipv6_inetpeer_init(struct net *net)
4015 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4017 if (!bp)
4018 return -ENOMEM;
4019 inet_peer_base_init(bp);
4020 net->ipv6.peers = bp;
4021 return 0;
4024 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4026 struct inet_peer_base *bp = net->ipv6.peers;
4028 net->ipv6.peers = NULL;
4029 inetpeer_invalidate_tree(bp);
4030 kfree(bp);
4033 static struct pernet_operations ipv6_inetpeer_ops = {
4034 .init = ipv6_inetpeer_init,
4035 .exit = ipv6_inetpeer_exit,
4038 static struct pernet_operations ip6_route_net_late_ops = {
4039 .init = ip6_route_net_init_late,
4040 .exit = ip6_route_net_exit_late,
4043 static struct notifier_block ip6_route_dev_notifier = {
4044 .notifier_call = ip6_route_dev_notify,
4045 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4048 void __init ip6_route_init_special_entries(void)
4050 /* Registering of the loopback is done before this portion of code,
4051 * the loopback reference in rt6_info will not be taken, do it
4052 * manually for init_net */
4053 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4054 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4055 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4056 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4057 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4058 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4059 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4060 #endif
4063 int __init ip6_route_init(void)
4065 int ret;
4066 int cpu;
4068 ret = -ENOMEM;
4069 ip6_dst_ops_template.kmem_cachep =
4070 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4071 SLAB_HWCACHE_ALIGN, NULL);
4072 if (!ip6_dst_ops_template.kmem_cachep)
4073 goto out;
4075 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4076 if (ret)
4077 goto out_kmem_cache;
4079 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4080 if (ret)
4081 goto out_dst_entries;
4083 ret = register_pernet_subsys(&ip6_route_net_ops);
4084 if (ret)
4085 goto out_register_inetpeer;
4087 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4089 ret = fib6_init();
4090 if (ret)
4091 goto out_register_subsys;
4093 ret = xfrm6_init();
4094 if (ret)
4095 goto out_fib6_init;
4097 ret = fib6_rules_init();
4098 if (ret)
4099 goto xfrm6_init;
4101 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4102 if (ret)
4103 goto fib6_rules_init;
4105 ret = -ENOBUFS;
4106 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4107 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4108 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4109 goto out_register_late_subsys;
4111 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4112 if (ret)
4113 goto out_register_late_subsys;
4115 for_each_possible_cpu(cpu) {
4116 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4118 INIT_LIST_HEAD(&ul->head);
4119 spin_lock_init(&ul->lock);
4122 out:
4123 return ret;
4125 out_register_late_subsys:
4126 unregister_pernet_subsys(&ip6_route_net_late_ops);
4127 fib6_rules_init:
4128 fib6_rules_cleanup();
4129 xfrm6_init:
4130 xfrm6_fini();
4131 out_fib6_init:
4132 fib6_gc_cleanup();
4133 out_register_subsys:
4134 unregister_pernet_subsys(&ip6_route_net_ops);
4135 out_register_inetpeer:
4136 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4137 out_dst_entries:
4138 dst_entries_destroy(&ip6_dst_blackhole_ops);
4139 out_kmem_cache:
4140 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4141 goto out;
4144 void ip6_route_cleanup(void)
4146 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4147 unregister_pernet_subsys(&ip6_route_net_late_ops);
4148 fib6_rules_cleanup();
4149 xfrm6_fini();
4150 fib6_gc_cleanup();
4151 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4152 unregister_pernet_subsys(&ip6_route_net_ops);
4153 dst_entries_destroy(&ip6_dst_blackhole_ops);
4154 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);