Linux 5.1.15
[linux/fpc-iii.git] / net / ipv6 / route.c
blobab348489bd8a66781f88c7e4d7e36baaa771339a
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
80 enum rt6_nud_state {
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
84 RT6_NUD_SUCCEED = 1
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
111 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 const struct in6_addr *daddr,
114 const struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
121 unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
126 #endif
128 struct uncached_list {
129 spinlock_t lock;
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
162 int cpu;
164 if (dev == loopback_dev)
165 return;
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 struct rt6_info *rt;
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
181 if (rt_dev == dev) {
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
184 dev_put(rt_dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 struct sk_buff *skb,
193 const void *daddr)
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
197 else if (skb)
198 return &ipv6_hdr(skb)->daddr;
199 return daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
204 struct sk_buff *skb,
205 const void *daddr)
207 struct neighbour *n;
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
211 if (n)
212 return n;
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219 struct sk_buff *skb,
220 const void *daddr)
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (!daddr)
234 return;
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236 return;
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238 return;
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
243 .family = AF_INET6,
244 .gc = ip6_dst_gc,
245 .gc_thresh = 1024,
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
248 .mtu = ip6_mtu,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274 struct sk_buff *skb)
278 static struct dst_ops ip6_dst_blackhole_ops = {
279 .family = AF_INET6,
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
304 .dst = {
305 .__refcnt = ATOMIC_INIT(1),
306 .__use = 1,
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
318 .dst = {
319 .__refcnt = ATOMIC_INIT(1),
320 .__use = 1,
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
322 .error = -EACCES,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330 .dst = {
331 .__refcnt = ATOMIC_INIT(1),
332 .__use = 1,
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
334 .error = -EINVAL,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 #endif
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 int flags)
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
358 if (rt) {
359 rt6_info_init(rt);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 return rt;
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
377 if (idev) {
378 rt->rt6i_idev = NULL;
379 in6_dev_put(idev);
382 from = xchg((__force struct fib6_info **)&rt->from, NULL);
383 fib6_info_release(from);
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387 int how)
389 struct rt6_info *rt = (struct rt6_info *)dst;
390 struct inet6_dev *idev = rt->rt6i_idev;
391 struct net_device *loopback_dev =
392 dev_net(dev)->loopback_dev;
394 if (idev && idev->dev != loopback_dev) {
395 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
396 if (loopback_idev) {
397 rt->rt6i_idev = loopback_idev;
398 in6_dev_put(idev);
403 static bool __rt6_check_expired(const struct rt6_info *rt)
405 if (rt->rt6i_flags & RTF_EXPIRES)
406 return time_after(jiffies, rt->dst.expires);
407 else
408 return false;
411 static bool rt6_check_expired(const struct rt6_info *rt)
413 struct fib6_info *from;
415 from = rcu_dereference(rt->from);
417 if (rt->rt6i_flags & RTF_EXPIRES) {
418 if (time_after(jiffies, rt->dst.expires))
419 return true;
420 } else if (from) {
421 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
422 fib6_check_expired(from);
424 return false;
427 struct fib6_info *fib6_multipath_select(const struct net *net,
428 struct fib6_info *match,
429 struct flowi6 *fl6, int oif,
430 const struct sk_buff *skb,
431 int strict)
433 struct fib6_info *sibling, *next_sibling;
435 /* We might have already computed the hash for ICMPv6 errors. In such
436 * case it will always be non-zero. Otherwise now is the time to do it.
438 if (!fl6->mp_hash)
439 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
442 return match;
444 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
445 fib6_siblings) {
446 int nh_upper_bound;
448 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
449 if (fl6->mp_hash > nh_upper_bound)
450 continue;
451 if (rt6_score_route(sibling, oif, strict) < 0)
452 break;
453 match = sibling;
454 break;
457 return match;
461 * Route lookup. rcu_read_lock() should be held.
464 static inline struct fib6_info *rt6_device_match(struct net *net,
465 struct fib6_info *rt,
466 const struct in6_addr *saddr,
467 int oif,
468 int flags)
470 struct fib6_info *sprt;
472 if (!oif && ipv6_addr_any(saddr) &&
473 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
474 return rt;
476 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
477 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
480 continue;
482 if (oif) {
483 if (dev->ifindex == oif)
484 return sprt;
485 } else {
486 if (ipv6_chk_addr(net, saddr, dev,
487 flags & RT6_LOOKUP_F_IFACE))
488 return sprt;
492 if (oif && flags & RT6_LOOKUP_F_IFACE)
493 return net->ipv6.fib6_null_entry;
495 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499 struct __rt6_probe_work {
500 struct work_struct work;
501 struct in6_addr target;
502 struct net_device *dev;
505 static void rt6_probe_deferred(struct work_struct *w)
507 struct in6_addr mcaddr;
508 struct __rt6_probe_work *work =
509 container_of(w, struct __rt6_probe_work, work);
511 addrconf_addr_solict_mult(&work->target, &mcaddr);
512 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
513 dev_put(work->dev);
514 kfree(work);
517 static void rt6_probe(struct fib6_info *rt)
519 struct __rt6_probe_work *work = NULL;
520 const struct in6_addr *nh_gw;
521 struct neighbour *neigh;
522 struct net_device *dev;
523 struct inet6_dev *idev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534 return;
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
538 rcu_read_lock_bh();
539 idev = __in6_dev_get(dev);
540 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 if (neigh) {
542 if (neigh->nud_state & NUD_VALID)
543 goto out;
545 write_lock(&neigh->lock);
546 if (!(neigh->nud_state & NUD_VALID) &&
547 time_after(jiffies,
548 neigh->updated + idev->cnf.rtr_probe_interval)) {
549 work = kmalloc(sizeof(*work), GFP_ATOMIC);
550 if (work)
551 __neigh_set_probe_once(neigh);
553 write_unlock(&neigh->lock);
554 } else if (time_after(jiffies, rt->last_probe +
555 idev->cnf.rtr_probe_interval)) {
556 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559 if (work) {
560 rt->last_probe = jiffies;
561 INIT_WORK(&work->work, rt6_probe_deferred);
562 work->target = *nh_gw;
563 dev_hold(dev);
564 work->dev = dev;
565 schedule_work(&work->work);
568 out:
569 rcu_read_unlock_bh();
571 #else
572 static inline void rt6_probe(struct fib6_info *rt)
575 #endif
578 * Default Router Selection (RFC 2461 6.3.6)
580 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 const struct net_device *dev = rt->fib6_nh.nh_dev;
584 if (!oif || dev->ifindex == oif)
585 return 2;
586 return 0;
589 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
592 struct neighbour *neigh;
594 if (rt->fib6_flags & RTF_NONEXTHOP ||
595 !(rt->fib6_flags & RTF_GATEWAY))
596 return RT6_NUD_SUCCEED;
598 rcu_read_lock_bh();
599 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
600 &rt->fib6_nh.nh_gw);
601 if (neigh) {
602 read_lock(&neigh->lock);
603 if (neigh->nud_state & NUD_VALID)
604 ret = RT6_NUD_SUCCEED;
605 #ifdef CONFIG_IPV6_ROUTER_PREF
606 else if (!(neigh->nud_state & NUD_FAILED))
607 ret = RT6_NUD_SUCCEED;
608 else
609 ret = RT6_NUD_FAIL_PROBE;
610 #endif
611 read_unlock(&neigh->lock);
612 } else {
613 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
614 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616 rcu_read_unlock_bh();
618 return ret;
621 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 int m;
625 m = rt6_check_dev(rt, oif);
626 if (!m && (strict & RT6_LOOKUP_F_IFACE))
627 return RT6_NUD_FAIL_HARD;
628 #ifdef CONFIG_IPV6_ROUTER_PREF
629 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
630 #endif
631 if (strict & RT6_LOOKUP_F_REACHABLE) {
632 int n = rt6_check_neigh(rt);
633 if (n < 0)
634 return n;
636 return m;
639 /* called with rc_read_lock held */
640 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 const struct net_device *dev = fib6_info_nh_dev(f6i);
643 bool rc = false;
645 if (dev) {
646 const struct inet6_dev *idev = __in6_dev_get(dev);
648 rc = !!idev->cnf.ignore_routes_with_linkdown;
651 return rc;
654 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
655 int *mpri, struct fib6_info *match,
656 bool *do_rr)
658 int m;
659 bool match_do_rr = false;
661 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
662 goto out;
664 if (fib6_ignore_linkdown(rt) &&
665 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
666 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
667 goto out;
669 if (fib6_check_expired(rt))
670 goto out;
672 m = rt6_score_route(rt, oif, strict);
673 if (m == RT6_NUD_FAIL_DO_RR) {
674 match_do_rr = true;
675 m = 0; /* lowest valid score */
676 } else if (m == RT6_NUD_FAIL_HARD) {
677 goto out;
680 if (strict & RT6_LOOKUP_F_REACHABLE)
681 rt6_probe(rt);
683 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
684 if (m > *mpri) {
685 *do_rr = match_do_rr;
686 *mpri = m;
687 match = rt;
689 out:
690 return match;
693 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
694 struct fib6_info *leaf,
695 struct fib6_info *rr_head,
696 u32 metric, int oif, int strict,
697 bool *do_rr)
699 struct fib6_info *rt, *match, *cont;
700 int mpri = -1;
702 match = NULL;
703 cont = NULL;
704 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
705 if (rt->fib6_metric != metric) {
706 cont = rt;
707 break;
710 match = find_match(rt, oif, strict, &mpri, match, do_rr);
713 for (rt = leaf; rt && rt != rr_head;
714 rt = rcu_dereference(rt->fib6_next)) {
715 if (rt->fib6_metric != metric) {
716 cont = rt;
717 break;
720 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 if (match || !cont)
724 return match;
726 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
727 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 return match;
732 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
733 int oif, int strict)
735 struct fib6_info *leaf = rcu_dereference(fn->leaf);
736 struct fib6_info *match, *rt0;
737 bool do_rr = false;
738 int key_plen;
740 if (!leaf || leaf == net->ipv6.fib6_null_entry)
741 return net->ipv6.fib6_null_entry;
743 rt0 = rcu_dereference(fn->rr_ptr);
744 if (!rt0)
745 rt0 = leaf;
747 /* Double check to make sure fn is not an intermediate node
748 * and fn->leaf does not points to its child's leaf
749 * (This might happen if all routes under fn are deleted from
750 * the tree and fib6_repair_tree() is called on the node.)
752 key_plen = rt0->fib6_dst.plen;
753 #ifdef CONFIG_IPV6_SUBTREES
754 if (rt0->fib6_src.plen)
755 key_plen = rt0->fib6_src.plen;
756 #endif
757 if (fn->fn_bit != key_plen)
758 return net->ipv6.fib6_null_entry;
760 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
761 &do_rr);
763 if (do_rr) {
764 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766 /* no entries matched; do round-robin */
767 if (!next || next->fib6_metric != rt0->fib6_metric)
768 next = leaf;
770 if (next != rt0) {
771 spin_lock_bh(&leaf->fib6_table->tb6_lock);
772 /* make sure next is not being deleted from the tree */
773 if (next->fib6_node)
774 rcu_assign_pointer(fn->rr_ptr, next);
775 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779 return match ? match : net->ipv6.fib6_null_entry;
782 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
787 #ifdef CONFIG_IPV6_ROUTE_INFO
788 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
789 const struct in6_addr *gwaddr)
791 struct net *net = dev_net(dev);
792 struct route_info *rinfo = (struct route_info *) opt;
793 struct in6_addr prefix_buf, *prefix;
794 unsigned int pref;
795 unsigned long lifetime;
796 struct fib6_info *rt;
798 if (len < sizeof(struct route_info)) {
799 return -EINVAL;
802 /* Sanity check for prefix_len and length */
803 if (rinfo->length > 3) {
804 return -EINVAL;
805 } else if (rinfo->prefix_len > 128) {
806 return -EINVAL;
807 } else if (rinfo->prefix_len > 64) {
808 if (rinfo->length < 2) {
809 return -EINVAL;
811 } else if (rinfo->prefix_len > 0) {
812 if (rinfo->length < 1) {
813 return -EINVAL;
817 pref = rinfo->route_pref;
818 if (pref == ICMPV6_ROUTER_PREF_INVALID)
819 return -EINVAL;
821 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823 if (rinfo->length == 3)
824 prefix = (struct in6_addr *)rinfo->prefix;
825 else {
826 /* this function is safe */
827 ipv6_addr_prefix(&prefix_buf,
828 (struct in6_addr *)rinfo->prefix,
829 rinfo->prefix_len);
830 prefix = &prefix_buf;
833 if (rinfo->prefix_len == 0)
834 rt = rt6_get_dflt_router(net, gwaddr, dev);
835 else
836 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
837 gwaddr, dev);
839 if (rt && !lifetime) {
840 ip6_del_rt(net, rt);
841 rt = NULL;
844 if (!rt && lifetime)
845 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
846 dev, pref);
847 else if (rt)
848 rt->fib6_flags = RTF_ROUTEINFO |
849 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851 if (rt) {
852 if (!addrconf_finite_timeout(lifetime))
853 fib6_clean_expires(rt);
854 else
855 fib6_set_expires(rt, jiffies + HZ * lifetime);
857 fib6_info_release(rt);
859 return 0;
861 #endif
864 * Misc support functions
867 /* called with rcu_lock held */
868 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 struct net_device *dev = rt->fib6_nh.nh_dev;
872 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
873 /* for copies of local routes, dst->dev needs to be the
874 * device if it is a master device, the master device if
875 * device is enslaved, and the loopback as the default
877 if (netif_is_l3_slave(dev) &&
878 !rt6_need_strict(&rt->fib6_dst.addr))
879 dev = l3mdev_master_dev_rcu(dev);
880 else if (!netif_is_l3_master(dev))
881 dev = dev_net(dev)->loopback_dev;
882 /* last case is netif_is_l3_master(dev) is true in which
883 * case we want dev returned to be dev
887 return dev;
890 static const int fib6_prop[RTN_MAX + 1] = {
891 [RTN_UNSPEC] = 0,
892 [RTN_UNICAST] = 0,
893 [RTN_LOCAL] = 0,
894 [RTN_BROADCAST] = 0,
895 [RTN_ANYCAST] = 0,
896 [RTN_MULTICAST] = 0,
897 [RTN_BLACKHOLE] = -EINVAL,
898 [RTN_UNREACHABLE] = -EHOSTUNREACH,
899 [RTN_PROHIBIT] = -EACCES,
900 [RTN_THROW] = -EAGAIN,
901 [RTN_NAT] = -EINVAL,
902 [RTN_XRESOLVE] = -EINVAL,
905 static int ip6_rt_type_to_error(u8 fib6_type)
907 return fib6_prop[fib6_type];
910 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 unsigned short flags = 0;
914 if (rt->dst_nocount)
915 flags |= DST_NOCOUNT;
916 if (rt->dst_nopolicy)
917 flags |= DST_NOPOLICY;
918 if (rt->dst_host)
919 flags |= DST_HOST;
921 return flags;
924 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928 switch (ort->fib6_type) {
929 case RTN_BLACKHOLE:
930 rt->dst.output = dst_discard_out;
931 rt->dst.input = dst_discard;
932 break;
933 case RTN_PROHIBIT:
934 rt->dst.output = ip6_pkt_prohibit_out;
935 rt->dst.input = ip6_pkt_prohibit;
936 break;
937 case RTN_THROW:
938 case RTN_UNREACHABLE:
939 default:
940 rt->dst.output = ip6_pkt_discard_out;
941 rt->dst.input = ip6_pkt_discard;
942 break;
946 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 if (ort->fib6_flags & RTF_REJECT) {
949 ip6_rt_init_dst_reject(rt, ort);
950 return;
953 rt->dst.error = 0;
954 rt->dst.output = ip6_output;
956 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
957 rt->dst.input = ip6_input;
958 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
959 rt->dst.input = ip6_mc_input;
960 } else {
961 rt->dst.input = ip6_forward;
964 if (ort->fib6_nh.nh_lwtstate) {
965 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
966 lwtunnel_set_redirect(&rt->dst);
969 rt->dst.lastuse = jiffies;
972 /* Caller must already hold reference to @from */
973 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 rt->rt6i_flags &= ~RTF_EXPIRES;
976 rcu_assign_pointer(rt->from, from);
977 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
980 /* Caller must already hold reference to @ort */
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 struct net_device *dev = fib6_info_nh_dev(ort);
985 ip6_rt_init_dst(rt, ort);
987 rt->rt6i_dst = ort->fib6_dst;
988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 rt->rt6i_flags = ort->fib6_flags;
991 rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 rt->rt6i_src = ort->fib6_src;
994 #endif
997 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
998 struct in6_addr *saddr)
1000 struct fib6_node *pn, *sn;
1001 while (1) {
1002 if (fn->fn_flags & RTN_TL_ROOT)
1003 return NULL;
1004 pn = rcu_dereference(fn->parent);
1005 sn = FIB6_SUBTREE(pn);
1006 if (sn && sn != fn)
1007 fn = fib6_node_lookup(sn, NULL, saddr);
1008 else
1009 fn = pn;
1010 if (fn->fn_flags & RTN_RTINFO)
1011 return fn;
1015 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1016 bool null_fallback)
1018 struct rt6_info *rt = *prt;
1020 if (dst_hold_safe(&rt->dst))
1021 return true;
1022 if (null_fallback) {
1023 rt = net->ipv6.ip6_null_entry;
1024 dst_hold(&rt->dst);
1025 } else {
1026 rt = NULL;
1028 *prt = rt;
1029 return false;
1032 /* called with rcu_lock held */
1033 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1035 unsigned short flags = fib6_info_dst_flags(rt);
1036 struct net_device *dev = rt->fib6_nh.nh_dev;
1037 struct rt6_info *nrt;
1039 if (!fib6_info_hold_safe(rt))
1040 goto fallback;
1042 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1043 if (!nrt) {
1044 fib6_info_release(rt);
1045 goto fallback;
1048 ip6_rt_copy_init(nrt, rt);
1049 return nrt;
1051 fallback:
1052 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1053 dst_hold(&nrt->dst);
1054 return nrt;
1057 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1058 struct fib6_table *table,
1059 struct flowi6 *fl6,
1060 const struct sk_buff *skb,
1061 int flags)
1063 struct fib6_info *f6i;
1064 struct fib6_node *fn;
1065 struct rt6_info *rt;
1067 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1068 flags &= ~RT6_LOOKUP_F_IFACE;
1070 rcu_read_lock();
1071 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1072 restart:
1073 f6i = rcu_dereference(fn->leaf);
1074 if (!f6i) {
1075 f6i = net->ipv6.fib6_null_entry;
1076 } else {
1077 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1078 fl6->flowi6_oif, flags);
1079 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1080 f6i = fib6_multipath_select(net, f6i, fl6,
1081 fl6->flowi6_oif, skb,
1082 flags);
1084 if (f6i == net->ipv6.fib6_null_entry) {
1085 fn = fib6_backtrack(fn, &fl6->saddr);
1086 if (fn)
1087 goto restart;
1090 trace_fib6_table_lookup(net, f6i, table, fl6);
1092 /* Search through exception table */
1093 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1094 if (rt) {
1095 if (ip6_hold_safe(net, &rt, true))
1096 dst_use_noref(&rt->dst, jiffies);
1097 } else if (f6i == net->ipv6.fib6_null_entry) {
1098 rt = net->ipv6.ip6_null_entry;
1099 dst_hold(&rt->dst);
1100 } else {
1101 rt = ip6_create_rt_rcu(f6i);
1104 rcu_read_unlock();
1106 return rt;
1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1110 const struct sk_buff *skb, int flags)
1112 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1117 const struct in6_addr *saddr, int oif,
1118 const struct sk_buff *skb, int strict)
1120 struct flowi6 fl6 = {
1121 .flowi6_oif = oif,
1122 .daddr = *daddr,
1124 struct dst_entry *dst;
1125 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127 if (saddr) {
1128 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1129 flags |= RT6_LOOKUP_F_HAS_SADDR;
1132 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1133 if (dst->error == 0)
1134 return (struct rt6_info *) dst;
1136 dst_release(dst);
1138 return NULL;
1140 EXPORT_SYMBOL(rt6_lookup);
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143 * It takes new route entry, the addition fails by any reason the
1144 * route is released.
1145 * Caller must hold dst before calling it.
1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1149 struct netlink_ext_ack *extack)
1151 int err;
1152 struct fib6_table *table;
1154 table = rt->fib6_table;
1155 spin_lock_bh(&table->tb6_lock);
1156 err = fib6_add(&table->tb6_root, rt, info, extack);
1157 spin_unlock_bh(&table->tb6_lock);
1159 return err;
1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 struct nl_info info = { .nl_net = net, };
1166 return __ip6_ins_rt(rt, &info, NULL);
1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1170 const struct in6_addr *daddr,
1171 const struct in6_addr *saddr)
1173 struct net_device *dev;
1174 struct rt6_info *rt;
1177 * Clone the route.
1180 if (!fib6_info_hold_safe(ort))
1181 return NULL;
1183 dev = ip6_rt_get_dev_rcu(ort);
1184 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1185 if (!rt) {
1186 fib6_info_release(ort);
1187 return NULL;
1190 ip6_rt_copy_init(rt, ort);
1191 rt->rt6i_flags |= RTF_CACHE;
1192 rt->dst.flags |= DST_HOST;
1193 rt->rt6i_dst.addr = *daddr;
1194 rt->rt6i_dst.plen = 128;
1196 if (!rt6_is_gw_or_nonexthop(ort)) {
1197 if (ort->fib6_dst.plen != 128 &&
1198 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1199 rt->rt6i_flags |= RTF_ANYCAST;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 if (rt->rt6i_src.plen && saddr) {
1202 rt->rt6i_src.addr = *saddr;
1203 rt->rt6i_src.plen = 128;
1205 #endif
1208 return rt;
1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 unsigned short flags = fib6_info_dst_flags(rt);
1214 struct net_device *dev;
1215 struct rt6_info *pcpu_rt;
1217 if (!fib6_info_hold_safe(rt))
1218 return NULL;
1220 rcu_read_lock();
1221 dev = ip6_rt_get_dev_rcu(rt);
1222 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1223 rcu_read_unlock();
1224 if (!pcpu_rt) {
1225 fib6_info_release(rt);
1226 return NULL;
1228 ip6_rt_copy_init(pcpu_rt, rt);
1229 pcpu_rt->rt6i_flags |= RTF_PCPU;
1230 return pcpu_rt;
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 struct rt6_info *pcpu_rt, **p;
1238 p = this_cpu_ptr(rt->rt6i_pcpu);
1239 pcpu_rt = *p;
1241 if (pcpu_rt)
1242 ip6_hold_safe(NULL, &pcpu_rt, false);
1244 return pcpu_rt;
1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1248 struct fib6_info *rt)
1250 struct rt6_info *pcpu_rt, *prev, **p;
1252 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1253 if (!pcpu_rt) {
1254 dst_hold(&net->ipv6.ip6_null_entry->dst);
1255 return net->ipv6.ip6_null_entry;
1258 dst_hold(&pcpu_rt->dst);
1259 p = this_cpu_ptr(rt->rt6i_pcpu);
1260 prev = cmpxchg(p, NULL, pcpu_rt);
1261 BUG_ON(prev);
1263 if (rt->fib6_destroying) {
1264 struct fib6_info *from;
1266 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1267 fib6_info_release(from);
1270 return pcpu_rt;
1273 /* exception hash table implementation
1275 static DEFINE_SPINLOCK(rt6_exception_lock);
1277 /* Remove rt6_ex from hash table and free the memory
1278 * Caller must hold rt6_exception_lock
1280 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1281 struct rt6_exception *rt6_ex)
1283 struct fib6_info *from;
1284 struct net *net;
1286 if (!bucket || !rt6_ex)
1287 return;
1289 net = dev_net(rt6_ex->rt6i->dst.dev);
1290 net->ipv6.rt6_stats->fib_rt_cache--;
1292 /* purge completely the exception to allow releasing the held resources:
1293 * some [sk] cache may keep the dst around for unlimited time
1295 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1296 fib6_info_release(from);
1297 dst_dev_put(&rt6_ex->rt6i->dst);
1299 hlist_del_rcu(&rt6_ex->hlist);
1300 dst_release(&rt6_ex->rt6i->dst);
1301 kfree_rcu(rt6_ex, rcu);
1302 WARN_ON_ONCE(!bucket->depth);
1303 bucket->depth--;
1306 /* Remove oldest rt6_ex in bucket and free the memory
1307 * Caller must hold rt6_exception_lock
1309 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1311 struct rt6_exception *rt6_ex, *oldest = NULL;
1313 if (!bucket)
1314 return;
1316 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1317 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1318 oldest = rt6_ex;
1320 rt6_remove_exception(bucket, oldest);
1323 static u32 rt6_exception_hash(const struct in6_addr *dst,
1324 const struct in6_addr *src)
1326 static u32 seed __read_mostly;
1327 u32 val;
1329 net_get_random_once(&seed, sizeof(seed));
1330 val = jhash(dst, sizeof(*dst), seed);
1332 #ifdef CONFIG_IPV6_SUBTREES
1333 if (src)
1334 val = jhash(src, sizeof(*src), val);
1335 #endif
1336 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1339 /* Helper function to find the cached rt in the hash table
1340 * and update bucket pointer to point to the bucket for this
1341 * (daddr, saddr) pair
1342 * Caller must hold rt6_exception_lock
1344 static struct rt6_exception *
1345 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1346 const struct in6_addr *daddr,
1347 const struct in6_addr *saddr)
1349 struct rt6_exception *rt6_ex;
1350 u32 hval;
1352 if (!(*bucket) || !daddr)
1353 return NULL;
1355 hval = rt6_exception_hash(daddr, saddr);
1356 *bucket += hval;
1358 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1359 struct rt6_info *rt6 = rt6_ex->rt6i;
1360 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1362 #ifdef CONFIG_IPV6_SUBTREES
1363 if (matched && saddr)
1364 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1365 #endif
1366 if (matched)
1367 return rt6_ex;
1369 return NULL;
1372 /* Helper function to find the cached rt in the hash table
1373 * and update bucket pointer to point to the bucket for this
1374 * (daddr, saddr) pair
1375 * Caller must hold rcu_read_lock()
1377 static struct rt6_exception *
1378 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1379 const struct in6_addr *daddr,
1380 const struct in6_addr *saddr)
1382 struct rt6_exception *rt6_ex;
1383 u32 hval;
1385 WARN_ON_ONCE(!rcu_read_lock_held());
1387 if (!(*bucket) || !daddr)
1388 return NULL;
1390 hval = rt6_exception_hash(daddr, saddr);
1391 *bucket += hval;
1393 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1394 struct rt6_info *rt6 = rt6_ex->rt6i;
1395 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1397 #ifdef CONFIG_IPV6_SUBTREES
1398 if (matched && saddr)
1399 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1400 #endif
1401 if (matched)
1402 return rt6_ex;
1404 return NULL;
1407 static unsigned int fib6_mtu(const struct fib6_info *rt)
1409 unsigned int mtu;
1411 if (rt->fib6_pmtu) {
1412 mtu = rt->fib6_pmtu;
1413 } else {
1414 struct net_device *dev = fib6_info_nh_dev(rt);
1415 struct inet6_dev *idev;
1417 rcu_read_lock();
1418 idev = __in6_dev_get(dev);
1419 mtu = idev->cnf.mtu6;
1420 rcu_read_unlock();
1423 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1425 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1428 static int rt6_insert_exception(struct rt6_info *nrt,
1429 struct fib6_info *ort)
1431 struct net *net = dev_net(nrt->dst.dev);
1432 struct rt6_exception_bucket *bucket;
1433 struct in6_addr *src_key = NULL;
1434 struct rt6_exception *rt6_ex;
1435 int err = 0;
1437 spin_lock_bh(&rt6_exception_lock);
1439 if (ort->exception_bucket_flushed) {
1440 err = -EINVAL;
1441 goto out;
1444 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1445 lockdep_is_held(&rt6_exception_lock));
1446 if (!bucket) {
1447 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1448 GFP_ATOMIC);
1449 if (!bucket) {
1450 err = -ENOMEM;
1451 goto out;
1453 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1456 #ifdef CONFIG_IPV6_SUBTREES
1457 /* rt6i_src.plen != 0 indicates ort is in subtree
1458 * and exception table is indexed by a hash of
1459 * both rt6i_dst and rt6i_src.
1460 * Otherwise, the exception table is indexed by
1461 * a hash of only rt6i_dst.
1463 if (ort->fib6_src.plen)
1464 src_key = &nrt->rt6i_src.addr;
1465 #endif
1466 /* rt6_mtu_change() might lower mtu on ort.
1467 * Only insert this exception route if its mtu
1468 * is less than ort's mtu value.
1470 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1471 err = -EINVAL;
1472 goto out;
1475 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1476 src_key);
1477 if (rt6_ex)
1478 rt6_remove_exception(bucket, rt6_ex);
1480 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1481 if (!rt6_ex) {
1482 err = -ENOMEM;
1483 goto out;
1485 rt6_ex->rt6i = nrt;
1486 rt6_ex->stamp = jiffies;
1487 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1488 bucket->depth++;
1489 net->ipv6.rt6_stats->fib_rt_cache++;
1491 if (bucket->depth > FIB6_MAX_DEPTH)
1492 rt6_exception_remove_oldest(bucket);
1494 out:
1495 spin_unlock_bh(&rt6_exception_lock);
1497 /* Update fn->fn_sernum to invalidate all cached dst */
1498 if (!err) {
1499 spin_lock_bh(&ort->fib6_table->tb6_lock);
1500 fib6_update_sernum(net, ort);
1501 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1502 fib6_force_start_gc(net);
1505 return err;
1508 void rt6_flush_exceptions(struct fib6_info *rt)
1510 struct rt6_exception_bucket *bucket;
1511 struct rt6_exception *rt6_ex;
1512 struct hlist_node *tmp;
1513 int i;
1515 spin_lock_bh(&rt6_exception_lock);
1516 /* Prevent rt6_insert_exception() to recreate the bucket list */
1517 rt->exception_bucket_flushed = 1;
1519 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1520 lockdep_is_held(&rt6_exception_lock));
1521 if (!bucket)
1522 goto out;
1524 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1525 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1526 rt6_remove_exception(bucket, rt6_ex);
1527 WARN_ON_ONCE(bucket->depth);
1528 bucket++;
1531 out:
1532 spin_unlock_bh(&rt6_exception_lock);
1535 /* Find cached rt in the hash table inside passed in rt
1536 * Caller has to hold rcu_read_lock()
1538 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1539 const struct in6_addr *daddr,
1540 const struct in6_addr *saddr)
1542 const struct in6_addr *src_key = NULL;
1543 struct rt6_exception_bucket *bucket;
1544 struct rt6_exception *rt6_ex;
1545 struct rt6_info *res = NULL;
1547 #ifdef CONFIG_IPV6_SUBTREES
1548 /* rt6i_src.plen != 0 indicates rt is in subtree
1549 * and exception table is indexed by a hash of
1550 * both rt6i_dst and rt6i_src.
1551 * However, the src addr used to create the hash
1552 * might not be exactly the passed in saddr which
1553 * is a /128 addr from the flow.
1554 * So we need to use f6i->fib6_src to redo lookup
1555 * if the passed in saddr does not find anything.
1556 * (See the logic in ip6_rt_cache_alloc() on how
1557 * rt->rt6i_src is updated.)
1559 if (rt->fib6_src.plen)
1560 src_key = saddr;
1561 find_ex:
1562 #endif
1563 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1564 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1566 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1567 res = rt6_ex->rt6i;
1569 #ifdef CONFIG_IPV6_SUBTREES
1570 /* Use fib6_src as src_key and redo lookup */
1571 if (!res && src_key && src_key != &rt->fib6_src.addr) {
1572 src_key = &rt->fib6_src.addr;
1573 goto find_ex;
1575 #endif
1577 return res;
1580 /* Remove the passed in cached rt from the hash table that contains it */
1581 static int rt6_remove_exception_rt(struct rt6_info *rt)
1583 struct rt6_exception_bucket *bucket;
1584 struct in6_addr *src_key = NULL;
1585 struct rt6_exception *rt6_ex;
1586 struct fib6_info *from;
1587 int err;
1589 from = rcu_dereference(rt->from);
1590 if (!from ||
1591 !(rt->rt6i_flags & RTF_CACHE))
1592 return -EINVAL;
1594 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1595 return -ENOENT;
1597 spin_lock_bh(&rt6_exception_lock);
1598 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1599 lockdep_is_held(&rt6_exception_lock));
1600 #ifdef CONFIG_IPV6_SUBTREES
1601 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1602 * and exception table is indexed by a hash of
1603 * both rt6i_dst and rt6i_src.
1604 * Otherwise, the exception table is indexed by
1605 * a hash of only rt6i_dst.
1607 if (from->fib6_src.plen)
1608 src_key = &rt->rt6i_src.addr;
1609 #endif
1610 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1611 &rt->rt6i_dst.addr,
1612 src_key);
1613 if (rt6_ex) {
1614 rt6_remove_exception(bucket, rt6_ex);
1615 err = 0;
1616 } else {
1617 err = -ENOENT;
1620 spin_unlock_bh(&rt6_exception_lock);
1621 return err;
1624 /* Find rt6_ex which contains the passed in rt cache and
1625 * refresh its stamp
1627 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1629 struct rt6_exception_bucket *bucket;
1630 struct in6_addr *src_key = NULL;
1631 struct rt6_exception *rt6_ex;
1632 struct fib6_info *from;
1634 rcu_read_lock();
1635 from = rcu_dereference(rt->from);
1636 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1637 goto unlock;
1639 bucket = rcu_dereference(from->rt6i_exception_bucket);
1641 #ifdef CONFIG_IPV6_SUBTREES
1642 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1643 * and exception table is indexed by a hash of
1644 * both rt6i_dst and rt6i_src.
1645 * Otherwise, the exception table is indexed by
1646 * a hash of only rt6i_dst.
1648 if (from->fib6_src.plen)
1649 src_key = &rt->rt6i_src.addr;
1650 #endif
1651 rt6_ex = __rt6_find_exception_rcu(&bucket,
1652 &rt->rt6i_dst.addr,
1653 src_key);
1654 if (rt6_ex)
1655 rt6_ex->stamp = jiffies;
1657 unlock:
1658 rcu_read_unlock();
1661 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1662 struct rt6_info *rt, int mtu)
1664 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1665 * lowest MTU in the path: always allow updating the route PMTU to
1666 * reflect PMTU decreases.
1668 * If the new MTU is higher, and the route PMTU is equal to the local
1669 * MTU, this means the old MTU is the lowest in the path, so allow
1670 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1671 * handle this.
1674 if (dst_mtu(&rt->dst) >= mtu)
1675 return true;
1677 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1678 return true;
1680 return false;
1683 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1684 struct fib6_info *rt, int mtu)
1686 struct rt6_exception_bucket *bucket;
1687 struct rt6_exception *rt6_ex;
1688 int i;
1690 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1691 lockdep_is_held(&rt6_exception_lock));
1693 if (!bucket)
1694 return;
1696 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1697 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1698 struct rt6_info *entry = rt6_ex->rt6i;
1700 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1701 * route), the metrics of its rt->from have already
1702 * been updated.
1704 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1705 rt6_mtu_change_route_allowed(idev, entry, mtu))
1706 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1708 bucket++;
1712 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1714 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1715 struct in6_addr *gateway)
1717 struct rt6_exception_bucket *bucket;
1718 struct rt6_exception *rt6_ex;
1719 struct hlist_node *tmp;
1720 int i;
1722 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1723 return;
1725 spin_lock_bh(&rt6_exception_lock);
1726 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1727 lockdep_is_held(&rt6_exception_lock));
1729 if (bucket) {
1730 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1731 hlist_for_each_entry_safe(rt6_ex, tmp,
1732 &bucket->chain, hlist) {
1733 struct rt6_info *entry = rt6_ex->rt6i;
1735 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1736 RTF_CACHE_GATEWAY &&
1737 ipv6_addr_equal(gateway,
1738 &entry->rt6i_gateway)) {
1739 rt6_remove_exception(bucket, rt6_ex);
1742 bucket++;
1746 spin_unlock_bh(&rt6_exception_lock);
1749 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1750 struct rt6_exception *rt6_ex,
1751 struct fib6_gc_args *gc_args,
1752 unsigned long now)
1754 struct rt6_info *rt = rt6_ex->rt6i;
1756 /* we are pruning and obsoleting aged-out and non gateway exceptions
1757 * even if others have still references to them, so that on next
1758 * dst_check() such references can be dropped.
1759 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1760 * expired, independently from their aging, as per RFC 8201 section 4
1762 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1763 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1764 RT6_TRACE("aging clone %p\n", rt);
1765 rt6_remove_exception(bucket, rt6_ex);
1766 return;
1768 } else if (time_after(jiffies, rt->dst.expires)) {
1769 RT6_TRACE("purging expired route %p\n", rt);
1770 rt6_remove_exception(bucket, rt6_ex);
1771 return;
1774 if (rt->rt6i_flags & RTF_GATEWAY) {
1775 struct neighbour *neigh;
1776 __u8 neigh_flags = 0;
1778 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1779 if (neigh)
1780 neigh_flags = neigh->flags;
1782 if (!(neigh_flags & NTF_ROUTER)) {
1783 RT6_TRACE("purging route %p via non-router but gateway\n",
1784 rt);
1785 rt6_remove_exception(bucket, rt6_ex);
1786 return;
1790 gc_args->more++;
1793 void rt6_age_exceptions(struct fib6_info *rt,
1794 struct fib6_gc_args *gc_args,
1795 unsigned long now)
1797 struct rt6_exception_bucket *bucket;
1798 struct rt6_exception *rt6_ex;
1799 struct hlist_node *tmp;
1800 int i;
1802 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1803 return;
1805 rcu_read_lock_bh();
1806 spin_lock(&rt6_exception_lock);
1807 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1808 lockdep_is_held(&rt6_exception_lock));
1810 if (bucket) {
1811 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1812 hlist_for_each_entry_safe(rt6_ex, tmp,
1813 &bucket->chain, hlist) {
1814 rt6_age_examine_exception(bucket, rt6_ex,
1815 gc_args, now);
1817 bucket++;
1820 spin_unlock(&rt6_exception_lock);
1821 rcu_read_unlock_bh();
1824 /* must be called with rcu lock held */
1825 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1826 int oif, struct flowi6 *fl6, int strict)
1828 struct fib6_node *fn, *saved_fn;
1829 struct fib6_info *f6i;
1831 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1832 saved_fn = fn;
1834 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1835 oif = 0;
1837 redo_rt6_select:
1838 f6i = rt6_select(net, fn, oif, strict);
1839 if (f6i == net->ipv6.fib6_null_entry) {
1840 fn = fib6_backtrack(fn, &fl6->saddr);
1841 if (fn)
1842 goto redo_rt6_select;
1843 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1844 /* also consider unreachable route */
1845 strict &= ~RT6_LOOKUP_F_REACHABLE;
1846 fn = saved_fn;
1847 goto redo_rt6_select;
1851 trace_fib6_table_lookup(net, f6i, table, fl6);
1853 return f6i;
1856 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1857 int oif, struct flowi6 *fl6,
1858 const struct sk_buff *skb, int flags)
1860 struct fib6_info *f6i;
1861 struct rt6_info *rt;
1862 int strict = 0;
1864 strict |= flags & RT6_LOOKUP_F_IFACE;
1865 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1866 if (net->ipv6.devconf_all->forwarding == 0)
1867 strict |= RT6_LOOKUP_F_REACHABLE;
1869 rcu_read_lock();
1871 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1872 if (f6i->fib6_nsiblings)
1873 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1875 if (f6i == net->ipv6.fib6_null_entry) {
1876 rt = net->ipv6.ip6_null_entry;
1877 rcu_read_unlock();
1878 dst_hold(&rt->dst);
1879 return rt;
1882 /*Search through exception table */
1883 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1884 if (rt) {
1885 if (ip6_hold_safe(net, &rt, true))
1886 dst_use_noref(&rt->dst, jiffies);
1888 rcu_read_unlock();
1889 return rt;
1890 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1891 !(f6i->fib6_flags & RTF_GATEWAY))) {
1892 /* Create a RTF_CACHE clone which will not be
1893 * owned by the fib6 tree. It is for the special case where
1894 * the daddr in the skb during the neighbor look-up is different
1895 * from the fl6->daddr used to look-up route here.
1897 struct rt6_info *uncached_rt;
1899 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1901 rcu_read_unlock();
1903 if (uncached_rt) {
1904 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1905 * No need for another dst_hold()
1907 rt6_uncached_list_add(uncached_rt);
1908 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1909 } else {
1910 uncached_rt = net->ipv6.ip6_null_entry;
1911 dst_hold(&uncached_rt->dst);
1914 return uncached_rt;
1915 } else {
1916 /* Get a percpu copy */
1918 struct rt6_info *pcpu_rt;
1920 local_bh_disable();
1921 pcpu_rt = rt6_get_pcpu_route(f6i);
1923 if (!pcpu_rt)
1924 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1926 local_bh_enable();
1927 rcu_read_unlock();
1929 return pcpu_rt;
1932 EXPORT_SYMBOL_GPL(ip6_pol_route);
1934 static struct rt6_info *ip6_pol_route_input(struct net *net,
1935 struct fib6_table *table,
1936 struct flowi6 *fl6,
1937 const struct sk_buff *skb,
1938 int flags)
1940 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1943 struct dst_entry *ip6_route_input_lookup(struct net *net,
1944 struct net_device *dev,
1945 struct flowi6 *fl6,
1946 const struct sk_buff *skb,
1947 int flags)
1949 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1950 flags |= RT6_LOOKUP_F_IFACE;
1952 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1954 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1956 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1957 struct flow_keys *keys,
1958 struct flow_keys *flkeys)
1960 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1961 const struct ipv6hdr *key_iph = outer_iph;
1962 struct flow_keys *_flkeys = flkeys;
1963 const struct ipv6hdr *inner_iph;
1964 const struct icmp6hdr *icmph;
1965 struct ipv6hdr _inner_iph;
1966 struct icmp6hdr _icmph;
1968 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1969 goto out;
1971 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1972 sizeof(_icmph), &_icmph);
1973 if (!icmph)
1974 goto out;
1976 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1977 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1978 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1979 icmph->icmp6_type != ICMPV6_PARAMPROB)
1980 goto out;
1982 inner_iph = skb_header_pointer(skb,
1983 skb_transport_offset(skb) + sizeof(*icmph),
1984 sizeof(_inner_iph), &_inner_iph);
1985 if (!inner_iph)
1986 goto out;
1988 key_iph = inner_iph;
1989 _flkeys = NULL;
1990 out:
1991 if (_flkeys) {
1992 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1993 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1994 keys->tags.flow_label = _flkeys->tags.flow_label;
1995 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1996 } else {
1997 keys->addrs.v6addrs.src = key_iph->saddr;
1998 keys->addrs.v6addrs.dst = key_iph->daddr;
1999 keys->tags.flow_label = ip6_flowlabel(key_iph);
2000 keys->basic.ip_proto = key_iph->nexthdr;
2004 /* if skb is set it will be used and fl6 can be NULL */
2005 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2006 const struct sk_buff *skb, struct flow_keys *flkeys)
2008 struct flow_keys hash_keys;
2009 u32 mhash;
2011 switch (ip6_multipath_hash_policy(net)) {
2012 case 0:
2013 memset(&hash_keys, 0, sizeof(hash_keys));
2014 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2015 if (skb) {
2016 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2017 } else {
2018 hash_keys.addrs.v6addrs.src = fl6->saddr;
2019 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2020 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2021 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2023 break;
2024 case 1:
2025 if (skb) {
2026 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2027 struct flow_keys keys;
2029 /* short-circuit if we already have L4 hash present */
2030 if (skb->l4_hash)
2031 return skb_get_hash_raw(skb) >> 1;
2033 memset(&hash_keys, 0, sizeof(hash_keys));
2035 if (!flkeys) {
2036 skb_flow_dissect_flow_keys(skb, &keys, flag);
2037 flkeys = &keys;
2039 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2040 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2041 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2042 hash_keys.ports.src = flkeys->ports.src;
2043 hash_keys.ports.dst = flkeys->ports.dst;
2044 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2045 } else {
2046 memset(&hash_keys, 0, sizeof(hash_keys));
2047 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2048 hash_keys.addrs.v6addrs.src = fl6->saddr;
2049 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2050 hash_keys.ports.src = fl6->fl6_sport;
2051 hash_keys.ports.dst = fl6->fl6_dport;
2052 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2054 break;
2056 mhash = flow_hash_from_keys(&hash_keys);
2058 return mhash >> 1;
2061 void ip6_route_input(struct sk_buff *skb)
2063 const struct ipv6hdr *iph = ipv6_hdr(skb);
2064 struct net *net = dev_net(skb->dev);
2065 int flags = RT6_LOOKUP_F_HAS_SADDR;
2066 struct ip_tunnel_info *tun_info;
2067 struct flowi6 fl6 = {
2068 .flowi6_iif = skb->dev->ifindex,
2069 .daddr = iph->daddr,
2070 .saddr = iph->saddr,
2071 .flowlabel = ip6_flowinfo(iph),
2072 .flowi6_mark = skb->mark,
2073 .flowi6_proto = iph->nexthdr,
2075 struct flow_keys *flkeys = NULL, _flkeys;
2077 tun_info = skb_tunnel_info(skb);
2078 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2079 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2081 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2082 flkeys = &_flkeys;
2084 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2085 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2086 skb_dst_drop(skb);
2087 skb_dst_set(skb,
2088 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2091 static struct rt6_info *ip6_pol_route_output(struct net *net,
2092 struct fib6_table *table,
2093 struct flowi6 *fl6,
2094 const struct sk_buff *skb,
2095 int flags)
2097 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2100 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2101 struct flowi6 *fl6, int flags)
2103 bool any_src;
2105 if (ipv6_addr_type(&fl6->daddr) &
2106 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2107 struct dst_entry *dst;
2109 dst = l3mdev_link_scope_lookup(net, fl6);
2110 if (dst)
2111 return dst;
2114 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2116 any_src = ipv6_addr_any(&fl6->saddr);
2117 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2118 (fl6->flowi6_oif && any_src))
2119 flags |= RT6_LOOKUP_F_IFACE;
2121 if (!any_src)
2122 flags |= RT6_LOOKUP_F_HAS_SADDR;
2123 else if (sk)
2124 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2126 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2128 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2130 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2132 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2133 struct net_device *loopback_dev = net->loopback_dev;
2134 struct dst_entry *new = NULL;
2136 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2137 DST_OBSOLETE_DEAD, 0);
2138 if (rt) {
2139 rt6_info_init(rt);
2140 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2142 new = &rt->dst;
2143 new->__use = 1;
2144 new->input = dst_discard;
2145 new->output = dst_discard_out;
2147 dst_copy_metrics(new, &ort->dst);
2149 rt->rt6i_idev = in6_dev_get(loopback_dev);
2150 rt->rt6i_gateway = ort->rt6i_gateway;
2151 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2153 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2154 #ifdef CONFIG_IPV6_SUBTREES
2155 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2156 #endif
2159 dst_release(dst_orig);
2160 return new ? new : ERR_PTR(-ENOMEM);
2164 * Destination cache support functions
2167 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2169 u32 rt_cookie = 0;
2171 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2172 return false;
2174 if (fib6_check_expired(f6i))
2175 return false;
2177 return true;
2180 static struct dst_entry *rt6_check(struct rt6_info *rt,
2181 struct fib6_info *from,
2182 u32 cookie)
2184 u32 rt_cookie = 0;
2186 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2187 rt_cookie != cookie)
2188 return NULL;
2190 if (rt6_check_expired(rt))
2191 return NULL;
2193 return &rt->dst;
2196 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2197 struct fib6_info *from,
2198 u32 cookie)
2200 if (!__rt6_check_expired(rt) &&
2201 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2202 fib6_check(from, cookie))
2203 return &rt->dst;
2204 else
2205 return NULL;
2208 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2210 struct dst_entry *dst_ret;
2211 struct fib6_info *from;
2212 struct rt6_info *rt;
2214 rt = container_of(dst, struct rt6_info, dst);
2216 rcu_read_lock();
2218 /* All IPV6 dsts are created with ->obsolete set to the value
2219 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2220 * into this function always.
2223 from = rcu_dereference(rt->from);
2225 if (from && (rt->rt6i_flags & RTF_PCPU ||
2226 unlikely(!list_empty(&rt->rt6i_uncached))))
2227 dst_ret = rt6_dst_from_check(rt, from, cookie);
2228 else
2229 dst_ret = rt6_check(rt, from, cookie);
2231 rcu_read_unlock();
2233 return dst_ret;
2236 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2238 struct rt6_info *rt = (struct rt6_info *) dst;
2240 if (rt) {
2241 if (rt->rt6i_flags & RTF_CACHE) {
2242 rcu_read_lock();
2243 if (rt6_check_expired(rt)) {
2244 rt6_remove_exception_rt(rt);
2245 dst = NULL;
2247 rcu_read_unlock();
2248 } else {
2249 dst_release(dst);
2250 dst = NULL;
2253 return dst;
2256 static void ip6_link_failure(struct sk_buff *skb)
2258 struct rt6_info *rt;
2260 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2262 rt = (struct rt6_info *) skb_dst(skb);
2263 if (rt) {
2264 rcu_read_lock();
2265 if (rt->rt6i_flags & RTF_CACHE) {
2266 rt6_remove_exception_rt(rt);
2267 } else {
2268 struct fib6_info *from;
2269 struct fib6_node *fn;
2271 from = rcu_dereference(rt->from);
2272 if (from) {
2273 fn = rcu_dereference(from->fib6_node);
2274 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2275 fn->fn_sernum = -1;
2278 rcu_read_unlock();
2282 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2284 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2285 struct fib6_info *from;
2287 rcu_read_lock();
2288 from = rcu_dereference(rt0->from);
2289 if (from)
2290 rt0->dst.expires = from->expires;
2291 rcu_read_unlock();
2294 dst_set_expires(&rt0->dst, timeout);
2295 rt0->rt6i_flags |= RTF_EXPIRES;
2298 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2300 struct net *net = dev_net(rt->dst.dev);
2302 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2303 rt->rt6i_flags |= RTF_MODIFIED;
2304 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2307 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2309 return !(rt->rt6i_flags & RTF_CACHE) &&
2310 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2313 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2314 const struct ipv6hdr *iph, u32 mtu)
2316 const struct in6_addr *daddr, *saddr;
2317 struct rt6_info *rt6 = (struct rt6_info *)dst;
2319 if (dst_metric_locked(dst, RTAX_MTU))
2320 return;
2322 if (iph) {
2323 daddr = &iph->daddr;
2324 saddr = &iph->saddr;
2325 } else if (sk) {
2326 daddr = &sk->sk_v6_daddr;
2327 saddr = &inet6_sk(sk)->saddr;
2328 } else {
2329 daddr = NULL;
2330 saddr = NULL;
2332 dst_confirm_neigh(dst, daddr);
2333 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2334 if (mtu >= dst_mtu(dst))
2335 return;
2337 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2338 rt6_do_update_pmtu(rt6, mtu);
2339 /* update rt6_ex->stamp for cache */
2340 if (rt6->rt6i_flags & RTF_CACHE)
2341 rt6_update_exception_stamp_rt(rt6);
2342 } else if (daddr) {
2343 struct fib6_info *from;
2344 struct rt6_info *nrt6;
2346 rcu_read_lock();
2347 from = rcu_dereference(rt6->from);
2348 if (!from) {
2349 rcu_read_unlock();
2350 return;
2352 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2353 if (nrt6) {
2354 rt6_do_update_pmtu(nrt6, mtu);
2355 if (rt6_insert_exception(nrt6, from))
2356 dst_release_immediate(&nrt6->dst);
2358 rcu_read_unlock();
2362 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2363 struct sk_buff *skb, u32 mtu)
2365 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2368 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2369 int oif, u32 mark, kuid_t uid)
2371 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2372 struct dst_entry *dst;
2373 struct flowi6 fl6 = {
2374 .flowi6_oif = oif,
2375 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2376 .daddr = iph->daddr,
2377 .saddr = iph->saddr,
2378 .flowlabel = ip6_flowinfo(iph),
2379 .flowi6_uid = uid,
2382 dst = ip6_route_output(net, NULL, &fl6);
2383 if (!dst->error)
2384 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2385 dst_release(dst);
2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2389 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2391 int oif = sk->sk_bound_dev_if;
2392 struct dst_entry *dst;
2394 if (!oif && skb->dev)
2395 oif = l3mdev_master_ifindex(skb->dev);
2397 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2399 dst = __sk_dst_get(sk);
2400 if (!dst || !dst->obsolete ||
2401 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2402 return;
2404 bh_lock_sock(sk);
2405 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2406 ip6_datagram_dst_update(sk, false);
2407 bh_unlock_sock(sk);
2409 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2411 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2412 const struct flowi6 *fl6)
2414 #ifdef CONFIG_IPV6_SUBTREES
2415 struct ipv6_pinfo *np = inet6_sk(sk);
2416 #endif
2418 ip6_dst_store(sk, dst,
2419 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2420 &sk->sk_v6_daddr : NULL,
2421 #ifdef CONFIG_IPV6_SUBTREES
2422 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2423 &np->saddr :
2424 #endif
2425 NULL);
2428 /* Handle redirects */
2429 struct ip6rd_flowi {
2430 struct flowi6 fl6;
2431 struct in6_addr gateway;
2434 static struct rt6_info *__ip6_route_redirect(struct net *net,
2435 struct fib6_table *table,
2436 struct flowi6 *fl6,
2437 const struct sk_buff *skb,
2438 int flags)
2440 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2441 struct rt6_info *ret = NULL, *rt_cache;
2442 struct fib6_info *rt;
2443 struct fib6_node *fn;
2445 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2446 * this case we must match on the real ingress device, so reset it
2448 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2449 fl6->flowi6_oif = skb->dev->ifindex;
2451 /* Get the "current" route for this destination and
2452 * check if the redirect has come from appropriate router.
2454 * RFC 4861 specifies that redirects should only be
2455 * accepted if they come from the nexthop to the target.
2456 * Due to the way the routes are chosen, this notion
2457 * is a bit fuzzy and one might need to check all possible
2458 * routes.
2461 rcu_read_lock();
2462 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2463 restart:
2464 for_each_fib6_node_rt_rcu(fn) {
2465 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2466 continue;
2467 if (fib6_check_expired(rt))
2468 continue;
2469 if (rt->fib6_flags & RTF_REJECT)
2470 break;
2471 if (!(rt->fib6_flags & RTF_GATEWAY))
2472 continue;
2473 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2474 continue;
2475 /* rt_cache's gateway might be different from its 'parent'
2476 * in the case of an ip redirect.
2477 * So we keep searching in the exception table if the gateway
2478 * is different.
2480 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2481 rt_cache = rt6_find_cached_rt(rt,
2482 &fl6->daddr,
2483 &fl6->saddr);
2484 if (rt_cache &&
2485 ipv6_addr_equal(&rdfl->gateway,
2486 &rt_cache->rt6i_gateway)) {
2487 ret = rt_cache;
2488 break;
2490 continue;
2492 break;
2495 if (!rt)
2496 rt = net->ipv6.fib6_null_entry;
2497 else if (rt->fib6_flags & RTF_REJECT) {
2498 ret = net->ipv6.ip6_null_entry;
2499 goto out;
2502 if (rt == net->ipv6.fib6_null_entry) {
2503 fn = fib6_backtrack(fn, &fl6->saddr);
2504 if (fn)
2505 goto restart;
2508 out:
2509 if (ret)
2510 ip6_hold_safe(net, &ret, true);
2511 else
2512 ret = ip6_create_rt_rcu(rt);
2514 rcu_read_unlock();
2516 trace_fib6_table_lookup(net, rt, table, fl6);
2517 return ret;
2520 static struct dst_entry *ip6_route_redirect(struct net *net,
2521 const struct flowi6 *fl6,
2522 const struct sk_buff *skb,
2523 const struct in6_addr *gateway)
2525 int flags = RT6_LOOKUP_F_HAS_SADDR;
2526 struct ip6rd_flowi rdfl;
2528 rdfl.fl6 = *fl6;
2529 rdfl.gateway = *gateway;
2531 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2532 flags, __ip6_route_redirect);
2535 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2536 kuid_t uid)
2538 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2539 struct dst_entry *dst;
2540 struct flowi6 fl6 = {
2541 .flowi6_iif = LOOPBACK_IFINDEX,
2542 .flowi6_oif = oif,
2543 .flowi6_mark = mark,
2544 .daddr = iph->daddr,
2545 .saddr = iph->saddr,
2546 .flowlabel = ip6_flowinfo(iph),
2547 .flowi6_uid = uid,
2550 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2551 rt6_do_redirect(dst, NULL, skb);
2552 dst_release(dst);
2554 EXPORT_SYMBOL_GPL(ip6_redirect);
2556 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2558 const struct ipv6hdr *iph = ipv6_hdr(skb);
2559 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2560 struct dst_entry *dst;
2561 struct flowi6 fl6 = {
2562 .flowi6_iif = LOOPBACK_IFINDEX,
2563 .flowi6_oif = oif,
2564 .daddr = msg->dest,
2565 .saddr = iph->daddr,
2566 .flowi6_uid = sock_net_uid(net, NULL),
2569 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2570 rt6_do_redirect(dst, NULL, skb);
2571 dst_release(dst);
2574 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2576 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2577 sk->sk_uid);
2579 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2581 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2583 struct net_device *dev = dst->dev;
2584 unsigned int mtu = dst_mtu(dst);
2585 struct net *net = dev_net(dev);
2587 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2589 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2590 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2593 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2594 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2595 * IPV6_MAXPLEN is also valid and means: "any MSS,
2596 * rely only on pmtu discovery"
2598 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2599 mtu = IPV6_MAXPLEN;
2600 return mtu;
2603 static unsigned int ip6_mtu(const struct dst_entry *dst)
2605 struct inet6_dev *idev;
2606 unsigned int mtu;
2608 mtu = dst_metric_raw(dst, RTAX_MTU);
2609 if (mtu)
2610 goto out;
2612 mtu = IPV6_MIN_MTU;
2614 rcu_read_lock();
2615 idev = __in6_dev_get(dst->dev);
2616 if (idev)
2617 mtu = idev->cnf.mtu6;
2618 rcu_read_unlock();
2620 out:
2621 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2623 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2626 /* MTU selection:
2627 * 1. mtu on route is locked - use it
2628 * 2. mtu from nexthop exception
2629 * 3. mtu from egress device
2631 * based on ip6_dst_mtu_forward and exception logic of
2632 * rt6_find_cached_rt; called with rcu_read_lock
2634 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2635 struct in6_addr *saddr)
2637 struct inet6_dev *idev;
2638 struct rt6_info *rt;
2639 u32 mtu = 0;
2641 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2642 mtu = f6i->fib6_pmtu;
2643 if (mtu)
2644 goto out;
2647 rt = rt6_find_cached_rt(f6i, daddr, saddr);
2648 if (unlikely(rt)) {
2649 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2650 } else {
2651 struct net_device *dev = fib6_info_nh_dev(f6i);
2653 mtu = IPV6_MIN_MTU;
2654 idev = __in6_dev_get(dev);
2655 if (idev && idev->cnf.mtu6 > mtu)
2656 mtu = idev->cnf.mtu6;
2659 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2660 out:
2661 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2664 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2665 struct flowi6 *fl6)
2667 struct dst_entry *dst;
2668 struct rt6_info *rt;
2669 struct inet6_dev *idev = in6_dev_get(dev);
2670 struct net *net = dev_net(dev);
2672 if (unlikely(!idev))
2673 return ERR_PTR(-ENODEV);
2675 rt = ip6_dst_alloc(net, dev, 0);
2676 if (unlikely(!rt)) {
2677 in6_dev_put(idev);
2678 dst = ERR_PTR(-ENOMEM);
2679 goto out;
2682 rt->dst.flags |= DST_HOST;
2683 rt->dst.input = ip6_input;
2684 rt->dst.output = ip6_output;
2685 rt->rt6i_gateway = fl6->daddr;
2686 rt->rt6i_dst.addr = fl6->daddr;
2687 rt->rt6i_dst.plen = 128;
2688 rt->rt6i_idev = idev;
2689 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2691 /* Add this dst into uncached_list so that rt6_disable_ip() can
2692 * do proper release of the net_device
2694 rt6_uncached_list_add(rt);
2695 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2697 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2699 out:
2700 return dst;
2703 static int ip6_dst_gc(struct dst_ops *ops)
2705 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2706 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2707 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2708 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2709 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2710 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2711 int entries;
2713 entries = dst_entries_get_fast(ops);
2714 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2715 entries <= rt_max_size)
2716 goto out;
2718 net->ipv6.ip6_rt_gc_expire++;
2719 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2720 entries = dst_entries_get_slow(ops);
2721 if (entries < ops->gc_thresh)
2722 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2723 out:
2724 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2725 return entries > rt_max_size;
2728 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2729 struct fib6_config *cfg,
2730 const struct in6_addr *gw_addr,
2731 u32 tbid, int flags)
2733 struct flowi6 fl6 = {
2734 .flowi6_oif = cfg->fc_ifindex,
2735 .daddr = *gw_addr,
2736 .saddr = cfg->fc_prefsrc,
2738 struct fib6_table *table;
2739 struct rt6_info *rt;
2741 table = fib6_get_table(net, tbid);
2742 if (!table)
2743 return NULL;
2745 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2746 flags |= RT6_LOOKUP_F_HAS_SADDR;
2748 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2749 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2751 /* if table lookup failed, fall back to full lookup */
2752 if (rt == net->ipv6.ip6_null_entry) {
2753 ip6_rt_put(rt);
2754 rt = NULL;
2757 return rt;
2760 static int ip6_route_check_nh_onlink(struct net *net,
2761 struct fib6_config *cfg,
2762 const struct net_device *dev,
2763 struct netlink_ext_ack *extack)
2765 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2766 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2767 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2768 struct fib6_info *from;
2769 struct rt6_info *grt;
2770 int err;
2772 err = 0;
2773 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2774 if (grt) {
2775 rcu_read_lock();
2776 from = rcu_dereference(grt->from);
2777 if (!grt->dst.error &&
2778 /* ignore match if it is the default route */
2779 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2780 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2781 NL_SET_ERR_MSG(extack,
2782 "Nexthop has invalid gateway or device mismatch");
2783 err = -EINVAL;
2785 rcu_read_unlock();
2787 ip6_rt_put(grt);
2790 return err;
2793 static int ip6_route_check_nh(struct net *net,
2794 struct fib6_config *cfg,
2795 struct net_device **_dev,
2796 struct inet6_dev **idev)
2798 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2799 struct net_device *dev = _dev ? *_dev : NULL;
2800 struct rt6_info *grt = NULL;
2801 int err = -EHOSTUNREACH;
2803 if (cfg->fc_table) {
2804 int flags = RT6_LOOKUP_F_IFACE;
2806 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2807 cfg->fc_table, flags);
2808 if (grt) {
2809 if (grt->rt6i_flags & RTF_GATEWAY ||
2810 (dev && dev != grt->dst.dev)) {
2811 ip6_rt_put(grt);
2812 grt = NULL;
2817 if (!grt)
2818 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2820 if (!grt)
2821 goto out;
2823 if (dev) {
2824 if (dev != grt->dst.dev) {
2825 ip6_rt_put(grt);
2826 goto out;
2828 } else {
2829 *_dev = dev = grt->dst.dev;
2830 *idev = grt->rt6i_idev;
2831 dev_hold(dev);
2832 in6_dev_hold(grt->rt6i_idev);
2835 if (!(grt->rt6i_flags & RTF_GATEWAY))
2836 err = 0;
2838 ip6_rt_put(grt);
2840 out:
2841 return err;
2844 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2845 struct net_device **_dev, struct inet6_dev **idev,
2846 struct netlink_ext_ack *extack)
2848 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2849 int gwa_type = ipv6_addr_type(gw_addr);
2850 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2851 const struct net_device *dev = *_dev;
2852 bool need_addr_check = !dev;
2853 int err = -EINVAL;
2855 /* if gw_addr is local we will fail to detect this in case
2856 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2857 * will return already-added prefix route via interface that
2858 * prefix route was assigned to, which might be non-loopback.
2860 if (dev &&
2861 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2862 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2863 goto out;
2866 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2867 /* IPv6 strictly inhibits using not link-local
2868 * addresses as nexthop address.
2869 * Otherwise, router will not able to send redirects.
2870 * It is very good, but in some (rare!) circumstances
2871 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2872 * some exceptions. --ANK
2873 * We allow IPv4-mapped nexthops to support RFC4798-type
2874 * addressing
2876 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2877 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2878 goto out;
2881 if (cfg->fc_flags & RTNH_F_ONLINK)
2882 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2883 else
2884 err = ip6_route_check_nh(net, cfg, _dev, idev);
2886 if (err)
2887 goto out;
2890 /* reload in case device was changed */
2891 dev = *_dev;
2893 err = -EINVAL;
2894 if (!dev) {
2895 NL_SET_ERR_MSG(extack, "Egress device not specified");
2896 goto out;
2897 } else if (dev->flags & IFF_LOOPBACK) {
2898 NL_SET_ERR_MSG(extack,
2899 "Egress device can not be loopback device for this route");
2900 goto out;
2903 /* if we did not check gw_addr above, do so now that the
2904 * egress device has been resolved.
2906 if (need_addr_check &&
2907 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2908 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2909 goto out;
2912 err = 0;
2913 out:
2914 return err;
2917 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2918 gfp_t gfp_flags,
2919 struct netlink_ext_ack *extack)
2921 struct net *net = cfg->fc_nlinfo.nl_net;
2922 struct fib6_info *rt = NULL;
2923 struct net_device *dev = NULL;
2924 struct inet6_dev *idev = NULL;
2925 struct fib6_table *table;
2926 int addr_type;
2927 int err = -EINVAL;
2929 /* RTF_PCPU is an internal flag; can not be set by userspace */
2930 if (cfg->fc_flags & RTF_PCPU) {
2931 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2932 goto out;
2935 /* RTF_CACHE is an internal flag; can not be set by userspace */
2936 if (cfg->fc_flags & RTF_CACHE) {
2937 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2938 goto out;
2941 if (cfg->fc_type > RTN_MAX) {
2942 NL_SET_ERR_MSG(extack, "Invalid route type");
2943 goto out;
2946 if (cfg->fc_dst_len > 128) {
2947 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2948 goto out;
2950 if (cfg->fc_src_len > 128) {
2951 NL_SET_ERR_MSG(extack, "Invalid source address length");
2952 goto out;
2954 #ifndef CONFIG_IPV6_SUBTREES
2955 if (cfg->fc_src_len) {
2956 NL_SET_ERR_MSG(extack,
2957 "Specifying source address requires IPV6_SUBTREES to be enabled");
2958 goto out;
2960 #endif
2961 if (cfg->fc_ifindex) {
2962 err = -ENODEV;
2963 dev = dev_get_by_index(net, cfg->fc_ifindex);
2964 if (!dev)
2965 goto out;
2966 idev = in6_dev_get(dev);
2967 if (!idev)
2968 goto out;
2971 if (cfg->fc_metric == 0)
2972 cfg->fc_metric = IP6_RT_PRIO_USER;
2974 if (cfg->fc_flags & RTNH_F_ONLINK) {
2975 if (!dev) {
2976 NL_SET_ERR_MSG(extack,
2977 "Nexthop device required for onlink");
2978 err = -ENODEV;
2979 goto out;
2982 if (!(dev->flags & IFF_UP)) {
2983 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2984 err = -ENETDOWN;
2985 goto out;
2989 err = -ENOBUFS;
2990 if (cfg->fc_nlinfo.nlh &&
2991 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2992 table = fib6_get_table(net, cfg->fc_table);
2993 if (!table) {
2994 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2995 table = fib6_new_table(net, cfg->fc_table);
2997 } else {
2998 table = fib6_new_table(net, cfg->fc_table);
3001 if (!table)
3002 goto out;
3004 err = -ENOMEM;
3005 rt = fib6_info_alloc(gfp_flags);
3006 if (!rt)
3007 goto out;
3009 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3010 extack);
3011 if (IS_ERR(rt->fib6_metrics)) {
3012 err = PTR_ERR(rt->fib6_metrics);
3013 /* Do not leave garbage there. */
3014 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3015 goto out;
3018 if (cfg->fc_flags & RTF_ADDRCONF)
3019 rt->dst_nocount = true;
3021 if (cfg->fc_flags & RTF_EXPIRES)
3022 fib6_set_expires(rt, jiffies +
3023 clock_t_to_jiffies(cfg->fc_expires));
3024 else
3025 fib6_clean_expires(rt);
3027 if (cfg->fc_protocol == RTPROT_UNSPEC)
3028 cfg->fc_protocol = RTPROT_BOOT;
3029 rt->fib6_protocol = cfg->fc_protocol;
3031 addr_type = ipv6_addr_type(&cfg->fc_dst);
3033 if (cfg->fc_encap) {
3034 struct lwtunnel_state *lwtstate;
3036 err = lwtunnel_build_state(cfg->fc_encap_type,
3037 cfg->fc_encap, AF_INET6, cfg,
3038 &lwtstate, extack);
3039 if (err)
3040 goto out;
3041 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3044 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3045 rt->fib6_dst.plen = cfg->fc_dst_len;
3046 if (rt->fib6_dst.plen == 128)
3047 rt->dst_host = true;
3049 #ifdef CONFIG_IPV6_SUBTREES
3050 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3051 rt->fib6_src.plen = cfg->fc_src_len;
3052 #endif
3054 rt->fib6_metric = cfg->fc_metric;
3055 rt->fib6_nh.nh_weight = 1;
3057 rt->fib6_type = cfg->fc_type;
3059 /* We cannot add true routes via loopback here,
3060 they would result in kernel looping; promote them to reject routes
3062 if ((cfg->fc_flags & RTF_REJECT) ||
3063 (dev && (dev->flags & IFF_LOOPBACK) &&
3064 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3065 !(cfg->fc_flags & RTF_LOCAL))) {
3066 /* hold loopback dev/idev if we haven't done so. */
3067 if (dev != net->loopback_dev) {
3068 if (dev) {
3069 dev_put(dev);
3070 in6_dev_put(idev);
3072 dev = net->loopback_dev;
3073 dev_hold(dev);
3074 idev = in6_dev_get(dev);
3075 if (!idev) {
3076 err = -ENODEV;
3077 goto out;
3080 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3081 goto install_route;
3084 if (cfg->fc_flags & RTF_GATEWAY) {
3085 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3086 if (err)
3087 goto out;
3089 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3092 err = -ENODEV;
3093 if (!dev)
3094 goto out;
3096 if (idev->cnf.disable_ipv6) {
3097 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3098 err = -EACCES;
3099 goto out;
3102 if (!(dev->flags & IFF_UP)) {
3103 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3104 err = -ENETDOWN;
3105 goto out;
3108 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3109 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3110 NL_SET_ERR_MSG(extack, "Invalid source address");
3111 err = -EINVAL;
3112 goto out;
3114 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3115 rt->fib6_prefsrc.plen = 128;
3116 } else
3117 rt->fib6_prefsrc.plen = 0;
3119 rt->fib6_flags = cfg->fc_flags;
3121 install_route:
3122 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3123 !netif_carrier_ok(dev))
3124 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3125 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3126 rt->fib6_nh.nh_dev = dev;
3127 rt->fib6_table = table;
3129 if (idev)
3130 in6_dev_put(idev);
3132 return rt;
3133 out:
3134 if (dev)
3135 dev_put(dev);
3136 if (idev)
3137 in6_dev_put(idev);
3139 fib6_info_release(rt);
3140 return ERR_PTR(err);
3143 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3144 struct netlink_ext_ack *extack)
3146 struct fib6_info *rt;
3147 int err;
3149 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3150 if (IS_ERR(rt))
3151 return PTR_ERR(rt);
3153 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3154 fib6_info_release(rt);
3156 return err;
3159 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3161 struct net *net = info->nl_net;
3162 struct fib6_table *table;
3163 int err;
3165 if (rt == net->ipv6.fib6_null_entry) {
3166 err = -ENOENT;
3167 goto out;
3170 table = rt->fib6_table;
3171 spin_lock_bh(&table->tb6_lock);
3172 err = fib6_del(rt, info);
3173 spin_unlock_bh(&table->tb6_lock);
3175 out:
3176 fib6_info_release(rt);
3177 return err;
3180 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3182 struct nl_info info = { .nl_net = net };
3184 return __ip6_del_rt(rt, &info);
3187 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3189 struct nl_info *info = &cfg->fc_nlinfo;
3190 struct net *net = info->nl_net;
3191 struct sk_buff *skb = NULL;
3192 struct fib6_table *table;
3193 int err = -ENOENT;
3195 if (rt == net->ipv6.fib6_null_entry)
3196 goto out_put;
3197 table = rt->fib6_table;
3198 spin_lock_bh(&table->tb6_lock);
3200 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3201 struct fib6_info *sibling, *next_sibling;
3203 /* prefer to send a single notification with all hops */
3204 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3205 if (skb) {
3206 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3208 if (rt6_fill_node(net, skb, rt, NULL,
3209 NULL, NULL, 0, RTM_DELROUTE,
3210 info->portid, seq, 0) < 0) {
3211 kfree_skb(skb);
3212 skb = NULL;
3213 } else
3214 info->skip_notify = 1;
3217 list_for_each_entry_safe(sibling, next_sibling,
3218 &rt->fib6_siblings,
3219 fib6_siblings) {
3220 err = fib6_del(sibling, info);
3221 if (err)
3222 goto out_unlock;
3226 err = fib6_del(rt, info);
3227 out_unlock:
3228 spin_unlock_bh(&table->tb6_lock);
3229 out_put:
3230 fib6_info_release(rt);
3232 if (skb) {
3233 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3234 info->nlh, gfp_any());
3236 return err;
3239 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3241 int rc = -ESRCH;
3243 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3244 goto out;
3246 if (cfg->fc_flags & RTF_GATEWAY &&
3247 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3248 goto out;
3250 rc = rt6_remove_exception_rt(rt);
3251 out:
3252 return rc;
3255 static int ip6_route_del(struct fib6_config *cfg,
3256 struct netlink_ext_ack *extack)
3258 struct rt6_info *rt_cache;
3259 struct fib6_table *table;
3260 struct fib6_info *rt;
3261 struct fib6_node *fn;
3262 int err = -ESRCH;
3264 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3265 if (!table) {
3266 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3267 return err;
3270 rcu_read_lock();
3272 fn = fib6_locate(&table->tb6_root,
3273 &cfg->fc_dst, cfg->fc_dst_len,
3274 &cfg->fc_src, cfg->fc_src_len,
3275 !(cfg->fc_flags & RTF_CACHE));
3277 if (fn) {
3278 for_each_fib6_node_rt_rcu(fn) {
3279 if (cfg->fc_flags & RTF_CACHE) {
3280 int rc;
3282 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3283 &cfg->fc_src);
3284 if (rt_cache) {
3285 rc = ip6_del_cached_rt(rt_cache, cfg);
3286 if (rc != -ESRCH) {
3287 rcu_read_unlock();
3288 return rc;
3291 continue;
3293 if (cfg->fc_ifindex &&
3294 (!rt->fib6_nh.nh_dev ||
3295 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3296 continue;
3297 if (cfg->fc_flags & RTF_GATEWAY &&
3298 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3299 continue;
3300 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3301 continue;
3302 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3303 continue;
3304 if (!fib6_info_hold_safe(rt))
3305 continue;
3306 rcu_read_unlock();
3308 /* if gateway was specified only delete the one hop */
3309 if (cfg->fc_flags & RTF_GATEWAY)
3310 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3312 return __ip6_del_rt_siblings(rt, cfg);
3315 rcu_read_unlock();
3317 return err;
3320 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3322 struct netevent_redirect netevent;
3323 struct rt6_info *rt, *nrt = NULL;
3324 struct ndisc_options ndopts;
3325 struct inet6_dev *in6_dev;
3326 struct neighbour *neigh;
3327 struct fib6_info *from;
3328 struct rd_msg *msg;
3329 int optlen, on_link;
3330 u8 *lladdr;
3332 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3333 optlen -= sizeof(*msg);
3335 if (optlen < 0) {
3336 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3337 return;
3340 msg = (struct rd_msg *)icmp6_hdr(skb);
3342 if (ipv6_addr_is_multicast(&msg->dest)) {
3343 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3344 return;
3347 on_link = 0;
3348 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3349 on_link = 1;
3350 } else if (ipv6_addr_type(&msg->target) !=
3351 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3352 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3353 return;
3356 in6_dev = __in6_dev_get(skb->dev);
3357 if (!in6_dev)
3358 return;
3359 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3360 return;
3362 /* RFC2461 8.1:
3363 * The IP source address of the Redirect MUST be the same as the current
3364 * first-hop router for the specified ICMP Destination Address.
3367 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3368 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3369 return;
3372 lladdr = NULL;
3373 if (ndopts.nd_opts_tgt_lladdr) {
3374 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3375 skb->dev);
3376 if (!lladdr) {
3377 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3378 return;
3382 rt = (struct rt6_info *) dst;
3383 if (rt->rt6i_flags & RTF_REJECT) {
3384 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3385 return;
3388 /* Redirect received -> path was valid.
3389 * Look, redirects are sent only in response to data packets,
3390 * so that this nexthop apparently is reachable. --ANK
3392 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3394 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3395 if (!neigh)
3396 return;
3399 * We have finally decided to accept it.
3402 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3403 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3404 NEIGH_UPDATE_F_OVERRIDE|
3405 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3406 NEIGH_UPDATE_F_ISROUTER)),
3407 NDISC_REDIRECT, &ndopts);
3409 rcu_read_lock();
3410 from = rcu_dereference(rt->from);
3411 if (!from)
3412 goto out;
3414 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3415 if (!nrt)
3416 goto out;
3418 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3419 if (on_link)
3420 nrt->rt6i_flags &= ~RTF_GATEWAY;
3422 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3424 /* rt6_insert_exception() will take care of duplicated exceptions */
3425 if (rt6_insert_exception(nrt, from)) {
3426 dst_release_immediate(&nrt->dst);
3427 goto out;
3430 netevent.old = &rt->dst;
3431 netevent.new = &nrt->dst;
3432 netevent.daddr = &msg->dest;
3433 netevent.neigh = neigh;
3434 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3436 out:
3437 rcu_read_unlock();
3438 neigh_release(neigh);
3441 #ifdef CONFIG_IPV6_ROUTE_INFO
3442 static struct fib6_info *rt6_get_route_info(struct net *net,
3443 const struct in6_addr *prefix, int prefixlen,
3444 const struct in6_addr *gwaddr,
3445 struct net_device *dev)
3447 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3448 int ifindex = dev->ifindex;
3449 struct fib6_node *fn;
3450 struct fib6_info *rt = NULL;
3451 struct fib6_table *table;
3453 table = fib6_get_table(net, tb_id);
3454 if (!table)
3455 return NULL;
3457 rcu_read_lock();
3458 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3459 if (!fn)
3460 goto out;
3462 for_each_fib6_node_rt_rcu(fn) {
3463 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3464 continue;
3465 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3466 continue;
3467 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3468 continue;
3469 if (!fib6_info_hold_safe(rt))
3470 continue;
3471 break;
3473 out:
3474 rcu_read_unlock();
3475 return rt;
3478 static struct fib6_info *rt6_add_route_info(struct net *net,
3479 const struct in6_addr *prefix, int prefixlen,
3480 const struct in6_addr *gwaddr,
3481 struct net_device *dev,
3482 unsigned int pref)
3484 struct fib6_config cfg = {
3485 .fc_metric = IP6_RT_PRIO_USER,
3486 .fc_ifindex = dev->ifindex,
3487 .fc_dst_len = prefixlen,
3488 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3489 RTF_UP | RTF_PREF(pref),
3490 .fc_protocol = RTPROT_RA,
3491 .fc_type = RTN_UNICAST,
3492 .fc_nlinfo.portid = 0,
3493 .fc_nlinfo.nlh = NULL,
3494 .fc_nlinfo.nl_net = net,
3497 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3498 cfg.fc_dst = *prefix;
3499 cfg.fc_gateway = *gwaddr;
3501 /* We should treat it as a default route if prefix length is 0. */
3502 if (!prefixlen)
3503 cfg.fc_flags |= RTF_DEFAULT;
3505 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3507 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3509 #endif
3511 struct fib6_info *rt6_get_dflt_router(struct net *net,
3512 const struct in6_addr *addr,
3513 struct net_device *dev)
3515 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3516 struct fib6_info *rt;
3517 struct fib6_table *table;
3519 table = fib6_get_table(net, tb_id);
3520 if (!table)
3521 return NULL;
3523 rcu_read_lock();
3524 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3525 if (dev == rt->fib6_nh.nh_dev &&
3526 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3527 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3528 break;
3530 if (rt && !fib6_info_hold_safe(rt))
3531 rt = NULL;
3532 rcu_read_unlock();
3533 return rt;
3536 struct fib6_info *rt6_add_dflt_router(struct net *net,
3537 const struct in6_addr *gwaddr,
3538 struct net_device *dev,
3539 unsigned int pref)
3541 struct fib6_config cfg = {
3542 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3543 .fc_metric = IP6_RT_PRIO_USER,
3544 .fc_ifindex = dev->ifindex,
3545 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3546 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3547 .fc_protocol = RTPROT_RA,
3548 .fc_type = RTN_UNICAST,
3549 .fc_nlinfo.portid = 0,
3550 .fc_nlinfo.nlh = NULL,
3551 .fc_nlinfo.nl_net = net,
3554 cfg.fc_gateway = *gwaddr;
3556 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3557 struct fib6_table *table;
3559 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3560 if (table)
3561 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3564 return rt6_get_dflt_router(net, gwaddr, dev);
3567 static void __rt6_purge_dflt_routers(struct net *net,
3568 struct fib6_table *table)
3570 struct fib6_info *rt;
3572 restart:
3573 rcu_read_lock();
3574 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3575 struct net_device *dev = fib6_info_nh_dev(rt);
3576 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3578 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3579 (!idev || idev->cnf.accept_ra != 2) &&
3580 fib6_info_hold_safe(rt)) {
3581 rcu_read_unlock();
3582 ip6_del_rt(net, rt);
3583 goto restart;
3586 rcu_read_unlock();
3588 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3591 void rt6_purge_dflt_routers(struct net *net)
3593 struct fib6_table *table;
3594 struct hlist_head *head;
3595 unsigned int h;
3597 rcu_read_lock();
3599 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3600 head = &net->ipv6.fib_table_hash[h];
3601 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3602 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3603 __rt6_purge_dflt_routers(net, table);
3607 rcu_read_unlock();
3610 static void rtmsg_to_fib6_config(struct net *net,
3611 struct in6_rtmsg *rtmsg,
3612 struct fib6_config *cfg)
3614 *cfg = (struct fib6_config){
3615 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3616 : RT6_TABLE_MAIN,
3617 .fc_ifindex = rtmsg->rtmsg_ifindex,
3618 .fc_metric = rtmsg->rtmsg_metric,
3619 .fc_expires = rtmsg->rtmsg_info,
3620 .fc_dst_len = rtmsg->rtmsg_dst_len,
3621 .fc_src_len = rtmsg->rtmsg_src_len,
3622 .fc_flags = rtmsg->rtmsg_flags,
3623 .fc_type = rtmsg->rtmsg_type,
3625 .fc_nlinfo.nl_net = net,
3627 .fc_dst = rtmsg->rtmsg_dst,
3628 .fc_src = rtmsg->rtmsg_src,
3629 .fc_gateway = rtmsg->rtmsg_gateway,
3633 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3635 struct fib6_config cfg;
3636 struct in6_rtmsg rtmsg;
3637 int err;
3639 switch (cmd) {
3640 case SIOCADDRT: /* Add a route */
3641 case SIOCDELRT: /* Delete a route */
3642 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3643 return -EPERM;
3644 err = copy_from_user(&rtmsg, arg,
3645 sizeof(struct in6_rtmsg));
3646 if (err)
3647 return -EFAULT;
3649 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3651 rtnl_lock();
3652 switch (cmd) {
3653 case SIOCADDRT:
3654 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3655 break;
3656 case SIOCDELRT:
3657 err = ip6_route_del(&cfg, NULL);
3658 break;
3659 default:
3660 err = -EINVAL;
3662 rtnl_unlock();
3664 return err;
3667 return -EINVAL;
3671 * Drop the packet on the floor
3674 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3676 struct dst_entry *dst = skb_dst(skb);
3677 struct net *net = dev_net(dst->dev);
3678 struct inet6_dev *idev;
3679 int type;
3681 if (netif_is_l3_master(skb->dev) &&
3682 dst->dev == net->loopback_dev)
3683 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3684 else
3685 idev = ip6_dst_idev(dst);
3687 switch (ipstats_mib_noroutes) {
3688 case IPSTATS_MIB_INNOROUTES:
3689 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3690 if (type == IPV6_ADDR_ANY) {
3691 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3692 break;
3694 /* FALLTHROUGH */
3695 case IPSTATS_MIB_OUTNOROUTES:
3696 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3697 break;
3700 /* Start over by dropping the dst for l3mdev case */
3701 if (netif_is_l3_master(skb->dev))
3702 skb_dst_drop(skb);
3704 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3705 kfree_skb(skb);
3706 return 0;
3709 static int ip6_pkt_discard(struct sk_buff *skb)
3711 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3714 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3716 skb->dev = skb_dst(skb)->dev;
3717 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3720 static int ip6_pkt_prohibit(struct sk_buff *skb)
3722 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3725 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3727 skb->dev = skb_dst(skb)->dev;
3728 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3732 * Allocate a dst for local (unicast / anycast) address.
3735 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3736 struct inet6_dev *idev,
3737 const struct in6_addr *addr,
3738 bool anycast, gfp_t gfp_flags)
3740 u32 tb_id;
3741 struct net_device *dev = idev->dev;
3742 struct fib6_info *f6i;
3744 f6i = fib6_info_alloc(gfp_flags);
3745 if (!f6i)
3746 return ERR_PTR(-ENOMEM);
3748 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3749 f6i->dst_nocount = true;
3750 f6i->dst_host = true;
3751 f6i->fib6_protocol = RTPROT_KERNEL;
3752 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3753 if (anycast) {
3754 f6i->fib6_type = RTN_ANYCAST;
3755 f6i->fib6_flags |= RTF_ANYCAST;
3756 } else {
3757 f6i->fib6_type = RTN_LOCAL;
3758 f6i->fib6_flags |= RTF_LOCAL;
3761 f6i->fib6_nh.nh_gw = *addr;
3762 dev_hold(dev);
3763 f6i->fib6_nh.nh_dev = dev;
3764 f6i->fib6_dst.addr = *addr;
3765 f6i->fib6_dst.plen = 128;
3766 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3767 f6i->fib6_table = fib6_get_table(net, tb_id);
3769 return f6i;
3772 /* remove deleted ip from prefsrc entries */
3773 struct arg_dev_net_ip {
3774 struct net_device *dev;
3775 struct net *net;
3776 struct in6_addr *addr;
3779 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3781 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3782 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3783 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3785 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3786 rt != net->ipv6.fib6_null_entry &&
3787 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3788 spin_lock_bh(&rt6_exception_lock);
3789 /* remove prefsrc entry */
3790 rt->fib6_prefsrc.plen = 0;
3791 spin_unlock_bh(&rt6_exception_lock);
3793 return 0;
3796 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3798 struct net *net = dev_net(ifp->idev->dev);
3799 struct arg_dev_net_ip adni = {
3800 .dev = ifp->idev->dev,
3801 .net = net,
3802 .addr = &ifp->addr,
3804 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3807 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3809 /* Remove routers and update dst entries when gateway turn into host. */
3810 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3812 struct in6_addr *gateway = (struct in6_addr *)arg;
3814 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3815 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3816 return -1;
3819 /* Further clean up cached routes in exception table.
3820 * This is needed because cached route may have a different
3821 * gateway than its 'parent' in the case of an ip redirect.
3823 rt6_exceptions_clean_tohost(rt, gateway);
3825 return 0;
3828 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3830 fib6_clean_all(net, fib6_clean_tohost, gateway);
3833 struct arg_netdev_event {
3834 const struct net_device *dev;
3835 union {
3836 unsigned int nh_flags;
3837 unsigned long event;
3841 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3843 struct fib6_info *iter;
3844 struct fib6_node *fn;
3846 fn = rcu_dereference_protected(rt->fib6_node,
3847 lockdep_is_held(&rt->fib6_table->tb6_lock));
3848 iter = rcu_dereference_protected(fn->leaf,
3849 lockdep_is_held(&rt->fib6_table->tb6_lock));
3850 while (iter) {
3851 if (iter->fib6_metric == rt->fib6_metric &&
3852 rt6_qualify_for_ecmp(iter))
3853 return iter;
3854 iter = rcu_dereference_protected(iter->fib6_next,
3855 lockdep_is_held(&rt->fib6_table->tb6_lock));
3858 return NULL;
3861 static bool rt6_is_dead(const struct fib6_info *rt)
3863 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3864 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3865 fib6_ignore_linkdown(rt)))
3866 return true;
3868 return false;
3871 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3873 struct fib6_info *iter;
3874 int total = 0;
3876 if (!rt6_is_dead(rt))
3877 total += rt->fib6_nh.nh_weight;
3879 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3880 if (!rt6_is_dead(iter))
3881 total += iter->fib6_nh.nh_weight;
3884 return total;
3887 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3889 int upper_bound = -1;
3891 if (!rt6_is_dead(rt)) {
3892 *weight += rt->fib6_nh.nh_weight;
3893 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3894 total) - 1;
3896 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3899 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3901 struct fib6_info *iter;
3902 int weight = 0;
3904 rt6_upper_bound_set(rt, &weight, total);
3906 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3907 rt6_upper_bound_set(iter, &weight, total);
3910 void rt6_multipath_rebalance(struct fib6_info *rt)
3912 struct fib6_info *first;
3913 int total;
3915 /* In case the entire multipath route was marked for flushing,
3916 * then there is no need to rebalance upon the removal of every
3917 * sibling route.
3919 if (!rt->fib6_nsiblings || rt->should_flush)
3920 return;
3922 /* During lookup routes are evaluated in order, so we need to
3923 * make sure upper bounds are assigned from the first sibling
3924 * onwards.
3926 first = rt6_multipath_first_sibling(rt);
3927 if (WARN_ON_ONCE(!first))
3928 return;
3930 total = rt6_multipath_total_weight(first);
3931 rt6_multipath_upper_bound_set(first, total);
3934 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3936 const struct arg_netdev_event *arg = p_arg;
3937 struct net *net = dev_net(arg->dev);
3939 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3940 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3941 fib6_update_sernum_upto_root(net, rt);
3942 rt6_multipath_rebalance(rt);
3945 return 0;
3948 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3950 struct arg_netdev_event arg = {
3951 .dev = dev,
3953 .nh_flags = nh_flags,
3957 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3958 arg.nh_flags |= RTNH_F_LINKDOWN;
3960 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3963 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3964 const struct net_device *dev)
3966 struct fib6_info *iter;
3968 if (rt->fib6_nh.nh_dev == dev)
3969 return true;
3970 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3971 if (iter->fib6_nh.nh_dev == dev)
3972 return true;
3974 return false;
3977 static void rt6_multipath_flush(struct fib6_info *rt)
3979 struct fib6_info *iter;
3981 rt->should_flush = 1;
3982 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3983 iter->should_flush = 1;
3986 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3987 const struct net_device *down_dev)
3989 struct fib6_info *iter;
3990 unsigned int dead = 0;
3992 if (rt->fib6_nh.nh_dev == down_dev ||
3993 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3994 dead++;
3995 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3996 if (iter->fib6_nh.nh_dev == down_dev ||
3997 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3998 dead++;
4000 return dead;
4003 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4004 const struct net_device *dev,
4005 unsigned int nh_flags)
4007 struct fib6_info *iter;
4009 if (rt->fib6_nh.nh_dev == dev)
4010 rt->fib6_nh.nh_flags |= nh_flags;
4011 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4012 if (iter->fib6_nh.nh_dev == dev)
4013 iter->fib6_nh.nh_flags |= nh_flags;
4016 /* called with write lock held for table with rt */
4017 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4019 const struct arg_netdev_event *arg = p_arg;
4020 const struct net_device *dev = arg->dev;
4021 struct net *net = dev_net(dev);
4023 if (rt == net->ipv6.fib6_null_entry)
4024 return 0;
4026 switch (arg->event) {
4027 case NETDEV_UNREGISTER:
4028 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4029 case NETDEV_DOWN:
4030 if (rt->should_flush)
4031 return -1;
4032 if (!rt->fib6_nsiblings)
4033 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4034 if (rt6_multipath_uses_dev(rt, dev)) {
4035 unsigned int count;
4037 count = rt6_multipath_dead_count(rt, dev);
4038 if (rt->fib6_nsiblings + 1 == count) {
4039 rt6_multipath_flush(rt);
4040 return -1;
4042 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4043 RTNH_F_LINKDOWN);
4044 fib6_update_sernum(net, rt);
4045 rt6_multipath_rebalance(rt);
4047 return -2;
4048 case NETDEV_CHANGE:
4049 if (rt->fib6_nh.nh_dev != dev ||
4050 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4051 break;
4052 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4053 rt6_multipath_rebalance(rt);
4054 break;
4057 return 0;
4060 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4062 struct arg_netdev_event arg = {
4063 .dev = dev,
4065 .event = event,
4068 struct net *net = dev_net(dev);
4070 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4071 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4072 else
4073 fib6_clean_all(net, fib6_ifdown, &arg);
4076 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4078 rt6_sync_down_dev(dev, event);
4079 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4080 neigh_ifdown(&nd_tbl, dev);
4083 struct rt6_mtu_change_arg {
4084 struct net_device *dev;
4085 unsigned int mtu;
4088 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4090 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4091 struct inet6_dev *idev;
4093 /* In IPv6 pmtu discovery is not optional,
4094 so that RTAX_MTU lock cannot disable it.
4095 We still use this lock to block changes
4096 caused by addrconf/ndisc.
4099 idev = __in6_dev_get(arg->dev);
4100 if (!idev)
4101 return 0;
4103 /* For administrative MTU increase, there is no way to discover
4104 IPv6 PMTU increase, so PMTU increase should be updated here.
4105 Since RFC 1981 doesn't include administrative MTU increase
4106 update PMTU increase is a MUST. (i.e. jumbo frame)
4108 if (rt->fib6_nh.nh_dev == arg->dev &&
4109 !fib6_metric_locked(rt, RTAX_MTU)) {
4110 u32 mtu = rt->fib6_pmtu;
4112 if (mtu >= arg->mtu ||
4113 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4114 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4116 spin_lock_bh(&rt6_exception_lock);
4117 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4118 spin_unlock_bh(&rt6_exception_lock);
4120 return 0;
4123 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4125 struct rt6_mtu_change_arg arg = {
4126 .dev = dev,
4127 .mtu = mtu,
4130 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4133 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4134 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4135 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4136 [RTA_OIF] = { .type = NLA_U32 },
4137 [RTA_IIF] = { .type = NLA_U32 },
4138 [RTA_PRIORITY] = { .type = NLA_U32 },
4139 [RTA_METRICS] = { .type = NLA_NESTED },
4140 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4141 [RTA_PREF] = { .type = NLA_U8 },
4142 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4143 [RTA_ENCAP] = { .type = NLA_NESTED },
4144 [RTA_EXPIRES] = { .type = NLA_U32 },
4145 [RTA_UID] = { .type = NLA_U32 },
4146 [RTA_MARK] = { .type = NLA_U32 },
4147 [RTA_TABLE] = { .type = NLA_U32 },
4148 [RTA_IP_PROTO] = { .type = NLA_U8 },
4149 [RTA_SPORT] = { .type = NLA_U16 },
4150 [RTA_DPORT] = { .type = NLA_U16 },
4153 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4154 struct fib6_config *cfg,
4155 struct netlink_ext_ack *extack)
4157 struct rtmsg *rtm;
4158 struct nlattr *tb[RTA_MAX+1];
4159 unsigned int pref;
4160 int err;
4162 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4163 extack);
4164 if (err < 0)
4165 goto errout;
4167 err = -EINVAL;
4168 rtm = nlmsg_data(nlh);
4170 *cfg = (struct fib6_config){
4171 .fc_table = rtm->rtm_table,
4172 .fc_dst_len = rtm->rtm_dst_len,
4173 .fc_src_len = rtm->rtm_src_len,
4174 .fc_flags = RTF_UP,
4175 .fc_protocol = rtm->rtm_protocol,
4176 .fc_type = rtm->rtm_type,
4178 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4179 .fc_nlinfo.nlh = nlh,
4180 .fc_nlinfo.nl_net = sock_net(skb->sk),
4183 if (rtm->rtm_type == RTN_UNREACHABLE ||
4184 rtm->rtm_type == RTN_BLACKHOLE ||
4185 rtm->rtm_type == RTN_PROHIBIT ||
4186 rtm->rtm_type == RTN_THROW)
4187 cfg->fc_flags |= RTF_REJECT;
4189 if (rtm->rtm_type == RTN_LOCAL)
4190 cfg->fc_flags |= RTF_LOCAL;
4192 if (rtm->rtm_flags & RTM_F_CLONED)
4193 cfg->fc_flags |= RTF_CACHE;
4195 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4197 if (tb[RTA_GATEWAY]) {
4198 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4199 cfg->fc_flags |= RTF_GATEWAY;
4201 if (tb[RTA_VIA]) {
4202 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4203 goto errout;
4206 if (tb[RTA_DST]) {
4207 int plen = (rtm->rtm_dst_len + 7) >> 3;
4209 if (nla_len(tb[RTA_DST]) < plen)
4210 goto errout;
4212 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4215 if (tb[RTA_SRC]) {
4216 int plen = (rtm->rtm_src_len + 7) >> 3;
4218 if (nla_len(tb[RTA_SRC]) < plen)
4219 goto errout;
4221 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4224 if (tb[RTA_PREFSRC])
4225 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4227 if (tb[RTA_OIF])
4228 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4230 if (tb[RTA_PRIORITY])
4231 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4233 if (tb[RTA_METRICS]) {
4234 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4235 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4238 if (tb[RTA_TABLE])
4239 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4241 if (tb[RTA_MULTIPATH]) {
4242 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4243 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4245 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4246 cfg->fc_mp_len, extack);
4247 if (err < 0)
4248 goto errout;
4251 if (tb[RTA_PREF]) {
4252 pref = nla_get_u8(tb[RTA_PREF]);
4253 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4254 pref != ICMPV6_ROUTER_PREF_HIGH)
4255 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4256 cfg->fc_flags |= RTF_PREF(pref);
4259 if (tb[RTA_ENCAP])
4260 cfg->fc_encap = tb[RTA_ENCAP];
4262 if (tb[RTA_ENCAP_TYPE]) {
4263 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4265 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4266 if (err < 0)
4267 goto errout;
4270 if (tb[RTA_EXPIRES]) {
4271 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4273 if (addrconf_finite_timeout(timeout)) {
4274 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4275 cfg->fc_flags |= RTF_EXPIRES;
4279 err = 0;
4280 errout:
4281 return err;
4284 struct rt6_nh {
4285 struct fib6_info *fib6_info;
4286 struct fib6_config r_cfg;
4287 struct list_head next;
4290 static int ip6_route_info_append(struct net *net,
4291 struct list_head *rt6_nh_list,
4292 struct fib6_info *rt,
4293 struct fib6_config *r_cfg)
4295 struct rt6_nh *nh;
4296 int err = -EEXIST;
4298 list_for_each_entry(nh, rt6_nh_list, next) {
4299 /* check if fib6_info already exists */
4300 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4301 return err;
4304 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4305 if (!nh)
4306 return -ENOMEM;
4307 nh->fib6_info = rt;
4308 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4309 list_add_tail(&nh->next, rt6_nh_list);
4311 return 0;
4314 static void ip6_route_mpath_notify(struct fib6_info *rt,
4315 struct fib6_info *rt_last,
4316 struct nl_info *info,
4317 __u16 nlflags)
4319 /* if this is an APPEND route, then rt points to the first route
4320 * inserted and rt_last points to last route inserted. Userspace
4321 * wants a consistent dump of the route which starts at the first
4322 * nexthop. Since sibling routes are always added at the end of
4323 * the list, find the first sibling of the last route appended
4325 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4326 rt = list_first_entry(&rt_last->fib6_siblings,
4327 struct fib6_info,
4328 fib6_siblings);
4331 if (rt)
4332 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4335 static int ip6_route_multipath_add(struct fib6_config *cfg,
4336 struct netlink_ext_ack *extack)
4338 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4339 struct nl_info *info = &cfg->fc_nlinfo;
4340 struct fib6_config r_cfg;
4341 struct rtnexthop *rtnh;
4342 struct fib6_info *rt;
4343 struct rt6_nh *err_nh;
4344 struct rt6_nh *nh, *nh_safe;
4345 __u16 nlflags;
4346 int remaining;
4347 int attrlen;
4348 int err = 1;
4349 int nhn = 0;
4350 int replace = (cfg->fc_nlinfo.nlh &&
4351 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4352 LIST_HEAD(rt6_nh_list);
4354 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4355 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4356 nlflags |= NLM_F_APPEND;
4358 remaining = cfg->fc_mp_len;
4359 rtnh = (struct rtnexthop *)cfg->fc_mp;
4361 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4362 * fib6_info structs per nexthop
4364 while (rtnh_ok(rtnh, remaining)) {
4365 memcpy(&r_cfg, cfg, sizeof(*cfg));
4366 if (rtnh->rtnh_ifindex)
4367 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4369 attrlen = rtnh_attrlen(rtnh);
4370 if (attrlen > 0) {
4371 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4373 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4374 if (nla) {
4375 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4376 r_cfg.fc_flags |= RTF_GATEWAY;
4378 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4379 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4380 if (nla)
4381 r_cfg.fc_encap_type = nla_get_u16(nla);
4384 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4385 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4386 if (IS_ERR(rt)) {
4387 err = PTR_ERR(rt);
4388 rt = NULL;
4389 goto cleanup;
4391 if (!rt6_qualify_for_ecmp(rt)) {
4392 err = -EINVAL;
4393 NL_SET_ERR_MSG(extack,
4394 "Device only routes can not be added for IPv6 using the multipath API.");
4395 fib6_info_release(rt);
4396 goto cleanup;
4399 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4401 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4402 rt, &r_cfg);
4403 if (err) {
4404 fib6_info_release(rt);
4405 goto cleanup;
4408 rtnh = rtnh_next(rtnh, &remaining);
4411 /* for add and replace send one notification with all nexthops.
4412 * Skip the notification in fib6_add_rt2node and send one with
4413 * the full route when done
4415 info->skip_notify = 1;
4417 err_nh = NULL;
4418 list_for_each_entry(nh, &rt6_nh_list, next) {
4419 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4420 fib6_info_release(nh->fib6_info);
4422 if (!err) {
4423 /* save reference to last route successfully inserted */
4424 rt_last = nh->fib6_info;
4426 /* save reference to first route for notification */
4427 if (!rt_notif)
4428 rt_notif = nh->fib6_info;
4431 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4432 nh->fib6_info = NULL;
4433 if (err) {
4434 if (replace && nhn)
4435 NL_SET_ERR_MSG_MOD(extack,
4436 "multipath route replace failed (check consistency of installed routes)");
4437 err_nh = nh;
4438 goto add_errout;
4441 /* Because each route is added like a single route we remove
4442 * these flags after the first nexthop: if there is a collision,
4443 * we have already failed to add the first nexthop:
4444 * fib6_add_rt2node() has rejected it; when replacing, old
4445 * nexthops have been replaced by first new, the rest should
4446 * be added to it.
4448 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4449 NLM_F_REPLACE);
4450 nhn++;
4453 /* success ... tell user about new route */
4454 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4455 goto cleanup;
4457 add_errout:
4458 /* send notification for routes that were added so that
4459 * the delete notifications sent by ip6_route_del are
4460 * coherent
4462 if (rt_notif)
4463 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4465 /* Delete routes that were already added */
4466 list_for_each_entry(nh, &rt6_nh_list, next) {
4467 if (err_nh == nh)
4468 break;
4469 ip6_route_del(&nh->r_cfg, extack);
4472 cleanup:
4473 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4474 if (nh->fib6_info)
4475 fib6_info_release(nh->fib6_info);
4476 list_del(&nh->next);
4477 kfree(nh);
4480 return err;
4483 static int ip6_route_multipath_del(struct fib6_config *cfg,
4484 struct netlink_ext_ack *extack)
4486 struct fib6_config r_cfg;
4487 struct rtnexthop *rtnh;
4488 int remaining;
4489 int attrlen;
4490 int err = 1, last_err = 0;
4492 remaining = cfg->fc_mp_len;
4493 rtnh = (struct rtnexthop *)cfg->fc_mp;
4495 /* Parse a Multipath Entry */
4496 while (rtnh_ok(rtnh, remaining)) {
4497 memcpy(&r_cfg, cfg, sizeof(*cfg));
4498 if (rtnh->rtnh_ifindex)
4499 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4501 attrlen = rtnh_attrlen(rtnh);
4502 if (attrlen > 0) {
4503 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4505 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4506 if (nla) {
4507 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4508 r_cfg.fc_flags |= RTF_GATEWAY;
4511 err = ip6_route_del(&r_cfg, extack);
4512 if (err)
4513 last_err = err;
4515 rtnh = rtnh_next(rtnh, &remaining);
4518 return last_err;
4521 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4522 struct netlink_ext_ack *extack)
4524 struct fib6_config cfg;
4525 int err;
4527 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4528 if (err < 0)
4529 return err;
4531 if (cfg.fc_mp)
4532 return ip6_route_multipath_del(&cfg, extack);
4533 else {
4534 cfg.fc_delete_all_nh = 1;
4535 return ip6_route_del(&cfg, extack);
4539 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4540 struct netlink_ext_ack *extack)
4542 struct fib6_config cfg;
4543 int err;
4545 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4546 if (err < 0)
4547 return err;
4549 if (cfg.fc_mp)
4550 return ip6_route_multipath_add(&cfg, extack);
4551 else
4552 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4555 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4557 int nexthop_len = 0;
4559 if (rt->fib6_nsiblings) {
4560 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4561 + NLA_ALIGN(sizeof(struct rtnexthop))
4562 + nla_total_size(16) /* RTA_GATEWAY */
4563 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4565 nexthop_len *= rt->fib6_nsiblings;
4568 return NLMSG_ALIGN(sizeof(struct rtmsg))
4569 + nla_total_size(16) /* RTA_SRC */
4570 + nla_total_size(16) /* RTA_DST */
4571 + nla_total_size(16) /* RTA_GATEWAY */
4572 + nla_total_size(16) /* RTA_PREFSRC */
4573 + nla_total_size(4) /* RTA_TABLE */
4574 + nla_total_size(4) /* RTA_IIF */
4575 + nla_total_size(4) /* RTA_OIF */
4576 + nla_total_size(4) /* RTA_PRIORITY */
4577 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4578 + nla_total_size(sizeof(struct rta_cacheinfo))
4579 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4580 + nla_total_size(1) /* RTA_PREF */
4581 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4582 + nexthop_len;
4585 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4586 unsigned int *flags, bool skip_oif)
4588 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4589 *flags |= RTNH_F_DEAD;
4591 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4592 *flags |= RTNH_F_LINKDOWN;
4594 rcu_read_lock();
4595 if (fib6_ignore_linkdown(rt))
4596 *flags |= RTNH_F_DEAD;
4597 rcu_read_unlock();
4600 if (rt->fib6_flags & RTF_GATEWAY) {
4601 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4602 goto nla_put_failure;
4605 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4606 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4607 *flags |= RTNH_F_OFFLOAD;
4609 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4610 if (!skip_oif && rt->fib6_nh.nh_dev &&
4611 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4612 goto nla_put_failure;
4614 if (rt->fib6_nh.nh_lwtstate &&
4615 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4616 goto nla_put_failure;
4618 return 0;
4620 nla_put_failure:
4621 return -EMSGSIZE;
4624 /* add multipath next hop */
4625 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4627 const struct net_device *dev = rt->fib6_nh.nh_dev;
4628 struct rtnexthop *rtnh;
4629 unsigned int flags = 0;
4631 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4632 if (!rtnh)
4633 goto nla_put_failure;
4635 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4636 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4638 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4639 goto nla_put_failure;
4641 rtnh->rtnh_flags = flags;
4643 /* length of rtnetlink header + attributes */
4644 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4646 return 0;
4648 nla_put_failure:
4649 return -EMSGSIZE;
4652 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4653 struct fib6_info *rt, struct dst_entry *dst,
4654 struct in6_addr *dest, struct in6_addr *src,
4655 int iif, int type, u32 portid, u32 seq,
4656 unsigned int flags)
4658 struct rt6_info *rt6 = (struct rt6_info *)dst;
4659 struct rt6key *rt6_dst, *rt6_src;
4660 u32 *pmetrics, table, rt6_flags;
4661 struct nlmsghdr *nlh;
4662 struct rtmsg *rtm;
4663 long expires = 0;
4665 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4666 if (!nlh)
4667 return -EMSGSIZE;
4669 if (rt6) {
4670 rt6_dst = &rt6->rt6i_dst;
4671 rt6_src = &rt6->rt6i_src;
4672 rt6_flags = rt6->rt6i_flags;
4673 } else {
4674 rt6_dst = &rt->fib6_dst;
4675 rt6_src = &rt->fib6_src;
4676 rt6_flags = rt->fib6_flags;
4679 rtm = nlmsg_data(nlh);
4680 rtm->rtm_family = AF_INET6;
4681 rtm->rtm_dst_len = rt6_dst->plen;
4682 rtm->rtm_src_len = rt6_src->plen;
4683 rtm->rtm_tos = 0;
4684 if (rt->fib6_table)
4685 table = rt->fib6_table->tb6_id;
4686 else
4687 table = RT6_TABLE_UNSPEC;
4688 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4689 if (nla_put_u32(skb, RTA_TABLE, table))
4690 goto nla_put_failure;
4692 rtm->rtm_type = rt->fib6_type;
4693 rtm->rtm_flags = 0;
4694 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4695 rtm->rtm_protocol = rt->fib6_protocol;
4697 if (rt6_flags & RTF_CACHE)
4698 rtm->rtm_flags |= RTM_F_CLONED;
4700 if (dest) {
4701 if (nla_put_in6_addr(skb, RTA_DST, dest))
4702 goto nla_put_failure;
4703 rtm->rtm_dst_len = 128;
4704 } else if (rtm->rtm_dst_len)
4705 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4706 goto nla_put_failure;
4707 #ifdef CONFIG_IPV6_SUBTREES
4708 if (src) {
4709 if (nla_put_in6_addr(skb, RTA_SRC, src))
4710 goto nla_put_failure;
4711 rtm->rtm_src_len = 128;
4712 } else if (rtm->rtm_src_len &&
4713 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4714 goto nla_put_failure;
4715 #endif
4716 if (iif) {
4717 #ifdef CONFIG_IPV6_MROUTE
4718 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4719 int err = ip6mr_get_route(net, skb, rtm, portid);
4721 if (err == 0)
4722 return 0;
4723 if (err < 0)
4724 goto nla_put_failure;
4725 } else
4726 #endif
4727 if (nla_put_u32(skb, RTA_IIF, iif))
4728 goto nla_put_failure;
4729 } else if (dest) {
4730 struct in6_addr saddr_buf;
4731 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4732 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4733 goto nla_put_failure;
4736 if (rt->fib6_prefsrc.plen) {
4737 struct in6_addr saddr_buf;
4738 saddr_buf = rt->fib6_prefsrc.addr;
4739 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4740 goto nla_put_failure;
4743 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4744 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4745 goto nla_put_failure;
4747 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4748 goto nla_put_failure;
4750 /* For multipath routes, walk the siblings list and add
4751 * each as a nexthop within RTA_MULTIPATH.
4753 if (rt6) {
4754 if (rt6_flags & RTF_GATEWAY &&
4755 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4756 goto nla_put_failure;
4758 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4759 goto nla_put_failure;
4760 } else if (rt->fib6_nsiblings) {
4761 struct fib6_info *sibling, *next_sibling;
4762 struct nlattr *mp;
4764 mp = nla_nest_start(skb, RTA_MULTIPATH);
4765 if (!mp)
4766 goto nla_put_failure;
4768 if (rt6_add_nexthop(skb, rt) < 0)
4769 goto nla_put_failure;
4771 list_for_each_entry_safe(sibling, next_sibling,
4772 &rt->fib6_siblings, fib6_siblings) {
4773 if (rt6_add_nexthop(skb, sibling) < 0)
4774 goto nla_put_failure;
4777 nla_nest_end(skb, mp);
4778 } else {
4779 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4780 goto nla_put_failure;
4783 if (rt6_flags & RTF_EXPIRES) {
4784 expires = dst ? dst->expires : rt->expires;
4785 expires -= jiffies;
4788 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4789 goto nla_put_failure;
4791 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4792 goto nla_put_failure;
4795 nlmsg_end(skb, nlh);
4796 return 0;
4798 nla_put_failure:
4799 nlmsg_cancel(skb, nlh);
4800 return -EMSGSIZE;
4803 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4804 const struct net_device *dev)
4806 if (f6i->fib6_nh.nh_dev == dev)
4807 return true;
4809 if (f6i->fib6_nsiblings) {
4810 struct fib6_info *sibling, *next_sibling;
4812 list_for_each_entry_safe(sibling, next_sibling,
4813 &f6i->fib6_siblings, fib6_siblings) {
4814 if (sibling->fib6_nh.nh_dev == dev)
4815 return true;
4819 return false;
4822 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4824 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4825 struct fib_dump_filter *filter = &arg->filter;
4826 unsigned int flags = NLM_F_MULTI;
4827 struct net *net = arg->net;
4829 if (rt == net->ipv6.fib6_null_entry)
4830 return 0;
4832 if ((filter->flags & RTM_F_PREFIX) &&
4833 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4834 /* success since this is not a prefix route */
4835 return 1;
4837 if (filter->filter_set) {
4838 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4839 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4840 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4841 return 1;
4843 flags |= NLM_F_DUMP_FILTERED;
4846 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4847 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4848 arg->cb->nlh->nlmsg_seq, flags);
4851 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4852 const struct nlmsghdr *nlh,
4853 struct nlattr **tb,
4854 struct netlink_ext_ack *extack)
4856 struct rtmsg *rtm;
4857 int i, err;
4859 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4860 NL_SET_ERR_MSG_MOD(extack,
4861 "Invalid header for get route request");
4862 return -EINVAL;
4865 if (!netlink_strict_get_check(skb))
4866 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4867 rtm_ipv6_policy, extack);
4869 rtm = nlmsg_data(nlh);
4870 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4871 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4872 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4873 rtm->rtm_type) {
4874 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4875 return -EINVAL;
4877 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4878 NL_SET_ERR_MSG_MOD(extack,
4879 "Invalid flags for get route request");
4880 return -EINVAL;
4883 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4884 rtm_ipv6_policy, extack);
4885 if (err)
4886 return err;
4888 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4889 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4890 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4891 return -EINVAL;
4894 for (i = 0; i <= RTA_MAX; i++) {
4895 if (!tb[i])
4896 continue;
4898 switch (i) {
4899 case RTA_SRC:
4900 case RTA_DST:
4901 case RTA_IIF:
4902 case RTA_OIF:
4903 case RTA_MARK:
4904 case RTA_UID:
4905 case RTA_SPORT:
4906 case RTA_DPORT:
4907 case RTA_IP_PROTO:
4908 break;
4909 default:
4910 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4911 return -EINVAL;
4915 return 0;
4918 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4919 struct netlink_ext_ack *extack)
4921 struct net *net = sock_net(in_skb->sk);
4922 struct nlattr *tb[RTA_MAX+1];
4923 int err, iif = 0, oif = 0;
4924 struct fib6_info *from;
4925 struct dst_entry *dst;
4926 struct rt6_info *rt;
4927 struct sk_buff *skb;
4928 struct rtmsg *rtm;
4929 struct flowi6 fl6 = {};
4930 bool fibmatch;
4932 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4933 if (err < 0)
4934 goto errout;
4936 err = -EINVAL;
4937 rtm = nlmsg_data(nlh);
4938 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4939 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4941 if (tb[RTA_SRC]) {
4942 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4943 goto errout;
4945 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4948 if (tb[RTA_DST]) {
4949 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4950 goto errout;
4952 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4955 if (tb[RTA_IIF])
4956 iif = nla_get_u32(tb[RTA_IIF]);
4958 if (tb[RTA_OIF])
4959 oif = nla_get_u32(tb[RTA_OIF]);
4961 if (tb[RTA_MARK])
4962 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4964 if (tb[RTA_UID])
4965 fl6.flowi6_uid = make_kuid(current_user_ns(),
4966 nla_get_u32(tb[RTA_UID]));
4967 else
4968 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4970 if (tb[RTA_SPORT])
4971 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4973 if (tb[RTA_DPORT])
4974 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4976 if (tb[RTA_IP_PROTO]) {
4977 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4978 &fl6.flowi6_proto, AF_INET6,
4979 extack);
4980 if (err)
4981 goto errout;
4984 if (iif) {
4985 struct net_device *dev;
4986 int flags = 0;
4988 rcu_read_lock();
4990 dev = dev_get_by_index_rcu(net, iif);
4991 if (!dev) {
4992 rcu_read_unlock();
4993 err = -ENODEV;
4994 goto errout;
4997 fl6.flowi6_iif = iif;
4999 if (!ipv6_addr_any(&fl6.saddr))
5000 flags |= RT6_LOOKUP_F_HAS_SADDR;
5002 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5004 rcu_read_unlock();
5005 } else {
5006 fl6.flowi6_oif = oif;
5008 dst = ip6_route_output(net, NULL, &fl6);
5012 rt = container_of(dst, struct rt6_info, dst);
5013 if (rt->dst.error) {
5014 err = rt->dst.error;
5015 ip6_rt_put(rt);
5016 goto errout;
5019 if (rt == net->ipv6.ip6_null_entry) {
5020 err = rt->dst.error;
5021 ip6_rt_put(rt);
5022 goto errout;
5025 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5026 if (!skb) {
5027 ip6_rt_put(rt);
5028 err = -ENOBUFS;
5029 goto errout;
5032 skb_dst_set(skb, &rt->dst);
5034 rcu_read_lock();
5035 from = rcu_dereference(rt->from);
5036 if (from) {
5037 if (fibmatch)
5038 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5039 iif, RTM_NEWROUTE,
5040 NETLINK_CB(in_skb).portid,
5041 nlh->nlmsg_seq, 0);
5042 else
5043 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5044 &fl6.saddr, iif, RTM_NEWROUTE,
5045 NETLINK_CB(in_skb).portid,
5046 nlh->nlmsg_seq, 0);
5047 } else {
5048 err = -ENETUNREACH;
5050 rcu_read_unlock();
5052 if (err < 0) {
5053 kfree_skb(skb);
5054 goto errout;
5057 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5058 errout:
5059 return err;
5062 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5063 unsigned int nlm_flags)
5065 struct sk_buff *skb;
5066 struct net *net = info->nl_net;
5067 u32 seq;
5068 int err;
5070 err = -ENOBUFS;
5071 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5073 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5074 if (!skb)
5075 goto errout;
5077 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5078 event, info->portid, seq, nlm_flags);
5079 if (err < 0) {
5080 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5081 WARN_ON(err == -EMSGSIZE);
5082 kfree_skb(skb);
5083 goto errout;
5085 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5086 info->nlh, gfp_any());
5087 return;
5088 errout:
5089 if (err < 0)
5090 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5093 static int ip6_route_dev_notify(struct notifier_block *this,
5094 unsigned long event, void *ptr)
5096 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5097 struct net *net = dev_net(dev);
5099 if (!(dev->flags & IFF_LOOPBACK))
5100 return NOTIFY_OK;
5102 if (event == NETDEV_REGISTER) {
5103 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5104 net->ipv6.ip6_null_entry->dst.dev = dev;
5105 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5108 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5109 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5110 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5111 #endif
5112 } else if (event == NETDEV_UNREGISTER &&
5113 dev->reg_state != NETREG_UNREGISTERED) {
5114 /* NETDEV_UNREGISTER could be fired for multiple times by
5115 * netdev_wait_allrefs(). Make sure we only call this once.
5117 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5118 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5119 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5120 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5121 #endif
5124 return NOTIFY_OK;
5128 * /proc
5131 #ifdef CONFIG_PROC_FS
5132 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5134 struct net *net = (struct net *)seq->private;
5135 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5136 net->ipv6.rt6_stats->fib_nodes,
5137 net->ipv6.rt6_stats->fib_route_nodes,
5138 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5139 net->ipv6.rt6_stats->fib_rt_entries,
5140 net->ipv6.rt6_stats->fib_rt_cache,
5141 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5142 net->ipv6.rt6_stats->fib_discarded_routes);
5144 return 0;
5146 #endif /* CONFIG_PROC_FS */
5148 #ifdef CONFIG_SYSCTL
5150 static
5151 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5152 void __user *buffer, size_t *lenp, loff_t *ppos)
5154 struct net *net;
5155 int delay;
5156 int ret;
5157 if (!write)
5158 return -EINVAL;
5160 net = (struct net *)ctl->extra1;
5161 delay = net->ipv6.sysctl.flush_delay;
5162 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5163 if (ret)
5164 return ret;
5166 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5167 return 0;
5170 static int zero;
5171 static int one = 1;
5173 static struct ctl_table ipv6_route_table_template[] = {
5175 .procname = "flush",
5176 .data = &init_net.ipv6.sysctl.flush_delay,
5177 .maxlen = sizeof(int),
5178 .mode = 0200,
5179 .proc_handler = ipv6_sysctl_rtcache_flush
5182 .procname = "gc_thresh",
5183 .data = &ip6_dst_ops_template.gc_thresh,
5184 .maxlen = sizeof(int),
5185 .mode = 0644,
5186 .proc_handler = proc_dointvec,
5189 .procname = "max_size",
5190 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5191 .maxlen = sizeof(int),
5192 .mode = 0644,
5193 .proc_handler = proc_dointvec,
5196 .procname = "gc_min_interval",
5197 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5198 .maxlen = sizeof(int),
5199 .mode = 0644,
5200 .proc_handler = proc_dointvec_jiffies,
5203 .procname = "gc_timeout",
5204 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5205 .maxlen = sizeof(int),
5206 .mode = 0644,
5207 .proc_handler = proc_dointvec_jiffies,
5210 .procname = "gc_interval",
5211 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5212 .maxlen = sizeof(int),
5213 .mode = 0644,
5214 .proc_handler = proc_dointvec_jiffies,
5217 .procname = "gc_elasticity",
5218 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5219 .maxlen = sizeof(int),
5220 .mode = 0644,
5221 .proc_handler = proc_dointvec,
5224 .procname = "mtu_expires",
5225 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5226 .maxlen = sizeof(int),
5227 .mode = 0644,
5228 .proc_handler = proc_dointvec_jiffies,
5231 .procname = "min_adv_mss",
5232 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5233 .maxlen = sizeof(int),
5234 .mode = 0644,
5235 .proc_handler = proc_dointvec,
5238 .procname = "gc_min_interval_ms",
5239 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5240 .maxlen = sizeof(int),
5241 .mode = 0644,
5242 .proc_handler = proc_dointvec_ms_jiffies,
5245 .procname = "skip_notify_on_dev_down",
5246 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5247 .maxlen = sizeof(int),
5248 .mode = 0644,
5249 .proc_handler = proc_dointvec,
5250 .extra1 = &zero,
5251 .extra2 = &one,
5256 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5258 struct ctl_table *table;
5260 table = kmemdup(ipv6_route_table_template,
5261 sizeof(ipv6_route_table_template),
5262 GFP_KERNEL);
5264 if (table) {
5265 table[0].data = &net->ipv6.sysctl.flush_delay;
5266 table[0].extra1 = net;
5267 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5268 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5269 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5270 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5271 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5272 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5273 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5274 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5275 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5276 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5278 /* Don't export sysctls to unprivileged users */
5279 if (net->user_ns != &init_user_ns)
5280 table[0].procname = NULL;
5283 return table;
5285 #endif
5287 static int __net_init ip6_route_net_init(struct net *net)
5289 int ret = -ENOMEM;
5291 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5292 sizeof(net->ipv6.ip6_dst_ops));
5294 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5295 goto out_ip6_dst_ops;
5297 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5298 sizeof(*net->ipv6.fib6_null_entry),
5299 GFP_KERNEL);
5300 if (!net->ipv6.fib6_null_entry)
5301 goto out_ip6_dst_entries;
5303 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5304 sizeof(*net->ipv6.ip6_null_entry),
5305 GFP_KERNEL);
5306 if (!net->ipv6.ip6_null_entry)
5307 goto out_fib6_null_entry;
5308 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5309 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5310 ip6_template_metrics, true);
5312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5313 net->ipv6.fib6_has_custom_rules = false;
5314 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5315 sizeof(*net->ipv6.ip6_prohibit_entry),
5316 GFP_KERNEL);
5317 if (!net->ipv6.ip6_prohibit_entry)
5318 goto out_ip6_null_entry;
5319 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5320 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5321 ip6_template_metrics, true);
5323 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5324 sizeof(*net->ipv6.ip6_blk_hole_entry),
5325 GFP_KERNEL);
5326 if (!net->ipv6.ip6_blk_hole_entry)
5327 goto out_ip6_prohibit_entry;
5328 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5329 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5330 ip6_template_metrics, true);
5331 #endif
5333 net->ipv6.sysctl.flush_delay = 0;
5334 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5335 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5336 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5337 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5338 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5339 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5340 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5341 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5343 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5345 ret = 0;
5346 out:
5347 return ret;
5349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5350 out_ip6_prohibit_entry:
5351 kfree(net->ipv6.ip6_prohibit_entry);
5352 out_ip6_null_entry:
5353 kfree(net->ipv6.ip6_null_entry);
5354 #endif
5355 out_fib6_null_entry:
5356 kfree(net->ipv6.fib6_null_entry);
5357 out_ip6_dst_entries:
5358 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5359 out_ip6_dst_ops:
5360 goto out;
5363 static void __net_exit ip6_route_net_exit(struct net *net)
5365 kfree(net->ipv6.fib6_null_entry);
5366 kfree(net->ipv6.ip6_null_entry);
5367 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5368 kfree(net->ipv6.ip6_prohibit_entry);
5369 kfree(net->ipv6.ip6_blk_hole_entry);
5370 #endif
5371 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5374 static int __net_init ip6_route_net_init_late(struct net *net)
5376 #ifdef CONFIG_PROC_FS
5377 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5378 sizeof(struct ipv6_route_iter));
5379 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5380 rt6_stats_seq_show, NULL);
5381 #endif
5382 return 0;
5385 static void __net_exit ip6_route_net_exit_late(struct net *net)
5387 #ifdef CONFIG_PROC_FS
5388 remove_proc_entry("ipv6_route", net->proc_net);
5389 remove_proc_entry("rt6_stats", net->proc_net);
5390 #endif
5393 static struct pernet_operations ip6_route_net_ops = {
5394 .init = ip6_route_net_init,
5395 .exit = ip6_route_net_exit,
5398 static int __net_init ipv6_inetpeer_init(struct net *net)
5400 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5402 if (!bp)
5403 return -ENOMEM;
5404 inet_peer_base_init(bp);
5405 net->ipv6.peers = bp;
5406 return 0;
5409 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5411 struct inet_peer_base *bp = net->ipv6.peers;
5413 net->ipv6.peers = NULL;
5414 inetpeer_invalidate_tree(bp);
5415 kfree(bp);
5418 static struct pernet_operations ipv6_inetpeer_ops = {
5419 .init = ipv6_inetpeer_init,
5420 .exit = ipv6_inetpeer_exit,
5423 static struct pernet_operations ip6_route_net_late_ops = {
5424 .init = ip6_route_net_init_late,
5425 .exit = ip6_route_net_exit_late,
5428 static struct notifier_block ip6_route_dev_notifier = {
5429 .notifier_call = ip6_route_dev_notify,
5430 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5433 void __init ip6_route_init_special_entries(void)
5435 /* Registering of the loopback is done before this portion of code,
5436 * the loopback reference in rt6_info will not be taken, do it
5437 * manually for init_net */
5438 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5439 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5440 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5441 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5442 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5443 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5444 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5445 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5446 #endif
5449 int __init ip6_route_init(void)
5451 int ret;
5452 int cpu;
5454 ret = -ENOMEM;
5455 ip6_dst_ops_template.kmem_cachep =
5456 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5457 SLAB_HWCACHE_ALIGN, NULL);
5458 if (!ip6_dst_ops_template.kmem_cachep)
5459 goto out;
5461 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5462 if (ret)
5463 goto out_kmem_cache;
5465 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5466 if (ret)
5467 goto out_dst_entries;
5469 ret = register_pernet_subsys(&ip6_route_net_ops);
5470 if (ret)
5471 goto out_register_inetpeer;
5473 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5475 ret = fib6_init();
5476 if (ret)
5477 goto out_register_subsys;
5479 ret = xfrm6_init();
5480 if (ret)
5481 goto out_fib6_init;
5483 ret = fib6_rules_init();
5484 if (ret)
5485 goto xfrm6_init;
5487 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5488 if (ret)
5489 goto fib6_rules_init;
5491 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5492 inet6_rtm_newroute, NULL, 0);
5493 if (ret < 0)
5494 goto out_register_late_subsys;
5496 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5497 inet6_rtm_delroute, NULL, 0);
5498 if (ret < 0)
5499 goto out_register_late_subsys;
5501 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5502 inet6_rtm_getroute, NULL,
5503 RTNL_FLAG_DOIT_UNLOCKED);
5504 if (ret < 0)
5505 goto out_register_late_subsys;
5507 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5508 if (ret)
5509 goto out_register_late_subsys;
5511 for_each_possible_cpu(cpu) {
5512 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5514 INIT_LIST_HEAD(&ul->head);
5515 spin_lock_init(&ul->lock);
5518 out:
5519 return ret;
5521 out_register_late_subsys:
5522 rtnl_unregister_all(PF_INET6);
5523 unregister_pernet_subsys(&ip6_route_net_late_ops);
5524 fib6_rules_init:
5525 fib6_rules_cleanup();
5526 xfrm6_init:
5527 xfrm6_fini();
5528 out_fib6_init:
5529 fib6_gc_cleanup();
5530 out_register_subsys:
5531 unregister_pernet_subsys(&ip6_route_net_ops);
5532 out_register_inetpeer:
5533 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5534 out_dst_entries:
5535 dst_entries_destroy(&ip6_dst_blackhole_ops);
5536 out_kmem_cache:
5537 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5538 goto out;
5541 void ip6_route_cleanup(void)
5543 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5544 unregister_pernet_subsys(&ip6_route_net_late_ops);
5545 fib6_rules_cleanup();
5546 xfrm6_fini();
5547 fib6_gc_cleanup();
5548 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5549 unregister_pernet_subsys(&ip6_route_net_ops);
5550 dst_entries_destroy(&ip6_dst_blackhole_ops);
5551 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);