2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD
= -3,
76 RT6_NUD_FAIL_PROBE
= -2,
77 RT6_NUD_FAIL_DO_RR
= -1,
81 static void ip6_rt_copy_init(struct rt6_info
*rt
, struct rt6_info
*ort
);
82 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
83 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
84 static unsigned int ip6_mtu(const struct dst_entry
*dst
);
85 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
86 static void ip6_dst_destroy(struct dst_entry
*);
87 static void ip6_dst_ifdown(struct dst_entry
*,
88 struct net_device
*dev
, int how
);
89 static int ip6_dst_gc(struct dst_ops
*ops
);
91 static int ip6_pkt_discard(struct sk_buff
*skb
);
92 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
93 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
94 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
95 static void ip6_link_failure(struct sk_buff
*skb
);
96 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
97 struct sk_buff
*skb
, u32 mtu
);
98 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
100 static void rt6_dst_from_metrics_check(struct rt6_info
*rt
);
101 static int rt6_score_route(struct rt6_info
*rt
, int oif
, int strict
);
102 static size_t rt6_nlmsg_size(struct rt6_info
*rt
);
103 static int rt6_fill_node(struct net
*net
,
104 struct sk_buff
*skb
, struct rt6_info
*rt
,
105 struct in6_addr
*dst
, struct in6_addr
*src
,
106 int iif
, int type
, u32 portid
, u32 seq
,
108 static struct rt6_info
*rt6_find_cached_rt(struct rt6_info
*rt
,
109 struct in6_addr
*daddr
,
110 struct in6_addr
*saddr
);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
114 const struct in6_addr
*prefix
, int prefixlen
,
115 const struct in6_addr
*gwaddr
,
116 struct net_device
*dev
,
118 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
119 const struct in6_addr
*prefix
, int prefixlen
,
120 const struct in6_addr
*gwaddr
,
121 struct net_device
*dev
);
124 struct uncached_list
{
126 struct list_head head
;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt6_uncached_list
);
131 void rt6_uncached_list_add(struct rt6_info
*rt
)
133 struct uncached_list
*ul
= raw_cpu_ptr(&rt6_uncached_list
);
135 rt
->rt6i_uncached_list
= ul
;
137 spin_lock_bh(&ul
->lock
);
138 list_add_tail(&rt
->rt6i_uncached
, &ul
->head
);
139 spin_unlock_bh(&ul
->lock
);
142 void rt6_uncached_list_del(struct rt6_info
*rt
)
144 if (!list_empty(&rt
->rt6i_uncached
)) {
145 struct uncached_list
*ul
= rt
->rt6i_uncached_list
;
146 struct net
*net
= dev_net(rt
->dst
.dev
);
148 spin_lock_bh(&ul
->lock
);
149 list_del(&rt
->rt6i_uncached
);
150 atomic_dec(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
151 spin_unlock_bh(&ul
->lock
);
155 static void rt6_uncached_list_flush_dev(struct net
*net
, struct net_device
*dev
)
157 struct net_device
*loopback_dev
= net
->loopback_dev
;
160 if (dev
== loopback_dev
)
163 for_each_possible_cpu(cpu
) {
164 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
167 spin_lock_bh(&ul
->lock
);
168 list_for_each_entry(rt
, &ul
->head
, rt6i_uncached
) {
169 struct inet6_dev
*rt_idev
= rt
->rt6i_idev
;
170 struct net_device
*rt_dev
= rt
->dst
.dev
;
172 if (rt_idev
->dev
== dev
) {
173 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
174 in6_dev_put(rt_idev
);
178 rt
->dst
.dev
= loopback_dev
;
179 dev_hold(rt
->dst
.dev
);
183 spin_unlock_bh(&ul
->lock
);
187 static u32
*rt6_pcpu_cow_metrics(struct rt6_info
*rt
)
189 return dst_metrics_write_ptr(&rt
->from
->dst
);
192 static u32
*ipv6_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
194 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
196 if (rt
->rt6i_flags
& RTF_PCPU
)
197 return rt6_pcpu_cow_metrics(rt
);
198 else if (rt
->rt6i_flags
& RTF_CACHE
)
201 return dst_cow_metrics_generic(dst
, old
);
204 static inline const void *choose_neigh_daddr(struct rt6_info
*rt
,
208 struct in6_addr
*p
= &rt
->rt6i_gateway
;
210 if (!ipv6_addr_any(p
))
211 return (const void *) p
;
213 return &ipv6_hdr(skb
)->daddr
;
217 static struct neighbour
*ip6_neigh_lookup(const struct dst_entry
*dst
,
221 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
224 daddr
= choose_neigh_daddr(rt
, skb
, daddr
);
225 n
= __ipv6_neigh_lookup(dst
->dev
, daddr
);
228 return neigh_create(&nd_tbl
, daddr
, dst
->dev
);
231 static void ip6_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
233 struct net_device
*dev
= dst
->dev
;
234 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
236 daddr
= choose_neigh_daddr(rt
, NULL
, daddr
);
239 if (dev
->flags
& (IFF_NOARP
| IFF_LOOPBACK
))
241 if (ipv6_addr_is_multicast((const struct in6_addr
*)daddr
))
243 __ipv6_confirm_neigh(dev
, daddr
);
246 static struct dst_ops ip6_dst_ops_template
= {
250 .check
= ip6_dst_check
,
251 .default_advmss
= ip6_default_advmss
,
253 .cow_metrics
= ipv6_cow_metrics
,
254 .destroy
= ip6_dst_destroy
,
255 .ifdown
= ip6_dst_ifdown
,
256 .negative_advice
= ip6_negative_advice
,
257 .link_failure
= ip6_link_failure
,
258 .update_pmtu
= ip6_rt_update_pmtu
,
259 .redirect
= rt6_do_redirect
,
260 .local_out
= __ip6_local_out
,
261 .neigh_lookup
= ip6_neigh_lookup
,
262 .confirm_neigh
= ip6_confirm_neigh
,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry
*dst
)
267 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
269 return mtu
? : dst
->dev
->mtu
;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
273 struct sk_buff
*skb
, u32 mtu
)
277 static void ip6_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
282 static struct dst_ops ip6_dst_blackhole_ops
= {
284 .destroy
= ip6_dst_destroy
,
285 .check
= ip6_dst_check
,
286 .mtu
= ip6_blackhole_mtu
,
287 .default_advmss
= ip6_default_advmss
,
288 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
289 .redirect
= ip6_rt_blackhole_redirect
,
290 .cow_metrics
= dst_cow_metrics_generic
,
291 .neigh_lookup
= ip6_neigh_lookup
,
294 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
295 [RTAX_HOPLIMIT
- 1] = 0,
298 static const struct rt6_info ip6_null_entry_template
= {
300 .__refcnt
= ATOMIC_INIT(1),
302 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
303 .error
= -ENETUNREACH
,
304 .input
= ip6_pkt_discard
,
305 .output
= ip6_pkt_discard_out
,
307 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
308 .rt6i_protocol
= RTPROT_KERNEL
,
309 .rt6i_metric
= ~(u32
) 0,
310 .rt6i_ref
= ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template
= {
317 .__refcnt
= ATOMIC_INIT(1),
319 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
321 .input
= ip6_pkt_prohibit
,
322 .output
= ip6_pkt_prohibit_out
,
324 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
325 .rt6i_protocol
= RTPROT_KERNEL
,
326 .rt6i_metric
= ~(u32
) 0,
327 .rt6i_ref
= ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template
= {
332 .__refcnt
= ATOMIC_INIT(1),
334 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
336 .input
= dst_discard
,
337 .output
= dst_discard_out
,
339 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
340 .rt6i_protocol
= RTPROT_KERNEL
,
341 .rt6i_metric
= ~(u32
) 0,
342 .rt6i_ref
= ATOMIC_INIT(1),
347 static void rt6_info_init(struct rt6_info
*rt
)
349 struct dst_entry
*dst
= &rt
->dst
;
351 memset(dst
+ 1, 0, sizeof(*rt
) - sizeof(*dst
));
352 INIT_LIST_HEAD(&rt
->rt6i_siblings
);
353 INIT_LIST_HEAD(&rt
->rt6i_uncached
);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info
*__ip6_dst_alloc(struct net
*net
,
358 struct net_device
*dev
,
361 struct rt6_info
*rt
= dst_alloc(&net
->ipv6
.ip6_dst_ops
, dev
,
362 1, DST_OBSOLETE_FORCE_CHK
, flags
);
366 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
372 struct rt6_info
*ip6_dst_alloc(struct net
*net
,
373 struct net_device
*dev
,
376 struct rt6_info
*rt
= __ip6_dst_alloc(net
, dev
, flags
);
379 rt
->rt6i_pcpu
= alloc_percpu_gfp(struct rt6_info
*, GFP_ATOMIC
);
380 if (!rt
->rt6i_pcpu
) {
381 dst_release_immediate(&rt
->dst
);
388 EXPORT_SYMBOL(ip6_dst_alloc
);
390 static void ip6_dst_destroy(struct dst_entry
*dst
)
392 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
393 struct rt6_exception_bucket
*bucket
;
394 struct rt6_info
*from
= rt
->from
;
395 struct inet6_dev
*idev
;
397 dst_destroy_metrics_generic(dst
);
398 free_percpu(rt
->rt6i_pcpu
);
399 rt6_uncached_list_del(rt
);
401 idev
= rt
->rt6i_idev
;
403 rt
->rt6i_idev
= NULL
;
406 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
, 1);
408 rt
->rt6i_exception_bucket
= NULL
;
413 dst_release(&from
->dst
);
416 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
419 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
420 struct inet6_dev
*idev
= rt
->rt6i_idev
;
421 struct net_device
*loopback_dev
=
422 dev_net(dev
)->loopback_dev
;
424 if (idev
&& idev
->dev
!= loopback_dev
) {
425 struct inet6_dev
*loopback_idev
= in6_dev_get(loopback_dev
);
427 rt
->rt6i_idev
= loopback_idev
;
433 static bool __rt6_check_expired(const struct rt6_info
*rt
)
435 if (rt
->rt6i_flags
& RTF_EXPIRES
)
436 return time_after(jiffies
, rt
->dst
.expires
);
441 static bool rt6_check_expired(const struct rt6_info
*rt
)
443 if (rt
->rt6i_flags
& RTF_EXPIRES
) {
444 if (time_after(jiffies
, rt
->dst
.expires
))
446 } else if (rt
->from
) {
447 return rt
->dst
.obsolete
!= DST_OBSOLETE_FORCE_CHK
||
448 rt6_check_expired(rt
->from
);
453 static struct rt6_info
*rt6_multipath_select(struct rt6_info
*match
,
454 struct flowi6
*fl6
, int oif
,
457 struct rt6_info
*sibling
, *next_sibling
;
459 /* We might have already computed the hash for ICMPv6 errors. In such
460 * case it will always be non-zero. Otherwise now is the time to do it.
463 fl6
->mp_hash
= rt6_multipath_hash(fl6
, NULL
);
465 if (fl6
->mp_hash
<= atomic_read(&match
->rt6i_nh_upper_bound
))
468 list_for_each_entry_safe(sibling
, next_sibling
, &match
->rt6i_siblings
,
470 if (fl6
->mp_hash
> atomic_read(&sibling
->rt6i_nh_upper_bound
))
472 if (rt6_score_route(sibling
, oif
, strict
) < 0)
482 * Route lookup. rcu_read_lock() should be held.
485 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
487 const struct in6_addr
*saddr
,
491 struct rt6_info
*local
= NULL
;
492 struct rt6_info
*sprt
;
494 if (!oif
&& ipv6_addr_any(saddr
) && !(rt
->rt6i_nh_flags
& RTNH_F_DEAD
))
497 for (sprt
= rt
; sprt
; sprt
= rcu_dereference(sprt
->rt6_next
)) {
498 struct net_device
*dev
= sprt
->dst
.dev
;
500 if (sprt
->rt6i_nh_flags
& RTNH_F_DEAD
)
504 if (dev
->ifindex
== oif
)
506 if (dev
->flags
& IFF_LOOPBACK
) {
507 if (!sprt
->rt6i_idev
||
508 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
509 if (flags
& RT6_LOOKUP_F_IFACE
)
512 local
->rt6i_idev
->dev
->ifindex
== oif
)
518 if (ipv6_chk_addr(net
, saddr
, dev
,
519 flags
& RT6_LOOKUP_F_IFACE
))
528 if (flags
& RT6_LOOKUP_F_IFACE
)
529 return net
->ipv6
.ip6_null_entry
;
532 return rt
->rt6i_nh_flags
& RTNH_F_DEAD
? net
->ipv6
.ip6_null_entry
: rt
;
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work
{
537 struct work_struct work
;
538 struct in6_addr target
;
539 struct net_device
*dev
;
542 static void rt6_probe_deferred(struct work_struct
*w
)
544 struct in6_addr mcaddr
;
545 struct __rt6_probe_work
*work
=
546 container_of(w
, struct __rt6_probe_work
, work
);
548 addrconf_addr_solict_mult(&work
->target
, &mcaddr
);
549 ndisc_send_ns(work
->dev
, &work
->target
, &mcaddr
, NULL
, 0);
554 static void rt6_probe(struct rt6_info
*rt
)
556 struct __rt6_probe_work
*work
;
557 struct neighbour
*neigh
;
559 * Okay, this does not seem to be appropriate
560 * for now, however, we need to check if it
561 * is really so; aka Router Reachability Probing.
563 * Router Reachability Probe MUST be rate-limited
564 * to no more than one per minute.
566 if (!rt
|| !(rt
->rt6i_flags
& RTF_GATEWAY
))
569 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
571 if (neigh
->nud_state
& NUD_VALID
)
575 write_lock(&neigh
->lock
);
576 if (!(neigh
->nud_state
& NUD_VALID
) &&
579 rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
580 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
582 __neigh_set_probe_once(neigh
);
584 write_unlock(&neigh
->lock
);
586 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
590 INIT_WORK(&work
->work
, rt6_probe_deferred
);
591 work
->target
= rt
->rt6i_gateway
;
592 dev_hold(rt
->dst
.dev
);
593 work
->dev
= rt
->dst
.dev
;
594 schedule_work(&work
->work
);
598 rcu_read_unlock_bh();
601 static inline void rt6_probe(struct rt6_info
*rt
)
607 * Default Router Selection (RFC 2461 6.3.6)
609 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
611 struct net_device
*dev
= rt
->dst
.dev
;
612 if (!oif
|| dev
->ifindex
== oif
)
614 if ((dev
->flags
& IFF_LOOPBACK
) &&
615 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
620 static inline enum rt6_nud_state
rt6_check_neigh(struct rt6_info
*rt
)
622 struct neighbour
*neigh
;
623 enum rt6_nud_state ret
= RT6_NUD_FAIL_HARD
;
625 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
626 !(rt
->rt6i_flags
& RTF_GATEWAY
))
627 return RT6_NUD_SUCCEED
;
630 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
632 read_lock(&neigh
->lock
);
633 if (neigh
->nud_state
& NUD_VALID
)
634 ret
= RT6_NUD_SUCCEED
;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 else if (!(neigh
->nud_state
& NUD_FAILED
))
637 ret
= RT6_NUD_SUCCEED
;
639 ret
= RT6_NUD_FAIL_PROBE
;
641 read_unlock(&neigh
->lock
);
643 ret
= IS_ENABLED(CONFIG_IPV6_ROUTER_PREF
) ?
644 RT6_NUD_SUCCEED
: RT6_NUD_FAIL_DO_RR
;
646 rcu_read_unlock_bh();
651 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
656 m
= rt6_check_dev(rt
, oif
);
657 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
658 return RT6_NUD_FAIL_HARD
;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
662 if (strict
& RT6_LOOKUP_F_REACHABLE
) {
663 int n
= rt6_check_neigh(rt
);
670 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
671 int *mpri
, struct rt6_info
*match
,
675 bool match_do_rr
= false;
676 struct inet6_dev
*idev
= rt
->rt6i_idev
;
678 if (rt
->rt6i_nh_flags
& RTNH_F_DEAD
)
681 if (idev
->cnf
.ignore_routes_with_linkdown
&&
682 rt
->rt6i_nh_flags
& RTNH_F_LINKDOWN
&&
683 !(strict
& RT6_LOOKUP_F_IGNORE_LINKSTATE
))
686 if (rt6_check_expired(rt
))
689 m
= rt6_score_route(rt
, oif
, strict
);
690 if (m
== RT6_NUD_FAIL_DO_RR
) {
692 m
= 0; /* lowest valid score */
693 } else if (m
== RT6_NUD_FAIL_HARD
) {
697 if (strict
& RT6_LOOKUP_F_REACHABLE
)
700 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702 *do_rr
= match_do_rr
;
710 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
711 struct rt6_info
*leaf
,
712 struct rt6_info
*rr_head
,
713 u32 metric
, int oif
, int strict
,
716 struct rt6_info
*rt
, *match
, *cont
;
721 for (rt
= rr_head
; rt
; rt
= rcu_dereference(rt
->rt6_next
)) {
722 if (rt
->rt6i_metric
!= metric
) {
727 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
730 for (rt
= leaf
; rt
&& rt
!= rr_head
;
731 rt
= rcu_dereference(rt
->rt6_next
)) {
732 if (rt
->rt6i_metric
!= metric
) {
737 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
743 for (rt
= cont
; rt
; rt
= rcu_dereference(rt
->rt6_next
))
744 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
749 static struct rt6_info
*rt6_select(struct net
*net
, struct fib6_node
*fn
,
752 struct rt6_info
*leaf
= rcu_dereference(fn
->leaf
);
753 struct rt6_info
*match
, *rt0
;
757 if (!leaf
|| leaf
== net
->ipv6
.ip6_null_entry
)
758 return net
->ipv6
.ip6_null_entry
;
760 rt0
= rcu_dereference(fn
->rr_ptr
);
764 /* Double check to make sure fn is not an intermediate node
765 * and fn->leaf does not points to its child's leaf
766 * (This might happen if all routes under fn are deleted from
767 * the tree and fib6_repair_tree() is called on the node.)
769 key_plen
= rt0
->rt6i_dst
.plen
;
770 #ifdef CONFIG_IPV6_SUBTREES
771 if (rt0
->rt6i_src
.plen
)
772 key_plen
= rt0
->rt6i_src
.plen
;
774 if (fn
->fn_bit
!= key_plen
)
775 return net
->ipv6
.ip6_null_entry
;
777 match
= find_rr_leaf(fn
, leaf
, rt0
, rt0
->rt6i_metric
, oif
, strict
,
781 struct rt6_info
*next
= rcu_dereference(rt0
->rt6_next
);
783 /* no entries matched; do round-robin */
784 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
788 spin_lock_bh(&leaf
->rt6i_table
->tb6_lock
);
789 /* make sure next is not being deleted from the tree */
791 rcu_assign_pointer(fn
->rr_ptr
, next
);
792 spin_unlock_bh(&leaf
->rt6i_table
->tb6_lock
);
796 return match
? match
: net
->ipv6
.ip6_null_entry
;
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info
*rt
)
801 return (rt
->rt6i_flags
& (RTF_NONEXTHOP
| RTF_GATEWAY
));
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
806 const struct in6_addr
*gwaddr
)
808 struct net
*net
= dev_net(dev
);
809 struct route_info
*rinfo
= (struct route_info
*) opt
;
810 struct in6_addr prefix_buf
, *prefix
;
812 unsigned long lifetime
;
815 if (len
< sizeof(struct route_info
)) {
819 /* Sanity check for prefix_len and length */
820 if (rinfo
->length
> 3) {
822 } else if (rinfo
->prefix_len
> 128) {
824 } else if (rinfo
->prefix_len
> 64) {
825 if (rinfo
->length
< 2) {
828 } else if (rinfo
->prefix_len
> 0) {
829 if (rinfo
->length
< 1) {
834 pref
= rinfo
->route_pref
;
835 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
838 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
840 if (rinfo
->length
== 3)
841 prefix
= (struct in6_addr
*)rinfo
->prefix
;
843 /* this function is safe */
844 ipv6_addr_prefix(&prefix_buf
,
845 (struct in6_addr
*)rinfo
->prefix
,
847 prefix
= &prefix_buf
;
850 if (rinfo
->prefix_len
== 0)
851 rt
= rt6_get_dflt_router(gwaddr
, dev
);
853 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
,
856 if (rt
&& !lifetime
) {
862 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
865 rt
->rt6i_flags
= RTF_ROUTEINFO
|
866 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
869 if (!addrconf_finite_timeout(lifetime
))
870 rt6_clean_expires(rt
);
872 rt6_set_expires(rt
, jiffies
+ HZ
* lifetime
);
880 static struct fib6_node
* fib6_backtrack(struct fib6_node
*fn
,
881 struct in6_addr
*saddr
)
883 struct fib6_node
*pn
, *sn
;
885 if (fn
->fn_flags
& RTN_TL_ROOT
)
887 pn
= rcu_dereference(fn
->parent
);
888 sn
= FIB6_SUBTREE(pn
);
890 fn
= fib6_lookup(sn
, NULL
, saddr
);
893 if (fn
->fn_flags
& RTN_RTINFO
)
898 static bool ip6_hold_safe(struct net
*net
, struct rt6_info
**prt
,
901 struct rt6_info
*rt
= *prt
;
903 if (dst_hold_safe(&rt
->dst
))
906 rt
= net
->ipv6
.ip6_null_entry
;
915 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
916 struct fib6_table
*table
,
917 struct flowi6
*fl6
, int flags
)
919 struct rt6_info
*rt
, *rt_cache
;
920 struct fib6_node
*fn
;
922 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
923 flags
&= ~RT6_LOOKUP_F_IFACE
;
926 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
928 rt
= rcu_dereference(fn
->leaf
);
930 rt
= net
->ipv6
.ip6_null_entry
;
932 rt
= rt6_device_match(net
, rt
, &fl6
->saddr
,
933 fl6
->flowi6_oif
, flags
);
934 if (rt
->rt6i_nsiblings
&& fl6
->flowi6_oif
== 0)
935 rt
= rt6_multipath_select(rt
, fl6
,
936 fl6
->flowi6_oif
, flags
);
938 if (rt
== net
->ipv6
.ip6_null_entry
) {
939 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
943 /* Search through exception table */
944 rt_cache
= rt6_find_cached_rt(rt
, &fl6
->daddr
, &fl6
->saddr
);
948 if (ip6_hold_safe(net
, &rt
, true))
949 dst_use_noref(&rt
->dst
, jiffies
);
953 trace_fib6_table_lookup(net
, rt
, table
, fl6
);
959 struct dst_entry
*ip6_route_lookup(struct net
*net
, struct flowi6
*fl6
,
962 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_lookup
);
964 EXPORT_SYMBOL_GPL(ip6_route_lookup
);
966 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
967 const struct in6_addr
*saddr
, int oif
, int strict
)
969 struct flowi6 fl6
= {
973 struct dst_entry
*dst
;
974 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
977 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
978 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
981 dst
= fib6_rule_lookup(net
, &fl6
, flags
, ip6_pol_route_lookup
);
983 return (struct rt6_info
*) dst
;
989 EXPORT_SYMBOL(rt6_lookup
);
991 /* ip6_ins_rt is called with FREE table->tb6_lock.
992 * It takes new route entry, the addition fails by any reason the
994 * Caller must hold dst before calling it.
997 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
,
998 struct mx6_config
*mxc
,
999 struct netlink_ext_ack
*extack
)
1002 struct fib6_table
*table
;
1004 table
= rt
->rt6i_table
;
1005 spin_lock_bh(&table
->tb6_lock
);
1006 err
= fib6_add(&table
->tb6_root
, rt
, info
, mxc
, extack
);
1007 spin_unlock_bh(&table
->tb6_lock
);
1012 int ip6_ins_rt(struct rt6_info
*rt
)
1014 struct nl_info info
= { .nl_net
= dev_net(rt
->dst
.dev
), };
1015 struct mx6_config mxc
= { .mx
= NULL
, };
1017 /* Hold dst to account for the reference from the fib6 tree */
1019 return __ip6_ins_rt(rt
, &info
, &mxc
, NULL
);
1022 /* called with rcu_lock held */
1023 static struct net_device
*ip6_rt_get_dev_rcu(struct rt6_info
*rt
)
1025 struct net_device
*dev
= rt
->dst
.dev
;
1027 if (rt
->rt6i_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) {
1028 /* for copies of local routes, dst->dev needs to be the
1029 * device if it is a master device, the master device if
1030 * device is enslaved, and the loopback as the default
1032 if (netif_is_l3_slave(dev
) &&
1033 !rt6_need_strict(&rt
->rt6i_dst
.addr
))
1034 dev
= l3mdev_master_dev_rcu(dev
);
1035 else if (!netif_is_l3_master(dev
))
1036 dev
= dev_net(dev
)->loopback_dev
;
1037 /* last case is netif_is_l3_master(dev) is true in which
1038 * case we want dev returned to be dev
1045 static struct rt6_info
*ip6_rt_cache_alloc(struct rt6_info
*ort
,
1046 const struct in6_addr
*daddr
,
1047 const struct in6_addr
*saddr
)
1049 struct net_device
*dev
;
1050 struct rt6_info
*rt
;
1056 if (ort
->rt6i_flags
& (RTF_CACHE
| RTF_PCPU
))
1060 dev
= ip6_rt_get_dev_rcu(ort
);
1061 rt
= __ip6_dst_alloc(dev_net(dev
), dev
, 0);
1066 ip6_rt_copy_init(rt
, ort
);
1067 rt
->rt6i_flags
|= RTF_CACHE
;
1068 rt
->rt6i_metric
= 0;
1069 rt
->dst
.flags
|= DST_HOST
;
1070 rt
->rt6i_dst
.addr
= *daddr
;
1071 rt
->rt6i_dst
.plen
= 128;
1073 if (!rt6_is_gw_or_nonexthop(ort
)) {
1074 if (ort
->rt6i_dst
.plen
!= 128 &&
1075 ipv6_addr_equal(&ort
->rt6i_dst
.addr
, daddr
))
1076 rt
->rt6i_flags
|= RTF_ANYCAST
;
1077 #ifdef CONFIG_IPV6_SUBTREES
1078 if (rt
->rt6i_src
.plen
&& saddr
) {
1079 rt
->rt6i_src
.addr
= *saddr
;
1080 rt
->rt6i_src
.plen
= 128;
1088 static struct rt6_info
*ip6_rt_pcpu_alloc(struct rt6_info
*rt
)
1090 struct net_device
*dev
;
1091 struct rt6_info
*pcpu_rt
;
1094 dev
= ip6_rt_get_dev_rcu(rt
);
1095 pcpu_rt
= __ip6_dst_alloc(dev_net(dev
), dev
, rt
->dst
.flags
);
1099 ip6_rt_copy_init(pcpu_rt
, rt
);
1100 pcpu_rt
->rt6i_protocol
= rt
->rt6i_protocol
;
1101 pcpu_rt
->rt6i_flags
|= RTF_PCPU
;
1105 /* It should be called with rcu_read_lock() acquired */
1106 static struct rt6_info
*rt6_get_pcpu_route(struct rt6_info
*rt
)
1108 struct rt6_info
*pcpu_rt
, **p
;
1110 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1113 if (pcpu_rt
&& ip6_hold_safe(NULL
, &pcpu_rt
, false))
1114 rt6_dst_from_metrics_check(pcpu_rt
);
1119 static struct rt6_info
*rt6_make_pcpu_route(struct rt6_info
*rt
)
1121 struct rt6_info
*pcpu_rt
, *prev
, **p
;
1123 pcpu_rt
= ip6_rt_pcpu_alloc(rt
);
1125 struct net
*net
= dev_net(rt
->dst
.dev
);
1127 dst_hold(&net
->ipv6
.ip6_null_entry
->dst
);
1128 return net
->ipv6
.ip6_null_entry
;
1131 dst_hold(&pcpu_rt
->dst
);
1132 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1133 prev
= cmpxchg(p
, NULL
, pcpu_rt
);
1136 rt6_dst_from_metrics_check(pcpu_rt
);
1140 /* exception hash table implementation
1142 static DEFINE_SPINLOCK(rt6_exception_lock
);
1144 /* Remove rt6_ex from hash table and free the memory
1145 * Caller must hold rt6_exception_lock
1147 static void rt6_remove_exception(struct rt6_exception_bucket
*bucket
,
1148 struct rt6_exception
*rt6_ex
)
1152 if (!bucket
|| !rt6_ex
)
1155 net
= dev_net(rt6_ex
->rt6i
->dst
.dev
);
1156 rt6_ex
->rt6i
->rt6i_node
= NULL
;
1157 hlist_del_rcu(&rt6_ex
->hlist
);
1158 rt6_release(rt6_ex
->rt6i
);
1159 kfree_rcu(rt6_ex
, rcu
);
1160 WARN_ON_ONCE(!bucket
->depth
);
1162 net
->ipv6
.rt6_stats
->fib_rt_cache
--;
1165 /* Remove oldest rt6_ex in bucket and free the memory
1166 * Caller must hold rt6_exception_lock
1168 static void rt6_exception_remove_oldest(struct rt6_exception_bucket
*bucket
)
1170 struct rt6_exception
*rt6_ex
, *oldest
= NULL
;
1175 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1176 if (!oldest
|| time_before(rt6_ex
->stamp
, oldest
->stamp
))
1179 rt6_remove_exception(bucket
, oldest
);
1182 static u32
rt6_exception_hash(const struct in6_addr
*dst
,
1183 const struct in6_addr
*src
)
1185 static u32 seed __read_mostly
;
1188 net_get_random_once(&seed
, sizeof(seed
));
1189 val
= jhash(dst
, sizeof(*dst
), seed
);
1191 #ifdef CONFIG_IPV6_SUBTREES
1193 val
= jhash(src
, sizeof(*src
), val
);
1195 return hash_32(val
, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT
);
1198 /* Helper function to find the cached rt in the hash table
1199 * and update bucket pointer to point to the bucket for this
1200 * (daddr, saddr) pair
1201 * Caller must hold rt6_exception_lock
1203 static struct rt6_exception
*
1204 __rt6_find_exception_spinlock(struct rt6_exception_bucket
**bucket
,
1205 const struct in6_addr
*daddr
,
1206 const struct in6_addr
*saddr
)
1208 struct rt6_exception
*rt6_ex
;
1211 if (!(*bucket
) || !daddr
)
1214 hval
= rt6_exception_hash(daddr
, saddr
);
1217 hlist_for_each_entry(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1218 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1219 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1221 #ifdef CONFIG_IPV6_SUBTREES
1222 if (matched
&& saddr
)
1223 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1231 /* Helper function to find the cached rt in the hash table
1232 * and update bucket pointer to point to the bucket for this
1233 * (daddr, saddr) pair
1234 * Caller must hold rcu_read_lock()
1236 static struct rt6_exception
*
1237 __rt6_find_exception_rcu(struct rt6_exception_bucket
**bucket
,
1238 const struct in6_addr
*daddr
,
1239 const struct in6_addr
*saddr
)
1241 struct rt6_exception
*rt6_ex
;
1244 WARN_ON_ONCE(!rcu_read_lock_held());
1246 if (!(*bucket
) || !daddr
)
1249 hval
= rt6_exception_hash(daddr
, saddr
);
1252 hlist_for_each_entry_rcu(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1253 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1254 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1256 #ifdef CONFIG_IPV6_SUBTREES
1257 if (matched
&& saddr
)
1258 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1266 static int rt6_insert_exception(struct rt6_info
*nrt
,
1267 struct rt6_info
*ort
)
1269 struct net
*net
= dev_net(ort
->dst
.dev
);
1270 struct rt6_exception_bucket
*bucket
;
1271 struct in6_addr
*src_key
= NULL
;
1272 struct rt6_exception
*rt6_ex
;
1275 /* ort can't be a cache or pcpu route */
1276 if (ort
->rt6i_flags
& (RTF_CACHE
| RTF_PCPU
))
1278 WARN_ON_ONCE(ort
->rt6i_flags
& (RTF_CACHE
| RTF_PCPU
));
1280 spin_lock_bh(&rt6_exception_lock
);
1282 if (ort
->exception_bucket_flushed
) {
1287 bucket
= rcu_dereference_protected(ort
->rt6i_exception_bucket
,
1288 lockdep_is_held(&rt6_exception_lock
));
1290 bucket
= kcalloc(FIB6_EXCEPTION_BUCKET_SIZE
, sizeof(*bucket
),
1296 rcu_assign_pointer(ort
->rt6i_exception_bucket
, bucket
);
1299 #ifdef CONFIG_IPV6_SUBTREES
1300 /* rt6i_src.plen != 0 indicates ort is in subtree
1301 * and exception table is indexed by a hash of
1302 * both rt6i_dst and rt6i_src.
1303 * Otherwise, the exception table is indexed by
1304 * a hash of only rt6i_dst.
1306 if (ort
->rt6i_src
.plen
)
1307 src_key
= &nrt
->rt6i_src
.addr
;
1310 /* Update rt6i_prefsrc as it could be changed
1311 * in rt6_remove_prefsrc()
1313 nrt
->rt6i_prefsrc
= ort
->rt6i_prefsrc
;
1314 /* rt6_mtu_change() might lower mtu on ort.
1315 * Only insert this exception route if its mtu
1316 * is less than ort's mtu value.
1318 if (nrt
->rt6i_pmtu
>= dst_mtu(&ort
->dst
)) {
1323 rt6_ex
= __rt6_find_exception_spinlock(&bucket
, &nrt
->rt6i_dst
.addr
,
1326 rt6_remove_exception(bucket
, rt6_ex
);
1328 rt6_ex
= kzalloc(sizeof(*rt6_ex
), GFP_ATOMIC
);
1334 rt6_ex
->stamp
= jiffies
;
1335 atomic_inc(&nrt
->rt6i_ref
);
1336 nrt
->rt6i_node
= ort
->rt6i_node
;
1337 hlist_add_head_rcu(&rt6_ex
->hlist
, &bucket
->chain
);
1339 net
->ipv6
.rt6_stats
->fib_rt_cache
++;
1341 if (bucket
->depth
> FIB6_MAX_DEPTH
)
1342 rt6_exception_remove_oldest(bucket
);
1345 spin_unlock_bh(&rt6_exception_lock
);
1347 /* Update fn->fn_sernum to invalidate all cached dst */
1349 spin_lock_bh(&ort
->rt6i_table
->tb6_lock
);
1350 fib6_update_sernum(ort
);
1351 spin_unlock_bh(&ort
->rt6i_table
->tb6_lock
);
1352 fib6_force_start_gc(net
);
1358 void rt6_flush_exceptions(struct rt6_info
*rt
)
1360 struct rt6_exception_bucket
*bucket
;
1361 struct rt6_exception
*rt6_ex
;
1362 struct hlist_node
*tmp
;
1365 spin_lock_bh(&rt6_exception_lock
);
1366 /* Prevent rt6_insert_exception() to recreate the bucket list */
1367 rt
->exception_bucket_flushed
= 1;
1369 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1370 lockdep_is_held(&rt6_exception_lock
));
1374 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1375 hlist_for_each_entry_safe(rt6_ex
, tmp
, &bucket
->chain
, hlist
)
1376 rt6_remove_exception(bucket
, rt6_ex
);
1377 WARN_ON_ONCE(bucket
->depth
);
1382 spin_unlock_bh(&rt6_exception_lock
);
1385 /* Find cached rt in the hash table inside passed in rt
1386 * Caller has to hold rcu_read_lock()
1388 static struct rt6_info
*rt6_find_cached_rt(struct rt6_info
*rt
,
1389 struct in6_addr
*daddr
,
1390 struct in6_addr
*saddr
)
1392 struct rt6_exception_bucket
*bucket
;
1393 struct in6_addr
*src_key
= NULL
;
1394 struct rt6_exception
*rt6_ex
;
1395 struct rt6_info
*res
= NULL
;
1397 bucket
= rcu_dereference(rt
->rt6i_exception_bucket
);
1399 #ifdef CONFIG_IPV6_SUBTREES
1400 /* rt6i_src.plen != 0 indicates rt is in subtree
1401 * and exception table is indexed by a hash of
1402 * both rt6i_dst and rt6i_src.
1403 * Otherwise, the exception table is indexed by
1404 * a hash of only rt6i_dst.
1406 if (rt
->rt6i_src
.plen
)
1409 rt6_ex
= __rt6_find_exception_rcu(&bucket
, daddr
, src_key
);
1411 if (rt6_ex
&& !rt6_check_expired(rt6_ex
->rt6i
))
1417 /* Remove the passed in cached rt from the hash table that contains it */
1418 int rt6_remove_exception_rt(struct rt6_info
*rt
)
1420 struct rt6_exception_bucket
*bucket
;
1421 struct rt6_info
*from
= rt
->from
;
1422 struct in6_addr
*src_key
= NULL
;
1423 struct rt6_exception
*rt6_ex
;
1427 !(rt
->rt6i_flags
& RTF_CACHE
))
1430 if (!rcu_access_pointer(from
->rt6i_exception_bucket
))
1433 spin_lock_bh(&rt6_exception_lock
);
1434 bucket
= rcu_dereference_protected(from
->rt6i_exception_bucket
,
1435 lockdep_is_held(&rt6_exception_lock
));
1436 #ifdef CONFIG_IPV6_SUBTREES
1437 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1438 * and exception table is indexed by a hash of
1439 * both rt6i_dst and rt6i_src.
1440 * Otherwise, the exception table is indexed by
1441 * a hash of only rt6i_dst.
1443 if (from
->rt6i_src
.plen
)
1444 src_key
= &rt
->rt6i_src
.addr
;
1446 rt6_ex
= __rt6_find_exception_spinlock(&bucket
,
1450 rt6_remove_exception(bucket
, rt6_ex
);
1456 spin_unlock_bh(&rt6_exception_lock
);
1460 /* Find rt6_ex which contains the passed in rt cache and
1463 static void rt6_update_exception_stamp_rt(struct rt6_info
*rt
)
1465 struct rt6_exception_bucket
*bucket
;
1466 struct rt6_info
*from
= rt
->from
;
1467 struct in6_addr
*src_key
= NULL
;
1468 struct rt6_exception
*rt6_ex
;
1471 !(rt
->rt6i_flags
& RTF_CACHE
))
1475 bucket
= rcu_dereference(from
->rt6i_exception_bucket
);
1477 #ifdef CONFIG_IPV6_SUBTREES
1478 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1479 * and exception table is indexed by a hash of
1480 * both rt6i_dst and rt6i_src.
1481 * Otherwise, the exception table is indexed by
1482 * a hash of only rt6i_dst.
1484 if (from
->rt6i_src
.plen
)
1485 src_key
= &rt
->rt6i_src
.addr
;
1487 rt6_ex
= __rt6_find_exception_rcu(&bucket
,
1491 rt6_ex
->stamp
= jiffies
;
1496 static void rt6_exceptions_remove_prefsrc(struct rt6_info
*rt
)
1498 struct rt6_exception_bucket
*bucket
;
1499 struct rt6_exception
*rt6_ex
;
1502 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1503 lockdep_is_held(&rt6_exception_lock
));
1506 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1507 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1508 rt6_ex
->rt6i
->rt6i_prefsrc
.plen
= 0;
1515 static bool rt6_mtu_change_route_allowed(struct inet6_dev
*idev
,
1516 struct rt6_info
*rt
, int mtu
)
1518 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1519 * lowest MTU in the path: always allow updating the route PMTU to
1520 * reflect PMTU decreases.
1522 * If the new MTU is higher, and the route PMTU is equal to the local
1523 * MTU, this means the old MTU is the lowest in the path, so allow
1524 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1528 if (dst_mtu(&rt
->dst
) >= mtu
)
1531 if (dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
)
1537 static void rt6_exceptions_update_pmtu(struct inet6_dev
*idev
,
1538 struct rt6_info
*rt
, int mtu
)
1540 struct rt6_exception_bucket
*bucket
;
1541 struct rt6_exception
*rt6_ex
;
1544 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1545 lockdep_is_held(&rt6_exception_lock
));
1550 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1551 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1552 struct rt6_info
*entry
= rt6_ex
->rt6i
;
1554 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1555 * route), the metrics of its rt->dst.from have already
1558 if (entry
->rt6i_pmtu
&&
1559 rt6_mtu_change_route_allowed(idev
, entry
, mtu
))
1560 entry
->rt6i_pmtu
= mtu
;
1566 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1568 static void rt6_exceptions_clean_tohost(struct rt6_info
*rt
,
1569 struct in6_addr
*gateway
)
1571 struct rt6_exception_bucket
*bucket
;
1572 struct rt6_exception
*rt6_ex
;
1573 struct hlist_node
*tmp
;
1576 if (!rcu_access_pointer(rt
->rt6i_exception_bucket
))
1579 spin_lock_bh(&rt6_exception_lock
);
1580 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1581 lockdep_is_held(&rt6_exception_lock
));
1584 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1585 hlist_for_each_entry_safe(rt6_ex
, tmp
,
1586 &bucket
->chain
, hlist
) {
1587 struct rt6_info
*entry
= rt6_ex
->rt6i
;
1589 if ((entry
->rt6i_flags
& RTF_CACHE_GATEWAY
) ==
1590 RTF_CACHE_GATEWAY
&&
1591 ipv6_addr_equal(gateway
,
1592 &entry
->rt6i_gateway
)) {
1593 rt6_remove_exception(bucket
, rt6_ex
);
1600 spin_unlock_bh(&rt6_exception_lock
);
1603 static void rt6_age_examine_exception(struct rt6_exception_bucket
*bucket
,
1604 struct rt6_exception
*rt6_ex
,
1605 struct fib6_gc_args
*gc_args
,
1608 struct rt6_info
*rt
= rt6_ex
->rt6i
;
1610 /* we are pruning and obsoleting aged-out and non gateway exceptions
1611 * even if others have still references to them, so that on next
1612 * dst_check() such references can be dropped.
1613 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1614 * expired, independently from their aging, as per RFC 8201 section 4
1616 if (!(rt
->rt6i_flags
& RTF_EXPIRES
)) {
1617 if (time_after_eq(now
, rt
->dst
.lastuse
+ gc_args
->timeout
)) {
1618 RT6_TRACE("aging clone %p\n", rt
);
1619 rt6_remove_exception(bucket
, rt6_ex
);
1622 } else if (time_after(jiffies
, rt
->dst
.expires
)) {
1623 RT6_TRACE("purging expired route %p\n", rt
);
1624 rt6_remove_exception(bucket
, rt6_ex
);
1628 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
1629 struct neighbour
*neigh
;
1630 __u8 neigh_flags
= 0;
1632 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
1634 neigh_flags
= neigh
->flags
;
1636 if (!(neigh_flags
& NTF_ROUTER
)) {
1637 RT6_TRACE("purging route %p via non-router but gateway\n",
1639 rt6_remove_exception(bucket
, rt6_ex
);
1647 void rt6_age_exceptions(struct rt6_info
*rt
,
1648 struct fib6_gc_args
*gc_args
,
1651 struct rt6_exception_bucket
*bucket
;
1652 struct rt6_exception
*rt6_ex
;
1653 struct hlist_node
*tmp
;
1656 if (!rcu_access_pointer(rt
->rt6i_exception_bucket
))
1660 spin_lock(&rt6_exception_lock
);
1661 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1662 lockdep_is_held(&rt6_exception_lock
));
1665 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1666 hlist_for_each_entry_safe(rt6_ex
, tmp
,
1667 &bucket
->chain
, hlist
) {
1668 rt6_age_examine_exception(bucket
, rt6_ex
,
1674 spin_unlock(&rt6_exception_lock
);
1675 rcu_read_unlock_bh();
1678 struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
,
1679 int oif
, struct flowi6
*fl6
, int flags
)
1681 struct fib6_node
*fn
, *saved_fn
;
1682 struct rt6_info
*rt
, *rt_cache
;
1685 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
1686 strict
|= flags
& RT6_LOOKUP_F_IGNORE_LINKSTATE
;
1687 if (net
->ipv6
.devconf_all
->forwarding
== 0)
1688 strict
|= RT6_LOOKUP_F_REACHABLE
;
1692 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1695 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1699 rt
= rt6_select(net
, fn
, oif
, strict
);
1700 if (rt
->rt6i_nsiblings
)
1701 rt
= rt6_multipath_select(rt
, fl6
, oif
, strict
);
1702 if (rt
== net
->ipv6
.ip6_null_entry
) {
1703 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1705 goto redo_rt6_select
;
1706 else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
1707 /* also consider unreachable route */
1708 strict
&= ~RT6_LOOKUP_F_REACHABLE
;
1710 goto redo_rt6_select
;
1714 /*Search through exception table */
1715 rt_cache
= rt6_find_cached_rt(rt
, &fl6
->daddr
, &fl6
->saddr
);
1719 if (rt
== net
->ipv6
.ip6_null_entry
) {
1722 trace_fib6_table_lookup(net
, rt
, table
, fl6
);
1724 } else if (rt
->rt6i_flags
& RTF_CACHE
) {
1725 if (ip6_hold_safe(net
, &rt
, true)) {
1726 dst_use_noref(&rt
->dst
, jiffies
);
1727 rt6_dst_from_metrics_check(rt
);
1730 trace_fib6_table_lookup(net
, rt
, table
, fl6
);
1732 } else if (unlikely((fl6
->flowi6_flags
& FLOWI_FLAG_KNOWN_NH
) &&
1733 !(rt
->rt6i_flags
& RTF_GATEWAY
))) {
1734 /* Create a RTF_CACHE clone which will not be
1735 * owned by the fib6 tree. It is for the special case where
1736 * the daddr in the skb during the neighbor look-up is different
1737 * from the fl6->daddr used to look-up route here.
1740 struct rt6_info
*uncached_rt
;
1742 if (ip6_hold_safe(net
, &rt
, true)) {
1743 dst_use_noref(&rt
->dst
, jiffies
);
1747 goto uncached_rt_out
;
1751 uncached_rt
= ip6_rt_cache_alloc(rt
, &fl6
->daddr
, NULL
);
1752 dst_release(&rt
->dst
);
1755 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1756 * No need for another dst_hold()
1758 rt6_uncached_list_add(uncached_rt
);
1759 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
1761 uncached_rt
= net
->ipv6
.ip6_null_entry
;
1762 dst_hold(&uncached_rt
->dst
);
1766 trace_fib6_table_lookup(net
, uncached_rt
, table
, fl6
);
1770 /* Get a percpu copy */
1772 struct rt6_info
*pcpu_rt
;
1774 dst_use_noref(&rt
->dst
, jiffies
);
1776 pcpu_rt
= rt6_get_pcpu_route(rt
);
1779 /* atomic_inc_not_zero() is needed when using rcu */
1780 if (atomic_inc_not_zero(&rt
->rt6i_ref
)) {
1781 /* No dst_hold() on rt is needed because grabbing
1782 * rt->rt6i_ref makes sure rt can't be released.
1784 pcpu_rt
= rt6_make_pcpu_route(rt
);
1787 /* rt is already removed from tree */
1788 pcpu_rt
= net
->ipv6
.ip6_null_entry
;
1789 dst_hold(&pcpu_rt
->dst
);
1794 trace_fib6_table_lookup(net
, pcpu_rt
, table
, fl6
);
1798 EXPORT_SYMBOL_GPL(ip6_pol_route
);
1800 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
1801 struct flowi6
*fl6
, int flags
)
1803 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, flags
);
1806 struct dst_entry
*ip6_route_input_lookup(struct net
*net
,
1807 struct net_device
*dev
,
1808 struct flowi6
*fl6
, int flags
)
1810 if (rt6_need_strict(&fl6
->daddr
) && dev
->type
!= ARPHRD_PIMREG
)
1811 flags
|= RT6_LOOKUP_F_IFACE
;
1813 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_input
);
1815 EXPORT_SYMBOL_GPL(ip6_route_input_lookup
);
1817 static void ip6_multipath_l3_keys(const struct sk_buff
*skb
,
1818 struct flow_keys
*keys
)
1820 const struct ipv6hdr
*outer_iph
= ipv6_hdr(skb
);
1821 const struct ipv6hdr
*key_iph
= outer_iph
;
1822 const struct ipv6hdr
*inner_iph
;
1823 const struct icmp6hdr
*icmph
;
1824 struct ipv6hdr _inner_iph
;
1825 struct icmp6hdr _icmph
;
1827 if (likely(outer_iph
->nexthdr
!= IPPROTO_ICMPV6
))
1830 icmph
= skb_header_pointer(skb
, skb_transport_offset(skb
),
1831 sizeof(_icmph
), &_icmph
);
1835 if (icmph
->icmp6_type
!= ICMPV6_DEST_UNREACH
&&
1836 icmph
->icmp6_type
!= ICMPV6_PKT_TOOBIG
&&
1837 icmph
->icmp6_type
!= ICMPV6_TIME_EXCEED
&&
1838 icmph
->icmp6_type
!= ICMPV6_PARAMPROB
)
1841 inner_iph
= skb_header_pointer(skb
,
1842 skb_transport_offset(skb
) + sizeof(*icmph
),
1843 sizeof(_inner_iph
), &_inner_iph
);
1847 key_iph
= inner_iph
;
1849 memset(keys
, 0, sizeof(*keys
));
1850 keys
->control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
1851 keys
->addrs
.v6addrs
.src
= key_iph
->saddr
;
1852 keys
->addrs
.v6addrs
.dst
= key_iph
->daddr
;
1853 keys
->tags
.flow_label
= ip6_flowinfo(key_iph
);
1854 keys
->basic
.ip_proto
= key_iph
->nexthdr
;
1857 /* if skb is set it will be used and fl6 can be NULL */
1858 u32
rt6_multipath_hash(const struct flowi6
*fl6
, const struct sk_buff
*skb
)
1860 struct flow_keys hash_keys
;
1863 ip6_multipath_l3_keys(skb
, &hash_keys
);
1864 return flow_hash_from_keys(&hash_keys
) >> 1;
1867 return get_hash_from_flowi6(fl6
) >> 1;
1870 void ip6_route_input(struct sk_buff
*skb
)
1872 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
1873 struct net
*net
= dev_net(skb
->dev
);
1874 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1875 struct ip_tunnel_info
*tun_info
;
1876 struct flowi6 fl6
= {
1877 .flowi6_iif
= skb
->dev
->ifindex
,
1878 .daddr
= iph
->daddr
,
1879 .saddr
= iph
->saddr
,
1880 .flowlabel
= ip6_flowinfo(iph
),
1881 .flowi6_mark
= skb
->mark
,
1882 .flowi6_proto
= iph
->nexthdr
,
1885 tun_info
= skb_tunnel_info(skb
);
1886 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
1887 fl6
.flowi6_tun_key
.tun_id
= tun_info
->key
.tun_id
;
1888 if (unlikely(fl6
.flowi6_proto
== IPPROTO_ICMPV6
))
1889 fl6
.mp_hash
= rt6_multipath_hash(&fl6
, skb
);
1891 skb_dst_set(skb
, ip6_route_input_lookup(net
, skb
->dev
, &fl6
, flags
));
1894 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
1895 struct flowi6
*fl6
, int flags
)
1897 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, flags
);
1900 struct dst_entry
*ip6_route_output_flags(struct net
*net
, const struct sock
*sk
,
1901 struct flowi6
*fl6
, int flags
)
1905 if (rt6_need_strict(&fl6
->daddr
)) {
1906 struct dst_entry
*dst
;
1908 dst
= l3mdev_link_scope_lookup(net
, fl6
);
1913 fl6
->flowi6_iif
= LOOPBACK_IFINDEX
;
1915 any_src
= ipv6_addr_any(&fl6
->saddr
);
1916 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
) ||
1917 (fl6
->flowi6_oif
&& any_src
))
1918 flags
|= RT6_LOOKUP_F_IFACE
;
1921 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1923 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
1925 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_output
);
1927 EXPORT_SYMBOL_GPL(ip6_route_output_flags
);
1929 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
1931 struct rt6_info
*rt
, *ort
= (struct rt6_info
*) dst_orig
;
1932 struct net_device
*loopback_dev
= net
->loopback_dev
;
1933 struct dst_entry
*new = NULL
;
1935 rt
= dst_alloc(&ip6_dst_blackhole_ops
, loopback_dev
, 1,
1936 DST_OBSOLETE_DEAD
, 0);
1939 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
1943 new->input
= dst_discard
;
1944 new->output
= dst_discard_out
;
1946 dst_copy_metrics(new, &ort
->dst
);
1948 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
1949 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
1950 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_PCPU
;
1951 rt
->rt6i_metric
= 0;
1953 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1954 #ifdef CONFIG_IPV6_SUBTREES
1955 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1959 dst_release(dst_orig
);
1960 return new ? new : ERR_PTR(-ENOMEM
);
1964 * Destination cache support functions
1967 static void rt6_dst_from_metrics_check(struct rt6_info
*rt
)
1970 dst_metrics_ptr(&rt
->dst
) != dst_metrics_ptr(&rt
->from
->dst
))
1971 dst_init_metrics(&rt
->dst
, dst_metrics_ptr(&rt
->from
->dst
), true);
1974 static struct dst_entry
*rt6_check(struct rt6_info
*rt
, u32 cookie
)
1978 if (!rt6_get_cookie_safe(rt
, &rt_cookie
) || rt_cookie
!= cookie
)
1981 if (rt6_check_expired(rt
))
1987 static struct dst_entry
*rt6_dst_from_check(struct rt6_info
*rt
, u32 cookie
)
1989 if (!__rt6_check_expired(rt
) &&
1990 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
1991 rt6_check(rt
->from
, cookie
))
1997 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
1999 struct rt6_info
*rt
;
2001 rt
= (struct rt6_info
*) dst
;
2003 /* All IPV6 dsts are created with ->obsolete set to the value
2004 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2005 * into this function always.
2008 rt6_dst_from_metrics_check(rt
);
2010 if (rt
->rt6i_flags
& RTF_PCPU
||
2011 (unlikely(!list_empty(&rt
->rt6i_uncached
)) && rt
->from
))
2012 return rt6_dst_from_check(rt
, cookie
);
2014 return rt6_check(rt
, cookie
);
2017 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
2019 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
2022 if (rt
->rt6i_flags
& RTF_CACHE
) {
2023 if (rt6_check_expired(rt
)) {
2035 static void ip6_link_failure(struct sk_buff
*skb
)
2037 struct rt6_info
*rt
;
2039 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
2041 rt
= (struct rt6_info
*) skb_dst(skb
);
2043 if (rt
->rt6i_flags
& RTF_CACHE
) {
2044 if (dst_hold_safe(&rt
->dst
))
2047 struct fib6_node
*fn
;
2050 fn
= rcu_dereference(rt
->rt6i_node
);
2051 if (fn
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
2058 static void rt6_do_update_pmtu(struct rt6_info
*rt
, u32 mtu
)
2060 struct net
*net
= dev_net(rt
->dst
.dev
);
2062 rt
->rt6i_flags
|= RTF_MODIFIED
;
2063 rt
->rt6i_pmtu
= mtu
;
2064 rt6_update_expires(rt
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
2067 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info
*rt
)
2069 return !(rt
->rt6i_flags
& RTF_CACHE
) &&
2070 (rt
->rt6i_flags
& RTF_PCPU
||
2071 rcu_access_pointer(rt
->rt6i_node
));
2074 static void __ip6_rt_update_pmtu(struct dst_entry
*dst
, const struct sock
*sk
,
2075 const struct ipv6hdr
*iph
, u32 mtu
)
2077 const struct in6_addr
*daddr
, *saddr
;
2078 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
2080 if (rt6
->rt6i_flags
& RTF_LOCAL
)
2083 if (dst_metric_locked(dst
, RTAX_MTU
))
2087 daddr
= &iph
->daddr
;
2088 saddr
= &iph
->saddr
;
2090 daddr
= &sk
->sk_v6_daddr
;
2091 saddr
= &inet6_sk(sk
)->saddr
;
2096 dst_confirm_neigh(dst
, daddr
);
2097 mtu
= max_t(u32
, mtu
, IPV6_MIN_MTU
);
2098 if (mtu
>= dst_mtu(dst
))
2101 if (!rt6_cache_allowed_for_pmtu(rt6
)) {
2102 rt6_do_update_pmtu(rt6
, mtu
);
2103 /* update rt6_ex->stamp for cache */
2104 if (rt6
->rt6i_flags
& RTF_CACHE
)
2105 rt6_update_exception_stamp_rt(rt6
);
2107 struct rt6_info
*nrt6
;
2109 nrt6
= ip6_rt_cache_alloc(rt6
, daddr
, saddr
);
2111 rt6_do_update_pmtu(nrt6
, mtu
);
2112 if (rt6_insert_exception(nrt6
, rt6
))
2113 dst_release_immediate(&nrt6
->dst
);
2118 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2119 struct sk_buff
*skb
, u32 mtu
)
2121 __ip6_rt_update_pmtu(dst
, sk
, skb
? ipv6_hdr(skb
) : NULL
, mtu
);
2124 void ip6_update_pmtu(struct sk_buff
*skb
, struct net
*net
, __be32 mtu
,
2125 int oif
, u32 mark
, kuid_t uid
)
2127 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2128 struct dst_entry
*dst
;
2131 memset(&fl6
, 0, sizeof(fl6
));
2132 fl6
.flowi6_oif
= oif
;
2133 fl6
.flowi6_mark
= mark
? mark
: IP6_REPLY_MARK(net
, skb
->mark
);
2134 fl6
.daddr
= iph
->daddr
;
2135 fl6
.saddr
= iph
->saddr
;
2136 fl6
.flowlabel
= ip6_flowinfo(iph
);
2137 fl6
.flowi6_uid
= uid
;
2139 dst
= ip6_route_output(net
, NULL
, &fl6
);
2141 __ip6_rt_update_pmtu(dst
, NULL
, iph
, ntohl(mtu
));
2144 EXPORT_SYMBOL_GPL(ip6_update_pmtu
);
2146 void ip6_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, __be32 mtu
)
2148 struct dst_entry
*dst
;
2150 ip6_update_pmtu(skb
, sock_net(sk
), mtu
,
2151 sk
->sk_bound_dev_if
, sk
->sk_mark
, sk
->sk_uid
);
2153 dst
= __sk_dst_get(sk
);
2154 if (!dst
|| !dst
->obsolete
||
2155 dst
->ops
->check(dst
, inet6_sk(sk
)->dst_cookie
))
2159 if (!sock_owned_by_user(sk
) && !ipv6_addr_v4mapped(&sk
->sk_v6_daddr
))
2160 ip6_datagram_dst_update(sk
, false);
2163 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu
);
2165 /* Handle redirects */
2166 struct ip6rd_flowi
{
2168 struct in6_addr gateway
;
2171 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
2172 struct fib6_table
*table
,
2176 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
2177 struct rt6_info
*rt
, *rt_cache
;
2178 struct fib6_node
*fn
;
2180 /* Get the "current" route for this destination and
2181 * check if the redirect has come from appropriate router.
2183 * RFC 4861 specifies that redirects should only be
2184 * accepted if they come from the nexthop to the target.
2185 * Due to the way the routes are chosen, this notion
2186 * is a bit fuzzy and one might need to check all possible
2191 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
2193 for_each_fib6_node_rt_rcu(fn
) {
2194 if (rt
->rt6i_nh_flags
& RTNH_F_DEAD
)
2196 if (rt6_check_expired(rt
))
2200 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
2202 if (fl6
->flowi6_oif
!= rt
->dst
.dev
->ifindex
)
2204 /* rt_cache's gateway might be different from its 'parent'
2205 * in the case of an ip redirect.
2206 * So we keep searching in the exception table if the gateway
2209 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
)) {
2210 rt_cache
= rt6_find_cached_rt(rt
,
2214 ipv6_addr_equal(&rdfl
->gateway
,
2215 &rt_cache
->rt6i_gateway
)) {
2225 rt
= net
->ipv6
.ip6_null_entry
;
2226 else if (rt
->dst
.error
) {
2227 rt
= net
->ipv6
.ip6_null_entry
;
2231 if (rt
== net
->ipv6
.ip6_null_entry
) {
2232 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
2238 ip6_hold_safe(net
, &rt
, true);
2242 trace_fib6_table_lookup(net
, rt
, table
, fl6
);
2246 static struct dst_entry
*ip6_route_redirect(struct net
*net
,
2247 const struct flowi6
*fl6
,
2248 const struct in6_addr
*gateway
)
2250 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
2251 struct ip6rd_flowi rdfl
;
2254 rdfl
.gateway
= *gateway
;
2256 return fib6_rule_lookup(net
, &rdfl
.fl6
,
2257 flags
, __ip6_route_redirect
);
2260 void ip6_redirect(struct sk_buff
*skb
, struct net
*net
, int oif
, u32 mark
,
2263 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2264 struct dst_entry
*dst
;
2267 memset(&fl6
, 0, sizeof(fl6
));
2268 fl6
.flowi6_iif
= LOOPBACK_IFINDEX
;
2269 fl6
.flowi6_oif
= oif
;
2270 fl6
.flowi6_mark
= mark
;
2271 fl6
.daddr
= iph
->daddr
;
2272 fl6
.saddr
= iph
->saddr
;
2273 fl6
.flowlabel
= ip6_flowinfo(iph
);
2274 fl6
.flowi6_uid
= uid
;
2276 dst
= ip6_route_redirect(net
, &fl6
, &ipv6_hdr(skb
)->saddr
);
2277 rt6_do_redirect(dst
, NULL
, skb
);
2280 EXPORT_SYMBOL_GPL(ip6_redirect
);
2282 void ip6_redirect_no_header(struct sk_buff
*skb
, struct net
*net
, int oif
,
2285 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2286 const struct rd_msg
*msg
= (struct rd_msg
*)icmp6_hdr(skb
);
2287 struct dst_entry
*dst
;
2290 memset(&fl6
, 0, sizeof(fl6
));
2291 fl6
.flowi6_iif
= LOOPBACK_IFINDEX
;
2292 fl6
.flowi6_oif
= oif
;
2293 fl6
.flowi6_mark
= mark
;
2294 fl6
.daddr
= msg
->dest
;
2295 fl6
.saddr
= iph
->daddr
;
2296 fl6
.flowi6_uid
= sock_net_uid(net
, NULL
);
2298 dst
= ip6_route_redirect(net
, &fl6
, &iph
->saddr
);
2299 rt6_do_redirect(dst
, NULL
, skb
);
2303 void ip6_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
2305 ip6_redirect(skb
, sock_net(sk
), sk
->sk_bound_dev_if
, sk
->sk_mark
,
2308 EXPORT_SYMBOL_GPL(ip6_sk_redirect
);
2310 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
2312 struct net_device
*dev
= dst
->dev
;
2313 unsigned int mtu
= dst_mtu(dst
);
2314 struct net
*net
= dev_net(dev
);
2316 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
2318 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
2319 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2322 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2323 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2324 * IPV6_MAXPLEN is also valid and means: "any MSS,
2325 * rely only on pmtu discovery"
2327 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
2332 static unsigned int ip6_mtu(const struct dst_entry
*dst
)
2334 const struct rt6_info
*rt
= (const struct rt6_info
*)dst
;
2335 unsigned int mtu
= rt
->rt6i_pmtu
;
2336 struct inet6_dev
*idev
;
2341 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2348 idev
= __in6_dev_get(dst
->dev
);
2350 mtu
= idev
->cnf
.mtu6
;
2354 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
2356 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
2359 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
2362 struct dst_entry
*dst
;
2363 struct rt6_info
*rt
;
2364 struct inet6_dev
*idev
= in6_dev_get(dev
);
2365 struct net
*net
= dev_net(dev
);
2367 if (unlikely(!idev
))
2368 return ERR_PTR(-ENODEV
);
2370 rt
= ip6_dst_alloc(net
, dev
, 0);
2371 if (unlikely(!rt
)) {
2373 dst
= ERR_PTR(-ENOMEM
);
2377 rt
->dst
.flags
|= DST_HOST
;
2378 rt
->dst
.input
= ip6_input
;
2379 rt
->dst
.output
= ip6_output
;
2380 rt
->rt6i_gateway
= fl6
->daddr
;
2381 rt
->rt6i_dst
.addr
= fl6
->daddr
;
2382 rt
->rt6i_dst
.plen
= 128;
2383 rt
->rt6i_idev
= idev
;
2384 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 0);
2386 /* Add this dst into uncached_list so that rt6_disable_ip() can
2387 * do proper release of the net_device
2389 rt6_uncached_list_add(rt
);
2390 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
2392 dst
= xfrm_lookup(net
, &rt
->dst
, flowi6_to_flowi(fl6
), NULL
, 0);
2398 static int ip6_dst_gc(struct dst_ops
*ops
)
2400 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
2401 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2402 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
2403 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2404 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2405 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
2408 entries
= dst_entries_get_fast(ops
);
2409 if (time_after(rt_last_gc
+ rt_min_interval
, jiffies
) &&
2410 entries
<= rt_max_size
)
2413 net
->ipv6
.ip6_rt_gc_expire
++;
2414 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
, true);
2415 entries
= dst_entries_get_slow(ops
);
2416 if (entries
< ops
->gc_thresh
)
2417 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
2419 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
2420 return entries
> rt_max_size
;
2423 static int ip6_convert_metrics(struct mx6_config
*mxc
,
2424 const struct fib6_config
*cfg
)
2426 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
2427 bool ecn_ca
= false;
2435 mp
= kzalloc(sizeof(u32
) * RTAX_MAX
, GFP_KERNEL
);
2439 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
2440 int type
= nla_type(nla
);
2445 if (unlikely(type
> RTAX_MAX
))
2448 if (type
== RTAX_CC_ALGO
) {
2449 char tmp
[TCP_CA_NAME_MAX
];
2451 nla_strlcpy(tmp
, nla
, sizeof(tmp
));
2452 val
= tcp_ca_get_key_by_name(net
, tmp
, &ecn_ca
);
2453 if (val
== TCP_CA_UNSPEC
)
2456 val
= nla_get_u32(nla
);
2458 if (type
== RTAX_HOPLIMIT
&& val
> 255)
2460 if (type
== RTAX_FEATURES
&& (val
& ~RTAX_FEATURE_MASK
))
2464 __set_bit(type
- 1, mxc
->mx_valid
);
2468 __set_bit(RTAX_FEATURES
- 1, mxc
->mx_valid
);
2469 mp
[RTAX_FEATURES
- 1] |= DST_FEATURE_ECN_CA
;
2479 static struct rt6_info
*ip6_nh_lookup_table(struct net
*net
,
2480 struct fib6_config
*cfg
,
2481 const struct in6_addr
*gw_addr
,
2482 u32 tbid
, int flags
)
2484 struct flowi6 fl6
= {
2485 .flowi6_oif
= cfg
->fc_ifindex
,
2487 .saddr
= cfg
->fc_prefsrc
,
2489 struct fib6_table
*table
;
2490 struct rt6_info
*rt
;
2492 table
= fib6_get_table(net
, tbid
);
2496 if (!ipv6_addr_any(&cfg
->fc_prefsrc
))
2497 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2499 flags
|= RT6_LOOKUP_F_IGNORE_LINKSTATE
;
2500 rt
= ip6_pol_route(net
, table
, cfg
->fc_ifindex
, &fl6
, flags
);
2502 /* if table lookup failed, fall back to full lookup */
2503 if (rt
== net
->ipv6
.ip6_null_entry
) {
2511 static int ip6_route_check_nh_onlink(struct net
*net
,
2512 struct fib6_config
*cfg
,
2513 struct net_device
*dev
,
2514 struct netlink_ext_ack
*extack
)
2516 u32 tbid
= l3mdev_fib_table(dev
) ? : RT_TABLE_MAIN
;
2517 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2518 u32 flags
= RTF_LOCAL
| RTF_ANYCAST
| RTF_REJECT
;
2519 struct rt6_info
*grt
;
2523 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
, tbid
, 0);
2525 if (!grt
->dst
.error
&&
2526 (grt
->rt6i_flags
& flags
|| dev
!= grt
->dst
.dev
)) {
2527 NL_SET_ERR_MSG(extack
,
2528 "Nexthop has invalid gateway or device mismatch");
2538 static int ip6_route_check_nh(struct net
*net
,
2539 struct fib6_config
*cfg
,
2540 struct net_device
**_dev
,
2541 struct inet6_dev
**idev
)
2543 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2544 struct net_device
*dev
= _dev
? *_dev
: NULL
;
2545 struct rt6_info
*grt
= NULL
;
2546 int err
= -EHOSTUNREACH
;
2548 if (cfg
->fc_table
) {
2549 int flags
= RT6_LOOKUP_F_IFACE
;
2551 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
,
2552 cfg
->fc_table
, flags
);
2554 if (grt
->rt6i_flags
& RTF_GATEWAY
||
2555 (dev
&& dev
!= grt
->dst
.dev
)) {
2563 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
2569 if (dev
!= grt
->dst
.dev
) {
2574 *_dev
= dev
= grt
->dst
.dev
;
2575 *idev
= grt
->rt6i_idev
;
2577 in6_dev_hold(grt
->rt6i_idev
);
2580 if (!(grt
->rt6i_flags
& RTF_GATEWAY
))
2589 static struct rt6_info
*ip6_route_info_create(struct fib6_config
*cfg
,
2590 struct netlink_ext_ack
*extack
)
2592 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
2593 struct rt6_info
*rt
= NULL
;
2594 struct net_device
*dev
= NULL
;
2595 struct inet6_dev
*idev
= NULL
;
2596 struct fib6_table
*table
;
2600 /* RTF_PCPU is an internal flag; can not be set by userspace */
2601 if (cfg
->fc_flags
& RTF_PCPU
) {
2602 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_PCPU");
2606 /* RTF_CACHE is an internal flag; can not be set by userspace */
2607 if (cfg
->fc_flags
& RTF_CACHE
) {
2608 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_CACHE");
2612 if (cfg
->fc_dst_len
> 128) {
2613 NL_SET_ERR_MSG(extack
, "Invalid prefix length");
2616 if (cfg
->fc_src_len
> 128) {
2617 NL_SET_ERR_MSG(extack
, "Invalid source address length");
2620 #ifndef CONFIG_IPV6_SUBTREES
2621 if (cfg
->fc_src_len
) {
2622 NL_SET_ERR_MSG(extack
,
2623 "Specifying source address requires IPV6_SUBTREES to be enabled");
2627 if (cfg
->fc_ifindex
) {
2629 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
2632 idev
= in6_dev_get(dev
);
2637 if (cfg
->fc_metric
== 0)
2638 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
2640 if (cfg
->fc_flags
& RTNH_F_ONLINK
) {
2642 NL_SET_ERR_MSG(extack
,
2643 "Nexthop device required for onlink");
2648 if (!(dev
->flags
& IFF_UP
)) {
2649 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
2656 if (cfg
->fc_nlinfo
.nlh
&&
2657 !(cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_CREATE
)) {
2658 table
= fib6_get_table(net
, cfg
->fc_table
);
2660 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2661 table
= fib6_new_table(net
, cfg
->fc_table
);
2664 table
= fib6_new_table(net
, cfg
->fc_table
);
2670 rt
= ip6_dst_alloc(net
, NULL
,
2671 (cfg
->fc_flags
& RTF_ADDRCONF
) ? 0 : DST_NOCOUNT
);
2678 if (cfg
->fc_flags
& RTF_EXPIRES
)
2679 rt6_set_expires(rt
, jiffies
+
2680 clock_t_to_jiffies(cfg
->fc_expires
));
2682 rt6_clean_expires(rt
);
2684 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
2685 cfg
->fc_protocol
= RTPROT_BOOT
;
2686 rt
->rt6i_protocol
= cfg
->fc_protocol
;
2688 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
2690 if (addr_type
& IPV6_ADDR_MULTICAST
)
2691 rt
->dst
.input
= ip6_mc_input
;
2692 else if (cfg
->fc_flags
& RTF_LOCAL
)
2693 rt
->dst
.input
= ip6_input
;
2695 rt
->dst
.input
= ip6_forward
;
2697 rt
->dst
.output
= ip6_output
;
2699 if (cfg
->fc_encap
) {
2700 struct lwtunnel_state
*lwtstate
;
2702 err
= lwtunnel_build_state(cfg
->fc_encap_type
,
2703 cfg
->fc_encap
, AF_INET6
, cfg
,
2707 rt
->dst
.lwtstate
= lwtstate_get(lwtstate
);
2708 if (lwtunnel_output_redirect(rt
->dst
.lwtstate
)) {
2709 rt
->dst
.lwtstate
->orig_output
= rt
->dst
.output
;
2710 rt
->dst
.output
= lwtunnel_output
;
2712 if (lwtunnel_input_redirect(rt
->dst
.lwtstate
)) {
2713 rt
->dst
.lwtstate
->orig_input
= rt
->dst
.input
;
2714 rt
->dst
.input
= lwtunnel_input
;
2718 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
2719 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
2720 if (rt
->rt6i_dst
.plen
== 128)
2721 rt
->dst
.flags
|= DST_HOST
;
2723 #ifdef CONFIG_IPV6_SUBTREES
2724 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
2725 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
2728 rt
->rt6i_metric
= cfg
->fc_metric
;
2729 rt
->rt6i_nh_weight
= 1;
2731 /* We cannot add true routes via loopback here,
2732 they would result in kernel looping; promote them to reject routes
2734 if ((cfg
->fc_flags
& RTF_REJECT
) ||
2735 (dev
&& (dev
->flags
& IFF_LOOPBACK
) &&
2736 !(addr_type
& IPV6_ADDR_LOOPBACK
) &&
2737 !(cfg
->fc_flags
& RTF_LOCAL
))) {
2738 /* hold loopback dev/idev if we haven't done so. */
2739 if (dev
!= net
->loopback_dev
) {
2744 dev
= net
->loopback_dev
;
2746 idev
= in6_dev_get(dev
);
2752 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
2753 switch (cfg
->fc_type
) {
2755 rt
->dst
.error
= -EINVAL
;
2756 rt
->dst
.output
= dst_discard_out
;
2757 rt
->dst
.input
= dst_discard
;
2760 rt
->dst
.error
= -EACCES
;
2761 rt
->dst
.output
= ip6_pkt_prohibit_out
;
2762 rt
->dst
.input
= ip6_pkt_prohibit
;
2765 case RTN_UNREACHABLE
:
2767 rt
->dst
.error
= (cfg
->fc_type
== RTN_THROW
) ? -EAGAIN
2768 : (cfg
->fc_type
== RTN_UNREACHABLE
)
2769 ? -EHOSTUNREACH
: -ENETUNREACH
;
2770 rt
->dst
.output
= ip6_pkt_discard_out
;
2771 rt
->dst
.input
= ip6_pkt_discard
;
2777 if (cfg
->fc_flags
& RTF_GATEWAY
) {
2778 const struct in6_addr
*gw_addr
;
2781 gw_addr
= &cfg
->fc_gateway
;
2782 gwa_type
= ipv6_addr_type(gw_addr
);
2784 /* if gw_addr is local we will fail to detect this in case
2785 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2786 * will return already-added prefix route via interface that
2787 * prefix route was assigned to, which might be non-loopback.
2790 if (ipv6_chk_addr_and_flags(net
, gw_addr
,
2791 gwa_type
& IPV6_ADDR_LINKLOCAL
?
2792 dev
: NULL
, 0, 0)) {
2793 NL_SET_ERR_MSG(extack
, "Invalid gateway address");
2796 rt
->rt6i_gateway
= *gw_addr
;
2798 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
2799 /* IPv6 strictly inhibits using not link-local
2800 addresses as nexthop address.
2801 Otherwise, router will not able to send redirects.
2802 It is very good, but in some (rare!) circumstances
2803 (SIT, PtP, NBMA NOARP links) it is handy to allow
2804 some exceptions. --ANK
2805 We allow IPv4-mapped nexthops to support RFC4798-type
2808 if (!(gwa_type
& (IPV6_ADDR_UNICAST
|
2809 IPV6_ADDR_MAPPED
))) {
2810 NL_SET_ERR_MSG(extack
,
2811 "Invalid gateway address");
2815 if (cfg
->fc_flags
& RTNH_F_ONLINK
) {
2816 err
= ip6_route_check_nh_onlink(net
, cfg
, dev
,
2819 err
= ip6_route_check_nh(net
, cfg
, &dev
, &idev
);
2826 NL_SET_ERR_MSG(extack
, "Egress device not specified");
2828 } else if (dev
->flags
& IFF_LOOPBACK
) {
2829 NL_SET_ERR_MSG(extack
,
2830 "Egress device can not be loopback device for this route");
2839 if (!(dev
->flags
& IFF_UP
)) {
2840 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
2845 if (!ipv6_addr_any(&cfg
->fc_prefsrc
)) {
2846 if (!ipv6_chk_addr(net
, &cfg
->fc_prefsrc
, dev
, 0)) {
2847 NL_SET_ERR_MSG(extack
, "Invalid source address");
2851 rt
->rt6i_prefsrc
.addr
= cfg
->fc_prefsrc
;
2852 rt
->rt6i_prefsrc
.plen
= 128;
2854 rt
->rt6i_prefsrc
.plen
= 0;
2856 rt
->rt6i_flags
= cfg
->fc_flags
;
2859 if (!(rt
->rt6i_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) &&
2860 !netif_carrier_ok(dev
))
2861 rt
->rt6i_nh_flags
|= RTNH_F_LINKDOWN
;
2862 rt
->rt6i_nh_flags
|= (cfg
->fc_flags
& RTNH_F_ONLINK
);
2864 rt
->rt6i_idev
= idev
;
2865 rt
->rt6i_table
= table
;
2867 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
2876 dst_release_immediate(&rt
->dst
);
2878 return ERR_PTR(err
);
2881 int ip6_route_add(struct fib6_config
*cfg
,
2882 struct netlink_ext_ack
*extack
)
2884 struct mx6_config mxc
= { .mx
= NULL
, };
2885 struct rt6_info
*rt
;
2888 rt
= ip6_route_info_create(cfg
, extack
);
2895 err
= ip6_convert_metrics(&mxc
, cfg
);
2899 err
= __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
, &mxc
, extack
);
2906 dst_release_immediate(&rt
->dst
);
2911 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
2914 struct fib6_table
*table
;
2915 struct net
*net
= dev_net(rt
->dst
.dev
);
2917 if (rt
== net
->ipv6
.ip6_null_entry
) {
2922 table
= rt
->rt6i_table
;
2923 spin_lock_bh(&table
->tb6_lock
);
2924 err
= fib6_del(rt
, info
);
2925 spin_unlock_bh(&table
->tb6_lock
);
2932 int ip6_del_rt(struct rt6_info
*rt
)
2934 struct nl_info info
= {
2935 .nl_net
= dev_net(rt
->dst
.dev
),
2937 return __ip6_del_rt(rt
, &info
);
2940 static int __ip6_del_rt_siblings(struct rt6_info
*rt
, struct fib6_config
*cfg
)
2942 struct nl_info
*info
= &cfg
->fc_nlinfo
;
2943 struct net
*net
= info
->nl_net
;
2944 struct sk_buff
*skb
= NULL
;
2945 struct fib6_table
*table
;
2948 if (rt
== net
->ipv6
.ip6_null_entry
)
2950 table
= rt
->rt6i_table
;
2951 spin_lock_bh(&table
->tb6_lock
);
2953 if (rt
->rt6i_nsiblings
&& cfg
->fc_delete_all_nh
) {
2954 struct rt6_info
*sibling
, *next_sibling
;
2956 /* prefer to send a single notification with all hops */
2957 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
2959 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
2961 if (rt6_fill_node(net
, skb
, rt
,
2962 NULL
, NULL
, 0, RTM_DELROUTE
,
2963 info
->portid
, seq
, 0) < 0) {
2967 info
->skip_notify
= 1;
2970 list_for_each_entry_safe(sibling
, next_sibling
,
2973 err
= fib6_del(sibling
, info
);
2979 err
= fib6_del(rt
, info
);
2981 spin_unlock_bh(&table
->tb6_lock
);
2986 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
2987 info
->nlh
, gfp_any());
2992 static int ip6_route_del(struct fib6_config
*cfg
,
2993 struct netlink_ext_ack
*extack
)
2995 struct rt6_info
*rt
, *rt_cache
;
2996 struct fib6_table
*table
;
2997 struct fib6_node
*fn
;
3000 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
3002 NL_SET_ERR_MSG(extack
, "FIB table does not exist");
3008 fn
= fib6_locate(&table
->tb6_root
,
3009 &cfg
->fc_dst
, cfg
->fc_dst_len
,
3010 &cfg
->fc_src
, cfg
->fc_src_len
,
3011 !(cfg
->fc_flags
& RTF_CACHE
));
3014 for_each_fib6_node_rt_rcu(fn
) {
3015 if (cfg
->fc_flags
& RTF_CACHE
) {
3016 rt_cache
= rt6_find_cached_rt(rt
, &cfg
->fc_dst
,
3022 if (cfg
->fc_ifindex
&&
3024 rt
->dst
.dev
->ifindex
!= cfg
->fc_ifindex
))
3026 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3027 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
3029 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
3031 if (cfg
->fc_protocol
&& cfg
->fc_protocol
!= rt
->rt6i_protocol
)
3033 if (!dst_hold_safe(&rt
->dst
))
3037 /* if gateway was specified only delete the one hop */
3038 if (cfg
->fc_flags
& RTF_GATEWAY
)
3039 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
3041 return __ip6_del_rt_siblings(rt
, cfg
);
3049 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
3051 struct netevent_redirect netevent
;
3052 struct rt6_info
*rt
, *nrt
= NULL
;
3053 struct ndisc_options ndopts
;
3054 struct inet6_dev
*in6_dev
;
3055 struct neighbour
*neigh
;
3057 int optlen
, on_link
;
3060 optlen
= skb_tail_pointer(skb
) - skb_transport_header(skb
);
3061 optlen
-= sizeof(*msg
);
3064 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3068 msg
= (struct rd_msg
*)icmp6_hdr(skb
);
3070 if (ipv6_addr_is_multicast(&msg
->dest
)) {
3071 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3076 if (ipv6_addr_equal(&msg
->dest
, &msg
->target
)) {
3078 } else if (ipv6_addr_type(&msg
->target
) !=
3079 (IPV6_ADDR_UNICAST
|IPV6_ADDR_LINKLOCAL
)) {
3080 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3084 in6_dev
= __in6_dev_get(skb
->dev
);
3087 if (in6_dev
->cnf
.forwarding
|| !in6_dev
->cnf
.accept_redirects
)
3091 * The IP source address of the Redirect MUST be the same as the current
3092 * first-hop router for the specified ICMP Destination Address.
3095 if (!ndisc_parse_options(skb
->dev
, msg
->opt
, optlen
, &ndopts
)) {
3096 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3101 if (ndopts
.nd_opts_tgt_lladdr
) {
3102 lladdr
= ndisc_opt_addr_data(ndopts
.nd_opts_tgt_lladdr
,
3105 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3110 rt
= (struct rt6_info
*) dst
;
3111 if (rt
->rt6i_flags
& RTF_REJECT
) {
3112 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3116 /* Redirect received -> path was valid.
3117 * Look, redirects are sent only in response to data packets,
3118 * so that this nexthop apparently is reachable. --ANK
3120 dst_confirm_neigh(&rt
->dst
, &ipv6_hdr(skb
)->saddr
);
3122 neigh
= __neigh_lookup(&nd_tbl
, &msg
->target
, skb
->dev
, 1);
3127 * We have finally decided to accept it.
3130 ndisc_update(skb
->dev
, neigh
, lladdr
, NUD_STALE
,
3131 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
3132 NEIGH_UPDATE_F_OVERRIDE
|
3133 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
3134 NEIGH_UPDATE_F_ISROUTER
)),
3135 NDISC_REDIRECT
, &ndopts
);
3137 nrt
= ip6_rt_cache_alloc(rt
, &msg
->dest
, NULL
);
3141 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
3143 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
3145 nrt
->rt6i_protocol
= RTPROT_REDIRECT
;
3146 nrt
->rt6i_gateway
= *(struct in6_addr
*)neigh
->primary_key
;
3148 /* No need to remove rt from the exception table if rt is
3149 * a cached route because rt6_insert_exception() will
3152 if (rt6_insert_exception(nrt
, rt
)) {
3153 dst_release_immediate(&nrt
->dst
);
3157 netevent
.old
= &rt
->dst
;
3158 netevent
.new = &nrt
->dst
;
3159 netevent
.daddr
= &msg
->dest
;
3160 netevent
.neigh
= neigh
;
3161 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
3164 neigh_release(neigh
);
3168 * Misc support functions
3171 static void rt6_set_from(struct rt6_info
*rt
, struct rt6_info
*from
)
3175 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
3176 dst_hold(&from
->dst
);
3178 dst_init_metrics(&rt
->dst
, dst_metrics_ptr(&from
->dst
), true);
3181 static void ip6_rt_copy_init(struct rt6_info
*rt
, struct rt6_info
*ort
)
3183 rt
->dst
.input
= ort
->dst
.input
;
3184 rt
->dst
.output
= ort
->dst
.output
;
3185 rt
->rt6i_dst
= ort
->rt6i_dst
;
3186 rt
->dst
.error
= ort
->dst
.error
;
3187 rt
->rt6i_idev
= ort
->rt6i_idev
;
3189 in6_dev_hold(rt
->rt6i_idev
);
3190 rt
->dst
.lastuse
= jiffies
;
3191 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
3192 rt
->rt6i_flags
= ort
->rt6i_flags
;
3193 rt6_set_from(rt
, ort
);
3194 rt
->rt6i_metric
= ort
->rt6i_metric
;
3195 #ifdef CONFIG_IPV6_SUBTREES
3196 rt
->rt6i_src
= ort
->rt6i_src
;
3198 rt
->rt6i_prefsrc
= ort
->rt6i_prefsrc
;
3199 rt
->rt6i_table
= ort
->rt6i_table
;
3200 rt
->dst
.lwtstate
= lwtstate_get(ort
->dst
.lwtstate
);
3203 #ifdef CONFIG_IPV6_ROUTE_INFO
3204 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
3205 const struct in6_addr
*prefix
, int prefixlen
,
3206 const struct in6_addr
*gwaddr
,
3207 struct net_device
*dev
)
3209 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
3210 int ifindex
= dev
->ifindex
;
3211 struct fib6_node
*fn
;
3212 struct rt6_info
*rt
= NULL
;
3213 struct fib6_table
*table
;
3215 table
= fib6_get_table(net
, tb_id
);
3220 fn
= fib6_locate(&table
->tb6_root
, prefix
, prefixlen
, NULL
, 0, true);
3224 for_each_fib6_node_rt_rcu(fn
) {
3225 if (rt
->dst
.dev
->ifindex
!= ifindex
)
3227 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
3229 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
3231 ip6_hold_safe(NULL
, &rt
, false);
3239 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
3240 const struct in6_addr
*prefix
, int prefixlen
,
3241 const struct in6_addr
*gwaddr
,
3242 struct net_device
*dev
,
3245 struct fib6_config cfg
= {
3246 .fc_metric
= IP6_RT_PRIO_USER
,
3247 .fc_ifindex
= dev
->ifindex
,
3248 .fc_dst_len
= prefixlen
,
3249 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
3250 RTF_UP
| RTF_PREF(pref
),
3251 .fc_protocol
= RTPROT_RA
,
3252 .fc_nlinfo
.portid
= 0,
3253 .fc_nlinfo
.nlh
= NULL
,
3254 .fc_nlinfo
.nl_net
= net
,
3257 cfg
.fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
,
3258 cfg
.fc_dst
= *prefix
;
3259 cfg
.fc_gateway
= *gwaddr
;
3261 /* We should treat it as a default route if prefix length is 0. */
3263 cfg
.fc_flags
|= RTF_DEFAULT
;
3265 ip6_route_add(&cfg
, NULL
);
3267 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, dev
);
3271 struct rt6_info
*rt6_get_dflt_router(const struct in6_addr
*addr
, struct net_device
*dev
)
3273 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
;
3274 struct rt6_info
*rt
;
3275 struct fib6_table
*table
;
3277 table
= fib6_get_table(dev_net(dev
), tb_id
);
3282 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
3283 if (dev
== rt
->dst
.dev
&&
3284 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
3285 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
3289 ip6_hold_safe(NULL
, &rt
, false);
3294 struct rt6_info
*rt6_add_dflt_router(const struct in6_addr
*gwaddr
,
3295 struct net_device
*dev
,
3298 struct fib6_config cfg
= {
3299 .fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
,
3300 .fc_metric
= IP6_RT_PRIO_USER
,
3301 .fc_ifindex
= dev
->ifindex
,
3302 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
3303 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
3304 .fc_protocol
= RTPROT_RA
,
3305 .fc_nlinfo
.portid
= 0,
3306 .fc_nlinfo
.nlh
= NULL
,
3307 .fc_nlinfo
.nl_net
= dev_net(dev
),
3310 cfg
.fc_gateway
= *gwaddr
;
3312 if (!ip6_route_add(&cfg
, NULL
)) {
3313 struct fib6_table
*table
;
3315 table
= fib6_get_table(dev_net(dev
), cfg
.fc_table
);
3317 table
->flags
|= RT6_TABLE_HAS_DFLT_ROUTER
;
3320 return rt6_get_dflt_router(gwaddr
, dev
);
3323 static void __rt6_purge_dflt_routers(struct fib6_table
*table
)
3325 struct rt6_info
*rt
;
3329 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
3330 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
) &&
3331 (!rt
->rt6i_idev
|| rt
->rt6i_idev
->cnf
.accept_ra
!= 2)) {
3332 if (dst_hold_safe(&rt
->dst
)) {
3343 table
->flags
&= ~RT6_TABLE_HAS_DFLT_ROUTER
;
3346 void rt6_purge_dflt_routers(struct net
*net
)
3348 struct fib6_table
*table
;
3349 struct hlist_head
*head
;
3354 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
3355 head
= &net
->ipv6
.fib_table_hash
[h
];
3356 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
3357 if (table
->flags
& RT6_TABLE_HAS_DFLT_ROUTER
)
3358 __rt6_purge_dflt_routers(table
);
3365 static void rtmsg_to_fib6_config(struct net
*net
,
3366 struct in6_rtmsg
*rtmsg
,
3367 struct fib6_config
*cfg
)
3369 memset(cfg
, 0, sizeof(*cfg
));
3371 cfg
->fc_table
= l3mdev_fib_table_by_index(net
, rtmsg
->rtmsg_ifindex
) ?
3373 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
3374 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
3375 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
3376 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
3377 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
3378 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
3380 cfg
->fc_nlinfo
.nl_net
= net
;
3382 cfg
->fc_dst
= rtmsg
->rtmsg_dst
;
3383 cfg
->fc_src
= rtmsg
->rtmsg_src
;
3384 cfg
->fc_gateway
= rtmsg
->rtmsg_gateway
;
3387 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
3389 struct fib6_config cfg
;
3390 struct in6_rtmsg rtmsg
;
3394 case SIOCADDRT
: /* Add a route */
3395 case SIOCDELRT
: /* Delete a route */
3396 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
3398 err
= copy_from_user(&rtmsg
, arg
,
3399 sizeof(struct in6_rtmsg
));
3403 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
3408 err
= ip6_route_add(&cfg
, NULL
);
3411 err
= ip6_route_del(&cfg
, NULL
);
3425 * Drop the packet on the floor
3428 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
3431 struct dst_entry
*dst
= skb_dst(skb
);
3432 switch (ipstats_mib_noroutes
) {
3433 case IPSTATS_MIB_INNOROUTES
:
3434 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
3435 if (type
== IPV6_ADDR_ANY
) {
3436 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
3437 IPSTATS_MIB_INADDRERRORS
);
3441 case IPSTATS_MIB_OUTNOROUTES
:
3442 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
3443 ipstats_mib_noroutes
);
3446 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
3451 static int ip6_pkt_discard(struct sk_buff
*skb
)
3453 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
3456 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3458 skb
->dev
= skb_dst(skb
)->dev
;
3459 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
3462 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
3464 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
3467 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3469 skb
->dev
= skb_dst(skb
)->dev
;
3470 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
3474 * Allocate a dst for local (unicast / anycast) address.
3477 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
3478 const struct in6_addr
*addr
,
3482 struct net
*net
= dev_net(idev
->dev
);
3483 struct net_device
*dev
= idev
->dev
;
3484 struct rt6_info
*rt
;
3486 rt
= ip6_dst_alloc(net
, dev
, DST_NOCOUNT
);
3488 return ERR_PTR(-ENOMEM
);
3492 rt
->dst
.flags
|= DST_HOST
;
3493 rt
->dst
.input
= ip6_input
;
3494 rt
->dst
.output
= ip6_output
;
3495 rt
->rt6i_idev
= idev
;
3497 rt
->rt6i_protocol
= RTPROT_KERNEL
;
3498 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
3500 rt
->rt6i_flags
|= RTF_ANYCAST
;
3502 rt
->rt6i_flags
|= RTF_LOCAL
;
3504 rt
->rt6i_gateway
= *addr
;
3505 rt
->rt6i_dst
.addr
= *addr
;
3506 rt
->rt6i_dst
.plen
= 128;
3507 tb_id
= l3mdev_fib_table(idev
->dev
) ? : RT6_TABLE_LOCAL
;
3508 rt
->rt6i_table
= fib6_get_table(net
, tb_id
);
3513 /* remove deleted ip from prefsrc entries */
3514 struct arg_dev_net_ip
{
3515 struct net_device
*dev
;
3517 struct in6_addr
*addr
;
3520 static int fib6_remove_prefsrc(struct rt6_info
*rt
, void *arg
)
3522 struct net_device
*dev
= ((struct arg_dev_net_ip
*)arg
)->dev
;
3523 struct net
*net
= ((struct arg_dev_net_ip
*)arg
)->net
;
3524 struct in6_addr
*addr
= ((struct arg_dev_net_ip
*)arg
)->addr
;
3526 if (((void *)rt
->dst
.dev
== dev
|| !dev
) &&
3527 rt
!= net
->ipv6
.ip6_null_entry
&&
3528 ipv6_addr_equal(addr
, &rt
->rt6i_prefsrc
.addr
)) {
3529 spin_lock_bh(&rt6_exception_lock
);
3530 /* remove prefsrc entry */
3531 rt
->rt6i_prefsrc
.plen
= 0;
3532 /* need to update cache as well */
3533 rt6_exceptions_remove_prefsrc(rt
);
3534 spin_unlock_bh(&rt6_exception_lock
);
3539 void rt6_remove_prefsrc(struct inet6_ifaddr
*ifp
)
3541 struct net
*net
= dev_net(ifp
->idev
->dev
);
3542 struct arg_dev_net_ip adni
= {
3543 .dev
= ifp
->idev
->dev
,
3547 fib6_clean_all(net
, fib6_remove_prefsrc
, &adni
);
3550 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3552 /* Remove routers and update dst entries when gateway turn into host. */
3553 static int fib6_clean_tohost(struct rt6_info
*rt
, void *arg
)
3555 struct in6_addr
*gateway
= (struct in6_addr
*)arg
;
3557 if (((rt
->rt6i_flags
& RTF_RA_ROUTER
) == RTF_RA_ROUTER
) &&
3558 ipv6_addr_equal(gateway
, &rt
->rt6i_gateway
)) {
3562 /* Further clean up cached routes in exception table.
3563 * This is needed because cached route may have a different
3564 * gateway than its 'parent' in the case of an ip redirect.
3566 rt6_exceptions_clean_tohost(rt
, gateway
);
3571 void rt6_clean_tohost(struct net
*net
, struct in6_addr
*gateway
)
3573 fib6_clean_all(net
, fib6_clean_tohost
, gateway
);
3576 struct arg_netdev_event
{
3577 const struct net_device
*dev
;
3579 unsigned int nh_flags
;
3580 unsigned long event
;
3584 static struct rt6_info
*rt6_multipath_first_sibling(const struct rt6_info
*rt
)
3586 struct rt6_info
*iter
;
3587 struct fib6_node
*fn
;
3589 fn
= rcu_dereference_protected(rt
->rt6i_node
,
3590 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
3591 iter
= rcu_dereference_protected(fn
->leaf
,
3592 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
3594 if (iter
->rt6i_metric
== rt
->rt6i_metric
&&
3595 rt6_qualify_for_ecmp(iter
))
3597 iter
= rcu_dereference_protected(iter
->rt6_next
,
3598 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
3604 static bool rt6_is_dead(const struct rt6_info
*rt
)
3606 if (rt
->rt6i_nh_flags
& RTNH_F_DEAD
||
3607 (rt
->rt6i_nh_flags
& RTNH_F_LINKDOWN
&&
3608 rt
->rt6i_idev
->cnf
.ignore_routes_with_linkdown
))
3614 static int rt6_multipath_total_weight(const struct rt6_info
*rt
)
3616 struct rt6_info
*iter
;
3619 if (!rt6_is_dead(rt
))
3620 total
+= rt
->rt6i_nh_weight
;
3622 list_for_each_entry(iter
, &rt
->rt6i_siblings
, rt6i_siblings
) {
3623 if (!rt6_is_dead(iter
))
3624 total
+= iter
->rt6i_nh_weight
;
3630 static void rt6_upper_bound_set(struct rt6_info
*rt
, int *weight
, int total
)
3632 int upper_bound
= -1;
3634 if (!rt6_is_dead(rt
)) {
3635 *weight
+= rt
->rt6i_nh_weight
;
3636 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
) (*weight
) << 31,
3639 atomic_set(&rt
->rt6i_nh_upper_bound
, upper_bound
);
3642 static void rt6_multipath_upper_bound_set(struct rt6_info
*rt
, int total
)
3644 struct rt6_info
*iter
;
3647 rt6_upper_bound_set(rt
, &weight
, total
);
3649 list_for_each_entry(iter
, &rt
->rt6i_siblings
, rt6i_siblings
)
3650 rt6_upper_bound_set(iter
, &weight
, total
);
3653 void rt6_multipath_rebalance(struct rt6_info
*rt
)
3655 struct rt6_info
*first
;
3658 /* In case the entire multipath route was marked for flushing,
3659 * then there is no need to rebalance upon the removal of every
3662 if (!rt
->rt6i_nsiblings
|| rt
->should_flush
)
3665 /* During lookup routes are evaluated in order, so we need to
3666 * make sure upper bounds are assigned from the first sibling
3669 first
= rt6_multipath_first_sibling(rt
);
3670 if (WARN_ON_ONCE(!first
))
3673 total
= rt6_multipath_total_weight(first
);
3674 rt6_multipath_upper_bound_set(first
, total
);
3677 static int fib6_ifup(struct rt6_info
*rt
, void *p_arg
)
3679 const struct arg_netdev_event
*arg
= p_arg
;
3680 const struct net
*net
= dev_net(arg
->dev
);
3682 if (rt
!= net
->ipv6
.ip6_null_entry
&& rt
->dst
.dev
== arg
->dev
) {
3683 rt
->rt6i_nh_flags
&= ~arg
->nh_flags
;
3684 fib6_update_sernum_upto_root(dev_net(rt
->dst
.dev
), rt
);
3685 rt6_multipath_rebalance(rt
);
3691 void rt6_sync_up(struct net_device
*dev
, unsigned int nh_flags
)
3693 struct arg_netdev_event arg
= {
3696 .nh_flags
= nh_flags
,
3700 if (nh_flags
& RTNH_F_DEAD
&& netif_carrier_ok(dev
))
3701 arg
.nh_flags
|= RTNH_F_LINKDOWN
;
3703 fib6_clean_all(dev_net(dev
), fib6_ifup
, &arg
);
3706 static bool rt6_multipath_uses_dev(const struct rt6_info
*rt
,
3707 const struct net_device
*dev
)
3709 struct rt6_info
*iter
;
3711 if (rt
->dst
.dev
== dev
)
3713 list_for_each_entry(iter
, &rt
->rt6i_siblings
, rt6i_siblings
)
3714 if (iter
->dst
.dev
== dev
)
3720 static void rt6_multipath_flush(struct rt6_info
*rt
)
3722 struct rt6_info
*iter
;
3724 rt
->should_flush
= 1;
3725 list_for_each_entry(iter
, &rt
->rt6i_siblings
, rt6i_siblings
)
3726 iter
->should_flush
= 1;
3729 static unsigned int rt6_multipath_dead_count(const struct rt6_info
*rt
,
3730 const struct net_device
*down_dev
)
3732 struct rt6_info
*iter
;
3733 unsigned int dead
= 0;
3735 if (rt
->dst
.dev
== down_dev
|| rt
->rt6i_nh_flags
& RTNH_F_DEAD
)
3737 list_for_each_entry(iter
, &rt
->rt6i_siblings
, rt6i_siblings
)
3738 if (iter
->dst
.dev
== down_dev
||
3739 iter
->rt6i_nh_flags
& RTNH_F_DEAD
)
3745 static void rt6_multipath_nh_flags_set(struct rt6_info
*rt
,
3746 const struct net_device
*dev
,
3747 unsigned int nh_flags
)
3749 struct rt6_info
*iter
;
3751 if (rt
->dst
.dev
== dev
)
3752 rt
->rt6i_nh_flags
|= nh_flags
;
3753 list_for_each_entry(iter
, &rt
->rt6i_siblings
, rt6i_siblings
)
3754 if (iter
->dst
.dev
== dev
)
3755 iter
->rt6i_nh_flags
|= nh_flags
;
3758 /* called with write lock held for table with rt */
3759 static int fib6_ifdown(struct rt6_info
*rt
, void *p_arg
)
3761 const struct arg_netdev_event
*arg
= p_arg
;
3762 const struct net_device
*dev
= arg
->dev
;
3763 const struct net
*net
= dev_net(dev
);
3765 if (rt
== net
->ipv6
.ip6_null_entry
)
3768 switch (arg
->event
) {
3769 case NETDEV_UNREGISTER
:
3770 return rt
->dst
.dev
== dev
? -1 : 0;
3772 if (rt
->should_flush
)
3774 if (!rt
->rt6i_nsiblings
)
3775 return rt
->dst
.dev
== dev
? -1 : 0;
3776 if (rt6_multipath_uses_dev(rt
, dev
)) {
3779 count
= rt6_multipath_dead_count(rt
, dev
);
3780 if (rt
->rt6i_nsiblings
+ 1 == count
) {
3781 rt6_multipath_flush(rt
);
3784 rt6_multipath_nh_flags_set(rt
, dev
, RTNH_F_DEAD
|
3786 fib6_update_sernum(rt
);
3787 rt6_multipath_rebalance(rt
);
3791 if (rt
->dst
.dev
!= dev
||
3792 rt
->rt6i_flags
& (RTF_LOCAL
| RTF_ANYCAST
))
3794 rt
->rt6i_nh_flags
|= RTNH_F_LINKDOWN
;
3795 rt6_multipath_rebalance(rt
);
3802 void rt6_sync_down_dev(struct net_device
*dev
, unsigned long event
)
3804 struct arg_netdev_event arg
= {
3811 fib6_clean_all(dev_net(dev
), fib6_ifdown
, &arg
);
3814 void rt6_disable_ip(struct net_device
*dev
, unsigned long event
)
3816 rt6_sync_down_dev(dev
, event
);
3817 rt6_uncached_list_flush_dev(dev_net(dev
), dev
);
3818 neigh_ifdown(&nd_tbl
, dev
);
3821 struct rt6_mtu_change_arg
{
3822 struct net_device
*dev
;
3826 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
3828 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
3829 struct inet6_dev
*idev
;
3831 /* In IPv6 pmtu discovery is not optional,
3832 so that RTAX_MTU lock cannot disable it.
3833 We still use this lock to block changes
3834 caused by addrconf/ndisc.
3837 idev
= __in6_dev_get(arg
->dev
);
3841 /* For administrative MTU increase, there is no way to discover
3842 IPv6 PMTU increase, so PMTU increase should be updated here.
3843 Since RFC 1981 doesn't include administrative MTU increase
3844 update PMTU increase is a MUST. (i.e. jumbo frame)
3846 if (rt
->dst
.dev
== arg
->dev
&&
3847 !dst_metric_locked(&rt
->dst
, RTAX_MTU
)) {
3848 spin_lock_bh(&rt6_exception_lock
);
3849 if (dst_metric_raw(&rt
->dst
, RTAX_MTU
) &&
3850 rt6_mtu_change_route_allowed(idev
, rt
, arg
->mtu
))
3851 dst_metric_set(&rt
->dst
, RTAX_MTU
, arg
->mtu
);
3852 rt6_exceptions_update_pmtu(idev
, rt
, arg
->mtu
);
3853 spin_unlock_bh(&rt6_exception_lock
);
3858 void rt6_mtu_change(struct net_device
*dev
, unsigned int mtu
)
3860 struct rt6_mtu_change_arg arg
= {
3865 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, &arg
);
3868 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
3869 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
3870 [RTA_PREFSRC
] = { .len
= sizeof(struct in6_addr
) },
3871 [RTA_OIF
] = { .type
= NLA_U32
},
3872 [RTA_IIF
] = { .type
= NLA_U32
},
3873 [RTA_PRIORITY
] = { .type
= NLA_U32
},
3874 [RTA_METRICS
] = { .type
= NLA_NESTED
},
3875 [RTA_MULTIPATH
] = { .len
= sizeof(struct rtnexthop
) },
3876 [RTA_PREF
] = { .type
= NLA_U8
},
3877 [RTA_ENCAP_TYPE
] = { .type
= NLA_U16
},
3878 [RTA_ENCAP
] = { .type
= NLA_NESTED
},
3879 [RTA_EXPIRES
] = { .type
= NLA_U32
},
3880 [RTA_UID
] = { .type
= NLA_U32
},
3881 [RTA_MARK
] = { .type
= NLA_U32
},
3882 [RTA_TABLE
] = { .type
= NLA_U32
},
3885 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
3886 struct fib6_config
*cfg
,
3887 struct netlink_ext_ack
*extack
)
3890 struct nlattr
*tb
[RTA_MAX
+1];
3894 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
,
3900 rtm
= nlmsg_data(nlh
);
3901 memset(cfg
, 0, sizeof(*cfg
));
3903 cfg
->fc_table
= rtm
->rtm_table
;
3904 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
3905 cfg
->fc_src_len
= rtm
->rtm_src_len
;
3906 cfg
->fc_flags
= RTF_UP
;
3907 cfg
->fc_protocol
= rtm
->rtm_protocol
;
3908 cfg
->fc_type
= rtm
->rtm_type
;
3910 if (rtm
->rtm_type
== RTN_UNREACHABLE
||
3911 rtm
->rtm_type
== RTN_BLACKHOLE
||
3912 rtm
->rtm_type
== RTN_PROHIBIT
||
3913 rtm
->rtm_type
== RTN_THROW
)
3914 cfg
->fc_flags
|= RTF_REJECT
;
3916 if (rtm
->rtm_type
== RTN_LOCAL
)
3917 cfg
->fc_flags
|= RTF_LOCAL
;
3919 if (rtm
->rtm_flags
& RTM_F_CLONED
)
3920 cfg
->fc_flags
|= RTF_CACHE
;
3922 cfg
->fc_flags
|= (rtm
->rtm_flags
& RTNH_F_ONLINK
);
3924 cfg
->fc_nlinfo
.portid
= NETLINK_CB(skb
).portid
;
3925 cfg
->fc_nlinfo
.nlh
= nlh
;
3926 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
3928 if (tb
[RTA_GATEWAY
]) {
3929 cfg
->fc_gateway
= nla_get_in6_addr(tb
[RTA_GATEWAY
]);
3930 cfg
->fc_flags
|= RTF_GATEWAY
;
3934 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
3936 if (nla_len(tb
[RTA_DST
]) < plen
)
3939 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
3943 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
3945 if (nla_len(tb
[RTA_SRC
]) < plen
)
3948 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
3951 if (tb
[RTA_PREFSRC
])
3952 cfg
->fc_prefsrc
= nla_get_in6_addr(tb
[RTA_PREFSRC
]);
3955 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
3957 if (tb
[RTA_PRIORITY
])
3958 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
3960 if (tb
[RTA_METRICS
]) {
3961 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
3962 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
3966 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
3968 if (tb
[RTA_MULTIPATH
]) {
3969 cfg
->fc_mp
= nla_data(tb
[RTA_MULTIPATH
]);
3970 cfg
->fc_mp_len
= nla_len(tb
[RTA_MULTIPATH
]);
3972 err
= lwtunnel_valid_encap_type_attr(cfg
->fc_mp
,
3973 cfg
->fc_mp_len
, extack
);
3979 pref
= nla_get_u8(tb
[RTA_PREF
]);
3980 if (pref
!= ICMPV6_ROUTER_PREF_LOW
&&
3981 pref
!= ICMPV6_ROUTER_PREF_HIGH
)
3982 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
3983 cfg
->fc_flags
|= RTF_PREF(pref
);
3987 cfg
->fc_encap
= tb
[RTA_ENCAP
];
3989 if (tb
[RTA_ENCAP_TYPE
]) {
3990 cfg
->fc_encap_type
= nla_get_u16(tb
[RTA_ENCAP_TYPE
]);
3992 err
= lwtunnel_valid_encap_type(cfg
->fc_encap_type
, extack
);
3997 if (tb
[RTA_EXPIRES
]) {
3998 unsigned long timeout
= addrconf_timeout_fixup(nla_get_u32(tb
[RTA_EXPIRES
]), HZ
);
4000 if (addrconf_finite_timeout(timeout
)) {
4001 cfg
->fc_expires
= jiffies_to_clock_t(timeout
* HZ
);
4002 cfg
->fc_flags
|= RTF_EXPIRES
;
4012 struct rt6_info
*rt6_info
;
4013 struct fib6_config r_cfg
;
4014 struct mx6_config mxc
;
4015 struct list_head next
;
4018 static void ip6_print_replace_route_err(struct list_head
*rt6_nh_list
)
4022 list_for_each_entry(nh
, rt6_nh_list
, next
) {
4023 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4024 &nh
->r_cfg
.fc_dst
, &nh
->r_cfg
.fc_gateway
,
4025 nh
->r_cfg
.fc_ifindex
);
4029 static int ip6_route_info_append(struct list_head
*rt6_nh_list
,
4030 struct rt6_info
*rt
, struct fib6_config
*r_cfg
)
4035 list_for_each_entry(nh
, rt6_nh_list
, next
) {
4036 /* check if rt6_info already exists */
4037 if (rt6_duplicate_nexthop(nh
->rt6_info
, rt
))
4041 nh
= kzalloc(sizeof(*nh
), GFP_KERNEL
);
4045 err
= ip6_convert_metrics(&nh
->mxc
, r_cfg
);
4050 memcpy(&nh
->r_cfg
, r_cfg
, sizeof(*r_cfg
));
4051 list_add_tail(&nh
->next
, rt6_nh_list
);
4056 static void ip6_route_mpath_notify(struct rt6_info
*rt
,
4057 struct rt6_info
*rt_last
,
4058 struct nl_info
*info
,
4061 /* if this is an APPEND route, then rt points to the first route
4062 * inserted and rt_last points to last route inserted. Userspace
4063 * wants a consistent dump of the route which starts at the first
4064 * nexthop. Since sibling routes are always added at the end of
4065 * the list, find the first sibling of the last route appended
4067 if ((nlflags
& NLM_F_APPEND
) && rt_last
&& rt_last
->rt6i_nsiblings
) {
4068 rt
= list_first_entry(&rt_last
->rt6i_siblings
,
4074 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
4077 static int ip6_route_multipath_add(struct fib6_config
*cfg
,
4078 struct netlink_ext_ack
*extack
)
4080 struct rt6_info
*rt_notif
= NULL
, *rt_last
= NULL
;
4081 struct nl_info
*info
= &cfg
->fc_nlinfo
;
4082 struct fib6_config r_cfg
;
4083 struct rtnexthop
*rtnh
;
4084 struct rt6_info
*rt
;
4085 struct rt6_nh
*err_nh
;
4086 struct rt6_nh
*nh
, *nh_safe
;
4092 int replace
= (cfg
->fc_nlinfo
.nlh
&&
4093 (cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_REPLACE
));
4094 LIST_HEAD(rt6_nh_list
);
4096 nlflags
= replace
? NLM_F_REPLACE
: NLM_F_CREATE
;
4097 if (info
->nlh
&& info
->nlh
->nlmsg_flags
& NLM_F_APPEND
)
4098 nlflags
|= NLM_F_APPEND
;
4100 remaining
= cfg
->fc_mp_len
;
4101 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
4103 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4104 * rt6_info structs per nexthop
4106 while (rtnh_ok(rtnh
, remaining
)) {
4107 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
4108 if (rtnh
->rtnh_ifindex
)
4109 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
4111 attrlen
= rtnh_attrlen(rtnh
);
4113 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
4115 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
4117 r_cfg
.fc_gateway
= nla_get_in6_addr(nla
);
4118 r_cfg
.fc_flags
|= RTF_GATEWAY
;
4120 r_cfg
.fc_encap
= nla_find(attrs
, attrlen
, RTA_ENCAP
);
4121 nla
= nla_find(attrs
, attrlen
, RTA_ENCAP_TYPE
);
4123 r_cfg
.fc_encap_type
= nla_get_u16(nla
);
4126 r_cfg
.fc_flags
|= (rtnh
->rtnh_flags
& RTNH_F_ONLINK
);
4127 rt
= ip6_route_info_create(&r_cfg
, extack
);
4134 rt
->rt6i_nh_weight
= rtnh
->rtnh_hops
+ 1;
4136 err
= ip6_route_info_append(&rt6_nh_list
, rt
, &r_cfg
);
4138 dst_release_immediate(&rt
->dst
);
4142 rtnh
= rtnh_next(rtnh
, &remaining
);
4145 /* for add and replace send one notification with all nexthops.
4146 * Skip the notification in fib6_add_rt2node and send one with
4147 * the full route when done
4149 info
->skip_notify
= 1;
4152 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
4153 rt_last
= nh
->rt6_info
;
4154 err
= __ip6_ins_rt(nh
->rt6_info
, info
, &nh
->mxc
, extack
);
4155 /* save reference to first route for notification */
4156 if (!rt_notif
&& !err
)
4157 rt_notif
= nh
->rt6_info
;
4159 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4160 nh
->rt6_info
= NULL
;
4163 ip6_print_replace_route_err(&rt6_nh_list
);
4168 /* Because each route is added like a single route we remove
4169 * these flags after the first nexthop: if there is a collision,
4170 * we have already failed to add the first nexthop:
4171 * fib6_add_rt2node() has rejected it; when replacing, old
4172 * nexthops have been replaced by first new, the rest should
4175 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
&= ~(NLM_F_EXCL
|
4180 /* success ... tell user about new route */
4181 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
4185 /* send notification for routes that were added so that
4186 * the delete notifications sent by ip6_route_del are
4190 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
4192 /* Delete routes that were already added */
4193 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
4196 ip6_route_del(&nh
->r_cfg
, extack
);
4200 list_for_each_entry_safe(nh
, nh_safe
, &rt6_nh_list
, next
) {
4202 dst_release_immediate(&nh
->rt6_info
->dst
);
4204 list_del(&nh
->next
);
4211 static int ip6_route_multipath_del(struct fib6_config
*cfg
,
4212 struct netlink_ext_ack
*extack
)
4214 struct fib6_config r_cfg
;
4215 struct rtnexthop
*rtnh
;
4218 int err
= 1, last_err
= 0;
4220 remaining
= cfg
->fc_mp_len
;
4221 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
4223 /* Parse a Multipath Entry */
4224 while (rtnh_ok(rtnh
, remaining
)) {
4225 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
4226 if (rtnh
->rtnh_ifindex
)
4227 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
4229 attrlen
= rtnh_attrlen(rtnh
);
4231 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
4233 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
4235 nla_memcpy(&r_cfg
.fc_gateway
, nla
, 16);
4236 r_cfg
.fc_flags
|= RTF_GATEWAY
;
4239 err
= ip6_route_del(&r_cfg
, extack
);
4243 rtnh
= rtnh_next(rtnh
, &remaining
);
4249 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4250 struct netlink_ext_ack
*extack
)
4252 struct fib6_config cfg
;
4255 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
4260 return ip6_route_multipath_del(&cfg
, extack
);
4262 cfg
.fc_delete_all_nh
= 1;
4263 return ip6_route_del(&cfg
, extack
);
4267 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4268 struct netlink_ext_ack
*extack
)
4270 struct fib6_config cfg
;
4273 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
4278 return ip6_route_multipath_add(&cfg
, extack
);
4280 return ip6_route_add(&cfg
, extack
);
4283 static size_t rt6_nlmsg_size(struct rt6_info
*rt
)
4285 int nexthop_len
= 0;
4287 if (rt
->rt6i_nsiblings
) {
4288 nexthop_len
= nla_total_size(0) /* RTA_MULTIPATH */
4289 + NLA_ALIGN(sizeof(struct rtnexthop
))
4290 + nla_total_size(16) /* RTA_GATEWAY */
4291 + lwtunnel_get_encap_size(rt
->dst
.lwtstate
);
4293 nexthop_len
*= rt
->rt6i_nsiblings
;
4296 return NLMSG_ALIGN(sizeof(struct rtmsg
))
4297 + nla_total_size(16) /* RTA_SRC */
4298 + nla_total_size(16) /* RTA_DST */
4299 + nla_total_size(16) /* RTA_GATEWAY */
4300 + nla_total_size(16) /* RTA_PREFSRC */
4301 + nla_total_size(4) /* RTA_TABLE */
4302 + nla_total_size(4) /* RTA_IIF */
4303 + nla_total_size(4) /* RTA_OIF */
4304 + nla_total_size(4) /* RTA_PRIORITY */
4305 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
4306 + nla_total_size(sizeof(struct rta_cacheinfo
))
4307 + nla_total_size(TCP_CA_NAME_MAX
) /* RTAX_CC_ALGO */
4308 + nla_total_size(1) /* RTA_PREF */
4309 + lwtunnel_get_encap_size(rt
->dst
.lwtstate
)
4313 static int rt6_nexthop_info(struct sk_buff
*skb
, struct rt6_info
*rt
,
4314 unsigned int *flags
, bool skip_oif
)
4316 if (rt
->rt6i_nh_flags
& RTNH_F_DEAD
)
4317 *flags
|= RTNH_F_DEAD
;
4319 if (rt
->rt6i_nh_flags
& RTNH_F_LINKDOWN
) {
4320 *flags
|= RTNH_F_LINKDOWN
;
4321 if (rt
->rt6i_idev
->cnf
.ignore_routes_with_linkdown
)
4322 *flags
|= RTNH_F_DEAD
;
4325 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
4326 if (nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt
->rt6i_gateway
) < 0)
4327 goto nla_put_failure
;
4330 *flags
|= (rt
->rt6i_nh_flags
& RTNH_F_ONLINK
);
4331 if (rt
->rt6i_nh_flags
& RTNH_F_OFFLOAD
)
4332 *flags
|= RTNH_F_OFFLOAD
;
4334 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4335 if (!skip_oif
&& rt
->dst
.dev
&&
4336 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
4337 goto nla_put_failure
;
4339 if (rt
->dst
.lwtstate
&&
4340 lwtunnel_fill_encap(skb
, rt
->dst
.lwtstate
) < 0)
4341 goto nla_put_failure
;
4349 /* add multipath next hop */
4350 static int rt6_add_nexthop(struct sk_buff
*skb
, struct rt6_info
*rt
)
4352 struct rtnexthop
*rtnh
;
4353 unsigned int flags
= 0;
4355 rtnh
= nla_reserve_nohdr(skb
, sizeof(*rtnh
));
4357 goto nla_put_failure
;
4359 rtnh
->rtnh_hops
= rt
->rt6i_nh_weight
- 1;
4360 rtnh
->rtnh_ifindex
= rt
->dst
.dev
? rt
->dst
.dev
->ifindex
: 0;
4362 if (rt6_nexthop_info(skb
, rt
, &flags
, true) < 0)
4363 goto nla_put_failure
;
4365 rtnh
->rtnh_flags
= flags
;
4367 /* length of rtnetlink header + attributes */
4368 rtnh
->rtnh_len
= nlmsg_get_pos(skb
) - (void *)rtnh
;
4376 static int rt6_fill_node(struct net
*net
,
4377 struct sk_buff
*skb
, struct rt6_info
*rt
,
4378 struct in6_addr
*dst
, struct in6_addr
*src
,
4379 int iif
, int type
, u32 portid
, u32 seq
,
4382 u32 metrics
[RTAX_MAX
];
4384 struct nlmsghdr
*nlh
;
4388 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*rtm
), flags
);
4392 rtm
= nlmsg_data(nlh
);
4393 rtm
->rtm_family
= AF_INET6
;
4394 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
4395 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
4398 table
= rt
->rt6i_table
->tb6_id
;
4400 table
= RT6_TABLE_UNSPEC
;
4401 rtm
->rtm_table
= table
;
4402 if (nla_put_u32(skb
, RTA_TABLE
, table
))
4403 goto nla_put_failure
;
4404 if (rt
->rt6i_flags
& RTF_REJECT
) {
4405 switch (rt
->dst
.error
) {
4407 rtm
->rtm_type
= RTN_BLACKHOLE
;
4410 rtm
->rtm_type
= RTN_PROHIBIT
;
4413 rtm
->rtm_type
= RTN_THROW
;
4416 rtm
->rtm_type
= RTN_UNREACHABLE
;
4420 else if (rt
->rt6i_flags
& RTF_LOCAL
)
4421 rtm
->rtm_type
= RTN_LOCAL
;
4422 else if (rt
->rt6i_flags
& RTF_ANYCAST
)
4423 rtm
->rtm_type
= RTN_ANYCAST
;
4424 else if (rt
->dst
.dev
&& (rt
->dst
.dev
->flags
& IFF_LOOPBACK
))
4425 rtm
->rtm_type
= RTN_LOCAL
;
4427 rtm
->rtm_type
= RTN_UNICAST
;
4429 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
4430 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
4432 if (rt
->rt6i_flags
& RTF_CACHE
)
4433 rtm
->rtm_flags
|= RTM_F_CLONED
;
4436 if (nla_put_in6_addr(skb
, RTA_DST
, dst
))
4437 goto nla_put_failure
;
4438 rtm
->rtm_dst_len
= 128;
4439 } else if (rtm
->rtm_dst_len
)
4440 if (nla_put_in6_addr(skb
, RTA_DST
, &rt
->rt6i_dst
.addr
))
4441 goto nla_put_failure
;
4442 #ifdef CONFIG_IPV6_SUBTREES
4444 if (nla_put_in6_addr(skb
, RTA_SRC
, src
))
4445 goto nla_put_failure
;
4446 rtm
->rtm_src_len
= 128;
4447 } else if (rtm
->rtm_src_len
&&
4448 nla_put_in6_addr(skb
, RTA_SRC
, &rt
->rt6i_src
.addr
))
4449 goto nla_put_failure
;
4452 #ifdef CONFIG_IPV6_MROUTE
4453 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
4454 int err
= ip6mr_get_route(net
, skb
, rtm
, portid
);
4459 goto nla_put_failure
;
4462 if (nla_put_u32(skb
, RTA_IIF
, iif
))
4463 goto nla_put_failure
;
4465 struct in6_addr saddr_buf
;
4466 if (ip6_route_get_saddr(net
, rt
, dst
, 0, &saddr_buf
) == 0 &&
4467 nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
4468 goto nla_put_failure
;
4471 if (rt
->rt6i_prefsrc
.plen
) {
4472 struct in6_addr saddr_buf
;
4473 saddr_buf
= rt
->rt6i_prefsrc
.addr
;
4474 if (nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
4475 goto nla_put_failure
;
4478 memcpy(metrics
, dst_metrics_ptr(&rt
->dst
), sizeof(metrics
));
4480 metrics
[RTAX_MTU
- 1] = rt
->rt6i_pmtu
;
4481 if (rtnetlink_put_metrics(skb
, metrics
) < 0)
4482 goto nla_put_failure
;
4484 if (nla_put_u32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
))
4485 goto nla_put_failure
;
4487 /* For multipath routes, walk the siblings list and add
4488 * each as a nexthop within RTA_MULTIPATH.
4490 if (rt
->rt6i_nsiblings
) {
4491 struct rt6_info
*sibling
, *next_sibling
;
4494 mp
= nla_nest_start(skb
, RTA_MULTIPATH
);
4496 goto nla_put_failure
;
4498 if (rt6_add_nexthop(skb
, rt
) < 0)
4499 goto nla_put_failure
;
4501 list_for_each_entry_safe(sibling
, next_sibling
,
4502 &rt
->rt6i_siblings
, rt6i_siblings
) {
4503 if (rt6_add_nexthop(skb
, sibling
) < 0)
4504 goto nla_put_failure
;
4507 nla_nest_end(skb
, mp
);
4509 if (rt6_nexthop_info(skb
, rt
, &rtm
->rtm_flags
, false) < 0)
4510 goto nla_put_failure
;
4513 expires
= (rt
->rt6i_flags
& RTF_EXPIRES
) ? rt
->dst
.expires
- jiffies
: 0;
4515 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, expires
, rt
->dst
.error
) < 0)
4516 goto nla_put_failure
;
4518 if (nla_put_u8(skb
, RTA_PREF
, IPV6_EXTRACT_PREF(rt
->rt6i_flags
)))
4519 goto nla_put_failure
;
4522 nlmsg_end(skb
, nlh
);
4526 nlmsg_cancel(skb
, nlh
);
4530 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
4532 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
4533 struct net
*net
= arg
->net
;
4535 if (rt
== net
->ipv6
.ip6_null_entry
)
4538 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
4539 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
4541 /* user wants prefix routes only */
4542 if (rtm
->rtm_flags
& RTM_F_PREFIX
&&
4543 !(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
4544 /* success since this is not a prefix route */
4549 return rt6_fill_node(net
,
4550 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
4551 NETLINK_CB(arg
->cb
->skb
).portid
, arg
->cb
->nlh
->nlmsg_seq
,
4555 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
4556 struct netlink_ext_ack
*extack
)
4558 struct net
*net
= sock_net(in_skb
->sk
);
4559 struct nlattr
*tb
[RTA_MAX
+1];
4560 int err
, iif
= 0, oif
= 0;
4561 struct dst_entry
*dst
;
4562 struct rt6_info
*rt
;
4563 struct sk_buff
*skb
;
4568 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
,
4574 memset(&fl6
, 0, sizeof(fl6
));
4575 rtm
= nlmsg_data(nlh
);
4576 fl6
.flowlabel
= ip6_make_flowinfo(rtm
->rtm_tos
, 0);
4577 fibmatch
= !!(rtm
->rtm_flags
& RTM_F_FIB_MATCH
);
4580 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
4583 fl6
.saddr
= *(struct in6_addr
*)nla_data(tb
[RTA_SRC
]);
4587 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
4590 fl6
.daddr
= *(struct in6_addr
*)nla_data(tb
[RTA_DST
]);
4594 iif
= nla_get_u32(tb
[RTA_IIF
]);
4597 oif
= nla_get_u32(tb
[RTA_OIF
]);
4600 fl6
.flowi6_mark
= nla_get_u32(tb
[RTA_MARK
]);
4603 fl6
.flowi6_uid
= make_kuid(current_user_ns(),
4604 nla_get_u32(tb
[RTA_UID
]));
4606 fl6
.flowi6_uid
= iif
? INVALID_UID
: current_uid();
4609 struct net_device
*dev
;
4614 dev
= dev_get_by_index_rcu(net
, iif
);
4621 fl6
.flowi6_iif
= iif
;
4623 if (!ipv6_addr_any(&fl6
.saddr
))
4624 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
4626 dst
= ip6_route_input_lookup(net
, dev
, &fl6
, flags
);
4630 fl6
.flowi6_oif
= oif
;
4632 dst
= ip6_route_output(net
, NULL
, &fl6
);
4636 rt
= container_of(dst
, struct rt6_info
, dst
);
4637 if (rt
->dst
.error
) {
4638 err
= rt
->dst
.error
;
4643 if (rt
== net
->ipv6
.ip6_null_entry
) {
4644 err
= rt
->dst
.error
;
4649 if (fibmatch
&& rt
->from
) {
4650 struct rt6_info
*ort
= rt
->from
;
4652 dst_hold(&ort
->dst
);
4657 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
4664 skb_dst_set(skb
, &rt
->dst
);
4666 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, iif
,
4667 RTM_NEWROUTE
, NETLINK_CB(in_skb
).portid
,
4670 err
= rt6_fill_node(net
, skb
, rt
, &fl6
.daddr
, &fl6
.saddr
, iif
,
4671 RTM_NEWROUTE
, NETLINK_CB(in_skb
).portid
,
4678 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
4683 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
,
4684 unsigned int nlm_flags
)
4686 struct sk_buff
*skb
;
4687 struct net
*net
= info
->nl_net
;
4692 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
4694 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
4698 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
4699 event
, info
->portid
, seq
, nlm_flags
);
4701 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4702 WARN_ON(err
== -EMSGSIZE
);
4706 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
4707 info
->nlh
, gfp_any());
4711 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
4714 static int ip6_route_dev_notify(struct notifier_block
*this,
4715 unsigned long event
, void *ptr
)
4717 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4718 struct net
*net
= dev_net(dev
);
4720 if (!(dev
->flags
& IFF_LOOPBACK
))
4723 if (event
== NETDEV_REGISTER
) {
4724 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
4725 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
4726 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4727 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
4728 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
4729 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
4730 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
4732 } else if (event
== NETDEV_UNREGISTER
&&
4733 dev
->reg_state
!= NETREG_UNREGISTERED
) {
4734 /* NETDEV_UNREGISTER could be fired for multiple times by
4735 * netdev_wait_allrefs(). Make sure we only call this once.
4737 in6_dev_put_clear(&net
->ipv6
.ip6_null_entry
->rt6i_idev
);
4738 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4739 in6_dev_put_clear(&net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
);
4740 in6_dev_put_clear(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
);
4751 #ifdef CONFIG_PROC_FS
4753 static const struct file_operations ipv6_route_proc_fops
= {
4754 .open
= ipv6_route_open
,
4756 .llseek
= seq_lseek
,
4757 .release
= seq_release_net
,
4760 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
4762 struct net
*net
= (struct net
*)seq
->private;
4763 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
4764 net
->ipv6
.rt6_stats
->fib_nodes
,
4765 net
->ipv6
.rt6_stats
->fib_route_nodes
,
4766 atomic_read(&net
->ipv6
.rt6_stats
->fib_rt_alloc
),
4767 net
->ipv6
.rt6_stats
->fib_rt_entries
,
4768 net
->ipv6
.rt6_stats
->fib_rt_cache
,
4769 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
4770 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
4775 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
4777 return single_open_net(inode
, file
, rt6_stats_seq_show
);
4780 static const struct file_operations rt6_stats_seq_fops
= {
4781 .open
= rt6_stats_seq_open
,
4783 .llseek
= seq_lseek
,
4784 .release
= single_release_net
,
4786 #endif /* CONFIG_PROC_FS */
4788 #ifdef CONFIG_SYSCTL
4791 int ipv6_sysctl_rtcache_flush(struct ctl_table
*ctl
, int write
,
4792 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
4799 net
= (struct net
*)ctl
->extra1
;
4800 delay
= net
->ipv6
.sysctl
.flush_delay
;
4801 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
4802 fib6_run_gc(delay
<= 0 ? 0 : (unsigned long)delay
, net
, delay
> 0);
4806 struct ctl_table ipv6_route_table_template
[] = {
4808 .procname
= "flush",
4809 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
4810 .maxlen
= sizeof(int),
4812 .proc_handler
= ipv6_sysctl_rtcache_flush
4815 .procname
= "gc_thresh",
4816 .data
= &ip6_dst_ops_template
.gc_thresh
,
4817 .maxlen
= sizeof(int),
4819 .proc_handler
= proc_dointvec
,
4822 .procname
= "max_size",
4823 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
4824 .maxlen
= sizeof(int),
4826 .proc_handler
= proc_dointvec
,
4829 .procname
= "gc_min_interval",
4830 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
4831 .maxlen
= sizeof(int),
4833 .proc_handler
= proc_dointvec_jiffies
,
4836 .procname
= "gc_timeout",
4837 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
4838 .maxlen
= sizeof(int),
4840 .proc_handler
= proc_dointvec_jiffies
,
4843 .procname
= "gc_interval",
4844 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
4845 .maxlen
= sizeof(int),
4847 .proc_handler
= proc_dointvec_jiffies
,
4850 .procname
= "gc_elasticity",
4851 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
4852 .maxlen
= sizeof(int),
4854 .proc_handler
= proc_dointvec
,
4857 .procname
= "mtu_expires",
4858 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
4859 .maxlen
= sizeof(int),
4861 .proc_handler
= proc_dointvec_jiffies
,
4864 .procname
= "min_adv_mss",
4865 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
4866 .maxlen
= sizeof(int),
4868 .proc_handler
= proc_dointvec
,
4871 .procname
= "gc_min_interval_ms",
4872 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
4873 .maxlen
= sizeof(int),
4875 .proc_handler
= proc_dointvec_ms_jiffies
,
4880 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
4882 struct ctl_table
*table
;
4884 table
= kmemdup(ipv6_route_table_template
,
4885 sizeof(ipv6_route_table_template
),
4889 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
4890 table
[0].extra1
= net
;
4891 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
4892 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
4893 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
4894 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
4895 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
4896 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
4897 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
4898 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
4899 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
4901 /* Don't export sysctls to unprivileged users */
4902 if (net
->user_ns
!= &init_user_ns
)
4903 table
[0].procname
= NULL
;
4910 static int __net_init
ip6_route_net_init(struct net
*net
)
4914 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
4915 sizeof(net
->ipv6
.ip6_dst_ops
));
4917 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
4918 goto out_ip6_dst_ops
;
4920 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
4921 sizeof(*net
->ipv6
.ip6_null_entry
),
4923 if (!net
->ipv6
.ip6_null_entry
)
4924 goto out_ip6_dst_entries
;
4925 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
4926 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
4927 ip6_template_metrics
, true);
4929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4930 net
->ipv6
.fib6_has_custom_rules
= false;
4931 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
4932 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
4934 if (!net
->ipv6
.ip6_prohibit_entry
)
4935 goto out_ip6_null_entry
;
4936 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
4937 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
4938 ip6_template_metrics
, true);
4940 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
4941 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
4943 if (!net
->ipv6
.ip6_blk_hole_entry
)
4944 goto out_ip6_prohibit_entry
;
4945 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
4946 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
4947 ip6_template_metrics
, true);
4950 net
->ipv6
.sysctl
.flush_delay
= 0;
4951 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
4952 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
4953 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
4954 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
4955 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
4956 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
4957 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
4959 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
4965 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4966 out_ip6_prohibit_entry
:
4967 kfree(net
->ipv6
.ip6_prohibit_entry
);
4969 kfree(net
->ipv6
.ip6_null_entry
);
4971 out_ip6_dst_entries
:
4972 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
4977 static void __net_exit
ip6_route_net_exit(struct net
*net
)
4979 kfree(net
->ipv6
.ip6_null_entry
);
4980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4981 kfree(net
->ipv6
.ip6_prohibit_entry
);
4982 kfree(net
->ipv6
.ip6_blk_hole_entry
);
4984 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
4987 static int __net_init
ip6_route_net_init_late(struct net
*net
)
4989 #ifdef CONFIG_PROC_FS
4990 proc_create("ipv6_route", 0, net
->proc_net
, &ipv6_route_proc_fops
);
4991 proc_create("rt6_stats", S_IRUGO
, net
->proc_net
, &rt6_stats_seq_fops
);
4996 static void __net_exit
ip6_route_net_exit_late(struct net
*net
)
4998 #ifdef CONFIG_PROC_FS
4999 remove_proc_entry("ipv6_route", net
->proc_net
);
5000 remove_proc_entry("rt6_stats", net
->proc_net
);
5004 static struct pernet_operations ip6_route_net_ops
= {
5005 .init
= ip6_route_net_init
,
5006 .exit
= ip6_route_net_exit
,
5009 static int __net_init
ipv6_inetpeer_init(struct net
*net
)
5011 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
5015 inet_peer_base_init(bp
);
5016 net
->ipv6
.peers
= bp
;
5020 static void __net_exit
ipv6_inetpeer_exit(struct net
*net
)
5022 struct inet_peer_base
*bp
= net
->ipv6
.peers
;
5024 net
->ipv6
.peers
= NULL
;
5025 inetpeer_invalidate_tree(bp
);
5029 static struct pernet_operations ipv6_inetpeer_ops
= {
5030 .init
= ipv6_inetpeer_init
,
5031 .exit
= ipv6_inetpeer_exit
,
5034 static struct pernet_operations ip6_route_net_late_ops
= {
5035 .init
= ip6_route_net_init_late
,
5036 .exit
= ip6_route_net_exit_late
,
5039 static struct notifier_block ip6_route_dev_notifier
= {
5040 .notifier_call
= ip6_route_dev_notify
,
5041 .priority
= ADDRCONF_NOTIFY_PRIORITY
- 10,
5044 void __init
ip6_route_init_special_entries(void)
5046 /* Registering of the loopback is done before this portion of code,
5047 * the loopback reference in rt6_info will not be taken, do it
5048 * manually for init_net */
5049 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
5050 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5051 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5052 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
5053 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5054 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
5055 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5059 int __init
ip6_route_init(void)
5065 ip6_dst_ops_template
.kmem_cachep
=
5066 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
5067 SLAB_HWCACHE_ALIGN
, NULL
);
5068 if (!ip6_dst_ops_template
.kmem_cachep
)
5071 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
5073 goto out_kmem_cache
;
5075 ret
= register_pernet_subsys(&ipv6_inetpeer_ops
);
5077 goto out_dst_entries
;
5079 ret
= register_pernet_subsys(&ip6_route_net_ops
);
5081 goto out_register_inetpeer
;
5083 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
5087 goto out_register_subsys
;
5093 ret
= fib6_rules_init();
5097 ret
= register_pernet_subsys(&ip6_route_net_late_ops
);
5099 goto fib6_rules_init
;
5101 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_NEWROUTE
,
5102 inet6_rtm_newroute
, NULL
, 0);
5104 goto out_register_late_subsys
;
5106 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_DELROUTE
,
5107 inet6_rtm_delroute
, NULL
, 0);
5109 goto out_register_late_subsys
;
5111 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_GETROUTE
,
5112 inet6_rtm_getroute
, NULL
,
5113 RTNL_FLAG_DOIT_UNLOCKED
);
5115 goto out_register_late_subsys
;
5117 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
5119 goto out_register_late_subsys
;
5121 for_each_possible_cpu(cpu
) {
5122 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
5124 INIT_LIST_HEAD(&ul
->head
);
5125 spin_lock_init(&ul
->lock
);
5131 out_register_late_subsys
:
5132 rtnl_unregister_all(PF_INET6
);
5133 unregister_pernet_subsys(&ip6_route_net_late_ops
);
5135 fib6_rules_cleanup();
5140 out_register_subsys
:
5141 unregister_pernet_subsys(&ip6_route_net_ops
);
5142 out_register_inetpeer
:
5143 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
5145 dst_entries_destroy(&ip6_dst_blackhole_ops
);
5147 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
5151 void ip6_route_cleanup(void)
5153 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
5154 unregister_pernet_subsys(&ip6_route_net_late_ops
);
5155 fib6_rules_cleanup();
5158 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
5159 unregister_pernet_subsys(&ip6_route_net_ops
);
5160 dst_entries_destroy(&ip6_dst_blackhole_ops
);
5161 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);