1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
12 * YOSHIFUJI Hideaki @USAGI
13 * reworked default router selection.
14 * - respect outgoing interface
15 * - select from (probably) reachable routers (i.e.
16 * routers in REACHABLE, STALE, DELAY or PROBE states).
17 * - always select the same router if it is (probably)
18 * reachable. otherwise, round-robin the list.
20 * Fixed routing subtrees.
23 #define pr_fmt(fmt) "IPv6: " fmt
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
54 #include <net/dst_metadata.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <net/lwtunnel.h>
60 #include <net/ip_tunnels.h>
61 #include <net/l3mdev.h>
63 #include <linux/uaccess.h>
64 #include <linux/btf_ids.h>
67 #include <linux/sysctl.h>
70 static int ip6_rt_type_to_error(u8 fib6_type
);
72 #define CREATE_TRACE_POINTS
73 #include <trace/events/fib6.h>
74 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup
);
75 #undef CREATE_TRACE_POINTS
78 RT6_NUD_FAIL_HARD
= -3,
79 RT6_NUD_FAIL_PROBE
= -2,
80 RT6_NUD_FAIL_DO_RR
= -1,
84 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
85 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
86 static unsigned int ip6_mtu(const struct dst_entry
*dst
);
87 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
88 static void ip6_dst_destroy(struct dst_entry
*);
89 static void ip6_dst_ifdown(struct dst_entry
*,
90 struct net_device
*dev
, int how
);
91 static int ip6_dst_gc(struct dst_ops
*ops
);
93 static int ip6_pkt_discard(struct sk_buff
*skb
);
94 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
95 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
96 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
97 static void ip6_link_failure(struct sk_buff
*skb
);
98 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
99 struct sk_buff
*skb
, u32 mtu
,
101 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
102 struct sk_buff
*skb
);
103 static int rt6_score_route(const struct fib6_nh
*nh
, u32 fib6_flags
, int oif
,
105 static size_t rt6_nlmsg_size(struct fib6_info
*f6i
);
106 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
107 struct fib6_info
*rt
, struct dst_entry
*dst
,
108 struct in6_addr
*dest
, struct in6_addr
*src
,
109 int iif
, int type
, u32 portid
, u32 seq
,
111 static struct rt6_info
*rt6_find_cached_rt(const struct fib6_result
*res
,
112 const struct in6_addr
*daddr
,
113 const struct in6_addr
*saddr
);
115 #ifdef CONFIG_IPV6_ROUTE_INFO
116 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
117 const struct in6_addr
*prefix
, int prefixlen
,
118 const struct in6_addr
*gwaddr
,
119 struct net_device
*dev
,
121 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
122 const struct in6_addr
*prefix
, int prefixlen
,
123 const struct in6_addr
*gwaddr
,
124 struct net_device
*dev
);
127 struct uncached_list
{
129 struct list_head head
;
132 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt6_uncached_list
);
134 void rt6_uncached_list_add(struct rt6_info
*rt
)
136 struct uncached_list
*ul
= raw_cpu_ptr(&rt6_uncached_list
);
138 rt
->rt6i_uncached_list
= ul
;
140 spin_lock_bh(&ul
->lock
);
141 list_add_tail(&rt
->rt6i_uncached
, &ul
->head
);
142 spin_unlock_bh(&ul
->lock
);
145 void rt6_uncached_list_del(struct rt6_info
*rt
)
147 if (!list_empty(&rt
->rt6i_uncached
)) {
148 struct uncached_list
*ul
= rt
->rt6i_uncached_list
;
149 struct net
*net
= dev_net(rt
->dst
.dev
);
151 spin_lock_bh(&ul
->lock
);
152 list_del(&rt
->rt6i_uncached
);
153 atomic_dec(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
154 spin_unlock_bh(&ul
->lock
);
158 static void rt6_uncached_list_flush_dev(struct net
*net
, struct net_device
*dev
)
160 struct net_device
*loopback_dev
= net
->loopback_dev
;
163 if (dev
== loopback_dev
)
166 for_each_possible_cpu(cpu
) {
167 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
170 spin_lock_bh(&ul
->lock
);
171 list_for_each_entry(rt
, &ul
->head
, rt6i_uncached
) {
172 struct inet6_dev
*rt_idev
= rt
->rt6i_idev
;
173 struct net_device
*rt_dev
= rt
->dst
.dev
;
175 if (rt_idev
->dev
== dev
) {
176 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
177 in6_dev_put(rt_idev
);
181 rt
->dst
.dev
= blackhole_netdev
;
182 dev_hold(rt
->dst
.dev
);
186 spin_unlock_bh(&ul
->lock
);
190 static inline const void *choose_neigh_daddr(const struct in6_addr
*p
,
194 if (!ipv6_addr_any(p
))
195 return (const void *) p
;
197 return &ipv6_hdr(skb
)->daddr
;
201 struct neighbour
*ip6_neigh_lookup(const struct in6_addr
*gw
,
202 struct net_device
*dev
,
208 daddr
= choose_neigh_daddr(gw
, skb
, daddr
);
209 n
= __ipv6_neigh_lookup(dev
, daddr
);
213 n
= neigh_create(&nd_tbl
, daddr
, dev
);
214 return IS_ERR(n
) ? NULL
: n
;
217 static struct neighbour
*ip6_dst_neigh_lookup(const struct dst_entry
*dst
,
221 const struct rt6_info
*rt
= container_of(dst
, struct rt6_info
, dst
);
223 return ip6_neigh_lookup(rt6_nexthop(rt
, &in6addr_any
),
224 dst
->dev
, skb
, daddr
);
227 static void ip6_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
229 struct net_device
*dev
= dst
->dev
;
230 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
232 daddr
= choose_neigh_daddr(rt6_nexthop(rt
, &in6addr_any
), NULL
, daddr
);
235 if (dev
->flags
& (IFF_NOARP
| IFF_LOOPBACK
))
237 if (ipv6_addr_is_multicast((const struct in6_addr
*)daddr
))
239 __ipv6_confirm_neigh(dev
, daddr
);
242 static struct dst_ops ip6_dst_ops_template
= {
246 .check
= ip6_dst_check
,
247 .default_advmss
= ip6_default_advmss
,
249 .cow_metrics
= dst_cow_metrics_generic
,
250 .destroy
= ip6_dst_destroy
,
251 .ifdown
= ip6_dst_ifdown
,
252 .negative_advice
= ip6_negative_advice
,
253 .link_failure
= ip6_link_failure
,
254 .update_pmtu
= ip6_rt_update_pmtu
,
255 .redirect
= rt6_do_redirect
,
256 .local_out
= __ip6_local_out
,
257 .neigh_lookup
= ip6_dst_neigh_lookup
,
258 .confirm_neigh
= ip6_confirm_neigh
,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry
*dst
)
263 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
265 return mtu
? : dst
->dev
->mtu
;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
269 struct sk_buff
*skb
, u32 mtu
,
274 static void ip6_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
279 static struct dst_ops ip6_dst_blackhole_ops
= {
281 .destroy
= ip6_dst_destroy
,
282 .check
= ip6_dst_check
,
283 .mtu
= ip6_blackhole_mtu
,
284 .default_advmss
= ip6_default_advmss
,
285 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
286 .redirect
= ip6_rt_blackhole_redirect
,
287 .cow_metrics
= dst_cow_metrics_generic
,
288 .neigh_lookup
= ip6_dst_neigh_lookup
,
291 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
292 [RTAX_HOPLIMIT
- 1] = 0,
295 static const struct fib6_info fib6_null_entry_template
= {
296 .fib6_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
297 .fib6_protocol
= RTPROT_KERNEL
,
298 .fib6_metric
= ~(u32
)0,
299 .fib6_ref
= REFCOUNT_INIT(1),
300 .fib6_type
= RTN_UNREACHABLE
,
301 .fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
,
304 static const struct rt6_info ip6_null_entry_template
= {
306 .__refcnt
= ATOMIC_INIT(1),
308 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
309 .error
= -ENETUNREACH
,
310 .input
= ip6_pkt_discard
,
311 .output
= ip6_pkt_discard_out
,
313 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
318 static const struct rt6_info ip6_prohibit_entry_template
= {
320 .__refcnt
= ATOMIC_INIT(1),
322 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
324 .input
= ip6_pkt_prohibit
,
325 .output
= ip6_pkt_prohibit_out
,
327 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
330 static const struct rt6_info ip6_blk_hole_entry_template
= {
332 .__refcnt
= ATOMIC_INIT(1),
334 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
336 .input
= dst_discard
,
337 .output
= dst_discard_out
,
339 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
344 static void rt6_info_init(struct rt6_info
*rt
)
346 struct dst_entry
*dst
= &rt
->dst
;
348 memset(dst
+ 1, 0, sizeof(*rt
) - sizeof(*dst
));
349 INIT_LIST_HEAD(&rt
->rt6i_uncached
);
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info
*ip6_dst_alloc(struct net
*net
, struct net_device
*dev
,
356 struct rt6_info
*rt
= dst_alloc(&net
->ipv6
.ip6_dst_ops
, dev
,
357 1, DST_OBSOLETE_FORCE_CHK
, flags
);
361 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
366 EXPORT_SYMBOL(ip6_dst_alloc
);
368 static void ip6_dst_destroy(struct dst_entry
*dst
)
370 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
371 struct fib6_info
*from
;
372 struct inet6_dev
*idev
;
374 ip_dst_metrics_put(dst
);
375 rt6_uncached_list_del(rt
);
377 idev
= rt
->rt6i_idev
;
379 rt
->rt6i_idev
= NULL
;
383 from
= xchg((__force
struct fib6_info
**)&rt
->from
, NULL
);
384 fib6_info_release(from
);
387 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
390 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
391 struct inet6_dev
*idev
= rt
->rt6i_idev
;
392 struct net_device
*loopback_dev
=
393 dev_net(dev
)->loopback_dev
;
395 if (idev
&& idev
->dev
!= loopback_dev
) {
396 struct inet6_dev
*loopback_idev
= in6_dev_get(loopback_dev
);
398 rt
->rt6i_idev
= loopback_idev
;
404 static bool __rt6_check_expired(const struct rt6_info
*rt
)
406 if (rt
->rt6i_flags
& RTF_EXPIRES
)
407 return time_after(jiffies
, rt
->dst
.expires
);
412 static bool rt6_check_expired(const struct rt6_info
*rt
)
414 struct fib6_info
*from
;
416 from
= rcu_dereference(rt
->from
);
418 if (rt
->rt6i_flags
& RTF_EXPIRES
) {
419 if (time_after(jiffies
, rt
->dst
.expires
))
422 return rt
->dst
.obsolete
!= DST_OBSOLETE_FORCE_CHK
||
423 fib6_check_expired(from
);
428 void fib6_select_path(const struct net
*net
, struct fib6_result
*res
,
429 struct flowi6
*fl6
, int oif
, bool have_oif_match
,
430 const struct sk_buff
*skb
, int strict
)
432 struct fib6_info
*sibling
, *next_sibling
;
433 struct fib6_info
*match
= res
->f6i
;
435 if (!match
->nh
&& (!match
->fib6_nsiblings
|| have_oif_match
))
438 if (match
->nh
&& have_oif_match
&& res
->nh
)
441 /* We might have already computed the hash for ICMPv6 errors. In such
442 * case it will always be non-zero. Otherwise now is the time to do it.
445 (!match
->nh
|| nexthop_is_multipath(match
->nh
)))
446 fl6
->mp_hash
= rt6_multipath_hash(net
, fl6
, skb
, NULL
);
448 if (unlikely(match
->nh
)) {
449 nexthop_path_fib6_result(res
, fl6
->mp_hash
);
453 if (fl6
->mp_hash
<= atomic_read(&match
->fib6_nh
->fib_nh_upper_bound
))
456 list_for_each_entry_safe(sibling
, next_sibling
, &match
->fib6_siblings
,
458 const struct fib6_nh
*nh
= sibling
->fib6_nh
;
461 nh_upper_bound
= atomic_read(&nh
->fib_nh_upper_bound
);
462 if (fl6
->mp_hash
> nh_upper_bound
)
464 if (rt6_score_route(nh
, sibling
->fib6_flags
, oif
, strict
) < 0)
472 res
->nh
= match
->fib6_nh
;
476 * Route lookup. rcu_read_lock() should be held.
479 static bool __rt6_device_match(struct net
*net
, const struct fib6_nh
*nh
,
480 const struct in6_addr
*saddr
, int oif
, int flags
)
482 const struct net_device
*dev
;
484 if (nh
->fib_nh_flags
& RTNH_F_DEAD
)
487 dev
= nh
->fib_nh_dev
;
489 if (dev
->ifindex
== oif
)
492 if (ipv6_chk_addr(net
, saddr
, dev
,
493 flags
& RT6_LOOKUP_F_IFACE
))
500 struct fib6_nh_dm_arg
{
502 const struct in6_addr
*saddr
;
508 static int __rt6_nh_dev_match(struct fib6_nh
*nh
, void *_arg
)
510 struct fib6_nh_dm_arg
*arg
= _arg
;
513 return __rt6_device_match(arg
->net
, nh
, arg
->saddr
, arg
->oif
,
517 /* returns fib6_nh from nexthop or NULL */
518 static struct fib6_nh
*rt6_nh_dev_match(struct net
*net
, struct nexthop
*nh
,
519 struct fib6_result
*res
,
520 const struct in6_addr
*saddr
,
523 struct fib6_nh_dm_arg arg
= {
530 if (nexthop_is_blackhole(nh
))
533 if (nexthop_for_each_fib6_nh(nh
, __rt6_nh_dev_match
, &arg
))
539 static void rt6_device_match(struct net
*net
, struct fib6_result
*res
,
540 const struct in6_addr
*saddr
, int oif
, int flags
)
542 struct fib6_info
*f6i
= res
->f6i
;
543 struct fib6_info
*spf6i
;
546 if (!oif
&& ipv6_addr_any(saddr
)) {
547 if (unlikely(f6i
->nh
)) {
548 nh
= nexthop_fib6_nh(f6i
->nh
);
549 if (nexthop_is_blackhole(f6i
->nh
))
554 if (!(nh
->fib_nh_flags
& RTNH_F_DEAD
))
558 for (spf6i
= f6i
; spf6i
; spf6i
= rcu_dereference(spf6i
->fib6_next
)) {
559 bool matched
= false;
561 if (unlikely(spf6i
->nh
)) {
562 nh
= rt6_nh_dev_match(net
, spf6i
->nh
, res
, saddr
,
568 if (__rt6_device_match(net
, nh
, saddr
, oif
, flags
))
577 if (oif
&& flags
& RT6_LOOKUP_F_IFACE
) {
578 res
->f6i
= net
->ipv6
.fib6_null_entry
;
579 nh
= res
->f6i
->fib6_nh
;
583 if (unlikely(f6i
->nh
)) {
584 nh
= nexthop_fib6_nh(f6i
->nh
);
585 if (nexthop_is_blackhole(f6i
->nh
))
591 if (nh
->fib_nh_flags
& RTNH_F_DEAD
) {
592 res
->f6i
= net
->ipv6
.fib6_null_entry
;
593 nh
= res
->f6i
->fib6_nh
;
597 res
->fib6_type
= res
->f6i
->fib6_type
;
598 res
->fib6_flags
= res
->f6i
->fib6_flags
;
602 res
->fib6_flags
|= RTF_REJECT
;
603 res
->fib6_type
= RTN_BLACKHOLE
;
607 #ifdef CONFIG_IPV6_ROUTER_PREF
608 struct __rt6_probe_work
{
609 struct work_struct work
;
610 struct in6_addr target
;
611 struct net_device
*dev
;
614 static void rt6_probe_deferred(struct work_struct
*w
)
616 struct in6_addr mcaddr
;
617 struct __rt6_probe_work
*work
=
618 container_of(w
, struct __rt6_probe_work
, work
);
620 addrconf_addr_solict_mult(&work
->target
, &mcaddr
);
621 ndisc_send_ns(work
->dev
, &work
->target
, &mcaddr
, NULL
, 0);
626 static void rt6_probe(struct fib6_nh
*fib6_nh
)
628 struct __rt6_probe_work
*work
= NULL
;
629 const struct in6_addr
*nh_gw
;
630 unsigned long last_probe
;
631 struct neighbour
*neigh
;
632 struct net_device
*dev
;
633 struct inet6_dev
*idev
;
636 * Okay, this does not seem to be appropriate
637 * for now, however, we need to check if it
638 * is really so; aka Router Reachability Probing.
640 * Router Reachability Probe MUST be rate-limited
641 * to no more than one per minute.
643 if (!fib6_nh
->fib_nh_gw_family
)
646 nh_gw
= &fib6_nh
->fib_nh_gw6
;
647 dev
= fib6_nh
->fib_nh_dev
;
649 last_probe
= READ_ONCE(fib6_nh
->last_probe
);
650 idev
= __in6_dev_get(dev
);
651 neigh
= __ipv6_neigh_lookup_noref(dev
, nh_gw
);
653 if (neigh
->nud_state
& NUD_VALID
)
656 write_lock(&neigh
->lock
);
657 if (!(neigh
->nud_state
& NUD_VALID
) &&
659 neigh
->updated
+ idev
->cnf
.rtr_probe_interval
)) {
660 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
662 __neigh_set_probe_once(neigh
);
664 write_unlock(&neigh
->lock
);
665 } else if (time_after(jiffies
, last_probe
+
666 idev
->cnf
.rtr_probe_interval
)) {
667 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
670 if (!work
|| cmpxchg(&fib6_nh
->last_probe
,
671 last_probe
, jiffies
) != last_probe
) {
674 INIT_WORK(&work
->work
, rt6_probe_deferred
);
675 work
->target
= *nh_gw
;
678 schedule_work(&work
->work
);
682 rcu_read_unlock_bh();
685 static inline void rt6_probe(struct fib6_nh
*fib6_nh
)
691 * Default Router Selection (RFC 2461 6.3.6)
693 static enum rt6_nud_state
rt6_check_neigh(const struct fib6_nh
*fib6_nh
)
695 enum rt6_nud_state ret
= RT6_NUD_FAIL_HARD
;
696 struct neighbour
*neigh
;
699 neigh
= __ipv6_neigh_lookup_noref(fib6_nh
->fib_nh_dev
,
700 &fib6_nh
->fib_nh_gw6
);
702 read_lock(&neigh
->lock
);
703 if (neigh
->nud_state
& NUD_VALID
)
704 ret
= RT6_NUD_SUCCEED
;
705 #ifdef CONFIG_IPV6_ROUTER_PREF
706 else if (!(neigh
->nud_state
& NUD_FAILED
))
707 ret
= RT6_NUD_SUCCEED
;
709 ret
= RT6_NUD_FAIL_PROBE
;
711 read_unlock(&neigh
->lock
);
713 ret
= IS_ENABLED(CONFIG_IPV6_ROUTER_PREF
) ?
714 RT6_NUD_SUCCEED
: RT6_NUD_FAIL_DO_RR
;
716 rcu_read_unlock_bh();
721 static int rt6_score_route(const struct fib6_nh
*nh
, u32 fib6_flags
, int oif
,
726 if (!oif
|| nh
->fib_nh_dev
->ifindex
== oif
)
729 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
730 return RT6_NUD_FAIL_HARD
;
731 #ifdef CONFIG_IPV6_ROUTER_PREF
732 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags
)) << 2;
734 if ((strict
& RT6_LOOKUP_F_REACHABLE
) &&
735 !(fib6_flags
& RTF_NONEXTHOP
) && nh
->fib_nh_gw_family
) {
736 int n
= rt6_check_neigh(nh
);
743 static bool find_match(struct fib6_nh
*nh
, u32 fib6_flags
,
744 int oif
, int strict
, int *mpri
, bool *do_rr
)
746 bool match_do_rr
= false;
750 if (nh
->fib_nh_flags
& RTNH_F_DEAD
)
753 if (ip6_ignore_linkdown(nh
->fib_nh_dev
) &&
754 nh
->fib_nh_flags
& RTNH_F_LINKDOWN
&&
755 !(strict
& RT6_LOOKUP_F_IGNORE_LINKSTATE
))
758 m
= rt6_score_route(nh
, fib6_flags
, oif
, strict
);
759 if (m
== RT6_NUD_FAIL_DO_RR
) {
761 m
= 0; /* lowest valid score */
762 } else if (m
== RT6_NUD_FAIL_HARD
) {
766 if (strict
& RT6_LOOKUP_F_REACHABLE
)
769 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
771 *do_rr
= match_do_rr
;
779 struct fib6_nh_frl_arg
{
788 static int rt6_nh_find_match(struct fib6_nh
*nh
, void *_arg
)
790 struct fib6_nh_frl_arg
*arg
= _arg
;
793 return find_match(nh
, arg
->flags
, arg
->oif
, arg
->strict
,
794 arg
->mpri
, arg
->do_rr
);
797 static void __find_rr_leaf(struct fib6_info
*f6i_start
,
798 struct fib6_info
*nomatch
, u32 metric
,
799 struct fib6_result
*res
, struct fib6_info
**cont
,
800 int oif
, int strict
, bool *do_rr
, int *mpri
)
802 struct fib6_info
*f6i
;
804 for (f6i
= f6i_start
;
805 f6i
&& f6i
!= nomatch
;
806 f6i
= rcu_dereference(f6i
->fib6_next
)) {
807 bool matched
= false;
810 if (cont
&& f6i
->fib6_metric
!= metric
) {
815 if (fib6_check_expired(f6i
))
818 if (unlikely(f6i
->nh
)) {
819 struct fib6_nh_frl_arg arg
= {
820 .flags
= f6i
->fib6_flags
,
827 if (nexthop_is_blackhole(f6i
->nh
)) {
828 res
->fib6_flags
= RTF_REJECT
;
829 res
->fib6_type
= RTN_BLACKHOLE
;
831 res
->nh
= nexthop_fib6_nh(f6i
->nh
);
834 if (nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_find_match
,
841 if (find_match(nh
, f6i
->fib6_flags
, oif
, strict
,
848 res
->fib6_flags
= f6i
->fib6_flags
;
849 res
->fib6_type
= f6i
->fib6_type
;
854 static void find_rr_leaf(struct fib6_node
*fn
, struct fib6_info
*leaf
,
855 struct fib6_info
*rr_head
, int oif
, int strict
,
856 bool *do_rr
, struct fib6_result
*res
)
858 u32 metric
= rr_head
->fib6_metric
;
859 struct fib6_info
*cont
= NULL
;
862 __find_rr_leaf(rr_head
, NULL
, metric
, res
, &cont
,
863 oif
, strict
, do_rr
, &mpri
);
865 __find_rr_leaf(leaf
, rr_head
, metric
, res
, &cont
,
866 oif
, strict
, do_rr
, &mpri
);
868 if (res
->f6i
|| !cont
)
871 __find_rr_leaf(cont
, NULL
, metric
, res
, NULL
,
872 oif
, strict
, do_rr
, &mpri
);
875 static void rt6_select(struct net
*net
, struct fib6_node
*fn
, int oif
,
876 struct fib6_result
*res
, int strict
)
878 struct fib6_info
*leaf
= rcu_dereference(fn
->leaf
);
879 struct fib6_info
*rt0
;
883 /* make sure this function or its helpers sets f6i */
886 if (!leaf
|| leaf
== net
->ipv6
.fib6_null_entry
)
889 rt0
= rcu_dereference(fn
->rr_ptr
);
893 /* Double check to make sure fn is not an intermediate node
894 * and fn->leaf does not points to its child's leaf
895 * (This might happen if all routes under fn are deleted from
896 * the tree and fib6_repair_tree() is called on the node.)
898 key_plen
= rt0
->fib6_dst
.plen
;
899 #ifdef CONFIG_IPV6_SUBTREES
900 if (rt0
->fib6_src
.plen
)
901 key_plen
= rt0
->fib6_src
.plen
;
903 if (fn
->fn_bit
!= key_plen
)
906 find_rr_leaf(fn
, leaf
, rt0
, oif
, strict
, &do_rr
, res
);
908 struct fib6_info
*next
= rcu_dereference(rt0
->fib6_next
);
910 /* no entries matched; do round-robin */
911 if (!next
|| next
->fib6_metric
!= rt0
->fib6_metric
)
915 spin_lock_bh(&leaf
->fib6_table
->tb6_lock
);
916 /* make sure next is not being deleted from the tree */
918 rcu_assign_pointer(fn
->rr_ptr
, next
);
919 spin_unlock_bh(&leaf
->fib6_table
->tb6_lock
);
925 res
->f6i
= net
->ipv6
.fib6_null_entry
;
926 res
->nh
= res
->f6i
->fib6_nh
;
927 res
->fib6_flags
= res
->f6i
->fib6_flags
;
928 res
->fib6_type
= res
->f6i
->fib6_type
;
932 static bool rt6_is_gw_or_nonexthop(const struct fib6_result
*res
)
934 return (res
->f6i
->fib6_flags
& RTF_NONEXTHOP
) ||
935 res
->nh
->fib_nh_gw_family
;
938 #ifdef CONFIG_IPV6_ROUTE_INFO
939 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
940 const struct in6_addr
*gwaddr
)
942 struct net
*net
= dev_net(dev
);
943 struct route_info
*rinfo
= (struct route_info
*) opt
;
944 struct in6_addr prefix_buf
, *prefix
;
946 unsigned long lifetime
;
947 struct fib6_info
*rt
;
949 if (len
< sizeof(struct route_info
)) {
953 /* Sanity check for prefix_len and length */
954 if (rinfo
->length
> 3) {
956 } else if (rinfo
->prefix_len
> 128) {
958 } else if (rinfo
->prefix_len
> 64) {
959 if (rinfo
->length
< 2) {
962 } else if (rinfo
->prefix_len
> 0) {
963 if (rinfo
->length
< 1) {
968 pref
= rinfo
->route_pref
;
969 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
972 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
974 if (rinfo
->length
== 3)
975 prefix
= (struct in6_addr
*)rinfo
->prefix
;
977 /* this function is safe */
978 ipv6_addr_prefix(&prefix_buf
,
979 (struct in6_addr
*)rinfo
->prefix
,
981 prefix
= &prefix_buf
;
984 if (rinfo
->prefix_len
== 0)
985 rt
= rt6_get_dflt_router(net
, gwaddr
, dev
);
987 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
,
990 if (rt
&& !lifetime
) {
991 ip6_del_rt(net
, rt
, false);
996 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
999 rt
->fib6_flags
= RTF_ROUTEINFO
|
1000 (rt
->fib6_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
1003 if (!addrconf_finite_timeout(lifetime
))
1004 fib6_clean_expires(rt
);
1006 fib6_set_expires(rt
, jiffies
+ HZ
* lifetime
);
1008 fib6_info_release(rt
);
1015 * Misc support functions
1018 /* called with rcu_lock held */
1019 static struct net_device
*ip6_rt_get_dev_rcu(const struct fib6_result
*res
)
1021 struct net_device
*dev
= res
->nh
->fib_nh_dev
;
1023 if (res
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) {
1024 /* for copies of local routes, dst->dev needs to be the
1025 * device if it is a master device, the master device if
1026 * device is enslaved, and the loopback as the default
1028 if (netif_is_l3_slave(dev
) &&
1029 !rt6_need_strict(&res
->f6i
->fib6_dst
.addr
))
1030 dev
= l3mdev_master_dev_rcu(dev
);
1031 else if (!netif_is_l3_master(dev
))
1032 dev
= dev_net(dev
)->loopback_dev
;
1033 /* last case is netif_is_l3_master(dev) is true in which
1034 * case we want dev returned to be dev
1041 static const int fib6_prop
[RTN_MAX
+ 1] = {
1045 [RTN_BROADCAST
] = 0,
1047 [RTN_MULTICAST
] = 0,
1048 [RTN_BLACKHOLE
] = -EINVAL
,
1049 [RTN_UNREACHABLE
] = -EHOSTUNREACH
,
1050 [RTN_PROHIBIT
] = -EACCES
,
1051 [RTN_THROW
] = -EAGAIN
,
1052 [RTN_NAT
] = -EINVAL
,
1053 [RTN_XRESOLVE
] = -EINVAL
,
1056 static int ip6_rt_type_to_error(u8 fib6_type
)
1058 return fib6_prop
[fib6_type
];
1061 static unsigned short fib6_info_dst_flags(struct fib6_info
*rt
)
1063 unsigned short flags
= 0;
1065 if (rt
->dst_nocount
)
1066 flags
|= DST_NOCOUNT
;
1067 if (rt
->dst_nopolicy
)
1068 flags
|= DST_NOPOLICY
;
1073 static void ip6_rt_init_dst_reject(struct rt6_info
*rt
, u8 fib6_type
)
1075 rt
->dst
.error
= ip6_rt_type_to_error(fib6_type
);
1077 switch (fib6_type
) {
1079 rt
->dst
.output
= dst_discard_out
;
1080 rt
->dst
.input
= dst_discard
;
1083 rt
->dst
.output
= ip6_pkt_prohibit_out
;
1084 rt
->dst
.input
= ip6_pkt_prohibit
;
1087 case RTN_UNREACHABLE
:
1089 rt
->dst
.output
= ip6_pkt_discard_out
;
1090 rt
->dst
.input
= ip6_pkt_discard
;
1095 static void ip6_rt_init_dst(struct rt6_info
*rt
, const struct fib6_result
*res
)
1097 struct fib6_info
*f6i
= res
->f6i
;
1099 if (res
->fib6_flags
& RTF_REJECT
) {
1100 ip6_rt_init_dst_reject(rt
, res
->fib6_type
);
1105 rt
->dst
.output
= ip6_output
;
1107 if (res
->fib6_type
== RTN_LOCAL
|| res
->fib6_type
== RTN_ANYCAST
) {
1108 rt
->dst
.input
= ip6_input
;
1109 } else if (ipv6_addr_type(&f6i
->fib6_dst
.addr
) & IPV6_ADDR_MULTICAST
) {
1110 rt
->dst
.input
= ip6_mc_input
;
1112 rt
->dst
.input
= ip6_forward
;
1115 if (res
->nh
->fib_nh_lws
) {
1116 rt
->dst
.lwtstate
= lwtstate_get(res
->nh
->fib_nh_lws
);
1117 lwtunnel_set_redirect(&rt
->dst
);
1120 rt
->dst
.lastuse
= jiffies
;
1123 /* Caller must already hold reference to @from */
1124 static void rt6_set_from(struct rt6_info
*rt
, struct fib6_info
*from
)
1126 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
1127 rcu_assign_pointer(rt
->from
, from
);
1128 ip_dst_init_metrics(&rt
->dst
, from
->fib6_metrics
);
1131 /* Caller must already hold reference to f6i in result */
1132 static void ip6_rt_copy_init(struct rt6_info
*rt
, const struct fib6_result
*res
)
1134 const struct fib6_nh
*nh
= res
->nh
;
1135 const struct net_device
*dev
= nh
->fib_nh_dev
;
1136 struct fib6_info
*f6i
= res
->f6i
;
1138 ip6_rt_init_dst(rt
, res
);
1140 rt
->rt6i_dst
= f6i
->fib6_dst
;
1141 rt
->rt6i_idev
= dev
? in6_dev_get(dev
) : NULL
;
1142 rt
->rt6i_flags
= res
->fib6_flags
;
1143 if (nh
->fib_nh_gw_family
) {
1144 rt
->rt6i_gateway
= nh
->fib_nh_gw6
;
1145 rt
->rt6i_flags
|= RTF_GATEWAY
;
1147 rt6_set_from(rt
, f6i
);
1148 #ifdef CONFIG_IPV6_SUBTREES
1149 rt
->rt6i_src
= f6i
->fib6_src
;
1153 static struct fib6_node
* fib6_backtrack(struct fib6_node
*fn
,
1154 struct in6_addr
*saddr
)
1156 struct fib6_node
*pn
, *sn
;
1158 if (fn
->fn_flags
& RTN_TL_ROOT
)
1160 pn
= rcu_dereference(fn
->parent
);
1161 sn
= FIB6_SUBTREE(pn
);
1163 fn
= fib6_node_lookup(sn
, NULL
, saddr
);
1166 if (fn
->fn_flags
& RTN_RTINFO
)
1171 static bool ip6_hold_safe(struct net
*net
, struct rt6_info
**prt
)
1173 struct rt6_info
*rt
= *prt
;
1175 if (dst_hold_safe(&rt
->dst
))
1178 rt
= net
->ipv6
.ip6_null_entry
;
1187 /* called with rcu_lock held */
1188 static struct rt6_info
*ip6_create_rt_rcu(const struct fib6_result
*res
)
1190 struct net_device
*dev
= res
->nh
->fib_nh_dev
;
1191 struct fib6_info
*f6i
= res
->f6i
;
1192 unsigned short flags
;
1193 struct rt6_info
*nrt
;
1195 if (!fib6_info_hold_safe(f6i
))
1198 flags
= fib6_info_dst_flags(f6i
);
1199 nrt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
);
1201 fib6_info_release(f6i
);
1205 ip6_rt_copy_init(nrt
, res
);
1209 nrt
= dev_net(dev
)->ipv6
.ip6_null_entry
;
1210 dst_hold(&nrt
->dst
);
1214 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
1215 struct fib6_table
*table
,
1217 const struct sk_buff
*skb
,
1220 struct fib6_result res
= {};
1221 struct fib6_node
*fn
;
1222 struct rt6_info
*rt
;
1224 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1225 flags
&= ~RT6_LOOKUP_F_IFACE
;
1228 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1230 res
.f6i
= rcu_dereference(fn
->leaf
);
1232 res
.f6i
= net
->ipv6
.fib6_null_entry
;
1234 rt6_device_match(net
, &res
, &fl6
->saddr
, fl6
->flowi6_oif
,
1237 if (res
.f6i
== net
->ipv6
.fib6_null_entry
) {
1238 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1242 rt
= net
->ipv6
.ip6_null_entry
;
1245 } else if (res
.fib6_flags
& RTF_REJECT
) {
1249 fib6_select_path(net
, &res
, fl6
, fl6
->flowi6_oif
,
1250 fl6
->flowi6_oif
!= 0, skb
, flags
);
1252 /* Search through exception table */
1253 rt
= rt6_find_cached_rt(&res
, &fl6
->daddr
, &fl6
->saddr
);
1255 if (ip6_hold_safe(net
, &rt
))
1256 dst_use_noref(&rt
->dst
, jiffies
);
1259 rt
= ip6_create_rt_rcu(&res
);
1263 trace_fib6_table_lookup(net
, &res
, table
, fl6
);
1270 struct dst_entry
*ip6_route_lookup(struct net
*net
, struct flowi6
*fl6
,
1271 const struct sk_buff
*skb
, int flags
)
1273 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_lookup
);
1275 EXPORT_SYMBOL_GPL(ip6_route_lookup
);
1277 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
1278 const struct in6_addr
*saddr
, int oif
,
1279 const struct sk_buff
*skb
, int strict
)
1281 struct flowi6 fl6
= {
1285 struct dst_entry
*dst
;
1286 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
1289 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
1290 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1293 dst
= fib6_rule_lookup(net
, &fl6
, skb
, flags
, ip6_pol_route_lookup
);
1294 if (dst
->error
== 0)
1295 return (struct rt6_info
*) dst
;
1301 EXPORT_SYMBOL(rt6_lookup
);
1303 /* ip6_ins_rt is called with FREE table->tb6_lock.
1304 * It takes new route entry, the addition fails by any reason the
1305 * route is released.
1306 * Caller must hold dst before calling it.
1309 static int __ip6_ins_rt(struct fib6_info
*rt
, struct nl_info
*info
,
1310 struct netlink_ext_ack
*extack
)
1313 struct fib6_table
*table
;
1315 table
= rt
->fib6_table
;
1316 spin_lock_bh(&table
->tb6_lock
);
1317 err
= fib6_add(&table
->tb6_root
, rt
, info
, extack
);
1318 spin_unlock_bh(&table
->tb6_lock
);
1323 int ip6_ins_rt(struct net
*net
, struct fib6_info
*rt
)
1325 struct nl_info info
= { .nl_net
= net
, };
1327 return __ip6_ins_rt(rt
, &info
, NULL
);
1330 static struct rt6_info
*ip6_rt_cache_alloc(const struct fib6_result
*res
,
1331 const struct in6_addr
*daddr
,
1332 const struct in6_addr
*saddr
)
1334 struct fib6_info
*f6i
= res
->f6i
;
1335 struct net_device
*dev
;
1336 struct rt6_info
*rt
;
1342 if (!fib6_info_hold_safe(f6i
))
1345 dev
= ip6_rt_get_dev_rcu(res
);
1346 rt
= ip6_dst_alloc(dev_net(dev
), dev
, 0);
1348 fib6_info_release(f6i
);
1352 ip6_rt_copy_init(rt
, res
);
1353 rt
->rt6i_flags
|= RTF_CACHE
;
1354 rt
->rt6i_dst
.addr
= *daddr
;
1355 rt
->rt6i_dst
.plen
= 128;
1357 if (!rt6_is_gw_or_nonexthop(res
)) {
1358 if (f6i
->fib6_dst
.plen
!= 128 &&
1359 ipv6_addr_equal(&f6i
->fib6_dst
.addr
, daddr
))
1360 rt
->rt6i_flags
|= RTF_ANYCAST
;
1361 #ifdef CONFIG_IPV6_SUBTREES
1362 if (rt
->rt6i_src
.plen
&& saddr
) {
1363 rt
->rt6i_src
.addr
= *saddr
;
1364 rt
->rt6i_src
.plen
= 128;
1372 static struct rt6_info
*ip6_rt_pcpu_alloc(const struct fib6_result
*res
)
1374 struct fib6_info
*f6i
= res
->f6i
;
1375 unsigned short flags
= fib6_info_dst_flags(f6i
);
1376 struct net_device
*dev
;
1377 struct rt6_info
*pcpu_rt
;
1379 if (!fib6_info_hold_safe(f6i
))
1383 dev
= ip6_rt_get_dev_rcu(res
);
1384 pcpu_rt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
| DST_NOCOUNT
);
1387 fib6_info_release(f6i
);
1390 ip6_rt_copy_init(pcpu_rt
, res
);
1391 pcpu_rt
->rt6i_flags
|= RTF_PCPU
;
1394 pcpu_rt
->sernum
= rt_genid_ipv6(dev_net(dev
));
1399 static bool rt6_is_valid(const struct rt6_info
*rt6
)
1401 return rt6
->sernum
== rt_genid_ipv6(dev_net(rt6
->dst
.dev
));
1404 /* It should be called with rcu_read_lock() acquired */
1405 static struct rt6_info
*rt6_get_pcpu_route(const struct fib6_result
*res
)
1407 struct rt6_info
*pcpu_rt
;
1409 pcpu_rt
= this_cpu_read(*res
->nh
->rt6i_pcpu
);
1411 if (pcpu_rt
&& pcpu_rt
->sernum
&& !rt6_is_valid(pcpu_rt
)) {
1412 struct rt6_info
*prev
, **p
;
1414 p
= this_cpu_ptr(res
->nh
->rt6i_pcpu
);
1415 prev
= xchg(p
, NULL
);
1417 dst_dev_put(&prev
->dst
);
1418 dst_release(&prev
->dst
);
1427 static struct rt6_info
*rt6_make_pcpu_route(struct net
*net
,
1428 const struct fib6_result
*res
)
1430 struct rt6_info
*pcpu_rt
, *prev
, **p
;
1432 pcpu_rt
= ip6_rt_pcpu_alloc(res
);
1436 p
= this_cpu_ptr(res
->nh
->rt6i_pcpu
);
1437 prev
= cmpxchg(p
, NULL
, pcpu_rt
);
1440 if (res
->f6i
->fib6_destroying
) {
1441 struct fib6_info
*from
;
1443 from
= xchg((__force
struct fib6_info
**)&pcpu_rt
->from
, NULL
);
1444 fib6_info_release(from
);
1450 /* exception hash table implementation
1452 static DEFINE_SPINLOCK(rt6_exception_lock
);
1454 /* Remove rt6_ex from hash table and free the memory
1455 * Caller must hold rt6_exception_lock
1457 static void rt6_remove_exception(struct rt6_exception_bucket
*bucket
,
1458 struct rt6_exception
*rt6_ex
)
1460 struct fib6_info
*from
;
1463 if (!bucket
|| !rt6_ex
)
1466 net
= dev_net(rt6_ex
->rt6i
->dst
.dev
);
1467 net
->ipv6
.rt6_stats
->fib_rt_cache
--;
1469 /* purge completely the exception to allow releasing the held resources:
1470 * some [sk] cache may keep the dst around for unlimited time
1472 from
= xchg((__force
struct fib6_info
**)&rt6_ex
->rt6i
->from
, NULL
);
1473 fib6_info_release(from
);
1474 dst_dev_put(&rt6_ex
->rt6i
->dst
);
1476 hlist_del_rcu(&rt6_ex
->hlist
);
1477 dst_release(&rt6_ex
->rt6i
->dst
);
1478 kfree_rcu(rt6_ex
, rcu
);
1479 WARN_ON_ONCE(!bucket
->depth
);
1483 /* Remove oldest rt6_ex in bucket and free the memory
1484 * Caller must hold rt6_exception_lock
1486 static void rt6_exception_remove_oldest(struct rt6_exception_bucket
*bucket
)
1488 struct rt6_exception
*rt6_ex
, *oldest
= NULL
;
1493 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1494 if (!oldest
|| time_before(rt6_ex
->stamp
, oldest
->stamp
))
1497 rt6_remove_exception(bucket
, oldest
);
1500 static u32
rt6_exception_hash(const struct in6_addr
*dst
,
1501 const struct in6_addr
*src
)
1503 static u32 seed __read_mostly
;
1506 net_get_random_once(&seed
, sizeof(seed
));
1507 val
= jhash2((const u32
*)dst
, sizeof(*dst
)/sizeof(u32
), seed
);
1509 #ifdef CONFIG_IPV6_SUBTREES
1511 val
= jhash2((const u32
*)src
, sizeof(*src
)/sizeof(u32
), val
);
1513 return hash_32(val
, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT
);
1516 /* Helper function to find the cached rt in the hash table
1517 * and update bucket pointer to point to the bucket for this
1518 * (daddr, saddr) pair
1519 * Caller must hold rt6_exception_lock
1521 static struct rt6_exception
*
1522 __rt6_find_exception_spinlock(struct rt6_exception_bucket
**bucket
,
1523 const struct in6_addr
*daddr
,
1524 const struct in6_addr
*saddr
)
1526 struct rt6_exception
*rt6_ex
;
1529 if (!(*bucket
) || !daddr
)
1532 hval
= rt6_exception_hash(daddr
, saddr
);
1535 hlist_for_each_entry(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1536 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1537 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1539 #ifdef CONFIG_IPV6_SUBTREES
1540 if (matched
&& saddr
)
1541 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1549 /* Helper function to find the cached rt in the hash table
1550 * and update bucket pointer to point to the bucket for this
1551 * (daddr, saddr) pair
1552 * Caller must hold rcu_read_lock()
1554 static struct rt6_exception
*
1555 __rt6_find_exception_rcu(struct rt6_exception_bucket
**bucket
,
1556 const struct in6_addr
*daddr
,
1557 const struct in6_addr
*saddr
)
1559 struct rt6_exception
*rt6_ex
;
1562 WARN_ON_ONCE(!rcu_read_lock_held());
1564 if (!(*bucket
) || !daddr
)
1567 hval
= rt6_exception_hash(daddr
, saddr
);
1570 hlist_for_each_entry_rcu(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1571 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1572 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1574 #ifdef CONFIG_IPV6_SUBTREES
1575 if (matched
&& saddr
)
1576 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1584 static unsigned int fib6_mtu(const struct fib6_result
*res
)
1586 const struct fib6_nh
*nh
= res
->nh
;
1589 if (res
->f6i
->fib6_pmtu
) {
1590 mtu
= res
->f6i
->fib6_pmtu
;
1592 struct net_device
*dev
= nh
->fib_nh_dev
;
1593 struct inet6_dev
*idev
;
1596 idev
= __in6_dev_get(dev
);
1597 mtu
= idev
->cnf
.mtu6
;
1601 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
1603 return mtu
- lwtunnel_headroom(nh
->fib_nh_lws
, mtu
);
1606 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
1608 /* used when the flushed bit is not relevant, only access to the bucket
1609 * (ie., all bucket users except rt6_insert_exception);
1611 * called under rcu lock; sometimes called with rt6_exception_lock held
1614 struct rt6_exception_bucket
*fib6_nh_get_excptn_bucket(const struct fib6_nh
*nh
,
1617 struct rt6_exception_bucket
*bucket
;
1620 bucket
= rcu_dereference_protected(nh
->rt6i_exception_bucket
,
1621 lockdep_is_held(lock
));
1623 bucket
= rcu_dereference(nh
->rt6i_exception_bucket
);
1625 /* remove bucket flushed bit if set */
1627 unsigned long p
= (unsigned long)bucket
;
1629 p
&= ~FIB6_EXCEPTION_BUCKET_FLUSHED
;
1630 bucket
= (struct rt6_exception_bucket
*)p
;
1636 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket
*bucket
)
1638 unsigned long p
= (unsigned long)bucket
;
1640 return !!(p
& FIB6_EXCEPTION_BUCKET_FLUSHED
);
1643 /* called with rt6_exception_lock held */
1644 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh
*nh
,
1647 struct rt6_exception_bucket
*bucket
;
1650 bucket
= rcu_dereference_protected(nh
->rt6i_exception_bucket
,
1651 lockdep_is_held(lock
));
1653 p
= (unsigned long)bucket
;
1654 p
|= FIB6_EXCEPTION_BUCKET_FLUSHED
;
1655 bucket
= (struct rt6_exception_bucket
*)p
;
1656 rcu_assign_pointer(nh
->rt6i_exception_bucket
, bucket
);
1659 static int rt6_insert_exception(struct rt6_info
*nrt
,
1660 const struct fib6_result
*res
)
1662 struct net
*net
= dev_net(nrt
->dst
.dev
);
1663 struct rt6_exception_bucket
*bucket
;
1664 struct fib6_info
*f6i
= res
->f6i
;
1665 struct in6_addr
*src_key
= NULL
;
1666 struct rt6_exception
*rt6_ex
;
1667 struct fib6_nh
*nh
= res
->nh
;
1670 spin_lock_bh(&rt6_exception_lock
);
1672 bucket
= rcu_dereference_protected(nh
->rt6i_exception_bucket
,
1673 lockdep_is_held(&rt6_exception_lock
));
1675 bucket
= kcalloc(FIB6_EXCEPTION_BUCKET_SIZE
, sizeof(*bucket
),
1681 rcu_assign_pointer(nh
->rt6i_exception_bucket
, bucket
);
1682 } else if (fib6_nh_excptn_bucket_flushed(bucket
)) {
1687 #ifdef CONFIG_IPV6_SUBTREES
1688 /* fib6_src.plen != 0 indicates f6i is in subtree
1689 * and exception table is indexed by a hash of
1690 * both fib6_dst and fib6_src.
1691 * Otherwise, the exception table is indexed by
1692 * a hash of only fib6_dst.
1694 if (f6i
->fib6_src
.plen
)
1695 src_key
= &nrt
->rt6i_src
.addr
;
1697 /* rt6_mtu_change() might lower mtu on f6i.
1698 * Only insert this exception route if its mtu
1699 * is less than f6i's mtu value.
1701 if (dst_metric_raw(&nrt
->dst
, RTAX_MTU
) >= fib6_mtu(res
)) {
1706 rt6_ex
= __rt6_find_exception_spinlock(&bucket
, &nrt
->rt6i_dst
.addr
,
1709 rt6_remove_exception(bucket
, rt6_ex
);
1711 rt6_ex
= kzalloc(sizeof(*rt6_ex
), GFP_ATOMIC
);
1717 rt6_ex
->stamp
= jiffies
;
1718 hlist_add_head_rcu(&rt6_ex
->hlist
, &bucket
->chain
);
1720 net
->ipv6
.rt6_stats
->fib_rt_cache
++;
1722 if (bucket
->depth
> FIB6_MAX_DEPTH
)
1723 rt6_exception_remove_oldest(bucket
);
1726 spin_unlock_bh(&rt6_exception_lock
);
1728 /* Update fn->fn_sernum to invalidate all cached dst */
1730 spin_lock_bh(&f6i
->fib6_table
->tb6_lock
);
1731 fib6_update_sernum(net
, f6i
);
1732 spin_unlock_bh(&f6i
->fib6_table
->tb6_lock
);
1733 fib6_force_start_gc(net
);
1739 static void fib6_nh_flush_exceptions(struct fib6_nh
*nh
, struct fib6_info
*from
)
1741 struct rt6_exception_bucket
*bucket
;
1742 struct rt6_exception
*rt6_ex
;
1743 struct hlist_node
*tmp
;
1746 spin_lock_bh(&rt6_exception_lock
);
1748 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
1752 /* Prevent rt6_insert_exception() to recreate the bucket list */
1754 fib6_nh_excptn_bucket_set_flushed(nh
, &rt6_exception_lock
);
1756 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1757 hlist_for_each_entry_safe(rt6_ex
, tmp
, &bucket
->chain
, hlist
) {
1759 rcu_access_pointer(rt6_ex
->rt6i
->from
) == from
)
1760 rt6_remove_exception(bucket
, rt6_ex
);
1762 WARN_ON_ONCE(!from
&& bucket
->depth
);
1766 spin_unlock_bh(&rt6_exception_lock
);
1769 static int rt6_nh_flush_exceptions(struct fib6_nh
*nh
, void *arg
)
1771 struct fib6_info
*f6i
= arg
;
1773 fib6_nh_flush_exceptions(nh
, f6i
);
1778 void rt6_flush_exceptions(struct fib6_info
*f6i
)
1781 nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_flush_exceptions
,
1784 fib6_nh_flush_exceptions(f6i
->fib6_nh
, f6i
);
1787 /* Find cached rt in the hash table inside passed in rt
1788 * Caller has to hold rcu_read_lock()
1790 static struct rt6_info
*rt6_find_cached_rt(const struct fib6_result
*res
,
1791 const struct in6_addr
*daddr
,
1792 const struct in6_addr
*saddr
)
1794 const struct in6_addr
*src_key
= NULL
;
1795 struct rt6_exception_bucket
*bucket
;
1796 struct rt6_exception
*rt6_ex
;
1797 struct rt6_info
*ret
= NULL
;
1799 #ifdef CONFIG_IPV6_SUBTREES
1800 /* fib6i_src.plen != 0 indicates f6i is in subtree
1801 * and exception table is indexed by a hash of
1802 * both fib6_dst and fib6_src.
1803 * However, the src addr used to create the hash
1804 * might not be exactly the passed in saddr which
1805 * is a /128 addr from the flow.
1806 * So we need to use f6i->fib6_src to redo lookup
1807 * if the passed in saddr does not find anything.
1808 * (See the logic in ip6_rt_cache_alloc() on how
1809 * rt->rt6i_src is updated.)
1811 if (res
->f6i
->fib6_src
.plen
)
1815 bucket
= fib6_nh_get_excptn_bucket(res
->nh
, NULL
);
1816 rt6_ex
= __rt6_find_exception_rcu(&bucket
, daddr
, src_key
);
1818 if (rt6_ex
&& !rt6_check_expired(rt6_ex
->rt6i
))
1821 #ifdef CONFIG_IPV6_SUBTREES
1822 /* Use fib6_src as src_key and redo lookup */
1823 if (!ret
&& src_key
&& src_key
!= &res
->f6i
->fib6_src
.addr
) {
1824 src_key
= &res
->f6i
->fib6_src
.addr
;
1832 /* Remove the passed in cached rt from the hash table that contains it */
1833 static int fib6_nh_remove_exception(const struct fib6_nh
*nh
, int plen
,
1834 const struct rt6_info
*rt
)
1836 const struct in6_addr
*src_key
= NULL
;
1837 struct rt6_exception_bucket
*bucket
;
1838 struct rt6_exception
*rt6_ex
;
1841 if (!rcu_access_pointer(nh
->rt6i_exception_bucket
))
1844 spin_lock_bh(&rt6_exception_lock
);
1845 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
1847 #ifdef CONFIG_IPV6_SUBTREES
1848 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1849 * and exception table is indexed by a hash of
1850 * both rt6i_dst and rt6i_src.
1851 * Otherwise, the exception table is indexed by
1852 * a hash of only rt6i_dst.
1855 src_key
= &rt
->rt6i_src
.addr
;
1857 rt6_ex
= __rt6_find_exception_spinlock(&bucket
,
1861 rt6_remove_exception(bucket
, rt6_ex
);
1867 spin_unlock_bh(&rt6_exception_lock
);
1871 struct fib6_nh_excptn_arg
{
1872 struct rt6_info
*rt
;
1876 static int rt6_nh_remove_exception_rt(struct fib6_nh
*nh
, void *_arg
)
1878 struct fib6_nh_excptn_arg
*arg
= _arg
;
1881 err
= fib6_nh_remove_exception(nh
, arg
->plen
, arg
->rt
);
1888 static int rt6_remove_exception_rt(struct rt6_info
*rt
)
1890 struct fib6_info
*from
;
1892 from
= rcu_dereference(rt
->from
);
1893 if (!from
|| !(rt
->rt6i_flags
& RTF_CACHE
))
1897 struct fib6_nh_excptn_arg arg
= {
1899 .plen
= from
->fib6_src
.plen
1903 /* rc = 1 means an entry was found */
1904 rc
= nexthop_for_each_fib6_nh(from
->nh
,
1905 rt6_nh_remove_exception_rt
,
1907 return rc
? 0 : -ENOENT
;
1910 return fib6_nh_remove_exception(from
->fib6_nh
,
1911 from
->fib6_src
.plen
, rt
);
1914 /* Find rt6_ex which contains the passed in rt cache and
1917 static void fib6_nh_update_exception(const struct fib6_nh
*nh
, int plen
,
1918 const struct rt6_info
*rt
)
1920 const struct in6_addr
*src_key
= NULL
;
1921 struct rt6_exception_bucket
*bucket
;
1922 struct rt6_exception
*rt6_ex
;
1924 bucket
= fib6_nh_get_excptn_bucket(nh
, NULL
);
1925 #ifdef CONFIG_IPV6_SUBTREES
1926 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1927 * and exception table is indexed by a hash of
1928 * both rt6i_dst and rt6i_src.
1929 * Otherwise, the exception table is indexed by
1930 * a hash of only rt6i_dst.
1933 src_key
= &rt
->rt6i_src
.addr
;
1935 rt6_ex
= __rt6_find_exception_rcu(&bucket
, &rt
->rt6i_dst
.addr
, src_key
);
1937 rt6_ex
->stamp
= jiffies
;
1940 struct fib6_nh_match_arg
{
1941 const struct net_device
*dev
;
1942 const struct in6_addr
*gw
;
1943 struct fib6_nh
*match
;
1946 /* determine if fib6_nh has given device and gateway */
1947 static int fib6_nh_find_match(struct fib6_nh
*nh
, void *_arg
)
1949 struct fib6_nh_match_arg
*arg
= _arg
;
1951 if (arg
->dev
!= nh
->fib_nh_dev
||
1952 (arg
->gw
&& !nh
->fib_nh_gw_family
) ||
1953 (!arg
->gw
&& nh
->fib_nh_gw_family
) ||
1954 (arg
->gw
&& !ipv6_addr_equal(arg
->gw
, &nh
->fib_nh_gw6
)))
1959 /* found a match, break the loop */
1963 static void rt6_update_exception_stamp_rt(struct rt6_info
*rt
)
1965 struct fib6_info
*from
;
1966 struct fib6_nh
*fib6_nh
;
1970 from
= rcu_dereference(rt
->from
);
1971 if (!from
|| !(rt
->rt6i_flags
& RTF_CACHE
))
1975 struct fib6_nh_match_arg arg
= {
1977 .gw
= &rt
->rt6i_gateway
,
1980 nexthop_for_each_fib6_nh(from
->nh
, fib6_nh_find_match
, &arg
);
1984 fib6_nh
= arg
.match
;
1986 fib6_nh
= from
->fib6_nh
;
1988 fib6_nh_update_exception(fib6_nh
, from
->fib6_src
.plen
, rt
);
1993 static bool rt6_mtu_change_route_allowed(struct inet6_dev
*idev
,
1994 struct rt6_info
*rt
, int mtu
)
1996 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1997 * lowest MTU in the path: always allow updating the route PMTU to
1998 * reflect PMTU decreases.
2000 * If the new MTU is higher, and the route PMTU is equal to the local
2001 * MTU, this means the old MTU is the lowest in the path, so allow
2002 * updating it: if other nodes now have lower MTUs, PMTU discovery will
2006 if (dst_mtu(&rt
->dst
) >= mtu
)
2009 if (dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
)
2015 static void rt6_exceptions_update_pmtu(struct inet6_dev
*idev
,
2016 const struct fib6_nh
*nh
, int mtu
)
2018 struct rt6_exception_bucket
*bucket
;
2019 struct rt6_exception
*rt6_ex
;
2022 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
2026 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
2027 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
2028 struct rt6_info
*entry
= rt6_ex
->rt6i
;
2030 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2031 * route), the metrics of its rt->from have already
2034 if (dst_metric_raw(&entry
->dst
, RTAX_MTU
) &&
2035 rt6_mtu_change_route_allowed(idev
, entry
, mtu
))
2036 dst_metric_set(&entry
->dst
, RTAX_MTU
, mtu
);
2042 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2044 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh
*nh
,
2045 const struct in6_addr
*gateway
)
2047 struct rt6_exception_bucket
*bucket
;
2048 struct rt6_exception
*rt6_ex
;
2049 struct hlist_node
*tmp
;
2052 if (!rcu_access_pointer(nh
->rt6i_exception_bucket
))
2055 spin_lock_bh(&rt6_exception_lock
);
2056 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
2058 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
2059 hlist_for_each_entry_safe(rt6_ex
, tmp
,
2060 &bucket
->chain
, hlist
) {
2061 struct rt6_info
*entry
= rt6_ex
->rt6i
;
2063 if ((entry
->rt6i_flags
& RTF_CACHE_GATEWAY
) ==
2064 RTF_CACHE_GATEWAY
&&
2065 ipv6_addr_equal(gateway
,
2066 &entry
->rt6i_gateway
)) {
2067 rt6_remove_exception(bucket
, rt6_ex
);
2074 spin_unlock_bh(&rt6_exception_lock
);
2077 static void rt6_age_examine_exception(struct rt6_exception_bucket
*bucket
,
2078 struct rt6_exception
*rt6_ex
,
2079 struct fib6_gc_args
*gc_args
,
2082 struct rt6_info
*rt
= rt6_ex
->rt6i
;
2084 /* we are pruning and obsoleting aged-out and non gateway exceptions
2085 * even if others have still references to them, so that on next
2086 * dst_check() such references can be dropped.
2087 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2088 * expired, independently from their aging, as per RFC 8201 section 4
2090 if (!(rt
->rt6i_flags
& RTF_EXPIRES
)) {
2091 if (time_after_eq(now
, rt
->dst
.lastuse
+ gc_args
->timeout
)) {
2092 RT6_TRACE("aging clone %p\n", rt
);
2093 rt6_remove_exception(bucket
, rt6_ex
);
2096 } else if (time_after(jiffies
, rt
->dst
.expires
)) {
2097 RT6_TRACE("purging expired route %p\n", rt
);
2098 rt6_remove_exception(bucket
, rt6_ex
);
2102 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
2103 struct neighbour
*neigh
;
2104 __u8 neigh_flags
= 0;
2106 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
2108 neigh_flags
= neigh
->flags
;
2110 if (!(neigh_flags
& NTF_ROUTER
)) {
2111 RT6_TRACE("purging route %p via non-router but gateway\n",
2113 rt6_remove_exception(bucket
, rt6_ex
);
2121 static void fib6_nh_age_exceptions(const struct fib6_nh
*nh
,
2122 struct fib6_gc_args
*gc_args
,
2125 struct rt6_exception_bucket
*bucket
;
2126 struct rt6_exception
*rt6_ex
;
2127 struct hlist_node
*tmp
;
2130 if (!rcu_access_pointer(nh
->rt6i_exception_bucket
))
2134 spin_lock(&rt6_exception_lock
);
2135 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
2137 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
2138 hlist_for_each_entry_safe(rt6_ex
, tmp
,
2139 &bucket
->chain
, hlist
) {
2140 rt6_age_examine_exception(bucket
, rt6_ex
,
2146 spin_unlock(&rt6_exception_lock
);
2147 rcu_read_unlock_bh();
2150 struct fib6_nh_age_excptn_arg
{
2151 struct fib6_gc_args
*gc_args
;
2155 static int rt6_nh_age_exceptions(struct fib6_nh
*nh
, void *_arg
)
2157 struct fib6_nh_age_excptn_arg
*arg
= _arg
;
2159 fib6_nh_age_exceptions(nh
, arg
->gc_args
, arg
->now
);
2163 void rt6_age_exceptions(struct fib6_info
*f6i
,
2164 struct fib6_gc_args
*gc_args
,
2168 struct fib6_nh_age_excptn_arg arg
= {
2173 nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_age_exceptions
,
2176 fib6_nh_age_exceptions(f6i
->fib6_nh
, gc_args
, now
);
2180 /* must be called with rcu lock held */
2181 int fib6_table_lookup(struct net
*net
, struct fib6_table
*table
, int oif
,
2182 struct flowi6
*fl6
, struct fib6_result
*res
, int strict
)
2184 struct fib6_node
*fn
, *saved_fn
;
2186 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
2189 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
2193 rt6_select(net
, fn
, oif
, res
, strict
);
2194 if (res
->f6i
== net
->ipv6
.fib6_null_entry
) {
2195 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
2197 goto redo_rt6_select
;
2198 else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
2199 /* also consider unreachable route */
2200 strict
&= ~RT6_LOOKUP_F_REACHABLE
;
2202 goto redo_rt6_select
;
2206 trace_fib6_table_lookup(net
, res
, table
, fl6
);
2211 struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
,
2212 int oif
, struct flowi6
*fl6
,
2213 const struct sk_buff
*skb
, int flags
)
2215 struct fib6_result res
= {};
2216 struct rt6_info
*rt
= NULL
;
2219 WARN_ON_ONCE((flags
& RT6_LOOKUP_F_DST_NOREF
) &&
2220 !rcu_read_lock_held());
2222 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
2223 strict
|= flags
& RT6_LOOKUP_F_IGNORE_LINKSTATE
;
2224 if (net
->ipv6
.devconf_all
->forwarding
== 0)
2225 strict
|= RT6_LOOKUP_F_REACHABLE
;
2229 fib6_table_lookup(net
, table
, oif
, fl6
, &res
, strict
);
2230 if (res
.f6i
== net
->ipv6
.fib6_null_entry
)
2233 fib6_select_path(net
, &res
, fl6
, oif
, false, skb
, strict
);
2235 /*Search through exception table */
2236 rt
= rt6_find_cached_rt(&res
, &fl6
->daddr
, &fl6
->saddr
);
2239 } else if (unlikely((fl6
->flowi6_flags
& FLOWI_FLAG_KNOWN_NH
) &&
2240 !res
.nh
->fib_nh_gw_family
)) {
2241 /* Create a RTF_CACHE clone which will not be
2242 * owned by the fib6 tree. It is for the special case where
2243 * the daddr in the skb during the neighbor look-up is different
2244 * from the fl6->daddr used to look-up route here.
2246 rt
= ip6_rt_cache_alloc(&res
, &fl6
->daddr
, NULL
);
2249 /* 1 refcnt is taken during ip6_rt_cache_alloc().
2250 * As rt6_uncached_list_add() does not consume refcnt,
2251 * this refcnt is always returned to the caller even
2252 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2254 rt6_uncached_list_add(rt
);
2255 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
2261 /* Get a percpu copy */
2263 rt
= rt6_get_pcpu_route(&res
);
2266 rt
= rt6_make_pcpu_route(net
, &res
);
2272 rt
= net
->ipv6
.ip6_null_entry
;
2273 if (!(flags
& RT6_LOOKUP_F_DST_NOREF
))
2274 ip6_hold_safe(net
, &rt
);
2279 EXPORT_SYMBOL_GPL(ip6_pol_route
);
2281 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*ip6_pol_route_input(struct net
*net
,
2282 struct fib6_table
*table
,
2284 const struct sk_buff
*skb
,
2287 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, skb
, flags
);
2290 struct dst_entry
*ip6_route_input_lookup(struct net
*net
,
2291 struct net_device
*dev
,
2293 const struct sk_buff
*skb
,
2296 if (rt6_need_strict(&fl6
->daddr
) && dev
->type
!= ARPHRD_PIMREG
)
2297 flags
|= RT6_LOOKUP_F_IFACE
;
2299 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_input
);
2301 EXPORT_SYMBOL_GPL(ip6_route_input_lookup
);
2303 static void ip6_multipath_l3_keys(const struct sk_buff
*skb
,
2304 struct flow_keys
*keys
,
2305 struct flow_keys
*flkeys
)
2307 const struct ipv6hdr
*outer_iph
= ipv6_hdr(skb
);
2308 const struct ipv6hdr
*key_iph
= outer_iph
;
2309 struct flow_keys
*_flkeys
= flkeys
;
2310 const struct ipv6hdr
*inner_iph
;
2311 const struct icmp6hdr
*icmph
;
2312 struct ipv6hdr _inner_iph
;
2313 struct icmp6hdr _icmph
;
2315 if (likely(outer_iph
->nexthdr
!= IPPROTO_ICMPV6
))
2318 icmph
= skb_header_pointer(skb
, skb_transport_offset(skb
),
2319 sizeof(_icmph
), &_icmph
);
2323 if (!icmpv6_is_err(icmph
->icmp6_type
))
2326 inner_iph
= skb_header_pointer(skb
,
2327 skb_transport_offset(skb
) + sizeof(*icmph
),
2328 sizeof(_inner_iph
), &_inner_iph
);
2332 key_iph
= inner_iph
;
2336 keys
->addrs
.v6addrs
.src
= _flkeys
->addrs
.v6addrs
.src
;
2337 keys
->addrs
.v6addrs
.dst
= _flkeys
->addrs
.v6addrs
.dst
;
2338 keys
->tags
.flow_label
= _flkeys
->tags
.flow_label
;
2339 keys
->basic
.ip_proto
= _flkeys
->basic
.ip_proto
;
2341 keys
->addrs
.v6addrs
.src
= key_iph
->saddr
;
2342 keys
->addrs
.v6addrs
.dst
= key_iph
->daddr
;
2343 keys
->tags
.flow_label
= ip6_flowlabel(key_iph
);
2344 keys
->basic
.ip_proto
= key_iph
->nexthdr
;
2348 /* if skb is set it will be used and fl6 can be NULL */
2349 u32
rt6_multipath_hash(const struct net
*net
, const struct flowi6
*fl6
,
2350 const struct sk_buff
*skb
, struct flow_keys
*flkeys
)
2352 struct flow_keys hash_keys
;
2355 switch (ip6_multipath_hash_policy(net
)) {
2357 memset(&hash_keys
, 0, sizeof(hash_keys
));
2358 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2360 ip6_multipath_l3_keys(skb
, &hash_keys
, flkeys
);
2362 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2363 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2364 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2365 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2370 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
2371 struct flow_keys keys
;
2373 /* short-circuit if we already have L4 hash present */
2375 return skb_get_hash_raw(skb
) >> 1;
2377 memset(&hash_keys
, 0, sizeof(hash_keys
));
2380 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
2383 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2384 hash_keys
.addrs
.v6addrs
.src
= flkeys
->addrs
.v6addrs
.src
;
2385 hash_keys
.addrs
.v6addrs
.dst
= flkeys
->addrs
.v6addrs
.dst
;
2386 hash_keys
.ports
.src
= flkeys
->ports
.src
;
2387 hash_keys
.ports
.dst
= flkeys
->ports
.dst
;
2388 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2390 memset(&hash_keys
, 0, sizeof(hash_keys
));
2391 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2392 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2393 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2394 hash_keys
.ports
.src
= fl6
->fl6_sport
;
2395 hash_keys
.ports
.dst
= fl6
->fl6_dport
;
2396 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2400 memset(&hash_keys
, 0, sizeof(hash_keys
));
2401 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2403 struct flow_keys keys
;
2406 skb_flow_dissect_flow_keys(skb
, &keys
, 0);
2410 /* Inner can be v4 or v6 */
2411 if (flkeys
->control
.addr_type
== FLOW_DISSECTOR_KEY_IPV4_ADDRS
) {
2412 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2413 hash_keys
.addrs
.v4addrs
.src
= flkeys
->addrs
.v4addrs
.src
;
2414 hash_keys
.addrs
.v4addrs
.dst
= flkeys
->addrs
.v4addrs
.dst
;
2415 } else if (flkeys
->control
.addr_type
== FLOW_DISSECTOR_KEY_IPV6_ADDRS
) {
2416 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2417 hash_keys
.addrs
.v6addrs
.src
= flkeys
->addrs
.v6addrs
.src
;
2418 hash_keys
.addrs
.v6addrs
.dst
= flkeys
->addrs
.v6addrs
.dst
;
2419 hash_keys
.tags
.flow_label
= flkeys
->tags
.flow_label
;
2420 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2422 /* Same as case 0 */
2423 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2424 ip6_multipath_l3_keys(skb
, &hash_keys
, flkeys
);
2427 /* Same as case 0 */
2428 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2429 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2430 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2431 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2432 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2436 mhash
= flow_hash_from_keys(&hash_keys
);
2441 /* Called with rcu held */
2442 void ip6_route_input(struct sk_buff
*skb
)
2444 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2445 struct net
*net
= dev_net(skb
->dev
);
2446 int flags
= RT6_LOOKUP_F_HAS_SADDR
| RT6_LOOKUP_F_DST_NOREF
;
2447 struct ip_tunnel_info
*tun_info
;
2448 struct flowi6 fl6
= {
2449 .flowi6_iif
= skb
->dev
->ifindex
,
2450 .daddr
= iph
->daddr
,
2451 .saddr
= iph
->saddr
,
2452 .flowlabel
= ip6_flowinfo(iph
),
2453 .flowi6_mark
= skb
->mark
,
2454 .flowi6_proto
= iph
->nexthdr
,
2456 struct flow_keys
*flkeys
= NULL
, _flkeys
;
2458 tun_info
= skb_tunnel_info(skb
);
2459 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
2460 fl6
.flowi6_tun_key
.tun_id
= tun_info
->key
.tun_id
;
2462 if (fib6_rules_early_flow_dissect(net
, skb
, &fl6
, &_flkeys
))
2465 if (unlikely(fl6
.flowi6_proto
== IPPROTO_ICMPV6
))
2466 fl6
.mp_hash
= rt6_multipath_hash(net
, &fl6
, skb
, flkeys
);
2468 skb_dst_set_noref(skb
, ip6_route_input_lookup(net
, skb
->dev
,
2472 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*ip6_pol_route_output(struct net
*net
,
2473 struct fib6_table
*table
,
2475 const struct sk_buff
*skb
,
2478 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, skb
, flags
);
2481 struct dst_entry
*ip6_route_output_flags_noref(struct net
*net
,
2482 const struct sock
*sk
,
2483 struct flowi6
*fl6
, int flags
)
2487 if (ipv6_addr_type(&fl6
->daddr
) &
2488 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
)) {
2489 struct dst_entry
*dst
;
2491 /* This function does not take refcnt on the dst */
2492 dst
= l3mdev_link_scope_lookup(net
, fl6
);
2497 fl6
->flowi6_iif
= LOOPBACK_IFINDEX
;
2499 flags
|= RT6_LOOKUP_F_DST_NOREF
;
2500 any_src
= ipv6_addr_any(&fl6
->saddr
);
2501 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
) ||
2502 (fl6
->flowi6_oif
&& any_src
))
2503 flags
|= RT6_LOOKUP_F_IFACE
;
2506 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2508 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
2510 return fib6_rule_lookup(net
, fl6
, NULL
, flags
, ip6_pol_route_output
);
2512 EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref
);
2514 struct dst_entry
*ip6_route_output_flags(struct net
*net
,
2515 const struct sock
*sk
,
2519 struct dst_entry
*dst
;
2520 struct rt6_info
*rt6
;
2523 dst
= ip6_route_output_flags_noref(net
, sk
, fl6
, flags
);
2524 rt6
= (struct rt6_info
*)dst
;
2525 /* For dst cached in uncached_list, refcnt is already taken. */
2526 if (list_empty(&rt6
->rt6i_uncached
) && !dst_hold_safe(dst
)) {
2527 dst
= &net
->ipv6
.ip6_null_entry
->dst
;
2534 EXPORT_SYMBOL_GPL(ip6_route_output_flags
);
2536 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2538 struct rt6_info
*rt
, *ort
= (struct rt6_info
*) dst_orig
;
2539 struct net_device
*loopback_dev
= net
->loopback_dev
;
2540 struct dst_entry
*new = NULL
;
2542 rt
= dst_alloc(&ip6_dst_blackhole_ops
, loopback_dev
, 1,
2543 DST_OBSOLETE_DEAD
, 0);
2546 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
2550 new->input
= dst_discard
;
2551 new->output
= dst_discard_out
;
2553 dst_copy_metrics(new, &ort
->dst
);
2555 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
2556 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
2557 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_PCPU
;
2559 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
2560 #ifdef CONFIG_IPV6_SUBTREES
2561 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
2565 dst_release(dst_orig
);
2566 return new ? new : ERR_PTR(-ENOMEM
);
2570 * Destination cache support functions
2573 static bool fib6_check(struct fib6_info
*f6i
, u32 cookie
)
2577 if (!fib6_get_cookie_safe(f6i
, &rt_cookie
) || rt_cookie
!= cookie
)
2580 if (fib6_check_expired(f6i
))
2586 static struct dst_entry
*rt6_check(struct rt6_info
*rt
,
2587 struct fib6_info
*from
,
2592 if (!from
|| !fib6_get_cookie_safe(from
, &rt_cookie
) ||
2593 rt_cookie
!= cookie
)
2596 if (rt6_check_expired(rt
))
2602 static struct dst_entry
*rt6_dst_from_check(struct rt6_info
*rt
,
2603 struct fib6_info
*from
,
2606 if (!__rt6_check_expired(rt
) &&
2607 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
2608 fib6_check(from
, cookie
))
2614 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
2616 struct dst_entry
*dst_ret
;
2617 struct fib6_info
*from
;
2618 struct rt6_info
*rt
;
2620 rt
= container_of(dst
, struct rt6_info
, dst
);
2623 return rt6_is_valid(rt
) ? dst
: NULL
;
2627 /* All IPV6 dsts are created with ->obsolete set to the value
2628 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2629 * into this function always.
2632 from
= rcu_dereference(rt
->from
);
2634 if (from
&& (rt
->rt6i_flags
& RTF_PCPU
||
2635 unlikely(!list_empty(&rt
->rt6i_uncached
))))
2636 dst_ret
= rt6_dst_from_check(rt
, from
, cookie
);
2638 dst_ret
= rt6_check(rt
, from
, cookie
);
2645 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
2647 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
2650 if (rt
->rt6i_flags
& RTF_CACHE
) {
2652 if (rt6_check_expired(rt
)) {
2653 rt6_remove_exception_rt(rt
);
2665 static void ip6_link_failure(struct sk_buff
*skb
)
2667 struct rt6_info
*rt
;
2669 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
2671 rt
= (struct rt6_info
*) skb_dst(skb
);
2674 if (rt
->rt6i_flags
& RTF_CACHE
) {
2675 rt6_remove_exception_rt(rt
);
2677 struct fib6_info
*from
;
2678 struct fib6_node
*fn
;
2680 from
= rcu_dereference(rt
->from
);
2682 fn
= rcu_dereference(from
->fib6_node
);
2683 if (fn
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
2691 static void rt6_update_expires(struct rt6_info
*rt0
, int timeout
)
2693 if (!(rt0
->rt6i_flags
& RTF_EXPIRES
)) {
2694 struct fib6_info
*from
;
2697 from
= rcu_dereference(rt0
->from
);
2699 rt0
->dst
.expires
= from
->expires
;
2703 dst_set_expires(&rt0
->dst
, timeout
);
2704 rt0
->rt6i_flags
|= RTF_EXPIRES
;
2707 static void rt6_do_update_pmtu(struct rt6_info
*rt
, u32 mtu
)
2709 struct net
*net
= dev_net(rt
->dst
.dev
);
2711 dst_metric_set(&rt
->dst
, RTAX_MTU
, mtu
);
2712 rt
->rt6i_flags
|= RTF_MODIFIED
;
2713 rt6_update_expires(rt
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
2716 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info
*rt
)
2718 return !(rt
->rt6i_flags
& RTF_CACHE
) &&
2719 (rt
->rt6i_flags
& RTF_PCPU
|| rcu_access_pointer(rt
->from
));
2722 static void __ip6_rt_update_pmtu(struct dst_entry
*dst
, const struct sock
*sk
,
2723 const struct ipv6hdr
*iph
, u32 mtu
,
2726 const struct in6_addr
*daddr
, *saddr
;
2727 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
2729 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2730 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2731 * [see also comment in rt6_mtu_change_route()]
2735 daddr
= &iph
->daddr
;
2736 saddr
= &iph
->saddr
;
2738 daddr
= &sk
->sk_v6_daddr
;
2739 saddr
= &inet6_sk(sk
)->saddr
;
2746 dst_confirm_neigh(dst
, daddr
);
2748 if (mtu
< IPV6_MIN_MTU
)
2750 if (mtu
>= dst_mtu(dst
))
2753 if (!rt6_cache_allowed_for_pmtu(rt6
)) {
2754 rt6_do_update_pmtu(rt6
, mtu
);
2755 /* update rt6_ex->stamp for cache */
2756 if (rt6
->rt6i_flags
& RTF_CACHE
)
2757 rt6_update_exception_stamp_rt(rt6
);
2759 struct fib6_result res
= {};
2760 struct rt6_info
*nrt6
;
2763 res
.f6i
= rcu_dereference(rt6
->from
);
2767 res
.fib6_flags
= res
.f6i
->fib6_flags
;
2768 res
.fib6_type
= res
.f6i
->fib6_type
;
2771 struct fib6_nh_match_arg arg
= {
2773 .gw
= &rt6
->rt6i_gateway
,
2776 nexthop_for_each_fib6_nh(res
.f6i
->nh
,
2777 fib6_nh_find_match
, &arg
);
2779 /* fib6_info uses a nexthop that does not have fib6_nh
2780 * using the dst->dev + gw. Should be impossible.
2787 res
.nh
= res
.f6i
->fib6_nh
;
2790 nrt6
= ip6_rt_cache_alloc(&res
, daddr
, saddr
);
2792 rt6_do_update_pmtu(nrt6
, mtu
);
2793 if (rt6_insert_exception(nrt6
, &res
))
2794 dst_release_immediate(&nrt6
->dst
);
2801 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2802 struct sk_buff
*skb
, u32 mtu
,
2805 __ip6_rt_update_pmtu(dst
, sk
, skb
? ipv6_hdr(skb
) : NULL
, mtu
,
2809 void ip6_update_pmtu(struct sk_buff
*skb
, struct net
*net
, __be32 mtu
,
2810 int oif
, u32 mark
, kuid_t uid
)
2812 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2813 struct dst_entry
*dst
;
2814 struct flowi6 fl6
= {
2816 .flowi6_mark
= mark
? mark
: IP6_REPLY_MARK(net
, skb
->mark
),
2817 .daddr
= iph
->daddr
,
2818 .saddr
= iph
->saddr
,
2819 .flowlabel
= ip6_flowinfo(iph
),
2823 dst
= ip6_route_output(net
, NULL
, &fl6
);
2825 __ip6_rt_update_pmtu(dst
, NULL
, iph
, ntohl(mtu
), true);
2828 EXPORT_SYMBOL_GPL(ip6_update_pmtu
);
2830 void ip6_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, __be32 mtu
)
2832 int oif
= sk
->sk_bound_dev_if
;
2833 struct dst_entry
*dst
;
2835 if (!oif
&& skb
->dev
)
2836 oif
= l3mdev_master_ifindex(skb
->dev
);
2838 ip6_update_pmtu(skb
, sock_net(sk
), mtu
, oif
, sk
->sk_mark
, sk
->sk_uid
);
2840 dst
= __sk_dst_get(sk
);
2841 if (!dst
|| !dst
->obsolete
||
2842 dst
->ops
->check(dst
, inet6_sk(sk
)->dst_cookie
))
2846 if (!sock_owned_by_user(sk
) && !ipv6_addr_v4mapped(&sk
->sk_v6_daddr
))
2847 ip6_datagram_dst_update(sk
, false);
2850 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu
);
2852 void ip6_sk_dst_store_flow(struct sock
*sk
, struct dst_entry
*dst
,
2853 const struct flowi6
*fl6
)
2855 #ifdef CONFIG_IPV6_SUBTREES
2856 struct ipv6_pinfo
*np
= inet6_sk(sk
);
2859 ip6_dst_store(sk
, dst
,
2860 ipv6_addr_equal(&fl6
->daddr
, &sk
->sk_v6_daddr
) ?
2861 &sk
->sk_v6_daddr
: NULL
,
2862 #ifdef CONFIG_IPV6_SUBTREES
2863 ipv6_addr_equal(&fl6
->saddr
, &np
->saddr
) ?
2869 static bool ip6_redirect_nh_match(const struct fib6_result
*res
,
2871 const struct in6_addr
*gw
,
2872 struct rt6_info
**ret
)
2874 const struct fib6_nh
*nh
= res
->nh
;
2876 if (nh
->fib_nh_flags
& RTNH_F_DEAD
|| !nh
->fib_nh_gw_family
||
2877 fl6
->flowi6_oif
!= nh
->fib_nh_dev
->ifindex
)
2880 /* rt_cache's gateway might be different from its 'parent'
2881 * in the case of an ip redirect.
2882 * So we keep searching in the exception table if the gateway
2885 if (!ipv6_addr_equal(gw
, &nh
->fib_nh_gw6
)) {
2886 struct rt6_info
*rt_cache
;
2888 rt_cache
= rt6_find_cached_rt(res
, &fl6
->daddr
, &fl6
->saddr
);
2890 ipv6_addr_equal(gw
, &rt_cache
->rt6i_gateway
)) {
2899 struct fib6_nh_rd_arg
{
2900 struct fib6_result
*res
;
2902 const struct in6_addr
*gw
;
2903 struct rt6_info
**ret
;
2906 static int fib6_nh_redirect_match(struct fib6_nh
*nh
, void *_arg
)
2908 struct fib6_nh_rd_arg
*arg
= _arg
;
2911 return ip6_redirect_nh_match(arg
->res
, arg
->fl6
, arg
->gw
, arg
->ret
);
2914 /* Handle redirects */
2915 struct ip6rd_flowi
{
2917 struct in6_addr gateway
;
2920 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*__ip6_route_redirect(struct net
*net
,
2921 struct fib6_table
*table
,
2923 const struct sk_buff
*skb
,
2926 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
2927 struct rt6_info
*ret
= NULL
;
2928 struct fib6_result res
= {};
2929 struct fib6_nh_rd_arg arg
= {
2932 .gw
= &rdfl
->gateway
,
2935 struct fib6_info
*rt
;
2936 struct fib6_node
*fn
;
2938 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2939 * this case we must match on the real ingress device, so reset it
2941 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
2942 fl6
->flowi6_oif
= skb
->dev
->ifindex
;
2944 /* Get the "current" route for this destination and
2945 * check if the redirect has come from appropriate router.
2947 * RFC 4861 specifies that redirects should only be
2948 * accepted if they come from the nexthop to the target.
2949 * Due to the way the routes are chosen, this notion
2950 * is a bit fuzzy and one might need to check all possible
2955 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
2957 for_each_fib6_node_rt_rcu(fn
) {
2959 if (fib6_check_expired(rt
))
2961 if (rt
->fib6_flags
& RTF_REJECT
)
2963 if (unlikely(rt
->nh
)) {
2964 if (nexthop_is_blackhole(rt
->nh
))
2966 /* on match, res->nh is filled in and potentially ret */
2967 if (nexthop_for_each_fib6_nh(rt
->nh
,
2968 fib6_nh_redirect_match
,
2972 res
.nh
= rt
->fib6_nh
;
2973 if (ip6_redirect_nh_match(&res
, fl6
, &rdfl
->gateway
,
2980 rt
= net
->ipv6
.fib6_null_entry
;
2981 else if (rt
->fib6_flags
& RTF_REJECT
) {
2982 ret
= net
->ipv6
.ip6_null_entry
;
2986 if (rt
== net
->ipv6
.fib6_null_entry
) {
2987 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
2993 res
.nh
= rt
->fib6_nh
;
2996 ip6_hold_safe(net
, &ret
);
2998 res
.fib6_flags
= res
.f6i
->fib6_flags
;
2999 res
.fib6_type
= res
.f6i
->fib6_type
;
3000 ret
= ip6_create_rt_rcu(&res
);
3005 trace_fib6_table_lookup(net
, &res
, table
, fl6
);
3009 static struct dst_entry
*ip6_route_redirect(struct net
*net
,
3010 const struct flowi6
*fl6
,
3011 const struct sk_buff
*skb
,
3012 const struct in6_addr
*gateway
)
3014 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
3015 struct ip6rd_flowi rdfl
;
3018 rdfl
.gateway
= *gateway
;
3020 return fib6_rule_lookup(net
, &rdfl
.fl6
, skb
,
3021 flags
, __ip6_route_redirect
);
3024 void ip6_redirect(struct sk_buff
*skb
, struct net
*net
, int oif
, u32 mark
,
3027 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
3028 struct dst_entry
*dst
;
3029 struct flowi6 fl6
= {
3030 .flowi6_iif
= LOOPBACK_IFINDEX
,
3032 .flowi6_mark
= mark
,
3033 .daddr
= iph
->daddr
,
3034 .saddr
= iph
->saddr
,
3035 .flowlabel
= ip6_flowinfo(iph
),
3039 dst
= ip6_route_redirect(net
, &fl6
, skb
, &ipv6_hdr(skb
)->saddr
);
3040 rt6_do_redirect(dst
, NULL
, skb
);
3043 EXPORT_SYMBOL_GPL(ip6_redirect
);
3045 void ip6_redirect_no_header(struct sk_buff
*skb
, struct net
*net
, int oif
)
3047 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
3048 const struct rd_msg
*msg
= (struct rd_msg
*)icmp6_hdr(skb
);
3049 struct dst_entry
*dst
;
3050 struct flowi6 fl6
= {
3051 .flowi6_iif
= LOOPBACK_IFINDEX
,
3054 .saddr
= iph
->daddr
,
3055 .flowi6_uid
= sock_net_uid(net
, NULL
),
3058 dst
= ip6_route_redirect(net
, &fl6
, skb
, &iph
->saddr
);
3059 rt6_do_redirect(dst
, NULL
, skb
);
3063 void ip6_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
3065 ip6_redirect(skb
, sock_net(sk
), sk
->sk_bound_dev_if
, sk
->sk_mark
,
3068 EXPORT_SYMBOL_GPL(ip6_sk_redirect
);
3070 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
3072 struct net_device
*dev
= dst
->dev
;
3073 unsigned int mtu
= dst_mtu(dst
);
3074 struct net
*net
= dev_net(dev
);
3076 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
3078 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
3079 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
3082 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3083 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3084 * IPV6_MAXPLEN is also valid and means: "any MSS,
3085 * rely only on pmtu discovery"
3087 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
3092 static unsigned int ip6_mtu(const struct dst_entry
*dst
)
3094 struct inet6_dev
*idev
;
3097 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
3104 idev
= __in6_dev_get(dst
->dev
);
3106 mtu
= idev
->cnf
.mtu6
;
3110 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
3112 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
3116 * 1. mtu on route is locked - use it
3117 * 2. mtu from nexthop exception
3118 * 3. mtu from egress device
3120 * based on ip6_dst_mtu_forward and exception logic of
3121 * rt6_find_cached_rt; called with rcu_read_lock
3123 u32
ip6_mtu_from_fib6(const struct fib6_result
*res
,
3124 const struct in6_addr
*daddr
,
3125 const struct in6_addr
*saddr
)
3127 const struct fib6_nh
*nh
= res
->nh
;
3128 struct fib6_info
*f6i
= res
->f6i
;
3129 struct inet6_dev
*idev
;
3130 struct rt6_info
*rt
;
3133 if (unlikely(fib6_metric_locked(f6i
, RTAX_MTU
))) {
3134 mtu
= f6i
->fib6_pmtu
;
3139 rt
= rt6_find_cached_rt(res
, daddr
, saddr
);
3141 mtu
= dst_metric_raw(&rt
->dst
, RTAX_MTU
);
3143 struct net_device
*dev
= nh
->fib_nh_dev
;
3146 idev
= __in6_dev_get(dev
);
3147 if (idev
&& idev
->cnf
.mtu6
> mtu
)
3148 mtu
= idev
->cnf
.mtu6
;
3151 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
3153 return mtu
- lwtunnel_headroom(nh
->fib_nh_lws
, mtu
);
3156 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
3159 struct dst_entry
*dst
;
3160 struct rt6_info
*rt
;
3161 struct inet6_dev
*idev
= in6_dev_get(dev
);
3162 struct net
*net
= dev_net(dev
);
3164 if (unlikely(!idev
))
3165 return ERR_PTR(-ENODEV
);
3167 rt
= ip6_dst_alloc(net
, dev
, 0);
3168 if (unlikely(!rt
)) {
3170 dst
= ERR_PTR(-ENOMEM
);
3174 rt
->dst
.input
= ip6_input
;
3175 rt
->dst
.output
= ip6_output
;
3176 rt
->rt6i_gateway
= fl6
->daddr
;
3177 rt
->rt6i_dst
.addr
= fl6
->daddr
;
3178 rt
->rt6i_dst
.plen
= 128;
3179 rt
->rt6i_idev
= idev
;
3180 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 0);
3182 /* Add this dst into uncached_list so that rt6_disable_ip() can
3183 * do proper release of the net_device
3185 rt6_uncached_list_add(rt
);
3186 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
3188 dst
= xfrm_lookup(net
, &rt
->dst
, flowi6_to_flowi(fl6
), NULL
, 0);
3194 static int ip6_dst_gc(struct dst_ops
*ops
)
3196 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
3197 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
3198 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
3199 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
3200 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
3201 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
3204 entries
= dst_entries_get_fast(ops
);
3205 if (entries
> rt_max_size
)
3206 entries
= dst_entries_get_slow(ops
);
3208 if (time_after(rt_last_gc
+ rt_min_interval
, jiffies
) &&
3209 entries
<= rt_max_size
)
3212 net
->ipv6
.ip6_rt_gc_expire
++;
3213 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
, true);
3214 entries
= dst_entries_get_slow(ops
);
3215 if (entries
< ops
->gc_thresh
)
3216 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
3218 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
3219 return entries
> rt_max_size
;
3222 static int ip6_nh_lookup_table(struct net
*net
, struct fib6_config
*cfg
,
3223 const struct in6_addr
*gw_addr
, u32 tbid
,
3224 int flags
, struct fib6_result
*res
)
3226 struct flowi6 fl6
= {
3227 .flowi6_oif
= cfg
->fc_ifindex
,
3229 .saddr
= cfg
->fc_prefsrc
,
3231 struct fib6_table
*table
;
3234 table
= fib6_get_table(net
, tbid
);
3238 if (!ipv6_addr_any(&cfg
->fc_prefsrc
))
3239 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
3241 flags
|= RT6_LOOKUP_F_IGNORE_LINKSTATE
;
3243 err
= fib6_table_lookup(net
, table
, cfg
->fc_ifindex
, &fl6
, res
, flags
);
3244 if (!err
&& res
->f6i
!= net
->ipv6
.fib6_null_entry
)
3245 fib6_select_path(net
, res
, &fl6
, cfg
->fc_ifindex
,
3246 cfg
->fc_ifindex
!= 0, NULL
, flags
);
3251 static int ip6_route_check_nh_onlink(struct net
*net
,
3252 struct fib6_config
*cfg
,
3253 const struct net_device
*dev
,
3254 struct netlink_ext_ack
*extack
)
3256 u32 tbid
= l3mdev_fib_table_rcu(dev
) ? : RT_TABLE_MAIN
;
3257 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
3258 struct fib6_result res
= {};
3261 err
= ip6_nh_lookup_table(net
, cfg
, gw_addr
, tbid
, 0, &res
);
3262 if (!err
&& !(res
.fib6_flags
& RTF_REJECT
) &&
3263 /* ignore match if it is the default route */
3264 !ipv6_addr_any(&res
.f6i
->fib6_dst
.addr
) &&
3265 (res
.fib6_type
!= RTN_UNICAST
|| dev
!= res
.nh
->fib_nh_dev
)) {
3266 NL_SET_ERR_MSG(extack
,
3267 "Nexthop has invalid gateway or device mismatch");
3274 static int ip6_route_check_nh(struct net
*net
,
3275 struct fib6_config
*cfg
,
3276 struct net_device
**_dev
,
3277 struct inet6_dev
**idev
)
3279 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
3280 struct net_device
*dev
= _dev
? *_dev
: NULL
;
3281 int flags
= RT6_LOOKUP_F_IFACE
;
3282 struct fib6_result res
= {};
3283 int err
= -EHOSTUNREACH
;
3285 if (cfg
->fc_table
) {
3286 err
= ip6_nh_lookup_table(net
, cfg
, gw_addr
,
3287 cfg
->fc_table
, flags
, &res
);
3288 /* gw_addr can not require a gateway or resolve to a reject
3289 * route. If a device is given, it must match the result.
3291 if (err
|| res
.fib6_flags
& RTF_REJECT
||
3292 res
.nh
->fib_nh_gw_family
||
3293 (dev
&& dev
!= res
.nh
->fib_nh_dev
))
3294 err
= -EHOSTUNREACH
;
3298 struct flowi6 fl6
= {
3299 .flowi6_oif
= cfg
->fc_ifindex
,
3303 err
= fib6_lookup(net
, cfg
->fc_ifindex
, &fl6
, &res
, flags
);
3304 if (err
|| res
.fib6_flags
& RTF_REJECT
||
3305 res
.nh
->fib_nh_gw_family
)
3306 err
= -EHOSTUNREACH
;
3311 fib6_select_path(net
, &res
, &fl6
, cfg
->fc_ifindex
,
3312 cfg
->fc_ifindex
!= 0, NULL
, flags
);
3317 if (dev
!= res
.nh
->fib_nh_dev
)
3318 err
= -EHOSTUNREACH
;
3320 *_dev
= dev
= res
.nh
->fib_nh_dev
;
3322 *idev
= in6_dev_get(dev
);
3328 static int ip6_validate_gw(struct net
*net
, struct fib6_config
*cfg
,
3329 struct net_device
**_dev
, struct inet6_dev
**idev
,
3330 struct netlink_ext_ack
*extack
)
3332 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
3333 int gwa_type
= ipv6_addr_type(gw_addr
);
3334 bool skip_dev
= gwa_type
& IPV6_ADDR_LINKLOCAL
? false : true;
3335 const struct net_device
*dev
= *_dev
;
3336 bool need_addr_check
= !dev
;
3339 /* if gw_addr is local we will fail to detect this in case
3340 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3341 * will return already-added prefix route via interface that
3342 * prefix route was assigned to, which might be non-loopback.
3345 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
3346 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
3350 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_UNICAST
)) {
3351 /* IPv6 strictly inhibits using not link-local
3352 * addresses as nexthop address.
3353 * Otherwise, router will not able to send redirects.
3354 * It is very good, but in some (rare!) circumstances
3355 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3356 * some exceptions. --ANK
3357 * We allow IPv4-mapped nexthops to support RFC4798-type
3360 if (!(gwa_type
& (IPV6_ADDR_UNICAST
| IPV6_ADDR_MAPPED
))) {
3361 NL_SET_ERR_MSG(extack
, "Invalid gateway address");
3367 if (cfg
->fc_flags
& RTNH_F_ONLINK
)
3368 err
= ip6_route_check_nh_onlink(net
, cfg
, dev
, extack
);
3370 err
= ip6_route_check_nh(net
, cfg
, _dev
, idev
);
3378 /* reload in case device was changed */
3383 NL_SET_ERR_MSG(extack
, "Egress device not specified");
3385 } else if (dev
->flags
& IFF_LOOPBACK
) {
3386 NL_SET_ERR_MSG(extack
,
3387 "Egress device can not be loopback device for this route");
3391 /* if we did not check gw_addr above, do so now that the
3392 * egress device has been resolved.
3394 if (need_addr_check
&&
3395 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
3396 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
3405 static bool fib6_is_reject(u32 flags
, struct net_device
*dev
, int addr_type
)
3407 if ((flags
& RTF_REJECT
) ||
3408 (dev
&& (dev
->flags
& IFF_LOOPBACK
) &&
3409 !(addr_type
& IPV6_ADDR_LOOPBACK
) &&
3410 !(flags
& (RTF_ANYCAST
| RTF_LOCAL
))))
3416 int fib6_nh_init(struct net
*net
, struct fib6_nh
*fib6_nh
,
3417 struct fib6_config
*cfg
, gfp_t gfp_flags
,
3418 struct netlink_ext_ack
*extack
)
3420 struct net_device
*dev
= NULL
;
3421 struct inet6_dev
*idev
= NULL
;
3425 fib6_nh
->fib_nh_family
= AF_INET6
;
3426 #ifdef CONFIG_IPV6_ROUTER_PREF
3427 fib6_nh
->last_probe
= jiffies
;
3429 if (cfg
->fc_is_fdb
) {
3430 fib6_nh
->fib_nh_gw6
= cfg
->fc_gateway
;
3431 fib6_nh
->fib_nh_gw_family
= AF_INET6
;
3436 if (cfg
->fc_ifindex
) {
3437 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
3440 idev
= in6_dev_get(dev
);
3445 if (cfg
->fc_flags
& RTNH_F_ONLINK
) {
3447 NL_SET_ERR_MSG(extack
,
3448 "Nexthop device required for onlink");
3452 if (!(dev
->flags
& IFF_UP
)) {
3453 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3458 fib6_nh
->fib_nh_flags
|= RTNH_F_ONLINK
;
3461 fib6_nh
->fib_nh_weight
= 1;
3463 /* We cannot add true routes via loopback here,
3464 * they would result in kernel looping; promote them to reject routes
3466 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
3467 if (fib6_is_reject(cfg
->fc_flags
, dev
, addr_type
)) {
3468 /* hold loopback dev/idev if we haven't done so. */
3469 if (dev
!= net
->loopback_dev
) {
3474 dev
= net
->loopback_dev
;
3476 idev
= in6_dev_get(dev
);
3485 if (cfg
->fc_flags
& RTF_GATEWAY
) {
3486 err
= ip6_validate_gw(net
, cfg
, &dev
, &idev
, extack
);
3490 fib6_nh
->fib_nh_gw6
= cfg
->fc_gateway
;
3491 fib6_nh
->fib_nh_gw_family
= AF_INET6
;
3498 if (idev
->cnf
.disable_ipv6
) {
3499 NL_SET_ERR_MSG(extack
, "IPv6 is disabled on nexthop device");
3504 if (!(dev
->flags
& IFF_UP
) && !cfg
->fc_ignore_dev_down
) {
3505 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3510 if (!(cfg
->fc_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) &&
3511 !netif_carrier_ok(dev
))
3512 fib6_nh
->fib_nh_flags
|= RTNH_F_LINKDOWN
;
3514 err
= fib_nh_common_init(net
, &fib6_nh
->nh_common
, cfg
->fc_encap
,
3515 cfg
->fc_encap_type
, cfg
, gfp_flags
, extack
);
3520 fib6_nh
->rt6i_pcpu
= alloc_percpu_gfp(struct rt6_info
*, gfp_flags
);
3521 if (!fib6_nh
->rt6i_pcpu
) {
3526 fib6_nh
->fib_nh_dev
= dev
;
3527 fib6_nh
->fib_nh_oif
= dev
->ifindex
;
3534 lwtstate_put(fib6_nh
->fib_nh_lws
);
3535 fib6_nh
->fib_nh_lws
= NULL
;
3543 void fib6_nh_release(struct fib6_nh
*fib6_nh
)
3545 struct rt6_exception_bucket
*bucket
;
3549 fib6_nh_flush_exceptions(fib6_nh
, NULL
);
3550 bucket
= fib6_nh_get_excptn_bucket(fib6_nh
, NULL
);
3552 rcu_assign_pointer(fib6_nh
->rt6i_exception_bucket
, NULL
);
3558 if (fib6_nh
->rt6i_pcpu
) {
3561 for_each_possible_cpu(cpu
) {
3562 struct rt6_info
**ppcpu_rt
;
3563 struct rt6_info
*pcpu_rt
;
3565 ppcpu_rt
= per_cpu_ptr(fib6_nh
->rt6i_pcpu
, cpu
);
3566 pcpu_rt
= *ppcpu_rt
;
3568 dst_dev_put(&pcpu_rt
->dst
);
3569 dst_release(&pcpu_rt
->dst
);
3574 free_percpu(fib6_nh
->rt6i_pcpu
);
3577 fib_nh_common_release(&fib6_nh
->nh_common
);
3580 static struct fib6_info
*ip6_route_info_create(struct fib6_config
*cfg
,
3582 struct netlink_ext_ack
*extack
)
3584 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
3585 struct fib6_info
*rt
= NULL
;
3586 struct nexthop
*nh
= NULL
;
3587 struct fib6_table
*table
;
3588 struct fib6_nh
*fib6_nh
;
3592 /* RTF_PCPU is an internal flag; can not be set by userspace */
3593 if (cfg
->fc_flags
& RTF_PCPU
) {
3594 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_PCPU");
3598 /* RTF_CACHE is an internal flag; can not be set by userspace */
3599 if (cfg
->fc_flags
& RTF_CACHE
) {
3600 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_CACHE");
3604 if (cfg
->fc_type
> RTN_MAX
) {
3605 NL_SET_ERR_MSG(extack
, "Invalid route type");
3609 if (cfg
->fc_dst_len
> 128) {
3610 NL_SET_ERR_MSG(extack
, "Invalid prefix length");
3613 if (cfg
->fc_src_len
> 128) {
3614 NL_SET_ERR_MSG(extack
, "Invalid source address length");
3617 #ifndef CONFIG_IPV6_SUBTREES
3618 if (cfg
->fc_src_len
) {
3619 NL_SET_ERR_MSG(extack
,
3620 "Specifying source address requires IPV6_SUBTREES to be enabled");
3624 if (cfg
->fc_nh_id
) {
3625 nh
= nexthop_find_by_id(net
, cfg
->fc_nh_id
);
3627 NL_SET_ERR_MSG(extack
, "Nexthop id does not exist");
3630 err
= fib6_check_nexthop(nh
, cfg
, extack
);
3636 if (cfg
->fc_nlinfo
.nlh
&&
3637 !(cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_CREATE
)) {
3638 table
= fib6_get_table(net
, cfg
->fc_table
);
3640 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3641 table
= fib6_new_table(net
, cfg
->fc_table
);
3644 table
= fib6_new_table(net
, cfg
->fc_table
);
3651 rt
= fib6_info_alloc(gfp_flags
, !nh
);
3655 rt
->fib6_metrics
= ip_fib_metrics_init(net
, cfg
->fc_mx
, cfg
->fc_mx_len
,
3657 if (IS_ERR(rt
->fib6_metrics
)) {
3658 err
= PTR_ERR(rt
->fib6_metrics
);
3659 /* Do not leave garbage there. */
3660 rt
->fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
;
3664 if (cfg
->fc_flags
& RTF_ADDRCONF
)
3665 rt
->dst_nocount
= true;
3667 if (cfg
->fc_flags
& RTF_EXPIRES
)
3668 fib6_set_expires(rt
, jiffies
+
3669 clock_t_to_jiffies(cfg
->fc_expires
));
3671 fib6_clean_expires(rt
);
3673 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
3674 cfg
->fc_protocol
= RTPROT_BOOT
;
3675 rt
->fib6_protocol
= cfg
->fc_protocol
;
3677 rt
->fib6_table
= table
;
3678 rt
->fib6_metric
= cfg
->fc_metric
;
3679 rt
->fib6_type
= cfg
->fc_type
? : RTN_UNICAST
;
3680 rt
->fib6_flags
= cfg
->fc_flags
& ~RTF_GATEWAY
;
3682 ipv6_addr_prefix(&rt
->fib6_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
3683 rt
->fib6_dst
.plen
= cfg
->fc_dst_len
;
3685 #ifdef CONFIG_IPV6_SUBTREES
3686 ipv6_addr_prefix(&rt
->fib6_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
3687 rt
->fib6_src
.plen
= cfg
->fc_src_len
;
3690 if (rt
->fib6_src
.plen
) {
3691 NL_SET_ERR_MSG(extack
, "Nexthops can not be used with source routing");
3694 if (!nexthop_get(nh
)) {
3695 NL_SET_ERR_MSG(extack
, "Nexthop has been deleted");
3699 fib6_nh
= nexthop_fib6_nh(rt
->nh
);
3701 err
= fib6_nh_init(net
, rt
->fib6_nh
, cfg
, gfp_flags
, extack
);
3705 fib6_nh
= rt
->fib6_nh
;
3707 /* We cannot add true routes via loopback here, they would
3708 * result in kernel looping; promote them to reject routes
3710 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
3711 if (fib6_is_reject(cfg
->fc_flags
, rt
->fib6_nh
->fib_nh_dev
,
3713 rt
->fib6_flags
= RTF_REJECT
| RTF_NONEXTHOP
;
3716 if (!ipv6_addr_any(&cfg
->fc_prefsrc
)) {
3717 struct net_device
*dev
= fib6_nh
->fib_nh_dev
;
3719 if (!ipv6_chk_addr(net
, &cfg
->fc_prefsrc
, dev
, 0)) {
3720 NL_SET_ERR_MSG(extack
, "Invalid source address");
3724 rt
->fib6_prefsrc
.addr
= cfg
->fc_prefsrc
;
3725 rt
->fib6_prefsrc
.plen
= 128;
3727 rt
->fib6_prefsrc
.plen
= 0;
3731 fib6_info_release(rt
);
3732 return ERR_PTR(err
);
3735 int ip6_route_add(struct fib6_config
*cfg
, gfp_t gfp_flags
,
3736 struct netlink_ext_ack
*extack
)
3738 struct fib6_info
*rt
;
3741 rt
= ip6_route_info_create(cfg
, gfp_flags
, extack
);
3745 err
= __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
, extack
);
3746 fib6_info_release(rt
);
3751 static int __ip6_del_rt(struct fib6_info
*rt
, struct nl_info
*info
)
3753 struct net
*net
= info
->nl_net
;
3754 struct fib6_table
*table
;
3757 if (rt
== net
->ipv6
.fib6_null_entry
) {
3762 table
= rt
->fib6_table
;
3763 spin_lock_bh(&table
->tb6_lock
);
3764 err
= fib6_del(rt
, info
);
3765 spin_unlock_bh(&table
->tb6_lock
);
3768 fib6_info_release(rt
);
3772 int ip6_del_rt(struct net
*net
, struct fib6_info
*rt
, bool skip_notify
)
3774 struct nl_info info
= {
3776 .skip_notify
= skip_notify
3779 return __ip6_del_rt(rt
, &info
);
3782 static int __ip6_del_rt_siblings(struct fib6_info
*rt
, struct fib6_config
*cfg
)
3784 struct nl_info
*info
= &cfg
->fc_nlinfo
;
3785 struct net
*net
= info
->nl_net
;
3786 struct sk_buff
*skb
= NULL
;
3787 struct fib6_table
*table
;
3790 if (rt
== net
->ipv6
.fib6_null_entry
)
3792 table
= rt
->fib6_table
;
3793 spin_lock_bh(&table
->tb6_lock
);
3795 if (rt
->fib6_nsiblings
&& cfg
->fc_delete_all_nh
) {
3796 struct fib6_info
*sibling
, *next_sibling
;
3797 struct fib6_node
*fn
;
3799 /* prefer to send a single notification with all hops */
3800 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
3802 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
3804 if (rt6_fill_node(net
, skb
, rt
, NULL
,
3805 NULL
, NULL
, 0, RTM_DELROUTE
,
3806 info
->portid
, seq
, 0) < 0) {
3810 info
->skip_notify
= 1;
3813 /* 'rt' points to the first sibling route. If it is not the
3814 * leaf, then we do not need to send a notification. Otherwise,
3815 * we need to check if the last sibling has a next route or not
3816 * and emit a replace or delete notification, respectively.
3818 info
->skip_notify_kernel
= 1;
3819 fn
= rcu_dereference_protected(rt
->fib6_node
,
3820 lockdep_is_held(&table
->tb6_lock
));
3821 if (rcu_access_pointer(fn
->leaf
) == rt
) {
3822 struct fib6_info
*last_sibling
, *replace_rt
;
3824 last_sibling
= list_last_entry(&rt
->fib6_siblings
,
3827 replace_rt
= rcu_dereference_protected(
3828 last_sibling
->fib6_next
,
3829 lockdep_is_held(&table
->tb6_lock
));
3831 call_fib6_entry_notifiers_replace(net
,
3834 call_fib6_multipath_entry_notifiers(net
,
3835 FIB_EVENT_ENTRY_DEL
,
3836 rt
, rt
->fib6_nsiblings
,
3839 list_for_each_entry_safe(sibling
, next_sibling
,
3842 err
= fib6_del(sibling
, info
);
3848 err
= fib6_del(rt
, info
);
3850 spin_unlock_bh(&table
->tb6_lock
);
3852 fib6_info_release(rt
);
3855 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
3856 info
->nlh
, gfp_any());
3861 static int __ip6_del_cached_rt(struct rt6_info
*rt
, struct fib6_config
*cfg
)
3865 if (cfg
->fc_ifindex
&& rt
->dst
.dev
->ifindex
!= cfg
->fc_ifindex
)
3868 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3869 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
3872 rc
= rt6_remove_exception_rt(rt
);
3877 static int ip6_del_cached_rt(struct fib6_config
*cfg
, struct fib6_info
*rt
,
3880 struct fib6_result res
= {
3884 struct rt6_info
*rt_cache
;
3886 rt_cache
= rt6_find_cached_rt(&res
, &cfg
->fc_dst
, &cfg
->fc_src
);
3888 return __ip6_del_cached_rt(rt_cache
, cfg
);
3893 struct fib6_nh_del_cached_rt_arg
{
3894 struct fib6_config
*cfg
;
3895 struct fib6_info
*f6i
;
3898 static int fib6_nh_del_cached_rt(struct fib6_nh
*nh
, void *_arg
)
3900 struct fib6_nh_del_cached_rt_arg
*arg
= _arg
;
3903 rc
= ip6_del_cached_rt(arg
->cfg
, arg
->f6i
, nh
);
3904 return rc
!= -ESRCH
? rc
: 0;
3907 static int ip6_del_cached_rt_nh(struct fib6_config
*cfg
, struct fib6_info
*f6i
)
3909 struct fib6_nh_del_cached_rt_arg arg
= {
3914 return nexthop_for_each_fib6_nh(f6i
->nh
, fib6_nh_del_cached_rt
, &arg
);
3917 static int ip6_route_del(struct fib6_config
*cfg
,
3918 struct netlink_ext_ack
*extack
)
3920 struct fib6_table
*table
;
3921 struct fib6_info
*rt
;
3922 struct fib6_node
*fn
;
3925 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
3927 NL_SET_ERR_MSG(extack
, "FIB table does not exist");
3933 fn
= fib6_locate(&table
->tb6_root
,
3934 &cfg
->fc_dst
, cfg
->fc_dst_len
,
3935 &cfg
->fc_src
, cfg
->fc_src_len
,
3936 !(cfg
->fc_flags
& RTF_CACHE
));
3939 for_each_fib6_node_rt_rcu(fn
) {
3942 if (rt
->nh
&& cfg
->fc_nh_id
&&
3943 rt
->nh
->id
!= cfg
->fc_nh_id
)
3946 if (cfg
->fc_flags
& RTF_CACHE
) {
3950 rc
= ip6_del_cached_rt_nh(cfg
, rt
);
3951 } else if (cfg
->fc_nh_id
) {
3955 rc
= ip6_del_cached_rt(cfg
, rt
, nh
);
3964 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->fib6_metric
)
3966 if (cfg
->fc_protocol
&&
3967 cfg
->fc_protocol
!= rt
->fib6_protocol
)
3971 if (!fib6_info_hold_safe(rt
))
3975 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
3981 if (cfg
->fc_ifindex
&&
3983 nh
->fib_nh_dev
->ifindex
!= cfg
->fc_ifindex
))
3985 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3986 !ipv6_addr_equal(&cfg
->fc_gateway
, &nh
->fib_nh_gw6
))
3988 if (!fib6_info_hold_safe(rt
))
3992 /* if gateway was specified only delete the one hop */
3993 if (cfg
->fc_flags
& RTF_GATEWAY
)
3994 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
3996 return __ip6_del_rt_siblings(rt
, cfg
);
4004 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
4006 struct netevent_redirect netevent
;
4007 struct rt6_info
*rt
, *nrt
= NULL
;
4008 struct fib6_result res
= {};
4009 struct ndisc_options ndopts
;
4010 struct inet6_dev
*in6_dev
;
4011 struct neighbour
*neigh
;
4013 int optlen
, on_link
;
4016 optlen
= skb_tail_pointer(skb
) - skb_transport_header(skb
);
4017 optlen
-= sizeof(*msg
);
4020 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
4024 msg
= (struct rd_msg
*)icmp6_hdr(skb
);
4026 if (ipv6_addr_is_multicast(&msg
->dest
)) {
4027 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
4032 if (ipv6_addr_equal(&msg
->dest
, &msg
->target
)) {
4034 } else if (ipv6_addr_type(&msg
->target
) !=
4035 (IPV6_ADDR_UNICAST
|IPV6_ADDR_LINKLOCAL
)) {
4036 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
4040 in6_dev
= __in6_dev_get(skb
->dev
);
4043 if (in6_dev
->cnf
.forwarding
|| !in6_dev
->cnf
.accept_redirects
)
4047 * The IP source address of the Redirect MUST be the same as the current
4048 * first-hop router for the specified ICMP Destination Address.
4051 if (!ndisc_parse_options(skb
->dev
, msg
->opt
, optlen
, &ndopts
)) {
4052 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
4057 if (ndopts
.nd_opts_tgt_lladdr
) {
4058 lladdr
= ndisc_opt_addr_data(ndopts
.nd_opts_tgt_lladdr
,
4061 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
4066 rt
= (struct rt6_info
*) dst
;
4067 if (rt
->rt6i_flags
& RTF_REJECT
) {
4068 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4072 /* Redirect received -> path was valid.
4073 * Look, redirects are sent only in response to data packets,
4074 * so that this nexthop apparently is reachable. --ANK
4076 dst_confirm_neigh(&rt
->dst
, &ipv6_hdr(skb
)->saddr
);
4078 neigh
= __neigh_lookup(&nd_tbl
, &msg
->target
, skb
->dev
, 1);
4083 * We have finally decided to accept it.
4086 ndisc_update(skb
->dev
, neigh
, lladdr
, NUD_STALE
,
4087 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
4088 NEIGH_UPDATE_F_OVERRIDE
|
4089 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
4090 NEIGH_UPDATE_F_ISROUTER
)),
4091 NDISC_REDIRECT
, &ndopts
);
4094 res
.f6i
= rcu_dereference(rt
->from
);
4099 struct fib6_nh_match_arg arg
= {
4101 .gw
= &rt
->rt6i_gateway
,
4104 nexthop_for_each_fib6_nh(res
.f6i
->nh
,
4105 fib6_nh_find_match
, &arg
);
4107 /* fib6_info uses a nexthop that does not have fib6_nh
4108 * using the dst->dev. Should be impossible
4114 res
.nh
= res
.f6i
->fib6_nh
;
4117 res
.fib6_flags
= res
.f6i
->fib6_flags
;
4118 res
.fib6_type
= res
.f6i
->fib6_type
;
4119 nrt
= ip6_rt_cache_alloc(&res
, &msg
->dest
, NULL
);
4123 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
4125 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
4127 nrt
->rt6i_gateway
= *(struct in6_addr
*)neigh
->primary_key
;
4129 /* rt6_insert_exception() will take care of duplicated exceptions */
4130 if (rt6_insert_exception(nrt
, &res
)) {
4131 dst_release_immediate(&nrt
->dst
);
4135 netevent
.old
= &rt
->dst
;
4136 netevent
.new = &nrt
->dst
;
4137 netevent
.daddr
= &msg
->dest
;
4138 netevent
.neigh
= neigh
;
4139 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
4143 neigh_release(neigh
);
4146 #ifdef CONFIG_IPV6_ROUTE_INFO
4147 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
4148 const struct in6_addr
*prefix
, int prefixlen
,
4149 const struct in6_addr
*gwaddr
,
4150 struct net_device
*dev
)
4152 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
4153 int ifindex
= dev
->ifindex
;
4154 struct fib6_node
*fn
;
4155 struct fib6_info
*rt
= NULL
;
4156 struct fib6_table
*table
;
4158 table
= fib6_get_table(net
, tb_id
);
4163 fn
= fib6_locate(&table
->tb6_root
, prefix
, prefixlen
, NULL
, 0, true);
4167 for_each_fib6_node_rt_rcu(fn
) {
4168 /* these routes do not use nexthops */
4171 if (rt
->fib6_nh
->fib_nh_dev
->ifindex
!= ifindex
)
4173 if (!(rt
->fib6_flags
& RTF_ROUTEINFO
) ||
4174 !rt
->fib6_nh
->fib_nh_gw_family
)
4176 if (!ipv6_addr_equal(&rt
->fib6_nh
->fib_nh_gw6
, gwaddr
))
4178 if (!fib6_info_hold_safe(rt
))
4187 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
4188 const struct in6_addr
*prefix
, int prefixlen
,
4189 const struct in6_addr
*gwaddr
,
4190 struct net_device
*dev
,
4193 struct fib6_config cfg
= {
4194 .fc_metric
= IP6_RT_PRIO_USER
,
4195 .fc_ifindex
= dev
->ifindex
,
4196 .fc_dst_len
= prefixlen
,
4197 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
4198 RTF_UP
| RTF_PREF(pref
),
4199 .fc_protocol
= RTPROT_RA
,
4200 .fc_type
= RTN_UNICAST
,
4201 .fc_nlinfo
.portid
= 0,
4202 .fc_nlinfo
.nlh
= NULL
,
4203 .fc_nlinfo
.nl_net
= net
,
4206 cfg
.fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
4207 cfg
.fc_dst
= *prefix
;
4208 cfg
.fc_gateway
= *gwaddr
;
4210 /* We should treat it as a default route if prefix length is 0. */
4212 cfg
.fc_flags
|= RTF_DEFAULT
;
4214 ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
);
4216 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, dev
);
4220 struct fib6_info
*rt6_get_dflt_router(struct net
*net
,
4221 const struct in6_addr
*addr
,
4222 struct net_device
*dev
)
4224 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
;
4225 struct fib6_info
*rt
;
4226 struct fib6_table
*table
;
4228 table
= fib6_get_table(net
, tb_id
);
4233 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
4236 /* RA routes do not use nexthops */
4241 if (dev
== nh
->fib_nh_dev
&&
4242 ((rt
->fib6_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
4243 ipv6_addr_equal(&nh
->fib_nh_gw6
, addr
))
4246 if (rt
&& !fib6_info_hold_safe(rt
))
4252 struct fib6_info
*rt6_add_dflt_router(struct net
*net
,
4253 const struct in6_addr
*gwaddr
,
4254 struct net_device
*dev
,
4257 struct fib6_config cfg
= {
4258 .fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
,
4259 .fc_metric
= IP6_RT_PRIO_USER
,
4260 .fc_ifindex
= dev
->ifindex
,
4261 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
4262 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
4263 .fc_protocol
= RTPROT_RA
,
4264 .fc_type
= RTN_UNICAST
,
4265 .fc_nlinfo
.portid
= 0,
4266 .fc_nlinfo
.nlh
= NULL
,
4267 .fc_nlinfo
.nl_net
= net
,
4270 cfg
.fc_gateway
= *gwaddr
;
4272 if (!ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
)) {
4273 struct fib6_table
*table
;
4275 table
= fib6_get_table(dev_net(dev
), cfg
.fc_table
);
4277 table
->flags
|= RT6_TABLE_HAS_DFLT_ROUTER
;
4280 return rt6_get_dflt_router(net
, gwaddr
, dev
);
4283 static void __rt6_purge_dflt_routers(struct net
*net
,
4284 struct fib6_table
*table
)
4286 struct fib6_info
*rt
;
4290 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
4291 struct net_device
*dev
= fib6_info_nh_dev(rt
);
4292 struct inet6_dev
*idev
= dev
? __in6_dev_get(dev
) : NULL
;
4294 if (rt
->fib6_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
) &&
4295 (!idev
|| idev
->cnf
.accept_ra
!= 2) &&
4296 fib6_info_hold_safe(rt
)) {
4298 ip6_del_rt(net
, rt
, false);
4304 table
->flags
&= ~RT6_TABLE_HAS_DFLT_ROUTER
;
4307 void rt6_purge_dflt_routers(struct net
*net
)
4309 struct fib6_table
*table
;
4310 struct hlist_head
*head
;
4315 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
4316 head
= &net
->ipv6
.fib_table_hash
[h
];
4317 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
4318 if (table
->flags
& RT6_TABLE_HAS_DFLT_ROUTER
)
4319 __rt6_purge_dflt_routers(net
, table
);
4326 static void rtmsg_to_fib6_config(struct net
*net
,
4327 struct in6_rtmsg
*rtmsg
,
4328 struct fib6_config
*cfg
)
4330 *cfg
= (struct fib6_config
){
4331 .fc_table
= l3mdev_fib_table_by_index(net
, rtmsg
->rtmsg_ifindex
) ?
4333 .fc_ifindex
= rtmsg
->rtmsg_ifindex
,
4334 .fc_metric
= rtmsg
->rtmsg_metric
? : IP6_RT_PRIO_USER
,
4335 .fc_expires
= rtmsg
->rtmsg_info
,
4336 .fc_dst_len
= rtmsg
->rtmsg_dst_len
,
4337 .fc_src_len
= rtmsg
->rtmsg_src_len
,
4338 .fc_flags
= rtmsg
->rtmsg_flags
,
4339 .fc_type
= rtmsg
->rtmsg_type
,
4341 .fc_nlinfo
.nl_net
= net
,
4343 .fc_dst
= rtmsg
->rtmsg_dst
,
4344 .fc_src
= rtmsg
->rtmsg_src
,
4345 .fc_gateway
= rtmsg
->rtmsg_gateway
,
4349 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, struct in6_rtmsg
*rtmsg
)
4351 struct fib6_config cfg
;
4354 if (cmd
!= SIOCADDRT
&& cmd
!= SIOCDELRT
)
4356 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
4359 rtmsg_to_fib6_config(net
, rtmsg
, &cfg
);
4364 err
= ip6_route_add(&cfg
, GFP_KERNEL
, NULL
);
4367 err
= ip6_route_del(&cfg
, NULL
);
4375 * Drop the packet on the floor
4378 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
4380 struct dst_entry
*dst
= skb_dst(skb
);
4381 struct net
*net
= dev_net(dst
->dev
);
4382 struct inet6_dev
*idev
;
4385 if (netif_is_l3_master(skb
->dev
) &&
4386 dst
->dev
== net
->loopback_dev
)
4387 idev
= __in6_dev_get_safely(dev_get_by_index_rcu(net
, IP6CB(skb
)->iif
));
4389 idev
= ip6_dst_idev(dst
);
4391 switch (ipstats_mib_noroutes
) {
4392 case IPSTATS_MIB_INNOROUTES
:
4393 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
4394 if (type
== IPV6_ADDR_ANY
) {
4395 IP6_INC_STATS(net
, idev
, IPSTATS_MIB_INADDRERRORS
);
4399 case IPSTATS_MIB_OUTNOROUTES
:
4400 IP6_INC_STATS(net
, idev
, ipstats_mib_noroutes
);
4404 /* Start over by dropping the dst for l3mdev case */
4405 if (netif_is_l3_master(skb
->dev
))
4408 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
4413 static int ip6_pkt_discard(struct sk_buff
*skb
)
4415 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
4418 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
4420 skb
->dev
= skb_dst(skb
)->dev
;
4421 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
4424 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
4426 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
4429 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
4431 skb
->dev
= skb_dst(skb
)->dev
;
4432 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
4436 * Allocate a dst for local (unicast / anycast) address.
4439 struct fib6_info
*addrconf_f6i_alloc(struct net
*net
,
4440 struct inet6_dev
*idev
,
4441 const struct in6_addr
*addr
,
4442 bool anycast
, gfp_t gfp_flags
)
4444 struct fib6_config cfg
= {
4445 .fc_table
= l3mdev_fib_table(idev
->dev
) ? : RT6_TABLE_LOCAL
,
4446 .fc_ifindex
= idev
->dev
->ifindex
,
4447 .fc_flags
= RTF_UP
| RTF_NONEXTHOP
,
4450 .fc_protocol
= RTPROT_KERNEL
,
4451 .fc_nlinfo
.nl_net
= net
,
4452 .fc_ignore_dev_down
= true,
4454 struct fib6_info
*f6i
;
4457 cfg
.fc_type
= RTN_ANYCAST
;
4458 cfg
.fc_flags
|= RTF_ANYCAST
;
4460 cfg
.fc_type
= RTN_LOCAL
;
4461 cfg
.fc_flags
|= RTF_LOCAL
;
4464 f6i
= ip6_route_info_create(&cfg
, gfp_flags
, NULL
);
4466 f6i
->dst_nocount
= true;
4470 /* remove deleted ip from prefsrc entries */
4471 struct arg_dev_net_ip
{
4472 struct net_device
*dev
;
4474 struct in6_addr
*addr
;
4477 static int fib6_remove_prefsrc(struct fib6_info
*rt
, void *arg
)
4479 struct net_device
*dev
= ((struct arg_dev_net_ip
*)arg
)->dev
;
4480 struct net
*net
= ((struct arg_dev_net_ip
*)arg
)->net
;
4481 struct in6_addr
*addr
= ((struct arg_dev_net_ip
*)arg
)->addr
;
4484 ((void *)rt
->fib6_nh
->fib_nh_dev
== dev
|| !dev
) &&
4485 rt
!= net
->ipv6
.fib6_null_entry
&&
4486 ipv6_addr_equal(addr
, &rt
->fib6_prefsrc
.addr
)) {
4487 spin_lock_bh(&rt6_exception_lock
);
4488 /* remove prefsrc entry */
4489 rt
->fib6_prefsrc
.plen
= 0;
4490 spin_unlock_bh(&rt6_exception_lock
);
4495 void rt6_remove_prefsrc(struct inet6_ifaddr
*ifp
)
4497 struct net
*net
= dev_net(ifp
->idev
->dev
);
4498 struct arg_dev_net_ip adni
= {
4499 .dev
= ifp
->idev
->dev
,
4503 fib6_clean_all(net
, fib6_remove_prefsrc
, &adni
);
4506 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
4508 /* Remove routers and update dst entries when gateway turn into host. */
4509 static int fib6_clean_tohost(struct fib6_info
*rt
, void *arg
)
4511 struct in6_addr
*gateway
= (struct in6_addr
*)arg
;
4514 /* RA routes do not use nexthops */
4519 if (((rt
->fib6_flags
& RTF_RA_ROUTER
) == RTF_RA_ROUTER
) &&
4520 nh
->fib_nh_gw_family
&& ipv6_addr_equal(gateway
, &nh
->fib_nh_gw6
))
4523 /* Further clean up cached routes in exception table.
4524 * This is needed because cached route may have a different
4525 * gateway than its 'parent' in the case of an ip redirect.
4527 fib6_nh_exceptions_clean_tohost(nh
, gateway
);
4532 void rt6_clean_tohost(struct net
*net
, struct in6_addr
*gateway
)
4534 fib6_clean_all(net
, fib6_clean_tohost
, gateway
);
4537 struct arg_netdev_event
{
4538 const struct net_device
*dev
;
4540 unsigned char nh_flags
;
4541 unsigned long event
;
4545 static struct fib6_info
*rt6_multipath_first_sibling(const struct fib6_info
*rt
)
4547 struct fib6_info
*iter
;
4548 struct fib6_node
*fn
;
4550 fn
= rcu_dereference_protected(rt
->fib6_node
,
4551 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
4552 iter
= rcu_dereference_protected(fn
->leaf
,
4553 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
4555 if (iter
->fib6_metric
== rt
->fib6_metric
&&
4556 rt6_qualify_for_ecmp(iter
))
4558 iter
= rcu_dereference_protected(iter
->fib6_next
,
4559 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
4565 /* only called for fib entries with builtin fib6_nh */
4566 static bool rt6_is_dead(const struct fib6_info
*rt
)
4568 if (rt
->fib6_nh
->fib_nh_flags
& RTNH_F_DEAD
||
4569 (rt
->fib6_nh
->fib_nh_flags
& RTNH_F_LINKDOWN
&&
4570 ip6_ignore_linkdown(rt
->fib6_nh
->fib_nh_dev
)))
4576 static int rt6_multipath_total_weight(const struct fib6_info
*rt
)
4578 struct fib6_info
*iter
;
4581 if (!rt6_is_dead(rt
))
4582 total
+= rt
->fib6_nh
->fib_nh_weight
;
4584 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
) {
4585 if (!rt6_is_dead(iter
))
4586 total
+= iter
->fib6_nh
->fib_nh_weight
;
4592 static void rt6_upper_bound_set(struct fib6_info
*rt
, int *weight
, int total
)
4594 int upper_bound
= -1;
4596 if (!rt6_is_dead(rt
)) {
4597 *weight
+= rt
->fib6_nh
->fib_nh_weight
;
4598 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
) (*weight
) << 31,
4601 atomic_set(&rt
->fib6_nh
->fib_nh_upper_bound
, upper_bound
);
4604 static void rt6_multipath_upper_bound_set(struct fib6_info
*rt
, int total
)
4606 struct fib6_info
*iter
;
4609 rt6_upper_bound_set(rt
, &weight
, total
);
4611 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4612 rt6_upper_bound_set(iter
, &weight
, total
);
4615 void rt6_multipath_rebalance(struct fib6_info
*rt
)
4617 struct fib6_info
*first
;
4620 /* In case the entire multipath route was marked for flushing,
4621 * then there is no need to rebalance upon the removal of every
4624 if (!rt
->fib6_nsiblings
|| rt
->should_flush
)
4627 /* During lookup routes are evaluated in order, so we need to
4628 * make sure upper bounds are assigned from the first sibling
4631 first
= rt6_multipath_first_sibling(rt
);
4632 if (WARN_ON_ONCE(!first
))
4635 total
= rt6_multipath_total_weight(first
);
4636 rt6_multipath_upper_bound_set(first
, total
);
4639 static int fib6_ifup(struct fib6_info
*rt
, void *p_arg
)
4641 const struct arg_netdev_event
*arg
= p_arg
;
4642 struct net
*net
= dev_net(arg
->dev
);
4644 if (rt
!= net
->ipv6
.fib6_null_entry
&& !rt
->nh
&&
4645 rt
->fib6_nh
->fib_nh_dev
== arg
->dev
) {
4646 rt
->fib6_nh
->fib_nh_flags
&= ~arg
->nh_flags
;
4647 fib6_update_sernum_upto_root(net
, rt
);
4648 rt6_multipath_rebalance(rt
);
4654 void rt6_sync_up(struct net_device
*dev
, unsigned char nh_flags
)
4656 struct arg_netdev_event arg
= {
4659 .nh_flags
= nh_flags
,
4663 if (nh_flags
& RTNH_F_DEAD
&& netif_carrier_ok(dev
))
4664 arg
.nh_flags
|= RTNH_F_LINKDOWN
;
4666 fib6_clean_all(dev_net(dev
), fib6_ifup
, &arg
);
4669 /* only called for fib entries with inline fib6_nh */
4670 static bool rt6_multipath_uses_dev(const struct fib6_info
*rt
,
4671 const struct net_device
*dev
)
4673 struct fib6_info
*iter
;
4675 if (rt
->fib6_nh
->fib_nh_dev
== dev
)
4677 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4678 if (iter
->fib6_nh
->fib_nh_dev
== dev
)
4684 static void rt6_multipath_flush(struct fib6_info
*rt
)
4686 struct fib6_info
*iter
;
4688 rt
->should_flush
= 1;
4689 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4690 iter
->should_flush
= 1;
4693 static unsigned int rt6_multipath_dead_count(const struct fib6_info
*rt
,
4694 const struct net_device
*down_dev
)
4696 struct fib6_info
*iter
;
4697 unsigned int dead
= 0;
4699 if (rt
->fib6_nh
->fib_nh_dev
== down_dev
||
4700 rt
->fib6_nh
->fib_nh_flags
& RTNH_F_DEAD
)
4702 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4703 if (iter
->fib6_nh
->fib_nh_dev
== down_dev
||
4704 iter
->fib6_nh
->fib_nh_flags
& RTNH_F_DEAD
)
4710 static void rt6_multipath_nh_flags_set(struct fib6_info
*rt
,
4711 const struct net_device
*dev
,
4712 unsigned char nh_flags
)
4714 struct fib6_info
*iter
;
4716 if (rt
->fib6_nh
->fib_nh_dev
== dev
)
4717 rt
->fib6_nh
->fib_nh_flags
|= nh_flags
;
4718 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4719 if (iter
->fib6_nh
->fib_nh_dev
== dev
)
4720 iter
->fib6_nh
->fib_nh_flags
|= nh_flags
;
4723 /* called with write lock held for table with rt */
4724 static int fib6_ifdown(struct fib6_info
*rt
, void *p_arg
)
4726 const struct arg_netdev_event
*arg
= p_arg
;
4727 const struct net_device
*dev
= arg
->dev
;
4728 struct net
*net
= dev_net(dev
);
4730 if (rt
== net
->ipv6
.fib6_null_entry
|| rt
->nh
)
4733 switch (arg
->event
) {
4734 case NETDEV_UNREGISTER
:
4735 return rt
->fib6_nh
->fib_nh_dev
== dev
? -1 : 0;
4737 if (rt
->should_flush
)
4739 if (!rt
->fib6_nsiblings
)
4740 return rt
->fib6_nh
->fib_nh_dev
== dev
? -1 : 0;
4741 if (rt6_multipath_uses_dev(rt
, dev
)) {
4744 count
= rt6_multipath_dead_count(rt
, dev
);
4745 if (rt
->fib6_nsiblings
+ 1 == count
) {
4746 rt6_multipath_flush(rt
);
4749 rt6_multipath_nh_flags_set(rt
, dev
, RTNH_F_DEAD
|
4751 fib6_update_sernum(net
, rt
);
4752 rt6_multipath_rebalance(rt
);
4756 if (rt
->fib6_nh
->fib_nh_dev
!= dev
||
4757 rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
))
4759 rt
->fib6_nh
->fib_nh_flags
|= RTNH_F_LINKDOWN
;
4760 rt6_multipath_rebalance(rt
);
4767 void rt6_sync_down_dev(struct net_device
*dev
, unsigned long event
)
4769 struct arg_netdev_event arg
= {
4775 struct net
*net
= dev_net(dev
);
4777 if (net
->ipv6
.sysctl
.skip_notify_on_dev_down
)
4778 fib6_clean_all_skip_notify(net
, fib6_ifdown
, &arg
);
4780 fib6_clean_all(net
, fib6_ifdown
, &arg
);
4783 void rt6_disable_ip(struct net_device
*dev
, unsigned long event
)
4785 rt6_sync_down_dev(dev
, event
);
4786 rt6_uncached_list_flush_dev(dev_net(dev
), dev
);
4787 neigh_ifdown(&nd_tbl
, dev
);
4790 struct rt6_mtu_change_arg
{
4791 struct net_device
*dev
;
4793 struct fib6_info
*f6i
;
4796 static int fib6_nh_mtu_change(struct fib6_nh
*nh
, void *_arg
)
4798 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*)_arg
;
4799 struct fib6_info
*f6i
= arg
->f6i
;
4801 /* For administrative MTU increase, there is no way to discover
4802 * IPv6 PMTU increase, so PMTU increase should be updated here.
4803 * Since RFC 1981 doesn't include administrative MTU increase
4804 * update PMTU increase is a MUST. (i.e. jumbo frame)
4806 if (nh
->fib_nh_dev
== arg
->dev
) {
4807 struct inet6_dev
*idev
= __in6_dev_get(arg
->dev
);
4808 u32 mtu
= f6i
->fib6_pmtu
;
4810 if (mtu
>= arg
->mtu
||
4811 (mtu
< arg
->mtu
&& mtu
== idev
->cnf
.mtu6
))
4812 fib6_metric_set(f6i
, RTAX_MTU
, arg
->mtu
);
4814 spin_lock_bh(&rt6_exception_lock
);
4815 rt6_exceptions_update_pmtu(idev
, nh
, arg
->mtu
);
4816 spin_unlock_bh(&rt6_exception_lock
);
4822 static int rt6_mtu_change_route(struct fib6_info
*f6i
, void *p_arg
)
4824 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
4825 struct inet6_dev
*idev
;
4827 /* In IPv6 pmtu discovery is not optional,
4828 so that RTAX_MTU lock cannot disable it.
4829 We still use this lock to block changes
4830 caused by addrconf/ndisc.
4833 idev
= __in6_dev_get(arg
->dev
);
4837 if (fib6_metric_locked(f6i
, RTAX_MTU
))
4842 /* fib6_nh_mtu_change only returns 0, so this is safe */
4843 return nexthop_for_each_fib6_nh(f6i
->nh
, fib6_nh_mtu_change
,
4847 return fib6_nh_mtu_change(f6i
->fib6_nh
, arg
);
4850 void rt6_mtu_change(struct net_device
*dev
, unsigned int mtu
)
4852 struct rt6_mtu_change_arg arg
= {
4857 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, &arg
);
4860 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
4861 [RTA_UNSPEC
] = { .strict_start_type
= RTA_DPORT
+ 1 },
4862 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
4863 [RTA_PREFSRC
] = { .len
= sizeof(struct in6_addr
) },
4864 [RTA_OIF
] = { .type
= NLA_U32
},
4865 [RTA_IIF
] = { .type
= NLA_U32
},
4866 [RTA_PRIORITY
] = { .type
= NLA_U32
},
4867 [RTA_METRICS
] = { .type
= NLA_NESTED
},
4868 [RTA_MULTIPATH
] = { .len
= sizeof(struct rtnexthop
) },
4869 [RTA_PREF
] = { .type
= NLA_U8
},
4870 [RTA_ENCAP_TYPE
] = { .type
= NLA_U16
},
4871 [RTA_ENCAP
] = { .type
= NLA_NESTED
},
4872 [RTA_EXPIRES
] = { .type
= NLA_U32
},
4873 [RTA_UID
] = { .type
= NLA_U32
},
4874 [RTA_MARK
] = { .type
= NLA_U32
},
4875 [RTA_TABLE
] = { .type
= NLA_U32
},
4876 [RTA_IP_PROTO
] = { .type
= NLA_U8
},
4877 [RTA_SPORT
] = { .type
= NLA_U16
},
4878 [RTA_DPORT
] = { .type
= NLA_U16
},
4879 [RTA_NH_ID
] = { .type
= NLA_U32
},
4882 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4883 struct fib6_config
*cfg
,
4884 struct netlink_ext_ack
*extack
)
4887 struct nlattr
*tb
[RTA_MAX
+1];
4891 err
= nlmsg_parse_deprecated(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
4892 rtm_ipv6_policy
, extack
);
4897 rtm
= nlmsg_data(nlh
);
4899 *cfg
= (struct fib6_config
){
4900 .fc_table
= rtm
->rtm_table
,
4901 .fc_dst_len
= rtm
->rtm_dst_len
,
4902 .fc_src_len
= rtm
->rtm_src_len
,
4904 .fc_protocol
= rtm
->rtm_protocol
,
4905 .fc_type
= rtm
->rtm_type
,
4907 .fc_nlinfo
.portid
= NETLINK_CB(skb
).portid
,
4908 .fc_nlinfo
.nlh
= nlh
,
4909 .fc_nlinfo
.nl_net
= sock_net(skb
->sk
),
4912 if (rtm
->rtm_type
== RTN_UNREACHABLE
||
4913 rtm
->rtm_type
== RTN_BLACKHOLE
||
4914 rtm
->rtm_type
== RTN_PROHIBIT
||
4915 rtm
->rtm_type
== RTN_THROW
)
4916 cfg
->fc_flags
|= RTF_REJECT
;
4918 if (rtm
->rtm_type
== RTN_LOCAL
)
4919 cfg
->fc_flags
|= RTF_LOCAL
;
4921 if (rtm
->rtm_flags
& RTM_F_CLONED
)
4922 cfg
->fc_flags
|= RTF_CACHE
;
4924 cfg
->fc_flags
|= (rtm
->rtm_flags
& RTNH_F_ONLINK
);
4926 if (tb
[RTA_NH_ID
]) {
4927 if (tb
[RTA_GATEWAY
] || tb
[RTA_OIF
] ||
4928 tb
[RTA_MULTIPATH
] || tb
[RTA_ENCAP
]) {
4929 NL_SET_ERR_MSG(extack
,
4930 "Nexthop specification and nexthop id are mutually exclusive");
4933 cfg
->fc_nh_id
= nla_get_u32(tb
[RTA_NH_ID
]);
4936 if (tb
[RTA_GATEWAY
]) {
4937 cfg
->fc_gateway
= nla_get_in6_addr(tb
[RTA_GATEWAY
]);
4938 cfg
->fc_flags
|= RTF_GATEWAY
;
4941 NL_SET_ERR_MSG(extack
, "IPv6 does not support RTA_VIA attribute");
4946 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
4948 if (nla_len(tb
[RTA_DST
]) < plen
)
4951 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
4955 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
4957 if (nla_len(tb
[RTA_SRC
]) < plen
)
4960 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
4963 if (tb
[RTA_PREFSRC
])
4964 cfg
->fc_prefsrc
= nla_get_in6_addr(tb
[RTA_PREFSRC
]);
4967 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
4969 if (tb
[RTA_PRIORITY
])
4970 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
4972 if (tb
[RTA_METRICS
]) {
4973 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
4974 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
4978 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
4980 if (tb
[RTA_MULTIPATH
]) {
4981 cfg
->fc_mp
= nla_data(tb
[RTA_MULTIPATH
]);
4982 cfg
->fc_mp_len
= nla_len(tb
[RTA_MULTIPATH
]);
4984 err
= lwtunnel_valid_encap_type_attr(cfg
->fc_mp
,
4985 cfg
->fc_mp_len
, extack
);
4991 pref
= nla_get_u8(tb
[RTA_PREF
]);
4992 if (pref
!= ICMPV6_ROUTER_PREF_LOW
&&
4993 pref
!= ICMPV6_ROUTER_PREF_HIGH
)
4994 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
4995 cfg
->fc_flags
|= RTF_PREF(pref
);
4999 cfg
->fc_encap
= tb
[RTA_ENCAP
];
5001 if (tb
[RTA_ENCAP_TYPE
]) {
5002 cfg
->fc_encap_type
= nla_get_u16(tb
[RTA_ENCAP_TYPE
]);
5004 err
= lwtunnel_valid_encap_type(cfg
->fc_encap_type
, extack
);
5009 if (tb
[RTA_EXPIRES
]) {
5010 unsigned long timeout
= addrconf_timeout_fixup(nla_get_u32(tb
[RTA_EXPIRES
]), HZ
);
5012 if (addrconf_finite_timeout(timeout
)) {
5013 cfg
->fc_expires
= jiffies_to_clock_t(timeout
* HZ
);
5014 cfg
->fc_flags
|= RTF_EXPIRES
;
5024 struct fib6_info
*fib6_info
;
5025 struct fib6_config r_cfg
;
5026 struct list_head next
;
5029 static int ip6_route_info_append(struct net
*net
,
5030 struct list_head
*rt6_nh_list
,
5031 struct fib6_info
*rt
,
5032 struct fib6_config
*r_cfg
)
5037 list_for_each_entry(nh
, rt6_nh_list
, next
) {
5038 /* check if fib6_info already exists */
5039 if (rt6_duplicate_nexthop(nh
->fib6_info
, rt
))
5043 nh
= kzalloc(sizeof(*nh
), GFP_KERNEL
);
5047 memcpy(&nh
->r_cfg
, r_cfg
, sizeof(*r_cfg
));
5048 list_add_tail(&nh
->next
, rt6_nh_list
);
5053 static void ip6_route_mpath_notify(struct fib6_info
*rt
,
5054 struct fib6_info
*rt_last
,
5055 struct nl_info
*info
,
5058 /* if this is an APPEND route, then rt points to the first route
5059 * inserted and rt_last points to last route inserted. Userspace
5060 * wants a consistent dump of the route which starts at the first
5061 * nexthop. Since sibling routes are always added at the end of
5062 * the list, find the first sibling of the last route appended
5064 if ((nlflags
& NLM_F_APPEND
) && rt_last
&& rt_last
->fib6_nsiblings
) {
5065 rt
= list_first_entry(&rt_last
->fib6_siblings
,
5071 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
5074 static bool ip6_route_mpath_should_notify(const struct fib6_info
*rt
)
5076 bool rt_can_ecmp
= rt6_qualify_for_ecmp(rt
);
5077 bool should_notify
= false;
5078 struct fib6_info
*leaf
;
5079 struct fib6_node
*fn
;
5082 fn
= rcu_dereference(rt
->fib6_node
);
5086 leaf
= rcu_dereference(fn
->leaf
);
5091 (rt_can_ecmp
&& rt
->fib6_metric
== leaf
->fib6_metric
&&
5092 rt6_qualify_for_ecmp(leaf
)))
5093 should_notify
= true;
5097 return should_notify
;
5100 static int ip6_route_multipath_add(struct fib6_config
*cfg
,
5101 struct netlink_ext_ack
*extack
)
5103 struct fib6_info
*rt_notif
= NULL
, *rt_last
= NULL
;
5104 struct nl_info
*info
= &cfg
->fc_nlinfo
;
5105 struct fib6_config r_cfg
;
5106 struct rtnexthop
*rtnh
;
5107 struct fib6_info
*rt
;
5108 struct rt6_nh
*err_nh
;
5109 struct rt6_nh
*nh
, *nh_safe
;
5115 int replace
= (cfg
->fc_nlinfo
.nlh
&&
5116 (cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_REPLACE
));
5117 LIST_HEAD(rt6_nh_list
);
5119 nlflags
= replace
? NLM_F_REPLACE
: NLM_F_CREATE
;
5120 if (info
->nlh
&& info
->nlh
->nlmsg_flags
& NLM_F_APPEND
)
5121 nlflags
|= NLM_F_APPEND
;
5123 remaining
= cfg
->fc_mp_len
;
5124 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
5126 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
5127 * fib6_info structs per nexthop
5129 while (rtnh_ok(rtnh
, remaining
)) {
5130 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
5131 if (rtnh
->rtnh_ifindex
)
5132 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
5134 attrlen
= rtnh_attrlen(rtnh
);
5136 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
5138 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
5140 r_cfg
.fc_gateway
= nla_get_in6_addr(nla
);
5141 r_cfg
.fc_flags
|= RTF_GATEWAY
;
5143 r_cfg
.fc_encap
= nla_find(attrs
, attrlen
, RTA_ENCAP
);
5144 nla
= nla_find(attrs
, attrlen
, RTA_ENCAP_TYPE
);
5146 r_cfg
.fc_encap_type
= nla_get_u16(nla
);
5149 r_cfg
.fc_flags
|= (rtnh
->rtnh_flags
& RTNH_F_ONLINK
);
5150 rt
= ip6_route_info_create(&r_cfg
, GFP_KERNEL
, extack
);
5156 if (!rt6_qualify_for_ecmp(rt
)) {
5158 NL_SET_ERR_MSG(extack
,
5159 "Device only routes can not be added for IPv6 using the multipath API.");
5160 fib6_info_release(rt
);
5164 rt
->fib6_nh
->fib_nh_weight
= rtnh
->rtnh_hops
+ 1;
5166 err
= ip6_route_info_append(info
->nl_net
, &rt6_nh_list
,
5169 fib6_info_release(rt
);
5173 rtnh
= rtnh_next(rtnh
, &remaining
);
5176 if (list_empty(&rt6_nh_list
)) {
5177 NL_SET_ERR_MSG(extack
,
5178 "Invalid nexthop configuration - no valid nexthops");
5182 /* for add and replace send one notification with all nexthops.
5183 * Skip the notification in fib6_add_rt2node and send one with
5184 * the full route when done
5186 info
->skip_notify
= 1;
5188 /* For add and replace, send one notification with all nexthops. For
5189 * append, send one notification with all appended nexthops.
5191 info
->skip_notify_kernel
= 1;
5194 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
5195 err
= __ip6_ins_rt(nh
->fib6_info
, info
, extack
);
5196 fib6_info_release(nh
->fib6_info
);
5199 /* save reference to last route successfully inserted */
5200 rt_last
= nh
->fib6_info
;
5202 /* save reference to first route for notification */
5204 rt_notif
= nh
->fib6_info
;
5207 /* nh->fib6_info is used or freed at this point, reset to NULL*/
5208 nh
->fib6_info
= NULL
;
5211 NL_SET_ERR_MSG_MOD(extack
,
5212 "multipath route replace failed (check consistency of installed routes)");
5217 /* Because each route is added like a single route we remove
5218 * these flags after the first nexthop: if there is a collision,
5219 * we have already failed to add the first nexthop:
5220 * fib6_add_rt2node() has rejected it; when replacing, old
5221 * nexthops have been replaced by first new, the rest should
5224 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
&= ~(NLM_F_EXCL
|
5226 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
|= NLM_F_CREATE
;
5230 /* An in-kernel notification should only be sent in case the new
5231 * multipath route is added as the first route in the node, or if
5232 * it was appended to it. We pass 'rt_notif' since it is the first
5233 * sibling and might allow us to skip some checks in the replace case.
5235 if (ip6_route_mpath_should_notify(rt_notif
)) {
5236 enum fib_event_type fib_event
;
5238 if (rt_notif
->fib6_nsiblings
!= nhn
- 1)
5239 fib_event
= FIB_EVENT_ENTRY_APPEND
;
5241 fib_event
= FIB_EVENT_ENTRY_REPLACE
;
5243 err
= call_fib6_multipath_entry_notifiers(info
->nl_net
,
5244 fib_event
, rt_notif
,
5247 /* Delete all the siblings that were just added */
5253 /* success ... tell user about new route */
5254 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
5258 /* send notification for routes that were added so that
5259 * the delete notifications sent by ip6_route_del are
5263 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
5265 /* Delete routes that were already added */
5266 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
5269 ip6_route_del(&nh
->r_cfg
, extack
);
5273 list_for_each_entry_safe(nh
, nh_safe
, &rt6_nh_list
, next
) {
5275 fib6_info_release(nh
->fib6_info
);
5276 list_del(&nh
->next
);
5283 static int ip6_route_multipath_del(struct fib6_config
*cfg
,
5284 struct netlink_ext_ack
*extack
)
5286 struct fib6_config r_cfg
;
5287 struct rtnexthop
*rtnh
;
5293 remaining
= cfg
->fc_mp_len
;
5294 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
5296 /* Parse a Multipath Entry */
5297 while (rtnh_ok(rtnh
, remaining
)) {
5298 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
5299 if (rtnh
->rtnh_ifindex
)
5300 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
5302 attrlen
= rtnh_attrlen(rtnh
);
5304 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
5306 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
5308 nla_memcpy(&r_cfg
.fc_gateway
, nla
, 16);
5309 r_cfg
.fc_flags
|= RTF_GATEWAY
;
5312 err
= ip6_route_del(&r_cfg
, extack
);
5316 rtnh
= rtnh_next(rtnh
, &remaining
);
5322 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
5323 struct netlink_ext_ack
*extack
)
5325 struct fib6_config cfg
;
5328 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
5333 !nexthop_find_by_id(sock_net(skb
->sk
), cfg
.fc_nh_id
)) {
5334 NL_SET_ERR_MSG(extack
, "Nexthop id does not exist");
5339 return ip6_route_multipath_del(&cfg
, extack
);
5341 cfg
.fc_delete_all_nh
= 1;
5342 return ip6_route_del(&cfg
, extack
);
5346 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
5347 struct netlink_ext_ack
*extack
)
5349 struct fib6_config cfg
;
5352 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
5356 if (cfg
.fc_metric
== 0)
5357 cfg
.fc_metric
= IP6_RT_PRIO_USER
;
5360 return ip6_route_multipath_add(&cfg
, extack
);
5362 return ip6_route_add(&cfg
, GFP_KERNEL
, extack
);
5365 /* add the overhead of this fib6_nh to nexthop_len */
5366 static int rt6_nh_nlmsg_size(struct fib6_nh
*nh
, void *arg
)
5368 int *nexthop_len
= arg
;
5370 *nexthop_len
+= nla_total_size(0) /* RTA_MULTIPATH */
5371 + NLA_ALIGN(sizeof(struct rtnexthop
))
5372 + nla_total_size(16); /* RTA_GATEWAY */
5374 if (nh
->fib_nh_lws
) {
5375 /* RTA_ENCAP_TYPE */
5376 *nexthop_len
+= lwtunnel_get_encap_size(nh
->fib_nh_lws
);
5378 *nexthop_len
+= nla_total_size(2);
5384 static size_t rt6_nlmsg_size(struct fib6_info
*f6i
)
5389 nexthop_len
= nla_total_size(4); /* RTA_NH_ID */
5390 nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_nlmsg_size
,
5393 struct fib6_nh
*nh
= f6i
->fib6_nh
;
5396 if (f6i
->fib6_nsiblings
) {
5397 nexthop_len
= nla_total_size(0) /* RTA_MULTIPATH */
5398 + NLA_ALIGN(sizeof(struct rtnexthop
))
5399 + nla_total_size(16) /* RTA_GATEWAY */
5400 + lwtunnel_get_encap_size(nh
->fib_nh_lws
);
5402 nexthop_len
*= f6i
->fib6_nsiblings
;
5404 nexthop_len
+= lwtunnel_get_encap_size(nh
->fib_nh_lws
);
5407 return NLMSG_ALIGN(sizeof(struct rtmsg
))
5408 + nla_total_size(16) /* RTA_SRC */
5409 + nla_total_size(16) /* RTA_DST */
5410 + nla_total_size(16) /* RTA_GATEWAY */
5411 + nla_total_size(16) /* RTA_PREFSRC */
5412 + nla_total_size(4) /* RTA_TABLE */
5413 + nla_total_size(4) /* RTA_IIF */
5414 + nla_total_size(4) /* RTA_OIF */
5415 + nla_total_size(4) /* RTA_PRIORITY */
5416 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
5417 + nla_total_size(sizeof(struct rta_cacheinfo
))
5418 + nla_total_size(TCP_CA_NAME_MAX
) /* RTAX_CC_ALGO */
5419 + nla_total_size(1) /* RTA_PREF */
5423 static int rt6_fill_node_nexthop(struct sk_buff
*skb
, struct nexthop
*nh
,
5424 unsigned char *flags
)
5426 if (nexthop_is_multipath(nh
)) {
5429 mp
= nla_nest_start_noflag(skb
, RTA_MULTIPATH
);
5431 goto nla_put_failure
;
5433 if (nexthop_mpath_fill_node(skb
, nh
, AF_INET6
))
5434 goto nla_put_failure
;
5436 nla_nest_end(skb
, mp
);
5438 struct fib6_nh
*fib6_nh
;
5440 fib6_nh
= nexthop_fib6_nh(nh
);
5441 if (fib_nexthop_info(skb
, &fib6_nh
->nh_common
, AF_INET6
,
5443 goto nla_put_failure
;
5452 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
5453 struct fib6_info
*rt
, struct dst_entry
*dst
,
5454 struct in6_addr
*dest
, struct in6_addr
*src
,
5455 int iif
, int type
, u32 portid
, u32 seq
,
5458 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
5459 struct rt6key
*rt6_dst
, *rt6_src
;
5460 u32
*pmetrics
, table
, rt6_flags
;
5461 unsigned char nh_flags
= 0;
5462 struct nlmsghdr
*nlh
;
5466 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*rtm
), flags
);
5471 rt6_dst
= &rt6
->rt6i_dst
;
5472 rt6_src
= &rt6
->rt6i_src
;
5473 rt6_flags
= rt6
->rt6i_flags
;
5475 rt6_dst
= &rt
->fib6_dst
;
5476 rt6_src
= &rt
->fib6_src
;
5477 rt6_flags
= rt
->fib6_flags
;
5480 rtm
= nlmsg_data(nlh
);
5481 rtm
->rtm_family
= AF_INET6
;
5482 rtm
->rtm_dst_len
= rt6_dst
->plen
;
5483 rtm
->rtm_src_len
= rt6_src
->plen
;
5486 table
= rt
->fib6_table
->tb6_id
;
5488 table
= RT6_TABLE_UNSPEC
;
5489 rtm
->rtm_table
= table
< 256 ? table
: RT_TABLE_COMPAT
;
5490 if (nla_put_u32(skb
, RTA_TABLE
, table
))
5491 goto nla_put_failure
;
5493 rtm
->rtm_type
= rt
->fib6_type
;
5495 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
5496 rtm
->rtm_protocol
= rt
->fib6_protocol
;
5498 if (rt6_flags
& RTF_CACHE
)
5499 rtm
->rtm_flags
|= RTM_F_CLONED
;
5502 if (nla_put_in6_addr(skb
, RTA_DST
, dest
))
5503 goto nla_put_failure
;
5504 rtm
->rtm_dst_len
= 128;
5505 } else if (rtm
->rtm_dst_len
)
5506 if (nla_put_in6_addr(skb
, RTA_DST
, &rt6_dst
->addr
))
5507 goto nla_put_failure
;
5508 #ifdef CONFIG_IPV6_SUBTREES
5510 if (nla_put_in6_addr(skb
, RTA_SRC
, src
))
5511 goto nla_put_failure
;
5512 rtm
->rtm_src_len
= 128;
5513 } else if (rtm
->rtm_src_len
&&
5514 nla_put_in6_addr(skb
, RTA_SRC
, &rt6_src
->addr
))
5515 goto nla_put_failure
;
5518 #ifdef CONFIG_IPV6_MROUTE
5519 if (ipv6_addr_is_multicast(&rt6_dst
->addr
)) {
5520 int err
= ip6mr_get_route(net
, skb
, rtm
, portid
);
5525 goto nla_put_failure
;
5528 if (nla_put_u32(skb
, RTA_IIF
, iif
))
5529 goto nla_put_failure
;
5531 struct in6_addr saddr_buf
;
5532 if (ip6_route_get_saddr(net
, rt
, dest
, 0, &saddr_buf
) == 0 &&
5533 nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
5534 goto nla_put_failure
;
5537 if (rt
->fib6_prefsrc
.plen
) {
5538 struct in6_addr saddr_buf
;
5539 saddr_buf
= rt
->fib6_prefsrc
.addr
;
5540 if (nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
5541 goto nla_put_failure
;
5544 pmetrics
= dst
? dst_metrics_ptr(dst
) : rt
->fib6_metrics
->metrics
;
5545 if (rtnetlink_put_metrics(skb
, pmetrics
) < 0)
5546 goto nla_put_failure
;
5548 if (nla_put_u32(skb
, RTA_PRIORITY
, rt
->fib6_metric
))
5549 goto nla_put_failure
;
5551 /* For multipath routes, walk the siblings list and add
5552 * each as a nexthop within RTA_MULTIPATH.
5555 if (rt6_flags
& RTF_GATEWAY
&&
5556 nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt6
->rt6i_gateway
))
5557 goto nla_put_failure
;
5559 if (dst
->dev
&& nla_put_u32(skb
, RTA_OIF
, dst
->dev
->ifindex
))
5560 goto nla_put_failure
;
5562 if (dst
->lwtstate
&&
5563 lwtunnel_fill_encap(skb
, dst
->lwtstate
, RTA_ENCAP
, RTA_ENCAP_TYPE
) < 0)
5564 goto nla_put_failure
;
5565 } else if (rt
->fib6_nsiblings
) {
5566 struct fib6_info
*sibling
, *next_sibling
;
5569 mp
= nla_nest_start_noflag(skb
, RTA_MULTIPATH
);
5571 goto nla_put_failure
;
5573 if (fib_add_nexthop(skb
, &rt
->fib6_nh
->nh_common
,
5574 rt
->fib6_nh
->fib_nh_weight
, AF_INET6
) < 0)
5575 goto nla_put_failure
;
5577 list_for_each_entry_safe(sibling
, next_sibling
,
5578 &rt
->fib6_siblings
, fib6_siblings
) {
5579 if (fib_add_nexthop(skb
, &sibling
->fib6_nh
->nh_common
,
5580 sibling
->fib6_nh
->fib_nh_weight
,
5582 goto nla_put_failure
;
5585 nla_nest_end(skb
, mp
);
5586 } else if (rt
->nh
) {
5587 if (nla_put_u32(skb
, RTA_NH_ID
, rt
->nh
->id
))
5588 goto nla_put_failure
;
5590 if (nexthop_is_blackhole(rt
->nh
))
5591 rtm
->rtm_type
= RTN_BLACKHOLE
;
5593 if (net
->ipv4
.sysctl_nexthop_compat_mode
&&
5594 rt6_fill_node_nexthop(skb
, rt
->nh
, &nh_flags
) < 0)
5595 goto nla_put_failure
;
5597 rtm
->rtm_flags
|= nh_flags
;
5599 if (fib_nexthop_info(skb
, &rt
->fib6_nh
->nh_common
, AF_INET6
,
5600 &nh_flags
, false) < 0)
5601 goto nla_put_failure
;
5603 rtm
->rtm_flags
|= nh_flags
;
5606 if (rt6_flags
& RTF_EXPIRES
) {
5607 expires
= dst
? dst
->expires
: rt
->expires
;
5613 rtm
->rtm_flags
|= RTM_F_OFFLOAD
;
5615 rtm
->rtm_flags
|= RTM_F_TRAP
;
5618 if (rtnl_put_cacheinfo(skb
, dst
, 0, expires
, dst
? dst
->error
: 0) < 0)
5619 goto nla_put_failure
;
5621 if (nla_put_u8(skb
, RTA_PREF
, IPV6_EXTRACT_PREF(rt6_flags
)))
5622 goto nla_put_failure
;
5625 nlmsg_end(skb
, nlh
);
5629 nlmsg_cancel(skb
, nlh
);
5633 static int fib6_info_nh_uses_dev(struct fib6_nh
*nh
, void *arg
)
5635 const struct net_device
*dev
= arg
;
5637 if (nh
->fib_nh_dev
== dev
)
5643 static bool fib6_info_uses_dev(const struct fib6_info
*f6i
,
5644 const struct net_device
*dev
)
5647 struct net_device
*_dev
= (struct net_device
*)dev
;
5649 return !!nexthop_for_each_fib6_nh(f6i
->nh
,
5650 fib6_info_nh_uses_dev
,
5654 if (f6i
->fib6_nh
->fib_nh_dev
== dev
)
5657 if (f6i
->fib6_nsiblings
) {
5658 struct fib6_info
*sibling
, *next_sibling
;
5660 list_for_each_entry_safe(sibling
, next_sibling
,
5661 &f6i
->fib6_siblings
, fib6_siblings
) {
5662 if (sibling
->fib6_nh
->fib_nh_dev
== dev
)
5670 struct fib6_nh_exception_dump_walker
{
5671 struct rt6_rtnl_dump_arg
*dump
;
5672 struct fib6_info
*rt
;
5678 static int rt6_nh_dump_exceptions(struct fib6_nh
*nh
, void *arg
)
5680 struct fib6_nh_exception_dump_walker
*w
= arg
;
5681 struct rt6_rtnl_dump_arg
*dump
= w
->dump
;
5682 struct rt6_exception_bucket
*bucket
;
5683 struct rt6_exception
*rt6_ex
;
5686 bucket
= fib6_nh_get_excptn_bucket(nh
, NULL
);
5690 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
5691 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
5697 /* Expiration of entries doesn't bump sernum, insertion
5698 * does. Removal is triggered by insertion, so we can
5699 * rely on the fact that if entries change between two
5700 * partial dumps, this node is scanned again completely,
5701 * see rt6_insert_exception() and fib6_dump_table().
5703 * Count expired entries we go through as handled
5704 * entries that we'll skip next time, in case of partial
5705 * node dump. Otherwise, if entries expire meanwhile,
5706 * we'll skip the wrong amount.
5708 if (rt6_check_expired(rt6_ex
->rt6i
)) {
5713 err
= rt6_fill_node(dump
->net
, dump
->skb
, w
->rt
,
5714 &rt6_ex
->rt6i
->dst
, NULL
, NULL
, 0,
5716 NETLINK_CB(dump
->cb
->skb
).portid
,
5717 dump
->cb
->nlh
->nlmsg_seq
, w
->flags
);
5729 /* Return -1 if done with node, number of handled routes on partial dump */
5730 int rt6_dump_route(struct fib6_info
*rt
, void *p_arg
, unsigned int skip
)
5732 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
5733 struct fib_dump_filter
*filter
= &arg
->filter
;
5734 unsigned int flags
= NLM_F_MULTI
;
5735 struct net
*net
= arg
->net
;
5738 if (rt
== net
->ipv6
.fib6_null_entry
)
5741 if ((filter
->flags
& RTM_F_PREFIX
) &&
5742 !(rt
->fib6_flags
& RTF_PREFIX_RT
)) {
5743 /* success since this is not a prefix route */
5746 if (filter
->filter_set
&&
5747 ((filter
->rt_type
&& rt
->fib6_type
!= filter
->rt_type
) ||
5748 (filter
->dev
&& !fib6_info_uses_dev(rt
, filter
->dev
)) ||
5749 (filter
->protocol
&& rt
->fib6_protocol
!= filter
->protocol
))) {
5753 if (filter
->filter_set
||
5754 !filter
->dump_routes
|| !filter
->dump_exceptions
) {
5755 flags
|= NLM_F_DUMP_FILTERED
;
5758 if (filter
->dump_routes
) {
5762 if (rt6_fill_node(net
, arg
->skb
, rt
, NULL
, NULL
, NULL
,
5764 NETLINK_CB(arg
->cb
->skb
).portid
,
5765 arg
->cb
->nlh
->nlmsg_seq
, flags
)) {
5772 if (filter
->dump_exceptions
) {
5773 struct fib6_nh_exception_dump_walker w
= { .dump
= arg
,
5782 err
= nexthop_for_each_fib6_nh(rt
->nh
,
5783 rt6_nh_dump_exceptions
,
5786 err
= rt6_nh_dump_exceptions(rt
->fib6_nh
, &w
);
5791 return count
+= w
.count
;
5797 static int inet6_rtm_valid_getroute_req(struct sk_buff
*skb
,
5798 const struct nlmsghdr
*nlh
,
5800 struct netlink_ext_ack
*extack
)
5805 if (nlh
->nlmsg_len
< nlmsg_msg_size(sizeof(*rtm
))) {
5806 NL_SET_ERR_MSG_MOD(extack
,
5807 "Invalid header for get route request");
5811 if (!netlink_strict_get_check(skb
))
5812 return nlmsg_parse_deprecated(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
5813 rtm_ipv6_policy
, extack
);
5815 rtm
= nlmsg_data(nlh
);
5816 if ((rtm
->rtm_src_len
&& rtm
->rtm_src_len
!= 128) ||
5817 (rtm
->rtm_dst_len
&& rtm
->rtm_dst_len
!= 128) ||
5818 rtm
->rtm_table
|| rtm
->rtm_protocol
|| rtm
->rtm_scope
||
5820 NL_SET_ERR_MSG_MOD(extack
, "Invalid values in header for get route request");
5823 if (rtm
->rtm_flags
& ~RTM_F_FIB_MATCH
) {
5824 NL_SET_ERR_MSG_MOD(extack
,
5825 "Invalid flags for get route request");
5829 err
= nlmsg_parse_deprecated_strict(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
5830 rtm_ipv6_policy
, extack
);
5834 if ((tb
[RTA_SRC
] && !rtm
->rtm_src_len
) ||
5835 (tb
[RTA_DST
] && !rtm
->rtm_dst_len
)) {
5836 NL_SET_ERR_MSG_MOD(extack
, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5840 for (i
= 0; i
<= RTA_MAX
; i
++) {
5856 NL_SET_ERR_MSG_MOD(extack
, "Unsupported attribute in get route request");
5864 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
5865 struct netlink_ext_ack
*extack
)
5867 struct net
*net
= sock_net(in_skb
->sk
);
5868 struct nlattr
*tb
[RTA_MAX
+1];
5869 int err
, iif
= 0, oif
= 0;
5870 struct fib6_info
*from
;
5871 struct dst_entry
*dst
;
5872 struct rt6_info
*rt
;
5873 struct sk_buff
*skb
;
5875 struct flowi6 fl6
= {};
5878 err
= inet6_rtm_valid_getroute_req(in_skb
, nlh
, tb
, extack
);
5883 rtm
= nlmsg_data(nlh
);
5884 fl6
.flowlabel
= ip6_make_flowinfo(rtm
->rtm_tos
, 0);
5885 fibmatch
= !!(rtm
->rtm_flags
& RTM_F_FIB_MATCH
);
5888 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
5891 fl6
.saddr
= *(struct in6_addr
*)nla_data(tb
[RTA_SRC
]);
5895 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
5898 fl6
.daddr
= *(struct in6_addr
*)nla_data(tb
[RTA_DST
]);
5902 iif
= nla_get_u32(tb
[RTA_IIF
]);
5905 oif
= nla_get_u32(tb
[RTA_OIF
]);
5908 fl6
.flowi6_mark
= nla_get_u32(tb
[RTA_MARK
]);
5911 fl6
.flowi6_uid
= make_kuid(current_user_ns(),
5912 nla_get_u32(tb
[RTA_UID
]));
5914 fl6
.flowi6_uid
= iif
? INVALID_UID
: current_uid();
5917 fl6
.fl6_sport
= nla_get_be16(tb
[RTA_SPORT
]);
5920 fl6
.fl6_dport
= nla_get_be16(tb
[RTA_DPORT
]);
5922 if (tb
[RTA_IP_PROTO
]) {
5923 err
= rtm_getroute_parse_ip_proto(tb
[RTA_IP_PROTO
],
5924 &fl6
.flowi6_proto
, AF_INET6
,
5931 struct net_device
*dev
;
5936 dev
= dev_get_by_index_rcu(net
, iif
);
5943 fl6
.flowi6_iif
= iif
;
5945 if (!ipv6_addr_any(&fl6
.saddr
))
5946 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
5948 dst
= ip6_route_input_lookup(net
, dev
, &fl6
, NULL
, flags
);
5952 fl6
.flowi6_oif
= oif
;
5954 dst
= ip6_route_output(net
, NULL
, &fl6
);
5958 rt
= container_of(dst
, struct rt6_info
, dst
);
5959 if (rt
->dst
.error
) {
5960 err
= rt
->dst
.error
;
5965 if (rt
== net
->ipv6
.ip6_null_entry
) {
5966 err
= rt
->dst
.error
;
5971 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
5978 skb_dst_set(skb
, &rt
->dst
);
5981 from
= rcu_dereference(rt
->from
);
5984 err
= rt6_fill_node(net
, skb
, from
, NULL
, NULL
, NULL
,
5986 NETLINK_CB(in_skb
).portid
,
5989 err
= rt6_fill_node(net
, skb
, from
, dst
, &fl6
.daddr
,
5990 &fl6
.saddr
, iif
, RTM_NEWROUTE
,
5991 NETLINK_CB(in_skb
).portid
,
6003 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
6008 void inet6_rt_notify(int event
, struct fib6_info
*rt
, struct nl_info
*info
,
6009 unsigned int nlm_flags
)
6011 struct sk_buff
*skb
;
6012 struct net
*net
= info
->nl_net
;
6017 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
6019 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
6023 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, NULL
, 0,
6024 event
, info
->portid
, seq
, nlm_flags
);
6026 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6027 WARN_ON(err
== -EMSGSIZE
);
6031 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
6032 info
->nlh
, gfp_any());
6036 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
6039 void fib6_rt_update(struct net
*net
, struct fib6_info
*rt
,
6040 struct nl_info
*info
)
6042 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
6043 struct sk_buff
*skb
;
6046 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
6050 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, NULL
, 0,
6051 RTM_NEWROUTE
, info
->portid
, seq
, NLM_F_REPLACE
);
6053 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6054 WARN_ON(err
== -EMSGSIZE
);
6058 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
6059 info
->nlh
, gfp_any());
6063 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
6066 static int ip6_route_dev_notify(struct notifier_block
*this,
6067 unsigned long event
, void *ptr
)
6069 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
6070 struct net
*net
= dev_net(dev
);
6072 if (!(dev
->flags
& IFF_LOOPBACK
))
6075 if (event
== NETDEV_REGISTER
) {
6076 net
->ipv6
.fib6_null_entry
->fib6_nh
->fib_nh_dev
= dev
;
6077 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
6078 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
6079 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6080 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
6081 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
6082 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
6083 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
6085 } else if (event
== NETDEV_UNREGISTER
&&
6086 dev
->reg_state
!= NETREG_UNREGISTERED
) {
6087 /* NETDEV_UNREGISTER could be fired for multiple times by
6088 * netdev_wait_allrefs(). Make sure we only call this once.
6090 in6_dev_put_clear(&net
->ipv6
.ip6_null_entry
->rt6i_idev
);
6091 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6092 in6_dev_put_clear(&net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
);
6093 in6_dev_put_clear(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
);
6104 #ifdef CONFIG_PROC_FS
6105 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
6107 struct net
*net
= (struct net
*)seq
->private;
6108 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
6109 net
->ipv6
.rt6_stats
->fib_nodes
,
6110 net
->ipv6
.rt6_stats
->fib_route_nodes
,
6111 atomic_read(&net
->ipv6
.rt6_stats
->fib_rt_alloc
),
6112 net
->ipv6
.rt6_stats
->fib_rt_entries
,
6113 net
->ipv6
.rt6_stats
->fib_rt_cache
,
6114 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
6115 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
6119 #endif /* CONFIG_PROC_FS */
6121 #ifdef CONFIG_SYSCTL
6123 static int ipv6_sysctl_rtcache_flush(struct ctl_table
*ctl
, int write
,
6124 void *buffer
, size_t *lenp
, loff_t
*ppos
)
6132 net
= (struct net
*)ctl
->extra1
;
6133 delay
= net
->ipv6
.sysctl
.flush_delay
;
6134 ret
= proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
6138 fib6_run_gc(delay
<= 0 ? 0 : (unsigned long)delay
, net
, delay
> 0);
6142 static struct ctl_table ipv6_route_table_template
[] = {
6144 .procname
= "flush",
6145 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
6146 .maxlen
= sizeof(int),
6148 .proc_handler
= ipv6_sysctl_rtcache_flush
6151 .procname
= "gc_thresh",
6152 .data
= &ip6_dst_ops_template
.gc_thresh
,
6153 .maxlen
= sizeof(int),
6155 .proc_handler
= proc_dointvec
,
6158 .procname
= "max_size",
6159 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
6160 .maxlen
= sizeof(int),
6162 .proc_handler
= proc_dointvec
,
6165 .procname
= "gc_min_interval",
6166 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
6167 .maxlen
= sizeof(int),
6169 .proc_handler
= proc_dointvec_jiffies
,
6172 .procname
= "gc_timeout",
6173 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
6174 .maxlen
= sizeof(int),
6176 .proc_handler
= proc_dointvec_jiffies
,
6179 .procname
= "gc_interval",
6180 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
6181 .maxlen
= sizeof(int),
6183 .proc_handler
= proc_dointvec_jiffies
,
6186 .procname
= "gc_elasticity",
6187 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
6188 .maxlen
= sizeof(int),
6190 .proc_handler
= proc_dointvec
,
6193 .procname
= "mtu_expires",
6194 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
6195 .maxlen
= sizeof(int),
6197 .proc_handler
= proc_dointvec_jiffies
,
6200 .procname
= "min_adv_mss",
6201 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
6202 .maxlen
= sizeof(int),
6204 .proc_handler
= proc_dointvec
,
6207 .procname
= "gc_min_interval_ms",
6208 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
6209 .maxlen
= sizeof(int),
6211 .proc_handler
= proc_dointvec_ms_jiffies
,
6214 .procname
= "skip_notify_on_dev_down",
6215 .data
= &init_net
.ipv6
.sysctl
.skip_notify_on_dev_down
,
6216 .maxlen
= sizeof(int),
6218 .proc_handler
= proc_dointvec_minmax
,
6219 .extra1
= SYSCTL_ZERO
,
6220 .extra2
= SYSCTL_ONE
,
6225 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
6227 struct ctl_table
*table
;
6229 table
= kmemdup(ipv6_route_table_template
,
6230 sizeof(ipv6_route_table_template
),
6234 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
6235 table
[0].extra1
= net
;
6236 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
6237 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
6238 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
6239 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
6240 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
6241 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
6242 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
6243 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
6244 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
6245 table
[10].data
= &net
->ipv6
.sysctl
.skip_notify_on_dev_down
;
6247 /* Don't export sysctls to unprivileged users */
6248 if (net
->user_ns
!= &init_user_ns
)
6249 table
[0].procname
= NULL
;
6256 static int __net_init
ip6_route_net_init(struct net
*net
)
6260 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
6261 sizeof(net
->ipv6
.ip6_dst_ops
));
6263 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
6264 goto out_ip6_dst_ops
;
6266 net
->ipv6
.fib6_null_entry
= fib6_info_alloc(GFP_KERNEL
, true);
6267 if (!net
->ipv6
.fib6_null_entry
)
6268 goto out_ip6_dst_entries
;
6269 memcpy(net
->ipv6
.fib6_null_entry
, &fib6_null_entry_template
,
6270 sizeof(*net
->ipv6
.fib6_null_entry
));
6272 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
6273 sizeof(*net
->ipv6
.ip6_null_entry
),
6275 if (!net
->ipv6
.ip6_null_entry
)
6276 goto out_fib6_null_entry
;
6277 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
6278 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
6279 ip6_template_metrics
, true);
6280 INIT_LIST_HEAD(&net
->ipv6
.ip6_null_entry
->rt6i_uncached
);
6282 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6283 net
->ipv6
.fib6_has_custom_rules
= false;
6284 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
6285 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
6287 if (!net
->ipv6
.ip6_prohibit_entry
)
6288 goto out_ip6_null_entry
;
6289 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
6290 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
6291 ip6_template_metrics
, true);
6292 INIT_LIST_HEAD(&net
->ipv6
.ip6_prohibit_entry
->rt6i_uncached
);
6294 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
6295 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
6297 if (!net
->ipv6
.ip6_blk_hole_entry
)
6298 goto out_ip6_prohibit_entry
;
6299 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
6300 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
6301 ip6_template_metrics
, true);
6302 INIT_LIST_HEAD(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_uncached
);
6303 #ifdef CONFIG_IPV6_SUBTREES
6304 net
->ipv6
.fib6_routes_require_src
= 0;
6308 net
->ipv6
.sysctl
.flush_delay
= 0;
6309 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
6310 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
6311 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
6312 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
6313 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
6314 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
6315 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
6316 net
->ipv6
.sysctl
.skip_notify_on_dev_down
= 0;
6318 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
6324 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6325 out_ip6_prohibit_entry
:
6326 kfree(net
->ipv6
.ip6_prohibit_entry
);
6328 kfree(net
->ipv6
.ip6_null_entry
);
6330 out_fib6_null_entry
:
6331 kfree(net
->ipv6
.fib6_null_entry
);
6332 out_ip6_dst_entries
:
6333 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
6338 static void __net_exit
ip6_route_net_exit(struct net
*net
)
6340 kfree(net
->ipv6
.fib6_null_entry
);
6341 kfree(net
->ipv6
.ip6_null_entry
);
6342 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6343 kfree(net
->ipv6
.ip6_prohibit_entry
);
6344 kfree(net
->ipv6
.ip6_blk_hole_entry
);
6346 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
6349 static int __net_init
ip6_route_net_init_late(struct net
*net
)
6351 #ifdef CONFIG_PROC_FS
6352 proc_create_net("ipv6_route", 0, net
->proc_net
, &ipv6_route_seq_ops
,
6353 sizeof(struct ipv6_route_iter
));
6354 proc_create_net_single("rt6_stats", 0444, net
->proc_net
,
6355 rt6_stats_seq_show
, NULL
);
6360 static void __net_exit
ip6_route_net_exit_late(struct net
*net
)
6362 #ifdef CONFIG_PROC_FS
6363 remove_proc_entry("ipv6_route", net
->proc_net
);
6364 remove_proc_entry("rt6_stats", net
->proc_net
);
6368 static struct pernet_operations ip6_route_net_ops
= {
6369 .init
= ip6_route_net_init
,
6370 .exit
= ip6_route_net_exit
,
6373 static int __net_init
ipv6_inetpeer_init(struct net
*net
)
6375 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
6379 inet_peer_base_init(bp
);
6380 net
->ipv6
.peers
= bp
;
6384 static void __net_exit
ipv6_inetpeer_exit(struct net
*net
)
6386 struct inet_peer_base
*bp
= net
->ipv6
.peers
;
6388 net
->ipv6
.peers
= NULL
;
6389 inetpeer_invalidate_tree(bp
);
6393 static struct pernet_operations ipv6_inetpeer_ops
= {
6394 .init
= ipv6_inetpeer_init
,
6395 .exit
= ipv6_inetpeer_exit
,
6398 static struct pernet_operations ip6_route_net_late_ops
= {
6399 .init
= ip6_route_net_init_late
,
6400 .exit
= ip6_route_net_exit_late
,
6403 static struct notifier_block ip6_route_dev_notifier
= {
6404 .notifier_call
= ip6_route_dev_notify
,
6405 .priority
= ADDRCONF_NOTIFY_PRIORITY
- 10,
6408 void __init
ip6_route_init_special_entries(void)
6410 /* Registering of the loopback is done before this portion of code,
6411 * the loopback reference in rt6_info will not be taken, do it
6412 * manually for init_net */
6413 init_net
.ipv6
.fib6_null_entry
->fib6_nh
->fib_nh_dev
= init_net
.loopback_dev
;
6414 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
6415 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
6416 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6417 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
6418 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
6419 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
6420 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
6424 #if IS_BUILTIN(CONFIG_IPV6)
6425 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6426 DEFINE_BPF_ITER_FUNC(ipv6_route
, struct bpf_iter_meta
*meta
, struct fib6_info
*rt
)
6428 BTF_ID_LIST(btf_fib6_info_id
)
6429 BTF_ID(struct, fib6_info
)
6431 static const struct bpf_iter_seq_info ipv6_route_seq_info
= {
6432 .seq_ops
= &ipv6_route_seq_ops
,
6433 .init_seq_private
= bpf_iter_init_seq_net
,
6434 .fini_seq_private
= bpf_iter_fini_seq_net
,
6435 .seq_priv_size
= sizeof(struct ipv6_route_iter
),
6438 static struct bpf_iter_reg ipv6_route_reg_info
= {
6439 .target
= "ipv6_route",
6440 .ctx_arg_info_size
= 1,
6442 { offsetof(struct bpf_iter__ipv6_route
, rt
),
6443 PTR_TO_BTF_ID_OR_NULL
},
6445 .seq_info
= &ipv6_route_seq_info
,
6448 static int __init
bpf_iter_register(void)
6450 ipv6_route_reg_info
.ctx_arg_info
[0].btf_id
= *btf_fib6_info_id
;
6451 return bpf_iter_reg_target(&ipv6_route_reg_info
);
6454 static void bpf_iter_unregister(void)
6456 bpf_iter_unreg_target(&ipv6_route_reg_info
);
6461 int __init
ip6_route_init(void)
6467 ip6_dst_ops_template
.kmem_cachep
=
6468 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
6469 SLAB_HWCACHE_ALIGN
, NULL
);
6470 if (!ip6_dst_ops_template
.kmem_cachep
)
6473 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
6475 goto out_kmem_cache
;
6477 ret
= register_pernet_subsys(&ipv6_inetpeer_ops
);
6479 goto out_dst_entries
;
6481 ret
= register_pernet_subsys(&ip6_route_net_ops
);
6483 goto out_register_inetpeer
;
6485 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
6489 goto out_register_subsys
;
6495 ret
= fib6_rules_init();
6499 ret
= register_pernet_subsys(&ip6_route_net_late_ops
);
6501 goto fib6_rules_init
;
6503 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_NEWROUTE
,
6504 inet6_rtm_newroute
, NULL
, 0);
6506 goto out_register_late_subsys
;
6508 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_DELROUTE
,
6509 inet6_rtm_delroute
, NULL
, 0);
6511 goto out_register_late_subsys
;
6513 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_GETROUTE
,
6514 inet6_rtm_getroute
, NULL
,
6515 RTNL_FLAG_DOIT_UNLOCKED
);
6517 goto out_register_late_subsys
;
6519 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
6521 goto out_register_late_subsys
;
6523 #if IS_BUILTIN(CONFIG_IPV6)
6524 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6525 ret
= bpf_iter_register();
6527 goto out_register_late_subsys
;
6531 for_each_possible_cpu(cpu
) {
6532 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
6534 INIT_LIST_HEAD(&ul
->head
);
6535 spin_lock_init(&ul
->lock
);
6541 out_register_late_subsys
:
6542 rtnl_unregister_all(PF_INET6
);
6543 unregister_pernet_subsys(&ip6_route_net_late_ops
);
6545 fib6_rules_cleanup();
6550 out_register_subsys
:
6551 unregister_pernet_subsys(&ip6_route_net_ops
);
6552 out_register_inetpeer
:
6553 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
6555 dst_entries_destroy(&ip6_dst_blackhole_ops
);
6557 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
6561 void ip6_route_cleanup(void)
6563 #if IS_BUILTIN(CONFIG_IPV6)
6564 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6565 bpf_iter_unregister();
6568 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
6569 unregister_pernet_subsys(&ip6_route_net_late_ops
);
6570 fib6_rules_cleanup();
6573 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
6574 unregister_pernet_subsys(&ip6_route_net_ops
);
6575 dst_entries_destroy(&ip6_dst_blackhole_ops
);
6576 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);