1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
12 * YOSHIFUJI Hideaki @USAGI
13 * reworked default router selection.
14 * - respect outgoing interface
15 * - select from (probably) reachable routers (i.e.
16 * routers in REACHABLE, STALE, DELAY or PROBE states).
17 * - always select the same router if it is (probably)
18 * reachable. otherwise, round-robin the list.
20 * Fixed routing subtrees.
23 #define pr_fmt(fmt) "IPv6: " fmt
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <linux/siphash.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
55 #include <net/dst_metadata.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
60 #include <net/lwtunnel.h>
61 #include <net/ip_tunnels.h>
62 #include <net/l3mdev.h>
64 #include <linux/uaccess.h>
65 #include <linux/btf_ids.h>
68 #include <linux/sysctl.h>
71 static int ip6_rt_type_to_error(u8 fib6_type
);
73 #define CREATE_TRACE_POINTS
74 #include <trace/events/fib6.h>
75 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup
);
76 #undef CREATE_TRACE_POINTS
79 RT6_NUD_FAIL_HARD
= -3,
80 RT6_NUD_FAIL_PROBE
= -2,
81 RT6_NUD_FAIL_DO_RR
= -1,
85 INDIRECT_CALLABLE_SCOPE
86 struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
87 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
88 INDIRECT_CALLABLE_SCOPE
89 unsigned int ip6_mtu(const struct dst_entry
*dst
);
90 static void ip6_negative_advice(struct sock
*sk
,
91 struct dst_entry
*dst
);
92 static void ip6_dst_destroy(struct dst_entry
*);
93 static void ip6_dst_ifdown(struct dst_entry
*,
94 struct net_device
*dev
);
95 static void ip6_dst_gc(struct dst_ops
*ops
);
97 static int ip6_pkt_discard(struct sk_buff
*skb
);
98 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
99 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
100 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
101 static void ip6_link_failure(struct sk_buff
*skb
);
102 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
103 struct sk_buff
*skb
, u32 mtu
,
105 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
106 struct sk_buff
*skb
);
107 static int rt6_score_route(const struct fib6_nh
*nh
, u32 fib6_flags
, int oif
,
109 static size_t rt6_nlmsg_size(struct fib6_info
*f6i
);
110 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
111 struct fib6_info
*rt
, struct dst_entry
*dst
,
112 struct in6_addr
*dest
, struct in6_addr
*src
,
113 int iif
, int type
, u32 portid
, u32 seq
,
115 static struct rt6_info
*rt6_find_cached_rt(const struct fib6_result
*res
,
116 const struct in6_addr
*daddr
,
117 const struct in6_addr
*saddr
);
119 #ifdef CONFIG_IPV6_ROUTE_INFO
120 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
121 const struct in6_addr
*prefix
, int prefixlen
,
122 const struct in6_addr
*gwaddr
,
123 struct net_device
*dev
,
125 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
126 const struct in6_addr
*prefix
, int prefixlen
,
127 const struct in6_addr
*gwaddr
,
128 struct net_device
*dev
);
131 struct uncached_list
{
133 struct list_head head
;
136 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt6_uncached_list
);
138 void rt6_uncached_list_add(struct rt6_info
*rt
)
140 struct uncached_list
*ul
= raw_cpu_ptr(&rt6_uncached_list
);
142 rt
->dst
.rt_uncached_list
= ul
;
144 spin_lock_bh(&ul
->lock
);
145 list_add_tail(&rt
->dst
.rt_uncached
, &ul
->head
);
146 spin_unlock_bh(&ul
->lock
);
149 void rt6_uncached_list_del(struct rt6_info
*rt
)
151 if (!list_empty(&rt
->dst
.rt_uncached
)) {
152 struct uncached_list
*ul
= rt
->dst
.rt_uncached_list
;
154 spin_lock_bh(&ul
->lock
);
155 list_del_init(&rt
->dst
.rt_uncached
);
156 spin_unlock_bh(&ul
->lock
);
160 static void rt6_uncached_list_flush_dev(struct net_device
*dev
)
164 for_each_possible_cpu(cpu
) {
165 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
166 struct rt6_info
*rt
, *safe
;
168 if (list_empty(&ul
->head
))
171 spin_lock_bh(&ul
->lock
);
172 list_for_each_entry_safe(rt
, safe
, &ul
->head
, dst
.rt_uncached
) {
173 struct inet6_dev
*rt_idev
= rt
->rt6i_idev
;
174 struct net_device
*rt_dev
= rt
->dst
.dev
;
175 bool handled
= false;
177 if (rt_idev
&& rt_idev
->dev
== dev
) {
178 rt
->rt6i_idev
= in6_dev_get(blackhole_netdev
);
179 in6_dev_put(rt_idev
);
184 rt
->dst
.dev
= blackhole_netdev
;
185 netdev_ref_replace(rt_dev
, blackhole_netdev
,
186 &rt
->dst
.dev_tracker
,
191 list_del_init(&rt
->dst
.rt_uncached
);
193 spin_unlock_bh(&ul
->lock
);
197 static inline const void *choose_neigh_daddr(const struct in6_addr
*p
,
201 if (!ipv6_addr_any(p
))
202 return (const void *) p
;
204 return &ipv6_hdr(skb
)->daddr
;
208 struct neighbour
*ip6_neigh_lookup(const struct in6_addr
*gw
,
209 struct net_device
*dev
,
215 daddr
= choose_neigh_daddr(gw
, skb
, daddr
);
216 n
= __ipv6_neigh_lookup(dev
, daddr
);
220 n
= neigh_create(&nd_tbl
, daddr
, dev
);
221 return IS_ERR(n
) ? NULL
: n
;
224 static struct neighbour
*ip6_dst_neigh_lookup(const struct dst_entry
*dst
,
228 const struct rt6_info
*rt
= dst_rt6_info(dst
);
230 return ip6_neigh_lookup(rt6_nexthop(rt
, &in6addr_any
),
231 dst
->dev
, skb
, daddr
);
234 static void ip6_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
236 const struct rt6_info
*rt
= dst_rt6_info(dst
);
237 struct net_device
*dev
= dst
->dev
;
239 daddr
= choose_neigh_daddr(rt6_nexthop(rt
, &in6addr_any
), NULL
, daddr
);
242 if (dev
->flags
& (IFF_NOARP
| IFF_LOOPBACK
))
244 if (ipv6_addr_is_multicast((const struct in6_addr
*)daddr
))
246 __ipv6_confirm_neigh(dev
, daddr
);
249 static struct dst_ops ip6_dst_ops_template
= {
253 .check
= ip6_dst_check
,
254 .default_advmss
= ip6_default_advmss
,
256 .cow_metrics
= dst_cow_metrics_generic
,
257 .destroy
= ip6_dst_destroy
,
258 .ifdown
= ip6_dst_ifdown
,
259 .negative_advice
= ip6_negative_advice
,
260 .link_failure
= ip6_link_failure
,
261 .update_pmtu
= ip6_rt_update_pmtu
,
262 .redirect
= rt6_do_redirect
,
263 .local_out
= __ip6_local_out
,
264 .neigh_lookup
= ip6_dst_neigh_lookup
,
265 .confirm_neigh
= ip6_confirm_neigh
,
268 static struct dst_ops ip6_dst_blackhole_ops
= {
270 .default_advmss
= ip6_default_advmss
,
271 .neigh_lookup
= ip6_dst_neigh_lookup
,
272 .check
= ip6_dst_check
,
273 .destroy
= ip6_dst_destroy
,
274 .cow_metrics
= dst_cow_metrics_generic
,
275 .update_pmtu
= dst_blackhole_update_pmtu
,
276 .redirect
= dst_blackhole_redirect
,
277 .mtu
= dst_blackhole_mtu
,
280 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
281 [RTAX_HOPLIMIT
- 1] = 0,
284 static const struct fib6_info fib6_null_entry_template
= {
285 .fib6_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
286 .fib6_protocol
= RTPROT_KERNEL
,
287 .fib6_metric
= ~(u32
)0,
288 .fib6_ref
= REFCOUNT_INIT(1),
289 .fib6_type
= RTN_UNREACHABLE
,
290 .fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
,
293 static const struct rt6_info ip6_null_entry_template
= {
295 .__rcuref
= RCUREF_INIT(1),
297 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
298 .error
= -ENETUNREACH
,
299 .input
= ip6_pkt_discard
,
300 .output
= ip6_pkt_discard_out
,
302 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
305 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
307 static const struct rt6_info ip6_prohibit_entry_template
= {
309 .__rcuref
= RCUREF_INIT(1),
311 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
313 .input
= ip6_pkt_prohibit
,
314 .output
= ip6_pkt_prohibit_out
,
316 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
319 static const struct rt6_info ip6_blk_hole_entry_template
= {
321 .__rcuref
= RCUREF_INIT(1),
323 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
325 .input
= dst_discard
,
326 .output
= dst_discard_out
,
328 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
333 static void rt6_info_init(struct rt6_info
*rt
)
335 memset_after(rt
, 0, dst
);
338 /* allocate dst with ip6_dst_ops */
339 struct rt6_info
*ip6_dst_alloc(struct net
*net
, struct net_device
*dev
,
342 struct rt6_info
*rt
= dst_alloc(&net
->ipv6
.ip6_dst_ops
, dev
,
343 DST_OBSOLETE_FORCE_CHK
, flags
);
347 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
352 EXPORT_SYMBOL(ip6_dst_alloc
);
354 static void ip6_dst_destroy(struct dst_entry
*dst
)
356 struct rt6_info
*rt
= dst_rt6_info(dst
);
357 struct fib6_info
*from
;
358 struct inet6_dev
*idev
;
360 ip_dst_metrics_put(dst
);
361 rt6_uncached_list_del(rt
);
363 idev
= rt
->rt6i_idev
;
365 rt
->rt6i_idev
= NULL
;
369 from
= unrcu_pointer(xchg(&rt
->from
, NULL
));
370 fib6_info_release(from
);
373 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
)
375 struct rt6_info
*rt
= dst_rt6_info(dst
);
376 struct inet6_dev
*idev
= rt
->rt6i_idev
;
377 struct fib6_info
*from
;
379 if (idev
&& idev
->dev
!= blackhole_netdev
) {
380 struct inet6_dev
*blackhole_idev
= in6_dev_get(blackhole_netdev
);
382 if (blackhole_idev
) {
383 rt
->rt6i_idev
= blackhole_idev
;
387 from
= unrcu_pointer(xchg(&rt
->from
, NULL
));
388 fib6_info_release(from
);
391 static bool __rt6_check_expired(const struct rt6_info
*rt
)
393 if (rt
->rt6i_flags
& RTF_EXPIRES
)
394 return time_after(jiffies
, rt
->dst
.expires
);
399 static bool rt6_check_expired(const struct rt6_info
*rt
)
401 struct fib6_info
*from
;
403 from
= rcu_dereference(rt
->from
);
405 if (rt
->rt6i_flags
& RTF_EXPIRES
) {
406 if (time_after(jiffies
, rt
->dst
.expires
))
409 return rt
->dst
.obsolete
!= DST_OBSOLETE_FORCE_CHK
||
410 fib6_check_expired(from
);
415 void fib6_select_path(const struct net
*net
, struct fib6_result
*res
,
416 struct flowi6
*fl6
, int oif
, bool have_oif_match
,
417 const struct sk_buff
*skb
, int strict
)
419 struct fib6_info
*match
= res
->f6i
;
420 struct fib6_info
*sibling
;
422 if (!match
->nh
&& (!match
->fib6_nsiblings
|| have_oif_match
))
425 if (match
->nh
&& have_oif_match
&& res
->nh
)
429 IP6CB(skb
)->flags
|= IP6SKB_MULTIPATH
;
431 /* We might have already computed the hash for ICMPv6 errors. In such
432 * case it will always be non-zero. Otherwise now is the time to do it.
435 (!match
->nh
|| nexthop_is_multipath(match
->nh
)))
436 fl6
->mp_hash
= rt6_multipath_hash(net
, fl6
, skb
, NULL
);
438 if (unlikely(match
->nh
)) {
439 nexthop_path_fib6_result(res
, fl6
->mp_hash
);
443 if (fl6
->mp_hash
<= atomic_read(&match
->fib6_nh
->fib_nh_upper_bound
))
446 list_for_each_entry_rcu(sibling
, &match
->fib6_siblings
,
448 const struct fib6_nh
*nh
= sibling
->fib6_nh
;
451 nh_upper_bound
= atomic_read(&nh
->fib_nh_upper_bound
);
452 if (fl6
->mp_hash
> nh_upper_bound
)
454 if (rt6_score_route(nh
, sibling
->fib6_flags
, oif
, strict
) < 0)
462 res
->nh
= match
->fib6_nh
;
466 * Route lookup. rcu_read_lock() should be held.
469 static bool __rt6_device_match(struct net
*net
, const struct fib6_nh
*nh
,
470 const struct in6_addr
*saddr
, int oif
, int flags
)
472 const struct net_device
*dev
;
474 if (nh
->fib_nh_flags
& RTNH_F_DEAD
)
477 dev
= nh
->fib_nh_dev
;
479 if (dev
->ifindex
== oif
)
482 if (ipv6_chk_addr(net
, saddr
, dev
,
483 flags
& RT6_LOOKUP_F_IFACE
))
490 struct fib6_nh_dm_arg
{
492 const struct in6_addr
*saddr
;
498 static int __rt6_nh_dev_match(struct fib6_nh
*nh
, void *_arg
)
500 struct fib6_nh_dm_arg
*arg
= _arg
;
503 return __rt6_device_match(arg
->net
, nh
, arg
->saddr
, arg
->oif
,
507 /* returns fib6_nh from nexthop or NULL */
508 static struct fib6_nh
*rt6_nh_dev_match(struct net
*net
, struct nexthop
*nh
,
509 struct fib6_result
*res
,
510 const struct in6_addr
*saddr
,
513 struct fib6_nh_dm_arg arg
= {
520 if (nexthop_is_blackhole(nh
))
523 if (nexthop_for_each_fib6_nh(nh
, __rt6_nh_dev_match
, &arg
))
529 static void rt6_device_match(struct net
*net
, struct fib6_result
*res
,
530 const struct in6_addr
*saddr
, int oif
, int flags
)
532 struct fib6_info
*f6i
= res
->f6i
;
533 struct fib6_info
*spf6i
;
536 if (!oif
&& ipv6_addr_any(saddr
)) {
537 if (unlikely(f6i
->nh
)) {
538 nh
= nexthop_fib6_nh(f6i
->nh
);
539 if (nexthop_is_blackhole(f6i
->nh
))
544 if (!(nh
->fib_nh_flags
& RTNH_F_DEAD
))
548 for (spf6i
= f6i
; spf6i
; spf6i
= rcu_dereference(spf6i
->fib6_next
)) {
549 bool matched
= false;
551 if (unlikely(spf6i
->nh
)) {
552 nh
= rt6_nh_dev_match(net
, spf6i
->nh
, res
, saddr
,
558 if (__rt6_device_match(net
, nh
, saddr
, oif
, flags
))
567 if (oif
&& flags
& RT6_LOOKUP_F_IFACE
) {
568 res
->f6i
= net
->ipv6
.fib6_null_entry
;
569 nh
= res
->f6i
->fib6_nh
;
573 if (unlikely(f6i
->nh
)) {
574 nh
= nexthop_fib6_nh(f6i
->nh
);
575 if (nexthop_is_blackhole(f6i
->nh
))
581 if (nh
->fib_nh_flags
& RTNH_F_DEAD
) {
582 res
->f6i
= net
->ipv6
.fib6_null_entry
;
583 nh
= res
->f6i
->fib6_nh
;
587 res
->fib6_type
= res
->f6i
->fib6_type
;
588 res
->fib6_flags
= res
->f6i
->fib6_flags
;
592 res
->fib6_flags
|= RTF_REJECT
;
593 res
->fib6_type
= RTN_BLACKHOLE
;
597 #ifdef CONFIG_IPV6_ROUTER_PREF
598 struct __rt6_probe_work
{
599 struct work_struct work
;
600 struct in6_addr target
;
601 struct net_device
*dev
;
602 netdevice_tracker dev_tracker
;
605 static void rt6_probe_deferred(struct work_struct
*w
)
607 struct in6_addr mcaddr
;
608 struct __rt6_probe_work
*work
=
609 container_of(w
, struct __rt6_probe_work
, work
);
611 addrconf_addr_solict_mult(&work
->target
, &mcaddr
);
612 ndisc_send_ns(work
->dev
, &work
->target
, &mcaddr
, NULL
, 0);
613 netdev_put(work
->dev
, &work
->dev_tracker
);
617 static void rt6_probe(struct fib6_nh
*fib6_nh
)
619 struct __rt6_probe_work
*work
= NULL
;
620 const struct in6_addr
*nh_gw
;
621 unsigned long last_probe
;
622 struct neighbour
*neigh
;
623 struct net_device
*dev
;
624 struct inet6_dev
*idev
;
627 * Okay, this does not seem to be appropriate
628 * for now, however, we need to check if it
629 * is really so; aka Router Reachability Probing.
631 * Router Reachability Probe MUST be rate-limited
632 * to no more than one per minute.
634 if (!fib6_nh
->fib_nh_gw_family
)
637 nh_gw
= &fib6_nh
->fib_nh_gw6
;
638 dev
= fib6_nh
->fib_nh_dev
;
640 last_probe
= READ_ONCE(fib6_nh
->last_probe
);
641 idev
= __in6_dev_get(dev
);
644 neigh
= __ipv6_neigh_lookup_noref(dev
, nh_gw
);
646 if (READ_ONCE(neigh
->nud_state
) & NUD_VALID
)
649 write_lock_bh(&neigh
->lock
);
650 if (!(neigh
->nud_state
& NUD_VALID
) &&
653 READ_ONCE(idev
->cnf
.rtr_probe_interval
))) {
654 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
656 __neigh_set_probe_once(neigh
);
658 write_unlock_bh(&neigh
->lock
);
659 } else if (time_after(jiffies
, last_probe
+
660 READ_ONCE(idev
->cnf
.rtr_probe_interval
))) {
661 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
664 if (!work
|| cmpxchg(&fib6_nh
->last_probe
,
665 last_probe
, jiffies
) != last_probe
) {
668 INIT_WORK(&work
->work
, rt6_probe_deferred
);
669 work
->target
= *nh_gw
;
670 netdev_hold(dev
, &work
->dev_tracker
, GFP_ATOMIC
);
672 schedule_work(&work
->work
);
679 static inline void rt6_probe(struct fib6_nh
*fib6_nh
)
685 * Default Router Selection (RFC 2461 6.3.6)
687 static enum rt6_nud_state
rt6_check_neigh(const struct fib6_nh
*fib6_nh
)
689 enum rt6_nud_state ret
= RT6_NUD_FAIL_HARD
;
690 struct neighbour
*neigh
;
693 neigh
= __ipv6_neigh_lookup_noref(fib6_nh
->fib_nh_dev
,
694 &fib6_nh
->fib_nh_gw6
);
696 u8 nud_state
= READ_ONCE(neigh
->nud_state
);
698 if (nud_state
& NUD_VALID
)
699 ret
= RT6_NUD_SUCCEED
;
700 #ifdef CONFIG_IPV6_ROUTER_PREF
701 else if (!(nud_state
& NUD_FAILED
))
702 ret
= RT6_NUD_SUCCEED
;
704 ret
= RT6_NUD_FAIL_PROBE
;
707 ret
= IS_ENABLED(CONFIG_IPV6_ROUTER_PREF
) ?
708 RT6_NUD_SUCCEED
: RT6_NUD_FAIL_DO_RR
;
715 static int rt6_score_route(const struct fib6_nh
*nh
, u32 fib6_flags
, int oif
,
720 if (!oif
|| nh
->fib_nh_dev
->ifindex
== oif
)
723 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
724 return RT6_NUD_FAIL_HARD
;
725 #ifdef CONFIG_IPV6_ROUTER_PREF
726 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags
)) << 2;
728 if ((strict
& RT6_LOOKUP_F_REACHABLE
) &&
729 !(fib6_flags
& RTF_NONEXTHOP
) && nh
->fib_nh_gw_family
) {
730 int n
= rt6_check_neigh(nh
);
737 static bool find_match(struct fib6_nh
*nh
, u32 fib6_flags
,
738 int oif
, int strict
, int *mpri
, bool *do_rr
)
740 bool match_do_rr
= false;
744 if (nh
->fib_nh_flags
& RTNH_F_DEAD
)
747 if (ip6_ignore_linkdown(nh
->fib_nh_dev
) &&
748 nh
->fib_nh_flags
& RTNH_F_LINKDOWN
&&
749 !(strict
& RT6_LOOKUP_F_IGNORE_LINKSTATE
))
752 m
= rt6_score_route(nh
, fib6_flags
, oif
, strict
);
753 if (m
== RT6_NUD_FAIL_DO_RR
) {
755 m
= 0; /* lowest valid score */
756 } else if (m
== RT6_NUD_FAIL_HARD
) {
760 if (strict
& RT6_LOOKUP_F_REACHABLE
)
763 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
765 *do_rr
= match_do_rr
;
773 struct fib6_nh_frl_arg
{
782 static int rt6_nh_find_match(struct fib6_nh
*nh
, void *_arg
)
784 struct fib6_nh_frl_arg
*arg
= _arg
;
787 return find_match(nh
, arg
->flags
, arg
->oif
, arg
->strict
,
788 arg
->mpri
, arg
->do_rr
);
791 static void __find_rr_leaf(struct fib6_info
*f6i_start
,
792 struct fib6_info
*nomatch
, u32 metric
,
793 struct fib6_result
*res
, struct fib6_info
**cont
,
794 int oif
, int strict
, bool *do_rr
, int *mpri
)
796 struct fib6_info
*f6i
;
798 for (f6i
= f6i_start
;
799 f6i
&& f6i
!= nomatch
;
800 f6i
= rcu_dereference(f6i
->fib6_next
)) {
801 bool matched
= false;
804 if (cont
&& f6i
->fib6_metric
!= metric
) {
809 if (fib6_check_expired(f6i
))
812 if (unlikely(f6i
->nh
)) {
813 struct fib6_nh_frl_arg arg
= {
814 .flags
= f6i
->fib6_flags
,
821 if (nexthop_is_blackhole(f6i
->nh
)) {
822 res
->fib6_flags
= RTF_REJECT
;
823 res
->fib6_type
= RTN_BLACKHOLE
;
825 res
->nh
= nexthop_fib6_nh(f6i
->nh
);
828 if (nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_find_match
,
835 if (find_match(nh
, f6i
->fib6_flags
, oif
, strict
,
842 res
->fib6_flags
= f6i
->fib6_flags
;
843 res
->fib6_type
= f6i
->fib6_type
;
848 static void find_rr_leaf(struct fib6_node
*fn
, struct fib6_info
*leaf
,
849 struct fib6_info
*rr_head
, int oif
, int strict
,
850 bool *do_rr
, struct fib6_result
*res
)
852 u32 metric
= rr_head
->fib6_metric
;
853 struct fib6_info
*cont
= NULL
;
856 __find_rr_leaf(rr_head
, NULL
, metric
, res
, &cont
,
857 oif
, strict
, do_rr
, &mpri
);
859 __find_rr_leaf(leaf
, rr_head
, metric
, res
, &cont
,
860 oif
, strict
, do_rr
, &mpri
);
862 if (res
->f6i
|| !cont
)
865 __find_rr_leaf(cont
, NULL
, metric
, res
, NULL
,
866 oif
, strict
, do_rr
, &mpri
);
869 static void rt6_select(struct net
*net
, struct fib6_node
*fn
, int oif
,
870 struct fib6_result
*res
, int strict
)
872 struct fib6_info
*leaf
= rcu_dereference(fn
->leaf
);
873 struct fib6_info
*rt0
;
877 /* make sure this function or its helpers sets f6i */
880 if (!leaf
|| leaf
== net
->ipv6
.fib6_null_entry
)
883 rt0
= rcu_dereference(fn
->rr_ptr
);
887 /* Double check to make sure fn is not an intermediate node
888 * and fn->leaf does not points to its child's leaf
889 * (This might happen if all routes under fn are deleted from
890 * the tree and fib6_repair_tree() is called on the node.)
892 key_plen
= rt0
->fib6_dst
.plen
;
893 #ifdef CONFIG_IPV6_SUBTREES
894 if (rt0
->fib6_src
.plen
)
895 key_plen
= rt0
->fib6_src
.plen
;
897 if (fn
->fn_bit
!= key_plen
)
900 find_rr_leaf(fn
, leaf
, rt0
, oif
, strict
, &do_rr
, res
);
902 struct fib6_info
*next
= rcu_dereference(rt0
->fib6_next
);
904 /* no entries matched; do round-robin */
905 if (!next
|| next
->fib6_metric
!= rt0
->fib6_metric
)
909 spin_lock_bh(&leaf
->fib6_table
->tb6_lock
);
910 /* make sure next is not being deleted from the tree */
912 rcu_assign_pointer(fn
->rr_ptr
, next
);
913 spin_unlock_bh(&leaf
->fib6_table
->tb6_lock
);
919 res
->f6i
= net
->ipv6
.fib6_null_entry
;
920 res
->nh
= res
->f6i
->fib6_nh
;
921 res
->fib6_flags
= res
->f6i
->fib6_flags
;
922 res
->fib6_type
= res
->f6i
->fib6_type
;
926 static bool rt6_is_gw_or_nonexthop(const struct fib6_result
*res
)
928 return (res
->f6i
->fib6_flags
& RTF_NONEXTHOP
) ||
929 res
->nh
->fib_nh_gw_family
;
932 #ifdef CONFIG_IPV6_ROUTE_INFO
933 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
934 const struct in6_addr
*gwaddr
)
936 struct net
*net
= dev_net(dev
);
937 struct route_info
*rinfo
= (struct route_info
*) opt
;
938 struct in6_addr prefix_buf
, *prefix
;
939 struct fib6_table
*table
;
941 unsigned long lifetime
;
942 struct fib6_info
*rt
;
944 if (len
< sizeof(struct route_info
)) {
948 /* Sanity check for prefix_len and length */
949 if (rinfo
->length
> 3) {
951 } else if (rinfo
->prefix_len
> 128) {
953 } else if (rinfo
->prefix_len
> 64) {
954 if (rinfo
->length
< 2) {
957 } else if (rinfo
->prefix_len
> 0) {
958 if (rinfo
->length
< 1) {
963 pref
= rinfo
->route_pref
;
964 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
967 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
969 if (rinfo
->length
== 3)
970 prefix
= (struct in6_addr
*)rinfo
->prefix
;
972 /* this function is safe */
973 ipv6_addr_prefix(&prefix_buf
,
974 (struct in6_addr
*)rinfo
->prefix
,
976 prefix
= &prefix_buf
;
979 if (rinfo
->prefix_len
== 0)
980 rt
= rt6_get_dflt_router(net
, gwaddr
, dev
);
982 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
,
985 if (rt
&& !lifetime
) {
986 ip6_del_rt(net
, rt
, false);
991 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
994 rt
->fib6_flags
= RTF_ROUTEINFO
|
995 (rt
->fib6_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
998 table
= rt
->fib6_table
;
999 spin_lock_bh(&table
->tb6_lock
);
1001 if (!addrconf_finite_timeout(lifetime
)) {
1002 fib6_clean_expires(rt
);
1003 fib6_remove_gc_list(rt
);
1005 fib6_set_expires(rt
, jiffies
+ HZ
* lifetime
);
1006 fib6_add_gc_list(rt
);
1009 spin_unlock_bh(&table
->tb6_lock
);
1011 fib6_info_release(rt
);
1018 * Misc support functions
1021 /* called with rcu_lock held */
1022 static struct net_device
*ip6_rt_get_dev_rcu(const struct fib6_result
*res
)
1024 struct net_device
*dev
= res
->nh
->fib_nh_dev
;
1026 if (res
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) {
1027 /* for copies of local routes, dst->dev needs to be the
1028 * device if it is a master device, the master device if
1029 * device is enslaved, and the loopback as the default
1031 if (netif_is_l3_slave(dev
) &&
1032 !rt6_need_strict(&res
->f6i
->fib6_dst
.addr
))
1033 dev
= l3mdev_master_dev_rcu(dev
);
1034 else if (!netif_is_l3_master(dev
))
1035 dev
= dev_net(dev
)->loopback_dev
;
1036 /* last case is netif_is_l3_master(dev) is true in which
1037 * case we want dev returned to be dev
1044 static const int fib6_prop
[RTN_MAX
+ 1] = {
1048 [RTN_BROADCAST
] = 0,
1050 [RTN_MULTICAST
] = 0,
1051 [RTN_BLACKHOLE
] = -EINVAL
,
1052 [RTN_UNREACHABLE
] = -EHOSTUNREACH
,
1053 [RTN_PROHIBIT
] = -EACCES
,
1054 [RTN_THROW
] = -EAGAIN
,
1055 [RTN_NAT
] = -EINVAL
,
1056 [RTN_XRESOLVE
] = -EINVAL
,
1059 static int ip6_rt_type_to_error(u8 fib6_type
)
1061 return fib6_prop
[fib6_type
];
1064 static unsigned short fib6_info_dst_flags(struct fib6_info
*rt
)
1066 unsigned short flags
= 0;
1068 if (rt
->dst_nocount
)
1069 flags
|= DST_NOCOUNT
;
1070 if (rt
->dst_nopolicy
)
1071 flags
|= DST_NOPOLICY
;
1076 static void ip6_rt_init_dst_reject(struct rt6_info
*rt
, u8 fib6_type
)
1078 rt
->dst
.error
= ip6_rt_type_to_error(fib6_type
);
1080 switch (fib6_type
) {
1082 rt
->dst
.output
= dst_discard_out
;
1083 rt
->dst
.input
= dst_discard
;
1086 rt
->dst
.output
= ip6_pkt_prohibit_out
;
1087 rt
->dst
.input
= ip6_pkt_prohibit
;
1090 case RTN_UNREACHABLE
:
1092 rt
->dst
.output
= ip6_pkt_discard_out
;
1093 rt
->dst
.input
= ip6_pkt_discard
;
1098 static void ip6_rt_init_dst(struct rt6_info
*rt
, const struct fib6_result
*res
)
1100 struct fib6_info
*f6i
= res
->f6i
;
1102 if (res
->fib6_flags
& RTF_REJECT
) {
1103 ip6_rt_init_dst_reject(rt
, res
->fib6_type
);
1108 rt
->dst
.output
= ip6_output
;
1110 if (res
->fib6_type
== RTN_LOCAL
|| res
->fib6_type
== RTN_ANYCAST
) {
1111 rt
->dst
.input
= ip6_input
;
1112 } else if (ipv6_addr_type(&f6i
->fib6_dst
.addr
) & IPV6_ADDR_MULTICAST
) {
1113 rt
->dst
.input
= ip6_mc_input
;
1115 rt
->dst
.input
= ip6_forward
;
1118 if (res
->nh
->fib_nh_lws
) {
1119 rt
->dst
.lwtstate
= lwtstate_get(res
->nh
->fib_nh_lws
);
1120 lwtunnel_set_redirect(&rt
->dst
);
1123 rt
->dst
.lastuse
= jiffies
;
1126 /* Caller must already hold reference to @from */
1127 static void rt6_set_from(struct rt6_info
*rt
, struct fib6_info
*from
)
1129 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
1130 rcu_assign_pointer(rt
->from
, from
);
1131 ip_dst_init_metrics(&rt
->dst
, from
->fib6_metrics
);
1134 /* Caller must already hold reference to f6i in result */
1135 static void ip6_rt_copy_init(struct rt6_info
*rt
, const struct fib6_result
*res
)
1137 const struct fib6_nh
*nh
= res
->nh
;
1138 const struct net_device
*dev
= nh
->fib_nh_dev
;
1139 struct fib6_info
*f6i
= res
->f6i
;
1141 ip6_rt_init_dst(rt
, res
);
1143 rt
->rt6i_dst
= f6i
->fib6_dst
;
1144 rt
->rt6i_idev
= dev
? in6_dev_get(dev
) : NULL
;
1145 rt
->rt6i_flags
= res
->fib6_flags
;
1146 if (nh
->fib_nh_gw_family
) {
1147 rt
->rt6i_gateway
= nh
->fib_nh_gw6
;
1148 rt
->rt6i_flags
|= RTF_GATEWAY
;
1150 rt6_set_from(rt
, f6i
);
1151 #ifdef CONFIG_IPV6_SUBTREES
1152 rt
->rt6i_src
= f6i
->fib6_src
;
1156 static struct fib6_node
* fib6_backtrack(struct fib6_node
*fn
,
1157 struct in6_addr
*saddr
)
1159 struct fib6_node
*pn
, *sn
;
1161 if (fn
->fn_flags
& RTN_TL_ROOT
)
1163 pn
= rcu_dereference(fn
->parent
);
1164 sn
= FIB6_SUBTREE(pn
);
1166 fn
= fib6_node_lookup(sn
, NULL
, saddr
);
1169 if (fn
->fn_flags
& RTN_RTINFO
)
1174 static bool ip6_hold_safe(struct net
*net
, struct rt6_info
**prt
)
1176 struct rt6_info
*rt
= *prt
;
1178 if (dst_hold_safe(&rt
->dst
))
1181 rt
= net
->ipv6
.ip6_null_entry
;
1190 /* called with rcu_lock held */
1191 static struct rt6_info
*ip6_create_rt_rcu(const struct fib6_result
*res
)
1193 struct net_device
*dev
= res
->nh
->fib_nh_dev
;
1194 struct fib6_info
*f6i
= res
->f6i
;
1195 unsigned short flags
;
1196 struct rt6_info
*nrt
;
1198 if (!fib6_info_hold_safe(f6i
))
1201 flags
= fib6_info_dst_flags(f6i
);
1202 nrt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
);
1204 fib6_info_release(f6i
);
1208 ip6_rt_copy_init(nrt
, res
);
1212 nrt
= dev_net(dev
)->ipv6
.ip6_null_entry
;
1213 dst_hold(&nrt
->dst
);
1217 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
1218 struct fib6_table
*table
,
1220 const struct sk_buff
*skb
,
1223 struct fib6_result res
= {};
1224 struct fib6_node
*fn
;
1225 struct rt6_info
*rt
;
1228 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1230 res
.f6i
= rcu_dereference(fn
->leaf
);
1232 res
.f6i
= net
->ipv6
.fib6_null_entry
;
1234 rt6_device_match(net
, &res
, &fl6
->saddr
, fl6
->flowi6_oif
,
1237 if (res
.f6i
== net
->ipv6
.fib6_null_entry
) {
1238 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1242 rt
= net
->ipv6
.ip6_null_entry
;
1245 } else if (res
.fib6_flags
& RTF_REJECT
) {
1249 fib6_select_path(net
, &res
, fl6
, fl6
->flowi6_oif
,
1250 fl6
->flowi6_oif
!= 0, skb
, flags
);
1252 /* Search through exception table */
1253 rt
= rt6_find_cached_rt(&res
, &fl6
->daddr
, &fl6
->saddr
);
1255 if (ip6_hold_safe(net
, &rt
))
1256 dst_use_noref(&rt
->dst
, jiffies
);
1259 rt
= ip6_create_rt_rcu(&res
);
1263 trace_fib6_table_lookup(net
, &res
, table
, fl6
);
1270 struct dst_entry
*ip6_route_lookup(struct net
*net
, struct flowi6
*fl6
,
1271 const struct sk_buff
*skb
, int flags
)
1273 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_lookup
);
1275 EXPORT_SYMBOL_GPL(ip6_route_lookup
);
1277 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
1278 const struct in6_addr
*saddr
, int oif
,
1279 const struct sk_buff
*skb
, int strict
)
1281 struct flowi6 fl6
= {
1285 struct dst_entry
*dst
;
1286 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
1289 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
1290 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1293 dst
= fib6_rule_lookup(net
, &fl6
, skb
, flags
, ip6_pol_route_lookup
);
1294 if (dst
->error
== 0)
1295 return dst_rt6_info(dst
);
1301 EXPORT_SYMBOL(rt6_lookup
);
1303 /* ip6_ins_rt is called with FREE table->tb6_lock.
1304 * It takes new route entry, the addition fails by any reason the
1305 * route is released.
1306 * Caller must hold dst before calling it.
1309 static int __ip6_ins_rt(struct fib6_info
*rt
, struct nl_info
*info
,
1310 struct netlink_ext_ack
*extack
)
1313 struct fib6_table
*table
;
1315 table
= rt
->fib6_table
;
1316 spin_lock_bh(&table
->tb6_lock
);
1317 err
= fib6_add(&table
->tb6_root
, rt
, info
, extack
);
1318 spin_unlock_bh(&table
->tb6_lock
);
1323 int ip6_ins_rt(struct net
*net
, struct fib6_info
*rt
)
1325 struct nl_info info
= { .nl_net
= net
, };
1327 return __ip6_ins_rt(rt
, &info
, NULL
);
1330 static struct rt6_info
*ip6_rt_cache_alloc(const struct fib6_result
*res
,
1331 const struct in6_addr
*daddr
,
1332 const struct in6_addr
*saddr
)
1334 struct fib6_info
*f6i
= res
->f6i
;
1335 struct net_device
*dev
;
1336 struct rt6_info
*rt
;
1342 if (!fib6_info_hold_safe(f6i
))
1345 dev
= ip6_rt_get_dev_rcu(res
);
1346 rt
= ip6_dst_alloc(dev_net(dev
), dev
, 0);
1348 fib6_info_release(f6i
);
1352 ip6_rt_copy_init(rt
, res
);
1353 rt
->rt6i_flags
|= RTF_CACHE
;
1354 rt
->rt6i_dst
.addr
= *daddr
;
1355 rt
->rt6i_dst
.plen
= 128;
1357 if (!rt6_is_gw_or_nonexthop(res
)) {
1358 if (f6i
->fib6_dst
.plen
!= 128 &&
1359 ipv6_addr_equal(&f6i
->fib6_dst
.addr
, daddr
))
1360 rt
->rt6i_flags
|= RTF_ANYCAST
;
1361 #ifdef CONFIG_IPV6_SUBTREES
1362 if (rt
->rt6i_src
.plen
&& saddr
) {
1363 rt
->rt6i_src
.addr
= *saddr
;
1364 rt
->rt6i_src
.plen
= 128;
1372 static struct rt6_info
*ip6_rt_pcpu_alloc(const struct fib6_result
*res
)
1374 struct fib6_info
*f6i
= res
->f6i
;
1375 unsigned short flags
= fib6_info_dst_flags(f6i
);
1376 struct net_device
*dev
;
1377 struct rt6_info
*pcpu_rt
;
1379 if (!fib6_info_hold_safe(f6i
))
1383 dev
= ip6_rt_get_dev_rcu(res
);
1384 pcpu_rt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
| DST_NOCOUNT
);
1387 fib6_info_release(f6i
);
1390 ip6_rt_copy_init(pcpu_rt
, res
);
1391 pcpu_rt
->rt6i_flags
|= RTF_PCPU
;
1394 pcpu_rt
->sernum
= rt_genid_ipv6(dev_net(dev
));
1399 static bool rt6_is_valid(const struct rt6_info
*rt6
)
1401 return rt6
->sernum
== rt_genid_ipv6(dev_net(rt6
->dst
.dev
));
1404 /* It should be called with rcu_read_lock() acquired */
1405 static struct rt6_info
*rt6_get_pcpu_route(const struct fib6_result
*res
)
1407 struct rt6_info
*pcpu_rt
;
1409 pcpu_rt
= this_cpu_read(*res
->nh
->rt6i_pcpu
);
1411 if (pcpu_rt
&& pcpu_rt
->sernum
&& !rt6_is_valid(pcpu_rt
)) {
1412 struct rt6_info
*prev
, **p
;
1414 p
= this_cpu_ptr(res
->nh
->rt6i_pcpu
);
1415 /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
1416 prev
= xchg(p
, NULL
);
1418 dst_dev_put(&prev
->dst
);
1419 dst_release(&prev
->dst
);
1428 static struct rt6_info
*rt6_make_pcpu_route(struct net
*net
,
1429 const struct fib6_result
*res
)
1431 struct rt6_info
*pcpu_rt
, *prev
, **p
;
1433 pcpu_rt
= ip6_rt_pcpu_alloc(res
);
1437 p
= this_cpu_ptr(res
->nh
->rt6i_pcpu
);
1438 prev
= cmpxchg(p
, NULL
, pcpu_rt
);
1441 if (res
->f6i
->fib6_destroying
) {
1442 struct fib6_info
*from
;
1444 from
= unrcu_pointer(xchg(&pcpu_rt
->from
, NULL
));
1445 fib6_info_release(from
);
1451 /* exception hash table implementation
1453 static DEFINE_SPINLOCK(rt6_exception_lock
);
1455 /* Remove rt6_ex from hash table and free the memory
1456 * Caller must hold rt6_exception_lock
1458 static void rt6_remove_exception(struct rt6_exception_bucket
*bucket
,
1459 struct rt6_exception
*rt6_ex
)
1463 if (!bucket
|| !rt6_ex
)
1466 net
= dev_net(rt6_ex
->rt6i
->dst
.dev
);
1467 net
->ipv6
.rt6_stats
->fib_rt_cache
--;
1469 /* purge completely the exception to allow releasing the held resources:
1470 * some [sk] cache may keep the dst around for unlimited time
1472 dst_dev_put(&rt6_ex
->rt6i
->dst
);
1474 hlist_del_rcu(&rt6_ex
->hlist
);
1475 dst_release(&rt6_ex
->rt6i
->dst
);
1476 kfree_rcu(rt6_ex
, rcu
);
1477 WARN_ON_ONCE(!bucket
->depth
);
1481 /* Remove oldest rt6_ex in bucket and free the memory
1482 * Caller must hold rt6_exception_lock
1484 static void rt6_exception_remove_oldest(struct rt6_exception_bucket
*bucket
)
1486 struct rt6_exception
*rt6_ex
, *oldest
= NULL
;
1491 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1492 if (!oldest
|| time_before(rt6_ex
->stamp
, oldest
->stamp
))
1495 rt6_remove_exception(bucket
, oldest
);
1498 static u32
rt6_exception_hash(const struct in6_addr
*dst
,
1499 const struct in6_addr
*src
)
1501 static siphash_aligned_key_t rt6_exception_key
;
1503 struct in6_addr dst
;
1504 struct in6_addr src
;
1505 } __aligned(SIPHASH_ALIGNMENT
) combined
= {
1510 net_get_random_once(&rt6_exception_key
, sizeof(rt6_exception_key
));
1512 #ifdef CONFIG_IPV6_SUBTREES
1514 combined
.src
= *src
;
1516 val
= siphash(&combined
, sizeof(combined
), &rt6_exception_key
);
1518 return hash_64(val
, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT
);
1521 /* Helper function to find the cached rt in the hash table
1522 * and update bucket pointer to point to the bucket for this
1523 * (daddr, saddr) pair
1524 * Caller must hold rt6_exception_lock
1526 static struct rt6_exception
*
1527 __rt6_find_exception_spinlock(struct rt6_exception_bucket
**bucket
,
1528 const struct in6_addr
*daddr
,
1529 const struct in6_addr
*saddr
)
1531 struct rt6_exception
*rt6_ex
;
1534 if (!(*bucket
) || !daddr
)
1537 hval
= rt6_exception_hash(daddr
, saddr
);
1540 hlist_for_each_entry(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1541 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1542 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1544 #ifdef CONFIG_IPV6_SUBTREES
1545 if (matched
&& saddr
)
1546 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1554 /* Helper function to find the cached rt in the hash table
1555 * and update bucket pointer to point to the bucket for this
1556 * (daddr, saddr) pair
1557 * Caller must hold rcu_read_lock()
1559 static struct rt6_exception
*
1560 __rt6_find_exception_rcu(struct rt6_exception_bucket
**bucket
,
1561 const struct in6_addr
*daddr
,
1562 const struct in6_addr
*saddr
)
1564 struct rt6_exception
*rt6_ex
;
1567 WARN_ON_ONCE(!rcu_read_lock_held());
1569 if (!(*bucket
) || !daddr
)
1572 hval
= rt6_exception_hash(daddr
, saddr
);
1575 hlist_for_each_entry_rcu(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1576 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1577 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1579 #ifdef CONFIG_IPV6_SUBTREES
1580 if (matched
&& saddr
)
1581 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1589 static unsigned int fib6_mtu(const struct fib6_result
*res
)
1591 const struct fib6_nh
*nh
= res
->nh
;
1594 if (res
->f6i
->fib6_pmtu
) {
1595 mtu
= res
->f6i
->fib6_pmtu
;
1597 struct net_device
*dev
= nh
->fib_nh_dev
;
1598 struct inet6_dev
*idev
;
1601 idev
= __in6_dev_get(dev
);
1602 mtu
= READ_ONCE(idev
->cnf
.mtu6
);
1606 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
1608 return mtu
- lwtunnel_headroom(nh
->fib_nh_lws
, mtu
);
1611 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
1613 /* used when the flushed bit is not relevant, only access to the bucket
1614 * (ie., all bucket users except rt6_insert_exception);
1616 * called under rcu lock; sometimes called with rt6_exception_lock held
1619 struct rt6_exception_bucket
*fib6_nh_get_excptn_bucket(const struct fib6_nh
*nh
,
1622 struct rt6_exception_bucket
*bucket
;
1625 bucket
= rcu_dereference_protected(nh
->rt6i_exception_bucket
,
1626 lockdep_is_held(lock
));
1628 bucket
= rcu_dereference(nh
->rt6i_exception_bucket
);
1630 /* remove bucket flushed bit if set */
1632 unsigned long p
= (unsigned long)bucket
;
1634 p
&= ~FIB6_EXCEPTION_BUCKET_FLUSHED
;
1635 bucket
= (struct rt6_exception_bucket
*)p
;
1641 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket
*bucket
)
1643 unsigned long p
= (unsigned long)bucket
;
1645 return !!(p
& FIB6_EXCEPTION_BUCKET_FLUSHED
);
1648 /* called with rt6_exception_lock held */
1649 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh
*nh
,
1652 struct rt6_exception_bucket
*bucket
;
1655 bucket
= rcu_dereference_protected(nh
->rt6i_exception_bucket
,
1656 lockdep_is_held(lock
));
1658 p
= (unsigned long)bucket
;
1659 p
|= FIB6_EXCEPTION_BUCKET_FLUSHED
;
1660 bucket
= (struct rt6_exception_bucket
*)p
;
1661 rcu_assign_pointer(nh
->rt6i_exception_bucket
, bucket
);
1664 static int rt6_insert_exception(struct rt6_info
*nrt
,
1665 const struct fib6_result
*res
)
1667 struct net
*net
= dev_net(nrt
->dst
.dev
);
1668 struct rt6_exception_bucket
*bucket
;
1669 struct fib6_info
*f6i
= res
->f6i
;
1670 struct in6_addr
*src_key
= NULL
;
1671 struct rt6_exception
*rt6_ex
;
1672 struct fib6_nh
*nh
= res
->nh
;
1676 spin_lock_bh(&rt6_exception_lock
);
1678 bucket
= rcu_dereference_protected(nh
->rt6i_exception_bucket
,
1679 lockdep_is_held(&rt6_exception_lock
));
1681 bucket
= kcalloc(FIB6_EXCEPTION_BUCKET_SIZE
, sizeof(*bucket
),
1687 rcu_assign_pointer(nh
->rt6i_exception_bucket
, bucket
);
1688 } else if (fib6_nh_excptn_bucket_flushed(bucket
)) {
1693 #ifdef CONFIG_IPV6_SUBTREES
1694 /* fib6_src.plen != 0 indicates f6i is in subtree
1695 * and exception table is indexed by a hash of
1696 * both fib6_dst and fib6_src.
1697 * Otherwise, the exception table is indexed by
1698 * a hash of only fib6_dst.
1700 if (f6i
->fib6_src
.plen
)
1701 src_key
= &nrt
->rt6i_src
.addr
;
1703 /* rt6_mtu_change() might lower mtu on f6i.
1704 * Only insert this exception route if its mtu
1705 * is less than f6i's mtu value.
1707 if (dst_metric_raw(&nrt
->dst
, RTAX_MTU
) >= fib6_mtu(res
)) {
1712 rt6_ex
= __rt6_find_exception_spinlock(&bucket
, &nrt
->rt6i_dst
.addr
,
1715 rt6_remove_exception(bucket
, rt6_ex
);
1717 rt6_ex
= kzalloc(sizeof(*rt6_ex
), GFP_ATOMIC
);
1723 rt6_ex
->stamp
= jiffies
;
1724 hlist_add_head_rcu(&rt6_ex
->hlist
, &bucket
->chain
);
1726 net
->ipv6
.rt6_stats
->fib_rt_cache
++;
1728 /* Randomize max depth to avoid some side channels attacks. */
1729 max_depth
= FIB6_MAX_DEPTH
+ get_random_u32_below(FIB6_MAX_DEPTH
);
1730 while (bucket
->depth
> max_depth
)
1731 rt6_exception_remove_oldest(bucket
);
1734 spin_unlock_bh(&rt6_exception_lock
);
1736 /* Update fn->fn_sernum to invalidate all cached dst */
1738 spin_lock_bh(&f6i
->fib6_table
->tb6_lock
);
1739 fib6_update_sernum(net
, f6i
);
1740 spin_unlock_bh(&f6i
->fib6_table
->tb6_lock
);
1741 fib6_force_start_gc(net
);
1747 static void fib6_nh_flush_exceptions(struct fib6_nh
*nh
, struct fib6_info
*from
)
1749 struct rt6_exception_bucket
*bucket
;
1750 struct rt6_exception
*rt6_ex
;
1751 struct hlist_node
*tmp
;
1754 spin_lock_bh(&rt6_exception_lock
);
1756 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
1760 /* Prevent rt6_insert_exception() to recreate the bucket list */
1762 fib6_nh_excptn_bucket_set_flushed(nh
, &rt6_exception_lock
);
1764 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1765 hlist_for_each_entry_safe(rt6_ex
, tmp
, &bucket
->chain
, hlist
) {
1767 rcu_access_pointer(rt6_ex
->rt6i
->from
) == from
)
1768 rt6_remove_exception(bucket
, rt6_ex
);
1770 WARN_ON_ONCE(!from
&& bucket
->depth
);
1774 spin_unlock_bh(&rt6_exception_lock
);
1777 static int rt6_nh_flush_exceptions(struct fib6_nh
*nh
, void *arg
)
1779 struct fib6_info
*f6i
= arg
;
1781 fib6_nh_flush_exceptions(nh
, f6i
);
1786 void rt6_flush_exceptions(struct fib6_info
*f6i
)
1789 nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_flush_exceptions
,
1792 fib6_nh_flush_exceptions(f6i
->fib6_nh
, f6i
);
1795 /* Find cached rt in the hash table inside passed in rt
1796 * Caller has to hold rcu_read_lock()
1798 static struct rt6_info
*rt6_find_cached_rt(const struct fib6_result
*res
,
1799 const struct in6_addr
*daddr
,
1800 const struct in6_addr
*saddr
)
1802 const struct in6_addr
*src_key
= NULL
;
1803 struct rt6_exception_bucket
*bucket
;
1804 struct rt6_exception
*rt6_ex
;
1805 struct rt6_info
*ret
= NULL
;
1807 #ifdef CONFIG_IPV6_SUBTREES
1808 /* fib6i_src.plen != 0 indicates f6i is in subtree
1809 * and exception table is indexed by a hash of
1810 * both fib6_dst and fib6_src.
1811 * However, the src addr used to create the hash
1812 * might not be exactly the passed in saddr which
1813 * is a /128 addr from the flow.
1814 * So we need to use f6i->fib6_src to redo lookup
1815 * if the passed in saddr does not find anything.
1816 * (See the logic in ip6_rt_cache_alloc() on how
1817 * rt->rt6i_src is updated.)
1819 if (res
->f6i
->fib6_src
.plen
)
1823 bucket
= fib6_nh_get_excptn_bucket(res
->nh
, NULL
);
1824 rt6_ex
= __rt6_find_exception_rcu(&bucket
, daddr
, src_key
);
1826 if (rt6_ex
&& !rt6_check_expired(rt6_ex
->rt6i
))
1829 #ifdef CONFIG_IPV6_SUBTREES
1830 /* Use fib6_src as src_key and redo lookup */
1831 if (!ret
&& src_key
&& src_key
!= &res
->f6i
->fib6_src
.addr
) {
1832 src_key
= &res
->f6i
->fib6_src
.addr
;
1840 /* Remove the passed in cached rt from the hash table that contains it */
1841 static int fib6_nh_remove_exception(const struct fib6_nh
*nh
, int plen
,
1842 const struct rt6_info
*rt
)
1844 const struct in6_addr
*src_key
= NULL
;
1845 struct rt6_exception_bucket
*bucket
;
1846 struct rt6_exception
*rt6_ex
;
1849 if (!rcu_access_pointer(nh
->rt6i_exception_bucket
))
1852 spin_lock_bh(&rt6_exception_lock
);
1853 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
1855 #ifdef CONFIG_IPV6_SUBTREES
1856 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1857 * and exception table is indexed by a hash of
1858 * both rt6i_dst and rt6i_src.
1859 * Otherwise, the exception table is indexed by
1860 * a hash of only rt6i_dst.
1863 src_key
= &rt
->rt6i_src
.addr
;
1865 rt6_ex
= __rt6_find_exception_spinlock(&bucket
,
1869 rt6_remove_exception(bucket
, rt6_ex
);
1875 spin_unlock_bh(&rt6_exception_lock
);
1879 struct fib6_nh_excptn_arg
{
1880 struct rt6_info
*rt
;
1884 static int rt6_nh_remove_exception_rt(struct fib6_nh
*nh
, void *_arg
)
1886 struct fib6_nh_excptn_arg
*arg
= _arg
;
1889 err
= fib6_nh_remove_exception(nh
, arg
->plen
, arg
->rt
);
1896 static int rt6_remove_exception_rt(struct rt6_info
*rt
)
1898 struct fib6_info
*from
;
1900 from
= rcu_dereference(rt
->from
);
1901 if (!from
|| !(rt
->rt6i_flags
& RTF_CACHE
))
1905 struct fib6_nh_excptn_arg arg
= {
1907 .plen
= from
->fib6_src
.plen
1911 /* rc = 1 means an entry was found */
1912 rc
= nexthop_for_each_fib6_nh(from
->nh
,
1913 rt6_nh_remove_exception_rt
,
1915 return rc
? 0 : -ENOENT
;
1918 return fib6_nh_remove_exception(from
->fib6_nh
,
1919 from
->fib6_src
.plen
, rt
);
1922 /* Find rt6_ex which contains the passed in rt cache and
1925 static void fib6_nh_update_exception(const struct fib6_nh
*nh
, int plen
,
1926 const struct rt6_info
*rt
)
1928 const struct in6_addr
*src_key
= NULL
;
1929 struct rt6_exception_bucket
*bucket
;
1930 struct rt6_exception
*rt6_ex
;
1932 bucket
= fib6_nh_get_excptn_bucket(nh
, NULL
);
1933 #ifdef CONFIG_IPV6_SUBTREES
1934 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1935 * and exception table is indexed by a hash of
1936 * both rt6i_dst and rt6i_src.
1937 * Otherwise, the exception table is indexed by
1938 * a hash of only rt6i_dst.
1941 src_key
= &rt
->rt6i_src
.addr
;
1943 rt6_ex
= __rt6_find_exception_rcu(&bucket
, &rt
->rt6i_dst
.addr
, src_key
);
1945 rt6_ex
->stamp
= jiffies
;
1948 struct fib6_nh_match_arg
{
1949 const struct net_device
*dev
;
1950 const struct in6_addr
*gw
;
1951 struct fib6_nh
*match
;
1954 /* determine if fib6_nh has given device and gateway */
1955 static int fib6_nh_find_match(struct fib6_nh
*nh
, void *_arg
)
1957 struct fib6_nh_match_arg
*arg
= _arg
;
1959 if (arg
->dev
!= nh
->fib_nh_dev
||
1960 (arg
->gw
&& !nh
->fib_nh_gw_family
) ||
1961 (!arg
->gw
&& nh
->fib_nh_gw_family
) ||
1962 (arg
->gw
&& !ipv6_addr_equal(arg
->gw
, &nh
->fib_nh_gw6
)))
1967 /* found a match, break the loop */
1971 static void rt6_update_exception_stamp_rt(struct rt6_info
*rt
)
1973 struct fib6_info
*from
;
1974 struct fib6_nh
*fib6_nh
;
1978 from
= rcu_dereference(rt
->from
);
1979 if (!from
|| !(rt
->rt6i_flags
& RTF_CACHE
))
1983 struct fib6_nh_match_arg arg
= {
1985 .gw
= &rt
->rt6i_gateway
,
1988 nexthop_for_each_fib6_nh(from
->nh
, fib6_nh_find_match
, &arg
);
1992 fib6_nh
= arg
.match
;
1994 fib6_nh
= from
->fib6_nh
;
1996 fib6_nh_update_exception(fib6_nh
, from
->fib6_src
.plen
, rt
);
2001 static bool rt6_mtu_change_route_allowed(struct inet6_dev
*idev
,
2002 struct rt6_info
*rt
, int mtu
)
2004 /* If the new MTU is lower than the route PMTU, this new MTU will be the
2005 * lowest MTU in the path: always allow updating the route PMTU to
2006 * reflect PMTU decreases.
2008 * If the new MTU is higher, and the route PMTU is equal to the local
2009 * MTU, this means the old MTU is the lowest in the path, so allow
2010 * updating it: if other nodes now have lower MTUs, PMTU discovery will
2014 if (dst_mtu(&rt
->dst
) >= mtu
)
2017 if (dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
)
2023 static void rt6_exceptions_update_pmtu(struct inet6_dev
*idev
,
2024 const struct fib6_nh
*nh
, int mtu
)
2026 struct rt6_exception_bucket
*bucket
;
2027 struct rt6_exception
*rt6_ex
;
2030 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
2034 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
2035 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
2036 struct rt6_info
*entry
= rt6_ex
->rt6i
;
2038 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2039 * route), the metrics of its rt->from have already
2042 if (dst_metric_raw(&entry
->dst
, RTAX_MTU
) &&
2043 rt6_mtu_change_route_allowed(idev
, entry
, mtu
))
2044 dst_metric_set(&entry
->dst
, RTAX_MTU
, mtu
);
2050 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2052 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh
*nh
,
2053 const struct in6_addr
*gateway
)
2055 struct rt6_exception_bucket
*bucket
;
2056 struct rt6_exception
*rt6_ex
;
2057 struct hlist_node
*tmp
;
2060 if (!rcu_access_pointer(nh
->rt6i_exception_bucket
))
2063 spin_lock_bh(&rt6_exception_lock
);
2064 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
2066 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
2067 hlist_for_each_entry_safe(rt6_ex
, tmp
,
2068 &bucket
->chain
, hlist
) {
2069 struct rt6_info
*entry
= rt6_ex
->rt6i
;
2071 if ((entry
->rt6i_flags
& RTF_CACHE_GATEWAY
) ==
2072 RTF_CACHE_GATEWAY
&&
2073 ipv6_addr_equal(gateway
,
2074 &entry
->rt6i_gateway
)) {
2075 rt6_remove_exception(bucket
, rt6_ex
);
2082 spin_unlock_bh(&rt6_exception_lock
);
2085 static void rt6_age_examine_exception(struct rt6_exception_bucket
*bucket
,
2086 struct rt6_exception
*rt6_ex
,
2087 struct fib6_gc_args
*gc_args
,
2090 struct rt6_info
*rt
= rt6_ex
->rt6i
;
2092 /* we are pruning and obsoleting aged-out and non gateway exceptions
2093 * even if others have still references to them, so that on next
2094 * dst_check() such references can be dropped.
2095 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2096 * expired, independently from their aging, as per RFC 8201 section 4
2098 if (!(rt
->rt6i_flags
& RTF_EXPIRES
)) {
2099 if (time_after_eq(now
, rt
->dst
.lastuse
+ gc_args
->timeout
)) {
2100 pr_debug("aging clone %p\n", rt
);
2101 rt6_remove_exception(bucket
, rt6_ex
);
2104 } else if (time_after(jiffies
, rt
->dst
.expires
)) {
2105 pr_debug("purging expired route %p\n", rt
);
2106 rt6_remove_exception(bucket
, rt6_ex
);
2110 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
2111 struct neighbour
*neigh
;
2113 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
2115 if (!(neigh
&& (neigh
->flags
& NTF_ROUTER
))) {
2116 pr_debug("purging route %p via non-router but gateway\n",
2118 rt6_remove_exception(bucket
, rt6_ex
);
2126 static void fib6_nh_age_exceptions(const struct fib6_nh
*nh
,
2127 struct fib6_gc_args
*gc_args
,
2130 struct rt6_exception_bucket
*bucket
;
2131 struct rt6_exception
*rt6_ex
;
2132 struct hlist_node
*tmp
;
2135 if (!rcu_access_pointer(nh
->rt6i_exception_bucket
))
2139 spin_lock(&rt6_exception_lock
);
2140 bucket
= fib6_nh_get_excptn_bucket(nh
, &rt6_exception_lock
);
2142 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
2143 hlist_for_each_entry_safe(rt6_ex
, tmp
,
2144 &bucket
->chain
, hlist
) {
2145 rt6_age_examine_exception(bucket
, rt6_ex
,
2151 spin_unlock(&rt6_exception_lock
);
2152 rcu_read_unlock_bh();
2155 struct fib6_nh_age_excptn_arg
{
2156 struct fib6_gc_args
*gc_args
;
2160 static int rt6_nh_age_exceptions(struct fib6_nh
*nh
, void *_arg
)
2162 struct fib6_nh_age_excptn_arg
*arg
= _arg
;
2164 fib6_nh_age_exceptions(nh
, arg
->gc_args
, arg
->now
);
2168 void rt6_age_exceptions(struct fib6_info
*f6i
,
2169 struct fib6_gc_args
*gc_args
,
2173 struct fib6_nh_age_excptn_arg arg
= {
2178 nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_age_exceptions
,
2181 fib6_nh_age_exceptions(f6i
->fib6_nh
, gc_args
, now
);
2185 /* must be called with rcu lock held */
2186 int fib6_table_lookup(struct net
*net
, struct fib6_table
*table
, int oif
,
2187 struct flowi6
*fl6
, struct fib6_result
*res
, int strict
)
2189 struct fib6_node
*fn
, *saved_fn
;
2191 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
2195 rt6_select(net
, fn
, oif
, res
, strict
);
2196 if (res
->f6i
== net
->ipv6
.fib6_null_entry
) {
2197 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
2199 goto redo_rt6_select
;
2200 else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
2201 /* also consider unreachable route */
2202 strict
&= ~RT6_LOOKUP_F_REACHABLE
;
2204 goto redo_rt6_select
;
2208 trace_fib6_table_lookup(net
, res
, table
, fl6
);
2213 struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
,
2214 int oif
, struct flowi6
*fl6
,
2215 const struct sk_buff
*skb
, int flags
)
2217 struct fib6_result res
= {};
2218 struct rt6_info
*rt
= NULL
;
2221 WARN_ON_ONCE((flags
& RT6_LOOKUP_F_DST_NOREF
) &&
2222 !rcu_read_lock_held());
2224 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
2225 strict
|= flags
& RT6_LOOKUP_F_IGNORE_LINKSTATE
;
2226 if (READ_ONCE(net
->ipv6
.devconf_all
->forwarding
) == 0)
2227 strict
|= RT6_LOOKUP_F_REACHABLE
;
2231 fib6_table_lookup(net
, table
, oif
, fl6
, &res
, strict
);
2232 if (res
.f6i
== net
->ipv6
.fib6_null_entry
)
2235 fib6_select_path(net
, &res
, fl6
, oif
, false, skb
, strict
);
2237 /*Search through exception table */
2238 rt
= rt6_find_cached_rt(&res
, &fl6
->daddr
, &fl6
->saddr
);
2241 } else if (unlikely((fl6
->flowi6_flags
& FLOWI_FLAG_KNOWN_NH
) &&
2242 !res
.nh
->fib_nh_gw_family
)) {
2243 /* Create a RTF_CACHE clone which will not be
2244 * owned by the fib6 tree. It is for the special case where
2245 * the daddr in the skb during the neighbor look-up is different
2246 * from the fl6->daddr used to look-up route here.
2248 rt
= ip6_rt_cache_alloc(&res
, &fl6
->daddr
, NULL
);
2251 /* 1 refcnt is taken during ip6_rt_cache_alloc().
2252 * As rt6_uncached_list_add() does not consume refcnt,
2253 * this refcnt is always returned to the caller even
2254 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2256 rt6_uncached_list_add(rt
);
2262 /* Get a percpu copy */
2264 rt
= rt6_get_pcpu_route(&res
);
2267 rt
= rt6_make_pcpu_route(net
, &res
);
2273 rt
= net
->ipv6
.ip6_null_entry
;
2274 if (!(flags
& RT6_LOOKUP_F_DST_NOREF
))
2275 ip6_hold_safe(net
, &rt
);
2280 EXPORT_SYMBOL_GPL(ip6_pol_route
);
2282 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*ip6_pol_route_input(struct net
*net
,
2283 struct fib6_table
*table
,
2285 const struct sk_buff
*skb
,
2288 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, skb
, flags
);
2291 struct dst_entry
*ip6_route_input_lookup(struct net
*net
,
2292 struct net_device
*dev
,
2294 const struct sk_buff
*skb
,
2297 if (rt6_need_strict(&fl6
->daddr
) && dev
->type
!= ARPHRD_PIMREG
)
2298 flags
|= RT6_LOOKUP_F_IFACE
;
2300 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_input
);
2302 EXPORT_SYMBOL_GPL(ip6_route_input_lookup
);
2304 static void ip6_multipath_l3_keys(const struct sk_buff
*skb
,
2305 struct flow_keys
*keys
,
2306 struct flow_keys
*flkeys
)
2308 const struct ipv6hdr
*outer_iph
= ipv6_hdr(skb
);
2309 const struct ipv6hdr
*key_iph
= outer_iph
;
2310 struct flow_keys
*_flkeys
= flkeys
;
2311 const struct ipv6hdr
*inner_iph
;
2312 const struct icmp6hdr
*icmph
;
2313 struct ipv6hdr _inner_iph
;
2314 struct icmp6hdr _icmph
;
2316 if (likely(outer_iph
->nexthdr
!= IPPROTO_ICMPV6
))
2319 icmph
= skb_header_pointer(skb
, skb_transport_offset(skb
),
2320 sizeof(_icmph
), &_icmph
);
2324 if (!icmpv6_is_err(icmph
->icmp6_type
))
2327 inner_iph
= skb_header_pointer(skb
,
2328 skb_transport_offset(skb
) + sizeof(*icmph
),
2329 sizeof(_inner_iph
), &_inner_iph
);
2333 key_iph
= inner_iph
;
2337 keys
->addrs
.v6addrs
.src
= _flkeys
->addrs
.v6addrs
.src
;
2338 keys
->addrs
.v6addrs
.dst
= _flkeys
->addrs
.v6addrs
.dst
;
2339 keys
->tags
.flow_label
= _flkeys
->tags
.flow_label
;
2340 keys
->basic
.ip_proto
= _flkeys
->basic
.ip_proto
;
2342 keys
->addrs
.v6addrs
.src
= key_iph
->saddr
;
2343 keys
->addrs
.v6addrs
.dst
= key_iph
->daddr
;
2344 keys
->tags
.flow_label
= ip6_flowlabel(key_iph
);
2345 keys
->basic
.ip_proto
= key_iph
->nexthdr
;
2349 static u32
rt6_multipath_custom_hash_outer(const struct net
*net
,
2350 const struct sk_buff
*skb
,
2353 u32 hash_fields
= ip6_multipath_hash_fields(net
);
2354 struct flow_keys keys
, hash_keys
;
2356 if (!(hash_fields
& FIB_MULTIPATH_HASH_FIELD_OUTER_MASK
))
2359 memset(&hash_keys
, 0, sizeof(hash_keys
));
2360 skb_flow_dissect_flow_keys(skb
, &keys
, FLOW_DISSECTOR_F_STOP_AT_ENCAP
);
2362 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2363 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_IP
)
2364 hash_keys
.addrs
.v6addrs
.src
= keys
.addrs
.v6addrs
.src
;
2365 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_IP
)
2366 hash_keys
.addrs
.v6addrs
.dst
= keys
.addrs
.v6addrs
.dst
;
2367 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_IP_PROTO
)
2368 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
2369 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_FLOWLABEL
)
2370 hash_keys
.tags
.flow_label
= keys
.tags
.flow_label
;
2371 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_PORT
)
2372 hash_keys
.ports
.src
= keys
.ports
.src
;
2373 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_PORT
)
2374 hash_keys
.ports
.dst
= keys
.ports
.dst
;
2376 *p_has_inner
= !!(keys
.control
.flags
& FLOW_DIS_ENCAPSULATION
);
2377 return fib_multipath_hash_from_keys(net
, &hash_keys
);
2380 static u32
rt6_multipath_custom_hash_inner(const struct net
*net
,
2381 const struct sk_buff
*skb
,
2384 u32 hash_fields
= ip6_multipath_hash_fields(net
);
2385 struct flow_keys keys
, hash_keys
;
2387 /* We assume the packet carries an encapsulation, but if none was
2388 * encountered during dissection of the outer flow, then there is no
2389 * point in calling the flow dissector again.
2394 if (!(hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_MASK
))
2397 memset(&hash_keys
, 0, sizeof(hash_keys
));
2398 skb_flow_dissect_flow_keys(skb
, &keys
, 0);
2400 if (!(keys
.control
.flags
& FLOW_DIS_ENCAPSULATION
))
2403 if (keys
.control
.addr_type
== FLOW_DISSECTOR_KEY_IPV4_ADDRS
) {
2404 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2405 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP
)
2406 hash_keys
.addrs
.v4addrs
.src
= keys
.addrs
.v4addrs
.src
;
2407 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP
)
2408 hash_keys
.addrs
.v4addrs
.dst
= keys
.addrs
.v4addrs
.dst
;
2409 } else if (keys
.control
.addr_type
== FLOW_DISSECTOR_KEY_IPV6_ADDRS
) {
2410 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2411 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP
)
2412 hash_keys
.addrs
.v6addrs
.src
= keys
.addrs
.v6addrs
.src
;
2413 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP
)
2414 hash_keys
.addrs
.v6addrs
.dst
= keys
.addrs
.v6addrs
.dst
;
2415 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL
)
2416 hash_keys
.tags
.flow_label
= keys
.tags
.flow_label
;
2419 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO
)
2420 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
2421 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT
)
2422 hash_keys
.ports
.src
= keys
.ports
.src
;
2423 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT
)
2424 hash_keys
.ports
.dst
= keys
.ports
.dst
;
2426 return fib_multipath_hash_from_keys(net
, &hash_keys
);
2429 static u32
rt6_multipath_custom_hash_skb(const struct net
*net
,
2430 const struct sk_buff
*skb
)
2432 u32 mhash
, mhash_inner
;
2433 bool has_inner
= true;
2435 mhash
= rt6_multipath_custom_hash_outer(net
, skb
, &has_inner
);
2436 mhash_inner
= rt6_multipath_custom_hash_inner(net
, skb
, has_inner
);
2438 return jhash_2words(mhash
, mhash_inner
, 0);
2441 static u32
rt6_multipath_custom_hash_fl6(const struct net
*net
,
2442 const struct flowi6
*fl6
)
2444 u32 hash_fields
= ip6_multipath_hash_fields(net
);
2445 struct flow_keys hash_keys
;
2447 if (!(hash_fields
& FIB_MULTIPATH_HASH_FIELD_OUTER_MASK
))
2450 memset(&hash_keys
, 0, sizeof(hash_keys
));
2451 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2452 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_IP
)
2453 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2454 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_IP
)
2455 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2456 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_IP_PROTO
)
2457 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2458 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_FLOWLABEL
)
2459 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2460 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_PORT
)
2461 hash_keys
.ports
.src
= fl6
->fl6_sport
;
2462 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_PORT
)
2463 hash_keys
.ports
.dst
= fl6
->fl6_dport
;
2465 return fib_multipath_hash_from_keys(net
, &hash_keys
);
2468 /* if skb is set it will be used and fl6 can be NULL */
2469 u32
rt6_multipath_hash(const struct net
*net
, const struct flowi6
*fl6
,
2470 const struct sk_buff
*skb
, struct flow_keys
*flkeys
)
2472 struct flow_keys hash_keys
;
2475 switch (ip6_multipath_hash_policy(net
)) {
2477 memset(&hash_keys
, 0, sizeof(hash_keys
));
2478 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2480 ip6_multipath_l3_keys(skb
, &hash_keys
, flkeys
);
2482 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2483 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2484 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2485 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2487 mhash
= fib_multipath_hash_from_keys(net
, &hash_keys
);
2491 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
2492 struct flow_keys keys
;
2494 /* short-circuit if we already have L4 hash present */
2496 return skb_get_hash_raw(skb
) >> 1;
2498 memset(&hash_keys
, 0, sizeof(hash_keys
));
2501 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
2504 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2505 hash_keys
.addrs
.v6addrs
.src
= flkeys
->addrs
.v6addrs
.src
;
2506 hash_keys
.addrs
.v6addrs
.dst
= flkeys
->addrs
.v6addrs
.dst
;
2507 hash_keys
.ports
.src
= flkeys
->ports
.src
;
2508 hash_keys
.ports
.dst
= flkeys
->ports
.dst
;
2509 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2511 memset(&hash_keys
, 0, sizeof(hash_keys
));
2512 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2513 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2514 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2515 hash_keys
.ports
.src
= fl6
->fl6_sport
;
2516 hash_keys
.ports
.dst
= fl6
->fl6_dport
;
2517 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2519 mhash
= fib_multipath_hash_from_keys(net
, &hash_keys
);
2522 memset(&hash_keys
, 0, sizeof(hash_keys
));
2523 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2525 struct flow_keys keys
;
2528 skb_flow_dissect_flow_keys(skb
, &keys
, 0);
2532 /* Inner can be v4 or v6 */
2533 if (flkeys
->control
.addr_type
== FLOW_DISSECTOR_KEY_IPV4_ADDRS
) {
2534 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2535 hash_keys
.addrs
.v4addrs
.src
= flkeys
->addrs
.v4addrs
.src
;
2536 hash_keys
.addrs
.v4addrs
.dst
= flkeys
->addrs
.v4addrs
.dst
;
2537 } else if (flkeys
->control
.addr_type
== FLOW_DISSECTOR_KEY_IPV6_ADDRS
) {
2538 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2539 hash_keys
.addrs
.v6addrs
.src
= flkeys
->addrs
.v6addrs
.src
;
2540 hash_keys
.addrs
.v6addrs
.dst
= flkeys
->addrs
.v6addrs
.dst
;
2541 hash_keys
.tags
.flow_label
= flkeys
->tags
.flow_label
;
2542 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2544 /* Same as case 0 */
2545 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2546 ip6_multipath_l3_keys(skb
, &hash_keys
, flkeys
);
2549 /* Same as case 0 */
2550 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2551 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2552 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2553 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2554 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2556 mhash
= fib_multipath_hash_from_keys(net
, &hash_keys
);
2560 mhash
= rt6_multipath_custom_hash_skb(net
, skb
);
2562 mhash
= rt6_multipath_custom_hash_fl6(net
, fl6
);
2569 /* Called with rcu held */
2570 void ip6_route_input(struct sk_buff
*skb
)
2572 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2573 struct net
*net
= dev_net(skb
->dev
);
2574 int flags
= RT6_LOOKUP_F_HAS_SADDR
| RT6_LOOKUP_F_DST_NOREF
;
2575 struct ip_tunnel_info
*tun_info
;
2576 struct flowi6 fl6
= {
2577 .flowi6_iif
= skb
->dev
->ifindex
,
2578 .daddr
= iph
->daddr
,
2579 .saddr
= iph
->saddr
,
2580 .flowlabel
= ip6_flowinfo(iph
),
2581 .flowi6_mark
= skb
->mark
,
2582 .flowi6_proto
= iph
->nexthdr
,
2584 struct flow_keys
*flkeys
= NULL
, _flkeys
;
2586 tun_info
= skb_tunnel_info(skb
);
2587 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
2588 fl6
.flowi6_tun_key
.tun_id
= tun_info
->key
.tun_id
;
2590 if (fib6_rules_early_flow_dissect(net
, skb
, &fl6
, &_flkeys
))
2593 if (unlikely(fl6
.flowi6_proto
== IPPROTO_ICMPV6
))
2594 fl6
.mp_hash
= rt6_multipath_hash(net
, &fl6
, skb
, flkeys
);
2596 skb_dst_set_noref(skb
, ip6_route_input_lookup(net
, skb
->dev
,
2600 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*ip6_pol_route_output(struct net
*net
,
2601 struct fib6_table
*table
,
2603 const struct sk_buff
*skb
,
2606 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, skb
, flags
);
2609 static struct dst_entry
*ip6_route_output_flags_noref(struct net
*net
,
2610 const struct sock
*sk
,
2616 if (ipv6_addr_type(&fl6
->daddr
) &
2617 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
)) {
2618 struct dst_entry
*dst
;
2620 /* This function does not take refcnt on the dst */
2621 dst
= l3mdev_link_scope_lookup(net
, fl6
);
2626 fl6
->flowi6_iif
= LOOPBACK_IFINDEX
;
2628 flags
|= RT6_LOOKUP_F_DST_NOREF
;
2629 any_src
= ipv6_addr_any(&fl6
->saddr
);
2630 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
) ||
2631 (fl6
->flowi6_oif
&& any_src
))
2632 flags
|= RT6_LOOKUP_F_IFACE
;
2635 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2637 flags
|= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk
)->srcprefs
));
2639 return fib6_rule_lookup(net
, fl6
, NULL
, flags
, ip6_pol_route_output
);
2642 struct dst_entry
*ip6_route_output_flags(struct net
*net
,
2643 const struct sock
*sk
,
2647 struct dst_entry
*dst
;
2648 struct rt6_info
*rt6
;
2651 dst
= ip6_route_output_flags_noref(net
, sk
, fl6
, flags
);
2652 rt6
= dst_rt6_info(dst
);
2653 /* For dst cached in uncached_list, refcnt is already taken. */
2654 if (list_empty(&rt6
->dst
.rt_uncached
) && !dst_hold_safe(dst
)) {
2655 dst
= &net
->ipv6
.ip6_null_entry
->dst
;
2662 EXPORT_SYMBOL_GPL(ip6_route_output_flags
);
2664 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2666 struct rt6_info
*rt
, *ort
= dst_rt6_info(dst_orig
);
2667 struct net_device
*loopback_dev
= net
->loopback_dev
;
2668 struct dst_entry
*new = NULL
;
2670 rt
= dst_alloc(&ip6_dst_blackhole_ops
, loopback_dev
,
2671 DST_OBSOLETE_DEAD
, 0);
2674 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
2678 new->input
= dst_discard
;
2679 new->output
= dst_discard_out
;
2681 dst_copy_metrics(new, &ort
->dst
);
2683 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
2684 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
2685 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_PCPU
;
2687 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
2688 #ifdef CONFIG_IPV6_SUBTREES
2689 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
2693 dst_release(dst_orig
);
2694 return new ? new : ERR_PTR(-ENOMEM
);
2698 * Destination cache support functions
2701 static bool fib6_check(struct fib6_info
*f6i
, u32 cookie
)
2705 if (!fib6_get_cookie_safe(f6i
, &rt_cookie
) || rt_cookie
!= cookie
)
2708 if (fib6_check_expired(f6i
))
2714 static struct dst_entry
*rt6_check(struct rt6_info
*rt
,
2715 struct fib6_info
*from
,
2720 if (!from
|| !fib6_get_cookie_safe(from
, &rt_cookie
) ||
2721 rt_cookie
!= cookie
)
2724 if (rt6_check_expired(rt
))
2730 static struct dst_entry
*rt6_dst_from_check(struct rt6_info
*rt
,
2731 struct fib6_info
*from
,
2734 if (!__rt6_check_expired(rt
) &&
2735 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
2736 fib6_check(from
, cookie
))
2742 INDIRECT_CALLABLE_SCOPE
struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
,
2745 struct dst_entry
*dst_ret
;
2746 struct fib6_info
*from
;
2747 struct rt6_info
*rt
;
2749 rt
= dst_rt6_info(dst
);
2752 return rt6_is_valid(rt
) ? dst
: NULL
;
2756 /* All IPV6 dsts are created with ->obsolete set to the value
2757 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2758 * into this function always.
2761 from
= rcu_dereference(rt
->from
);
2763 if (from
&& (rt
->rt6i_flags
& RTF_PCPU
||
2764 unlikely(!list_empty(&rt
->dst
.rt_uncached
))))
2765 dst_ret
= rt6_dst_from_check(rt
, from
, cookie
);
2767 dst_ret
= rt6_check(rt
, from
, cookie
);
2773 EXPORT_INDIRECT_CALLABLE(ip6_dst_check
);
2775 static void ip6_negative_advice(struct sock
*sk
,
2776 struct dst_entry
*dst
)
2778 struct rt6_info
*rt
= dst_rt6_info(dst
);
2780 if (rt
->rt6i_flags
& RTF_CACHE
) {
2782 if (rt6_check_expired(rt
)) {
2783 /* rt/dst can not be destroyed yet,
2784 * because of rcu_read_lock()
2787 rt6_remove_exception_rt(rt
);
2795 static void ip6_link_failure(struct sk_buff
*skb
)
2797 struct rt6_info
*rt
;
2799 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
2801 rt
= dst_rt6_info(skb_dst(skb
));
2804 if (rt
->rt6i_flags
& RTF_CACHE
) {
2805 rt6_remove_exception_rt(rt
);
2807 struct fib6_info
*from
;
2808 struct fib6_node
*fn
;
2810 from
= rcu_dereference(rt
->from
);
2812 fn
= rcu_dereference(from
->fib6_node
);
2813 if (fn
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
2814 WRITE_ONCE(fn
->fn_sernum
, -1);
2821 static void rt6_update_expires(struct rt6_info
*rt0
, int timeout
)
2823 if (!(rt0
->rt6i_flags
& RTF_EXPIRES
)) {
2824 struct fib6_info
*from
;
2827 from
= rcu_dereference(rt0
->from
);
2829 rt0
->dst
.expires
= from
->expires
;
2833 dst_set_expires(&rt0
->dst
, timeout
);
2834 rt0
->rt6i_flags
|= RTF_EXPIRES
;
2837 static void rt6_do_update_pmtu(struct rt6_info
*rt
, u32 mtu
)
2839 struct net
*net
= dev_net(rt
->dst
.dev
);
2841 dst_metric_set(&rt
->dst
, RTAX_MTU
, mtu
);
2842 rt
->rt6i_flags
|= RTF_MODIFIED
;
2843 rt6_update_expires(rt
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
2846 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info
*rt
)
2848 return !(rt
->rt6i_flags
& RTF_CACHE
) &&
2849 (rt
->rt6i_flags
& RTF_PCPU
|| rcu_access_pointer(rt
->from
));
2852 static void __ip6_rt_update_pmtu(struct dst_entry
*dst
, const struct sock
*sk
,
2853 const struct ipv6hdr
*iph
, u32 mtu
,
2856 const struct in6_addr
*daddr
, *saddr
;
2857 struct rt6_info
*rt6
= dst_rt6_info(dst
);
2859 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2860 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2861 * [see also comment in rt6_mtu_change_route()]
2865 daddr
= &iph
->daddr
;
2866 saddr
= &iph
->saddr
;
2868 daddr
= &sk
->sk_v6_daddr
;
2869 saddr
= &inet6_sk(sk
)->saddr
;
2876 dst_confirm_neigh(dst
, daddr
);
2878 if (mtu
< IPV6_MIN_MTU
)
2880 if (mtu
>= dst_mtu(dst
))
2883 if (!rt6_cache_allowed_for_pmtu(rt6
)) {
2884 rt6_do_update_pmtu(rt6
, mtu
);
2885 /* update rt6_ex->stamp for cache */
2886 if (rt6
->rt6i_flags
& RTF_CACHE
)
2887 rt6_update_exception_stamp_rt(rt6
);
2889 struct fib6_result res
= {};
2890 struct rt6_info
*nrt6
;
2893 res
.f6i
= rcu_dereference(rt6
->from
);
2897 res
.fib6_flags
= res
.f6i
->fib6_flags
;
2898 res
.fib6_type
= res
.f6i
->fib6_type
;
2901 struct fib6_nh_match_arg arg
= {
2903 .gw
= &rt6
->rt6i_gateway
,
2906 nexthop_for_each_fib6_nh(res
.f6i
->nh
,
2907 fib6_nh_find_match
, &arg
);
2909 /* fib6_info uses a nexthop that does not have fib6_nh
2910 * using the dst->dev + gw. Should be impossible.
2917 res
.nh
= res
.f6i
->fib6_nh
;
2920 nrt6
= ip6_rt_cache_alloc(&res
, daddr
, saddr
);
2922 rt6_do_update_pmtu(nrt6
, mtu
);
2923 if (rt6_insert_exception(nrt6
, &res
))
2924 dst_release_immediate(&nrt6
->dst
);
2931 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2932 struct sk_buff
*skb
, u32 mtu
,
2935 __ip6_rt_update_pmtu(dst
, sk
, skb
? ipv6_hdr(skb
) : NULL
, mtu
,
2939 void ip6_update_pmtu(struct sk_buff
*skb
, struct net
*net
, __be32 mtu
,
2940 int oif
, u32 mark
, kuid_t uid
)
2942 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2943 struct dst_entry
*dst
;
2944 struct flowi6 fl6
= {
2946 .flowi6_mark
= mark
? mark
: IP6_REPLY_MARK(net
, skb
->mark
),
2947 .daddr
= iph
->daddr
,
2948 .saddr
= iph
->saddr
,
2949 .flowlabel
= ip6_flowinfo(iph
),
2953 dst
= ip6_route_output(net
, NULL
, &fl6
);
2955 __ip6_rt_update_pmtu(dst
, NULL
, iph
, ntohl(mtu
), true);
2958 EXPORT_SYMBOL_GPL(ip6_update_pmtu
);
2960 void ip6_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, __be32 mtu
)
2962 int oif
= sk
->sk_bound_dev_if
;
2963 struct dst_entry
*dst
;
2965 if (!oif
&& skb
->dev
)
2966 oif
= l3mdev_master_ifindex(skb
->dev
);
2968 ip6_update_pmtu(skb
, sock_net(sk
), mtu
, oif
, READ_ONCE(sk
->sk_mark
),
2971 dst
= __sk_dst_get(sk
);
2972 if (!dst
|| !dst
->obsolete
||
2973 dst
->ops
->check(dst
, inet6_sk(sk
)->dst_cookie
))
2977 if (!sock_owned_by_user(sk
) && !ipv6_addr_v4mapped(&sk
->sk_v6_daddr
))
2978 ip6_datagram_dst_update(sk
, false);
2981 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu
);
2983 void ip6_sk_dst_store_flow(struct sock
*sk
, struct dst_entry
*dst
,
2984 const struct flowi6
*fl6
)
2986 #ifdef CONFIG_IPV6_SUBTREES
2987 struct ipv6_pinfo
*np
= inet6_sk(sk
);
2990 ip6_dst_store(sk
, dst
,
2991 ipv6_addr_equal(&fl6
->daddr
, &sk
->sk_v6_daddr
) ?
2992 &sk
->sk_v6_daddr
: NULL
,
2993 #ifdef CONFIG_IPV6_SUBTREES
2994 ipv6_addr_equal(&fl6
->saddr
, &np
->saddr
) ?
3000 static bool ip6_redirect_nh_match(const struct fib6_result
*res
,
3002 const struct in6_addr
*gw
,
3003 struct rt6_info
**ret
)
3005 const struct fib6_nh
*nh
= res
->nh
;
3007 if (nh
->fib_nh_flags
& RTNH_F_DEAD
|| !nh
->fib_nh_gw_family
||
3008 fl6
->flowi6_oif
!= nh
->fib_nh_dev
->ifindex
)
3011 /* rt_cache's gateway might be different from its 'parent'
3012 * in the case of an ip redirect.
3013 * So we keep searching in the exception table if the gateway
3016 if (!ipv6_addr_equal(gw
, &nh
->fib_nh_gw6
)) {
3017 struct rt6_info
*rt_cache
;
3019 rt_cache
= rt6_find_cached_rt(res
, &fl6
->daddr
, &fl6
->saddr
);
3021 ipv6_addr_equal(gw
, &rt_cache
->rt6i_gateway
)) {
3030 struct fib6_nh_rd_arg
{
3031 struct fib6_result
*res
;
3033 const struct in6_addr
*gw
;
3034 struct rt6_info
**ret
;
3037 static int fib6_nh_redirect_match(struct fib6_nh
*nh
, void *_arg
)
3039 struct fib6_nh_rd_arg
*arg
= _arg
;
3042 return ip6_redirect_nh_match(arg
->res
, arg
->fl6
, arg
->gw
, arg
->ret
);
3045 /* Handle redirects */
3046 struct ip6rd_flowi
{
3048 struct in6_addr gateway
;
3051 INDIRECT_CALLABLE_SCOPE
struct rt6_info
*__ip6_route_redirect(struct net
*net
,
3052 struct fib6_table
*table
,
3054 const struct sk_buff
*skb
,
3057 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
3058 struct rt6_info
*ret
= NULL
;
3059 struct fib6_result res
= {};
3060 struct fib6_nh_rd_arg arg
= {
3063 .gw
= &rdfl
->gateway
,
3066 struct fib6_info
*rt
;
3067 struct fib6_node
*fn
;
3069 /* Get the "current" route for this destination and
3070 * check if the redirect has come from appropriate router.
3072 * RFC 4861 specifies that redirects should only be
3073 * accepted if they come from the nexthop to the target.
3074 * Due to the way the routes are chosen, this notion
3075 * is a bit fuzzy and one might need to check all possible
3080 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
3082 for_each_fib6_node_rt_rcu(fn
) {
3084 if (fib6_check_expired(rt
))
3086 if (rt
->fib6_flags
& RTF_REJECT
)
3088 if (unlikely(rt
->nh
)) {
3089 if (nexthop_is_blackhole(rt
->nh
))
3091 /* on match, res->nh is filled in and potentially ret */
3092 if (nexthop_for_each_fib6_nh(rt
->nh
,
3093 fib6_nh_redirect_match
,
3097 res
.nh
= rt
->fib6_nh
;
3098 if (ip6_redirect_nh_match(&res
, fl6
, &rdfl
->gateway
,
3105 rt
= net
->ipv6
.fib6_null_entry
;
3106 else if (rt
->fib6_flags
& RTF_REJECT
) {
3107 ret
= net
->ipv6
.ip6_null_entry
;
3111 if (rt
== net
->ipv6
.fib6_null_entry
) {
3112 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
3118 res
.nh
= rt
->fib6_nh
;
3121 ip6_hold_safe(net
, &ret
);
3123 res
.fib6_flags
= res
.f6i
->fib6_flags
;
3124 res
.fib6_type
= res
.f6i
->fib6_type
;
3125 ret
= ip6_create_rt_rcu(&res
);
3130 trace_fib6_table_lookup(net
, &res
, table
, fl6
);
3134 static struct dst_entry
*ip6_route_redirect(struct net
*net
,
3135 const struct flowi6
*fl6
,
3136 const struct sk_buff
*skb
,
3137 const struct in6_addr
*gateway
)
3139 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
3140 struct ip6rd_flowi rdfl
;
3143 rdfl
.gateway
= *gateway
;
3145 return fib6_rule_lookup(net
, &rdfl
.fl6
, skb
,
3146 flags
, __ip6_route_redirect
);
3149 void ip6_redirect(struct sk_buff
*skb
, struct net
*net
, int oif
, u32 mark
,
3152 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
3153 struct dst_entry
*dst
;
3154 struct flowi6 fl6
= {
3155 .flowi6_iif
= LOOPBACK_IFINDEX
,
3157 .flowi6_mark
= mark
,
3158 .daddr
= iph
->daddr
,
3159 .saddr
= iph
->saddr
,
3160 .flowlabel
= ip6_flowinfo(iph
),
3164 dst
= ip6_route_redirect(net
, &fl6
, skb
, &ipv6_hdr(skb
)->saddr
);
3165 rt6_do_redirect(dst
, NULL
, skb
);
3168 EXPORT_SYMBOL_GPL(ip6_redirect
);
3170 void ip6_redirect_no_header(struct sk_buff
*skb
, struct net
*net
, int oif
)
3172 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
3173 const struct rd_msg
*msg
= (struct rd_msg
*)icmp6_hdr(skb
);
3174 struct dst_entry
*dst
;
3175 struct flowi6 fl6
= {
3176 .flowi6_iif
= LOOPBACK_IFINDEX
,
3179 .saddr
= iph
->daddr
,
3180 .flowi6_uid
= sock_net_uid(net
, NULL
),
3183 dst
= ip6_route_redirect(net
, &fl6
, skb
, &iph
->saddr
);
3184 rt6_do_redirect(dst
, NULL
, skb
);
3188 void ip6_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
3190 ip6_redirect(skb
, sock_net(sk
), sk
->sk_bound_dev_if
,
3191 READ_ONCE(sk
->sk_mark
), sk
->sk_uid
);
3193 EXPORT_SYMBOL_GPL(ip6_sk_redirect
);
3195 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
3197 struct net_device
*dev
= dst
->dev
;
3198 unsigned int mtu
= dst_mtu(dst
);
3199 struct net
*net
= dev_net(dev
);
3201 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
3203 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
3204 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
3207 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3208 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3209 * IPV6_MAXPLEN is also valid and means: "any MSS,
3210 * rely only on pmtu discovery"
3212 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
3217 INDIRECT_CALLABLE_SCOPE
unsigned int ip6_mtu(const struct dst_entry
*dst
)
3219 return ip6_dst_mtu_maybe_forward(dst
, false);
3221 EXPORT_INDIRECT_CALLABLE(ip6_mtu
);
3224 * 1. mtu on route is locked - use it
3225 * 2. mtu from nexthop exception
3226 * 3. mtu from egress device
3228 * based on ip6_dst_mtu_forward and exception logic of
3229 * rt6_find_cached_rt; called with rcu_read_lock
3231 u32
ip6_mtu_from_fib6(const struct fib6_result
*res
,
3232 const struct in6_addr
*daddr
,
3233 const struct in6_addr
*saddr
)
3235 const struct fib6_nh
*nh
= res
->nh
;
3236 struct fib6_info
*f6i
= res
->f6i
;
3237 struct inet6_dev
*idev
;
3238 struct rt6_info
*rt
;
3241 if (unlikely(fib6_metric_locked(f6i
, RTAX_MTU
))) {
3242 mtu
= f6i
->fib6_pmtu
;
3247 rt
= rt6_find_cached_rt(res
, daddr
, saddr
);
3249 mtu
= dst_metric_raw(&rt
->dst
, RTAX_MTU
);
3251 struct net_device
*dev
= nh
->fib_nh_dev
;
3254 idev
= __in6_dev_get(dev
);
3256 mtu
= max_t(u32
, mtu
, READ_ONCE(idev
->cnf
.mtu6
));
3259 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
3261 return mtu
- lwtunnel_headroom(nh
->fib_nh_lws
, mtu
);
3264 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
3267 struct dst_entry
*dst
;
3268 struct rt6_info
*rt
;
3269 struct inet6_dev
*idev
= in6_dev_get(dev
);
3270 struct net
*net
= dev_net(dev
);
3272 if (unlikely(!idev
))
3273 return ERR_PTR(-ENODEV
);
3275 rt
= ip6_dst_alloc(net
, dev
, 0);
3276 if (unlikely(!rt
)) {
3278 dst
= ERR_PTR(-ENOMEM
);
3282 rt
->dst
.input
= ip6_input
;
3283 rt
->dst
.output
= ip6_output
;
3284 rt
->rt6i_gateway
= fl6
->daddr
;
3285 rt
->rt6i_dst
.addr
= fl6
->daddr
;
3286 rt
->rt6i_dst
.plen
= 128;
3287 rt
->rt6i_idev
= idev
;
3288 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 0);
3290 /* Add this dst into uncached_list so that rt6_disable_ip() can
3291 * do proper release of the net_device
3293 rt6_uncached_list_add(rt
);
3295 dst
= xfrm_lookup(net
, &rt
->dst
, flowi6_to_flowi(fl6
), NULL
, 0);
3301 static void ip6_dst_gc(struct dst_ops
*ops
)
3303 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
3304 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
3305 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
3306 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
3307 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
3311 if (time_after(rt_last_gc
+ rt_min_interval
, jiffies
))
3314 fib6_run_gc(atomic_inc_return(&net
->ipv6
.ip6_rt_gc_expire
), net
, true);
3315 entries
= dst_entries_get_slow(ops
);
3316 if (entries
< ops
->gc_thresh
)
3317 atomic_set(&net
->ipv6
.ip6_rt_gc_expire
, rt_gc_timeout
>> 1);
3319 val
= atomic_read(&net
->ipv6
.ip6_rt_gc_expire
);
3320 atomic_set(&net
->ipv6
.ip6_rt_gc_expire
, val
- (val
>> rt_elasticity
));
3323 static int ip6_nh_lookup_table(struct net
*net
, struct fib6_config
*cfg
,
3324 const struct in6_addr
*gw_addr
, u32 tbid
,
3325 int flags
, struct fib6_result
*res
)
3327 struct flowi6 fl6
= {
3328 .flowi6_oif
= cfg
->fc_ifindex
,
3330 .saddr
= cfg
->fc_prefsrc
,
3332 struct fib6_table
*table
;
3335 table
= fib6_get_table(net
, tbid
);
3339 if (!ipv6_addr_any(&cfg
->fc_prefsrc
))
3340 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
3342 flags
|= RT6_LOOKUP_F_IGNORE_LINKSTATE
;
3344 err
= fib6_table_lookup(net
, table
, cfg
->fc_ifindex
, &fl6
, res
, flags
);
3345 if (!err
&& res
->f6i
!= net
->ipv6
.fib6_null_entry
)
3346 fib6_select_path(net
, res
, &fl6
, cfg
->fc_ifindex
,
3347 cfg
->fc_ifindex
!= 0, NULL
, flags
);
3352 static int ip6_route_check_nh_onlink(struct net
*net
,
3353 struct fib6_config
*cfg
,
3354 const struct net_device
*dev
,
3355 struct netlink_ext_ack
*extack
)
3357 u32 tbid
= l3mdev_fib_table_rcu(dev
) ? : RT_TABLE_MAIN
;
3358 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
3359 struct fib6_result res
= {};
3362 err
= ip6_nh_lookup_table(net
, cfg
, gw_addr
, tbid
, 0, &res
);
3363 if (!err
&& !(res
.fib6_flags
& RTF_REJECT
) &&
3364 /* ignore match if it is the default route */
3365 !ipv6_addr_any(&res
.f6i
->fib6_dst
.addr
) &&
3366 (res
.fib6_type
!= RTN_UNICAST
|| dev
!= res
.nh
->fib_nh_dev
)) {
3367 NL_SET_ERR_MSG(extack
,
3368 "Nexthop has invalid gateway or device mismatch");
3375 static int ip6_route_check_nh(struct net
*net
,
3376 struct fib6_config
*cfg
,
3377 struct net_device
**_dev
,
3378 netdevice_tracker
*dev_tracker
,
3379 struct inet6_dev
**idev
)
3381 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
3382 struct net_device
*dev
= _dev
? *_dev
: NULL
;
3383 int flags
= RT6_LOOKUP_F_IFACE
;
3384 struct fib6_result res
= {};
3385 int err
= -EHOSTUNREACH
;
3387 if (cfg
->fc_table
) {
3388 err
= ip6_nh_lookup_table(net
, cfg
, gw_addr
,
3389 cfg
->fc_table
, flags
, &res
);
3390 /* gw_addr can not require a gateway or resolve to a reject
3391 * route. If a device is given, it must match the result.
3393 if (err
|| res
.fib6_flags
& RTF_REJECT
||
3394 res
.nh
->fib_nh_gw_family
||
3395 (dev
&& dev
!= res
.nh
->fib_nh_dev
))
3396 err
= -EHOSTUNREACH
;
3400 struct flowi6 fl6
= {
3401 .flowi6_oif
= cfg
->fc_ifindex
,
3405 err
= fib6_lookup(net
, cfg
->fc_ifindex
, &fl6
, &res
, flags
);
3406 if (err
|| res
.fib6_flags
& RTF_REJECT
||
3407 res
.nh
->fib_nh_gw_family
)
3408 err
= -EHOSTUNREACH
;
3413 fib6_select_path(net
, &res
, &fl6
, cfg
->fc_ifindex
,
3414 cfg
->fc_ifindex
!= 0, NULL
, flags
);
3419 if (dev
!= res
.nh
->fib_nh_dev
)
3420 err
= -EHOSTUNREACH
;
3422 *_dev
= dev
= res
.nh
->fib_nh_dev
;
3423 netdev_hold(dev
, dev_tracker
, GFP_ATOMIC
);
3424 *idev
= in6_dev_get(dev
);
3430 static int ip6_validate_gw(struct net
*net
, struct fib6_config
*cfg
,
3431 struct net_device
**_dev
,
3432 netdevice_tracker
*dev_tracker
,
3433 struct inet6_dev
**idev
,
3434 struct netlink_ext_ack
*extack
)
3436 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
3437 int gwa_type
= ipv6_addr_type(gw_addr
);
3438 bool skip_dev
= gwa_type
& IPV6_ADDR_LINKLOCAL
? false : true;
3439 const struct net_device
*dev
= *_dev
;
3440 bool need_addr_check
= !dev
;
3443 /* if gw_addr is local we will fail to detect this in case
3444 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3445 * will return already-added prefix route via interface that
3446 * prefix route was assigned to, which might be non-loopback.
3449 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
3450 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
3454 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_UNICAST
)) {
3455 /* IPv6 strictly inhibits using not link-local
3456 * addresses as nexthop address.
3457 * Otherwise, router will not able to send redirects.
3458 * It is very good, but in some (rare!) circumstances
3459 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3460 * some exceptions. --ANK
3461 * We allow IPv4-mapped nexthops to support RFC4798-type
3464 if (!(gwa_type
& (IPV6_ADDR_UNICAST
| IPV6_ADDR_MAPPED
))) {
3465 NL_SET_ERR_MSG(extack
, "Invalid gateway address");
3471 if (cfg
->fc_flags
& RTNH_F_ONLINK
)
3472 err
= ip6_route_check_nh_onlink(net
, cfg
, dev
, extack
);
3474 err
= ip6_route_check_nh(net
, cfg
, _dev
, dev_tracker
,
3483 /* reload in case device was changed */
3488 NL_SET_ERR_MSG(extack
, "Egress device not specified");
3490 } else if (dev
->flags
& IFF_LOOPBACK
) {
3491 NL_SET_ERR_MSG(extack
,
3492 "Egress device can not be loopback device for this route");
3496 /* if we did not check gw_addr above, do so now that the
3497 * egress device has been resolved.
3499 if (need_addr_check
&&
3500 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
3501 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
3510 static bool fib6_is_reject(u32 flags
, struct net_device
*dev
, int addr_type
)
3512 if ((flags
& RTF_REJECT
) ||
3513 (dev
&& (dev
->flags
& IFF_LOOPBACK
) &&
3514 !(addr_type
& IPV6_ADDR_LOOPBACK
) &&
3515 !(flags
& (RTF_ANYCAST
| RTF_LOCAL
))))
3521 int fib6_nh_init(struct net
*net
, struct fib6_nh
*fib6_nh
,
3522 struct fib6_config
*cfg
, gfp_t gfp_flags
,
3523 struct netlink_ext_ack
*extack
)
3525 netdevice_tracker
*dev_tracker
= &fib6_nh
->fib_nh_dev_tracker
;
3526 struct net_device
*dev
= NULL
;
3527 struct inet6_dev
*idev
= NULL
;
3531 fib6_nh
->fib_nh_family
= AF_INET6
;
3532 #ifdef CONFIG_IPV6_ROUTER_PREF
3533 fib6_nh
->last_probe
= jiffies
;
3535 if (cfg
->fc_is_fdb
) {
3536 fib6_nh
->fib_nh_gw6
= cfg
->fc_gateway
;
3537 fib6_nh
->fib_nh_gw_family
= AF_INET6
;
3542 if (cfg
->fc_ifindex
) {
3543 dev
= netdev_get_by_index(net
, cfg
->fc_ifindex
,
3544 dev_tracker
, gfp_flags
);
3547 idev
= in6_dev_get(dev
);
3552 if (cfg
->fc_flags
& RTNH_F_ONLINK
) {
3554 NL_SET_ERR_MSG(extack
,
3555 "Nexthop device required for onlink");
3559 if (!(dev
->flags
& IFF_UP
)) {
3560 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3565 fib6_nh
->fib_nh_flags
|= RTNH_F_ONLINK
;
3568 fib6_nh
->fib_nh_weight
= 1;
3570 /* We cannot add true routes via loopback here,
3571 * they would result in kernel looping; promote them to reject routes
3573 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
3574 if (fib6_is_reject(cfg
->fc_flags
, dev
, addr_type
)) {
3575 /* hold loopback dev/idev if we haven't done so. */
3576 if (dev
!= net
->loopback_dev
) {
3578 netdev_put(dev
, dev_tracker
);
3581 dev
= net
->loopback_dev
;
3582 netdev_hold(dev
, dev_tracker
, gfp_flags
);
3583 idev
= in6_dev_get(dev
);
3592 if (cfg
->fc_flags
& RTF_GATEWAY
) {
3593 err
= ip6_validate_gw(net
, cfg
, &dev
, dev_tracker
,
3598 fib6_nh
->fib_nh_gw6
= cfg
->fc_gateway
;
3599 fib6_nh
->fib_nh_gw_family
= AF_INET6
;
3606 if (!idev
|| idev
->cnf
.disable_ipv6
) {
3607 NL_SET_ERR_MSG(extack
, "IPv6 is disabled on nexthop device");
3612 if (!(dev
->flags
& IFF_UP
) && !cfg
->fc_ignore_dev_down
) {
3613 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3618 if (!(cfg
->fc_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) &&
3619 !netif_carrier_ok(dev
))
3620 fib6_nh
->fib_nh_flags
|= RTNH_F_LINKDOWN
;
3622 err
= fib_nh_common_init(net
, &fib6_nh
->nh_common
, cfg
->fc_encap
,
3623 cfg
->fc_encap_type
, cfg
, gfp_flags
, extack
);
3628 fib6_nh
->rt6i_pcpu
= alloc_percpu_gfp(struct rt6_info
*, gfp_flags
);
3629 if (!fib6_nh
->rt6i_pcpu
) {
3634 fib6_nh
->fib_nh_dev
= dev
;
3635 fib6_nh
->fib_nh_oif
= dev
->ifindex
;
3642 lwtstate_put(fib6_nh
->fib_nh_lws
);
3643 fib6_nh
->fib_nh_lws
= NULL
;
3644 netdev_put(dev
, dev_tracker
);
3650 void fib6_nh_release(struct fib6_nh
*fib6_nh
)
3652 struct rt6_exception_bucket
*bucket
;
3656 fib6_nh_flush_exceptions(fib6_nh
, NULL
);
3657 bucket
= fib6_nh_get_excptn_bucket(fib6_nh
, NULL
);
3659 rcu_assign_pointer(fib6_nh
->rt6i_exception_bucket
, NULL
);
3665 fib6_nh_release_dsts(fib6_nh
);
3666 free_percpu(fib6_nh
->rt6i_pcpu
);
3668 fib_nh_common_release(&fib6_nh
->nh_common
);
3671 void fib6_nh_release_dsts(struct fib6_nh
*fib6_nh
)
3675 if (!fib6_nh
->rt6i_pcpu
)
3678 for_each_possible_cpu(cpu
) {
3679 struct rt6_info
*pcpu_rt
, **ppcpu_rt
;
3681 ppcpu_rt
= per_cpu_ptr(fib6_nh
->rt6i_pcpu
, cpu
);
3682 pcpu_rt
= xchg(ppcpu_rt
, NULL
);
3684 dst_dev_put(&pcpu_rt
->dst
);
3685 dst_release(&pcpu_rt
->dst
);
3690 static struct fib6_info
*ip6_route_info_create(struct fib6_config
*cfg
,
3692 struct netlink_ext_ack
*extack
)
3694 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
3695 struct fib6_info
*rt
= NULL
;
3696 struct nexthop
*nh
= NULL
;
3697 struct fib6_table
*table
;
3698 struct fib6_nh
*fib6_nh
;
3702 /* RTF_PCPU is an internal flag; can not be set by userspace */
3703 if (cfg
->fc_flags
& RTF_PCPU
) {
3704 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_PCPU");
3708 /* RTF_CACHE is an internal flag; can not be set by userspace */
3709 if (cfg
->fc_flags
& RTF_CACHE
) {
3710 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_CACHE");
3714 if (cfg
->fc_type
> RTN_MAX
) {
3715 NL_SET_ERR_MSG(extack
, "Invalid route type");
3719 if (cfg
->fc_dst_len
> 128) {
3720 NL_SET_ERR_MSG(extack
, "Invalid prefix length");
3723 if (cfg
->fc_src_len
> 128) {
3724 NL_SET_ERR_MSG(extack
, "Invalid source address length");
3727 #ifndef CONFIG_IPV6_SUBTREES
3728 if (cfg
->fc_src_len
) {
3729 NL_SET_ERR_MSG(extack
,
3730 "Specifying source address requires IPV6_SUBTREES to be enabled");
3734 if (cfg
->fc_nh_id
) {
3735 nh
= nexthop_find_by_id(net
, cfg
->fc_nh_id
);
3737 NL_SET_ERR_MSG(extack
, "Nexthop id does not exist");
3740 err
= fib6_check_nexthop(nh
, cfg
, extack
);
3746 if (cfg
->fc_nlinfo
.nlh
&&
3747 !(cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_CREATE
)) {
3748 table
= fib6_get_table(net
, cfg
->fc_table
);
3750 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3751 table
= fib6_new_table(net
, cfg
->fc_table
);
3754 table
= fib6_new_table(net
, cfg
->fc_table
);
3761 rt
= fib6_info_alloc(gfp_flags
, !nh
);
3765 rt
->fib6_metrics
= ip_fib_metrics_init(cfg
->fc_mx
, cfg
->fc_mx_len
,
3767 if (IS_ERR(rt
->fib6_metrics
)) {
3768 err
= PTR_ERR(rt
->fib6_metrics
);
3769 /* Do not leave garbage there. */
3770 rt
->fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
;
3774 if (cfg
->fc_flags
& RTF_ADDRCONF
)
3775 rt
->dst_nocount
= true;
3777 if (cfg
->fc_flags
& RTF_EXPIRES
)
3778 fib6_set_expires(rt
, jiffies
+
3779 clock_t_to_jiffies(cfg
->fc_expires
));
3781 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
3782 cfg
->fc_protocol
= RTPROT_BOOT
;
3783 rt
->fib6_protocol
= cfg
->fc_protocol
;
3785 rt
->fib6_table
= table
;
3786 rt
->fib6_metric
= cfg
->fc_metric
;
3787 rt
->fib6_type
= cfg
->fc_type
? : RTN_UNICAST
;
3788 rt
->fib6_flags
= cfg
->fc_flags
& ~RTF_GATEWAY
;
3790 ipv6_addr_prefix(&rt
->fib6_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
3791 rt
->fib6_dst
.plen
= cfg
->fc_dst_len
;
3793 #ifdef CONFIG_IPV6_SUBTREES
3794 ipv6_addr_prefix(&rt
->fib6_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
3795 rt
->fib6_src
.plen
= cfg
->fc_src_len
;
3798 if (rt
->fib6_src
.plen
) {
3799 NL_SET_ERR_MSG(extack
, "Nexthops can not be used with source routing");
3802 if (!nexthop_get(nh
)) {
3803 NL_SET_ERR_MSG(extack
, "Nexthop has been deleted");
3807 fib6_nh
= nexthop_fib6_nh(rt
->nh
);
3809 err
= fib6_nh_init(net
, rt
->fib6_nh
, cfg
, gfp_flags
, extack
);
3813 fib6_nh
= rt
->fib6_nh
;
3815 /* We cannot add true routes via loopback here, they would
3816 * result in kernel looping; promote them to reject routes
3818 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
3819 if (fib6_is_reject(cfg
->fc_flags
, rt
->fib6_nh
->fib_nh_dev
,
3821 rt
->fib6_flags
= RTF_REJECT
| RTF_NONEXTHOP
;
3824 if (!ipv6_addr_any(&cfg
->fc_prefsrc
)) {
3825 struct net_device
*dev
= fib6_nh
->fib_nh_dev
;
3827 if (!ipv6_chk_addr(net
, &cfg
->fc_prefsrc
, dev
, 0)) {
3828 NL_SET_ERR_MSG(extack
, "Invalid source address");
3832 rt
->fib6_prefsrc
.addr
= cfg
->fc_prefsrc
;
3833 rt
->fib6_prefsrc
.plen
= 128;
3835 rt
->fib6_prefsrc
.plen
= 0;
3839 fib6_info_release(rt
);
3840 return ERR_PTR(err
);
3842 ip_fib_metrics_put(rt
->fib6_metrics
);
3844 return ERR_PTR(err
);
3847 int ip6_route_add(struct fib6_config
*cfg
, gfp_t gfp_flags
,
3848 struct netlink_ext_ack
*extack
)
3850 struct fib6_info
*rt
;
3853 rt
= ip6_route_info_create(cfg
, gfp_flags
, extack
);
3857 err
= __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
, extack
);
3858 fib6_info_release(rt
);
3863 static int __ip6_del_rt(struct fib6_info
*rt
, struct nl_info
*info
)
3865 struct net
*net
= info
->nl_net
;
3866 struct fib6_table
*table
;
3869 if (rt
== net
->ipv6
.fib6_null_entry
) {
3874 table
= rt
->fib6_table
;
3875 spin_lock_bh(&table
->tb6_lock
);
3876 err
= fib6_del(rt
, info
);
3877 spin_unlock_bh(&table
->tb6_lock
);
3880 fib6_info_release(rt
);
3884 int ip6_del_rt(struct net
*net
, struct fib6_info
*rt
, bool skip_notify
)
3886 struct nl_info info
= {
3888 .skip_notify
= skip_notify
3891 return __ip6_del_rt(rt
, &info
);
3894 static int __ip6_del_rt_siblings(struct fib6_info
*rt
, struct fib6_config
*cfg
)
3896 struct nl_info
*info
= &cfg
->fc_nlinfo
;
3897 struct net
*net
= info
->nl_net
;
3898 struct sk_buff
*skb
= NULL
;
3899 struct fib6_table
*table
;
3902 if (rt
== net
->ipv6
.fib6_null_entry
)
3904 table
= rt
->fib6_table
;
3905 spin_lock_bh(&table
->tb6_lock
);
3907 if (rt
->fib6_nsiblings
&& cfg
->fc_delete_all_nh
) {
3908 struct fib6_info
*sibling
, *next_sibling
;
3909 struct fib6_node
*fn
;
3911 /* prefer to send a single notification with all hops */
3912 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
3914 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
3916 if (rt6_fill_node(net
, skb
, rt
, NULL
,
3917 NULL
, NULL
, 0, RTM_DELROUTE
,
3918 info
->portid
, seq
, 0) < 0) {
3922 info
->skip_notify
= 1;
3925 /* 'rt' points to the first sibling route. If it is not the
3926 * leaf, then we do not need to send a notification. Otherwise,
3927 * we need to check if the last sibling has a next route or not
3928 * and emit a replace or delete notification, respectively.
3930 info
->skip_notify_kernel
= 1;
3931 fn
= rcu_dereference_protected(rt
->fib6_node
,
3932 lockdep_is_held(&table
->tb6_lock
));
3933 if (rcu_access_pointer(fn
->leaf
) == rt
) {
3934 struct fib6_info
*last_sibling
, *replace_rt
;
3936 last_sibling
= list_last_entry(&rt
->fib6_siblings
,
3939 replace_rt
= rcu_dereference_protected(
3940 last_sibling
->fib6_next
,
3941 lockdep_is_held(&table
->tb6_lock
));
3943 call_fib6_entry_notifiers_replace(net
,
3946 call_fib6_multipath_entry_notifiers(net
,
3947 FIB_EVENT_ENTRY_DEL
,
3948 rt
, rt
->fib6_nsiblings
,
3951 list_for_each_entry_safe(sibling
, next_sibling
,
3954 err
= fib6_del(sibling
, info
);
3960 err
= fib6_del(rt
, info
);
3962 spin_unlock_bh(&table
->tb6_lock
);
3964 fib6_info_release(rt
);
3967 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
3968 info
->nlh
, gfp_any());
3973 static int __ip6_del_cached_rt(struct rt6_info
*rt
, struct fib6_config
*cfg
)
3977 if (cfg
->fc_ifindex
&& rt
->dst
.dev
->ifindex
!= cfg
->fc_ifindex
)
3980 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3981 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
3984 rc
= rt6_remove_exception_rt(rt
);
3989 static int ip6_del_cached_rt(struct fib6_config
*cfg
, struct fib6_info
*rt
,
3992 struct fib6_result res
= {
3996 struct rt6_info
*rt_cache
;
3998 rt_cache
= rt6_find_cached_rt(&res
, &cfg
->fc_dst
, &cfg
->fc_src
);
4000 return __ip6_del_cached_rt(rt_cache
, cfg
);
4005 struct fib6_nh_del_cached_rt_arg
{
4006 struct fib6_config
*cfg
;
4007 struct fib6_info
*f6i
;
4010 static int fib6_nh_del_cached_rt(struct fib6_nh
*nh
, void *_arg
)
4012 struct fib6_nh_del_cached_rt_arg
*arg
= _arg
;
4015 rc
= ip6_del_cached_rt(arg
->cfg
, arg
->f6i
, nh
);
4016 return rc
!= -ESRCH
? rc
: 0;
4019 static int ip6_del_cached_rt_nh(struct fib6_config
*cfg
, struct fib6_info
*f6i
)
4021 struct fib6_nh_del_cached_rt_arg arg
= {
4026 return nexthop_for_each_fib6_nh(f6i
->nh
, fib6_nh_del_cached_rt
, &arg
);
4029 static int ip6_route_del(struct fib6_config
*cfg
,
4030 struct netlink_ext_ack
*extack
)
4032 struct fib6_table
*table
;
4033 struct fib6_info
*rt
;
4034 struct fib6_node
*fn
;
4037 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
4039 NL_SET_ERR_MSG(extack
, "FIB table does not exist");
4045 fn
= fib6_locate(&table
->tb6_root
,
4046 &cfg
->fc_dst
, cfg
->fc_dst_len
,
4047 &cfg
->fc_src
, cfg
->fc_src_len
,
4048 !(cfg
->fc_flags
& RTF_CACHE
));
4051 for_each_fib6_node_rt_rcu(fn
) {
4054 if (rt
->nh
&& cfg
->fc_nh_id
&&
4055 rt
->nh
->id
!= cfg
->fc_nh_id
)
4058 if (cfg
->fc_flags
& RTF_CACHE
) {
4062 rc
= ip6_del_cached_rt_nh(cfg
, rt
);
4063 } else if (cfg
->fc_nh_id
) {
4067 rc
= ip6_del_cached_rt(cfg
, rt
, nh
);
4076 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->fib6_metric
)
4078 if (cfg
->fc_protocol
&&
4079 cfg
->fc_protocol
!= rt
->fib6_protocol
)
4083 if (!fib6_info_hold_safe(rt
))
4087 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
4093 if (cfg
->fc_ifindex
&&
4095 nh
->fib_nh_dev
->ifindex
!= cfg
->fc_ifindex
))
4097 if (cfg
->fc_flags
& RTF_GATEWAY
&&
4098 !ipv6_addr_equal(&cfg
->fc_gateway
, &nh
->fib_nh_gw6
))
4100 if (!fib6_info_hold_safe(rt
))
4104 /* if gateway was specified only delete the one hop */
4105 if (cfg
->fc_flags
& RTF_GATEWAY
)
4106 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
4108 return __ip6_del_rt_siblings(rt
, cfg
);
4116 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
4118 struct netevent_redirect netevent
;
4119 struct rt6_info
*rt
, *nrt
= NULL
;
4120 struct fib6_result res
= {};
4121 struct ndisc_options ndopts
;
4122 struct inet6_dev
*in6_dev
;
4123 struct neighbour
*neigh
;
4125 int optlen
, on_link
;
4128 optlen
= skb_tail_pointer(skb
) - skb_transport_header(skb
);
4129 optlen
-= sizeof(*msg
);
4132 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
4136 msg
= (struct rd_msg
*)icmp6_hdr(skb
);
4138 if (ipv6_addr_is_multicast(&msg
->dest
)) {
4139 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
4144 if (ipv6_addr_equal(&msg
->dest
, &msg
->target
)) {
4146 } else if (ipv6_addr_type(&msg
->target
) !=
4147 (IPV6_ADDR_UNICAST
|IPV6_ADDR_LINKLOCAL
)) {
4148 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
4152 in6_dev
= __in6_dev_get(skb
->dev
);
4155 if (READ_ONCE(in6_dev
->cnf
.forwarding
) ||
4156 !READ_ONCE(in6_dev
->cnf
.accept_redirects
))
4160 * The IP source address of the Redirect MUST be the same as the current
4161 * first-hop router for the specified ICMP Destination Address.
4164 if (!ndisc_parse_options(skb
->dev
, msg
->opt
, optlen
, &ndopts
)) {
4165 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
4170 if (ndopts
.nd_opts_tgt_lladdr
) {
4171 lladdr
= ndisc_opt_addr_data(ndopts
.nd_opts_tgt_lladdr
,
4174 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
4179 rt
= dst_rt6_info(dst
);
4180 if (rt
->rt6i_flags
& RTF_REJECT
) {
4181 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4185 /* Redirect received -> path was valid.
4186 * Look, redirects are sent only in response to data packets,
4187 * so that this nexthop apparently is reachable. --ANK
4189 dst_confirm_neigh(&rt
->dst
, &ipv6_hdr(skb
)->saddr
);
4191 neigh
= __neigh_lookup(&nd_tbl
, &msg
->target
, skb
->dev
, 1);
4196 * We have finally decided to accept it.
4199 ndisc_update(skb
->dev
, neigh
, lladdr
, NUD_STALE
,
4200 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
4201 NEIGH_UPDATE_F_OVERRIDE
|
4202 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
4203 NEIGH_UPDATE_F_ISROUTER
)),
4204 NDISC_REDIRECT
, &ndopts
);
4207 res
.f6i
= rcu_dereference(rt
->from
);
4212 struct fib6_nh_match_arg arg
= {
4214 .gw
= &rt
->rt6i_gateway
,
4217 nexthop_for_each_fib6_nh(res
.f6i
->nh
,
4218 fib6_nh_find_match
, &arg
);
4220 /* fib6_info uses a nexthop that does not have fib6_nh
4221 * using the dst->dev. Should be impossible
4227 res
.nh
= res
.f6i
->fib6_nh
;
4230 res
.fib6_flags
= res
.f6i
->fib6_flags
;
4231 res
.fib6_type
= res
.f6i
->fib6_type
;
4232 nrt
= ip6_rt_cache_alloc(&res
, &msg
->dest
, NULL
);
4236 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
4238 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
4240 nrt
->rt6i_gateway
= *(struct in6_addr
*)neigh
->primary_key
;
4242 /* rt6_insert_exception() will take care of duplicated exceptions */
4243 if (rt6_insert_exception(nrt
, &res
)) {
4244 dst_release_immediate(&nrt
->dst
);
4248 netevent
.old
= &rt
->dst
;
4249 netevent
.new = &nrt
->dst
;
4250 netevent
.daddr
= &msg
->dest
;
4251 netevent
.neigh
= neigh
;
4252 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
4256 neigh_release(neigh
);
4259 #ifdef CONFIG_IPV6_ROUTE_INFO
4260 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
4261 const struct in6_addr
*prefix
, int prefixlen
,
4262 const struct in6_addr
*gwaddr
,
4263 struct net_device
*dev
)
4265 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
4266 int ifindex
= dev
->ifindex
;
4267 struct fib6_node
*fn
;
4268 struct fib6_info
*rt
= NULL
;
4269 struct fib6_table
*table
;
4271 table
= fib6_get_table(net
, tb_id
);
4276 fn
= fib6_locate(&table
->tb6_root
, prefix
, prefixlen
, NULL
, 0, true);
4280 for_each_fib6_node_rt_rcu(fn
) {
4281 /* these routes do not use nexthops */
4284 if (rt
->fib6_nh
->fib_nh_dev
->ifindex
!= ifindex
)
4286 if (!(rt
->fib6_flags
& RTF_ROUTEINFO
) ||
4287 !rt
->fib6_nh
->fib_nh_gw_family
)
4289 if (!ipv6_addr_equal(&rt
->fib6_nh
->fib_nh_gw6
, gwaddr
))
4291 if (!fib6_info_hold_safe(rt
))
4300 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
4301 const struct in6_addr
*prefix
, int prefixlen
,
4302 const struct in6_addr
*gwaddr
,
4303 struct net_device
*dev
,
4306 struct fib6_config cfg
= {
4307 .fc_metric
= IP6_RT_PRIO_USER
,
4308 .fc_ifindex
= dev
->ifindex
,
4309 .fc_dst_len
= prefixlen
,
4310 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
4311 RTF_UP
| RTF_PREF(pref
),
4312 .fc_protocol
= RTPROT_RA
,
4313 .fc_type
= RTN_UNICAST
,
4314 .fc_nlinfo
.portid
= 0,
4315 .fc_nlinfo
.nlh
= NULL
,
4316 .fc_nlinfo
.nl_net
= net
,
4319 cfg
.fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
4320 cfg
.fc_dst
= *prefix
;
4321 cfg
.fc_gateway
= *gwaddr
;
4323 /* We should treat it as a default route if prefix length is 0. */
4325 cfg
.fc_flags
|= RTF_DEFAULT
;
4327 ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
);
4329 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, dev
);
4333 struct fib6_info
*rt6_get_dflt_router(struct net
*net
,
4334 const struct in6_addr
*addr
,
4335 struct net_device
*dev
)
4337 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
;
4338 struct fib6_info
*rt
;
4339 struct fib6_table
*table
;
4341 table
= fib6_get_table(net
, tb_id
);
4346 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
4349 /* RA routes do not use nexthops */
4354 if (dev
== nh
->fib_nh_dev
&&
4355 ((rt
->fib6_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
4356 ipv6_addr_equal(&nh
->fib_nh_gw6
, addr
))
4359 if (rt
&& !fib6_info_hold_safe(rt
))
4365 struct fib6_info
*rt6_add_dflt_router(struct net
*net
,
4366 const struct in6_addr
*gwaddr
,
4367 struct net_device
*dev
,
4369 u32 defrtr_usr_metric
,
4372 struct fib6_config cfg
= {
4373 .fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
,
4374 .fc_metric
= defrtr_usr_metric
,
4375 .fc_ifindex
= dev
->ifindex
,
4376 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
4377 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
4378 .fc_protocol
= RTPROT_RA
,
4379 .fc_type
= RTN_UNICAST
,
4380 .fc_nlinfo
.portid
= 0,
4381 .fc_nlinfo
.nlh
= NULL
,
4382 .fc_nlinfo
.nl_net
= net
,
4383 .fc_expires
= jiffies_to_clock_t(lifetime
* HZ
),
4386 cfg
.fc_gateway
= *gwaddr
;
4388 if (!ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
)) {
4389 struct fib6_table
*table
;
4391 table
= fib6_get_table(dev_net(dev
), cfg
.fc_table
);
4393 table
->flags
|= RT6_TABLE_HAS_DFLT_ROUTER
;
4396 return rt6_get_dflt_router(net
, gwaddr
, dev
);
4399 static void __rt6_purge_dflt_routers(struct net
*net
,
4400 struct fib6_table
*table
)
4402 struct fib6_info
*rt
;
4406 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
4407 struct net_device
*dev
= fib6_info_nh_dev(rt
);
4408 struct inet6_dev
*idev
= dev
? __in6_dev_get(dev
) : NULL
;
4410 if (rt
->fib6_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
) &&
4411 (!idev
|| idev
->cnf
.accept_ra
!= 2) &&
4412 fib6_info_hold_safe(rt
)) {
4414 ip6_del_rt(net
, rt
, false);
4420 table
->flags
&= ~RT6_TABLE_HAS_DFLT_ROUTER
;
4423 void rt6_purge_dflt_routers(struct net
*net
)
4425 struct fib6_table
*table
;
4426 struct hlist_head
*head
;
4431 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
4432 head
= &net
->ipv6
.fib_table_hash
[h
];
4433 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
4434 if (table
->flags
& RT6_TABLE_HAS_DFLT_ROUTER
)
4435 __rt6_purge_dflt_routers(net
, table
);
4442 static void rtmsg_to_fib6_config(struct net
*net
,
4443 struct in6_rtmsg
*rtmsg
,
4444 struct fib6_config
*cfg
)
4446 *cfg
= (struct fib6_config
){
4447 .fc_table
= l3mdev_fib_table_by_index(net
, rtmsg
->rtmsg_ifindex
) ?
4449 .fc_ifindex
= rtmsg
->rtmsg_ifindex
,
4450 .fc_metric
= rtmsg
->rtmsg_metric
,
4451 .fc_expires
= rtmsg
->rtmsg_info
,
4452 .fc_dst_len
= rtmsg
->rtmsg_dst_len
,
4453 .fc_src_len
= rtmsg
->rtmsg_src_len
,
4454 .fc_flags
= rtmsg
->rtmsg_flags
,
4455 .fc_type
= rtmsg
->rtmsg_type
,
4457 .fc_nlinfo
.nl_net
= net
,
4459 .fc_dst
= rtmsg
->rtmsg_dst
,
4460 .fc_src
= rtmsg
->rtmsg_src
,
4461 .fc_gateway
= rtmsg
->rtmsg_gateway
,
4465 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, struct in6_rtmsg
*rtmsg
)
4467 struct fib6_config cfg
;
4470 if (cmd
!= SIOCADDRT
&& cmd
!= SIOCDELRT
)
4472 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
4475 rtmsg_to_fib6_config(net
, rtmsg
, &cfg
);
4480 /* Only do the default setting of fc_metric in route adding */
4481 if (cfg
.fc_metric
== 0)
4482 cfg
.fc_metric
= IP6_RT_PRIO_USER
;
4483 err
= ip6_route_add(&cfg
, GFP_KERNEL
, NULL
);
4486 err
= ip6_route_del(&cfg
, NULL
);
4494 * Drop the packet on the floor
4497 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
4499 struct dst_entry
*dst
= skb_dst(skb
);
4500 struct net
*net
= dev_net(dst
->dev
);
4501 struct inet6_dev
*idev
;
4505 if (netif_is_l3_master(skb
->dev
) ||
4506 dst
->dev
== net
->loopback_dev
)
4507 idev
= __in6_dev_get_safely(dev_get_by_index_rcu(net
, IP6CB(skb
)->iif
));
4509 idev
= ip6_dst_idev(dst
);
4511 switch (ipstats_mib_noroutes
) {
4512 case IPSTATS_MIB_INNOROUTES
:
4513 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
4514 if (type
== IPV6_ADDR_ANY
) {
4515 SKB_DR_SET(reason
, IP_INADDRERRORS
);
4516 IP6_INC_STATS(net
, idev
, IPSTATS_MIB_INADDRERRORS
);
4519 SKB_DR_SET(reason
, IP_INNOROUTES
);
4521 case IPSTATS_MIB_OUTNOROUTES
:
4522 SKB_DR_OR(reason
, IP_OUTNOROUTES
);
4523 IP6_INC_STATS(net
, idev
, ipstats_mib_noroutes
);
4527 /* Start over by dropping the dst for l3mdev case */
4528 if (netif_is_l3_master(skb
->dev
))
4531 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
4532 kfree_skb_reason(skb
, reason
);
4536 static int ip6_pkt_discard(struct sk_buff
*skb
)
4538 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
4541 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
4543 skb
->dev
= skb_dst(skb
)->dev
;
4544 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
4547 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
4549 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
4552 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
4554 skb
->dev
= skb_dst(skb
)->dev
;
4555 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
4559 * Allocate a dst for local (unicast / anycast) address.
4562 struct fib6_info
*addrconf_f6i_alloc(struct net
*net
,
4563 struct inet6_dev
*idev
,
4564 const struct in6_addr
*addr
,
4565 bool anycast
, gfp_t gfp_flags
,
4566 struct netlink_ext_ack
*extack
)
4568 struct fib6_config cfg
= {
4569 .fc_table
= l3mdev_fib_table(idev
->dev
) ? : RT6_TABLE_LOCAL
,
4570 .fc_ifindex
= idev
->dev
->ifindex
,
4571 .fc_flags
= RTF_UP
| RTF_NONEXTHOP
,
4574 .fc_protocol
= RTPROT_KERNEL
,
4575 .fc_nlinfo
.nl_net
= net
,
4576 .fc_ignore_dev_down
= true,
4578 struct fib6_info
*f6i
;
4581 cfg
.fc_type
= RTN_ANYCAST
;
4582 cfg
.fc_flags
|= RTF_ANYCAST
;
4584 cfg
.fc_type
= RTN_LOCAL
;
4585 cfg
.fc_flags
|= RTF_LOCAL
;
4588 f6i
= ip6_route_info_create(&cfg
, gfp_flags
, extack
);
4590 f6i
->dst_nocount
= true;
4593 (READ_ONCE(net
->ipv6
.devconf_all
->disable_policy
) ||
4594 READ_ONCE(idev
->cnf
.disable_policy
)))
4595 f6i
->dst_nopolicy
= true;
4601 /* remove deleted ip from prefsrc entries */
4602 struct arg_dev_net_ip
{
4604 struct in6_addr
*addr
;
4607 static int fib6_remove_prefsrc(struct fib6_info
*rt
, void *arg
)
4609 struct net
*net
= ((struct arg_dev_net_ip
*)arg
)->net
;
4610 struct in6_addr
*addr
= ((struct arg_dev_net_ip
*)arg
)->addr
;
4613 rt
!= net
->ipv6
.fib6_null_entry
&&
4614 ipv6_addr_equal(addr
, &rt
->fib6_prefsrc
.addr
) &&
4615 !ipv6_chk_addr(net
, addr
, rt
->fib6_nh
->fib_nh_dev
, 0)) {
4616 spin_lock_bh(&rt6_exception_lock
);
4617 /* remove prefsrc entry */
4618 rt
->fib6_prefsrc
.plen
= 0;
4619 spin_unlock_bh(&rt6_exception_lock
);
4624 void rt6_remove_prefsrc(struct inet6_ifaddr
*ifp
)
4626 struct net
*net
= dev_net(ifp
->idev
->dev
);
4627 struct arg_dev_net_ip adni
= {
4631 fib6_clean_all(net
, fib6_remove_prefsrc
, &adni
);
4634 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
4636 /* Remove routers and update dst entries when gateway turn into host. */
4637 static int fib6_clean_tohost(struct fib6_info
*rt
, void *arg
)
4639 struct in6_addr
*gateway
= (struct in6_addr
*)arg
;
4642 /* RA routes do not use nexthops */
4647 if (((rt
->fib6_flags
& RTF_RA_ROUTER
) == RTF_RA_ROUTER
) &&
4648 nh
->fib_nh_gw_family
&& ipv6_addr_equal(gateway
, &nh
->fib_nh_gw6
))
4651 /* Further clean up cached routes in exception table.
4652 * This is needed because cached route may have a different
4653 * gateway than its 'parent' in the case of an ip redirect.
4655 fib6_nh_exceptions_clean_tohost(nh
, gateway
);
4660 void rt6_clean_tohost(struct net
*net
, struct in6_addr
*gateway
)
4662 fib6_clean_all(net
, fib6_clean_tohost
, gateway
);
4665 struct arg_netdev_event
{
4666 const struct net_device
*dev
;
4668 unsigned char nh_flags
;
4669 unsigned long event
;
4673 static struct fib6_info
*rt6_multipath_first_sibling(const struct fib6_info
*rt
)
4675 struct fib6_info
*iter
;
4676 struct fib6_node
*fn
;
4678 fn
= rcu_dereference_protected(rt
->fib6_node
,
4679 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
4680 iter
= rcu_dereference_protected(fn
->leaf
,
4681 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
4683 if (iter
->fib6_metric
== rt
->fib6_metric
&&
4684 rt6_qualify_for_ecmp(iter
))
4686 iter
= rcu_dereference_protected(iter
->fib6_next
,
4687 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
4693 /* only called for fib entries with builtin fib6_nh */
4694 static bool rt6_is_dead(const struct fib6_info
*rt
)
4696 if (rt
->fib6_nh
->fib_nh_flags
& RTNH_F_DEAD
||
4697 (rt
->fib6_nh
->fib_nh_flags
& RTNH_F_LINKDOWN
&&
4698 ip6_ignore_linkdown(rt
->fib6_nh
->fib_nh_dev
)))
4704 static int rt6_multipath_total_weight(const struct fib6_info
*rt
)
4706 struct fib6_info
*iter
;
4709 if (!rt6_is_dead(rt
))
4710 total
+= rt
->fib6_nh
->fib_nh_weight
;
4712 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
) {
4713 if (!rt6_is_dead(iter
))
4714 total
+= iter
->fib6_nh
->fib_nh_weight
;
4720 static void rt6_upper_bound_set(struct fib6_info
*rt
, int *weight
, int total
)
4722 int upper_bound
= -1;
4724 if (!rt6_is_dead(rt
)) {
4725 *weight
+= rt
->fib6_nh
->fib_nh_weight
;
4726 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
) (*weight
) << 31,
4729 atomic_set(&rt
->fib6_nh
->fib_nh_upper_bound
, upper_bound
);
4732 static void rt6_multipath_upper_bound_set(struct fib6_info
*rt
, int total
)
4734 struct fib6_info
*iter
;
4737 rt6_upper_bound_set(rt
, &weight
, total
);
4739 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4740 rt6_upper_bound_set(iter
, &weight
, total
);
4743 void rt6_multipath_rebalance(struct fib6_info
*rt
)
4745 struct fib6_info
*first
;
4748 /* In case the entire multipath route was marked for flushing,
4749 * then there is no need to rebalance upon the removal of every
4752 if (!rt
->fib6_nsiblings
|| rt
->should_flush
)
4755 /* During lookup routes are evaluated in order, so we need to
4756 * make sure upper bounds are assigned from the first sibling
4759 first
= rt6_multipath_first_sibling(rt
);
4760 if (WARN_ON_ONCE(!first
))
4763 total
= rt6_multipath_total_weight(first
);
4764 rt6_multipath_upper_bound_set(first
, total
);
4767 static int fib6_ifup(struct fib6_info
*rt
, void *p_arg
)
4769 const struct arg_netdev_event
*arg
= p_arg
;
4770 struct net
*net
= dev_net(arg
->dev
);
4772 if (rt
!= net
->ipv6
.fib6_null_entry
&& !rt
->nh
&&
4773 rt
->fib6_nh
->fib_nh_dev
== arg
->dev
) {
4774 rt
->fib6_nh
->fib_nh_flags
&= ~arg
->nh_flags
;
4775 fib6_update_sernum_upto_root(net
, rt
);
4776 rt6_multipath_rebalance(rt
);
4782 void rt6_sync_up(struct net_device
*dev
, unsigned char nh_flags
)
4784 struct arg_netdev_event arg
= {
4787 .nh_flags
= nh_flags
,
4791 if (nh_flags
& RTNH_F_DEAD
&& netif_carrier_ok(dev
))
4792 arg
.nh_flags
|= RTNH_F_LINKDOWN
;
4794 fib6_clean_all(dev_net(dev
), fib6_ifup
, &arg
);
4797 /* only called for fib entries with inline fib6_nh */
4798 static bool rt6_multipath_uses_dev(const struct fib6_info
*rt
,
4799 const struct net_device
*dev
)
4801 struct fib6_info
*iter
;
4803 if (rt
->fib6_nh
->fib_nh_dev
== dev
)
4805 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4806 if (iter
->fib6_nh
->fib_nh_dev
== dev
)
4812 static void rt6_multipath_flush(struct fib6_info
*rt
)
4814 struct fib6_info
*iter
;
4816 rt
->should_flush
= 1;
4817 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4818 iter
->should_flush
= 1;
4821 static unsigned int rt6_multipath_dead_count(const struct fib6_info
*rt
,
4822 const struct net_device
*down_dev
)
4824 struct fib6_info
*iter
;
4825 unsigned int dead
= 0;
4827 if (rt
->fib6_nh
->fib_nh_dev
== down_dev
||
4828 rt
->fib6_nh
->fib_nh_flags
& RTNH_F_DEAD
)
4830 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4831 if (iter
->fib6_nh
->fib_nh_dev
== down_dev
||
4832 iter
->fib6_nh
->fib_nh_flags
& RTNH_F_DEAD
)
4838 static void rt6_multipath_nh_flags_set(struct fib6_info
*rt
,
4839 const struct net_device
*dev
,
4840 unsigned char nh_flags
)
4842 struct fib6_info
*iter
;
4844 if (rt
->fib6_nh
->fib_nh_dev
== dev
)
4845 rt
->fib6_nh
->fib_nh_flags
|= nh_flags
;
4846 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4847 if (iter
->fib6_nh
->fib_nh_dev
== dev
)
4848 iter
->fib6_nh
->fib_nh_flags
|= nh_flags
;
4851 /* called with write lock held for table with rt */
4852 static int fib6_ifdown(struct fib6_info
*rt
, void *p_arg
)
4854 const struct arg_netdev_event
*arg
= p_arg
;
4855 const struct net_device
*dev
= arg
->dev
;
4856 struct net
*net
= dev_net(dev
);
4858 if (rt
== net
->ipv6
.fib6_null_entry
|| rt
->nh
)
4861 switch (arg
->event
) {
4862 case NETDEV_UNREGISTER
:
4863 return rt
->fib6_nh
->fib_nh_dev
== dev
? -1 : 0;
4865 if (rt
->should_flush
)
4867 if (!rt
->fib6_nsiblings
)
4868 return rt
->fib6_nh
->fib_nh_dev
== dev
? -1 : 0;
4869 if (rt6_multipath_uses_dev(rt
, dev
)) {
4872 count
= rt6_multipath_dead_count(rt
, dev
);
4873 if (rt
->fib6_nsiblings
+ 1 == count
) {
4874 rt6_multipath_flush(rt
);
4877 rt6_multipath_nh_flags_set(rt
, dev
, RTNH_F_DEAD
|
4879 fib6_update_sernum(net
, rt
);
4880 rt6_multipath_rebalance(rt
);
4884 if (rt
->fib6_nh
->fib_nh_dev
!= dev
||
4885 rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
))
4887 rt
->fib6_nh
->fib_nh_flags
|= RTNH_F_LINKDOWN
;
4888 rt6_multipath_rebalance(rt
);
4895 void rt6_sync_down_dev(struct net_device
*dev
, unsigned long event
)
4897 struct arg_netdev_event arg
= {
4903 struct net
*net
= dev_net(dev
);
4905 if (net
->ipv6
.sysctl
.skip_notify_on_dev_down
)
4906 fib6_clean_all_skip_notify(net
, fib6_ifdown
, &arg
);
4908 fib6_clean_all(net
, fib6_ifdown
, &arg
);
4911 void rt6_disable_ip(struct net_device
*dev
, unsigned long event
)
4913 rt6_sync_down_dev(dev
, event
);
4914 rt6_uncached_list_flush_dev(dev
);
4915 neigh_ifdown(&nd_tbl
, dev
);
4918 struct rt6_mtu_change_arg
{
4919 struct net_device
*dev
;
4921 struct fib6_info
*f6i
;
4924 static int fib6_nh_mtu_change(struct fib6_nh
*nh
, void *_arg
)
4926 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*)_arg
;
4927 struct fib6_info
*f6i
= arg
->f6i
;
4929 /* For administrative MTU increase, there is no way to discover
4930 * IPv6 PMTU increase, so PMTU increase should be updated here.
4931 * Since RFC 1981 doesn't include administrative MTU increase
4932 * update PMTU increase is a MUST. (i.e. jumbo frame)
4934 if (nh
->fib_nh_dev
== arg
->dev
) {
4935 struct inet6_dev
*idev
= __in6_dev_get(arg
->dev
);
4936 u32 mtu
= f6i
->fib6_pmtu
;
4938 if (mtu
>= arg
->mtu
||
4939 (mtu
< arg
->mtu
&& mtu
== idev
->cnf
.mtu6
))
4940 fib6_metric_set(f6i
, RTAX_MTU
, arg
->mtu
);
4942 spin_lock_bh(&rt6_exception_lock
);
4943 rt6_exceptions_update_pmtu(idev
, nh
, arg
->mtu
);
4944 spin_unlock_bh(&rt6_exception_lock
);
4950 static int rt6_mtu_change_route(struct fib6_info
*f6i
, void *p_arg
)
4952 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
4953 struct inet6_dev
*idev
;
4955 /* In IPv6 pmtu discovery is not optional,
4956 so that RTAX_MTU lock cannot disable it.
4957 We still use this lock to block changes
4958 caused by addrconf/ndisc.
4961 idev
= __in6_dev_get(arg
->dev
);
4965 if (fib6_metric_locked(f6i
, RTAX_MTU
))
4970 /* fib6_nh_mtu_change only returns 0, so this is safe */
4971 return nexthop_for_each_fib6_nh(f6i
->nh
, fib6_nh_mtu_change
,
4975 return fib6_nh_mtu_change(f6i
->fib6_nh
, arg
);
4978 void rt6_mtu_change(struct net_device
*dev
, unsigned int mtu
)
4980 struct rt6_mtu_change_arg arg
= {
4985 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, &arg
);
4988 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
4989 [RTA_UNSPEC
] = { .strict_start_type
= RTA_DPORT
+ 1 },
4990 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
4991 [RTA_PREFSRC
] = { .len
= sizeof(struct in6_addr
) },
4992 [RTA_OIF
] = { .type
= NLA_U32
},
4993 [RTA_IIF
] = { .type
= NLA_U32
},
4994 [RTA_PRIORITY
] = { .type
= NLA_U32
},
4995 [RTA_METRICS
] = { .type
= NLA_NESTED
},
4996 [RTA_MULTIPATH
] = { .len
= sizeof(struct rtnexthop
) },
4997 [RTA_PREF
] = { .type
= NLA_U8
},
4998 [RTA_ENCAP_TYPE
] = { .type
= NLA_U16
},
4999 [RTA_ENCAP
] = { .type
= NLA_NESTED
},
5000 [RTA_EXPIRES
] = { .type
= NLA_U32
},
5001 [RTA_UID
] = { .type
= NLA_U32
},
5002 [RTA_MARK
] = { .type
= NLA_U32
},
5003 [RTA_TABLE
] = { .type
= NLA_U32
},
5004 [RTA_IP_PROTO
] = { .type
= NLA_U8
},
5005 [RTA_SPORT
] = { .type
= NLA_U16
},
5006 [RTA_DPORT
] = { .type
= NLA_U16
},
5007 [RTA_NH_ID
] = { .type
= NLA_U32
},
5008 [RTA_FLOWLABEL
] = { .type
= NLA_BE32
},
5011 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
5012 struct fib6_config
*cfg
,
5013 struct netlink_ext_ack
*extack
)
5016 struct nlattr
*tb
[RTA_MAX
+1];
5020 err
= nlmsg_parse_deprecated(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
5021 rtm_ipv6_policy
, extack
);
5026 rtm
= nlmsg_data(nlh
);
5029 NL_SET_ERR_MSG(extack
,
5030 "Invalid dsfield (tos): option not available for IPv6");
5034 if (tb
[RTA_FLOWLABEL
]) {
5035 NL_SET_ERR_MSG_ATTR(extack
, tb
[RTA_FLOWLABEL
],
5036 "Flow label cannot be specified for this operation");
5040 *cfg
= (struct fib6_config
){
5041 .fc_table
= rtm
->rtm_table
,
5042 .fc_dst_len
= rtm
->rtm_dst_len
,
5043 .fc_src_len
= rtm
->rtm_src_len
,
5045 .fc_protocol
= rtm
->rtm_protocol
,
5046 .fc_type
= rtm
->rtm_type
,
5048 .fc_nlinfo
.portid
= NETLINK_CB(skb
).portid
,
5049 .fc_nlinfo
.nlh
= nlh
,
5050 .fc_nlinfo
.nl_net
= sock_net(skb
->sk
),
5053 if (rtm
->rtm_type
== RTN_UNREACHABLE
||
5054 rtm
->rtm_type
== RTN_BLACKHOLE
||
5055 rtm
->rtm_type
== RTN_PROHIBIT
||
5056 rtm
->rtm_type
== RTN_THROW
)
5057 cfg
->fc_flags
|= RTF_REJECT
;
5059 if (rtm
->rtm_type
== RTN_LOCAL
)
5060 cfg
->fc_flags
|= RTF_LOCAL
;
5062 if (rtm
->rtm_flags
& RTM_F_CLONED
)
5063 cfg
->fc_flags
|= RTF_CACHE
;
5065 cfg
->fc_flags
|= (rtm
->rtm_flags
& RTNH_F_ONLINK
);
5067 if (tb
[RTA_NH_ID
]) {
5068 if (tb
[RTA_GATEWAY
] || tb
[RTA_OIF
] ||
5069 tb
[RTA_MULTIPATH
] || tb
[RTA_ENCAP
]) {
5070 NL_SET_ERR_MSG(extack
,
5071 "Nexthop specification and nexthop id are mutually exclusive");
5074 cfg
->fc_nh_id
= nla_get_u32(tb
[RTA_NH_ID
]);
5077 if (tb
[RTA_GATEWAY
]) {
5078 cfg
->fc_gateway
= nla_get_in6_addr(tb
[RTA_GATEWAY
]);
5079 cfg
->fc_flags
|= RTF_GATEWAY
;
5082 NL_SET_ERR_MSG(extack
, "IPv6 does not support RTA_VIA attribute");
5087 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
5089 if (nla_len(tb
[RTA_DST
]) < plen
)
5092 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
5096 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
5098 if (nla_len(tb
[RTA_SRC
]) < plen
)
5101 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
5104 if (tb
[RTA_PREFSRC
])
5105 cfg
->fc_prefsrc
= nla_get_in6_addr(tb
[RTA_PREFSRC
]);
5108 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
5110 if (tb
[RTA_PRIORITY
])
5111 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
5113 if (tb
[RTA_METRICS
]) {
5114 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
5115 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
5119 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
5121 if (tb
[RTA_MULTIPATH
]) {
5122 cfg
->fc_mp
= nla_data(tb
[RTA_MULTIPATH
]);
5123 cfg
->fc_mp_len
= nla_len(tb
[RTA_MULTIPATH
]);
5125 err
= lwtunnel_valid_encap_type_attr(cfg
->fc_mp
,
5126 cfg
->fc_mp_len
, extack
);
5132 pref
= nla_get_u8(tb
[RTA_PREF
]);
5133 if (pref
!= ICMPV6_ROUTER_PREF_LOW
&&
5134 pref
!= ICMPV6_ROUTER_PREF_HIGH
)
5135 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
5136 cfg
->fc_flags
|= RTF_PREF(pref
);
5140 cfg
->fc_encap
= tb
[RTA_ENCAP
];
5142 if (tb
[RTA_ENCAP_TYPE
]) {
5143 cfg
->fc_encap_type
= nla_get_u16(tb
[RTA_ENCAP_TYPE
]);
5145 err
= lwtunnel_valid_encap_type(cfg
->fc_encap_type
, extack
);
5150 if (tb
[RTA_EXPIRES
]) {
5151 unsigned long timeout
= addrconf_timeout_fixup(nla_get_u32(tb
[RTA_EXPIRES
]), HZ
);
5153 if (addrconf_finite_timeout(timeout
)) {
5154 cfg
->fc_expires
= jiffies_to_clock_t(timeout
* HZ
);
5155 cfg
->fc_flags
|= RTF_EXPIRES
;
5165 struct fib6_info
*fib6_info
;
5166 struct fib6_config r_cfg
;
5167 struct list_head next
;
5170 static int ip6_route_info_append(struct net
*net
,
5171 struct list_head
*rt6_nh_list
,
5172 struct fib6_info
*rt
,
5173 struct fib6_config
*r_cfg
)
5178 list_for_each_entry(nh
, rt6_nh_list
, next
) {
5179 /* check if fib6_info already exists */
5180 if (rt6_duplicate_nexthop(nh
->fib6_info
, rt
))
5184 nh
= kzalloc(sizeof(*nh
), GFP_KERNEL
);
5188 memcpy(&nh
->r_cfg
, r_cfg
, sizeof(*r_cfg
));
5189 list_add_tail(&nh
->next
, rt6_nh_list
);
5194 static void ip6_route_mpath_notify(struct fib6_info
*rt
,
5195 struct fib6_info
*rt_last
,
5196 struct nl_info
*info
,
5199 /* if this is an APPEND route, then rt points to the first route
5200 * inserted and rt_last points to last route inserted. Userspace
5201 * wants a consistent dump of the route which starts at the first
5202 * nexthop. Since sibling routes are always added at the end of
5203 * the list, find the first sibling of the last route appended
5207 if ((nlflags
& NLM_F_APPEND
) && rt_last
&& rt_last
->fib6_nsiblings
) {
5208 rt
= list_first_or_null_rcu(&rt_last
->fib6_siblings
,
5214 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
5219 static bool ip6_route_mpath_should_notify(const struct fib6_info
*rt
)
5221 bool rt_can_ecmp
= rt6_qualify_for_ecmp(rt
);
5222 bool should_notify
= false;
5223 struct fib6_info
*leaf
;
5224 struct fib6_node
*fn
;
5227 fn
= rcu_dereference(rt
->fib6_node
);
5231 leaf
= rcu_dereference(fn
->leaf
);
5236 (rt_can_ecmp
&& rt
->fib6_metric
== leaf
->fib6_metric
&&
5237 rt6_qualify_for_ecmp(leaf
)))
5238 should_notify
= true;
5242 return should_notify
;
5245 static int fib6_gw_from_attr(struct in6_addr
*gw
, struct nlattr
*nla
,
5246 struct netlink_ext_ack
*extack
)
5248 if (nla_len(nla
) < sizeof(*gw
)) {
5249 NL_SET_ERR_MSG(extack
, "Invalid IPv6 address in RTA_GATEWAY");
5253 *gw
= nla_get_in6_addr(nla
);
5258 static int ip6_route_multipath_add(struct fib6_config
*cfg
,
5259 struct netlink_ext_ack
*extack
)
5261 struct fib6_info
*rt_notif
= NULL
, *rt_last
= NULL
;
5262 struct nl_info
*info
= &cfg
->fc_nlinfo
;
5263 struct fib6_config r_cfg
;
5264 struct rtnexthop
*rtnh
;
5265 struct fib6_info
*rt
;
5266 struct rt6_nh
*err_nh
;
5267 struct rt6_nh
*nh
, *nh_safe
;
5273 int replace
= (cfg
->fc_nlinfo
.nlh
&&
5274 (cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_REPLACE
));
5275 LIST_HEAD(rt6_nh_list
);
5277 nlflags
= replace
? NLM_F_REPLACE
: NLM_F_CREATE
;
5278 if (info
->nlh
&& info
->nlh
->nlmsg_flags
& NLM_F_APPEND
)
5279 nlflags
|= NLM_F_APPEND
;
5281 remaining
= cfg
->fc_mp_len
;
5282 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
5284 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
5285 * fib6_info structs per nexthop
5287 while (rtnh_ok(rtnh
, remaining
)) {
5288 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
5289 if (rtnh
->rtnh_ifindex
)
5290 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
5292 attrlen
= rtnh_attrlen(rtnh
);
5294 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
5296 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
5298 err
= fib6_gw_from_attr(&r_cfg
.fc_gateway
, nla
,
5303 r_cfg
.fc_flags
|= RTF_GATEWAY
;
5305 r_cfg
.fc_encap
= nla_find(attrs
, attrlen
, RTA_ENCAP
);
5307 /* RTA_ENCAP_TYPE length checked in
5308 * lwtunnel_valid_encap_type_attr
5310 nla
= nla_find(attrs
, attrlen
, RTA_ENCAP_TYPE
);
5312 r_cfg
.fc_encap_type
= nla_get_u16(nla
);
5315 r_cfg
.fc_flags
|= (rtnh
->rtnh_flags
& RTNH_F_ONLINK
);
5316 rt
= ip6_route_info_create(&r_cfg
, GFP_KERNEL
, extack
);
5322 if (!rt6_qualify_for_ecmp(rt
)) {
5324 NL_SET_ERR_MSG(extack
,
5325 "Device only routes can not be added for IPv6 using the multipath API.");
5326 fib6_info_release(rt
);
5330 rt
->fib6_nh
->fib_nh_weight
= rtnh
->rtnh_hops
+ 1;
5332 err
= ip6_route_info_append(info
->nl_net
, &rt6_nh_list
,
5335 fib6_info_release(rt
);
5339 rtnh
= rtnh_next(rtnh
, &remaining
);
5342 if (list_empty(&rt6_nh_list
)) {
5343 NL_SET_ERR_MSG(extack
,
5344 "Invalid nexthop configuration - no valid nexthops");
5348 /* for add and replace send one notification with all nexthops.
5349 * Skip the notification in fib6_add_rt2node and send one with
5350 * the full route when done
5352 info
->skip_notify
= 1;
5354 /* For add and replace, send one notification with all nexthops. For
5355 * append, send one notification with all appended nexthops.
5357 info
->skip_notify_kernel
= 1;
5360 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
5361 err
= __ip6_ins_rt(nh
->fib6_info
, info
, extack
);
5365 NL_SET_ERR_MSG_MOD(extack
,
5366 "multipath route replace failed (check consistency of installed routes)");
5370 /* save reference to last route successfully inserted */
5371 rt_last
= nh
->fib6_info
;
5373 /* save reference to first route for notification */
5375 rt_notif
= nh
->fib6_info
;
5377 /* Because each route is added like a single route we remove
5378 * these flags after the first nexthop: if there is a collision,
5379 * we have already failed to add the first nexthop:
5380 * fib6_add_rt2node() has rejected it; when replacing, old
5381 * nexthops have been replaced by first new, the rest should
5384 if (cfg
->fc_nlinfo
.nlh
) {
5385 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
&= ~(NLM_F_EXCL
|
5387 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
|= NLM_F_CREATE
;
5392 /* An in-kernel notification should only be sent in case the new
5393 * multipath route is added as the first route in the node, or if
5394 * it was appended to it. We pass 'rt_notif' since it is the first
5395 * sibling and might allow us to skip some checks in the replace case.
5397 if (ip6_route_mpath_should_notify(rt_notif
)) {
5398 enum fib_event_type fib_event
;
5400 if (rt_notif
->fib6_nsiblings
!= nhn
- 1)
5401 fib_event
= FIB_EVENT_ENTRY_APPEND
;
5403 fib_event
= FIB_EVENT_ENTRY_REPLACE
;
5405 err
= call_fib6_multipath_entry_notifiers(info
->nl_net
,
5406 fib_event
, rt_notif
,
5409 /* Delete all the siblings that were just added */
5415 /* success ... tell user about new route */
5416 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
5420 /* send notification for routes that were added so that
5421 * the delete notifications sent by ip6_route_del are
5425 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
5427 /* Delete routes that were already added */
5428 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
5431 ip6_route_del(&nh
->r_cfg
, extack
);
5435 list_for_each_entry_safe(nh
, nh_safe
, &rt6_nh_list
, next
) {
5436 fib6_info_release(nh
->fib6_info
);
5437 list_del(&nh
->next
);
5444 static int ip6_route_multipath_del(struct fib6_config
*cfg
,
5445 struct netlink_ext_ack
*extack
)
5447 struct fib6_config r_cfg
;
5448 struct rtnexthop
*rtnh
;
5454 remaining
= cfg
->fc_mp_len
;
5455 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
5457 /* Parse a Multipath Entry */
5458 while (rtnh_ok(rtnh
, remaining
)) {
5459 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
5460 if (rtnh
->rtnh_ifindex
)
5461 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
5463 attrlen
= rtnh_attrlen(rtnh
);
5465 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
5467 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
5469 err
= fib6_gw_from_attr(&r_cfg
.fc_gateway
, nla
,
5476 r_cfg
.fc_flags
|= RTF_GATEWAY
;
5479 err
= ip6_route_del(&r_cfg
, extack
);
5484 rtnh
= rtnh_next(rtnh
, &remaining
);
5490 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
5491 struct netlink_ext_ack
*extack
)
5493 struct fib6_config cfg
;
5496 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
5501 !nexthop_find_by_id(sock_net(skb
->sk
), cfg
.fc_nh_id
)) {
5502 NL_SET_ERR_MSG(extack
, "Nexthop id does not exist");
5507 return ip6_route_multipath_del(&cfg
, extack
);
5509 cfg
.fc_delete_all_nh
= 1;
5510 return ip6_route_del(&cfg
, extack
);
5514 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
5515 struct netlink_ext_ack
*extack
)
5517 struct fib6_config cfg
;
5520 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
5524 if (cfg
.fc_metric
== 0)
5525 cfg
.fc_metric
= IP6_RT_PRIO_USER
;
5528 return ip6_route_multipath_add(&cfg
, extack
);
5530 return ip6_route_add(&cfg
, GFP_KERNEL
, extack
);
5533 /* add the overhead of this fib6_nh to nexthop_len */
5534 static int rt6_nh_nlmsg_size(struct fib6_nh
*nh
, void *arg
)
5536 int *nexthop_len
= arg
;
5538 *nexthop_len
+= nla_total_size(0) /* RTA_MULTIPATH */
5539 + NLA_ALIGN(sizeof(struct rtnexthop
))
5540 + nla_total_size(16); /* RTA_GATEWAY */
5542 if (nh
->fib_nh_lws
) {
5543 /* RTA_ENCAP_TYPE */
5544 *nexthop_len
+= lwtunnel_get_encap_size(nh
->fib_nh_lws
);
5546 *nexthop_len
+= nla_total_size(2);
5552 static size_t rt6_nlmsg_size(struct fib6_info
*f6i
)
5557 nexthop_len
= nla_total_size(4); /* RTA_NH_ID */
5558 nexthop_for_each_fib6_nh(f6i
->nh
, rt6_nh_nlmsg_size
,
5561 struct fib6_nh
*nh
= f6i
->fib6_nh
;
5562 struct fib6_info
*sibling
;
5565 if (f6i
->fib6_nsiblings
) {
5566 rt6_nh_nlmsg_size(nh
, &nexthop_len
);
5570 list_for_each_entry_rcu(sibling
, &f6i
->fib6_siblings
,
5572 rt6_nh_nlmsg_size(sibling
->fib6_nh
, &nexthop_len
);
5577 nexthop_len
+= lwtunnel_get_encap_size(nh
->fib_nh_lws
);
5580 return NLMSG_ALIGN(sizeof(struct rtmsg
))
5581 + nla_total_size(16) /* RTA_SRC */
5582 + nla_total_size(16) /* RTA_DST */
5583 + nla_total_size(16) /* RTA_GATEWAY */
5584 + nla_total_size(16) /* RTA_PREFSRC */
5585 + nla_total_size(4) /* RTA_TABLE */
5586 + nla_total_size(4) /* RTA_IIF */
5587 + nla_total_size(4) /* RTA_OIF */
5588 + nla_total_size(4) /* RTA_PRIORITY */
5589 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
5590 + nla_total_size(sizeof(struct rta_cacheinfo
))
5591 + nla_total_size(TCP_CA_NAME_MAX
) /* RTAX_CC_ALGO */
5592 + nla_total_size(1) /* RTA_PREF */
5596 static int rt6_fill_node_nexthop(struct sk_buff
*skb
, struct nexthop
*nh
,
5597 unsigned char *flags
)
5599 if (nexthop_is_multipath(nh
)) {
5602 mp
= nla_nest_start_noflag(skb
, RTA_MULTIPATH
);
5604 goto nla_put_failure
;
5606 if (nexthop_mpath_fill_node(skb
, nh
, AF_INET6
))
5607 goto nla_put_failure
;
5609 nla_nest_end(skb
, mp
);
5611 struct fib6_nh
*fib6_nh
;
5613 fib6_nh
= nexthop_fib6_nh(nh
);
5614 if (fib_nexthop_info(skb
, &fib6_nh
->nh_common
, AF_INET6
,
5616 goto nla_put_failure
;
5625 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
5626 struct fib6_info
*rt
, struct dst_entry
*dst
,
5627 struct in6_addr
*dest
, struct in6_addr
*src
,
5628 int iif
, int type
, u32 portid
, u32 seq
,
5631 struct rt6_info
*rt6
= dst_rt6_info(dst
);
5632 struct rt6key
*rt6_dst
, *rt6_src
;
5633 u32
*pmetrics
, table
, rt6_flags
;
5634 unsigned char nh_flags
= 0;
5635 struct nlmsghdr
*nlh
;
5639 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*rtm
), flags
);
5644 rt6_dst
= &rt6
->rt6i_dst
;
5645 rt6_src
= &rt6
->rt6i_src
;
5646 rt6_flags
= rt6
->rt6i_flags
;
5648 rt6_dst
= &rt
->fib6_dst
;
5649 rt6_src
= &rt
->fib6_src
;
5650 rt6_flags
= rt
->fib6_flags
;
5653 rtm
= nlmsg_data(nlh
);
5654 rtm
->rtm_family
= AF_INET6
;
5655 rtm
->rtm_dst_len
= rt6_dst
->plen
;
5656 rtm
->rtm_src_len
= rt6_src
->plen
;
5659 table
= rt
->fib6_table
->tb6_id
;
5661 table
= RT6_TABLE_UNSPEC
;
5662 rtm
->rtm_table
= table
< 256 ? table
: RT_TABLE_COMPAT
;
5663 if (nla_put_u32(skb
, RTA_TABLE
, table
))
5664 goto nla_put_failure
;
5666 rtm
->rtm_type
= rt
->fib6_type
;
5668 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
5669 rtm
->rtm_protocol
= rt
->fib6_protocol
;
5671 if (rt6_flags
& RTF_CACHE
)
5672 rtm
->rtm_flags
|= RTM_F_CLONED
;
5675 if (nla_put_in6_addr(skb
, RTA_DST
, dest
))
5676 goto nla_put_failure
;
5677 rtm
->rtm_dst_len
= 128;
5678 } else if (rtm
->rtm_dst_len
)
5679 if (nla_put_in6_addr(skb
, RTA_DST
, &rt6_dst
->addr
))
5680 goto nla_put_failure
;
5681 #ifdef CONFIG_IPV6_SUBTREES
5683 if (nla_put_in6_addr(skb
, RTA_SRC
, src
))
5684 goto nla_put_failure
;
5685 rtm
->rtm_src_len
= 128;
5686 } else if (rtm
->rtm_src_len
&&
5687 nla_put_in6_addr(skb
, RTA_SRC
, &rt6_src
->addr
))
5688 goto nla_put_failure
;
5691 #ifdef CONFIG_IPV6_MROUTE
5692 if (ipv6_addr_is_multicast(&rt6_dst
->addr
)) {
5693 int err
= ip6mr_get_route(net
, skb
, rtm
, portid
);
5698 goto nla_put_failure
;
5701 if (nla_put_u32(skb
, RTA_IIF
, iif
))
5702 goto nla_put_failure
;
5704 struct in6_addr saddr_buf
;
5705 if (ip6_route_get_saddr(net
, rt
, dest
, 0, 0, &saddr_buf
) == 0 &&
5706 nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
5707 goto nla_put_failure
;
5710 if (rt
->fib6_prefsrc
.plen
) {
5711 struct in6_addr saddr_buf
;
5712 saddr_buf
= rt
->fib6_prefsrc
.addr
;
5713 if (nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
5714 goto nla_put_failure
;
5717 pmetrics
= dst
? dst_metrics_ptr(dst
) : rt
->fib6_metrics
->metrics
;
5718 if (rtnetlink_put_metrics(skb
, pmetrics
) < 0)
5719 goto nla_put_failure
;
5721 if (nla_put_u32(skb
, RTA_PRIORITY
, rt
->fib6_metric
))
5722 goto nla_put_failure
;
5724 /* For multipath routes, walk the siblings list and add
5725 * each as a nexthop within RTA_MULTIPATH.
5728 if (rt6_flags
& RTF_GATEWAY
&&
5729 nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt6
->rt6i_gateway
))
5730 goto nla_put_failure
;
5732 if (dst
->dev
&& nla_put_u32(skb
, RTA_OIF
, dst
->dev
->ifindex
))
5733 goto nla_put_failure
;
5735 if (dst
->lwtstate
&&
5736 lwtunnel_fill_encap(skb
, dst
->lwtstate
, RTA_ENCAP
, RTA_ENCAP_TYPE
) < 0)
5737 goto nla_put_failure
;
5738 } else if (rt
->fib6_nsiblings
) {
5739 struct fib6_info
*sibling
;
5742 mp
= nla_nest_start_noflag(skb
, RTA_MULTIPATH
);
5744 goto nla_put_failure
;
5746 if (fib_add_nexthop(skb
, &rt
->fib6_nh
->nh_common
,
5747 rt
->fib6_nh
->fib_nh_weight
, AF_INET6
,
5749 goto nla_put_failure
;
5753 list_for_each_entry_rcu(sibling
, &rt
->fib6_siblings
,
5755 if (fib_add_nexthop(skb
, &sibling
->fib6_nh
->nh_common
,
5756 sibling
->fib6_nh
->fib_nh_weight
,
5760 goto nla_put_failure
;
5766 nla_nest_end(skb
, mp
);
5767 } else if (rt
->nh
) {
5768 if (nla_put_u32(skb
, RTA_NH_ID
, rt
->nh
->id
))
5769 goto nla_put_failure
;
5771 if (nexthop_is_blackhole(rt
->nh
))
5772 rtm
->rtm_type
= RTN_BLACKHOLE
;
5774 if (READ_ONCE(net
->ipv4
.sysctl_nexthop_compat_mode
) &&
5775 rt6_fill_node_nexthop(skb
, rt
->nh
, &nh_flags
) < 0)
5776 goto nla_put_failure
;
5778 rtm
->rtm_flags
|= nh_flags
;
5780 if (fib_nexthop_info(skb
, &rt
->fib6_nh
->nh_common
, AF_INET6
,
5781 &nh_flags
, false) < 0)
5782 goto nla_put_failure
;
5784 rtm
->rtm_flags
|= nh_flags
;
5787 if (rt6_flags
& RTF_EXPIRES
) {
5788 expires
= dst
? dst
->expires
: rt
->expires
;
5793 if (READ_ONCE(rt
->offload
))
5794 rtm
->rtm_flags
|= RTM_F_OFFLOAD
;
5795 if (READ_ONCE(rt
->trap
))
5796 rtm
->rtm_flags
|= RTM_F_TRAP
;
5797 if (READ_ONCE(rt
->offload_failed
))
5798 rtm
->rtm_flags
|= RTM_F_OFFLOAD_FAILED
;
5801 if (rtnl_put_cacheinfo(skb
, dst
, 0, expires
, dst
? dst
->error
: 0) < 0)
5802 goto nla_put_failure
;
5804 if (nla_put_u8(skb
, RTA_PREF
, IPV6_EXTRACT_PREF(rt6_flags
)))
5805 goto nla_put_failure
;
5808 nlmsg_end(skb
, nlh
);
5812 nlmsg_cancel(skb
, nlh
);
5816 static int fib6_info_nh_uses_dev(struct fib6_nh
*nh
, void *arg
)
5818 const struct net_device
*dev
= arg
;
5820 if (nh
->fib_nh_dev
== dev
)
5826 static bool fib6_info_uses_dev(const struct fib6_info
*f6i
,
5827 const struct net_device
*dev
)
5830 struct net_device
*_dev
= (struct net_device
*)dev
;
5832 return !!nexthop_for_each_fib6_nh(f6i
->nh
,
5833 fib6_info_nh_uses_dev
,
5837 if (f6i
->fib6_nh
->fib_nh_dev
== dev
)
5840 if (f6i
->fib6_nsiblings
) {
5841 struct fib6_info
*sibling
, *next_sibling
;
5843 list_for_each_entry_safe(sibling
, next_sibling
,
5844 &f6i
->fib6_siblings
, fib6_siblings
) {
5845 if (sibling
->fib6_nh
->fib_nh_dev
== dev
)
5853 struct fib6_nh_exception_dump_walker
{
5854 struct rt6_rtnl_dump_arg
*dump
;
5855 struct fib6_info
*rt
;
5861 static int rt6_nh_dump_exceptions(struct fib6_nh
*nh
, void *arg
)
5863 struct fib6_nh_exception_dump_walker
*w
= arg
;
5864 struct rt6_rtnl_dump_arg
*dump
= w
->dump
;
5865 struct rt6_exception_bucket
*bucket
;
5866 struct rt6_exception
*rt6_ex
;
5869 bucket
= fib6_nh_get_excptn_bucket(nh
, NULL
);
5873 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
5874 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
5880 /* Expiration of entries doesn't bump sernum, insertion
5881 * does. Removal is triggered by insertion, so we can
5882 * rely on the fact that if entries change between two
5883 * partial dumps, this node is scanned again completely,
5884 * see rt6_insert_exception() and fib6_dump_table().
5886 * Count expired entries we go through as handled
5887 * entries that we'll skip next time, in case of partial
5888 * node dump. Otherwise, if entries expire meanwhile,
5889 * we'll skip the wrong amount.
5891 if (rt6_check_expired(rt6_ex
->rt6i
)) {
5896 err
= rt6_fill_node(dump
->net
, dump
->skb
, w
->rt
,
5897 &rt6_ex
->rt6i
->dst
, NULL
, NULL
, 0,
5899 NETLINK_CB(dump
->cb
->skb
).portid
,
5900 dump
->cb
->nlh
->nlmsg_seq
, w
->flags
);
5912 /* Return -1 if done with node, number of handled routes on partial dump */
5913 int rt6_dump_route(struct fib6_info
*rt
, void *p_arg
, unsigned int skip
)
5915 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
5916 struct fib_dump_filter
*filter
= &arg
->filter
;
5917 unsigned int flags
= NLM_F_MULTI
;
5918 struct net
*net
= arg
->net
;
5921 if (rt
== net
->ipv6
.fib6_null_entry
)
5924 if ((filter
->flags
& RTM_F_PREFIX
) &&
5925 !(rt
->fib6_flags
& RTF_PREFIX_RT
)) {
5926 /* success since this is not a prefix route */
5929 if (filter
->filter_set
&&
5930 ((filter
->rt_type
&& rt
->fib6_type
!= filter
->rt_type
) ||
5931 (filter
->dev
&& !fib6_info_uses_dev(rt
, filter
->dev
)) ||
5932 (filter
->protocol
&& rt
->fib6_protocol
!= filter
->protocol
))) {
5936 if (filter
->filter_set
||
5937 !filter
->dump_routes
|| !filter
->dump_exceptions
) {
5938 flags
|= NLM_F_DUMP_FILTERED
;
5941 if (filter
->dump_routes
) {
5945 if (rt6_fill_node(net
, arg
->skb
, rt
, NULL
, NULL
, NULL
,
5947 NETLINK_CB(arg
->cb
->skb
).portid
,
5948 arg
->cb
->nlh
->nlmsg_seq
, flags
)) {
5955 if (filter
->dump_exceptions
) {
5956 struct fib6_nh_exception_dump_walker w
= { .dump
= arg
,
5965 err
= nexthop_for_each_fib6_nh(rt
->nh
,
5966 rt6_nh_dump_exceptions
,
5969 err
= rt6_nh_dump_exceptions(rt
->fib6_nh
, &w
);
5974 return count
+ w
.count
;
5980 static int inet6_rtm_valid_getroute_req(struct sk_buff
*skb
,
5981 const struct nlmsghdr
*nlh
,
5983 struct netlink_ext_ack
*extack
)
5988 if (nlh
->nlmsg_len
< nlmsg_msg_size(sizeof(*rtm
))) {
5989 NL_SET_ERR_MSG_MOD(extack
,
5990 "Invalid header for get route request");
5994 if (!netlink_strict_get_check(skb
))
5995 return nlmsg_parse_deprecated(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
5996 rtm_ipv6_policy
, extack
);
5998 rtm
= nlmsg_data(nlh
);
5999 if ((rtm
->rtm_src_len
&& rtm
->rtm_src_len
!= 128) ||
6000 (rtm
->rtm_dst_len
&& rtm
->rtm_dst_len
!= 128) ||
6001 rtm
->rtm_table
|| rtm
->rtm_protocol
|| rtm
->rtm_scope
||
6003 NL_SET_ERR_MSG_MOD(extack
, "Invalid values in header for get route request");
6006 if (rtm
->rtm_flags
& ~RTM_F_FIB_MATCH
) {
6007 NL_SET_ERR_MSG_MOD(extack
,
6008 "Invalid flags for get route request");
6012 err
= nlmsg_parse_deprecated_strict(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
6013 rtm_ipv6_policy
, extack
);
6017 if ((tb
[RTA_SRC
] && !rtm
->rtm_src_len
) ||
6018 (tb
[RTA_DST
] && !rtm
->rtm_dst_len
)) {
6019 NL_SET_ERR_MSG_MOD(extack
, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
6023 if (tb
[RTA_FLOWLABEL
] &&
6024 (nla_get_be32(tb
[RTA_FLOWLABEL
]) & ~IPV6_FLOWLABEL_MASK
)) {
6025 NL_SET_ERR_MSG_ATTR(extack
, tb
[RTA_FLOWLABEL
],
6026 "Invalid flow label");
6030 for (i
= 0; i
<= RTA_MAX
; i
++) {
6047 NL_SET_ERR_MSG_MOD(extack
, "Unsupported attribute in get route request");
6055 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
6056 struct netlink_ext_ack
*extack
)
6058 struct net
*net
= sock_net(in_skb
->sk
);
6059 struct nlattr
*tb
[RTA_MAX
+1];
6060 int err
, iif
= 0, oif
= 0;
6061 struct fib6_info
*from
;
6062 struct dst_entry
*dst
;
6063 struct rt6_info
*rt
;
6064 struct sk_buff
*skb
;
6066 struct flowi6 fl6
= {};
6070 err
= inet6_rtm_valid_getroute_req(in_skb
, nlh
, tb
, extack
);
6075 rtm
= nlmsg_data(nlh
);
6076 fibmatch
= !!(rtm
->rtm_flags
& RTM_F_FIB_MATCH
);
6079 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
6082 fl6
.saddr
= *(struct in6_addr
*)nla_data(tb
[RTA_SRC
]);
6086 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
6089 fl6
.daddr
= *(struct in6_addr
*)nla_data(tb
[RTA_DST
]);
6093 iif
= nla_get_u32(tb
[RTA_IIF
]);
6096 oif
= nla_get_u32(tb
[RTA_OIF
]);
6099 fl6
.flowi6_mark
= nla_get_u32(tb
[RTA_MARK
]);
6102 fl6
.flowi6_uid
= make_kuid(current_user_ns(),
6103 nla_get_u32(tb
[RTA_UID
]));
6105 fl6
.flowi6_uid
= iif
? INVALID_UID
: current_uid();
6108 fl6
.fl6_sport
= nla_get_be16(tb
[RTA_SPORT
]);
6111 fl6
.fl6_dport
= nla_get_be16(tb
[RTA_DPORT
]);
6113 if (tb
[RTA_IP_PROTO
]) {
6114 err
= rtm_getroute_parse_ip_proto(tb
[RTA_IP_PROTO
],
6115 &fl6
.flowi6_proto
, AF_INET6
,
6121 flowlabel
= nla_get_be32_default(tb
[RTA_FLOWLABEL
], 0);
6122 fl6
.flowlabel
= ip6_make_flowinfo(rtm
->rtm_tos
, flowlabel
);
6125 struct net_device
*dev
;
6130 dev
= dev_get_by_index_rcu(net
, iif
);
6137 fl6
.flowi6_iif
= iif
;
6139 if (!ipv6_addr_any(&fl6
.saddr
))
6140 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
6142 dst
= ip6_route_input_lookup(net
, dev
, &fl6
, NULL
, flags
);
6146 fl6
.flowi6_oif
= oif
;
6148 dst
= ip6_route_output(net
, NULL
, &fl6
);
6152 rt
= dst_rt6_info(dst
);
6153 if (rt
->dst
.error
) {
6154 err
= rt
->dst
.error
;
6159 if (rt
== net
->ipv6
.ip6_null_entry
) {
6160 err
= rt
->dst
.error
;
6165 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
6172 skb_dst_set(skb
, &rt
->dst
);
6175 from
= rcu_dereference(rt
->from
);
6178 err
= rt6_fill_node(net
, skb
, from
, NULL
, NULL
, NULL
,
6180 NETLINK_CB(in_skb
).portid
,
6183 err
= rt6_fill_node(net
, skb
, from
, dst
, &fl6
.daddr
,
6184 &fl6
.saddr
, iif
, RTM_NEWROUTE
,
6185 NETLINK_CB(in_skb
).portid
,
6197 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
6202 void inet6_rt_notify(int event
, struct fib6_info
*rt
, struct nl_info
*info
,
6203 unsigned int nlm_flags
)
6205 struct sk_buff
*skb
;
6206 struct net
*net
= info
->nl_net
;
6211 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
6213 skb
= nlmsg_new(rt6_nlmsg_size(rt
), GFP_ATOMIC
);
6217 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, NULL
, 0,
6218 event
, info
->portid
, seq
, nlm_flags
);
6220 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6221 WARN_ON(err
== -EMSGSIZE
);
6225 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
6226 info
->nlh
, GFP_ATOMIC
);
6229 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
6232 void fib6_rt_update(struct net
*net
, struct fib6_info
*rt
,
6233 struct nl_info
*info
)
6235 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
6236 struct sk_buff
*skb
;
6239 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
6243 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, NULL
, 0,
6244 RTM_NEWROUTE
, info
->portid
, seq
, NLM_F_REPLACE
);
6246 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6247 WARN_ON(err
== -EMSGSIZE
);
6251 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
6252 info
->nlh
, gfp_any());
6255 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
6258 void fib6_info_hw_flags_set(struct net
*net
, struct fib6_info
*f6i
,
6259 bool offload
, bool trap
, bool offload_failed
)
6261 struct sk_buff
*skb
;
6264 if (READ_ONCE(f6i
->offload
) == offload
&&
6265 READ_ONCE(f6i
->trap
) == trap
&&
6266 READ_ONCE(f6i
->offload_failed
) == offload_failed
)
6269 WRITE_ONCE(f6i
->offload
, offload
);
6270 WRITE_ONCE(f6i
->trap
, trap
);
6272 /* 2 means send notifications only if offload_failed was changed. */
6273 if (net
->ipv6
.sysctl
.fib_notify_on_flag_change
== 2 &&
6274 READ_ONCE(f6i
->offload_failed
) == offload_failed
)
6277 WRITE_ONCE(f6i
->offload_failed
, offload_failed
);
6279 if (!rcu_access_pointer(f6i
->fib6_node
))
6280 /* The route was removed from the tree, do not send
6285 if (!net
->ipv6
.sysctl
.fib_notify_on_flag_change
)
6288 skb
= nlmsg_new(rt6_nlmsg_size(f6i
), GFP_KERNEL
);
6294 err
= rt6_fill_node(net
, skb
, f6i
, NULL
, NULL
, NULL
, 0, RTM_NEWROUTE
, 0,
6297 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6298 WARN_ON(err
== -EMSGSIZE
);
6303 rtnl_notify(skb
, net
, 0, RTNLGRP_IPV6_ROUTE
, NULL
, GFP_KERNEL
);
6307 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
6309 EXPORT_SYMBOL(fib6_info_hw_flags_set
);
6311 static int ip6_route_dev_notify(struct notifier_block
*this,
6312 unsigned long event
, void *ptr
)
6314 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
6315 struct net
*net
= dev_net(dev
);
6317 if (!(dev
->flags
& IFF_LOOPBACK
))
6320 if (event
== NETDEV_REGISTER
) {
6321 net
->ipv6
.fib6_null_entry
->fib6_nh
->fib_nh_dev
= dev
;
6322 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
6323 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
6324 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6325 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
6326 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
6327 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
6328 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
6330 } else if (event
== NETDEV_UNREGISTER
&&
6331 dev
->reg_state
!= NETREG_UNREGISTERED
) {
6332 /* NETDEV_UNREGISTER could be fired for multiple times by
6333 * netdev_wait_allrefs(). Make sure we only call this once.
6335 in6_dev_put_clear(&net
->ipv6
.ip6_null_entry
->rt6i_idev
);
6336 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6337 in6_dev_put_clear(&net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
);
6338 in6_dev_put_clear(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
);
6349 #ifdef CONFIG_PROC_FS
6350 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
6352 struct net
*net
= (struct net
*)seq
->private;
6353 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
6354 net
->ipv6
.rt6_stats
->fib_nodes
,
6355 net
->ipv6
.rt6_stats
->fib_route_nodes
,
6356 atomic_read(&net
->ipv6
.rt6_stats
->fib_rt_alloc
),
6357 net
->ipv6
.rt6_stats
->fib_rt_entries
,
6358 net
->ipv6
.rt6_stats
->fib_rt_cache
,
6359 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
6360 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
6364 #endif /* CONFIG_PROC_FS */
6366 #ifdef CONFIG_SYSCTL
6368 static int ipv6_sysctl_rtcache_flush(const struct ctl_table
*ctl
, int write
,
6369 void *buffer
, size_t *lenp
, loff_t
*ppos
)
6377 ret
= proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
6381 net
= (struct net
*)ctl
->extra1
;
6382 delay
= net
->ipv6
.sysctl
.flush_delay
;
6383 fib6_run_gc(delay
<= 0 ? 0 : (unsigned long)delay
, net
, delay
> 0);
6387 static struct ctl_table ipv6_route_table_template
[] = {
6389 .procname
= "max_size",
6390 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
6391 .maxlen
= sizeof(int),
6393 .proc_handler
= proc_dointvec
,
6396 .procname
= "gc_thresh",
6397 .data
= &ip6_dst_ops_template
.gc_thresh
,
6398 .maxlen
= sizeof(int),
6400 .proc_handler
= proc_dointvec
,
6403 .procname
= "flush",
6404 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
6405 .maxlen
= sizeof(int),
6407 .proc_handler
= ipv6_sysctl_rtcache_flush
6410 .procname
= "gc_min_interval",
6411 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
6412 .maxlen
= sizeof(int),
6414 .proc_handler
= proc_dointvec_jiffies
,
6417 .procname
= "gc_timeout",
6418 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
6419 .maxlen
= sizeof(int),
6421 .proc_handler
= proc_dointvec_jiffies
,
6424 .procname
= "gc_interval",
6425 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
6426 .maxlen
= sizeof(int),
6428 .proc_handler
= proc_dointvec_jiffies
,
6431 .procname
= "gc_elasticity",
6432 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
6433 .maxlen
= sizeof(int),
6435 .proc_handler
= proc_dointvec
,
6438 .procname
= "mtu_expires",
6439 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
6440 .maxlen
= sizeof(int),
6442 .proc_handler
= proc_dointvec_jiffies
,
6445 .procname
= "min_adv_mss",
6446 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
6447 .maxlen
= sizeof(int),
6449 .proc_handler
= proc_dointvec
,
6452 .procname
= "gc_min_interval_ms",
6453 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
6454 .maxlen
= sizeof(int),
6456 .proc_handler
= proc_dointvec_ms_jiffies
,
6459 .procname
= "skip_notify_on_dev_down",
6460 .data
= &init_net
.ipv6
.sysctl
.skip_notify_on_dev_down
,
6461 .maxlen
= sizeof(u8
),
6463 .proc_handler
= proc_dou8vec_minmax
,
6464 .extra1
= SYSCTL_ZERO
,
6465 .extra2
= SYSCTL_ONE
,
6469 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
6471 struct ctl_table
*table
;
6473 table
= kmemdup(ipv6_route_table_template
,
6474 sizeof(ipv6_route_table_template
),
6478 table
[0].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
6479 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
6480 table
[2].data
= &net
->ipv6
.sysctl
.flush_delay
;
6481 table
[2].extra1
= net
;
6482 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
6483 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
6484 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
6485 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
6486 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
6487 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
6488 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
6489 table
[10].data
= &net
->ipv6
.sysctl
.skip_notify_on_dev_down
;
6495 size_t ipv6_route_sysctl_table_size(struct net
*net
)
6497 /* Don't export sysctls to unprivileged users */
6498 if (net
->user_ns
!= &init_user_ns
)
6501 return ARRAY_SIZE(ipv6_route_table_template
);
6505 static int __net_init
ip6_route_net_init(struct net
*net
)
6509 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
6510 sizeof(net
->ipv6
.ip6_dst_ops
));
6512 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
6513 goto out_ip6_dst_ops
;
6515 net
->ipv6
.fib6_null_entry
= fib6_info_alloc(GFP_KERNEL
, true);
6516 if (!net
->ipv6
.fib6_null_entry
)
6517 goto out_ip6_dst_entries
;
6518 memcpy(net
->ipv6
.fib6_null_entry
, &fib6_null_entry_template
,
6519 sizeof(*net
->ipv6
.fib6_null_entry
));
6521 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
6522 sizeof(*net
->ipv6
.ip6_null_entry
),
6524 if (!net
->ipv6
.ip6_null_entry
)
6525 goto out_fib6_null_entry
;
6526 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
6527 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
6528 ip6_template_metrics
, true);
6529 INIT_LIST_HEAD(&net
->ipv6
.ip6_null_entry
->dst
.rt_uncached
);
6531 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6532 net
->ipv6
.fib6_has_custom_rules
= false;
6533 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
6534 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
6536 if (!net
->ipv6
.ip6_prohibit_entry
)
6537 goto out_ip6_null_entry
;
6538 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
6539 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
6540 ip6_template_metrics
, true);
6541 INIT_LIST_HEAD(&net
->ipv6
.ip6_prohibit_entry
->dst
.rt_uncached
);
6543 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
6544 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
6546 if (!net
->ipv6
.ip6_blk_hole_entry
)
6547 goto out_ip6_prohibit_entry
;
6548 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
6549 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
6550 ip6_template_metrics
, true);
6551 INIT_LIST_HEAD(&net
->ipv6
.ip6_blk_hole_entry
->dst
.rt_uncached
);
6552 #ifdef CONFIG_IPV6_SUBTREES
6553 net
->ipv6
.fib6_routes_require_src
= 0;
6557 net
->ipv6
.sysctl
.flush_delay
= 0;
6558 net
->ipv6
.sysctl
.ip6_rt_max_size
= INT_MAX
;
6559 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
6560 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
6561 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
6562 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
6563 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
6564 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
6565 net
->ipv6
.sysctl
.skip_notify_on_dev_down
= 0;
6567 atomic_set(&net
->ipv6
.ip6_rt_gc_expire
, 30*HZ
);
6573 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6574 out_ip6_prohibit_entry
:
6575 kfree(net
->ipv6
.ip6_prohibit_entry
);
6577 kfree(net
->ipv6
.ip6_null_entry
);
6579 out_fib6_null_entry
:
6580 kfree(net
->ipv6
.fib6_null_entry
);
6581 out_ip6_dst_entries
:
6582 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
6587 static void __net_exit
ip6_route_net_exit(struct net
*net
)
6589 kfree(net
->ipv6
.fib6_null_entry
);
6590 kfree(net
->ipv6
.ip6_null_entry
);
6591 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6592 kfree(net
->ipv6
.ip6_prohibit_entry
);
6593 kfree(net
->ipv6
.ip6_blk_hole_entry
);
6595 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
6598 static int __net_init
ip6_route_net_init_late(struct net
*net
)
6600 #ifdef CONFIG_PROC_FS
6601 if (!proc_create_net("ipv6_route", 0, net
->proc_net
,
6602 &ipv6_route_seq_ops
,
6603 sizeof(struct ipv6_route_iter
)))
6606 if (!proc_create_net_single("rt6_stats", 0444, net
->proc_net
,
6607 rt6_stats_seq_show
, NULL
)) {
6608 remove_proc_entry("ipv6_route", net
->proc_net
);
6615 static void __net_exit
ip6_route_net_exit_late(struct net
*net
)
6617 #ifdef CONFIG_PROC_FS
6618 remove_proc_entry("ipv6_route", net
->proc_net
);
6619 remove_proc_entry("rt6_stats", net
->proc_net
);
6623 static struct pernet_operations ip6_route_net_ops
= {
6624 .init
= ip6_route_net_init
,
6625 .exit
= ip6_route_net_exit
,
6628 static int __net_init
ipv6_inetpeer_init(struct net
*net
)
6630 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
6634 inet_peer_base_init(bp
);
6635 net
->ipv6
.peers
= bp
;
6639 static void __net_exit
ipv6_inetpeer_exit(struct net
*net
)
6641 struct inet_peer_base
*bp
= net
->ipv6
.peers
;
6643 net
->ipv6
.peers
= NULL
;
6644 inetpeer_invalidate_tree(bp
);
6648 static struct pernet_operations ipv6_inetpeer_ops
= {
6649 .init
= ipv6_inetpeer_init
,
6650 .exit
= ipv6_inetpeer_exit
,
6653 static struct pernet_operations ip6_route_net_late_ops
= {
6654 .init
= ip6_route_net_init_late
,
6655 .exit
= ip6_route_net_exit_late
,
6658 static struct notifier_block ip6_route_dev_notifier
= {
6659 .notifier_call
= ip6_route_dev_notify
,
6660 .priority
= ADDRCONF_NOTIFY_PRIORITY
- 10,
6663 void __init
ip6_route_init_special_entries(void)
6665 /* Registering of the loopback is done before this portion of code,
6666 * the loopback reference in rt6_info will not be taken, do it
6667 * manually for init_net */
6668 init_net
.ipv6
.fib6_null_entry
->fib6_nh
->fib_nh_dev
= init_net
.loopback_dev
;
6669 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
6670 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
6671 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6672 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
6673 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
6674 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
6675 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
6679 #if IS_BUILTIN(CONFIG_IPV6)
6680 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6681 DEFINE_BPF_ITER_FUNC(ipv6_route
, struct bpf_iter_meta
*meta
, struct fib6_info
*rt
)
6683 BTF_ID_LIST(btf_fib6_info_id
)
6684 BTF_ID(struct, fib6_info
)
6686 static const struct bpf_iter_seq_info ipv6_route_seq_info
= {
6687 .seq_ops
= &ipv6_route_seq_ops
,
6688 .init_seq_private
= bpf_iter_init_seq_net
,
6689 .fini_seq_private
= bpf_iter_fini_seq_net
,
6690 .seq_priv_size
= sizeof(struct ipv6_route_iter
),
6693 static struct bpf_iter_reg ipv6_route_reg_info
= {
6694 .target
= "ipv6_route",
6695 .ctx_arg_info_size
= 1,
6697 { offsetof(struct bpf_iter__ipv6_route
, rt
),
6698 PTR_TO_BTF_ID_OR_NULL
},
6700 .seq_info
= &ipv6_route_seq_info
,
6703 static int __init
bpf_iter_register(void)
6705 ipv6_route_reg_info
.ctx_arg_info
[0].btf_id
= *btf_fib6_info_id
;
6706 return bpf_iter_reg_target(&ipv6_route_reg_info
);
6709 static void bpf_iter_unregister(void)
6711 bpf_iter_unreg_target(&ipv6_route_reg_info
);
6716 static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers
[] __initconst_or_module
= {
6717 {.owner
= THIS_MODULE
, .protocol
= PF_INET6
, .msgtype
= RTM_NEWROUTE
,
6718 .doit
= inet6_rtm_newroute
},
6719 {.owner
= THIS_MODULE
, .protocol
= PF_INET6
, .msgtype
= RTM_DELROUTE
,
6720 .doit
= inet6_rtm_delroute
},
6721 {.owner
= THIS_MODULE
, .protocol
= PF_INET6
, .msgtype
= RTM_GETROUTE
,
6722 .doit
= inet6_rtm_getroute
, .flags
= RTNL_FLAG_DOIT_UNLOCKED
},
6725 int __init
ip6_route_init(void)
6731 ip6_dst_ops_template
.kmem_cachep
=
6732 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
6733 SLAB_HWCACHE_ALIGN
| SLAB_ACCOUNT
, NULL
);
6734 if (!ip6_dst_ops_template
.kmem_cachep
)
6737 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
6739 goto out_kmem_cache
;
6741 ret
= register_pernet_subsys(&ipv6_inetpeer_ops
);
6743 goto out_dst_entries
;
6745 ret
= register_pernet_subsys(&ip6_route_net_ops
);
6747 goto out_register_inetpeer
;
6749 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
6753 goto out_register_subsys
;
6759 ret
= fib6_rules_init();
6763 ret
= register_pernet_subsys(&ip6_route_net_late_ops
);
6765 goto fib6_rules_init
;
6767 ret
= rtnl_register_many(ip6_route_rtnl_msg_handlers
);
6769 goto out_register_late_subsys
;
6771 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
6773 goto out_register_late_subsys
;
6775 #if IS_BUILTIN(CONFIG_IPV6)
6776 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6777 ret
= bpf_iter_register();
6779 goto out_register_late_subsys
;
6783 for_each_possible_cpu(cpu
) {
6784 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
6786 INIT_LIST_HEAD(&ul
->head
);
6787 spin_lock_init(&ul
->lock
);
6793 out_register_late_subsys
:
6794 rtnl_unregister_all(PF_INET6
);
6795 unregister_pernet_subsys(&ip6_route_net_late_ops
);
6797 fib6_rules_cleanup();
6802 out_register_subsys
:
6803 unregister_pernet_subsys(&ip6_route_net_ops
);
6804 out_register_inetpeer
:
6805 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
6807 dst_entries_destroy(&ip6_dst_blackhole_ops
);
6809 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
6813 void ip6_route_cleanup(void)
6815 #if IS_BUILTIN(CONFIG_IPV6)
6816 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6817 bpf_iter_unregister();
6820 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
6821 unregister_pernet_subsys(&ip6_route_net_late_ops
);
6822 fib6_rules_cleanup();
6825 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
6826 unregister_pernet_subsys(&ip6_route_net_ops
);
6827 dst_entries_destroy(&ip6_dst_blackhole_ops
);
6828 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);