2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type
);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup
);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD
= -3,
82 RT6_NUD_FAIL_PROBE
= -2,
83 RT6_NUD_FAIL_DO_RR
= -1,
87 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
88 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
89 static unsigned int ip6_mtu(const struct dst_entry
*dst
);
90 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
91 static void ip6_dst_destroy(struct dst_entry
*);
92 static void ip6_dst_ifdown(struct dst_entry
*,
93 struct net_device
*dev
, int how
);
94 static int ip6_dst_gc(struct dst_ops
*ops
);
96 static int ip6_pkt_discard(struct sk_buff
*skb
);
97 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
98 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
99 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
100 static void ip6_link_failure(struct sk_buff
*skb
);
101 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
102 struct sk_buff
*skb
, u32 mtu
);
103 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
104 struct sk_buff
*skb
);
105 static int rt6_score_route(struct fib6_info
*rt
, int oif
, int strict
);
106 static size_t rt6_nlmsg_size(struct fib6_info
*rt
);
107 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
108 struct fib6_info
*rt
, struct dst_entry
*dst
,
109 struct in6_addr
*dest
, struct in6_addr
*src
,
110 int iif
, int type
, u32 portid
, u32 seq
,
112 static struct rt6_info
*rt6_find_cached_rt(struct fib6_info
*rt
,
113 const struct in6_addr
*daddr
,
114 const struct in6_addr
*saddr
);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
118 const struct in6_addr
*prefix
, int prefixlen
,
119 const struct in6_addr
*gwaddr
,
120 struct net_device
*dev
,
122 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
123 const struct in6_addr
*prefix
, int prefixlen
,
124 const struct in6_addr
*gwaddr
,
125 struct net_device
*dev
);
128 struct uncached_list
{
130 struct list_head head
;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt6_uncached_list
);
135 void rt6_uncached_list_add(struct rt6_info
*rt
)
137 struct uncached_list
*ul
= raw_cpu_ptr(&rt6_uncached_list
);
139 rt
->rt6i_uncached_list
= ul
;
141 spin_lock_bh(&ul
->lock
);
142 list_add_tail(&rt
->rt6i_uncached
, &ul
->head
);
143 spin_unlock_bh(&ul
->lock
);
146 void rt6_uncached_list_del(struct rt6_info
*rt
)
148 if (!list_empty(&rt
->rt6i_uncached
)) {
149 struct uncached_list
*ul
= rt
->rt6i_uncached_list
;
150 struct net
*net
= dev_net(rt
->dst
.dev
);
152 spin_lock_bh(&ul
->lock
);
153 list_del(&rt
->rt6i_uncached
);
154 atomic_dec(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
155 spin_unlock_bh(&ul
->lock
);
159 static void rt6_uncached_list_flush_dev(struct net
*net
, struct net_device
*dev
)
161 struct net_device
*loopback_dev
= net
->loopback_dev
;
164 if (dev
== loopback_dev
)
167 for_each_possible_cpu(cpu
) {
168 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
171 spin_lock_bh(&ul
->lock
);
172 list_for_each_entry(rt
, &ul
->head
, rt6i_uncached
) {
173 struct inet6_dev
*rt_idev
= rt
->rt6i_idev
;
174 struct net_device
*rt_dev
= rt
->dst
.dev
;
176 if (rt_idev
->dev
== dev
) {
177 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
178 in6_dev_put(rt_idev
);
182 rt
->dst
.dev
= loopback_dev
;
183 dev_hold(rt
->dst
.dev
);
187 spin_unlock_bh(&ul
->lock
);
191 static inline const void *choose_neigh_daddr(const struct in6_addr
*p
,
195 if (!ipv6_addr_any(p
))
196 return (const void *) p
;
198 return &ipv6_hdr(skb
)->daddr
;
202 struct neighbour
*ip6_neigh_lookup(const struct in6_addr
*gw
,
203 struct net_device
*dev
,
209 daddr
= choose_neigh_daddr(gw
, skb
, daddr
);
210 n
= __ipv6_neigh_lookup(dev
, daddr
);
214 n
= neigh_create(&nd_tbl
, daddr
, dev
);
215 return IS_ERR(n
) ? NULL
: n
;
218 static struct neighbour
*ip6_dst_neigh_lookup(const struct dst_entry
*dst
,
222 const struct rt6_info
*rt
= container_of(dst
, struct rt6_info
, dst
);
224 return ip6_neigh_lookup(&rt
->rt6i_gateway
, dst
->dev
, skb
, daddr
);
227 static void ip6_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
229 struct net_device
*dev
= dst
->dev
;
230 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
232 daddr
= choose_neigh_daddr(&rt
->rt6i_gateway
, NULL
, daddr
);
235 if (dev
->flags
& (IFF_NOARP
| IFF_LOOPBACK
))
237 if (ipv6_addr_is_multicast((const struct in6_addr
*)daddr
))
239 __ipv6_confirm_neigh(dev
, daddr
);
242 static struct dst_ops ip6_dst_ops_template
= {
246 .check
= ip6_dst_check
,
247 .default_advmss
= ip6_default_advmss
,
249 .cow_metrics
= dst_cow_metrics_generic
,
250 .destroy
= ip6_dst_destroy
,
251 .ifdown
= ip6_dst_ifdown
,
252 .negative_advice
= ip6_negative_advice
,
253 .link_failure
= ip6_link_failure
,
254 .update_pmtu
= ip6_rt_update_pmtu
,
255 .redirect
= rt6_do_redirect
,
256 .local_out
= __ip6_local_out
,
257 .neigh_lookup
= ip6_dst_neigh_lookup
,
258 .confirm_neigh
= ip6_confirm_neigh
,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry
*dst
)
263 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
265 return mtu
? : dst
->dev
->mtu
;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
269 struct sk_buff
*skb
, u32 mtu
)
273 static void ip6_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
278 static struct dst_ops ip6_dst_blackhole_ops
= {
280 .destroy
= ip6_dst_destroy
,
281 .check
= ip6_dst_check
,
282 .mtu
= ip6_blackhole_mtu
,
283 .default_advmss
= ip6_default_advmss
,
284 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
285 .redirect
= ip6_rt_blackhole_redirect
,
286 .cow_metrics
= dst_cow_metrics_generic
,
287 .neigh_lookup
= ip6_dst_neigh_lookup
,
290 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
291 [RTAX_HOPLIMIT
- 1] = 0,
294 static const struct fib6_info fib6_null_entry_template
= {
295 .fib6_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
296 .fib6_protocol
= RTPROT_KERNEL
,
297 .fib6_metric
= ~(u32
)0,
298 .fib6_ref
= ATOMIC_INIT(1),
299 .fib6_type
= RTN_UNREACHABLE
,
300 .fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
,
303 static const struct rt6_info ip6_null_entry_template
= {
305 .__refcnt
= ATOMIC_INIT(1),
307 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
308 .error
= -ENETUNREACH
,
309 .input
= ip6_pkt_discard
,
310 .output
= ip6_pkt_discard_out
,
312 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template
= {
319 .__refcnt
= ATOMIC_INIT(1),
321 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
323 .input
= ip6_pkt_prohibit
,
324 .output
= ip6_pkt_prohibit_out
,
326 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
329 static const struct rt6_info ip6_blk_hole_entry_template
= {
331 .__refcnt
= ATOMIC_INIT(1),
333 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
335 .input
= dst_discard
,
336 .output
= dst_discard_out
,
338 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
343 static void rt6_info_init(struct rt6_info
*rt
)
345 struct dst_entry
*dst
= &rt
->dst
;
347 memset(dst
+ 1, 0, sizeof(*rt
) - sizeof(*dst
));
348 INIT_LIST_HEAD(&rt
->rt6i_uncached
);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info
*ip6_dst_alloc(struct net
*net
, struct net_device
*dev
,
355 struct rt6_info
*rt
= dst_alloc(&net
->ipv6
.ip6_dst_ops
, dev
,
356 1, DST_OBSOLETE_FORCE_CHK
, flags
);
360 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
365 EXPORT_SYMBOL(ip6_dst_alloc
);
367 static void ip6_dst_destroy(struct dst_entry
*dst
)
369 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
370 struct fib6_info
*from
;
371 struct inet6_dev
*idev
;
373 ip_dst_metrics_put(dst
);
374 rt6_uncached_list_del(rt
);
376 idev
= rt
->rt6i_idev
;
378 rt
->rt6i_idev
= NULL
;
382 from
= xchg((__force
struct fib6_info
**)&rt
->from
, NULL
);
383 fib6_info_release(from
);
386 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
389 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
390 struct inet6_dev
*idev
= rt
->rt6i_idev
;
391 struct net_device
*loopback_dev
=
392 dev_net(dev
)->loopback_dev
;
394 if (idev
&& idev
->dev
!= loopback_dev
) {
395 struct inet6_dev
*loopback_idev
= in6_dev_get(loopback_dev
);
397 rt
->rt6i_idev
= loopback_idev
;
403 static bool __rt6_check_expired(const struct rt6_info
*rt
)
405 if (rt
->rt6i_flags
& RTF_EXPIRES
)
406 return time_after(jiffies
, rt
->dst
.expires
);
411 static bool rt6_check_expired(const struct rt6_info
*rt
)
413 struct fib6_info
*from
;
415 from
= rcu_dereference(rt
->from
);
417 if (rt
->rt6i_flags
& RTF_EXPIRES
) {
418 if (time_after(jiffies
, rt
->dst
.expires
))
421 return rt
->dst
.obsolete
!= DST_OBSOLETE_FORCE_CHK
||
422 fib6_check_expired(from
);
427 struct fib6_info
*fib6_multipath_select(const struct net
*net
,
428 struct fib6_info
*match
,
429 struct flowi6
*fl6
, int oif
,
430 const struct sk_buff
*skb
,
433 struct fib6_info
*sibling
, *next_sibling
;
435 /* We might have already computed the hash for ICMPv6 errors. In such
436 * case it will always be non-zero. Otherwise now is the time to do it.
439 fl6
->mp_hash
= rt6_multipath_hash(net
, fl6
, skb
, NULL
);
441 if (fl6
->mp_hash
<= atomic_read(&match
->fib6_nh
.nh_upper_bound
))
444 list_for_each_entry_safe(sibling
, next_sibling
, &match
->fib6_siblings
,
448 nh_upper_bound
= atomic_read(&sibling
->fib6_nh
.nh_upper_bound
);
449 if (fl6
->mp_hash
> nh_upper_bound
)
451 if (rt6_score_route(sibling
, oif
, strict
) < 0)
461 * Route lookup. rcu_read_lock() should be held.
464 static inline struct fib6_info
*rt6_device_match(struct net
*net
,
465 struct fib6_info
*rt
,
466 const struct in6_addr
*saddr
,
470 struct fib6_info
*sprt
;
472 if (!oif
&& ipv6_addr_any(saddr
) &&
473 !(rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
))
476 for (sprt
= rt
; sprt
; sprt
= rcu_dereference(sprt
->fib6_next
)) {
477 const struct net_device
*dev
= sprt
->fib6_nh
.nh_dev
;
479 if (sprt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
483 if (dev
->ifindex
== oif
)
486 if (ipv6_chk_addr(net
, saddr
, dev
,
487 flags
& RT6_LOOKUP_F_IFACE
))
492 if (oif
&& flags
& RT6_LOOKUP_F_IFACE
)
493 return net
->ipv6
.fib6_null_entry
;
495 return rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
? net
->ipv6
.fib6_null_entry
: rt
;
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499 struct __rt6_probe_work
{
500 struct work_struct work
;
501 struct in6_addr target
;
502 struct net_device
*dev
;
505 static void rt6_probe_deferred(struct work_struct
*w
)
507 struct in6_addr mcaddr
;
508 struct __rt6_probe_work
*work
=
509 container_of(w
, struct __rt6_probe_work
, work
);
511 addrconf_addr_solict_mult(&work
->target
, &mcaddr
);
512 ndisc_send_ns(work
->dev
, &work
->target
, &mcaddr
, NULL
, 0);
517 static void rt6_probe(struct fib6_info
*rt
)
519 struct __rt6_probe_work
*work
= NULL
;
520 const struct in6_addr
*nh_gw
;
521 struct neighbour
*neigh
;
522 struct net_device
*dev
;
523 struct inet6_dev
*idev
;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt
|| !(rt
->fib6_flags
& RTF_GATEWAY
))
536 nh_gw
= &rt
->fib6_nh
.nh_gw
;
537 dev
= rt
->fib6_nh
.nh_dev
;
539 idev
= __in6_dev_get(dev
);
540 neigh
= __ipv6_neigh_lookup_noref(dev
, nh_gw
);
542 if (neigh
->nud_state
& NUD_VALID
)
545 write_lock(&neigh
->lock
);
546 if (!(neigh
->nud_state
& NUD_VALID
) &&
548 neigh
->updated
+ idev
->cnf
.rtr_probe_interval
)) {
549 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
551 __neigh_set_probe_once(neigh
);
553 write_unlock(&neigh
->lock
);
554 } else if (time_after(jiffies
, rt
->last_probe
+
555 idev
->cnf
.rtr_probe_interval
)) {
556 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
560 rt
->last_probe
= jiffies
;
561 INIT_WORK(&work
->work
, rt6_probe_deferred
);
562 work
->target
= *nh_gw
;
565 schedule_work(&work
->work
);
569 rcu_read_unlock_bh();
572 static inline void rt6_probe(struct fib6_info
*rt
)
578 * Default Router Selection (RFC 2461 6.3.6)
580 static inline int rt6_check_dev(struct fib6_info
*rt
, int oif
)
582 const struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
584 if (!oif
|| dev
->ifindex
== oif
)
589 static inline enum rt6_nud_state
rt6_check_neigh(struct fib6_info
*rt
)
591 enum rt6_nud_state ret
= RT6_NUD_FAIL_HARD
;
592 struct neighbour
*neigh
;
594 if (rt
->fib6_flags
& RTF_NONEXTHOP
||
595 !(rt
->fib6_flags
& RTF_GATEWAY
))
596 return RT6_NUD_SUCCEED
;
599 neigh
= __ipv6_neigh_lookup_noref(rt
->fib6_nh
.nh_dev
,
602 read_lock(&neigh
->lock
);
603 if (neigh
->nud_state
& NUD_VALID
)
604 ret
= RT6_NUD_SUCCEED
;
605 #ifdef CONFIG_IPV6_ROUTER_PREF
606 else if (!(neigh
->nud_state
& NUD_FAILED
))
607 ret
= RT6_NUD_SUCCEED
;
609 ret
= RT6_NUD_FAIL_PROBE
;
611 read_unlock(&neigh
->lock
);
613 ret
= IS_ENABLED(CONFIG_IPV6_ROUTER_PREF
) ?
614 RT6_NUD_SUCCEED
: RT6_NUD_FAIL_DO_RR
;
616 rcu_read_unlock_bh();
621 static int rt6_score_route(struct fib6_info
*rt
, int oif
, int strict
)
625 m
= rt6_check_dev(rt
, oif
);
626 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
627 return RT6_NUD_FAIL_HARD
;
628 #ifdef CONFIG_IPV6_ROUTER_PREF
629 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->fib6_flags
)) << 2;
631 if (strict
& RT6_LOOKUP_F_REACHABLE
) {
632 int n
= rt6_check_neigh(rt
);
639 /* called with rc_read_lock held */
640 static inline bool fib6_ignore_linkdown(const struct fib6_info
*f6i
)
642 const struct net_device
*dev
= fib6_info_nh_dev(f6i
);
646 const struct inet6_dev
*idev
= __in6_dev_get(dev
);
648 rc
= !!idev
->cnf
.ignore_routes_with_linkdown
;
654 static struct fib6_info
*find_match(struct fib6_info
*rt
, int oif
, int strict
,
655 int *mpri
, struct fib6_info
*match
,
659 bool match_do_rr
= false;
661 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
664 if (fib6_ignore_linkdown(rt
) &&
665 rt
->fib6_nh
.nh_flags
& RTNH_F_LINKDOWN
&&
666 !(strict
& RT6_LOOKUP_F_IGNORE_LINKSTATE
))
669 if (fib6_check_expired(rt
))
672 m
= rt6_score_route(rt
, oif
, strict
);
673 if (m
== RT6_NUD_FAIL_DO_RR
) {
675 m
= 0; /* lowest valid score */
676 } else if (m
== RT6_NUD_FAIL_HARD
) {
680 if (strict
& RT6_LOOKUP_F_REACHABLE
)
683 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685 *do_rr
= match_do_rr
;
693 static struct fib6_info
*find_rr_leaf(struct fib6_node
*fn
,
694 struct fib6_info
*leaf
,
695 struct fib6_info
*rr_head
,
696 u32 metric
, int oif
, int strict
,
699 struct fib6_info
*rt
, *match
, *cont
;
704 for (rt
= rr_head
; rt
; rt
= rcu_dereference(rt
->fib6_next
)) {
705 if (rt
->fib6_metric
!= metric
) {
710 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
713 for (rt
= leaf
; rt
&& rt
!= rr_head
;
714 rt
= rcu_dereference(rt
->fib6_next
)) {
715 if (rt
->fib6_metric
!= metric
) {
720 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
726 for (rt
= cont
; rt
; rt
= rcu_dereference(rt
->fib6_next
))
727 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
732 static struct fib6_info
*rt6_select(struct net
*net
, struct fib6_node
*fn
,
735 struct fib6_info
*leaf
= rcu_dereference(fn
->leaf
);
736 struct fib6_info
*match
, *rt0
;
740 if (!leaf
|| leaf
== net
->ipv6
.fib6_null_entry
)
741 return net
->ipv6
.fib6_null_entry
;
743 rt0
= rcu_dereference(fn
->rr_ptr
);
747 /* Double check to make sure fn is not an intermediate node
748 * and fn->leaf does not points to its child's leaf
749 * (This might happen if all routes under fn are deleted from
750 * the tree and fib6_repair_tree() is called on the node.)
752 key_plen
= rt0
->fib6_dst
.plen
;
753 #ifdef CONFIG_IPV6_SUBTREES
754 if (rt0
->fib6_src
.plen
)
755 key_plen
= rt0
->fib6_src
.plen
;
757 if (fn
->fn_bit
!= key_plen
)
758 return net
->ipv6
.fib6_null_entry
;
760 match
= find_rr_leaf(fn
, leaf
, rt0
, rt0
->fib6_metric
, oif
, strict
,
764 struct fib6_info
*next
= rcu_dereference(rt0
->fib6_next
);
766 /* no entries matched; do round-robin */
767 if (!next
|| next
->fib6_metric
!= rt0
->fib6_metric
)
771 spin_lock_bh(&leaf
->fib6_table
->tb6_lock
);
772 /* make sure next is not being deleted from the tree */
774 rcu_assign_pointer(fn
->rr_ptr
, next
);
775 spin_unlock_bh(&leaf
->fib6_table
->tb6_lock
);
779 return match
? match
: net
->ipv6
.fib6_null_entry
;
782 static bool rt6_is_gw_or_nonexthop(const struct fib6_info
*rt
)
784 return (rt
->fib6_flags
& (RTF_NONEXTHOP
| RTF_GATEWAY
));
787 #ifdef CONFIG_IPV6_ROUTE_INFO
788 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
789 const struct in6_addr
*gwaddr
)
791 struct net
*net
= dev_net(dev
);
792 struct route_info
*rinfo
= (struct route_info
*) opt
;
793 struct in6_addr prefix_buf
, *prefix
;
795 unsigned long lifetime
;
796 struct fib6_info
*rt
;
798 if (len
< sizeof(struct route_info
)) {
802 /* Sanity check for prefix_len and length */
803 if (rinfo
->length
> 3) {
805 } else if (rinfo
->prefix_len
> 128) {
807 } else if (rinfo
->prefix_len
> 64) {
808 if (rinfo
->length
< 2) {
811 } else if (rinfo
->prefix_len
> 0) {
812 if (rinfo
->length
< 1) {
817 pref
= rinfo
->route_pref
;
818 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
821 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
823 if (rinfo
->length
== 3)
824 prefix
= (struct in6_addr
*)rinfo
->prefix
;
826 /* this function is safe */
827 ipv6_addr_prefix(&prefix_buf
,
828 (struct in6_addr
*)rinfo
->prefix
,
830 prefix
= &prefix_buf
;
833 if (rinfo
->prefix_len
== 0)
834 rt
= rt6_get_dflt_router(net
, gwaddr
, dev
);
836 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
,
839 if (rt
&& !lifetime
) {
845 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
848 rt
->fib6_flags
= RTF_ROUTEINFO
|
849 (rt
->fib6_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
852 if (!addrconf_finite_timeout(lifetime
))
853 fib6_clean_expires(rt
);
855 fib6_set_expires(rt
, jiffies
+ HZ
* lifetime
);
857 fib6_info_release(rt
);
864 * Misc support functions
867 /* called with rcu_lock held */
868 static struct net_device
*ip6_rt_get_dev_rcu(struct fib6_info
*rt
)
870 struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
872 if (rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) {
873 /* for copies of local routes, dst->dev needs to be the
874 * device if it is a master device, the master device if
875 * device is enslaved, and the loopback as the default
877 if (netif_is_l3_slave(dev
) &&
878 !rt6_need_strict(&rt
->fib6_dst
.addr
))
879 dev
= l3mdev_master_dev_rcu(dev
);
880 else if (!netif_is_l3_master(dev
))
881 dev
= dev_net(dev
)->loopback_dev
;
882 /* last case is netif_is_l3_master(dev) is true in which
883 * case we want dev returned to be dev
890 static const int fib6_prop
[RTN_MAX
+ 1] = {
897 [RTN_BLACKHOLE
] = -EINVAL
,
898 [RTN_UNREACHABLE
] = -EHOSTUNREACH
,
899 [RTN_PROHIBIT
] = -EACCES
,
900 [RTN_THROW
] = -EAGAIN
,
902 [RTN_XRESOLVE
] = -EINVAL
,
905 static int ip6_rt_type_to_error(u8 fib6_type
)
907 return fib6_prop
[fib6_type
];
910 static unsigned short fib6_info_dst_flags(struct fib6_info
*rt
)
912 unsigned short flags
= 0;
915 flags
|= DST_NOCOUNT
;
916 if (rt
->dst_nopolicy
)
917 flags
|= DST_NOPOLICY
;
924 static void ip6_rt_init_dst_reject(struct rt6_info
*rt
, struct fib6_info
*ort
)
926 rt
->dst
.error
= ip6_rt_type_to_error(ort
->fib6_type
);
928 switch (ort
->fib6_type
) {
930 rt
->dst
.output
= dst_discard_out
;
931 rt
->dst
.input
= dst_discard
;
934 rt
->dst
.output
= ip6_pkt_prohibit_out
;
935 rt
->dst
.input
= ip6_pkt_prohibit
;
938 case RTN_UNREACHABLE
:
940 rt
->dst
.output
= ip6_pkt_discard_out
;
941 rt
->dst
.input
= ip6_pkt_discard
;
946 static void ip6_rt_init_dst(struct rt6_info
*rt
, struct fib6_info
*ort
)
948 if (ort
->fib6_flags
& RTF_REJECT
) {
949 ip6_rt_init_dst_reject(rt
, ort
);
954 rt
->dst
.output
= ip6_output
;
956 if (ort
->fib6_type
== RTN_LOCAL
|| ort
->fib6_type
== RTN_ANYCAST
) {
957 rt
->dst
.input
= ip6_input
;
958 } else if (ipv6_addr_type(&ort
->fib6_dst
.addr
) & IPV6_ADDR_MULTICAST
) {
959 rt
->dst
.input
= ip6_mc_input
;
961 rt
->dst
.input
= ip6_forward
;
964 if (ort
->fib6_nh
.nh_lwtstate
) {
965 rt
->dst
.lwtstate
= lwtstate_get(ort
->fib6_nh
.nh_lwtstate
);
966 lwtunnel_set_redirect(&rt
->dst
);
969 rt
->dst
.lastuse
= jiffies
;
972 /* Caller must already hold reference to @from */
973 static void rt6_set_from(struct rt6_info
*rt
, struct fib6_info
*from
)
975 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
976 rcu_assign_pointer(rt
->from
, from
);
977 ip_dst_init_metrics(&rt
->dst
, from
->fib6_metrics
);
980 /* Caller must already hold reference to @ort */
981 static void ip6_rt_copy_init(struct rt6_info
*rt
, struct fib6_info
*ort
)
983 struct net_device
*dev
= fib6_info_nh_dev(ort
);
985 ip6_rt_init_dst(rt
, ort
);
987 rt
->rt6i_dst
= ort
->fib6_dst
;
988 rt
->rt6i_idev
= dev
? in6_dev_get(dev
) : NULL
;
989 rt
->rt6i_gateway
= ort
->fib6_nh
.nh_gw
;
990 rt
->rt6i_flags
= ort
->fib6_flags
;
991 rt6_set_from(rt
, ort
);
992 #ifdef CONFIG_IPV6_SUBTREES
993 rt
->rt6i_src
= ort
->fib6_src
;
997 static struct fib6_node
* fib6_backtrack(struct fib6_node
*fn
,
998 struct in6_addr
*saddr
)
1000 struct fib6_node
*pn
, *sn
;
1002 if (fn
->fn_flags
& RTN_TL_ROOT
)
1004 pn
= rcu_dereference(fn
->parent
);
1005 sn
= FIB6_SUBTREE(pn
);
1007 fn
= fib6_node_lookup(sn
, NULL
, saddr
);
1010 if (fn
->fn_flags
& RTN_RTINFO
)
1015 static bool ip6_hold_safe(struct net
*net
, struct rt6_info
**prt
,
1018 struct rt6_info
*rt
= *prt
;
1020 if (dst_hold_safe(&rt
->dst
))
1022 if (null_fallback
) {
1023 rt
= net
->ipv6
.ip6_null_entry
;
1032 /* called with rcu_lock held */
1033 static struct rt6_info
*ip6_create_rt_rcu(struct fib6_info
*rt
)
1035 unsigned short flags
= fib6_info_dst_flags(rt
);
1036 struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
1037 struct rt6_info
*nrt
;
1039 if (!fib6_info_hold_safe(rt
))
1042 nrt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
);
1044 fib6_info_release(rt
);
1048 ip6_rt_copy_init(nrt
, rt
);
1052 nrt
= dev_net(dev
)->ipv6
.ip6_null_entry
;
1053 dst_hold(&nrt
->dst
);
1057 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
1058 struct fib6_table
*table
,
1060 const struct sk_buff
*skb
,
1063 struct fib6_info
*f6i
;
1064 struct fib6_node
*fn
;
1065 struct rt6_info
*rt
;
1067 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1068 flags
&= ~RT6_LOOKUP_F_IFACE
;
1071 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1073 f6i
= rcu_dereference(fn
->leaf
);
1075 f6i
= net
->ipv6
.fib6_null_entry
;
1077 f6i
= rt6_device_match(net
, f6i
, &fl6
->saddr
,
1078 fl6
->flowi6_oif
, flags
);
1079 if (f6i
->fib6_nsiblings
&& fl6
->flowi6_oif
== 0)
1080 f6i
= fib6_multipath_select(net
, f6i
, fl6
,
1081 fl6
->flowi6_oif
, skb
,
1084 if (f6i
== net
->ipv6
.fib6_null_entry
) {
1085 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1090 trace_fib6_table_lookup(net
, f6i
, table
, fl6
);
1092 /* Search through exception table */
1093 rt
= rt6_find_cached_rt(f6i
, &fl6
->daddr
, &fl6
->saddr
);
1095 if (ip6_hold_safe(net
, &rt
, true))
1096 dst_use_noref(&rt
->dst
, jiffies
);
1097 } else if (f6i
== net
->ipv6
.fib6_null_entry
) {
1098 rt
= net
->ipv6
.ip6_null_entry
;
1101 rt
= ip6_create_rt_rcu(f6i
);
1109 struct dst_entry
*ip6_route_lookup(struct net
*net
, struct flowi6
*fl6
,
1110 const struct sk_buff
*skb
, int flags
)
1112 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_lookup
);
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup
);
1116 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
1117 const struct in6_addr
*saddr
, int oif
,
1118 const struct sk_buff
*skb
, int strict
)
1120 struct flowi6 fl6
= {
1124 struct dst_entry
*dst
;
1125 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
1128 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
1129 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1132 dst
= fib6_rule_lookup(net
, &fl6
, skb
, flags
, ip6_pol_route_lookup
);
1133 if (dst
->error
== 0)
1134 return (struct rt6_info
*) dst
;
1140 EXPORT_SYMBOL(rt6_lookup
);
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143 * It takes new route entry, the addition fails by any reason the
1144 * route is released.
1145 * Caller must hold dst before calling it.
1148 static int __ip6_ins_rt(struct fib6_info
*rt
, struct nl_info
*info
,
1149 struct netlink_ext_ack
*extack
)
1152 struct fib6_table
*table
;
1154 table
= rt
->fib6_table
;
1155 spin_lock_bh(&table
->tb6_lock
);
1156 err
= fib6_add(&table
->tb6_root
, rt
, info
, extack
);
1157 spin_unlock_bh(&table
->tb6_lock
);
1162 int ip6_ins_rt(struct net
*net
, struct fib6_info
*rt
)
1164 struct nl_info info
= { .nl_net
= net
, };
1166 return __ip6_ins_rt(rt
, &info
, NULL
);
1169 static struct rt6_info
*ip6_rt_cache_alloc(struct fib6_info
*ort
,
1170 const struct in6_addr
*daddr
,
1171 const struct in6_addr
*saddr
)
1173 struct net_device
*dev
;
1174 struct rt6_info
*rt
;
1180 if (!fib6_info_hold_safe(ort
))
1183 dev
= ip6_rt_get_dev_rcu(ort
);
1184 rt
= ip6_dst_alloc(dev_net(dev
), dev
, 0);
1186 fib6_info_release(ort
);
1190 ip6_rt_copy_init(rt
, ort
);
1191 rt
->rt6i_flags
|= RTF_CACHE
;
1192 rt
->dst
.flags
|= DST_HOST
;
1193 rt
->rt6i_dst
.addr
= *daddr
;
1194 rt
->rt6i_dst
.plen
= 128;
1196 if (!rt6_is_gw_or_nonexthop(ort
)) {
1197 if (ort
->fib6_dst
.plen
!= 128 &&
1198 ipv6_addr_equal(&ort
->fib6_dst
.addr
, daddr
))
1199 rt
->rt6i_flags
|= RTF_ANYCAST
;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 if (rt
->rt6i_src
.plen
&& saddr
) {
1202 rt
->rt6i_src
.addr
= *saddr
;
1203 rt
->rt6i_src
.plen
= 128;
1211 static struct rt6_info
*ip6_rt_pcpu_alloc(struct fib6_info
*rt
)
1213 unsigned short flags
= fib6_info_dst_flags(rt
);
1214 struct net_device
*dev
;
1215 struct rt6_info
*pcpu_rt
;
1217 if (!fib6_info_hold_safe(rt
))
1221 dev
= ip6_rt_get_dev_rcu(rt
);
1222 pcpu_rt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
);
1225 fib6_info_release(rt
);
1228 ip6_rt_copy_init(pcpu_rt
, rt
);
1229 pcpu_rt
->rt6i_flags
|= RTF_PCPU
;
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info
*rt6_get_pcpu_route(struct fib6_info
*rt
)
1236 struct rt6_info
*pcpu_rt
, **p
;
1238 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1242 ip6_hold_safe(NULL
, &pcpu_rt
, false);
1247 static struct rt6_info
*rt6_make_pcpu_route(struct net
*net
,
1248 struct fib6_info
*rt
)
1250 struct rt6_info
*pcpu_rt
, *prev
, **p
;
1252 pcpu_rt
= ip6_rt_pcpu_alloc(rt
);
1254 dst_hold(&net
->ipv6
.ip6_null_entry
->dst
);
1255 return net
->ipv6
.ip6_null_entry
;
1258 dst_hold(&pcpu_rt
->dst
);
1259 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1260 prev
= cmpxchg(p
, NULL
, pcpu_rt
);
1263 if (rt
->fib6_destroying
) {
1264 struct fib6_info
*from
;
1266 from
= xchg((__force
struct fib6_info
**)&pcpu_rt
->from
, NULL
);
1267 fib6_info_release(from
);
1273 /* exception hash table implementation
1275 static DEFINE_SPINLOCK(rt6_exception_lock
);
1277 /* Remove rt6_ex from hash table and free the memory
1278 * Caller must hold rt6_exception_lock
1280 static void rt6_remove_exception(struct rt6_exception_bucket
*bucket
,
1281 struct rt6_exception
*rt6_ex
)
1283 struct fib6_info
*from
;
1286 if (!bucket
|| !rt6_ex
)
1289 net
= dev_net(rt6_ex
->rt6i
->dst
.dev
);
1290 net
->ipv6
.rt6_stats
->fib_rt_cache
--;
1292 /* purge completely the exception to allow releasing the held resources:
1293 * some [sk] cache may keep the dst around for unlimited time
1295 from
= xchg((__force
struct fib6_info
**)&rt6_ex
->rt6i
->from
, NULL
);
1296 fib6_info_release(from
);
1297 dst_dev_put(&rt6_ex
->rt6i
->dst
);
1299 hlist_del_rcu(&rt6_ex
->hlist
);
1300 dst_release(&rt6_ex
->rt6i
->dst
);
1301 kfree_rcu(rt6_ex
, rcu
);
1302 WARN_ON_ONCE(!bucket
->depth
);
1306 /* Remove oldest rt6_ex in bucket and free the memory
1307 * Caller must hold rt6_exception_lock
1309 static void rt6_exception_remove_oldest(struct rt6_exception_bucket
*bucket
)
1311 struct rt6_exception
*rt6_ex
, *oldest
= NULL
;
1316 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1317 if (!oldest
|| time_before(rt6_ex
->stamp
, oldest
->stamp
))
1320 rt6_remove_exception(bucket
, oldest
);
1323 static u32
rt6_exception_hash(const struct in6_addr
*dst
,
1324 const struct in6_addr
*src
)
1326 static u32 seed __read_mostly
;
1329 net_get_random_once(&seed
, sizeof(seed
));
1330 val
= jhash(dst
, sizeof(*dst
), seed
);
1332 #ifdef CONFIG_IPV6_SUBTREES
1334 val
= jhash(src
, sizeof(*src
), val
);
1336 return hash_32(val
, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT
);
1339 /* Helper function to find the cached rt in the hash table
1340 * and update bucket pointer to point to the bucket for this
1341 * (daddr, saddr) pair
1342 * Caller must hold rt6_exception_lock
1344 static struct rt6_exception
*
1345 __rt6_find_exception_spinlock(struct rt6_exception_bucket
**bucket
,
1346 const struct in6_addr
*daddr
,
1347 const struct in6_addr
*saddr
)
1349 struct rt6_exception
*rt6_ex
;
1352 if (!(*bucket
) || !daddr
)
1355 hval
= rt6_exception_hash(daddr
, saddr
);
1358 hlist_for_each_entry(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1359 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1360 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1362 #ifdef CONFIG_IPV6_SUBTREES
1363 if (matched
&& saddr
)
1364 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1372 /* Helper function to find the cached rt in the hash table
1373 * and update bucket pointer to point to the bucket for this
1374 * (daddr, saddr) pair
1375 * Caller must hold rcu_read_lock()
1377 static struct rt6_exception
*
1378 __rt6_find_exception_rcu(struct rt6_exception_bucket
**bucket
,
1379 const struct in6_addr
*daddr
,
1380 const struct in6_addr
*saddr
)
1382 struct rt6_exception
*rt6_ex
;
1385 WARN_ON_ONCE(!rcu_read_lock_held());
1387 if (!(*bucket
) || !daddr
)
1390 hval
= rt6_exception_hash(daddr
, saddr
);
1393 hlist_for_each_entry_rcu(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1394 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1395 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1397 #ifdef CONFIG_IPV6_SUBTREES
1398 if (matched
&& saddr
)
1399 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1407 static unsigned int fib6_mtu(const struct fib6_info
*rt
)
1411 if (rt
->fib6_pmtu
) {
1412 mtu
= rt
->fib6_pmtu
;
1414 struct net_device
*dev
= fib6_info_nh_dev(rt
);
1415 struct inet6_dev
*idev
;
1418 idev
= __in6_dev_get(dev
);
1419 mtu
= idev
->cnf
.mtu6
;
1423 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
1425 return mtu
- lwtunnel_headroom(rt
->fib6_nh
.nh_lwtstate
, mtu
);
1428 static int rt6_insert_exception(struct rt6_info
*nrt
,
1429 struct fib6_info
*ort
)
1431 struct net
*net
= dev_net(nrt
->dst
.dev
);
1432 struct rt6_exception_bucket
*bucket
;
1433 struct in6_addr
*src_key
= NULL
;
1434 struct rt6_exception
*rt6_ex
;
1437 spin_lock_bh(&rt6_exception_lock
);
1439 if (ort
->exception_bucket_flushed
) {
1444 bucket
= rcu_dereference_protected(ort
->rt6i_exception_bucket
,
1445 lockdep_is_held(&rt6_exception_lock
));
1447 bucket
= kcalloc(FIB6_EXCEPTION_BUCKET_SIZE
, sizeof(*bucket
),
1453 rcu_assign_pointer(ort
->rt6i_exception_bucket
, bucket
);
1456 #ifdef CONFIG_IPV6_SUBTREES
1457 /* rt6i_src.plen != 0 indicates ort is in subtree
1458 * and exception table is indexed by a hash of
1459 * both rt6i_dst and rt6i_src.
1460 * Otherwise, the exception table is indexed by
1461 * a hash of only rt6i_dst.
1463 if (ort
->fib6_src
.plen
)
1464 src_key
= &nrt
->rt6i_src
.addr
;
1466 /* rt6_mtu_change() might lower mtu on ort.
1467 * Only insert this exception route if its mtu
1468 * is less than ort's mtu value.
1470 if (dst_metric_raw(&nrt
->dst
, RTAX_MTU
) >= fib6_mtu(ort
)) {
1475 rt6_ex
= __rt6_find_exception_spinlock(&bucket
, &nrt
->rt6i_dst
.addr
,
1478 rt6_remove_exception(bucket
, rt6_ex
);
1480 rt6_ex
= kzalloc(sizeof(*rt6_ex
), GFP_ATOMIC
);
1486 rt6_ex
->stamp
= jiffies
;
1487 hlist_add_head_rcu(&rt6_ex
->hlist
, &bucket
->chain
);
1489 net
->ipv6
.rt6_stats
->fib_rt_cache
++;
1491 if (bucket
->depth
> FIB6_MAX_DEPTH
)
1492 rt6_exception_remove_oldest(bucket
);
1495 spin_unlock_bh(&rt6_exception_lock
);
1497 /* Update fn->fn_sernum to invalidate all cached dst */
1499 spin_lock_bh(&ort
->fib6_table
->tb6_lock
);
1500 fib6_update_sernum(net
, ort
);
1501 spin_unlock_bh(&ort
->fib6_table
->tb6_lock
);
1502 fib6_force_start_gc(net
);
1508 void rt6_flush_exceptions(struct fib6_info
*rt
)
1510 struct rt6_exception_bucket
*bucket
;
1511 struct rt6_exception
*rt6_ex
;
1512 struct hlist_node
*tmp
;
1515 spin_lock_bh(&rt6_exception_lock
);
1516 /* Prevent rt6_insert_exception() to recreate the bucket list */
1517 rt
->exception_bucket_flushed
= 1;
1519 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1520 lockdep_is_held(&rt6_exception_lock
));
1524 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1525 hlist_for_each_entry_safe(rt6_ex
, tmp
, &bucket
->chain
, hlist
)
1526 rt6_remove_exception(bucket
, rt6_ex
);
1527 WARN_ON_ONCE(bucket
->depth
);
1532 spin_unlock_bh(&rt6_exception_lock
);
1535 /* Find cached rt in the hash table inside passed in rt
1536 * Caller has to hold rcu_read_lock()
1538 static struct rt6_info
*rt6_find_cached_rt(struct fib6_info
*rt
,
1539 const struct in6_addr
*daddr
,
1540 const struct in6_addr
*saddr
)
1542 const struct in6_addr
*src_key
= NULL
;
1543 struct rt6_exception_bucket
*bucket
;
1544 struct rt6_exception
*rt6_ex
;
1545 struct rt6_info
*res
= NULL
;
1547 #ifdef CONFIG_IPV6_SUBTREES
1548 /* rt6i_src.plen != 0 indicates rt is in subtree
1549 * and exception table is indexed by a hash of
1550 * both rt6i_dst and rt6i_src.
1551 * However, the src addr used to create the hash
1552 * might not be exactly the passed in saddr which
1553 * is a /128 addr from the flow.
1554 * So we need to use f6i->fib6_src to redo lookup
1555 * if the passed in saddr does not find anything.
1556 * (See the logic in ip6_rt_cache_alloc() on how
1557 * rt->rt6i_src is updated.)
1559 if (rt
->fib6_src
.plen
)
1563 bucket
= rcu_dereference(rt
->rt6i_exception_bucket
);
1564 rt6_ex
= __rt6_find_exception_rcu(&bucket
, daddr
, src_key
);
1566 if (rt6_ex
&& !rt6_check_expired(rt6_ex
->rt6i
))
1569 #ifdef CONFIG_IPV6_SUBTREES
1570 /* Use fib6_src as src_key and redo lookup */
1571 if (!res
&& src_key
&& src_key
!= &rt
->fib6_src
.addr
) {
1572 src_key
= &rt
->fib6_src
.addr
;
1580 /* Remove the passed in cached rt from the hash table that contains it */
1581 static int rt6_remove_exception_rt(struct rt6_info
*rt
)
1583 struct rt6_exception_bucket
*bucket
;
1584 struct in6_addr
*src_key
= NULL
;
1585 struct rt6_exception
*rt6_ex
;
1586 struct fib6_info
*from
;
1589 from
= rcu_dereference(rt
->from
);
1591 !(rt
->rt6i_flags
& RTF_CACHE
))
1594 if (!rcu_access_pointer(from
->rt6i_exception_bucket
))
1597 spin_lock_bh(&rt6_exception_lock
);
1598 bucket
= rcu_dereference_protected(from
->rt6i_exception_bucket
,
1599 lockdep_is_held(&rt6_exception_lock
));
1600 #ifdef CONFIG_IPV6_SUBTREES
1601 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1602 * and exception table is indexed by a hash of
1603 * both rt6i_dst and rt6i_src.
1604 * Otherwise, the exception table is indexed by
1605 * a hash of only rt6i_dst.
1607 if (from
->fib6_src
.plen
)
1608 src_key
= &rt
->rt6i_src
.addr
;
1610 rt6_ex
= __rt6_find_exception_spinlock(&bucket
,
1614 rt6_remove_exception(bucket
, rt6_ex
);
1620 spin_unlock_bh(&rt6_exception_lock
);
1624 /* Find rt6_ex which contains the passed in rt cache and
1627 static void rt6_update_exception_stamp_rt(struct rt6_info
*rt
)
1629 struct rt6_exception_bucket
*bucket
;
1630 struct in6_addr
*src_key
= NULL
;
1631 struct rt6_exception
*rt6_ex
;
1632 struct fib6_info
*from
;
1635 from
= rcu_dereference(rt
->from
);
1636 if (!from
|| !(rt
->rt6i_flags
& RTF_CACHE
))
1639 bucket
= rcu_dereference(from
->rt6i_exception_bucket
);
1641 #ifdef CONFIG_IPV6_SUBTREES
1642 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1643 * and exception table is indexed by a hash of
1644 * both rt6i_dst and rt6i_src.
1645 * Otherwise, the exception table is indexed by
1646 * a hash of only rt6i_dst.
1648 if (from
->fib6_src
.plen
)
1649 src_key
= &rt
->rt6i_src
.addr
;
1651 rt6_ex
= __rt6_find_exception_rcu(&bucket
,
1655 rt6_ex
->stamp
= jiffies
;
1661 static bool rt6_mtu_change_route_allowed(struct inet6_dev
*idev
,
1662 struct rt6_info
*rt
, int mtu
)
1664 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1665 * lowest MTU in the path: always allow updating the route PMTU to
1666 * reflect PMTU decreases.
1668 * If the new MTU is higher, and the route PMTU is equal to the local
1669 * MTU, this means the old MTU is the lowest in the path, so allow
1670 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1674 if (dst_mtu(&rt
->dst
) >= mtu
)
1677 if (dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
)
1683 static void rt6_exceptions_update_pmtu(struct inet6_dev
*idev
,
1684 struct fib6_info
*rt
, int mtu
)
1686 struct rt6_exception_bucket
*bucket
;
1687 struct rt6_exception
*rt6_ex
;
1690 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1691 lockdep_is_held(&rt6_exception_lock
));
1696 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1697 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1698 struct rt6_info
*entry
= rt6_ex
->rt6i
;
1700 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1701 * route), the metrics of its rt->from have already
1704 if (dst_metric_raw(&entry
->dst
, RTAX_MTU
) &&
1705 rt6_mtu_change_route_allowed(idev
, entry
, mtu
))
1706 dst_metric_set(&entry
->dst
, RTAX_MTU
, mtu
);
1712 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1714 static void rt6_exceptions_clean_tohost(struct fib6_info
*rt
,
1715 struct in6_addr
*gateway
)
1717 struct rt6_exception_bucket
*bucket
;
1718 struct rt6_exception
*rt6_ex
;
1719 struct hlist_node
*tmp
;
1722 if (!rcu_access_pointer(rt
->rt6i_exception_bucket
))
1725 spin_lock_bh(&rt6_exception_lock
);
1726 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1727 lockdep_is_held(&rt6_exception_lock
));
1730 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1731 hlist_for_each_entry_safe(rt6_ex
, tmp
,
1732 &bucket
->chain
, hlist
) {
1733 struct rt6_info
*entry
= rt6_ex
->rt6i
;
1735 if ((entry
->rt6i_flags
& RTF_CACHE_GATEWAY
) ==
1736 RTF_CACHE_GATEWAY
&&
1737 ipv6_addr_equal(gateway
,
1738 &entry
->rt6i_gateway
)) {
1739 rt6_remove_exception(bucket
, rt6_ex
);
1746 spin_unlock_bh(&rt6_exception_lock
);
1749 static void rt6_age_examine_exception(struct rt6_exception_bucket
*bucket
,
1750 struct rt6_exception
*rt6_ex
,
1751 struct fib6_gc_args
*gc_args
,
1754 struct rt6_info
*rt
= rt6_ex
->rt6i
;
1756 /* we are pruning and obsoleting aged-out and non gateway exceptions
1757 * even if others have still references to them, so that on next
1758 * dst_check() such references can be dropped.
1759 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1760 * expired, independently from their aging, as per RFC 8201 section 4
1762 if (!(rt
->rt6i_flags
& RTF_EXPIRES
)) {
1763 if (time_after_eq(now
, rt
->dst
.lastuse
+ gc_args
->timeout
)) {
1764 RT6_TRACE("aging clone %p\n", rt
);
1765 rt6_remove_exception(bucket
, rt6_ex
);
1768 } else if (time_after(jiffies
, rt
->dst
.expires
)) {
1769 RT6_TRACE("purging expired route %p\n", rt
);
1770 rt6_remove_exception(bucket
, rt6_ex
);
1774 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
1775 struct neighbour
*neigh
;
1776 __u8 neigh_flags
= 0;
1778 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
1780 neigh_flags
= neigh
->flags
;
1782 if (!(neigh_flags
& NTF_ROUTER
)) {
1783 RT6_TRACE("purging route %p via non-router but gateway\n",
1785 rt6_remove_exception(bucket
, rt6_ex
);
1793 void rt6_age_exceptions(struct fib6_info
*rt
,
1794 struct fib6_gc_args
*gc_args
,
1797 struct rt6_exception_bucket
*bucket
;
1798 struct rt6_exception
*rt6_ex
;
1799 struct hlist_node
*tmp
;
1802 if (!rcu_access_pointer(rt
->rt6i_exception_bucket
))
1806 spin_lock(&rt6_exception_lock
);
1807 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1808 lockdep_is_held(&rt6_exception_lock
));
1811 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1812 hlist_for_each_entry_safe(rt6_ex
, tmp
,
1813 &bucket
->chain
, hlist
) {
1814 rt6_age_examine_exception(bucket
, rt6_ex
,
1820 spin_unlock(&rt6_exception_lock
);
1821 rcu_read_unlock_bh();
1824 /* must be called with rcu lock held */
1825 struct fib6_info
*fib6_table_lookup(struct net
*net
, struct fib6_table
*table
,
1826 int oif
, struct flowi6
*fl6
, int strict
)
1828 struct fib6_node
*fn
, *saved_fn
;
1829 struct fib6_info
*f6i
;
1831 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1834 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1838 f6i
= rt6_select(net
, fn
, oif
, strict
);
1839 if (f6i
== net
->ipv6
.fib6_null_entry
) {
1840 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1842 goto redo_rt6_select
;
1843 else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
1844 /* also consider unreachable route */
1845 strict
&= ~RT6_LOOKUP_F_REACHABLE
;
1847 goto redo_rt6_select
;
1851 trace_fib6_table_lookup(net
, f6i
, table
, fl6
);
1856 struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
,
1857 int oif
, struct flowi6
*fl6
,
1858 const struct sk_buff
*skb
, int flags
)
1860 struct fib6_info
*f6i
;
1861 struct rt6_info
*rt
;
1864 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
1865 strict
|= flags
& RT6_LOOKUP_F_IGNORE_LINKSTATE
;
1866 if (net
->ipv6
.devconf_all
->forwarding
== 0)
1867 strict
|= RT6_LOOKUP_F_REACHABLE
;
1871 f6i
= fib6_table_lookup(net
, table
, oif
, fl6
, strict
);
1872 if (f6i
->fib6_nsiblings
)
1873 f6i
= fib6_multipath_select(net
, f6i
, fl6
, oif
, skb
, strict
);
1875 if (f6i
== net
->ipv6
.fib6_null_entry
) {
1876 rt
= net
->ipv6
.ip6_null_entry
;
1882 /*Search through exception table */
1883 rt
= rt6_find_cached_rt(f6i
, &fl6
->daddr
, &fl6
->saddr
);
1885 if (ip6_hold_safe(net
, &rt
, true))
1886 dst_use_noref(&rt
->dst
, jiffies
);
1890 } else if (unlikely((fl6
->flowi6_flags
& FLOWI_FLAG_KNOWN_NH
) &&
1891 !(f6i
->fib6_flags
& RTF_GATEWAY
))) {
1892 /* Create a RTF_CACHE clone which will not be
1893 * owned by the fib6 tree. It is for the special case where
1894 * the daddr in the skb during the neighbor look-up is different
1895 * from the fl6->daddr used to look-up route here.
1897 struct rt6_info
*uncached_rt
;
1899 uncached_rt
= ip6_rt_cache_alloc(f6i
, &fl6
->daddr
, NULL
);
1904 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1905 * No need for another dst_hold()
1907 rt6_uncached_list_add(uncached_rt
);
1908 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
1910 uncached_rt
= net
->ipv6
.ip6_null_entry
;
1911 dst_hold(&uncached_rt
->dst
);
1916 /* Get a percpu copy */
1918 struct rt6_info
*pcpu_rt
;
1921 pcpu_rt
= rt6_get_pcpu_route(f6i
);
1924 pcpu_rt
= rt6_make_pcpu_route(net
, f6i
);
1932 EXPORT_SYMBOL_GPL(ip6_pol_route
);
1934 static struct rt6_info
*ip6_pol_route_input(struct net
*net
,
1935 struct fib6_table
*table
,
1937 const struct sk_buff
*skb
,
1940 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, skb
, flags
);
1943 struct dst_entry
*ip6_route_input_lookup(struct net
*net
,
1944 struct net_device
*dev
,
1946 const struct sk_buff
*skb
,
1949 if (rt6_need_strict(&fl6
->daddr
) && dev
->type
!= ARPHRD_PIMREG
)
1950 flags
|= RT6_LOOKUP_F_IFACE
;
1952 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_input
);
1954 EXPORT_SYMBOL_GPL(ip6_route_input_lookup
);
1956 static void ip6_multipath_l3_keys(const struct sk_buff
*skb
,
1957 struct flow_keys
*keys
,
1958 struct flow_keys
*flkeys
)
1960 const struct ipv6hdr
*outer_iph
= ipv6_hdr(skb
);
1961 const struct ipv6hdr
*key_iph
= outer_iph
;
1962 struct flow_keys
*_flkeys
= flkeys
;
1963 const struct ipv6hdr
*inner_iph
;
1964 const struct icmp6hdr
*icmph
;
1965 struct ipv6hdr _inner_iph
;
1966 struct icmp6hdr _icmph
;
1968 if (likely(outer_iph
->nexthdr
!= IPPROTO_ICMPV6
))
1971 icmph
= skb_header_pointer(skb
, skb_transport_offset(skb
),
1972 sizeof(_icmph
), &_icmph
);
1976 if (icmph
->icmp6_type
!= ICMPV6_DEST_UNREACH
&&
1977 icmph
->icmp6_type
!= ICMPV6_PKT_TOOBIG
&&
1978 icmph
->icmp6_type
!= ICMPV6_TIME_EXCEED
&&
1979 icmph
->icmp6_type
!= ICMPV6_PARAMPROB
)
1982 inner_iph
= skb_header_pointer(skb
,
1983 skb_transport_offset(skb
) + sizeof(*icmph
),
1984 sizeof(_inner_iph
), &_inner_iph
);
1988 key_iph
= inner_iph
;
1992 keys
->addrs
.v6addrs
.src
= _flkeys
->addrs
.v6addrs
.src
;
1993 keys
->addrs
.v6addrs
.dst
= _flkeys
->addrs
.v6addrs
.dst
;
1994 keys
->tags
.flow_label
= _flkeys
->tags
.flow_label
;
1995 keys
->basic
.ip_proto
= _flkeys
->basic
.ip_proto
;
1997 keys
->addrs
.v6addrs
.src
= key_iph
->saddr
;
1998 keys
->addrs
.v6addrs
.dst
= key_iph
->daddr
;
1999 keys
->tags
.flow_label
= ip6_flowlabel(key_iph
);
2000 keys
->basic
.ip_proto
= key_iph
->nexthdr
;
2004 /* if skb is set it will be used and fl6 can be NULL */
2005 u32
rt6_multipath_hash(const struct net
*net
, const struct flowi6
*fl6
,
2006 const struct sk_buff
*skb
, struct flow_keys
*flkeys
)
2008 struct flow_keys hash_keys
;
2011 switch (ip6_multipath_hash_policy(net
)) {
2013 memset(&hash_keys
, 0, sizeof(hash_keys
));
2014 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2016 ip6_multipath_l3_keys(skb
, &hash_keys
, flkeys
);
2018 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2019 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2020 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2021 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2026 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
2027 struct flow_keys keys
;
2029 /* short-circuit if we already have L4 hash present */
2031 return skb_get_hash_raw(skb
) >> 1;
2033 memset(&hash_keys
, 0, sizeof(hash_keys
));
2036 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
2039 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2040 hash_keys
.addrs
.v6addrs
.src
= flkeys
->addrs
.v6addrs
.src
;
2041 hash_keys
.addrs
.v6addrs
.dst
= flkeys
->addrs
.v6addrs
.dst
;
2042 hash_keys
.ports
.src
= flkeys
->ports
.src
;
2043 hash_keys
.ports
.dst
= flkeys
->ports
.dst
;
2044 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2046 memset(&hash_keys
, 0, sizeof(hash_keys
));
2047 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2048 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2049 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2050 hash_keys
.ports
.src
= fl6
->fl6_sport
;
2051 hash_keys
.ports
.dst
= fl6
->fl6_dport
;
2052 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2056 mhash
= flow_hash_from_keys(&hash_keys
);
2061 void ip6_route_input(struct sk_buff
*skb
)
2063 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2064 struct net
*net
= dev_net(skb
->dev
);
2065 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
2066 struct ip_tunnel_info
*tun_info
;
2067 struct flowi6 fl6
= {
2068 .flowi6_iif
= skb
->dev
->ifindex
,
2069 .daddr
= iph
->daddr
,
2070 .saddr
= iph
->saddr
,
2071 .flowlabel
= ip6_flowinfo(iph
),
2072 .flowi6_mark
= skb
->mark
,
2073 .flowi6_proto
= iph
->nexthdr
,
2075 struct flow_keys
*flkeys
= NULL
, _flkeys
;
2077 tun_info
= skb_tunnel_info(skb
);
2078 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
2079 fl6
.flowi6_tun_key
.tun_id
= tun_info
->key
.tun_id
;
2081 if (fib6_rules_early_flow_dissect(net
, skb
, &fl6
, &_flkeys
))
2084 if (unlikely(fl6
.flowi6_proto
== IPPROTO_ICMPV6
))
2085 fl6
.mp_hash
= rt6_multipath_hash(net
, &fl6
, skb
, flkeys
);
2088 ip6_route_input_lookup(net
, skb
->dev
, &fl6
, skb
, flags
));
2091 static struct rt6_info
*ip6_pol_route_output(struct net
*net
,
2092 struct fib6_table
*table
,
2094 const struct sk_buff
*skb
,
2097 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, skb
, flags
);
2100 struct dst_entry
*ip6_route_output_flags(struct net
*net
, const struct sock
*sk
,
2101 struct flowi6
*fl6
, int flags
)
2105 if (ipv6_addr_type(&fl6
->daddr
) &
2106 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
)) {
2107 struct dst_entry
*dst
;
2109 dst
= l3mdev_link_scope_lookup(net
, fl6
);
2114 fl6
->flowi6_iif
= LOOPBACK_IFINDEX
;
2116 any_src
= ipv6_addr_any(&fl6
->saddr
);
2117 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
) ||
2118 (fl6
->flowi6_oif
&& any_src
))
2119 flags
|= RT6_LOOKUP_F_IFACE
;
2122 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2124 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
2126 return fib6_rule_lookup(net
, fl6
, NULL
, flags
, ip6_pol_route_output
);
2128 EXPORT_SYMBOL_GPL(ip6_route_output_flags
);
2130 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2132 struct rt6_info
*rt
, *ort
= (struct rt6_info
*) dst_orig
;
2133 struct net_device
*loopback_dev
= net
->loopback_dev
;
2134 struct dst_entry
*new = NULL
;
2136 rt
= dst_alloc(&ip6_dst_blackhole_ops
, loopback_dev
, 1,
2137 DST_OBSOLETE_DEAD
, 0);
2140 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
2144 new->input
= dst_discard
;
2145 new->output
= dst_discard_out
;
2147 dst_copy_metrics(new, &ort
->dst
);
2149 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
2150 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
2151 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_PCPU
;
2153 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
2154 #ifdef CONFIG_IPV6_SUBTREES
2155 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
2159 dst_release(dst_orig
);
2160 return new ? new : ERR_PTR(-ENOMEM
);
2164 * Destination cache support functions
2167 static bool fib6_check(struct fib6_info
*f6i
, u32 cookie
)
2171 if (!fib6_get_cookie_safe(f6i
, &rt_cookie
) || rt_cookie
!= cookie
)
2174 if (fib6_check_expired(f6i
))
2180 static struct dst_entry
*rt6_check(struct rt6_info
*rt
,
2181 struct fib6_info
*from
,
2186 if ((from
&& !fib6_get_cookie_safe(from
, &rt_cookie
)) ||
2187 rt_cookie
!= cookie
)
2190 if (rt6_check_expired(rt
))
2196 static struct dst_entry
*rt6_dst_from_check(struct rt6_info
*rt
,
2197 struct fib6_info
*from
,
2200 if (!__rt6_check_expired(rt
) &&
2201 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
2202 fib6_check(from
, cookie
))
2208 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
2210 struct dst_entry
*dst_ret
;
2211 struct fib6_info
*from
;
2212 struct rt6_info
*rt
;
2214 rt
= container_of(dst
, struct rt6_info
, dst
);
2218 /* All IPV6 dsts are created with ->obsolete set to the value
2219 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2220 * into this function always.
2223 from
= rcu_dereference(rt
->from
);
2225 if (from
&& (rt
->rt6i_flags
& RTF_PCPU
||
2226 unlikely(!list_empty(&rt
->rt6i_uncached
))))
2227 dst_ret
= rt6_dst_from_check(rt
, from
, cookie
);
2229 dst_ret
= rt6_check(rt
, from
, cookie
);
2236 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
2238 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
2241 if (rt
->rt6i_flags
& RTF_CACHE
) {
2243 if (rt6_check_expired(rt
)) {
2244 rt6_remove_exception_rt(rt
);
2256 static void ip6_link_failure(struct sk_buff
*skb
)
2258 struct rt6_info
*rt
;
2260 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
2262 rt
= (struct rt6_info
*) skb_dst(skb
);
2265 if (rt
->rt6i_flags
& RTF_CACHE
) {
2266 rt6_remove_exception_rt(rt
);
2268 struct fib6_info
*from
;
2269 struct fib6_node
*fn
;
2271 from
= rcu_dereference(rt
->from
);
2273 fn
= rcu_dereference(from
->fib6_node
);
2274 if (fn
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
2282 static void rt6_update_expires(struct rt6_info
*rt0
, int timeout
)
2284 if (!(rt0
->rt6i_flags
& RTF_EXPIRES
)) {
2285 struct fib6_info
*from
;
2288 from
= rcu_dereference(rt0
->from
);
2290 rt0
->dst
.expires
= from
->expires
;
2294 dst_set_expires(&rt0
->dst
, timeout
);
2295 rt0
->rt6i_flags
|= RTF_EXPIRES
;
2298 static void rt6_do_update_pmtu(struct rt6_info
*rt
, u32 mtu
)
2300 struct net
*net
= dev_net(rt
->dst
.dev
);
2302 dst_metric_set(&rt
->dst
, RTAX_MTU
, mtu
);
2303 rt
->rt6i_flags
|= RTF_MODIFIED
;
2304 rt6_update_expires(rt
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
2307 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info
*rt
)
2309 return !(rt
->rt6i_flags
& RTF_CACHE
) &&
2310 (rt
->rt6i_flags
& RTF_PCPU
|| rcu_access_pointer(rt
->from
));
2313 static void __ip6_rt_update_pmtu(struct dst_entry
*dst
, const struct sock
*sk
,
2314 const struct ipv6hdr
*iph
, u32 mtu
)
2316 const struct in6_addr
*daddr
, *saddr
;
2317 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
2319 if (dst_metric_locked(dst
, RTAX_MTU
))
2323 daddr
= &iph
->daddr
;
2324 saddr
= &iph
->saddr
;
2326 daddr
= &sk
->sk_v6_daddr
;
2327 saddr
= &inet6_sk(sk
)->saddr
;
2332 dst_confirm_neigh(dst
, daddr
);
2333 mtu
= max_t(u32
, mtu
, IPV6_MIN_MTU
);
2334 if (mtu
>= dst_mtu(dst
))
2337 if (!rt6_cache_allowed_for_pmtu(rt6
)) {
2338 rt6_do_update_pmtu(rt6
, mtu
);
2339 /* update rt6_ex->stamp for cache */
2340 if (rt6
->rt6i_flags
& RTF_CACHE
)
2341 rt6_update_exception_stamp_rt(rt6
);
2343 struct fib6_info
*from
;
2344 struct rt6_info
*nrt6
;
2347 from
= rcu_dereference(rt6
->from
);
2352 nrt6
= ip6_rt_cache_alloc(from
, daddr
, saddr
);
2354 rt6_do_update_pmtu(nrt6
, mtu
);
2355 if (rt6_insert_exception(nrt6
, from
))
2356 dst_release_immediate(&nrt6
->dst
);
2362 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2363 struct sk_buff
*skb
, u32 mtu
)
2365 __ip6_rt_update_pmtu(dst
, sk
, skb
? ipv6_hdr(skb
) : NULL
, mtu
);
2368 void ip6_update_pmtu(struct sk_buff
*skb
, struct net
*net
, __be32 mtu
,
2369 int oif
, u32 mark
, kuid_t uid
)
2371 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2372 struct dst_entry
*dst
;
2373 struct flowi6 fl6
= {
2375 .flowi6_mark
= mark
? mark
: IP6_REPLY_MARK(net
, skb
->mark
),
2376 .daddr
= iph
->daddr
,
2377 .saddr
= iph
->saddr
,
2378 .flowlabel
= ip6_flowinfo(iph
),
2382 dst
= ip6_route_output(net
, NULL
, &fl6
);
2384 __ip6_rt_update_pmtu(dst
, NULL
, iph
, ntohl(mtu
));
2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu
);
2389 void ip6_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, __be32 mtu
)
2391 int oif
= sk
->sk_bound_dev_if
;
2392 struct dst_entry
*dst
;
2394 if (!oif
&& skb
->dev
)
2395 oif
= l3mdev_master_ifindex(skb
->dev
);
2397 ip6_update_pmtu(skb
, sock_net(sk
), mtu
, oif
, sk
->sk_mark
, sk
->sk_uid
);
2399 dst
= __sk_dst_get(sk
);
2400 if (!dst
|| !dst
->obsolete
||
2401 dst
->ops
->check(dst
, inet6_sk(sk
)->dst_cookie
))
2405 if (!sock_owned_by_user(sk
) && !ipv6_addr_v4mapped(&sk
->sk_v6_daddr
))
2406 ip6_datagram_dst_update(sk
, false);
2409 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu
);
2411 void ip6_sk_dst_store_flow(struct sock
*sk
, struct dst_entry
*dst
,
2412 const struct flowi6
*fl6
)
2414 #ifdef CONFIG_IPV6_SUBTREES
2415 struct ipv6_pinfo
*np
= inet6_sk(sk
);
2418 ip6_dst_store(sk
, dst
,
2419 ipv6_addr_equal(&fl6
->daddr
, &sk
->sk_v6_daddr
) ?
2420 &sk
->sk_v6_daddr
: NULL
,
2421 #ifdef CONFIG_IPV6_SUBTREES
2422 ipv6_addr_equal(&fl6
->saddr
, &np
->saddr
) ?
2428 /* Handle redirects */
2429 struct ip6rd_flowi
{
2431 struct in6_addr gateway
;
2434 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
2435 struct fib6_table
*table
,
2437 const struct sk_buff
*skb
,
2440 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
2441 struct rt6_info
*ret
= NULL
, *rt_cache
;
2442 struct fib6_info
*rt
;
2443 struct fib6_node
*fn
;
2445 /* Get the "current" route for this destination and
2446 * check if the redirect has come from appropriate router.
2448 * RFC 4861 specifies that redirects should only be
2449 * accepted if they come from the nexthop to the target.
2450 * Due to the way the routes are chosen, this notion
2451 * is a bit fuzzy and one might need to check all possible
2456 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
2458 for_each_fib6_node_rt_rcu(fn
) {
2459 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
2461 if (fib6_check_expired(rt
))
2463 if (rt
->fib6_flags
& RTF_REJECT
)
2465 if (!(rt
->fib6_flags
& RTF_GATEWAY
))
2467 if (fl6
->flowi6_oif
!= rt
->fib6_nh
.nh_dev
->ifindex
)
2469 /* rt_cache's gateway might be different from its 'parent'
2470 * in the case of an ip redirect.
2471 * So we keep searching in the exception table if the gateway
2474 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->fib6_nh
.nh_gw
)) {
2475 rt_cache
= rt6_find_cached_rt(rt
,
2479 ipv6_addr_equal(&rdfl
->gateway
,
2480 &rt_cache
->rt6i_gateway
)) {
2490 rt
= net
->ipv6
.fib6_null_entry
;
2491 else if (rt
->fib6_flags
& RTF_REJECT
) {
2492 ret
= net
->ipv6
.ip6_null_entry
;
2496 if (rt
== net
->ipv6
.fib6_null_entry
) {
2497 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
2504 ip6_hold_safe(net
, &ret
, true);
2506 ret
= ip6_create_rt_rcu(rt
);
2510 trace_fib6_table_lookup(net
, rt
, table
, fl6
);
2514 static struct dst_entry
*ip6_route_redirect(struct net
*net
,
2515 const struct flowi6
*fl6
,
2516 const struct sk_buff
*skb
,
2517 const struct in6_addr
*gateway
)
2519 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
2520 struct ip6rd_flowi rdfl
;
2523 rdfl
.gateway
= *gateway
;
2525 return fib6_rule_lookup(net
, &rdfl
.fl6
, skb
,
2526 flags
, __ip6_route_redirect
);
2529 void ip6_redirect(struct sk_buff
*skb
, struct net
*net
, int oif
, u32 mark
,
2532 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2533 struct dst_entry
*dst
;
2534 struct flowi6 fl6
= {
2535 .flowi6_iif
= LOOPBACK_IFINDEX
,
2537 .flowi6_mark
= mark
,
2538 .daddr
= iph
->daddr
,
2539 .saddr
= iph
->saddr
,
2540 .flowlabel
= ip6_flowinfo(iph
),
2544 dst
= ip6_route_redirect(net
, &fl6
, skb
, &ipv6_hdr(skb
)->saddr
);
2545 rt6_do_redirect(dst
, NULL
, skb
);
2548 EXPORT_SYMBOL_GPL(ip6_redirect
);
2550 void ip6_redirect_no_header(struct sk_buff
*skb
, struct net
*net
, int oif
)
2552 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2553 const struct rd_msg
*msg
= (struct rd_msg
*)icmp6_hdr(skb
);
2554 struct dst_entry
*dst
;
2555 struct flowi6 fl6
= {
2556 .flowi6_iif
= LOOPBACK_IFINDEX
,
2559 .saddr
= iph
->daddr
,
2560 .flowi6_uid
= sock_net_uid(net
, NULL
),
2563 dst
= ip6_route_redirect(net
, &fl6
, skb
, &iph
->saddr
);
2564 rt6_do_redirect(dst
, NULL
, skb
);
2568 void ip6_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
2570 ip6_redirect(skb
, sock_net(sk
), sk
->sk_bound_dev_if
, sk
->sk_mark
,
2573 EXPORT_SYMBOL_GPL(ip6_sk_redirect
);
2575 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
2577 struct net_device
*dev
= dst
->dev
;
2578 unsigned int mtu
= dst_mtu(dst
);
2579 struct net
*net
= dev_net(dev
);
2581 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
2583 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
2584 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2587 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2588 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2589 * IPV6_MAXPLEN is also valid and means: "any MSS,
2590 * rely only on pmtu discovery"
2592 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
2597 static unsigned int ip6_mtu(const struct dst_entry
*dst
)
2599 struct inet6_dev
*idev
;
2602 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2609 idev
= __in6_dev_get(dst
->dev
);
2611 mtu
= idev
->cnf
.mtu6
;
2615 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
2617 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
2621 * 1. mtu on route is locked - use it
2622 * 2. mtu from nexthop exception
2623 * 3. mtu from egress device
2625 * based on ip6_dst_mtu_forward and exception logic of
2626 * rt6_find_cached_rt; called with rcu_read_lock
2628 u32
ip6_mtu_from_fib6(struct fib6_info
*f6i
, struct in6_addr
*daddr
,
2629 struct in6_addr
*saddr
)
2631 struct inet6_dev
*idev
;
2632 struct rt6_info
*rt
;
2635 if (unlikely(fib6_metric_locked(f6i
, RTAX_MTU
))) {
2636 mtu
= f6i
->fib6_pmtu
;
2641 rt
= rt6_find_cached_rt(f6i
, daddr
, saddr
);
2643 mtu
= dst_metric_raw(&rt
->dst
, RTAX_MTU
);
2645 struct net_device
*dev
= fib6_info_nh_dev(f6i
);
2648 idev
= __in6_dev_get(dev
);
2649 if (idev
&& idev
->cnf
.mtu6
> mtu
)
2650 mtu
= idev
->cnf
.mtu6
;
2653 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
2655 return mtu
- lwtunnel_headroom(fib6_info_nh_lwt(f6i
), mtu
);
2658 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
2661 struct dst_entry
*dst
;
2662 struct rt6_info
*rt
;
2663 struct inet6_dev
*idev
= in6_dev_get(dev
);
2664 struct net
*net
= dev_net(dev
);
2666 if (unlikely(!idev
))
2667 return ERR_PTR(-ENODEV
);
2669 rt
= ip6_dst_alloc(net
, dev
, 0);
2670 if (unlikely(!rt
)) {
2672 dst
= ERR_PTR(-ENOMEM
);
2676 rt
->dst
.flags
|= DST_HOST
;
2677 rt
->dst
.input
= ip6_input
;
2678 rt
->dst
.output
= ip6_output
;
2679 rt
->rt6i_gateway
= fl6
->daddr
;
2680 rt
->rt6i_dst
.addr
= fl6
->daddr
;
2681 rt
->rt6i_dst
.plen
= 128;
2682 rt
->rt6i_idev
= idev
;
2683 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 0);
2685 /* Add this dst into uncached_list so that rt6_disable_ip() can
2686 * do proper release of the net_device
2688 rt6_uncached_list_add(rt
);
2689 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
2691 dst
= xfrm_lookup(net
, &rt
->dst
, flowi6_to_flowi(fl6
), NULL
, 0);
2697 static int ip6_dst_gc(struct dst_ops
*ops
)
2699 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
2700 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2701 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
2702 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2703 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2704 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
2707 entries
= dst_entries_get_fast(ops
);
2708 if (time_after(rt_last_gc
+ rt_min_interval
, jiffies
) &&
2709 entries
<= rt_max_size
)
2712 net
->ipv6
.ip6_rt_gc_expire
++;
2713 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
, true);
2714 entries
= dst_entries_get_slow(ops
);
2715 if (entries
< ops
->gc_thresh
)
2716 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
2718 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
2719 return entries
> rt_max_size
;
2722 static struct rt6_info
*ip6_nh_lookup_table(struct net
*net
,
2723 struct fib6_config
*cfg
,
2724 const struct in6_addr
*gw_addr
,
2725 u32 tbid
, int flags
)
2727 struct flowi6 fl6
= {
2728 .flowi6_oif
= cfg
->fc_ifindex
,
2730 .saddr
= cfg
->fc_prefsrc
,
2732 struct fib6_table
*table
;
2733 struct rt6_info
*rt
;
2735 table
= fib6_get_table(net
, tbid
);
2739 if (!ipv6_addr_any(&cfg
->fc_prefsrc
))
2740 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2742 flags
|= RT6_LOOKUP_F_IGNORE_LINKSTATE
;
2743 rt
= ip6_pol_route(net
, table
, cfg
->fc_ifindex
, &fl6
, NULL
, flags
);
2745 /* if table lookup failed, fall back to full lookup */
2746 if (rt
== net
->ipv6
.ip6_null_entry
) {
2754 static int ip6_route_check_nh_onlink(struct net
*net
,
2755 struct fib6_config
*cfg
,
2756 const struct net_device
*dev
,
2757 struct netlink_ext_ack
*extack
)
2759 u32 tbid
= l3mdev_fib_table(dev
) ? : RT_TABLE_MAIN
;
2760 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2761 u32 flags
= RTF_LOCAL
| RTF_ANYCAST
| RTF_REJECT
;
2762 struct fib6_info
*from
;
2763 struct rt6_info
*grt
;
2767 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
, tbid
, 0);
2770 from
= rcu_dereference(grt
->from
);
2771 if (!grt
->dst
.error
&&
2772 /* ignore match if it is the default route */
2773 from
&& !ipv6_addr_any(&from
->fib6_dst
.addr
) &&
2774 (grt
->rt6i_flags
& flags
|| dev
!= grt
->dst
.dev
)) {
2775 NL_SET_ERR_MSG(extack
,
2776 "Nexthop has invalid gateway or device mismatch");
2787 static int ip6_route_check_nh(struct net
*net
,
2788 struct fib6_config
*cfg
,
2789 struct net_device
**_dev
,
2790 struct inet6_dev
**idev
)
2792 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2793 struct net_device
*dev
= _dev
? *_dev
: NULL
;
2794 struct rt6_info
*grt
= NULL
;
2795 int err
= -EHOSTUNREACH
;
2797 if (cfg
->fc_table
) {
2798 int flags
= RT6_LOOKUP_F_IFACE
;
2800 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
,
2801 cfg
->fc_table
, flags
);
2803 if (grt
->rt6i_flags
& RTF_GATEWAY
||
2804 (dev
&& dev
!= grt
->dst
.dev
)) {
2812 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, NULL
, 1);
2818 if (dev
!= grt
->dst
.dev
) {
2823 *_dev
= dev
= grt
->dst
.dev
;
2824 *idev
= grt
->rt6i_idev
;
2826 in6_dev_hold(grt
->rt6i_idev
);
2829 if (!(grt
->rt6i_flags
& RTF_GATEWAY
))
2838 static int ip6_validate_gw(struct net
*net
, struct fib6_config
*cfg
,
2839 struct net_device
**_dev
, struct inet6_dev
**idev
,
2840 struct netlink_ext_ack
*extack
)
2842 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2843 int gwa_type
= ipv6_addr_type(gw_addr
);
2844 bool skip_dev
= gwa_type
& IPV6_ADDR_LINKLOCAL
? false : true;
2845 const struct net_device
*dev
= *_dev
;
2846 bool need_addr_check
= !dev
;
2849 /* if gw_addr is local we will fail to detect this in case
2850 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2851 * will return already-added prefix route via interface that
2852 * prefix route was assigned to, which might be non-loopback.
2855 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
2856 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
2860 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_UNICAST
)) {
2861 /* IPv6 strictly inhibits using not link-local
2862 * addresses as nexthop address.
2863 * Otherwise, router will not able to send redirects.
2864 * It is very good, but in some (rare!) circumstances
2865 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2866 * some exceptions. --ANK
2867 * We allow IPv4-mapped nexthops to support RFC4798-type
2870 if (!(gwa_type
& (IPV6_ADDR_UNICAST
| IPV6_ADDR_MAPPED
))) {
2871 NL_SET_ERR_MSG(extack
, "Invalid gateway address");
2875 if (cfg
->fc_flags
& RTNH_F_ONLINK
)
2876 err
= ip6_route_check_nh_onlink(net
, cfg
, dev
, extack
);
2878 err
= ip6_route_check_nh(net
, cfg
, _dev
, idev
);
2884 /* reload in case device was changed */
2889 NL_SET_ERR_MSG(extack
, "Egress device not specified");
2891 } else if (dev
->flags
& IFF_LOOPBACK
) {
2892 NL_SET_ERR_MSG(extack
,
2893 "Egress device can not be loopback device for this route");
2897 /* if we did not check gw_addr above, do so now that the
2898 * egress device has been resolved.
2900 if (need_addr_check
&&
2901 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
2902 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
2911 static struct fib6_info
*ip6_route_info_create(struct fib6_config
*cfg
,
2913 struct netlink_ext_ack
*extack
)
2915 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
2916 struct fib6_info
*rt
= NULL
;
2917 struct net_device
*dev
= NULL
;
2918 struct inet6_dev
*idev
= NULL
;
2919 struct fib6_table
*table
;
2923 /* RTF_PCPU is an internal flag; can not be set by userspace */
2924 if (cfg
->fc_flags
& RTF_PCPU
) {
2925 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_PCPU");
2929 /* RTF_CACHE is an internal flag; can not be set by userspace */
2930 if (cfg
->fc_flags
& RTF_CACHE
) {
2931 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_CACHE");
2935 if (cfg
->fc_type
> RTN_MAX
) {
2936 NL_SET_ERR_MSG(extack
, "Invalid route type");
2940 if (cfg
->fc_dst_len
> 128) {
2941 NL_SET_ERR_MSG(extack
, "Invalid prefix length");
2944 if (cfg
->fc_src_len
> 128) {
2945 NL_SET_ERR_MSG(extack
, "Invalid source address length");
2948 #ifndef CONFIG_IPV6_SUBTREES
2949 if (cfg
->fc_src_len
) {
2950 NL_SET_ERR_MSG(extack
,
2951 "Specifying source address requires IPV6_SUBTREES to be enabled");
2955 if (cfg
->fc_ifindex
) {
2957 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
2960 idev
= in6_dev_get(dev
);
2965 if (cfg
->fc_metric
== 0)
2966 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
2968 if (cfg
->fc_flags
& RTNH_F_ONLINK
) {
2970 NL_SET_ERR_MSG(extack
,
2971 "Nexthop device required for onlink");
2976 if (!(dev
->flags
& IFF_UP
)) {
2977 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
2984 if (cfg
->fc_nlinfo
.nlh
&&
2985 !(cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_CREATE
)) {
2986 table
= fib6_get_table(net
, cfg
->fc_table
);
2988 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2989 table
= fib6_new_table(net
, cfg
->fc_table
);
2992 table
= fib6_new_table(net
, cfg
->fc_table
);
2999 rt
= fib6_info_alloc(gfp_flags
);
3003 rt
->fib6_metrics
= ip_fib_metrics_init(net
, cfg
->fc_mx
, cfg
->fc_mx_len
,
3005 if (IS_ERR(rt
->fib6_metrics
)) {
3006 err
= PTR_ERR(rt
->fib6_metrics
);
3007 /* Do not leave garbage there. */
3008 rt
->fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
;
3012 if (cfg
->fc_flags
& RTF_ADDRCONF
)
3013 rt
->dst_nocount
= true;
3015 if (cfg
->fc_flags
& RTF_EXPIRES
)
3016 fib6_set_expires(rt
, jiffies
+
3017 clock_t_to_jiffies(cfg
->fc_expires
));
3019 fib6_clean_expires(rt
);
3021 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
3022 cfg
->fc_protocol
= RTPROT_BOOT
;
3023 rt
->fib6_protocol
= cfg
->fc_protocol
;
3025 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
3027 if (cfg
->fc_encap
) {
3028 struct lwtunnel_state
*lwtstate
;
3030 err
= lwtunnel_build_state(cfg
->fc_encap_type
,
3031 cfg
->fc_encap
, AF_INET6
, cfg
,
3035 rt
->fib6_nh
.nh_lwtstate
= lwtstate_get(lwtstate
);
3038 ipv6_addr_prefix(&rt
->fib6_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
3039 rt
->fib6_dst
.plen
= cfg
->fc_dst_len
;
3040 if (rt
->fib6_dst
.plen
== 128)
3041 rt
->dst_host
= true;
3043 #ifdef CONFIG_IPV6_SUBTREES
3044 ipv6_addr_prefix(&rt
->fib6_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
3045 rt
->fib6_src
.plen
= cfg
->fc_src_len
;
3048 rt
->fib6_metric
= cfg
->fc_metric
;
3049 rt
->fib6_nh
.nh_weight
= 1;
3051 rt
->fib6_type
= cfg
->fc_type
;
3053 /* We cannot add true routes via loopback here,
3054 they would result in kernel looping; promote them to reject routes
3056 if ((cfg
->fc_flags
& RTF_REJECT
) ||
3057 (dev
&& (dev
->flags
& IFF_LOOPBACK
) &&
3058 !(addr_type
& IPV6_ADDR_LOOPBACK
) &&
3059 !(cfg
->fc_flags
& RTF_LOCAL
))) {
3060 /* hold loopback dev/idev if we haven't done so. */
3061 if (dev
!= net
->loopback_dev
) {
3066 dev
= net
->loopback_dev
;
3068 idev
= in6_dev_get(dev
);
3074 rt
->fib6_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
3078 if (cfg
->fc_flags
& RTF_GATEWAY
) {
3079 err
= ip6_validate_gw(net
, cfg
, &dev
, &idev
, extack
);
3083 rt
->fib6_nh
.nh_gw
= cfg
->fc_gateway
;
3090 if (idev
->cnf
.disable_ipv6
) {
3091 NL_SET_ERR_MSG(extack
, "IPv6 is disabled on nexthop device");
3096 if (!(dev
->flags
& IFF_UP
)) {
3097 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3102 if (!ipv6_addr_any(&cfg
->fc_prefsrc
)) {
3103 if (!ipv6_chk_addr(net
, &cfg
->fc_prefsrc
, dev
, 0)) {
3104 NL_SET_ERR_MSG(extack
, "Invalid source address");
3108 rt
->fib6_prefsrc
.addr
= cfg
->fc_prefsrc
;
3109 rt
->fib6_prefsrc
.plen
= 128;
3111 rt
->fib6_prefsrc
.plen
= 0;
3113 rt
->fib6_flags
= cfg
->fc_flags
;
3116 if (!(rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) &&
3117 !netif_carrier_ok(dev
))
3118 rt
->fib6_nh
.nh_flags
|= RTNH_F_LINKDOWN
;
3119 rt
->fib6_nh
.nh_flags
|= (cfg
->fc_flags
& RTNH_F_ONLINK
);
3120 rt
->fib6_nh
.nh_dev
= dev
;
3121 rt
->fib6_table
= table
;
3133 fib6_info_release(rt
);
3134 return ERR_PTR(err
);
3137 int ip6_route_add(struct fib6_config
*cfg
, gfp_t gfp_flags
,
3138 struct netlink_ext_ack
*extack
)
3140 struct fib6_info
*rt
;
3143 rt
= ip6_route_info_create(cfg
, gfp_flags
, extack
);
3147 err
= __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
, extack
);
3148 fib6_info_release(rt
);
3153 static int __ip6_del_rt(struct fib6_info
*rt
, struct nl_info
*info
)
3155 struct net
*net
= info
->nl_net
;
3156 struct fib6_table
*table
;
3159 if (rt
== net
->ipv6
.fib6_null_entry
) {
3164 table
= rt
->fib6_table
;
3165 spin_lock_bh(&table
->tb6_lock
);
3166 err
= fib6_del(rt
, info
);
3167 spin_unlock_bh(&table
->tb6_lock
);
3170 fib6_info_release(rt
);
3174 int ip6_del_rt(struct net
*net
, struct fib6_info
*rt
)
3176 struct nl_info info
= { .nl_net
= net
};
3178 return __ip6_del_rt(rt
, &info
);
3181 static int __ip6_del_rt_siblings(struct fib6_info
*rt
, struct fib6_config
*cfg
)
3183 struct nl_info
*info
= &cfg
->fc_nlinfo
;
3184 struct net
*net
= info
->nl_net
;
3185 struct sk_buff
*skb
= NULL
;
3186 struct fib6_table
*table
;
3189 if (rt
== net
->ipv6
.fib6_null_entry
)
3191 table
= rt
->fib6_table
;
3192 spin_lock_bh(&table
->tb6_lock
);
3194 if (rt
->fib6_nsiblings
&& cfg
->fc_delete_all_nh
) {
3195 struct fib6_info
*sibling
, *next_sibling
;
3197 /* prefer to send a single notification with all hops */
3198 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
3200 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
3202 if (rt6_fill_node(net
, skb
, rt
, NULL
,
3203 NULL
, NULL
, 0, RTM_DELROUTE
,
3204 info
->portid
, seq
, 0) < 0) {
3208 info
->skip_notify
= 1;
3211 list_for_each_entry_safe(sibling
, next_sibling
,
3214 err
= fib6_del(sibling
, info
);
3220 err
= fib6_del(rt
, info
);
3222 spin_unlock_bh(&table
->tb6_lock
);
3224 fib6_info_release(rt
);
3227 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
3228 info
->nlh
, gfp_any());
3233 static int ip6_del_cached_rt(struct rt6_info
*rt
, struct fib6_config
*cfg
)
3237 if (cfg
->fc_ifindex
&& rt
->dst
.dev
->ifindex
!= cfg
->fc_ifindex
)
3240 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3241 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
3244 rc
= rt6_remove_exception_rt(rt
);
3249 static int ip6_route_del(struct fib6_config
*cfg
,
3250 struct netlink_ext_ack
*extack
)
3252 struct rt6_info
*rt_cache
;
3253 struct fib6_table
*table
;
3254 struct fib6_info
*rt
;
3255 struct fib6_node
*fn
;
3258 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
3260 NL_SET_ERR_MSG(extack
, "FIB table does not exist");
3266 fn
= fib6_locate(&table
->tb6_root
,
3267 &cfg
->fc_dst
, cfg
->fc_dst_len
,
3268 &cfg
->fc_src
, cfg
->fc_src_len
,
3269 !(cfg
->fc_flags
& RTF_CACHE
));
3272 for_each_fib6_node_rt_rcu(fn
) {
3273 if (cfg
->fc_flags
& RTF_CACHE
) {
3276 rt_cache
= rt6_find_cached_rt(rt
, &cfg
->fc_dst
,
3279 rc
= ip6_del_cached_rt(rt_cache
, cfg
);
3287 if (cfg
->fc_ifindex
&&
3288 (!rt
->fib6_nh
.nh_dev
||
3289 rt
->fib6_nh
.nh_dev
->ifindex
!= cfg
->fc_ifindex
))
3291 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3292 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->fib6_nh
.nh_gw
))
3294 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->fib6_metric
)
3296 if (cfg
->fc_protocol
&& cfg
->fc_protocol
!= rt
->fib6_protocol
)
3298 if (!fib6_info_hold_safe(rt
))
3302 /* if gateway was specified only delete the one hop */
3303 if (cfg
->fc_flags
& RTF_GATEWAY
)
3304 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
3306 return __ip6_del_rt_siblings(rt
, cfg
);
3314 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
3316 struct netevent_redirect netevent
;
3317 struct rt6_info
*rt
, *nrt
= NULL
;
3318 struct ndisc_options ndopts
;
3319 struct inet6_dev
*in6_dev
;
3320 struct neighbour
*neigh
;
3321 struct fib6_info
*from
;
3323 int optlen
, on_link
;
3326 optlen
= skb_tail_pointer(skb
) - skb_transport_header(skb
);
3327 optlen
-= sizeof(*msg
);
3330 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3334 msg
= (struct rd_msg
*)icmp6_hdr(skb
);
3336 if (ipv6_addr_is_multicast(&msg
->dest
)) {
3337 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3342 if (ipv6_addr_equal(&msg
->dest
, &msg
->target
)) {
3344 } else if (ipv6_addr_type(&msg
->target
) !=
3345 (IPV6_ADDR_UNICAST
|IPV6_ADDR_LINKLOCAL
)) {
3346 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3350 in6_dev
= __in6_dev_get(skb
->dev
);
3353 if (in6_dev
->cnf
.forwarding
|| !in6_dev
->cnf
.accept_redirects
)
3357 * The IP source address of the Redirect MUST be the same as the current
3358 * first-hop router for the specified ICMP Destination Address.
3361 if (!ndisc_parse_options(skb
->dev
, msg
->opt
, optlen
, &ndopts
)) {
3362 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3367 if (ndopts
.nd_opts_tgt_lladdr
) {
3368 lladdr
= ndisc_opt_addr_data(ndopts
.nd_opts_tgt_lladdr
,
3371 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3376 rt
= (struct rt6_info
*) dst
;
3377 if (rt
->rt6i_flags
& RTF_REJECT
) {
3378 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3382 /* Redirect received -> path was valid.
3383 * Look, redirects are sent only in response to data packets,
3384 * so that this nexthop apparently is reachable. --ANK
3386 dst_confirm_neigh(&rt
->dst
, &ipv6_hdr(skb
)->saddr
);
3388 neigh
= __neigh_lookup(&nd_tbl
, &msg
->target
, skb
->dev
, 1);
3393 * We have finally decided to accept it.
3396 ndisc_update(skb
->dev
, neigh
, lladdr
, NUD_STALE
,
3397 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
3398 NEIGH_UPDATE_F_OVERRIDE
|
3399 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
3400 NEIGH_UPDATE_F_ISROUTER
)),
3401 NDISC_REDIRECT
, &ndopts
);
3404 from
= rcu_dereference(rt
->from
);
3408 nrt
= ip6_rt_cache_alloc(from
, &msg
->dest
, NULL
);
3412 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
3414 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
3416 nrt
->rt6i_gateway
= *(struct in6_addr
*)neigh
->primary_key
;
3418 /* rt6_insert_exception() will take care of duplicated exceptions */
3419 if (rt6_insert_exception(nrt
, from
)) {
3420 dst_release_immediate(&nrt
->dst
);
3424 netevent
.old
= &rt
->dst
;
3425 netevent
.new = &nrt
->dst
;
3426 netevent
.daddr
= &msg
->dest
;
3427 netevent
.neigh
= neigh
;
3428 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
3432 neigh_release(neigh
);
3435 #ifdef CONFIG_IPV6_ROUTE_INFO
3436 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
3437 const struct in6_addr
*prefix
, int prefixlen
,
3438 const struct in6_addr
*gwaddr
,
3439 struct net_device
*dev
)
3441 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
3442 int ifindex
= dev
->ifindex
;
3443 struct fib6_node
*fn
;
3444 struct fib6_info
*rt
= NULL
;
3445 struct fib6_table
*table
;
3447 table
= fib6_get_table(net
, tb_id
);
3452 fn
= fib6_locate(&table
->tb6_root
, prefix
, prefixlen
, NULL
, 0, true);
3456 for_each_fib6_node_rt_rcu(fn
) {
3457 if (rt
->fib6_nh
.nh_dev
->ifindex
!= ifindex
)
3459 if ((rt
->fib6_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
3461 if (!ipv6_addr_equal(&rt
->fib6_nh
.nh_gw
, gwaddr
))
3463 if (!fib6_info_hold_safe(rt
))
3472 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
3473 const struct in6_addr
*prefix
, int prefixlen
,
3474 const struct in6_addr
*gwaddr
,
3475 struct net_device
*dev
,
3478 struct fib6_config cfg
= {
3479 .fc_metric
= IP6_RT_PRIO_USER
,
3480 .fc_ifindex
= dev
->ifindex
,
3481 .fc_dst_len
= prefixlen
,
3482 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
3483 RTF_UP
| RTF_PREF(pref
),
3484 .fc_protocol
= RTPROT_RA
,
3485 .fc_type
= RTN_UNICAST
,
3486 .fc_nlinfo
.portid
= 0,
3487 .fc_nlinfo
.nlh
= NULL
,
3488 .fc_nlinfo
.nl_net
= net
,
3491 cfg
.fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
,
3492 cfg
.fc_dst
= *prefix
;
3493 cfg
.fc_gateway
= *gwaddr
;
3495 /* We should treat it as a default route if prefix length is 0. */
3497 cfg
.fc_flags
|= RTF_DEFAULT
;
3499 ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
);
3501 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, dev
);
3505 struct fib6_info
*rt6_get_dflt_router(struct net
*net
,
3506 const struct in6_addr
*addr
,
3507 struct net_device
*dev
)
3509 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
;
3510 struct fib6_info
*rt
;
3511 struct fib6_table
*table
;
3513 table
= fib6_get_table(net
, tb_id
);
3518 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
3519 if (dev
== rt
->fib6_nh
.nh_dev
&&
3520 ((rt
->fib6_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
3521 ipv6_addr_equal(&rt
->fib6_nh
.nh_gw
, addr
))
3524 if (rt
&& !fib6_info_hold_safe(rt
))
3530 struct fib6_info
*rt6_add_dflt_router(struct net
*net
,
3531 const struct in6_addr
*gwaddr
,
3532 struct net_device
*dev
,
3535 struct fib6_config cfg
= {
3536 .fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
,
3537 .fc_metric
= IP6_RT_PRIO_USER
,
3538 .fc_ifindex
= dev
->ifindex
,
3539 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
3540 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
3541 .fc_protocol
= RTPROT_RA
,
3542 .fc_type
= RTN_UNICAST
,
3543 .fc_nlinfo
.portid
= 0,
3544 .fc_nlinfo
.nlh
= NULL
,
3545 .fc_nlinfo
.nl_net
= net
,
3548 cfg
.fc_gateway
= *gwaddr
;
3550 if (!ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
)) {
3551 struct fib6_table
*table
;
3553 table
= fib6_get_table(dev_net(dev
), cfg
.fc_table
);
3555 table
->flags
|= RT6_TABLE_HAS_DFLT_ROUTER
;
3558 return rt6_get_dflt_router(net
, gwaddr
, dev
);
3561 static void __rt6_purge_dflt_routers(struct net
*net
,
3562 struct fib6_table
*table
)
3564 struct fib6_info
*rt
;
3568 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
3569 struct net_device
*dev
= fib6_info_nh_dev(rt
);
3570 struct inet6_dev
*idev
= dev
? __in6_dev_get(dev
) : NULL
;
3572 if (rt
->fib6_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
) &&
3573 (!idev
|| idev
->cnf
.accept_ra
!= 2) &&
3574 fib6_info_hold_safe(rt
)) {
3576 ip6_del_rt(net
, rt
);
3582 table
->flags
&= ~RT6_TABLE_HAS_DFLT_ROUTER
;
3585 void rt6_purge_dflt_routers(struct net
*net
)
3587 struct fib6_table
*table
;
3588 struct hlist_head
*head
;
3593 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
3594 head
= &net
->ipv6
.fib_table_hash
[h
];
3595 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
3596 if (table
->flags
& RT6_TABLE_HAS_DFLT_ROUTER
)
3597 __rt6_purge_dflt_routers(net
, table
);
3604 static void rtmsg_to_fib6_config(struct net
*net
,
3605 struct in6_rtmsg
*rtmsg
,
3606 struct fib6_config
*cfg
)
3608 *cfg
= (struct fib6_config
){
3609 .fc_table
= l3mdev_fib_table_by_index(net
, rtmsg
->rtmsg_ifindex
) ?
3611 .fc_ifindex
= rtmsg
->rtmsg_ifindex
,
3612 .fc_metric
= rtmsg
->rtmsg_metric
,
3613 .fc_expires
= rtmsg
->rtmsg_info
,
3614 .fc_dst_len
= rtmsg
->rtmsg_dst_len
,
3615 .fc_src_len
= rtmsg
->rtmsg_src_len
,
3616 .fc_flags
= rtmsg
->rtmsg_flags
,
3617 .fc_type
= rtmsg
->rtmsg_type
,
3619 .fc_nlinfo
.nl_net
= net
,
3621 .fc_dst
= rtmsg
->rtmsg_dst
,
3622 .fc_src
= rtmsg
->rtmsg_src
,
3623 .fc_gateway
= rtmsg
->rtmsg_gateway
,
3627 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
3629 struct fib6_config cfg
;
3630 struct in6_rtmsg rtmsg
;
3634 case SIOCADDRT
: /* Add a route */
3635 case SIOCDELRT
: /* Delete a route */
3636 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
3638 err
= copy_from_user(&rtmsg
, arg
,
3639 sizeof(struct in6_rtmsg
));
3643 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
3648 err
= ip6_route_add(&cfg
, GFP_KERNEL
, NULL
);
3651 err
= ip6_route_del(&cfg
, NULL
);
3665 * Drop the packet on the floor
3668 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
3670 struct dst_entry
*dst
= skb_dst(skb
);
3671 struct net
*net
= dev_net(dst
->dev
);
3672 struct inet6_dev
*idev
;
3675 if (netif_is_l3_master(skb
->dev
) &&
3676 dst
->dev
== net
->loopback_dev
)
3677 idev
= __in6_dev_get_safely(dev_get_by_index_rcu(net
, IP6CB(skb
)->iif
));
3679 idev
= ip6_dst_idev(dst
);
3681 switch (ipstats_mib_noroutes
) {
3682 case IPSTATS_MIB_INNOROUTES
:
3683 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
3684 if (type
== IPV6_ADDR_ANY
) {
3685 IP6_INC_STATS(net
, idev
, IPSTATS_MIB_INADDRERRORS
);
3689 case IPSTATS_MIB_OUTNOROUTES
:
3690 IP6_INC_STATS(net
, idev
, ipstats_mib_noroutes
);
3694 /* Start over by dropping the dst for l3mdev case */
3695 if (netif_is_l3_master(skb
->dev
))
3698 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
3703 static int ip6_pkt_discard(struct sk_buff
*skb
)
3705 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
3708 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3710 skb
->dev
= skb_dst(skb
)->dev
;
3711 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
3714 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
3716 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
3719 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3721 skb
->dev
= skb_dst(skb
)->dev
;
3722 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
3726 * Allocate a dst for local (unicast / anycast) address.
3729 struct fib6_info
*addrconf_f6i_alloc(struct net
*net
,
3730 struct inet6_dev
*idev
,
3731 const struct in6_addr
*addr
,
3732 bool anycast
, gfp_t gfp_flags
)
3735 struct net_device
*dev
= idev
->dev
;
3736 struct fib6_info
*f6i
;
3738 f6i
= fib6_info_alloc(gfp_flags
);
3740 return ERR_PTR(-ENOMEM
);
3742 f6i
->fib6_metrics
= ip_fib_metrics_init(net
, NULL
, 0, NULL
);
3743 f6i
->dst_nocount
= true;
3744 f6i
->dst_host
= true;
3745 f6i
->fib6_protocol
= RTPROT_KERNEL
;
3746 f6i
->fib6_flags
= RTF_UP
| RTF_NONEXTHOP
;
3748 f6i
->fib6_type
= RTN_ANYCAST
;
3749 f6i
->fib6_flags
|= RTF_ANYCAST
;
3751 f6i
->fib6_type
= RTN_LOCAL
;
3752 f6i
->fib6_flags
|= RTF_LOCAL
;
3755 f6i
->fib6_nh
.nh_gw
= *addr
;
3757 f6i
->fib6_nh
.nh_dev
= dev
;
3758 f6i
->fib6_dst
.addr
= *addr
;
3759 f6i
->fib6_dst
.plen
= 128;
3760 tb_id
= l3mdev_fib_table(idev
->dev
) ? : RT6_TABLE_LOCAL
;
3761 f6i
->fib6_table
= fib6_get_table(net
, tb_id
);
3766 /* remove deleted ip from prefsrc entries */
3767 struct arg_dev_net_ip
{
3768 struct net_device
*dev
;
3770 struct in6_addr
*addr
;
3773 static int fib6_remove_prefsrc(struct fib6_info
*rt
, void *arg
)
3775 struct net_device
*dev
= ((struct arg_dev_net_ip
*)arg
)->dev
;
3776 struct net
*net
= ((struct arg_dev_net_ip
*)arg
)->net
;
3777 struct in6_addr
*addr
= ((struct arg_dev_net_ip
*)arg
)->addr
;
3779 if (((void *)rt
->fib6_nh
.nh_dev
== dev
|| !dev
) &&
3780 rt
!= net
->ipv6
.fib6_null_entry
&&
3781 ipv6_addr_equal(addr
, &rt
->fib6_prefsrc
.addr
)) {
3782 spin_lock_bh(&rt6_exception_lock
);
3783 /* remove prefsrc entry */
3784 rt
->fib6_prefsrc
.plen
= 0;
3785 spin_unlock_bh(&rt6_exception_lock
);
3790 void rt6_remove_prefsrc(struct inet6_ifaddr
*ifp
)
3792 struct net
*net
= dev_net(ifp
->idev
->dev
);
3793 struct arg_dev_net_ip adni
= {
3794 .dev
= ifp
->idev
->dev
,
3798 fib6_clean_all(net
, fib6_remove_prefsrc
, &adni
);
3801 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3803 /* Remove routers and update dst entries when gateway turn into host. */
3804 static int fib6_clean_tohost(struct fib6_info
*rt
, void *arg
)
3806 struct in6_addr
*gateway
= (struct in6_addr
*)arg
;
3808 if (((rt
->fib6_flags
& RTF_RA_ROUTER
) == RTF_RA_ROUTER
) &&
3809 ipv6_addr_equal(gateway
, &rt
->fib6_nh
.nh_gw
)) {
3813 /* Further clean up cached routes in exception table.
3814 * This is needed because cached route may have a different
3815 * gateway than its 'parent' in the case of an ip redirect.
3817 rt6_exceptions_clean_tohost(rt
, gateway
);
3822 void rt6_clean_tohost(struct net
*net
, struct in6_addr
*gateway
)
3824 fib6_clean_all(net
, fib6_clean_tohost
, gateway
);
3827 struct arg_netdev_event
{
3828 const struct net_device
*dev
;
3830 unsigned int nh_flags
;
3831 unsigned long event
;
3835 static struct fib6_info
*rt6_multipath_first_sibling(const struct fib6_info
*rt
)
3837 struct fib6_info
*iter
;
3838 struct fib6_node
*fn
;
3840 fn
= rcu_dereference_protected(rt
->fib6_node
,
3841 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
3842 iter
= rcu_dereference_protected(fn
->leaf
,
3843 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
3845 if (iter
->fib6_metric
== rt
->fib6_metric
&&
3846 rt6_qualify_for_ecmp(iter
))
3848 iter
= rcu_dereference_protected(iter
->fib6_next
,
3849 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
3855 static bool rt6_is_dead(const struct fib6_info
*rt
)
3857 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
||
3858 (rt
->fib6_nh
.nh_flags
& RTNH_F_LINKDOWN
&&
3859 fib6_ignore_linkdown(rt
)))
3865 static int rt6_multipath_total_weight(const struct fib6_info
*rt
)
3867 struct fib6_info
*iter
;
3870 if (!rt6_is_dead(rt
))
3871 total
+= rt
->fib6_nh
.nh_weight
;
3873 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
) {
3874 if (!rt6_is_dead(iter
))
3875 total
+= iter
->fib6_nh
.nh_weight
;
3881 static void rt6_upper_bound_set(struct fib6_info
*rt
, int *weight
, int total
)
3883 int upper_bound
= -1;
3885 if (!rt6_is_dead(rt
)) {
3886 *weight
+= rt
->fib6_nh
.nh_weight
;
3887 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
) (*weight
) << 31,
3890 atomic_set(&rt
->fib6_nh
.nh_upper_bound
, upper_bound
);
3893 static void rt6_multipath_upper_bound_set(struct fib6_info
*rt
, int total
)
3895 struct fib6_info
*iter
;
3898 rt6_upper_bound_set(rt
, &weight
, total
);
3900 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3901 rt6_upper_bound_set(iter
, &weight
, total
);
3904 void rt6_multipath_rebalance(struct fib6_info
*rt
)
3906 struct fib6_info
*first
;
3909 /* In case the entire multipath route was marked for flushing,
3910 * then there is no need to rebalance upon the removal of every
3913 if (!rt
->fib6_nsiblings
|| rt
->should_flush
)
3916 /* During lookup routes are evaluated in order, so we need to
3917 * make sure upper bounds are assigned from the first sibling
3920 first
= rt6_multipath_first_sibling(rt
);
3921 if (WARN_ON_ONCE(!first
))
3924 total
= rt6_multipath_total_weight(first
);
3925 rt6_multipath_upper_bound_set(first
, total
);
3928 static int fib6_ifup(struct fib6_info
*rt
, void *p_arg
)
3930 const struct arg_netdev_event
*arg
= p_arg
;
3931 struct net
*net
= dev_net(arg
->dev
);
3933 if (rt
!= net
->ipv6
.fib6_null_entry
&& rt
->fib6_nh
.nh_dev
== arg
->dev
) {
3934 rt
->fib6_nh
.nh_flags
&= ~arg
->nh_flags
;
3935 fib6_update_sernum_upto_root(net
, rt
);
3936 rt6_multipath_rebalance(rt
);
3942 void rt6_sync_up(struct net_device
*dev
, unsigned int nh_flags
)
3944 struct arg_netdev_event arg
= {
3947 .nh_flags
= nh_flags
,
3951 if (nh_flags
& RTNH_F_DEAD
&& netif_carrier_ok(dev
))
3952 arg
.nh_flags
|= RTNH_F_LINKDOWN
;
3954 fib6_clean_all(dev_net(dev
), fib6_ifup
, &arg
);
3957 static bool rt6_multipath_uses_dev(const struct fib6_info
*rt
,
3958 const struct net_device
*dev
)
3960 struct fib6_info
*iter
;
3962 if (rt
->fib6_nh
.nh_dev
== dev
)
3964 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3965 if (iter
->fib6_nh
.nh_dev
== dev
)
3971 static void rt6_multipath_flush(struct fib6_info
*rt
)
3973 struct fib6_info
*iter
;
3975 rt
->should_flush
= 1;
3976 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3977 iter
->should_flush
= 1;
3980 static unsigned int rt6_multipath_dead_count(const struct fib6_info
*rt
,
3981 const struct net_device
*down_dev
)
3983 struct fib6_info
*iter
;
3984 unsigned int dead
= 0;
3986 if (rt
->fib6_nh
.nh_dev
== down_dev
||
3987 rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
3989 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3990 if (iter
->fib6_nh
.nh_dev
== down_dev
||
3991 iter
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
3997 static void rt6_multipath_nh_flags_set(struct fib6_info
*rt
,
3998 const struct net_device
*dev
,
3999 unsigned int nh_flags
)
4001 struct fib6_info
*iter
;
4003 if (rt
->fib6_nh
.nh_dev
== dev
)
4004 rt
->fib6_nh
.nh_flags
|= nh_flags
;
4005 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4006 if (iter
->fib6_nh
.nh_dev
== dev
)
4007 iter
->fib6_nh
.nh_flags
|= nh_flags
;
4010 /* called with write lock held for table with rt */
4011 static int fib6_ifdown(struct fib6_info
*rt
, void *p_arg
)
4013 const struct arg_netdev_event
*arg
= p_arg
;
4014 const struct net_device
*dev
= arg
->dev
;
4015 struct net
*net
= dev_net(dev
);
4017 if (rt
== net
->ipv6
.fib6_null_entry
)
4020 switch (arg
->event
) {
4021 case NETDEV_UNREGISTER
:
4022 return rt
->fib6_nh
.nh_dev
== dev
? -1 : 0;
4024 if (rt
->should_flush
)
4026 if (!rt
->fib6_nsiblings
)
4027 return rt
->fib6_nh
.nh_dev
== dev
? -1 : 0;
4028 if (rt6_multipath_uses_dev(rt
, dev
)) {
4031 count
= rt6_multipath_dead_count(rt
, dev
);
4032 if (rt
->fib6_nsiblings
+ 1 == count
) {
4033 rt6_multipath_flush(rt
);
4036 rt6_multipath_nh_flags_set(rt
, dev
, RTNH_F_DEAD
|
4038 fib6_update_sernum(net
, rt
);
4039 rt6_multipath_rebalance(rt
);
4043 if (rt
->fib6_nh
.nh_dev
!= dev
||
4044 rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
))
4046 rt
->fib6_nh
.nh_flags
|= RTNH_F_LINKDOWN
;
4047 rt6_multipath_rebalance(rt
);
4054 void rt6_sync_down_dev(struct net_device
*dev
, unsigned long event
)
4056 struct arg_netdev_event arg
= {
4062 struct net
*net
= dev_net(dev
);
4064 if (net
->ipv6
.sysctl
.skip_notify_on_dev_down
)
4065 fib6_clean_all_skip_notify(net
, fib6_ifdown
, &arg
);
4067 fib6_clean_all(net
, fib6_ifdown
, &arg
);
4070 void rt6_disable_ip(struct net_device
*dev
, unsigned long event
)
4072 rt6_sync_down_dev(dev
, event
);
4073 rt6_uncached_list_flush_dev(dev_net(dev
), dev
);
4074 neigh_ifdown(&nd_tbl
, dev
);
4077 struct rt6_mtu_change_arg
{
4078 struct net_device
*dev
;
4082 static int rt6_mtu_change_route(struct fib6_info
*rt
, void *p_arg
)
4084 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
4085 struct inet6_dev
*idev
;
4087 /* In IPv6 pmtu discovery is not optional,
4088 so that RTAX_MTU lock cannot disable it.
4089 We still use this lock to block changes
4090 caused by addrconf/ndisc.
4093 idev
= __in6_dev_get(arg
->dev
);
4097 /* For administrative MTU increase, there is no way to discover
4098 IPv6 PMTU increase, so PMTU increase should be updated here.
4099 Since RFC 1981 doesn't include administrative MTU increase
4100 update PMTU increase is a MUST. (i.e. jumbo frame)
4102 if (rt
->fib6_nh
.nh_dev
== arg
->dev
&&
4103 !fib6_metric_locked(rt
, RTAX_MTU
)) {
4104 u32 mtu
= rt
->fib6_pmtu
;
4106 if (mtu
>= arg
->mtu
||
4107 (mtu
< arg
->mtu
&& mtu
== idev
->cnf
.mtu6
))
4108 fib6_metric_set(rt
, RTAX_MTU
, arg
->mtu
);
4110 spin_lock_bh(&rt6_exception_lock
);
4111 rt6_exceptions_update_pmtu(idev
, rt
, arg
->mtu
);
4112 spin_unlock_bh(&rt6_exception_lock
);
4117 void rt6_mtu_change(struct net_device
*dev
, unsigned int mtu
)
4119 struct rt6_mtu_change_arg arg
= {
4124 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, &arg
);
4127 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
4128 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
4129 [RTA_PREFSRC
] = { .len
= sizeof(struct in6_addr
) },
4130 [RTA_OIF
] = { .type
= NLA_U32
},
4131 [RTA_IIF
] = { .type
= NLA_U32
},
4132 [RTA_PRIORITY
] = { .type
= NLA_U32
},
4133 [RTA_METRICS
] = { .type
= NLA_NESTED
},
4134 [RTA_MULTIPATH
] = { .len
= sizeof(struct rtnexthop
) },
4135 [RTA_PREF
] = { .type
= NLA_U8
},
4136 [RTA_ENCAP_TYPE
] = { .type
= NLA_U16
},
4137 [RTA_ENCAP
] = { .type
= NLA_NESTED
},
4138 [RTA_EXPIRES
] = { .type
= NLA_U32
},
4139 [RTA_UID
] = { .type
= NLA_U32
},
4140 [RTA_MARK
] = { .type
= NLA_U32
},
4141 [RTA_TABLE
] = { .type
= NLA_U32
},
4142 [RTA_IP_PROTO
] = { .type
= NLA_U8
},
4143 [RTA_SPORT
] = { .type
= NLA_U16
},
4144 [RTA_DPORT
] = { .type
= NLA_U16
},
4147 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4148 struct fib6_config
*cfg
,
4149 struct netlink_ext_ack
*extack
)
4152 struct nlattr
*tb
[RTA_MAX
+1];
4156 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
,
4162 rtm
= nlmsg_data(nlh
);
4164 *cfg
= (struct fib6_config
){
4165 .fc_table
= rtm
->rtm_table
,
4166 .fc_dst_len
= rtm
->rtm_dst_len
,
4167 .fc_src_len
= rtm
->rtm_src_len
,
4169 .fc_protocol
= rtm
->rtm_protocol
,
4170 .fc_type
= rtm
->rtm_type
,
4172 .fc_nlinfo
.portid
= NETLINK_CB(skb
).portid
,
4173 .fc_nlinfo
.nlh
= nlh
,
4174 .fc_nlinfo
.nl_net
= sock_net(skb
->sk
),
4177 if (rtm
->rtm_type
== RTN_UNREACHABLE
||
4178 rtm
->rtm_type
== RTN_BLACKHOLE
||
4179 rtm
->rtm_type
== RTN_PROHIBIT
||
4180 rtm
->rtm_type
== RTN_THROW
)
4181 cfg
->fc_flags
|= RTF_REJECT
;
4183 if (rtm
->rtm_type
== RTN_LOCAL
)
4184 cfg
->fc_flags
|= RTF_LOCAL
;
4186 if (rtm
->rtm_flags
& RTM_F_CLONED
)
4187 cfg
->fc_flags
|= RTF_CACHE
;
4189 cfg
->fc_flags
|= (rtm
->rtm_flags
& RTNH_F_ONLINK
);
4191 if (tb
[RTA_GATEWAY
]) {
4192 cfg
->fc_gateway
= nla_get_in6_addr(tb
[RTA_GATEWAY
]);
4193 cfg
->fc_flags
|= RTF_GATEWAY
;
4196 NL_SET_ERR_MSG(extack
, "IPv6 does not support RTA_VIA attribute");
4201 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
4203 if (nla_len(tb
[RTA_DST
]) < plen
)
4206 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
4210 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
4212 if (nla_len(tb
[RTA_SRC
]) < plen
)
4215 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
4218 if (tb
[RTA_PREFSRC
])
4219 cfg
->fc_prefsrc
= nla_get_in6_addr(tb
[RTA_PREFSRC
]);
4222 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
4224 if (tb
[RTA_PRIORITY
])
4225 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
4227 if (tb
[RTA_METRICS
]) {
4228 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
4229 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
4233 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
4235 if (tb
[RTA_MULTIPATH
]) {
4236 cfg
->fc_mp
= nla_data(tb
[RTA_MULTIPATH
]);
4237 cfg
->fc_mp_len
= nla_len(tb
[RTA_MULTIPATH
]);
4239 err
= lwtunnel_valid_encap_type_attr(cfg
->fc_mp
,
4240 cfg
->fc_mp_len
, extack
);
4246 pref
= nla_get_u8(tb
[RTA_PREF
]);
4247 if (pref
!= ICMPV6_ROUTER_PREF_LOW
&&
4248 pref
!= ICMPV6_ROUTER_PREF_HIGH
)
4249 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
4250 cfg
->fc_flags
|= RTF_PREF(pref
);
4254 cfg
->fc_encap
= tb
[RTA_ENCAP
];
4256 if (tb
[RTA_ENCAP_TYPE
]) {
4257 cfg
->fc_encap_type
= nla_get_u16(tb
[RTA_ENCAP_TYPE
]);
4259 err
= lwtunnel_valid_encap_type(cfg
->fc_encap_type
, extack
);
4264 if (tb
[RTA_EXPIRES
]) {
4265 unsigned long timeout
= addrconf_timeout_fixup(nla_get_u32(tb
[RTA_EXPIRES
]), HZ
);
4267 if (addrconf_finite_timeout(timeout
)) {
4268 cfg
->fc_expires
= jiffies_to_clock_t(timeout
* HZ
);
4269 cfg
->fc_flags
|= RTF_EXPIRES
;
4279 struct fib6_info
*fib6_info
;
4280 struct fib6_config r_cfg
;
4281 struct list_head next
;
4284 static int ip6_route_info_append(struct net
*net
,
4285 struct list_head
*rt6_nh_list
,
4286 struct fib6_info
*rt
,
4287 struct fib6_config
*r_cfg
)
4292 list_for_each_entry(nh
, rt6_nh_list
, next
) {
4293 /* check if fib6_info already exists */
4294 if (rt6_duplicate_nexthop(nh
->fib6_info
, rt
))
4298 nh
= kzalloc(sizeof(*nh
), GFP_KERNEL
);
4302 memcpy(&nh
->r_cfg
, r_cfg
, sizeof(*r_cfg
));
4303 list_add_tail(&nh
->next
, rt6_nh_list
);
4308 static void ip6_route_mpath_notify(struct fib6_info
*rt
,
4309 struct fib6_info
*rt_last
,
4310 struct nl_info
*info
,
4313 /* if this is an APPEND route, then rt points to the first route
4314 * inserted and rt_last points to last route inserted. Userspace
4315 * wants a consistent dump of the route which starts at the first
4316 * nexthop. Since sibling routes are always added at the end of
4317 * the list, find the first sibling of the last route appended
4319 if ((nlflags
& NLM_F_APPEND
) && rt_last
&& rt_last
->fib6_nsiblings
) {
4320 rt
= list_first_entry(&rt_last
->fib6_siblings
,
4326 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
4329 static int ip6_route_multipath_add(struct fib6_config
*cfg
,
4330 struct netlink_ext_ack
*extack
)
4332 struct fib6_info
*rt_notif
= NULL
, *rt_last
= NULL
;
4333 struct nl_info
*info
= &cfg
->fc_nlinfo
;
4334 struct fib6_config r_cfg
;
4335 struct rtnexthop
*rtnh
;
4336 struct fib6_info
*rt
;
4337 struct rt6_nh
*err_nh
;
4338 struct rt6_nh
*nh
, *nh_safe
;
4344 int replace
= (cfg
->fc_nlinfo
.nlh
&&
4345 (cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_REPLACE
));
4346 LIST_HEAD(rt6_nh_list
);
4348 nlflags
= replace
? NLM_F_REPLACE
: NLM_F_CREATE
;
4349 if (info
->nlh
&& info
->nlh
->nlmsg_flags
& NLM_F_APPEND
)
4350 nlflags
|= NLM_F_APPEND
;
4352 remaining
= cfg
->fc_mp_len
;
4353 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
4355 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4356 * fib6_info structs per nexthop
4358 while (rtnh_ok(rtnh
, remaining
)) {
4359 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
4360 if (rtnh
->rtnh_ifindex
)
4361 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
4363 attrlen
= rtnh_attrlen(rtnh
);
4365 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
4367 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
4369 r_cfg
.fc_gateway
= nla_get_in6_addr(nla
);
4370 r_cfg
.fc_flags
|= RTF_GATEWAY
;
4372 r_cfg
.fc_encap
= nla_find(attrs
, attrlen
, RTA_ENCAP
);
4373 nla
= nla_find(attrs
, attrlen
, RTA_ENCAP_TYPE
);
4375 r_cfg
.fc_encap_type
= nla_get_u16(nla
);
4378 r_cfg
.fc_flags
|= (rtnh
->rtnh_flags
& RTNH_F_ONLINK
);
4379 rt
= ip6_route_info_create(&r_cfg
, GFP_KERNEL
, extack
);
4385 if (!rt6_qualify_for_ecmp(rt
)) {
4387 NL_SET_ERR_MSG(extack
,
4388 "Device only routes can not be added for IPv6 using the multipath API.");
4389 fib6_info_release(rt
);
4393 rt
->fib6_nh
.nh_weight
= rtnh
->rtnh_hops
+ 1;
4395 err
= ip6_route_info_append(info
->nl_net
, &rt6_nh_list
,
4398 fib6_info_release(rt
);
4402 rtnh
= rtnh_next(rtnh
, &remaining
);
4405 /* for add and replace send one notification with all nexthops.
4406 * Skip the notification in fib6_add_rt2node and send one with
4407 * the full route when done
4409 info
->skip_notify
= 1;
4412 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
4413 err
= __ip6_ins_rt(nh
->fib6_info
, info
, extack
);
4414 fib6_info_release(nh
->fib6_info
);
4417 /* save reference to last route successfully inserted */
4418 rt_last
= nh
->fib6_info
;
4420 /* save reference to first route for notification */
4422 rt_notif
= nh
->fib6_info
;
4425 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4426 nh
->fib6_info
= NULL
;
4429 NL_SET_ERR_MSG_MOD(extack
,
4430 "multipath route replace failed (check consistency of installed routes)");
4435 /* Because each route is added like a single route we remove
4436 * these flags after the first nexthop: if there is a collision,
4437 * we have already failed to add the first nexthop:
4438 * fib6_add_rt2node() has rejected it; when replacing, old
4439 * nexthops have been replaced by first new, the rest should
4442 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
&= ~(NLM_F_EXCL
|
4447 /* success ... tell user about new route */
4448 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
4452 /* send notification for routes that were added so that
4453 * the delete notifications sent by ip6_route_del are
4457 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
4459 /* Delete routes that were already added */
4460 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
4463 ip6_route_del(&nh
->r_cfg
, extack
);
4467 list_for_each_entry_safe(nh
, nh_safe
, &rt6_nh_list
, next
) {
4469 fib6_info_release(nh
->fib6_info
);
4470 list_del(&nh
->next
);
4477 static int ip6_route_multipath_del(struct fib6_config
*cfg
,
4478 struct netlink_ext_ack
*extack
)
4480 struct fib6_config r_cfg
;
4481 struct rtnexthop
*rtnh
;
4484 int err
= 1, last_err
= 0;
4486 remaining
= cfg
->fc_mp_len
;
4487 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
4489 /* Parse a Multipath Entry */
4490 while (rtnh_ok(rtnh
, remaining
)) {
4491 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
4492 if (rtnh
->rtnh_ifindex
)
4493 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
4495 attrlen
= rtnh_attrlen(rtnh
);
4497 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
4499 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
4501 nla_memcpy(&r_cfg
.fc_gateway
, nla
, 16);
4502 r_cfg
.fc_flags
|= RTF_GATEWAY
;
4505 err
= ip6_route_del(&r_cfg
, extack
);
4509 rtnh
= rtnh_next(rtnh
, &remaining
);
4515 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4516 struct netlink_ext_ack
*extack
)
4518 struct fib6_config cfg
;
4521 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
4526 return ip6_route_multipath_del(&cfg
, extack
);
4528 cfg
.fc_delete_all_nh
= 1;
4529 return ip6_route_del(&cfg
, extack
);
4533 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4534 struct netlink_ext_ack
*extack
)
4536 struct fib6_config cfg
;
4539 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
4544 return ip6_route_multipath_add(&cfg
, extack
);
4546 return ip6_route_add(&cfg
, GFP_KERNEL
, extack
);
4549 static size_t rt6_nlmsg_size(struct fib6_info
*rt
)
4551 int nexthop_len
= 0;
4553 if (rt
->fib6_nsiblings
) {
4554 nexthop_len
= nla_total_size(0) /* RTA_MULTIPATH */
4555 + NLA_ALIGN(sizeof(struct rtnexthop
))
4556 + nla_total_size(16) /* RTA_GATEWAY */
4557 + lwtunnel_get_encap_size(rt
->fib6_nh
.nh_lwtstate
);
4559 nexthop_len
*= rt
->fib6_nsiblings
;
4562 return NLMSG_ALIGN(sizeof(struct rtmsg
))
4563 + nla_total_size(16) /* RTA_SRC */
4564 + nla_total_size(16) /* RTA_DST */
4565 + nla_total_size(16) /* RTA_GATEWAY */
4566 + nla_total_size(16) /* RTA_PREFSRC */
4567 + nla_total_size(4) /* RTA_TABLE */
4568 + nla_total_size(4) /* RTA_IIF */
4569 + nla_total_size(4) /* RTA_OIF */
4570 + nla_total_size(4) /* RTA_PRIORITY */
4571 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
4572 + nla_total_size(sizeof(struct rta_cacheinfo
))
4573 + nla_total_size(TCP_CA_NAME_MAX
) /* RTAX_CC_ALGO */
4574 + nla_total_size(1) /* RTA_PREF */
4575 + lwtunnel_get_encap_size(rt
->fib6_nh
.nh_lwtstate
)
4579 static int rt6_nexthop_info(struct sk_buff
*skb
, struct fib6_info
*rt
,
4580 unsigned int *flags
, bool skip_oif
)
4582 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
4583 *flags
|= RTNH_F_DEAD
;
4585 if (rt
->fib6_nh
.nh_flags
& RTNH_F_LINKDOWN
) {
4586 *flags
|= RTNH_F_LINKDOWN
;
4589 if (fib6_ignore_linkdown(rt
))
4590 *flags
|= RTNH_F_DEAD
;
4594 if (rt
->fib6_flags
& RTF_GATEWAY
) {
4595 if (nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt
->fib6_nh
.nh_gw
) < 0)
4596 goto nla_put_failure
;
4599 *flags
|= (rt
->fib6_nh
.nh_flags
& RTNH_F_ONLINK
);
4600 if (rt
->fib6_nh
.nh_flags
& RTNH_F_OFFLOAD
)
4601 *flags
|= RTNH_F_OFFLOAD
;
4603 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4604 if (!skip_oif
&& rt
->fib6_nh
.nh_dev
&&
4605 nla_put_u32(skb
, RTA_OIF
, rt
->fib6_nh
.nh_dev
->ifindex
))
4606 goto nla_put_failure
;
4608 if (rt
->fib6_nh
.nh_lwtstate
&&
4609 lwtunnel_fill_encap(skb
, rt
->fib6_nh
.nh_lwtstate
) < 0)
4610 goto nla_put_failure
;
4618 /* add multipath next hop */
4619 static int rt6_add_nexthop(struct sk_buff
*skb
, struct fib6_info
*rt
)
4621 const struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
4622 struct rtnexthop
*rtnh
;
4623 unsigned int flags
= 0;
4625 rtnh
= nla_reserve_nohdr(skb
, sizeof(*rtnh
));
4627 goto nla_put_failure
;
4629 rtnh
->rtnh_hops
= rt
->fib6_nh
.nh_weight
- 1;
4630 rtnh
->rtnh_ifindex
= dev
? dev
->ifindex
: 0;
4632 if (rt6_nexthop_info(skb
, rt
, &flags
, true) < 0)
4633 goto nla_put_failure
;
4635 rtnh
->rtnh_flags
= flags
;
4637 /* length of rtnetlink header + attributes */
4638 rtnh
->rtnh_len
= nlmsg_get_pos(skb
) - (void *)rtnh
;
4646 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
4647 struct fib6_info
*rt
, struct dst_entry
*dst
,
4648 struct in6_addr
*dest
, struct in6_addr
*src
,
4649 int iif
, int type
, u32 portid
, u32 seq
,
4652 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
4653 struct rt6key
*rt6_dst
, *rt6_src
;
4654 u32
*pmetrics
, table
, rt6_flags
;
4655 struct nlmsghdr
*nlh
;
4659 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*rtm
), flags
);
4664 rt6_dst
= &rt6
->rt6i_dst
;
4665 rt6_src
= &rt6
->rt6i_src
;
4666 rt6_flags
= rt6
->rt6i_flags
;
4668 rt6_dst
= &rt
->fib6_dst
;
4669 rt6_src
= &rt
->fib6_src
;
4670 rt6_flags
= rt
->fib6_flags
;
4673 rtm
= nlmsg_data(nlh
);
4674 rtm
->rtm_family
= AF_INET6
;
4675 rtm
->rtm_dst_len
= rt6_dst
->plen
;
4676 rtm
->rtm_src_len
= rt6_src
->plen
;
4679 table
= rt
->fib6_table
->tb6_id
;
4681 table
= RT6_TABLE_UNSPEC
;
4682 rtm
->rtm_table
= table
< 256 ? table
: RT_TABLE_COMPAT
;
4683 if (nla_put_u32(skb
, RTA_TABLE
, table
))
4684 goto nla_put_failure
;
4686 rtm
->rtm_type
= rt
->fib6_type
;
4688 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
4689 rtm
->rtm_protocol
= rt
->fib6_protocol
;
4691 if (rt6_flags
& RTF_CACHE
)
4692 rtm
->rtm_flags
|= RTM_F_CLONED
;
4695 if (nla_put_in6_addr(skb
, RTA_DST
, dest
))
4696 goto nla_put_failure
;
4697 rtm
->rtm_dst_len
= 128;
4698 } else if (rtm
->rtm_dst_len
)
4699 if (nla_put_in6_addr(skb
, RTA_DST
, &rt6_dst
->addr
))
4700 goto nla_put_failure
;
4701 #ifdef CONFIG_IPV6_SUBTREES
4703 if (nla_put_in6_addr(skb
, RTA_SRC
, src
))
4704 goto nla_put_failure
;
4705 rtm
->rtm_src_len
= 128;
4706 } else if (rtm
->rtm_src_len
&&
4707 nla_put_in6_addr(skb
, RTA_SRC
, &rt6_src
->addr
))
4708 goto nla_put_failure
;
4711 #ifdef CONFIG_IPV6_MROUTE
4712 if (ipv6_addr_is_multicast(&rt6_dst
->addr
)) {
4713 int err
= ip6mr_get_route(net
, skb
, rtm
, portid
);
4718 goto nla_put_failure
;
4721 if (nla_put_u32(skb
, RTA_IIF
, iif
))
4722 goto nla_put_failure
;
4724 struct in6_addr saddr_buf
;
4725 if (ip6_route_get_saddr(net
, rt
, dest
, 0, &saddr_buf
) == 0 &&
4726 nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
4727 goto nla_put_failure
;
4730 if (rt
->fib6_prefsrc
.plen
) {
4731 struct in6_addr saddr_buf
;
4732 saddr_buf
= rt
->fib6_prefsrc
.addr
;
4733 if (nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
4734 goto nla_put_failure
;
4737 pmetrics
= dst
? dst_metrics_ptr(dst
) : rt
->fib6_metrics
->metrics
;
4738 if (rtnetlink_put_metrics(skb
, pmetrics
) < 0)
4739 goto nla_put_failure
;
4741 if (nla_put_u32(skb
, RTA_PRIORITY
, rt
->fib6_metric
))
4742 goto nla_put_failure
;
4744 /* For multipath routes, walk the siblings list and add
4745 * each as a nexthop within RTA_MULTIPATH.
4748 if (rt6_flags
& RTF_GATEWAY
&&
4749 nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt6
->rt6i_gateway
))
4750 goto nla_put_failure
;
4752 if (dst
->dev
&& nla_put_u32(skb
, RTA_OIF
, dst
->dev
->ifindex
))
4753 goto nla_put_failure
;
4754 } else if (rt
->fib6_nsiblings
) {
4755 struct fib6_info
*sibling
, *next_sibling
;
4758 mp
= nla_nest_start(skb
, RTA_MULTIPATH
);
4760 goto nla_put_failure
;
4762 if (rt6_add_nexthop(skb
, rt
) < 0)
4763 goto nla_put_failure
;
4765 list_for_each_entry_safe(sibling
, next_sibling
,
4766 &rt
->fib6_siblings
, fib6_siblings
) {
4767 if (rt6_add_nexthop(skb
, sibling
) < 0)
4768 goto nla_put_failure
;
4771 nla_nest_end(skb
, mp
);
4773 if (rt6_nexthop_info(skb
, rt
, &rtm
->rtm_flags
, false) < 0)
4774 goto nla_put_failure
;
4777 if (rt6_flags
& RTF_EXPIRES
) {
4778 expires
= dst
? dst
->expires
: rt
->expires
;
4782 if (rtnl_put_cacheinfo(skb
, dst
, 0, expires
, dst
? dst
->error
: 0) < 0)
4783 goto nla_put_failure
;
4785 if (nla_put_u8(skb
, RTA_PREF
, IPV6_EXTRACT_PREF(rt6_flags
)))
4786 goto nla_put_failure
;
4789 nlmsg_end(skb
, nlh
);
4793 nlmsg_cancel(skb
, nlh
);
4797 static bool fib6_info_uses_dev(const struct fib6_info
*f6i
,
4798 const struct net_device
*dev
)
4800 if (f6i
->fib6_nh
.nh_dev
== dev
)
4803 if (f6i
->fib6_nsiblings
) {
4804 struct fib6_info
*sibling
, *next_sibling
;
4806 list_for_each_entry_safe(sibling
, next_sibling
,
4807 &f6i
->fib6_siblings
, fib6_siblings
) {
4808 if (sibling
->fib6_nh
.nh_dev
== dev
)
4816 int rt6_dump_route(struct fib6_info
*rt
, void *p_arg
)
4818 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
4819 struct fib_dump_filter
*filter
= &arg
->filter
;
4820 unsigned int flags
= NLM_F_MULTI
;
4821 struct net
*net
= arg
->net
;
4823 if (rt
== net
->ipv6
.fib6_null_entry
)
4826 if ((filter
->flags
& RTM_F_PREFIX
) &&
4827 !(rt
->fib6_flags
& RTF_PREFIX_RT
)) {
4828 /* success since this is not a prefix route */
4831 if (filter
->filter_set
) {
4832 if ((filter
->rt_type
&& rt
->fib6_type
!= filter
->rt_type
) ||
4833 (filter
->dev
&& !fib6_info_uses_dev(rt
, filter
->dev
)) ||
4834 (filter
->protocol
&& rt
->fib6_protocol
!= filter
->protocol
)) {
4837 flags
|= NLM_F_DUMP_FILTERED
;
4840 return rt6_fill_node(net
, arg
->skb
, rt
, NULL
, NULL
, NULL
, 0,
4841 RTM_NEWROUTE
, NETLINK_CB(arg
->cb
->skb
).portid
,
4842 arg
->cb
->nlh
->nlmsg_seq
, flags
);
4845 static int inet6_rtm_valid_getroute_req(struct sk_buff
*skb
,
4846 const struct nlmsghdr
*nlh
,
4848 struct netlink_ext_ack
*extack
)
4853 if (nlh
->nlmsg_len
< nlmsg_msg_size(sizeof(*rtm
))) {
4854 NL_SET_ERR_MSG_MOD(extack
,
4855 "Invalid header for get route request");
4859 if (!netlink_strict_get_check(skb
))
4860 return nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
4861 rtm_ipv6_policy
, extack
);
4863 rtm
= nlmsg_data(nlh
);
4864 if ((rtm
->rtm_src_len
&& rtm
->rtm_src_len
!= 128) ||
4865 (rtm
->rtm_dst_len
&& rtm
->rtm_dst_len
!= 128) ||
4866 rtm
->rtm_table
|| rtm
->rtm_protocol
|| rtm
->rtm_scope
||
4868 NL_SET_ERR_MSG_MOD(extack
, "Invalid values in header for get route request");
4871 if (rtm
->rtm_flags
& ~RTM_F_FIB_MATCH
) {
4872 NL_SET_ERR_MSG_MOD(extack
,
4873 "Invalid flags for get route request");
4877 err
= nlmsg_parse_strict(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
4878 rtm_ipv6_policy
, extack
);
4882 if ((tb
[RTA_SRC
] && !rtm
->rtm_src_len
) ||
4883 (tb
[RTA_DST
] && !rtm
->rtm_dst_len
)) {
4884 NL_SET_ERR_MSG_MOD(extack
, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4888 for (i
= 0; i
<= RTA_MAX
; i
++) {
4904 NL_SET_ERR_MSG_MOD(extack
, "Unsupported attribute in get route request");
4912 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
4913 struct netlink_ext_ack
*extack
)
4915 struct net
*net
= sock_net(in_skb
->sk
);
4916 struct nlattr
*tb
[RTA_MAX
+1];
4917 int err
, iif
= 0, oif
= 0;
4918 struct fib6_info
*from
;
4919 struct dst_entry
*dst
;
4920 struct rt6_info
*rt
;
4921 struct sk_buff
*skb
;
4923 struct flowi6 fl6
= {};
4926 err
= inet6_rtm_valid_getroute_req(in_skb
, nlh
, tb
, extack
);
4931 rtm
= nlmsg_data(nlh
);
4932 fl6
.flowlabel
= ip6_make_flowinfo(rtm
->rtm_tos
, 0);
4933 fibmatch
= !!(rtm
->rtm_flags
& RTM_F_FIB_MATCH
);
4936 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
4939 fl6
.saddr
= *(struct in6_addr
*)nla_data(tb
[RTA_SRC
]);
4943 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
4946 fl6
.daddr
= *(struct in6_addr
*)nla_data(tb
[RTA_DST
]);
4950 iif
= nla_get_u32(tb
[RTA_IIF
]);
4953 oif
= nla_get_u32(tb
[RTA_OIF
]);
4956 fl6
.flowi6_mark
= nla_get_u32(tb
[RTA_MARK
]);
4959 fl6
.flowi6_uid
= make_kuid(current_user_ns(),
4960 nla_get_u32(tb
[RTA_UID
]));
4962 fl6
.flowi6_uid
= iif
? INVALID_UID
: current_uid();
4965 fl6
.fl6_sport
= nla_get_be16(tb
[RTA_SPORT
]);
4968 fl6
.fl6_dport
= nla_get_be16(tb
[RTA_DPORT
]);
4970 if (tb
[RTA_IP_PROTO
]) {
4971 err
= rtm_getroute_parse_ip_proto(tb
[RTA_IP_PROTO
],
4972 &fl6
.flowi6_proto
, AF_INET6
,
4979 struct net_device
*dev
;
4984 dev
= dev_get_by_index_rcu(net
, iif
);
4991 fl6
.flowi6_iif
= iif
;
4993 if (!ipv6_addr_any(&fl6
.saddr
))
4994 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
4996 dst
= ip6_route_input_lookup(net
, dev
, &fl6
, NULL
, flags
);
5000 fl6
.flowi6_oif
= oif
;
5002 dst
= ip6_route_output(net
, NULL
, &fl6
);
5006 rt
= container_of(dst
, struct rt6_info
, dst
);
5007 if (rt
->dst
.error
) {
5008 err
= rt
->dst
.error
;
5013 if (rt
== net
->ipv6
.ip6_null_entry
) {
5014 err
= rt
->dst
.error
;
5019 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
5026 skb_dst_set(skb
, &rt
->dst
);
5029 from
= rcu_dereference(rt
->from
);
5032 err
= rt6_fill_node(net
, skb
, from
, NULL
, NULL
, NULL
,
5034 NETLINK_CB(in_skb
).portid
,
5037 err
= rt6_fill_node(net
, skb
, from
, dst
, &fl6
.daddr
,
5038 &fl6
.saddr
, iif
, RTM_NEWROUTE
,
5039 NETLINK_CB(in_skb
).portid
,
5051 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
5056 void inet6_rt_notify(int event
, struct fib6_info
*rt
, struct nl_info
*info
,
5057 unsigned int nlm_flags
)
5059 struct sk_buff
*skb
;
5060 struct net
*net
= info
->nl_net
;
5065 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
5067 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
5071 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, NULL
, 0,
5072 event
, info
->portid
, seq
, nlm_flags
);
5074 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5075 WARN_ON(err
== -EMSGSIZE
);
5079 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
5080 info
->nlh
, gfp_any());
5084 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
5087 static int ip6_route_dev_notify(struct notifier_block
*this,
5088 unsigned long event
, void *ptr
)
5090 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
5091 struct net
*net
= dev_net(dev
);
5093 if (!(dev
->flags
& IFF_LOOPBACK
))
5096 if (event
== NETDEV_REGISTER
) {
5097 net
->ipv6
.fib6_null_entry
->fib6_nh
.nh_dev
= dev
;
5098 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
5099 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
5100 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5101 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
5102 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
5103 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
5104 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
5106 } else if (event
== NETDEV_UNREGISTER
&&
5107 dev
->reg_state
!= NETREG_UNREGISTERED
) {
5108 /* NETDEV_UNREGISTER could be fired for multiple times by
5109 * netdev_wait_allrefs(). Make sure we only call this once.
5111 in6_dev_put_clear(&net
->ipv6
.ip6_null_entry
->rt6i_idev
);
5112 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5113 in6_dev_put_clear(&net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
);
5114 in6_dev_put_clear(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
);
5125 #ifdef CONFIG_PROC_FS
5126 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
5128 struct net
*net
= (struct net
*)seq
->private;
5129 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
5130 net
->ipv6
.rt6_stats
->fib_nodes
,
5131 net
->ipv6
.rt6_stats
->fib_route_nodes
,
5132 atomic_read(&net
->ipv6
.rt6_stats
->fib_rt_alloc
),
5133 net
->ipv6
.rt6_stats
->fib_rt_entries
,
5134 net
->ipv6
.rt6_stats
->fib_rt_cache
,
5135 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
5136 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
5140 #endif /* CONFIG_PROC_FS */
5142 #ifdef CONFIG_SYSCTL
5145 int ipv6_sysctl_rtcache_flush(struct ctl_table
*ctl
, int write
,
5146 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
5154 net
= (struct net
*)ctl
->extra1
;
5155 delay
= net
->ipv6
.sysctl
.flush_delay
;
5156 ret
= proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
5160 fib6_run_gc(delay
<= 0 ? 0 : (unsigned long)delay
, net
, delay
> 0);
5167 static struct ctl_table ipv6_route_table_template
[] = {
5169 .procname
= "flush",
5170 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
5171 .maxlen
= sizeof(int),
5173 .proc_handler
= ipv6_sysctl_rtcache_flush
5176 .procname
= "gc_thresh",
5177 .data
= &ip6_dst_ops_template
.gc_thresh
,
5178 .maxlen
= sizeof(int),
5180 .proc_handler
= proc_dointvec
,
5183 .procname
= "max_size",
5184 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
5185 .maxlen
= sizeof(int),
5187 .proc_handler
= proc_dointvec
,
5190 .procname
= "gc_min_interval",
5191 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
5192 .maxlen
= sizeof(int),
5194 .proc_handler
= proc_dointvec_jiffies
,
5197 .procname
= "gc_timeout",
5198 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
5199 .maxlen
= sizeof(int),
5201 .proc_handler
= proc_dointvec_jiffies
,
5204 .procname
= "gc_interval",
5205 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
5206 .maxlen
= sizeof(int),
5208 .proc_handler
= proc_dointvec_jiffies
,
5211 .procname
= "gc_elasticity",
5212 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
5213 .maxlen
= sizeof(int),
5215 .proc_handler
= proc_dointvec
,
5218 .procname
= "mtu_expires",
5219 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
5220 .maxlen
= sizeof(int),
5222 .proc_handler
= proc_dointvec_jiffies
,
5225 .procname
= "min_adv_mss",
5226 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
5227 .maxlen
= sizeof(int),
5229 .proc_handler
= proc_dointvec
,
5232 .procname
= "gc_min_interval_ms",
5233 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
5234 .maxlen
= sizeof(int),
5236 .proc_handler
= proc_dointvec_ms_jiffies
,
5239 .procname
= "skip_notify_on_dev_down",
5240 .data
= &init_net
.ipv6
.sysctl
.skip_notify_on_dev_down
,
5241 .maxlen
= sizeof(int),
5243 .proc_handler
= proc_dointvec
,
5250 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
5252 struct ctl_table
*table
;
5254 table
= kmemdup(ipv6_route_table_template
,
5255 sizeof(ipv6_route_table_template
),
5259 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
5260 table
[0].extra1
= net
;
5261 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
5262 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
5263 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
5264 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
5265 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
5266 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
5267 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
5268 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
5269 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
5270 table
[10].data
= &net
->ipv6
.sysctl
.skip_notify_on_dev_down
;
5272 /* Don't export sysctls to unprivileged users */
5273 if (net
->user_ns
!= &init_user_ns
)
5274 table
[0].procname
= NULL
;
5281 static int __net_init
ip6_route_net_init(struct net
*net
)
5285 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
5286 sizeof(net
->ipv6
.ip6_dst_ops
));
5288 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
5289 goto out_ip6_dst_ops
;
5291 net
->ipv6
.fib6_null_entry
= kmemdup(&fib6_null_entry_template
,
5292 sizeof(*net
->ipv6
.fib6_null_entry
),
5294 if (!net
->ipv6
.fib6_null_entry
)
5295 goto out_ip6_dst_entries
;
5297 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
5298 sizeof(*net
->ipv6
.ip6_null_entry
),
5300 if (!net
->ipv6
.ip6_null_entry
)
5301 goto out_fib6_null_entry
;
5302 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
5303 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
5304 ip6_template_metrics
, true);
5306 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5307 net
->ipv6
.fib6_has_custom_rules
= false;
5308 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
5309 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
5311 if (!net
->ipv6
.ip6_prohibit_entry
)
5312 goto out_ip6_null_entry
;
5313 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
5314 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
5315 ip6_template_metrics
, true);
5317 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
5318 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
5320 if (!net
->ipv6
.ip6_blk_hole_entry
)
5321 goto out_ip6_prohibit_entry
;
5322 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
5323 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
5324 ip6_template_metrics
, true);
5327 net
->ipv6
.sysctl
.flush_delay
= 0;
5328 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
5329 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
5330 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
5331 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
5332 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
5333 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
5334 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
5335 net
->ipv6
.sysctl
.skip_notify_on_dev_down
= 0;
5337 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
5343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5344 out_ip6_prohibit_entry
:
5345 kfree(net
->ipv6
.ip6_prohibit_entry
);
5347 kfree(net
->ipv6
.ip6_null_entry
);
5349 out_fib6_null_entry
:
5350 kfree(net
->ipv6
.fib6_null_entry
);
5351 out_ip6_dst_entries
:
5352 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
5357 static void __net_exit
ip6_route_net_exit(struct net
*net
)
5359 kfree(net
->ipv6
.fib6_null_entry
);
5360 kfree(net
->ipv6
.ip6_null_entry
);
5361 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5362 kfree(net
->ipv6
.ip6_prohibit_entry
);
5363 kfree(net
->ipv6
.ip6_blk_hole_entry
);
5365 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
5368 static int __net_init
ip6_route_net_init_late(struct net
*net
)
5370 #ifdef CONFIG_PROC_FS
5371 proc_create_net("ipv6_route", 0, net
->proc_net
, &ipv6_route_seq_ops
,
5372 sizeof(struct ipv6_route_iter
));
5373 proc_create_net_single("rt6_stats", 0444, net
->proc_net
,
5374 rt6_stats_seq_show
, NULL
);
5379 static void __net_exit
ip6_route_net_exit_late(struct net
*net
)
5381 #ifdef CONFIG_PROC_FS
5382 remove_proc_entry("ipv6_route", net
->proc_net
);
5383 remove_proc_entry("rt6_stats", net
->proc_net
);
5387 static struct pernet_operations ip6_route_net_ops
= {
5388 .init
= ip6_route_net_init
,
5389 .exit
= ip6_route_net_exit
,
5392 static int __net_init
ipv6_inetpeer_init(struct net
*net
)
5394 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
5398 inet_peer_base_init(bp
);
5399 net
->ipv6
.peers
= bp
;
5403 static void __net_exit
ipv6_inetpeer_exit(struct net
*net
)
5405 struct inet_peer_base
*bp
= net
->ipv6
.peers
;
5407 net
->ipv6
.peers
= NULL
;
5408 inetpeer_invalidate_tree(bp
);
5412 static struct pernet_operations ipv6_inetpeer_ops
= {
5413 .init
= ipv6_inetpeer_init
,
5414 .exit
= ipv6_inetpeer_exit
,
5417 static struct pernet_operations ip6_route_net_late_ops
= {
5418 .init
= ip6_route_net_init_late
,
5419 .exit
= ip6_route_net_exit_late
,
5422 static struct notifier_block ip6_route_dev_notifier
= {
5423 .notifier_call
= ip6_route_dev_notify
,
5424 .priority
= ADDRCONF_NOTIFY_PRIORITY
- 10,
5427 void __init
ip6_route_init_special_entries(void)
5429 /* Registering of the loopback is done before this portion of code,
5430 * the loopback reference in rt6_info will not be taken, do it
5431 * manually for init_net */
5432 init_net
.ipv6
.fib6_null_entry
->fib6_nh
.nh_dev
= init_net
.loopback_dev
;
5433 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
5434 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5435 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5436 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
5437 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5438 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
5439 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5443 int __init
ip6_route_init(void)
5449 ip6_dst_ops_template
.kmem_cachep
=
5450 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
5451 SLAB_HWCACHE_ALIGN
, NULL
);
5452 if (!ip6_dst_ops_template
.kmem_cachep
)
5455 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
5457 goto out_kmem_cache
;
5459 ret
= register_pernet_subsys(&ipv6_inetpeer_ops
);
5461 goto out_dst_entries
;
5463 ret
= register_pernet_subsys(&ip6_route_net_ops
);
5465 goto out_register_inetpeer
;
5467 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
5471 goto out_register_subsys
;
5477 ret
= fib6_rules_init();
5481 ret
= register_pernet_subsys(&ip6_route_net_late_ops
);
5483 goto fib6_rules_init
;
5485 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_NEWROUTE
,
5486 inet6_rtm_newroute
, NULL
, 0);
5488 goto out_register_late_subsys
;
5490 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_DELROUTE
,
5491 inet6_rtm_delroute
, NULL
, 0);
5493 goto out_register_late_subsys
;
5495 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_GETROUTE
,
5496 inet6_rtm_getroute
, NULL
,
5497 RTNL_FLAG_DOIT_UNLOCKED
);
5499 goto out_register_late_subsys
;
5501 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
5503 goto out_register_late_subsys
;
5505 for_each_possible_cpu(cpu
) {
5506 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
5508 INIT_LIST_HEAD(&ul
->head
);
5509 spin_lock_init(&ul
->lock
);
5515 out_register_late_subsys
:
5516 rtnl_unregister_all(PF_INET6
);
5517 unregister_pernet_subsys(&ip6_route_net_late_ops
);
5519 fib6_rules_cleanup();
5524 out_register_subsys
:
5525 unregister_pernet_subsys(&ip6_route_net_ops
);
5526 out_register_inetpeer
:
5527 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
5529 dst_entries_destroy(&ip6_dst_blackhole_ops
);
5531 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
5535 void ip6_route_cleanup(void)
5537 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
5538 unregister_pernet_subsys(&ip6_route_net_late_ops
);
5539 fib6_rules_cleanup();
5542 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
5543 unregister_pernet_subsys(&ip6_route_net_ops
);
5544 dst_entries_destroy(&ip6_dst_blackhole_ops
);
5545 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);