2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type
);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup
);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD
= -3,
82 RT6_NUD_FAIL_PROBE
= -2,
83 RT6_NUD_FAIL_DO_RR
= -1,
87 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
88 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
89 static unsigned int ip6_mtu(const struct dst_entry
*dst
);
90 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
91 static void ip6_dst_destroy(struct dst_entry
*);
92 static void ip6_dst_ifdown(struct dst_entry
*,
93 struct net_device
*dev
, int how
);
94 static int ip6_dst_gc(struct dst_ops
*ops
);
96 static int ip6_pkt_discard(struct sk_buff
*skb
);
97 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
98 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
99 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
100 static void ip6_link_failure(struct sk_buff
*skb
);
101 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
102 struct sk_buff
*skb
, u32 mtu
);
103 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
104 struct sk_buff
*skb
);
105 static int rt6_score_route(struct fib6_info
*rt
, int oif
, int strict
);
106 static size_t rt6_nlmsg_size(struct fib6_info
*rt
);
107 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
108 struct fib6_info
*rt
, struct dst_entry
*dst
,
109 struct in6_addr
*dest
, struct in6_addr
*src
,
110 int iif
, int type
, u32 portid
, u32 seq
,
112 static struct rt6_info
*rt6_find_cached_rt(struct fib6_info
*rt
,
113 const struct in6_addr
*daddr
,
114 const struct in6_addr
*saddr
);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
118 const struct in6_addr
*prefix
, int prefixlen
,
119 const struct in6_addr
*gwaddr
,
120 struct net_device
*dev
,
122 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
123 const struct in6_addr
*prefix
, int prefixlen
,
124 const struct in6_addr
*gwaddr
,
125 struct net_device
*dev
);
128 struct uncached_list
{
130 struct list_head head
;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt6_uncached_list
);
135 void rt6_uncached_list_add(struct rt6_info
*rt
)
137 struct uncached_list
*ul
= raw_cpu_ptr(&rt6_uncached_list
);
139 rt
->rt6i_uncached_list
= ul
;
141 spin_lock_bh(&ul
->lock
);
142 list_add_tail(&rt
->rt6i_uncached
, &ul
->head
);
143 spin_unlock_bh(&ul
->lock
);
146 void rt6_uncached_list_del(struct rt6_info
*rt
)
148 if (!list_empty(&rt
->rt6i_uncached
)) {
149 struct uncached_list
*ul
= rt
->rt6i_uncached_list
;
150 struct net
*net
= dev_net(rt
->dst
.dev
);
152 spin_lock_bh(&ul
->lock
);
153 list_del(&rt
->rt6i_uncached
);
154 atomic_dec(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
155 spin_unlock_bh(&ul
->lock
);
159 static void rt6_uncached_list_flush_dev(struct net
*net
, struct net_device
*dev
)
161 struct net_device
*loopback_dev
= net
->loopback_dev
;
164 if (dev
== loopback_dev
)
167 for_each_possible_cpu(cpu
) {
168 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
171 spin_lock_bh(&ul
->lock
);
172 list_for_each_entry(rt
, &ul
->head
, rt6i_uncached
) {
173 struct inet6_dev
*rt_idev
= rt
->rt6i_idev
;
174 struct net_device
*rt_dev
= rt
->dst
.dev
;
176 if (rt_idev
->dev
== dev
) {
177 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
178 in6_dev_put(rt_idev
);
182 rt
->dst
.dev
= loopback_dev
;
183 dev_hold(rt
->dst
.dev
);
187 spin_unlock_bh(&ul
->lock
);
191 static inline const void *choose_neigh_daddr(const struct in6_addr
*p
,
195 if (!ipv6_addr_any(p
))
196 return (const void *) p
;
198 return &ipv6_hdr(skb
)->daddr
;
202 struct neighbour
*ip6_neigh_lookup(const struct in6_addr
*gw
,
203 struct net_device
*dev
,
209 daddr
= choose_neigh_daddr(gw
, skb
, daddr
);
210 n
= __ipv6_neigh_lookup(dev
, daddr
);
214 n
= neigh_create(&nd_tbl
, daddr
, dev
);
215 return IS_ERR(n
) ? NULL
: n
;
218 static struct neighbour
*ip6_dst_neigh_lookup(const struct dst_entry
*dst
,
222 const struct rt6_info
*rt
= container_of(dst
, struct rt6_info
, dst
);
224 return ip6_neigh_lookup(&rt
->rt6i_gateway
, dst
->dev
, skb
, daddr
);
227 static void ip6_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
229 struct net_device
*dev
= dst
->dev
;
230 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
232 daddr
= choose_neigh_daddr(&rt
->rt6i_gateway
, NULL
, daddr
);
235 if (dev
->flags
& (IFF_NOARP
| IFF_LOOPBACK
))
237 if (ipv6_addr_is_multicast((const struct in6_addr
*)daddr
))
239 __ipv6_confirm_neigh(dev
, daddr
);
242 static struct dst_ops ip6_dst_ops_template
= {
246 .check
= ip6_dst_check
,
247 .default_advmss
= ip6_default_advmss
,
249 .cow_metrics
= dst_cow_metrics_generic
,
250 .destroy
= ip6_dst_destroy
,
251 .ifdown
= ip6_dst_ifdown
,
252 .negative_advice
= ip6_negative_advice
,
253 .link_failure
= ip6_link_failure
,
254 .update_pmtu
= ip6_rt_update_pmtu
,
255 .redirect
= rt6_do_redirect
,
256 .local_out
= __ip6_local_out
,
257 .neigh_lookup
= ip6_dst_neigh_lookup
,
258 .confirm_neigh
= ip6_confirm_neigh
,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry
*dst
)
263 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
265 return mtu
? : dst
->dev
->mtu
;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
269 struct sk_buff
*skb
, u32 mtu
)
273 static void ip6_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
278 static struct dst_ops ip6_dst_blackhole_ops
= {
280 .destroy
= ip6_dst_destroy
,
281 .check
= ip6_dst_check
,
282 .mtu
= ip6_blackhole_mtu
,
283 .default_advmss
= ip6_default_advmss
,
284 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
285 .redirect
= ip6_rt_blackhole_redirect
,
286 .cow_metrics
= dst_cow_metrics_generic
,
287 .neigh_lookup
= ip6_dst_neigh_lookup
,
290 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
291 [RTAX_HOPLIMIT
- 1] = 0,
294 static const struct fib6_info fib6_null_entry_template
= {
295 .fib6_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
296 .fib6_protocol
= RTPROT_KERNEL
,
297 .fib6_metric
= ~(u32
)0,
298 .fib6_ref
= ATOMIC_INIT(1),
299 .fib6_type
= RTN_UNREACHABLE
,
300 .fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
,
303 static const struct rt6_info ip6_null_entry_template
= {
305 .__refcnt
= ATOMIC_INIT(1),
307 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
308 .error
= -ENETUNREACH
,
309 .input
= ip6_pkt_discard
,
310 .output
= ip6_pkt_discard_out
,
312 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template
= {
319 .__refcnt
= ATOMIC_INIT(1),
321 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
323 .input
= ip6_pkt_prohibit
,
324 .output
= ip6_pkt_prohibit_out
,
326 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
329 static const struct rt6_info ip6_blk_hole_entry_template
= {
331 .__refcnt
= ATOMIC_INIT(1),
333 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
335 .input
= dst_discard
,
336 .output
= dst_discard_out
,
338 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
343 static void rt6_info_init(struct rt6_info
*rt
)
345 struct dst_entry
*dst
= &rt
->dst
;
347 memset(dst
+ 1, 0, sizeof(*rt
) - sizeof(*dst
));
348 INIT_LIST_HEAD(&rt
->rt6i_uncached
);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info
*ip6_dst_alloc(struct net
*net
, struct net_device
*dev
,
355 struct rt6_info
*rt
= dst_alloc(&net
->ipv6
.ip6_dst_ops
, dev
,
356 1, DST_OBSOLETE_FORCE_CHK
, flags
);
360 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
365 EXPORT_SYMBOL(ip6_dst_alloc
);
367 static void ip6_dst_destroy(struct dst_entry
*dst
)
369 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
370 struct fib6_info
*from
;
371 struct inet6_dev
*idev
;
373 ip_dst_metrics_put(dst
);
374 rt6_uncached_list_del(rt
);
376 idev
= rt
->rt6i_idev
;
378 rt
->rt6i_idev
= NULL
;
382 from
= xchg((__force
struct fib6_info
**)&rt
->from
, NULL
);
383 fib6_info_release(from
);
386 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
389 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
390 struct inet6_dev
*idev
= rt
->rt6i_idev
;
391 struct net_device
*loopback_dev
=
392 dev_net(dev
)->loopback_dev
;
394 if (idev
&& idev
->dev
!= loopback_dev
) {
395 struct inet6_dev
*loopback_idev
= in6_dev_get(loopback_dev
);
397 rt
->rt6i_idev
= loopback_idev
;
403 static bool __rt6_check_expired(const struct rt6_info
*rt
)
405 if (rt
->rt6i_flags
& RTF_EXPIRES
)
406 return time_after(jiffies
, rt
->dst
.expires
);
411 static bool rt6_check_expired(const struct rt6_info
*rt
)
413 struct fib6_info
*from
;
415 from
= rcu_dereference(rt
->from
);
417 if (rt
->rt6i_flags
& RTF_EXPIRES
) {
418 if (time_after(jiffies
, rt
->dst
.expires
))
421 return rt
->dst
.obsolete
!= DST_OBSOLETE_FORCE_CHK
||
422 fib6_check_expired(from
);
427 struct fib6_info
*fib6_multipath_select(const struct net
*net
,
428 struct fib6_info
*match
,
429 struct flowi6
*fl6
, int oif
,
430 const struct sk_buff
*skb
,
433 struct fib6_info
*sibling
, *next_sibling
;
435 /* We might have already computed the hash for ICMPv6 errors. In such
436 * case it will always be non-zero. Otherwise now is the time to do it.
439 fl6
->mp_hash
= rt6_multipath_hash(net
, fl6
, skb
, NULL
);
441 if (fl6
->mp_hash
<= atomic_read(&match
->fib6_nh
.nh_upper_bound
))
444 list_for_each_entry_safe(sibling
, next_sibling
, &match
->fib6_siblings
,
448 nh_upper_bound
= atomic_read(&sibling
->fib6_nh
.nh_upper_bound
);
449 if (fl6
->mp_hash
> nh_upper_bound
)
451 if (rt6_score_route(sibling
, oif
, strict
) < 0)
461 * Route lookup. rcu_read_lock() should be held.
464 static inline struct fib6_info
*rt6_device_match(struct net
*net
,
465 struct fib6_info
*rt
,
466 const struct in6_addr
*saddr
,
470 struct fib6_info
*sprt
;
472 if (!oif
&& ipv6_addr_any(saddr
) &&
473 !(rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
))
476 for (sprt
= rt
; sprt
; sprt
= rcu_dereference(sprt
->fib6_next
)) {
477 const struct net_device
*dev
= sprt
->fib6_nh
.nh_dev
;
479 if (sprt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
483 if (dev
->ifindex
== oif
)
486 if (ipv6_chk_addr(net
, saddr
, dev
,
487 flags
& RT6_LOOKUP_F_IFACE
))
492 if (oif
&& flags
& RT6_LOOKUP_F_IFACE
)
493 return net
->ipv6
.fib6_null_entry
;
495 return rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
? net
->ipv6
.fib6_null_entry
: rt
;
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499 struct __rt6_probe_work
{
500 struct work_struct work
;
501 struct in6_addr target
;
502 struct net_device
*dev
;
505 static void rt6_probe_deferred(struct work_struct
*w
)
507 struct in6_addr mcaddr
;
508 struct __rt6_probe_work
*work
=
509 container_of(w
, struct __rt6_probe_work
, work
);
511 addrconf_addr_solict_mult(&work
->target
, &mcaddr
);
512 ndisc_send_ns(work
->dev
, &work
->target
, &mcaddr
, NULL
, 0);
517 static void rt6_probe(struct fib6_info
*rt
)
519 struct __rt6_probe_work
*work
= NULL
;
520 const struct in6_addr
*nh_gw
;
521 struct neighbour
*neigh
;
522 struct net_device
*dev
;
523 struct inet6_dev
*idev
;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt
|| !(rt
->fib6_flags
& RTF_GATEWAY
))
536 nh_gw
= &rt
->fib6_nh
.nh_gw
;
537 dev
= rt
->fib6_nh
.nh_dev
;
539 idev
= __in6_dev_get(dev
);
540 neigh
= __ipv6_neigh_lookup_noref(dev
, nh_gw
);
542 if (neigh
->nud_state
& NUD_VALID
)
545 write_lock(&neigh
->lock
);
546 if (!(neigh
->nud_state
& NUD_VALID
) &&
548 neigh
->updated
+ idev
->cnf
.rtr_probe_interval
)) {
549 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
551 __neigh_set_probe_once(neigh
);
553 write_unlock(&neigh
->lock
);
554 } else if (time_after(jiffies
, rt
->last_probe
+
555 idev
->cnf
.rtr_probe_interval
)) {
556 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
560 rt
->last_probe
= jiffies
;
561 INIT_WORK(&work
->work
, rt6_probe_deferred
);
562 work
->target
= *nh_gw
;
565 schedule_work(&work
->work
);
569 rcu_read_unlock_bh();
572 static inline void rt6_probe(struct fib6_info
*rt
)
578 * Default Router Selection (RFC 2461 6.3.6)
580 static inline int rt6_check_dev(struct fib6_info
*rt
, int oif
)
582 const struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
584 if (!oif
|| dev
->ifindex
== oif
)
589 static inline enum rt6_nud_state
rt6_check_neigh(struct fib6_info
*rt
)
591 enum rt6_nud_state ret
= RT6_NUD_FAIL_HARD
;
592 struct neighbour
*neigh
;
594 if (rt
->fib6_flags
& RTF_NONEXTHOP
||
595 !(rt
->fib6_flags
& RTF_GATEWAY
))
596 return RT6_NUD_SUCCEED
;
599 neigh
= __ipv6_neigh_lookup_noref(rt
->fib6_nh
.nh_dev
,
602 read_lock(&neigh
->lock
);
603 if (neigh
->nud_state
& NUD_VALID
)
604 ret
= RT6_NUD_SUCCEED
;
605 #ifdef CONFIG_IPV6_ROUTER_PREF
606 else if (!(neigh
->nud_state
& NUD_FAILED
))
607 ret
= RT6_NUD_SUCCEED
;
609 ret
= RT6_NUD_FAIL_PROBE
;
611 read_unlock(&neigh
->lock
);
613 ret
= IS_ENABLED(CONFIG_IPV6_ROUTER_PREF
) ?
614 RT6_NUD_SUCCEED
: RT6_NUD_FAIL_DO_RR
;
616 rcu_read_unlock_bh();
621 static int rt6_score_route(struct fib6_info
*rt
, int oif
, int strict
)
625 m
= rt6_check_dev(rt
, oif
);
626 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
627 return RT6_NUD_FAIL_HARD
;
628 #ifdef CONFIG_IPV6_ROUTER_PREF
629 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->fib6_flags
)) << 2;
631 if (strict
& RT6_LOOKUP_F_REACHABLE
) {
632 int n
= rt6_check_neigh(rt
);
639 /* called with rc_read_lock held */
640 static inline bool fib6_ignore_linkdown(const struct fib6_info
*f6i
)
642 const struct net_device
*dev
= fib6_info_nh_dev(f6i
);
646 const struct inet6_dev
*idev
= __in6_dev_get(dev
);
648 rc
= !!idev
->cnf
.ignore_routes_with_linkdown
;
654 static struct fib6_info
*find_match(struct fib6_info
*rt
, int oif
, int strict
,
655 int *mpri
, struct fib6_info
*match
,
659 bool match_do_rr
= false;
661 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
664 if (fib6_ignore_linkdown(rt
) &&
665 rt
->fib6_nh
.nh_flags
& RTNH_F_LINKDOWN
&&
666 !(strict
& RT6_LOOKUP_F_IGNORE_LINKSTATE
))
669 if (fib6_check_expired(rt
))
672 m
= rt6_score_route(rt
, oif
, strict
);
673 if (m
== RT6_NUD_FAIL_DO_RR
) {
675 m
= 0; /* lowest valid score */
676 } else if (m
== RT6_NUD_FAIL_HARD
) {
680 if (strict
& RT6_LOOKUP_F_REACHABLE
)
683 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685 *do_rr
= match_do_rr
;
693 static struct fib6_info
*find_rr_leaf(struct fib6_node
*fn
,
694 struct fib6_info
*leaf
,
695 struct fib6_info
*rr_head
,
696 u32 metric
, int oif
, int strict
,
699 struct fib6_info
*rt
, *match
, *cont
;
704 for (rt
= rr_head
; rt
; rt
= rcu_dereference(rt
->fib6_next
)) {
705 if (rt
->fib6_metric
!= metric
) {
710 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
713 for (rt
= leaf
; rt
&& rt
!= rr_head
;
714 rt
= rcu_dereference(rt
->fib6_next
)) {
715 if (rt
->fib6_metric
!= metric
) {
720 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
726 for (rt
= cont
; rt
; rt
= rcu_dereference(rt
->fib6_next
))
727 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
732 static struct fib6_info
*rt6_select(struct net
*net
, struct fib6_node
*fn
,
735 struct fib6_info
*leaf
= rcu_dereference(fn
->leaf
);
736 struct fib6_info
*match
, *rt0
;
740 if (!leaf
|| leaf
== net
->ipv6
.fib6_null_entry
)
741 return net
->ipv6
.fib6_null_entry
;
743 rt0
= rcu_dereference(fn
->rr_ptr
);
747 /* Double check to make sure fn is not an intermediate node
748 * and fn->leaf does not points to its child's leaf
749 * (This might happen if all routes under fn are deleted from
750 * the tree and fib6_repair_tree() is called on the node.)
752 key_plen
= rt0
->fib6_dst
.plen
;
753 #ifdef CONFIG_IPV6_SUBTREES
754 if (rt0
->fib6_src
.plen
)
755 key_plen
= rt0
->fib6_src
.plen
;
757 if (fn
->fn_bit
!= key_plen
)
758 return net
->ipv6
.fib6_null_entry
;
760 match
= find_rr_leaf(fn
, leaf
, rt0
, rt0
->fib6_metric
, oif
, strict
,
764 struct fib6_info
*next
= rcu_dereference(rt0
->fib6_next
);
766 /* no entries matched; do round-robin */
767 if (!next
|| next
->fib6_metric
!= rt0
->fib6_metric
)
771 spin_lock_bh(&leaf
->fib6_table
->tb6_lock
);
772 /* make sure next is not being deleted from the tree */
774 rcu_assign_pointer(fn
->rr_ptr
, next
);
775 spin_unlock_bh(&leaf
->fib6_table
->tb6_lock
);
779 return match
? match
: net
->ipv6
.fib6_null_entry
;
782 static bool rt6_is_gw_or_nonexthop(const struct fib6_info
*rt
)
784 return (rt
->fib6_flags
& (RTF_NONEXTHOP
| RTF_GATEWAY
));
787 #ifdef CONFIG_IPV6_ROUTE_INFO
788 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
789 const struct in6_addr
*gwaddr
)
791 struct net
*net
= dev_net(dev
);
792 struct route_info
*rinfo
= (struct route_info
*) opt
;
793 struct in6_addr prefix_buf
, *prefix
;
795 unsigned long lifetime
;
796 struct fib6_info
*rt
;
798 if (len
< sizeof(struct route_info
)) {
802 /* Sanity check for prefix_len and length */
803 if (rinfo
->length
> 3) {
805 } else if (rinfo
->prefix_len
> 128) {
807 } else if (rinfo
->prefix_len
> 64) {
808 if (rinfo
->length
< 2) {
811 } else if (rinfo
->prefix_len
> 0) {
812 if (rinfo
->length
< 1) {
817 pref
= rinfo
->route_pref
;
818 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
821 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
823 if (rinfo
->length
== 3)
824 prefix
= (struct in6_addr
*)rinfo
->prefix
;
826 /* this function is safe */
827 ipv6_addr_prefix(&prefix_buf
,
828 (struct in6_addr
*)rinfo
->prefix
,
830 prefix
= &prefix_buf
;
833 if (rinfo
->prefix_len
== 0)
834 rt
= rt6_get_dflt_router(net
, gwaddr
, dev
);
836 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
,
839 if (rt
&& !lifetime
) {
845 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
848 rt
->fib6_flags
= RTF_ROUTEINFO
|
849 (rt
->fib6_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
852 if (!addrconf_finite_timeout(lifetime
))
853 fib6_clean_expires(rt
);
855 fib6_set_expires(rt
, jiffies
+ HZ
* lifetime
);
857 fib6_info_release(rt
);
864 * Misc support functions
867 /* called with rcu_lock held */
868 static struct net_device
*ip6_rt_get_dev_rcu(struct fib6_info
*rt
)
870 struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
872 if (rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) {
873 /* for copies of local routes, dst->dev needs to be the
874 * device if it is a master device, the master device if
875 * device is enslaved, and the loopback as the default
877 if (netif_is_l3_slave(dev
) &&
878 !rt6_need_strict(&rt
->fib6_dst
.addr
))
879 dev
= l3mdev_master_dev_rcu(dev
);
880 else if (!netif_is_l3_master(dev
))
881 dev
= dev_net(dev
)->loopback_dev
;
882 /* last case is netif_is_l3_master(dev) is true in which
883 * case we want dev returned to be dev
890 static const int fib6_prop
[RTN_MAX
+ 1] = {
897 [RTN_BLACKHOLE
] = -EINVAL
,
898 [RTN_UNREACHABLE
] = -EHOSTUNREACH
,
899 [RTN_PROHIBIT
] = -EACCES
,
900 [RTN_THROW
] = -EAGAIN
,
902 [RTN_XRESOLVE
] = -EINVAL
,
905 static int ip6_rt_type_to_error(u8 fib6_type
)
907 return fib6_prop
[fib6_type
];
910 static unsigned short fib6_info_dst_flags(struct fib6_info
*rt
)
912 unsigned short flags
= 0;
915 flags
|= DST_NOCOUNT
;
916 if (rt
->dst_nopolicy
)
917 flags
|= DST_NOPOLICY
;
924 static void ip6_rt_init_dst_reject(struct rt6_info
*rt
, struct fib6_info
*ort
)
926 rt
->dst
.error
= ip6_rt_type_to_error(ort
->fib6_type
);
928 switch (ort
->fib6_type
) {
930 rt
->dst
.output
= dst_discard_out
;
931 rt
->dst
.input
= dst_discard
;
934 rt
->dst
.output
= ip6_pkt_prohibit_out
;
935 rt
->dst
.input
= ip6_pkt_prohibit
;
938 case RTN_UNREACHABLE
:
940 rt
->dst
.output
= ip6_pkt_discard_out
;
941 rt
->dst
.input
= ip6_pkt_discard
;
946 static void ip6_rt_init_dst(struct rt6_info
*rt
, struct fib6_info
*ort
)
948 if (ort
->fib6_flags
& RTF_REJECT
) {
949 ip6_rt_init_dst_reject(rt
, ort
);
954 rt
->dst
.output
= ip6_output
;
956 if (ort
->fib6_type
== RTN_LOCAL
|| ort
->fib6_type
== RTN_ANYCAST
) {
957 rt
->dst
.input
= ip6_input
;
958 } else if (ipv6_addr_type(&ort
->fib6_dst
.addr
) & IPV6_ADDR_MULTICAST
) {
959 rt
->dst
.input
= ip6_mc_input
;
961 rt
->dst
.input
= ip6_forward
;
964 if (ort
->fib6_nh
.nh_lwtstate
) {
965 rt
->dst
.lwtstate
= lwtstate_get(ort
->fib6_nh
.nh_lwtstate
);
966 lwtunnel_set_redirect(&rt
->dst
);
969 rt
->dst
.lastuse
= jiffies
;
972 /* Caller must already hold reference to @from */
973 static void rt6_set_from(struct rt6_info
*rt
, struct fib6_info
*from
)
975 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
976 rcu_assign_pointer(rt
->from
, from
);
977 ip_dst_init_metrics(&rt
->dst
, from
->fib6_metrics
);
980 /* Caller must already hold reference to @ort */
981 static void ip6_rt_copy_init(struct rt6_info
*rt
, struct fib6_info
*ort
)
983 struct net_device
*dev
= fib6_info_nh_dev(ort
);
985 ip6_rt_init_dst(rt
, ort
);
987 rt
->rt6i_dst
= ort
->fib6_dst
;
988 rt
->rt6i_idev
= dev
? in6_dev_get(dev
) : NULL
;
989 rt
->rt6i_gateway
= ort
->fib6_nh
.nh_gw
;
990 rt
->rt6i_flags
= ort
->fib6_flags
;
991 rt6_set_from(rt
, ort
);
992 #ifdef CONFIG_IPV6_SUBTREES
993 rt
->rt6i_src
= ort
->fib6_src
;
997 static struct fib6_node
* fib6_backtrack(struct fib6_node
*fn
,
998 struct in6_addr
*saddr
)
1000 struct fib6_node
*pn
, *sn
;
1002 if (fn
->fn_flags
& RTN_TL_ROOT
)
1004 pn
= rcu_dereference(fn
->parent
);
1005 sn
= FIB6_SUBTREE(pn
);
1007 fn
= fib6_node_lookup(sn
, NULL
, saddr
);
1010 if (fn
->fn_flags
& RTN_RTINFO
)
1015 static bool ip6_hold_safe(struct net
*net
, struct rt6_info
**prt
,
1018 struct rt6_info
*rt
= *prt
;
1020 if (dst_hold_safe(&rt
->dst
))
1022 if (null_fallback
) {
1023 rt
= net
->ipv6
.ip6_null_entry
;
1032 /* called with rcu_lock held */
1033 static struct rt6_info
*ip6_create_rt_rcu(struct fib6_info
*rt
)
1035 unsigned short flags
= fib6_info_dst_flags(rt
);
1036 struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
1037 struct rt6_info
*nrt
;
1039 if (!fib6_info_hold_safe(rt
))
1042 nrt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
);
1044 fib6_info_release(rt
);
1048 ip6_rt_copy_init(nrt
, rt
);
1052 nrt
= dev_net(dev
)->ipv6
.ip6_null_entry
;
1053 dst_hold(&nrt
->dst
);
1057 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
1058 struct fib6_table
*table
,
1060 const struct sk_buff
*skb
,
1063 struct fib6_info
*f6i
;
1064 struct fib6_node
*fn
;
1065 struct rt6_info
*rt
;
1067 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1068 flags
&= ~RT6_LOOKUP_F_IFACE
;
1071 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1073 f6i
= rcu_dereference(fn
->leaf
);
1075 f6i
= net
->ipv6
.fib6_null_entry
;
1077 f6i
= rt6_device_match(net
, f6i
, &fl6
->saddr
,
1078 fl6
->flowi6_oif
, flags
);
1079 if (f6i
->fib6_nsiblings
&& fl6
->flowi6_oif
== 0)
1080 f6i
= fib6_multipath_select(net
, f6i
, fl6
,
1081 fl6
->flowi6_oif
, skb
,
1084 if (f6i
== net
->ipv6
.fib6_null_entry
) {
1085 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1090 trace_fib6_table_lookup(net
, f6i
, table
, fl6
);
1092 /* Search through exception table */
1093 rt
= rt6_find_cached_rt(f6i
, &fl6
->daddr
, &fl6
->saddr
);
1095 if (ip6_hold_safe(net
, &rt
, true))
1096 dst_use_noref(&rt
->dst
, jiffies
);
1097 } else if (f6i
== net
->ipv6
.fib6_null_entry
) {
1098 rt
= net
->ipv6
.ip6_null_entry
;
1101 rt
= ip6_create_rt_rcu(f6i
);
1109 struct dst_entry
*ip6_route_lookup(struct net
*net
, struct flowi6
*fl6
,
1110 const struct sk_buff
*skb
, int flags
)
1112 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_lookup
);
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup
);
1116 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
1117 const struct in6_addr
*saddr
, int oif
,
1118 const struct sk_buff
*skb
, int strict
)
1120 struct flowi6 fl6
= {
1124 struct dst_entry
*dst
;
1125 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
1128 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
1129 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1132 dst
= fib6_rule_lookup(net
, &fl6
, skb
, flags
, ip6_pol_route_lookup
);
1133 if (dst
->error
== 0)
1134 return (struct rt6_info
*) dst
;
1140 EXPORT_SYMBOL(rt6_lookup
);
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143 * It takes new route entry, the addition fails by any reason the
1144 * route is released.
1145 * Caller must hold dst before calling it.
1148 static int __ip6_ins_rt(struct fib6_info
*rt
, struct nl_info
*info
,
1149 struct netlink_ext_ack
*extack
)
1152 struct fib6_table
*table
;
1154 table
= rt
->fib6_table
;
1155 spin_lock_bh(&table
->tb6_lock
);
1156 err
= fib6_add(&table
->tb6_root
, rt
, info
, extack
);
1157 spin_unlock_bh(&table
->tb6_lock
);
1162 int ip6_ins_rt(struct net
*net
, struct fib6_info
*rt
)
1164 struct nl_info info
= { .nl_net
= net
, };
1166 return __ip6_ins_rt(rt
, &info
, NULL
);
1169 static struct rt6_info
*ip6_rt_cache_alloc(struct fib6_info
*ort
,
1170 const struct in6_addr
*daddr
,
1171 const struct in6_addr
*saddr
)
1173 struct net_device
*dev
;
1174 struct rt6_info
*rt
;
1180 if (!fib6_info_hold_safe(ort
))
1183 dev
= ip6_rt_get_dev_rcu(ort
);
1184 rt
= ip6_dst_alloc(dev_net(dev
), dev
, 0);
1186 fib6_info_release(ort
);
1190 ip6_rt_copy_init(rt
, ort
);
1191 rt
->rt6i_flags
|= RTF_CACHE
;
1192 rt
->dst
.flags
|= DST_HOST
;
1193 rt
->rt6i_dst
.addr
= *daddr
;
1194 rt
->rt6i_dst
.plen
= 128;
1196 if (!rt6_is_gw_or_nonexthop(ort
)) {
1197 if (ort
->fib6_dst
.plen
!= 128 &&
1198 ipv6_addr_equal(&ort
->fib6_dst
.addr
, daddr
))
1199 rt
->rt6i_flags
|= RTF_ANYCAST
;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 if (rt
->rt6i_src
.plen
&& saddr
) {
1202 rt
->rt6i_src
.addr
= *saddr
;
1203 rt
->rt6i_src
.plen
= 128;
1211 static struct rt6_info
*ip6_rt_pcpu_alloc(struct fib6_info
*rt
)
1213 unsigned short flags
= fib6_info_dst_flags(rt
);
1214 struct net_device
*dev
;
1215 struct rt6_info
*pcpu_rt
;
1217 if (!fib6_info_hold_safe(rt
))
1221 dev
= ip6_rt_get_dev_rcu(rt
);
1222 pcpu_rt
= ip6_dst_alloc(dev_net(dev
), dev
, flags
);
1225 fib6_info_release(rt
);
1228 ip6_rt_copy_init(pcpu_rt
, rt
);
1229 pcpu_rt
->rt6i_flags
|= RTF_PCPU
;
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info
*rt6_get_pcpu_route(struct fib6_info
*rt
)
1236 struct rt6_info
*pcpu_rt
, **p
;
1238 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1242 ip6_hold_safe(NULL
, &pcpu_rt
, false);
1247 static struct rt6_info
*rt6_make_pcpu_route(struct net
*net
,
1248 struct fib6_info
*rt
)
1250 struct rt6_info
*pcpu_rt
, *prev
, **p
;
1252 pcpu_rt
= ip6_rt_pcpu_alloc(rt
);
1254 dst_hold(&net
->ipv6
.ip6_null_entry
->dst
);
1255 return net
->ipv6
.ip6_null_entry
;
1258 dst_hold(&pcpu_rt
->dst
);
1259 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1260 prev
= cmpxchg(p
, NULL
, pcpu_rt
);
1263 if (rt
->fib6_destroying
) {
1264 struct fib6_info
*from
;
1266 from
= xchg((__force
struct fib6_info
**)&pcpu_rt
->from
, NULL
);
1267 fib6_info_release(from
);
1273 /* exception hash table implementation
1275 static DEFINE_SPINLOCK(rt6_exception_lock
);
1277 /* Remove rt6_ex from hash table and free the memory
1278 * Caller must hold rt6_exception_lock
1280 static void rt6_remove_exception(struct rt6_exception_bucket
*bucket
,
1281 struct rt6_exception
*rt6_ex
)
1283 struct fib6_info
*from
;
1286 if (!bucket
|| !rt6_ex
)
1289 net
= dev_net(rt6_ex
->rt6i
->dst
.dev
);
1290 net
->ipv6
.rt6_stats
->fib_rt_cache
--;
1292 /* purge completely the exception to allow releasing the held resources:
1293 * some [sk] cache may keep the dst around for unlimited time
1295 from
= xchg((__force
struct fib6_info
**)&rt6_ex
->rt6i
->from
, NULL
);
1296 fib6_info_release(from
);
1297 dst_dev_put(&rt6_ex
->rt6i
->dst
);
1299 hlist_del_rcu(&rt6_ex
->hlist
);
1300 dst_release(&rt6_ex
->rt6i
->dst
);
1301 kfree_rcu(rt6_ex
, rcu
);
1302 WARN_ON_ONCE(!bucket
->depth
);
1306 /* Remove oldest rt6_ex in bucket and free the memory
1307 * Caller must hold rt6_exception_lock
1309 static void rt6_exception_remove_oldest(struct rt6_exception_bucket
*bucket
)
1311 struct rt6_exception
*rt6_ex
, *oldest
= NULL
;
1316 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1317 if (!oldest
|| time_before(rt6_ex
->stamp
, oldest
->stamp
))
1320 rt6_remove_exception(bucket
, oldest
);
1323 static u32
rt6_exception_hash(const struct in6_addr
*dst
,
1324 const struct in6_addr
*src
)
1326 static u32 seed __read_mostly
;
1329 net_get_random_once(&seed
, sizeof(seed
));
1330 val
= jhash(dst
, sizeof(*dst
), seed
);
1332 #ifdef CONFIG_IPV6_SUBTREES
1334 val
= jhash(src
, sizeof(*src
), val
);
1336 return hash_32(val
, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT
);
1339 /* Helper function to find the cached rt in the hash table
1340 * and update bucket pointer to point to the bucket for this
1341 * (daddr, saddr) pair
1342 * Caller must hold rt6_exception_lock
1344 static struct rt6_exception
*
1345 __rt6_find_exception_spinlock(struct rt6_exception_bucket
**bucket
,
1346 const struct in6_addr
*daddr
,
1347 const struct in6_addr
*saddr
)
1349 struct rt6_exception
*rt6_ex
;
1352 if (!(*bucket
) || !daddr
)
1355 hval
= rt6_exception_hash(daddr
, saddr
);
1358 hlist_for_each_entry(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1359 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1360 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1362 #ifdef CONFIG_IPV6_SUBTREES
1363 if (matched
&& saddr
)
1364 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1372 /* Helper function to find the cached rt in the hash table
1373 * and update bucket pointer to point to the bucket for this
1374 * (daddr, saddr) pair
1375 * Caller must hold rcu_read_lock()
1377 static struct rt6_exception
*
1378 __rt6_find_exception_rcu(struct rt6_exception_bucket
**bucket
,
1379 const struct in6_addr
*daddr
,
1380 const struct in6_addr
*saddr
)
1382 struct rt6_exception
*rt6_ex
;
1385 WARN_ON_ONCE(!rcu_read_lock_held());
1387 if (!(*bucket
) || !daddr
)
1390 hval
= rt6_exception_hash(daddr
, saddr
);
1393 hlist_for_each_entry_rcu(rt6_ex
, &(*bucket
)->chain
, hlist
) {
1394 struct rt6_info
*rt6
= rt6_ex
->rt6i
;
1395 bool matched
= ipv6_addr_equal(daddr
, &rt6
->rt6i_dst
.addr
);
1397 #ifdef CONFIG_IPV6_SUBTREES
1398 if (matched
&& saddr
)
1399 matched
= ipv6_addr_equal(saddr
, &rt6
->rt6i_src
.addr
);
1407 static unsigned int fib6_mtu(const struct fib6_info
*rt
)
1411 if (rt
->fib6_pmtu
) {
1412 mtu
= rt
->fib6_pmtu
;
1414 struct net_device
*dev
= fib6_info_nh_dev(rt
);
1415 struct inet6_dev
*idev
;
1418 idev
= __in6_dev_get(dev
);
1419 mtu
= idev
->cnf
.mtu6
;
1423 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
1425 return mtu
- lwtunnel_headroom(rt
->fib6_nh
.nh_lwtstate
, mtu
);
1428 static int rt6_insert_exception(struct rt6_info
*nrt
,
1429 struct fib6_info
*ort
)
1431 struct net
*net
= dev_net(nrt
->dst
.dev
);
1432 struct rt6_exception_bucket
*bucket
;
1433 struct in6_addr
*src_key
= NULL
;
1434 struct rt6_exception
*rt6_ex
;
1437 spin_lock_bh(&rt6_exception_lock
);
1439 if (ort
->exception_bucket_flushed
) {
1444 bucket
= rcu_dereference_protected(ort
->rt6i_exception_bucket
,
1445 lockdep_is_held(&rt6_exception_lock
));
1447 bucket
= kcalloc(FIB6_EXCEPTION_BUCKET_SIZE
, sizeof(*bucket
),
1453 rcu_assign_pointer(ort
->rt6i_exception_bucket
, bucket
);
1456 #ifdef CONFIG_IPV6_SUBTREES
1457 /* rt6i_src.plen != 0 indicates ort is in subtree
1458 * and exception table is indexed by a hash of
1459 * both rt6i_dst and rt6i_src.
1460 * Otherwise, the exception table is indexed by
1461 * a hash of only rt6i_dst.
1463 if (ort
->fib6_src
.plen
)
1464 src_key
= &nrt
->rt6i_src
.addr
;
1466 /* rt6_mtu_change() might lower mtu on ort.
1467 * Only insert this exception route if its mtu
1468 * is less than ort's mtu value.
1470 if (dst_metric_raw(&nrt
->dst
, RTAX_MTU
) >= fib6_mtu(ort
)) {
1475 rt6_ex
= __rt6_find_exception_spinlock(&bucket
, &nrt
->rt6i_dst
.addr
,
1478 rt6_remove_exception(bucket
, rt6_ex
);
1480 rt6_ex
= kzalloc(sizeof(*rt6_ex
), GFP_ATOMIC
);
1486 rt6_ex
->stamp
= jiffies
;
1487 hlist_add_head_rcu(&rt6_ex
->hlist
, &bucket
->chain
);
1489 net
->ipv6
.rt6_stats
->fib_rt_cache
++;
1491 if (bucket
->depth
> FIB6_MAX_DEPTH
)
1492 rt6_exception_remove_oldest(bucket
);
1495 spin_unlock_bh(&rt6_exception_lock
);
1497 /* Update fn->fn_sernum to invalidate all cached dst */
1499 spin_lock_bh(&ort
->fib6_table
->tb6_lock
);
1500 fib6_update_sernum(net
, ort
);
1501 spin_unlock_bh(&ort
->fib6_table
->tb6_lock
);
1502 fib6_force_start_gc(net
);
1508 void rt6_flush_exceptions(struct fib6_info
*rt
)
1510 struct rt6_exception_bucket
*bucket
;
1511 struct rt6_exception
*rt6_ex
;
1512 struct hlist_node
*tmp
;
1515 spin_lock_bh(&rt6_exception_lock
);
1516 /* Prevent rt6_insert_exception() to recreate the bucket list */
1517 rt
->exception_bucket_flushed
= 1;
1519 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1520 lockdep_is_held(&rt6_exception_lock
));
1524 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1525 hlist_for_each_entry_safe(rt6_ex
, tmp
, &bucket
->chain
, hlist
)
1526 rt6_remove_exception(bucket
, rt6_ex
);
1527 WARN_ON_ONCE(bucket
->depth
);
1532 spin_unlock_bh(&rt6_exception_lock
);
1535 /* Find cached rt in the hash table inside passed in rt
1536 * Caller has to hold rcu_read_lock()
1538 static struct rt6_info
*rt6_find_cached_rt(struct fib6_info
*rt
,
1539 const struct in6_addr
*daddr
,
1540 const struct in6_addr
*saddr
)
1542 const struct in6_addr
*src_key
= NULL
;
1543 struct rt6_exception_bucket
*bucket
;
1544 struct rt6_exception
*rt6_ex
;
1545 struct rt6_info
*res
= NULL
;
1547 #ifdef CONFIG_IPV6_SUBTREES
1548 /* rt6i_src.plen != 0 indicates rt is in subtree
1549 * and exception table is indexed by a hash of
1550 * both rt6i_dst and rt6i_src.
1551 * However, the src addr used to create the hash
1552 * might not be exactly the passed in saddr which
1553 * is a /128 addr from the flow.
1554 * So we need to use f6i->fib6_src to redo lookup
1555 * if the passed in saddr does not find anything.
1556 * (See the logic in ip6_rt_cache_alloc() on how
1557 * rt->rt6i_src is updated.)
1559 if (rt
->fib6_src
.plen
)
1563 bucket
= rcu_dereference(rt
->rt6i_exception_bucket
);
1564 rt6_ex
= __rt6_find_exception_rcu(&bucket
, daddr
, src_key
);
1566 if (rt6_ex
&& !rt6_check_expired(rt6_ex
->rt6i
))
1569 #ifdef CONFIG_IPV6_SUBTREES
1570 /* Use fib6_src as src_key and redo lookup */
1571 if (!res
&& src_key
&& src_key
!= &rt
->fib6_src
.addr
) {
1572 src_key
= &rt
->fib6_src
.addr
;
1580 /* Remove the passed in cached rt from the hash table that contains it */
1581 static int rt6_remove_exception_rt(struct rt6_info
*rt
)
1583 struct rt6_exception_bucket
*bucket
;
1584 struct in6_addr
*src_key
= NULL
;
1585 struct rt6_exception
*rt6_ex
;
1586 struct fib6_info
*from
;
1589 from
= rcu_dereference(rt
->from
);
1591 !(rt
->rt6i_flags
& RTF_CACHE
))
1594 if (!rcu_access_pointer(from
->rt6i_exception_bucket
))
1597 spin_lock_bh(&rt6_exception_lock
);
1598 bucket
= rcu_dereference_protected(from
->rt6i_exception_bucket
,
1599 lockdep_is_held(&rt6_exception_lock
));
1600 #ifdef CONFIG_IPV6_SUBTREES
1601 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1602 * and exception table is indexed by a hash of
1603 * both rt6i_dst and rt6i_src.
1604 * Otherwise, the exception table is indexed by
1605 * a hash of only rt6i_dst.
1607 if (from
->fib6_src
.plen
)
1608 src_key
= &rt
->rt6i_src
.addr
;
1610 rt6_ex
= __rt6_find_exception_spinlock(&bucket
,
1614 rt6_remove_exception(bucket
, rt6_ex
);
1620 spin_unlock_bh(&rt6_exception_lock
);
1624 /* Find rt6_ex which contains the passed in rt cache and
1627 static void rt6_update_exception_stamp_rt(struct rt6_info
*rt
)
1629 struct rt6_exception_bucket
*bucket
;
1630 struct in6_addr
*src_key
= NULL
;
1631 struct rt6_exception
*rt6_ex
;
1632 struct fib6_info
*from
;
1635 from
= rcu_dereference(rt
->from
);
1636 if (!from
|| !(rt
->rt6i_flags
& RTF_CACHE
))
1639 bucket
= rcu_dereference(from
->rt6i_exception_bucket
);
1641 #ifdef CONFIG_IPV6_SUBTREES
1642 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1643 * and exception table is indexed by a hash of
1644 * both rt6i_dst and rt6i_src.
1645 * Otherwise, the exception table is indexed by
1646 * a hash of only rt6i_dst.
1648 if (from
->fib6_src
.plen
)
1649 src_key
= &rt
->rt6i_src
.addr
;
1651 rt6_ex
= __rt6_find_exception_rcu(&bucket
,
1655 rt6_ex
->stamp
= jiffies
;
1661 static bool rt6_mtu_change_route_allowed(struct inet6_dev
*idev
,
1662 struct rt6_info
*rt
, int mtu
)
1664 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1665 * lowest MTU in the path: always allow updating the route PMTU to
1666 * reflect PMTU decreases.
1668 * If the new MTU is higher, and the route PMTU is equal to the local
1669 * MTU, this means the old MTU is the lowest in the path, so allow
1670 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1674 if (dst_mtu(&rt
->dst
) >= mtu
)
1677 if (dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
)
1683 static void rt6_exceptions_update_pmtu(struct inet6_dev
*idev
,
1684 struct fib6_info
*rt
, int mtu
)
1686 struct rt6_exception_bucket
*bucket
;
1687 struct rt6_exception
*rt6_ex
;
1690 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1691 lockdep_is_held(&rt6_exception_lock
));
1696 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1697 hlist_for_each_entry(rt6_ex
, &bucket
->chain
, hlist
) {
1698 struct rt6_info
*entry
= rt6_ex
->rt6i
;
1700 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1701 * route), the metrics of its rt->from have already
1704 if (dst_metric_raw(&entry
->dst
, RTAX_MTU
) &&
1705 rt6_mtu_change_route_allowed(idev
, entry
, mtu
))
1706 dst_metric_set(&entry
->dst
, RTAX_MTU
, mtu
);
1712 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1714 static void rt6_exceptions_clean_tohost(struct fib6_info
*rt
,
1715 struct in6_addr
*gateway
)
1717 struct rt6_exception_bucket
*bucket
;
1718 struct rt6_exception
*rt6_ex
;
1719 struct hlist_node
*tmp
;
1722 if (!rcu_access_pointer(rt
->rt6i_exception_bucket
))
1725 spin_lock_bh(&rt6_exception_lock
);
1726 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1727 lockdep_is_held(&rt6_exception_lock
));
1730 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1731 hlist_for_each_entry_safe(rt6_ex
, tmp
,
1732 &bucket
->chain
, hlist
) {
1733 struct rt6_info
*entry
= rt6_ex
->rt6i
;
1735 if ((entry
->rt6i_flags
& RTF_CACHE_GATEWAY
) ==
1736 RTF_CACHE_GATEWAY
&&
1737 ipv6_addr_equal(gateway
,
1738 &entry
->rt6i_gateway
)) {
1739 rt6_remove_exception(bucket
, rt6_ex
);
1746 spin_unlock_bh(&rt6_exception_lock
);
1749 static void rt6_age_examine_exception(struct rt6_exception_bucket
*bucket
,
1750 struct rt6_exception
*rt6_ex
,
1751 struct fib6_gc_args
*gc_args
,
1754 struct rt6_info
*rt
= rt6_ex
->rt6i
;
1756 /* we are pruning and obsoleting aged-out and non gateway exceptions
1757 * even if others have still references to them, so that on next
1758 * dst_check() such references can be dropped.
1759 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1760 * expired, independently from their aging, as per RFC 8201 section 4
1762 if (!(rt
->rt6i_flags
& RTF_EXPIRES
)) {
1763 if (time_after_eq(now
, rt
->dst
.lastuse
+ gc_args
->timeout
)) {
1764 RT6_TRACE("aging clone %p\n", rt
);
1765 rt6_remove_exception(bucket
, rt6_ex
);
1768 } else if (time_after(jiffies
, rt
->dst
.expires
)) {
1769 RT6_TRACE("purging expired route %p\n", rt
);
1770 rt6_remove_exception(bucket
, rt6_ex
);
1774 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
1775 struct neighbour
*neigh
;
1776 __u8 neigh_flags
= 0;
1778 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
1780 neigh_flags
= neigh
->flags
;
1782 if (!(neigh_flags
& NTF_ROUTER
)) {
1783 RT6_TRACE("purging route %p via non-router but gateway\n",
1785 rt6_remove_exception(bucket
, rt6_ex
);
1793 void rt6_age_exceptions(struct fib6_info
*rt
,
1794 struct fib6_gc_args
*gc_args
,
1797 struct rt6_exception_bucket
*bucket
;
1798 struct rt6_exception
*rt6_ex
;
1799 struct hlist_node
*tmp
;
1802 if (!rcu_access_pointer(rt
->rt6i_exception_bucket
))
1806 spin_lock(&rt6_exception_lock
);
1807 bucket
= rcu_dereference_protected(rt
->rt6i_exception_bucket
,
1808 lockdep_is_held(&rt6_exception_lock
));
1811 for (i
= 0; i
< FIB6_EXCEPTION_BUCKET_SIZE
; i
++) {
1812 hlist_for_each_entry_safe(rt6_ex
, tmp
,
1813 &bucket
->chain
, hlist
) {
1814 rt6_age_examine_exception(bucket
, rt6_ex
,
1820 spin_unlock(&rt6_exception_lock
);
1821 rcu_read_unlock_bh();
1824 /* must be called with rcu lock held */
1825 struct fib6_info
*fib6_table_lookup(struct net
*net
, struct fib6_table
*table
,
1826 int oif
, struct flowi6
*fl6
, int strict
)
1828 struct fib6_node
*fn
, *saved_fn
;
1829 struct fib6_info
*f6i
;
1831 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1834 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1838 f6i
= rt6_select(net
, fn
, oif
, strict
);
1839 if (f6i
== net
->ipv6
.fib6_null_entry
) {
1840 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1842 goto redo_rt6_select
;
1843 else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
1844 /* also consider unreachable route */
1845 strict
&= ~RT6_LOOKUP_F_REACHABLE
;
1847 goto redo_rt6_select
;
1851 trace_fib6_table_lookup(net
, f6i
, table
, fl6
);
1856 struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
,
1857 int oif
, struct flowi6
*fl6
,
1858 const struct sk_buff
*skb
, int flags
)
1860 struct fib6_info
*f6i
;
1861 struct rt6_info
*rt
;
1864 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
1865 strict
|= flags
& RT6_LOOKUP_F_IGNORE_LINKSTATE
;
1866 if (net
->ipv6
.devconf_all
->forwarding
== 0)
1867 strict
|= RT6_LOOKUP_F_REACHABLE
;
1871 f6i
= fib6_table_lookup(net
, table
, oif
, fl6
, strict
);
1872 if (f6i
->fib6_nsiblings
)
1873 f6i
= fib6_multipath_select(net
, f6i
, fl6
, oif
, skb
, strict
);
1875 if (f6i
== net
->ipv6
.fib6_null_entry
) {
1876 rt
= net
->ipv6
.ip6_null_entry
;
1882 /*Search through exception table */
1883 rt
= rt6_find_cached_rt(f6i
, &fl6
->daddr
, &fl6
->saddr
);
1885 if (ip6_hold_safe(net
, &rt
, true))
1886 dst_use_noref(&rt
->dst
, jiffies
);
1890 } else if (unlikely((fl6
->flowi6_flags
& FLOWI_FLAG_KNOWN_NH
) &&
1891 !(f6i
->fib6_flags
& RTF_GATEWAY
))) {
1892 /* Create a RTF_CACHE clone which will not be
1893 * owned by the fib6 tree. It is for the special case where
1894 * the daddr in the skb during the neighbor look-up is different
1895 * from the fl6->daddr used to look-up route here.
1897 struct rt6_info
*uncached_rt
;
1899 uncached_rt
= ip6_rt_cache_alloc(f6i
, &fl6
->daddr
, NULL
);
1904 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1905 * No need for another dst_hold()
1907 rt6_uncached_list_add(uncached_rt
);
1908 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
1910 uncached_rt
= net
->ipv6
.ip6_null_entry
;
1911 dst_hold(&uncached_rt
->dst
);
1916 /* Get a percpu copy */
1918 struct rt6_info
*pcpu_rt
;
1921 pcpu_rt
= rt6_get_pcpu_route(f6i
);
1924 pcpu_rt
= rt6_make_pcpu_route(net
, f6i
);
1932 EXPORT_SYMBOL_GPL(ip6_pol_route
);
1934 static struct rt6_info
*ip6_pol_route_input(struct net
*net
,
1935 struct fib6_table
*table
,
1937 const struct sk_buff
*skb
,
1940 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, skb
, flags
);
1943 struct dst_entry
*ip6_route_input_lookup(struct net
*net
,
1944 struct net_device
*dev
,
1946 const struct sk_buff
*skb
,
1949 if (rt6_need_strict(&fl6
->daddr
) && dev
->type
!= ARPHRD_PIMREG
)
1950 flags
|= RT6_LOOKUP_F_IFACE
;
1952 return fib6_rule_lookup(net
, fl6
, skb
, flags
, ip6_pol_route_input
);
1954 EXPORT_SYMBOL_GPL(ip6_route_input_lookup
);
1956 static void ip6_multipath_l3_keys(const struct sk_buff
*skb
,
1957 struct flow_keys
*keys
,
1958 struct flow_keys
*flkeys
)
1960 const struct ipv6hdr
*outer_iph
= ipv6_hdr(skb
);
1961 const struct ipv6hdr
*key_iph
= outer_iph
;
1962 struct flow_keys
*_flkeys
= flkeys
;
1963 const struct ipv6hdr
*inner_iph
;
1964 const struct icmp6hdr
*icmph
;
1965 struct ipv6hdr _inner_iph
;
1966 struct icmp6hdr _icmph
;
1968 if (likely(outer_iph
->nexthdr
!= IPPROTO_ICMPV6
))
1971 icmph
= skb_header_pointer(skb
, skb_transport_offset(skb
),
1972 sizeof(_icmph
), &_icmph
);
1976 if (icmph
->icmp6_type
!= ICMPV6_DEST_UNREACH
&&
1977 icmph
->icmp6_type
!= ICMPV6_PKT_TOOBIG
&&
1978 icmph
->icmp6_type
!= ICMPV6_TIME_EXCEED
&&
1979 icmph
->icmp6_type
!= ICMPV6_PARAMPROB
)
1982 inner_iph
= skb_header_pointer(skb
,
1983 skb_transport_offset(skb
) + sizeof(*icmph
),
1984 sizeof(_inner_iph
), &_inner_iph
);
1988 key_iph
= inner_iph
;
1992 keys
->addrs
.v6addrs
.src
= _flkeys
->addrs
.v6addrs
.src
;
1993 keys
->addrs
.v6addrs
.dst
= _flkeys
->addrs
.v6addrs
.dst
;
1994 keys
->tags
.flow_label
= _flkeys
->tags
.flow_label
;
1995 keys
->basic
.ip_proto
= _flkeys
->basic
.ip_proto
;
1997 keys
->addrs
.v6addrs
.src
= key_iph
->saddr
;
1998 keys
->addrs
.v6addrs
.dst
= key_iph
->daddr
;
1999 keys
->tags
.flow_label
= ip6_flowlabel(key_iph
);
2000 keys
->basic
.ip_proto
= key_iph
->nexthdr
;
2004 /* if skb is set it will be used and fl6 can be NULL */
2005 u32
rt6_multipath_hash(const struct net
*net
, const struct flowi6
*fl6
,
2006 const struct sk_buff
*skb
, struct flow_keys
*flkeys
)
2008 struct flow_keys hash_keys
;
2011 switch (ip6_multipath_hash_policy(net
)) {
2013 memset(&hash_keys
, 0, sizeof(hash_keys
));
2014 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2016 ip6_multipath_l3_keys(skb
, &hash_keys
, flkeys
);
2018 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2019 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2020 hash_keys
.tags
.flow_label
= (__force u32
)flowi6_get_flowlabel(fl6
);
2021 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2026 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
2027 struct flow_keys keys
;
2029 /* short-circuit if we already have L4 hash present */
2031 return skb_get_hash_raw(skb
) >> 1;
2033 memset(&hash_keys
, 0, sizeof(hash_keys
));
2036 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
2039 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2040 hash_keys
.addrs
.v6addrs
.src
= flkeys
->addrs
.v6addrs
.src
;
2041 hash_keys
.addrs
.v6addrs
.dst
= flkeys
->addrs
.v6addrs
.dst
;
2042 hash_keys
.ports
.src
= flkeys
->ports
.src
;
2043 hash_keys
.ports
.dst
= flkeys
->ports
.dst
;
2044 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2046 memset(&hash_keys
, 0, sizeof(hash_keys
));
2047 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2048 hash_keys
.addrs
.v6addrs
.src
= fl6
->saddr
;
2049 hash_keys
.addrs
.v6addrs
.dst
= fl6
->daddr
;
2050 hash_keys
.ports
.src
= fl6
->fl6_sport
;
2051 hash_keys
.ports
.dst
= fl6
->fl6_dport
;
2052 hash_keys
.basic
.ip_proto
= fl6
->flowi6_proto
;
2056 mhash
= flow_hash_from_keys(&hash_keys
);
2061 void ip6_route_input(struct sk_buff
*skb
)
2063 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2064 struct net
*net
= dev_net(skb
->dev
);
2065 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
2066 struct ip_tunnel_info
*tun_info
;
2067 struct flowi6 fl6
= {
2068 .flowi6_iif
= skb
->dev
->ifindex
,
2069 .daddr
= iph
->daddr
,
2070 .saddr
= iph
->saddr
,
2071 .flowlabel
= ip6_flowinfo(iph
),
2072 .flowi6_mark
= skb
->mark
,
2073 .flowi6_proto
= iph
->nexthdr
,
2075 struct flow_keys
*flkeys
= NULL
, _flkeys
;
2077 tun_info
= skb_tunnel_info(skb
);
2078 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
2079 fl6
.flowi6_tun_key
.tun_id
= tun_info
->key
.tun_id
;
2081 if (fib6_rules_early_flow_dissect(net
, skb
, &fl6
, &_flkeys
))
2084 if (unlikely(fl6
.flowi6_proto
== IPPROTO_ICMPV6
))
2085 fl6
.mp_hash
= rt6_multipath_hash(net
, &fl6
, skb
, flkeys
);
2088 ip6_route_input_lookup(net
, skb
->dev
, &fl6
, skb
, flags
));
2091 static struct rt6_info
*ip6_pol_route_output(struct net
*net
,
2092 struct fib6_table
*table
,
2094 const struct sk_buff
*skb
,
2097 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, skb
, flags
);
2100 struct dst_entry
*ip6_route_output_flags(struct net
*net
, const struct sock
*sk
,
2101 struct flowi6
*fl6
, int flags
)
2105 if (ipv6_addr_type(&fl6
->daddr
) &
2106 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
)) {
2107 struct dst_entry
*dst
;
2109 dst
= l3mdev_link_scope_lookup(net
, fl6
);
2114 fl6
->flowi6_iif
= LOOPBACK_IFINDEX
;
2116 any_src
= ipv6_addr_any(&fl6
->saddr
);
2117 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
) ||
2118 (fl6
->flowi6_oif
&& any_src
))
2119 flags
|= RT6_LOOKUP_F_IFACE
;
2122 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2124 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
2126 return fib6_rule_lookup(net
, fl6
, NULL
, flags
, ip6_pol_route_output
);
2128 EXPORT_SYMBOL_GPL(ip6_route_output_flags
);
2130 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2132 struct rt6_info
*rt
, *ort
= (struct rt6_info
*) dst_orig
;
2133 struct net_device
*loopback_dev
= net
->loopback_dev
;
2134 struct dst_entry
*new = NULL
;
2136 rt
= dst_alloc(&ip6_dst_blackhole_ops
, loopback_dev
, 1,
2137 DST_OBSOLETE_DEAD
, 0);
2140 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_alloc
);
2144 new->input
= dst_discard
;
2145 new->output
= dst_discard_out
;
2147 dst_copy_metrics(new, &ort
->dst
);
2149 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
2150 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
2151 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_PCPU
;
2153 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
2154 #ifdef CONFIG_IPV6_SUBTREES
2155 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
2159 dst_release(dst_orig
);
2160 return new ? new : ERR_PTR(-ENOMEM
);
2164 * Destination cache support functions
2167 static bool fib6_check(struct fib6_info
*f6i
, u32 cookie
)
2171 if (!fib6_get_cookie_safe(f6i
, &rt_cookie
) || rt_cookie
!= cookie
)
2174 if (fib6_check_expired(f6i
))
2180 static struct dst_entry
*rt6_check(struct rt6_info
*rt
,
2181 struct fib6_info
*from
,
2186 if ((from
&& !fib6_get_cookie_safe(from
, &rt_cookie
)) ||
2187 rt_cookie
!= cookie
)
2190 if (rt6_check_expired(rt
))
2196 static struct dst_entry
*rt6_dst_from_check(struct rt6_info
*rt
,
2197 struct fib6_info
*from
,
2200 if (!__rt6_check_expired(rt
) &&
2201 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
2202 fib6_check(from
, cookie
))
2208 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
2210 struct dst_entry
*dst_ret
;
2211 struct fib6_info
*from
;
2212 struct rt6_info
*rt
;
2214 rt
= container_of(dst
, struct rt6_info
, dst
);
2218 /* All IPV6 dsts are created with ->obsolete set to the value
2219 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2220 * into this function always.
2223 from
= rcu_dereference(rt
->from
);
2225 if (from
&& (rt
->rt6i_flags
& RTF_PCPU
||
2226 unlikely(!list_empty(&rt
->rt6i_uncached
))))
2227 dst_ret
= rt6_dst_from_check(rt
, from
, cookie
);
2229 dst_ret
= rt6_check(rt
, from
, cookie
);
2236 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
2238 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
2241 if (rt
->rt6i_flags
& RTF_CACHE
) {
2243 if (rt6_check_expired(rt
)) {
2244 rt6_remove_exception_rt(rt
);
2256 static void ip6_link_failure(struct sk_buff
*skb
)
2258 struct rt6_info
*rt
;
2260 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
2262 rt
= (struct rt6_info
*) skb_dst(skb
);
2265 if (rt
->rt6i_flags
& RTF_CACHE
) {
2266 rt6_remove_exception_rt(rt
);
2268 struct fib6_info
*from
;
2269 struct fib6_node
*fn
;
2271 from
= rcu_dereference(rt
->from
);
2273 fn
= rcu_dereference(from
->fib6_node
);
2274 if (fn
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
2282 static void rt6_update_expires(struct rt6_info
*rt0
, int timeout
)
2284 if (!(rt0
->rt6i_flags
& RTF_EXPIRES
)) {
2285 struct fib6_info
*from
;
2288 from
= rcu_dereference(rt0
->from
);
2290 rt0
->dst
.expires
= from
->expires
;
2294 dst_set_expires(&rt0
->dst
, timeout
);
2295 rt0
->rt6i_flags
|= RTF_EXPIRES
;
2298 static void rt6_do_update_pmtu(struct rt6_info
*rt
, u32 mtu
)
2300 struct net
*net
= dev_net(rt
->dst
.dev
);
2302 dst_metric_set(&rt
->dst
, RTAX_MTU
, mtu
);
2303 rt
->rt6i_flags
|= RTF_MODIFIED
;
2304 rt6_update_expires(rt
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
2307 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info
*rt
)
2309 return !(rt
->rt6i_flags
& RTF_CACHE
) &&
2310 (rt
->rt6i_flags
& RTF_PCPU
|| rcu_access_pointer(rt
->from
));
2313 static void __ip6_rt_update_pmtu(struct dst_entry
*dst
, const struct sock
*sk
,
2314 const struct ipv6hdr
*iph
, u32 mtu
)
2316 const struct in6_addr
*daddr
, *saddr
;
2317 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
2319 if (dst_metric_locked(dst
, RTAX_MTU
))
2323 daddr
= &iph
->daddr
;
2324 saddr
= &iph
->saddr
;
2326 daddr
= &sk
->sk_v6_daddr
;
2327 saddr
= &inet6_sk(sk
)->saddr
;
2332 dst_confirm_neigh(dst
, daddr
);
2333 mtu
= max_t(u32
, mtu
, IPV6_MIN_MTU
);
2334 if (mtu
>= dst_mtu(dst
))
2337 if (!rt6_cache_allowed_for_pmtu(rt6
)) {
2338 rt6_do_update_pmtu(rt6
, mtu
);
2339 /* update rt6_ex->stamp for cache */
2340 if (rt6
->rt6i_flags
& RTF_CACHE
)
2341 rt6_update_exception_stamp_rt(rt6
);
2343 struct fib6_info
*from
;
2344 struct rt6_info
*nrt6
;
2347 from
= rcu_dereference(rt6
->from
);
2352 nrt6
= ip6_rt_cache_alloc(from
, daddr
, saddr
);
2354 rt6_do_update_pmtu(nrt6
, mtu
);
2355 if (rt6_insert_exception(nrt6
, from
))
2356 dst_release_immediate(&nrt6
->dst
);
2362 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2363 struct sk_buff
*skb
, u32 mtu
)
2365 __ip6_rt_update_pmtu(dst
, sk
, skb
? ipv6_hdr(skb
) : NULL
, mtu
);
2368 void ip6_update_pmtu(struct sk_buff
*skb
, struct net
*net
, __be32 mtu
,
2369 int oif
, u32 mark
, kuid_t uid
)
2371 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2372 struct dst_entry
*dst
;
2373 struct flowi6 fl6
= {
2375 .flowi6_mark
= mark
? mark
: IP6_REPLY_MARK(net
, skb
->mark
),
2376 .daddr
= iph
->daddr
,
2377 .saddr
= iph
->saddr
,
2378 .flowlabel
= ip6_flowinfo(iph
),
2382 dst
= ip6_route_output(net
, NULL
, &fl6
);
2384 __ip6_rt_update_pmtu(dst
, NULL
, iph
, ntohl(mtu
));
2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu
);
2389 void ip6_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, __be32 mtu
)
2391 int oif
= sk
->sk_bound_dev_if
;
2392 struct dst_entry
*dst
;
2394 if (!oif
&& skb
->dev
)
2395 oif
= l3mdev_master_ifindex(skb
->dev
);
2397 ip6_update_pmtu(skb
, sock_net(sk
), mtu
, oif
, sk
->sk_mark
, sk
->sk_uid
);
2399 dst
= __sk_dst_get(sk
);
2400 if (!dst
|| !dst
->obsolete
||
2401 dst
->ops
->check(dst
, inet6_sk(sk
)->dst_cookie
))
2405 if (!sock_owned_by_user(sk
) && !ipv6_addr_v4mapped(&sk
->sk_v6_daddr
))
2406 ip6_datagram_dst_update(sk
, false);
2409 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu
);
2411 void ip6_sk_dst_store_flow(struct sock
*sk
, struct dst_entry
*dst
,
2412 const struct flowi6
*fl6
)
2414 #ifdef CONFIG_IPV6_SUBTREES
2415 struct ipv6_pinfo
*np
= inet6_sk(sk
);
2418 ip6_dst_store(sk
, dst
,
2419 ipv6_addr_equal(&fl6
->daddr
, &sk
->sk_v6_daddr
) ?
2420 &sk
->sk_v6_daddr
: NULL
,
2421 #ifdef CONFIG_IPV6_SUBTREES
2422 ipv6_addr_equal(&fl6
->saddr
, &np
->saddr
) ?
2428 /* Handle redirects */
2429 struct ip6rd_flowi
{
2431 struct in6_addr gateway
;
2434 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
2435 struct fib6_table
*table
,
2437 const struct sk_buff
*skb
,
2440 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
2441 struct rt6_info
*ret
= NULL
, *rt_cache
;
2442 struct fib6_info
*rt
;
2443 struct fib6_node
*fn
;
2445 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2446 * this case we must match on the real ingress device, so reset it
2448 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
2449 fl6
->flowi6_oif
= skb
->dev
->ifindex
;
2451 /* Get the "current" route for this destination and
2452 * check if the redirect has come from appropriate router.
2454 * RFC 4861 specifies that redirects should only be
2455 * accepted if they come from the nexthop to the target.
2456 * Due to the way the routes are chosen, this notion
2457 * is a bit fuzzy and one might need to check all possible
2462 fn
= fib6_node_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
2464 for_each_fib6_node_rt_rcu(fn
) {
2465 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
2467 if (fib6_check_expired(rt
))
2469 if (rt
->fib6_flags
& RTF_REJECT
)
2471 if (!(rt
->fib6_flags
& RTF_GATEWAY
))
2473 if (fl6
->flowi6_oif
!= rt
->fib6_nh
.nh_dev
->ifindex
)
2475 /* rt_cache's gateway might be different from its 'parent'
2476 * in the case of an ip redirect.
2477 * So we keep searching in the exception table if the gateway
2480 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->fib6_nh
.nh_gw
)) {
2481 rt_cache
= rt6_find_cached_rt(rt
,
2485 ipv6_addr_equal(&rdfl
->gateway
,
2486 &rt_cache
->rt6i_gateway
)) {
2496 rt
= net
->ipv6
.fib6_null_entry
;
2497 else if (rt
->fib6_flags
& RTF_REJECT
) {
2498 ret
= net
->ipv6
.ip6_null_entry
;
2502 if (rt
== net
->ipv6
.fib6_null_entry
) {
2503 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
2510 ip6_hold_safe(net
, &ret
, true);
2512 ret
= ip6_create_rt_rcu(rt
);
2516 trace_fib6_table_lookup(net
, rt
, table
, fl6
);
2520 static struct dst_entry
*ip6_route_redirect(struct net
*net
,
2521 const struct flowi6
*fl6
,
2522 const struct sk_buff
*skb
,
2523 const struct in6_addr
*gateway
)
2525 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
2526 struct ip6rd_flowi rdfl
;
2529 rdfl
.gateway
= *gateway
;
2531 return fib6_rule_lookup(net
, &rdfl
.fl6
, skb
,
2532 flags
, __ip6_route_redirect
);
2535 void ip6_redirect(struct sk_buff
*skb
, struct net
*net
, int oif
, u32 mark
,
2538 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
2539 struct dst_entry
*dst
;
2540 struct flowi6 fl6
= {
2541 .flowi6_iif
= LOOPBACK_IFINDEX
,
2543 .flowi6_mark
= mark
,
2544 .daddr
= iph
->daddr
,
2545 .saddr
= iph
->saddr
,
2546 .flowlabel
= ip6_flowinfo(iph
),
2550 dst
= ip6_route_redirect(net
, &fl6
, skb
, &ipv6_hdr(skb
)->saddr
);
2551 rt6_do_redirect(dst
, NULL
, skb
);
2554 EXPORT_SYMBOL_GPL(ip6_redirect
);
2556 void ip6_redirect_no_header(struct sk_buff
*skb
, struct net
*net
, int oif
)
2558 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
2559 const struct rd_msg
*msg
= (struct rd_msg
*)icmp6_hdr(skb
);
2560 struct dst_entry
*dst
;
2561 struct flowi6 fl6
= {
2562 .flowi6_iif
= LOOPBACK_IFINDEX
,
2565 .saddr
= iph
->daddr
,
2566 .flowi6_uid
= sock_net_uid(net
, NULL
),
2569 dst
= ip6_route_redirect(net
, &fl6
, skb
, &iph
->saddr
);
2570 rt6_do_redirect(dst
, NULL
, skb
);
2574 void ip6_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
2576 ip6_redirect(skb
, sock_net(sk
), sk
->sk_bound_dev_if
, sk
->sk_mark
,
2579 EXPORT_SYMBOL_GPL(ip6_sk_redirect
);
2581 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
2583 struct net_device
*dev
= dst
->dev
;
2584 unsigned int mtu
= dst_mtu(dst
);
2585 struct net
*net
= dev_net(dev
);
2587 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
2589 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
2590 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2593 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2594 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2595 * IPV6_MAXPLEN is also valid and means: "any MSS,
2596 * rely only on pmtu discovery"
2598 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
2603 static unsigned int ip6_mtu(const struct dst_entry
*dst
)
2605 struct inet6_dev
*idev
;
2608 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2615 idev
= __in6_dev_get(dst
->dev
);
2617 mtu
= idev
->cnf
.mtu6
;
2621 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
2623 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
2627 * 1. mtu on route is locked - use it
2628 * 2. mtu from nexthop exception
2629 * 3. mtu from egress device
2631 * based on ip6_dst_mtu_forward and exception logic of
2632 * rt6_find_cached_rt; called with rcu_read_lock
2634 u32
ip6_mtu_from_fib6(struct fib6_info
*f6i
, struct in6_addr
*daddr
,
2635 struct in6_addr
*saddr
)
2637 struct inet6_dev
*idev
;
2638 struct rt6_info
*rt
;
2641 if (unlikely(fib6_metric_locked(f6i
, RTAX_MTU
))) {
2642 mtu
= f6i
->fib6_pmtu
;
2647 rt
= rt6_find_cached_rt(f6i
, daddr
, saddr
);
2649 mtu
= dst_metric_raw(&rt
->dst
, RTAX_MTU
);
2651 struct net_device
*dev
= fib6_info_nh_dev(f6i
);
2654 idev
= __in6_dev_get(dev
);
2655 if (idev
&& idev
->cnf
.mtu6
> mtu
)
2656 mtu
= idev
->cnf
.mtu6
;
2659 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
2661 return mtu
- lwtunnel_headroom(fib6_info_nh_lwt(f6i
), mtu
);
2664 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
2667 struct dst_entry
*dst
;
2668 struct rt6_info
*rt
;
2669 struct inet6_dev
*idev
= in6_dev_get(dev
);
2670 struct net
*net
= dev_net(dev
);
2672 if (unlikely(!idev
))
2673 return ERR_PTR(-ENODEV
);
2675 rt
= ip6_dst_alloc(net
, dev
, 0);
2676 if (unlikely(!rt
)) {
2678 dst
= ERR_PTR(-ENOMEM
);
2682 rt
->dst
.flags
|= DST_HOST
;
2683 rt
->dst
.input
= ip6_input
;
2684 rt
->dst
.output
= ip6_output
;
2685 rt
->rt6i_gateway
= fl6
->daddr
;
2686 rt
->rt6i_dst
.addr
= fl6
->daddr
;
2687 rt
->rt6i_dst
.plen
= 128;
2688 rt
->rt6i_idev
= idev
;
2689 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 0);
2691 /* Add this dst into uncached_list so that rt6_disable_ip() can
2692 * do proper release of the net_device
2694 rt6_uncached_list_add(rt
);
2695 atomic_inc(&net
->ipv6
.rt6_stats
->fib_rt_uncache
);
2697 dst
= xfrm_lookup(net
, &rt
->dst
, flowi6_to_flowi(fl6
), NULL
, 0);
2703 static int ip6_dst_gc(struct dst_ops
*ops
)
2705 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
2706 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2707 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
2708 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2709 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2710 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
2713 entries
= dst_entries_get_fast(ops
);
2714 if (time_after(rt_last_gc
+ rt_min_interval
, jiffies
) &&
2715 entries
<= rt_max_size
)
2718 net
->ipv6
.ip6_rt_gc_expire
++;
2719 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
, true);
2720 entries
= dst_entries_get_slow(ops
);
2721 if (entries
< ops
->gc_thresh
)
2722 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
2724 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
2725 return entries
> rt_max_size
;
2728 static struct rt6_info
*ip6_nh_lookup_table(struct net
*net
,
2729 struct fib6_config
*cfg
,
2730 const struct in6_addr
*gw_addr
,
2731 u32 tbid
, int flags
)
2733 struct flowi6 fl6
= {
2734 .flowi6_oif
= cfg
->fc_ifindex
,
2736 .saddr
= cfg
->fc_prefsrc
,
2738 struct fib6_table
*table
;
2739 struct rt6_info
*rt
;
2741 table
= fib6_get_table(net
, tbid
);
2745 if (!ipv6_addr_any(&cfg
->fc_prefsrc
))
2746 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
2748 flags
|= RT6_LOOKUP_F_IGNORE_LINKSTATE
;
2749 rt
= ip6_pol_route(net
, table
, cfg
->fc_ifindex
, &fl6
, NULL
, flags
);
2751 /* if table lookup failed, fall back to full lookup */
2752 if (rt
== net
->ipv6
.ip6_null_entry
) {
2760 static int ip6_route_check_nh_onlink(struct net
*net
,
2761 struct fib6_config
*cfg
,
2762 const struct net_device
*dev
,
2763 struct netlink_ext_ack
*extack
)
2765 u32 tbid
= l3mdev_fib_table(dev
) ? : RT_TABLE_MAIN
;
2766 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2767 u32 flags
= RTF_LOCAL
| RTF_ANYCAST
| RTF_REJECT
;
2768 struct fib6_info
*from
;
2769 struct rt6_info
*grt
;
2773 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
, tbid
, 0);
2776 from
= rcu_dereference(grt
->from
);
2777 if (!grt
->dst
.error
&&
2778 /* ignore match if it is the default route */
2779 from
&& !ipv6_addr_any(&from
->fib6_dst
.addr
) &&
2780 (grt
->rt6i_flags
& flags
|| dev
!= grt
->dst
.dev
)) {
2781 NL_SET_ERR_MSG(extack
,
2782 "Nexthop has invalid gateway or device mismatch");
2793 static int ip6_route_check_nh(struct net
*net
,
2794 struct fib6_config
*cfg
,
2795 struct net_device
**_dev
,
2796 struct inet6_dev
**idev
)
2798 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2799 struct net_device
*dev
= _dev
? *_dev
: NULL
;
2800 struct rt6_info
*grt
= NULL
;
2801 int err
= -EHOSTUNREACH
;
2803 if (cfg
->fc_table
) {
2804 int flags
= RT6_LOOKUP_F_IFACE
;
2806 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
,
2807 cfg
->fc_table
, flags
);
2809 if (grt
->rt6i_flags
& RTF_GATEWAY
||
2810 (dev
&& dev
!= grt
->dst
.dev
)) {
2818 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, NULL
, 1);
2824 if (dev
!= grt
->dst
.dev
) {
2829 *_dev
= dev
= grt
->dst
.dev
;
2830 *idev
= grt
->rt6i_idev
;
2832 in6_dev_hold(grt
->rt6i_idev
);
2835 if (!(grt
->rt6i_flags
& RTF_GATEWAY
))
2844 static int ip6_validate_gw(struct net
*net
, struct fib6_config
*cfg
,
2845 struct net_device
**_dev
, struct inet6_dev
**idev
,
2846 struct netlink_ext_ack
*extack
)
2848 const struct in6_addr
*gw_addr
= &cfg
->fc_gateway
;
2849 int gwa_type
= ipv6_addr_type(gw_addr
);
2850 bool skip_dev
= gwa_type
& IPV6_ADDR_LINKLOCAL
? false : true;
2851 const struct net_device
*dev
= *_dev
;
2852 bool need_addr_check
= !dev
;
2855 /* if gw_addr is local we will fail to detect this in case
2856 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2857 * will return already-added prefix route via interface that
2858 * prefix route was assigned to, which might be non-loopback.
2861 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
2862 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
2866 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_UNICAST
)) {
2867 /* IPv6 strictly inhibits using not link-local
2868 * addresses as nexthop address.
2869 * Otherwise, router will not able to send redirects.
2870 * It is very good, but in some (rare!) circumstances
2871 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2872 * some exceptions. --ANK
2873 * We allow IPv4-mapped nexthops to support RFC4798-type
2876 if (!(gwa_type
& (IPV6_ADDR_UNICAST
| IPV6_ADDR_MAPPED
))) {
2877 NL_SET_ERR_MSG(extack
, "Invalid gateway address");
2881 if (cfg
->fc_flags
& RTNH_F_ONLINK
)
2882 err
= ip6_route_check_nh_onlink(net
, cfg
, dev
, extack
);
2884 err
= ip6_route_check_nh(net
, cfg
, _dev
, idev
);
2890 /* reload in case device was changed */
2895 NL_SET_ERR_MSG(extack
, "Egress device not specified");
2897 } else if (dev
->flags
& IFF_LOOPBACK
) {
2898 NL_SET_ERR_MSG(extack
,
2899 "Egress device can not be loopback device for this route");
2903 /* if we did not check gw_addr above, do so now that the
2904 * egress device has been resolved.
2906 if (need_addr_check
&&
2907 ipv6_chk_addr_and_flags(net
, gw_addr
, dev
, skip_dev
, 0, 0)) {
2908 NL_SET_ERR_MSG(extack
, "Gateway can not be a local address");
2917 static struct fib6_info
*ip6_route_info_create(struct fib6_config
*cfg
,
2919 struct netlink_ext_ack
*extack
)
2921 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
2922 struct fib6_info
*rt
= NULL
;
2923 struct net_device
*dev
= NULL
;
2924 struct inet6_dev
*idev
= NULL
;
2925 struct fib6_table
*table
;
2929 /* RTF_PCPU is an internal flag; can not be set by userspace */
2930 if (cfg
->fc_flags
& RTF_PCPU
) {
2931 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_PCPU");
2935 /* RTF_CACHE is an internal flag; can not be set by userspace */
2936 if (cfg
->fc_flags
& RTF_CACHE
) {
2937 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_CACHE");
2941 if (cfg
->fc_type
> RTN_MAX
) {
2942 NL_SET_ERR_MSG(extack
, "Invalid route type");
2946 if (cfg
->fc_dst_len
> 128) {
2947 NL_SET_ERR_MSG(extack
, "Invalid prefix length");
2950 if (cfg
->fc_src_len
> 128) {
2951 NL_SET_ERR_MSG(extack
, "Invalid source address length");
2954 #ifndef CONFIG_IPV6_SUBTREES
2955 if (cfg
->fc_src_len
) {
2956 NL_SET_ERR_MSG(extack
,
2957 "Specifying source address requires IPV6_SUBTREES to be enabled");
2961 if (cfg
->fc_ifindex
) {
2963 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
2966 idev
= in6_dev_get(dev
);
2971 if (cfg
->fc_metric
== 0)
2972 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
2974 if (cfg
->fc_flags
& RTNH_F_ONLINK
) {
2976 NL_SET_ERR_MSG(extack
,
2977 "Nexthop device required for onlink");
2982 if (!(dev
->flags
& IFF_UP
)) {
2983 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
2990 if (cfg
->fc_nlinfo
.nlh
&&
2991 !(cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_CREATE
)) {
2992 table
= fib6_get_table(net
, cfg
->fc_table
);
2994 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2995 table
= fib6_new_table(net
, cfg
->fc_table
);
2998 table
= fib6_new_table(net
, cfg
->fc_table
);
3005 rt
= fib6_info_alloc(gfp_flags
);
3009 rt
->fib6_metrics
= ip_fib_metrics_init(net
, cfg
->fc_mx
, cfg
->fc_mx_len
,
3011 if (IS_ERR(rt
->fib6_metrics
)) {
3012 err
= PTR_ERR(rt
->fib6_metrics
);
3013 /* Do not leave garbage there. */
3014 rt
->fib6_metrics
= (struct dst_metrics
*)&dst_default_metrics
;
3018 if (cfg
->fc_flags
& RTF_ADDRCONF
)
3019 rt
->dst_nocount
= true;
3021 if (cfg
->fc_flags
& RTF_EXPIRES
)
3022 fib6_set_expires(rt
, jiffies
+
3023 clock_t_to_jiffies(cfg
->fc_expires
));
3025 fib6_clean_expires(rt
);
3027 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
3028 cfg
->fc_protocol
= RTPROT_BOOT
;
3029 rt
->fib6_protocol
= cfg
->fc_protocol
;
3031 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
3033 if (cfg
->fc_encap
) {
3034 struct lwtunnel_state
*lwtstate
;
3036 err
= lwtunnel_build_state(cfg
->fc_encap_type
,
3037 cfg
->fc_encap
, AF_INET6
, cfg
,
3041 rt
->fib6_nh
.nh_lwtstate
= lwtstate_get(lwtstate
);
3044 ipv6_addr_prefix(&rt
->fib6_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
3045 rt
->fib6_dst
.plen
= cfg
->fc_dst_len
;
3046 if (rt
->fib6_dst
.plen
== 128)
3047 rt
->dst_host
= true;
3049 #ifdef CONFIG_IPV6_SUBTREES
3050 ipv6_addr_prefix(&rt
->fib6_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
3051 rt
->fib6_src
.plen
= cfg
->fc_src_len
;
3054 rt
->fib6_metric
= cfg
->fc_metric
;
3055 rt
->fib6_nh
.nh_weight
= 1;
3057 rt
->fib6_type
= cfg
->fc_type
;
3059 /* We cannot add true routes via loopback here,
3060 they would result in kernel looping; promote them to reject routes
3062 if ((cfg
->fc_flags
& RTF_REJECT
) ||
3063 (dev
&& (dev
->flags
& IFF_LOOPBACK
) &&
3064 !(addr_type
& IPV6_ADDR_LOOPBACK
) &&
3065 !(cfg
->fc_flags
& RTF_LOCAL
))) {
3066 /* hold loopback dev/idev if we haven't done so. */
3067 if (dev
!= net
->loopback_dev
) {
3072 dev
= net
->loopback_dev
;
3074 idev
= in6_dev_get(dev
);
3080 rt
->fib6_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
3084 if (cfg
->fc_flags
& RTF_GATEWAY
) {
3085 err
= ip6_validate_gw(net
, cfg
, &dev
, &idev
, extack
);
3089 rt
->fib6_nh
.nh_gw
= cfg
->fc_gateway
;
3096 if (idev
->cnf
.disable_ipv6
) {
3097 NL_SET_ERR_MSG(extack
, "IPv6 is disabled on nexthop device");
3102 if (!(dev
->flags
& IFF_UP
)) {
3103 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3108 if (!ipv6_addr_any(&cfg
->fc_prefsrc
)) {
3109 if (!ipv6_chk_addr(net
, &cfg
->fc_prefsrc
, dev
, 0)) {
3110 NL_SET_ERR_MSG(extack
, "Invalid source address");
3114 rt
->fib6_prefsrc
.addr
= cfg
->fc_prefsrc
;
3115 rt
->fib6_prefsrc
.plen
= 128;
3117 rt
->fib6_prefsrc
.plen
= 0;
3119 rt
->fib6_flags
= cfg
->fc_flags
;
3122 if (!(rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
)) &&
3123 !netif_carrier_ok(dev
))
3124 rt
->fib6_nh
.nh_flags
|= RTNH_F_LINKDOWN
;
3125 rt
->fib6_nh
.nh_flags
|= (cfg
->fc_flags
& RTNH_F_ONLINK
);
3126 rt
->fib6_nh
.nh_dev
= dev
;
3127 rt
->fib6_table
= table
;
3139 fib6_info_release(rt
);
3140 return ERR_PTR(err
);
3143 int ip6_route_add(struct fib6_config
*cfg
, gfp_t gfp_flags
,
3144 struct netlink_ext_ack
*extack
)
3146 struct fib6_info
*rt
;
3149 rt
= ip6_route_info_create(cfg
, gfp_flags
, extack
);
3153 err
= __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
, extack
);
3154 fib6_info_release(rt
);
3159 static int __ip6_del_rt(struct fib6_info
*rt
, struct nl_info
*info
)
3161 struct net
*net
= info
->nl_net
;
3162 struct fib6_table
*table
;
3165 if (rt
== net
->ipv6
.fib6_null_entry
) {
3170 table
= rt
->fib6_table
;
3171 spin_lock_bh(&table
->tb6_lock
);
3172 err
= fib6_del(rt
, info
);
3173 spin_unlock_bh(&table
->tb6_lock
);
3176 fib6_info_release(rt
);
3180 int ip6_del_rt(struct net
*net
, struct fib6_info
*rt
)
3182 struct nl_info info
= { .nl_net
= net
};
3184 return __ip6_del_rt(rt
, &info
);
3187 static int __ip6_del_rt_siblings(struct fib6_info
*rt
, struct fib6_config
*cfg
)
3189 struct nl_info
*info
= &cfg
->fc_nlinfo
;
3190 struct net
*net
= info
->nl_net
;
3191 struct sk_buff
*skb
= NULL
;
3192 struct fib6_table
*table
;
3195 if (rt
== net
->ipv6
.fib6_null_entry
)
3197 table
= rt
->fib6_table
;
3198 spin_lock_bh(&table
->tb6_lock
);
3200 if (rt
->fib6_nsiblings
&& cfg
->fc_delete_all_nh
) {
3201 struct fib6_info
*sibling
, *next_sibling
;
3203 /* prefer to send a single notification with all hops */
3204 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
3206 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
3208 if (rt6_fill_node(net
, skb
, rt
, NULL
,
3209 NULL
, NULL
, 0, RTM_DELROUTE
,
3210 info
->portid
, seq
, 0) < 0) {
3214 info
->skip_notify
= 1;
3217 list_for_each_entry_safe(sibling
, next_sibling
,
3220 err
= fib6_del(sibling
, info
);
3226 err
= fib6_del(rt
, info
);
3228 spin_unlock_bh(&table
->tb6_lock
);
3230 fib6_info_release(rt
);
3233 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
3234 info
->nlh
, gfp_any());
3239 static int ip6_del_cached_rt(struct rt6_info
*rt
, struct fib6_config
*cfg
)
3243 if (cfg
->fc_ifindex
&& rt
->dst
.dev
->ifindex
!= cfg
->fc_ifindex
)
3246 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3247 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
3250 rc
= rt6_remove_exception_rt(rt
);
3255 static int ip6_route_del(struct fib6_config
*cfg
,
3256 struct netlink_ext_ack
*extack
)
3258 struct rt6_info
*rt_cache
;
3259 struct fib6_table
*table
;
3260 struct fib6_info
*rt
;
3261 struct fib6_node
*fn
;
3264 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
3266 NL_SET_ERR_MSG(extack
, "FIB table does not exist");
3272 fn
= fib6_locate(&table
->tb6_root
,
3273 &cfg
->fc_dst
, cfg
->fc_dst_len
,
3274 &cfg
->fc_src
, cfg
->fc_src_len
,
3275 !(cfg
->fc_flags
& RTF_CACHE
));
3278 for_each_fib6_node_rt_rcu(fn
) {
3279 if (cfg
->fc_flags
& RTF_CACHE
) {
3282 rt_cache
= rt6_find_cached_rt(rt
, &cfg
->fc_dst
,
3285 rc
= ip6_del_cached_rt(rt_cache
, cfg
);
3293 if (cfg
->fc_ifindex
&&
3294 (!rt
->fib6_nh
.nh_dev
||
3295 rt
->fib6_nh
.nh_dev
->ifindex
!= cfg
->fc_ifindex
))
3297 if (cfg
->fc_flags
& RTF_GATEWAY
&&
3298 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->fib6_nh
.nh_gw
))
3300 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->fib6_metric
)
3302 if (cfg
->fc_protocol
&& cfg
->fc_protocol
!= rt
->fib6_protocol
)
3304 if (!fib6_info_hold_safe(rt
))
3308 /* if gateway was specified only delete the one hop */
3309 if (cfg
->fc_flags
& RTF_GATEWAY
)
3310 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
3312 return __ip6_del_rt_siblings(rt
, cfg
);
3320 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
3322 struct netevent_redirect netevent
;
3323 struct rt6_info
*rt
, *nrt
= NULL
;
3324 struct ndisc_options ndopts
;
3325 struct inet6_dev
*in6_dev
;
3326 struct neighbour
*neigh
;
3327 struct fib6_info
*from
;
3329 int optlen
, on_link
;
3332 optlen
= skb_tail_pointer(skb
) - skb_transport_header(skb
);
3333 optlen
-= sizeof(*msg
);
3336 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3340 msg
= (struct rd_msg
*)icmp6_hdr(skb
);
3342 if (ipv6_addr_is_multicast(&msg
->dest
)) {
3343 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3348 if (ipv6_addr_equal(&msg
->dest
, &msg
->target
)) {
3350 } else if (ipv6_addr_type(&msg
->target
) !=
3351 (IPV6_ADDR_UNICAST
|IPV6_ADDR_LINKLOCAL
)) {
3352 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3356 in6_dev
= __in6_dev_get(skb
->dev
);
3359 if (in6_dev
->cnf
.forwarding
|| !in6_dev
->cnf
.accept_redirects
)
3363 * The IP source address of the Redirect MUST be the same as the current
3364 * first-hop router for the specified ICMP Destination Address.
3367 if (!ndisc_parse_options(skb
->dev
, msg
->opt
, optlen
, &ndopts
)) {
3368 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3373 if (ndopts
.nd_opts_tgt_lladdr
) {
3374 lladdr
= ndisc_opt_addr_data(ndopts
.nd_opts_tgt_lladdr
,
3377 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3382 rt
= (struct rt6_info
*) dst
;
3383 if (rt
->rt6i_flags
& RTF_REJECT
) {
3384 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3388 /* Redirect received -> path was valid.
3389 * Look, redirects are sent only in response to data packets,
3390 * so that this nexthop apparently is reachable. --ANK
3392 dst_confirm_neigh(&rt
->dst
, &ipv6_hdr(skb
)->saddr
);
3394 neigh
= __neigh_lookup(&nd_tbl
, &msg
->target
, skb
->dev
, 1);
3399 * We have finally decided to accept it.
3402 ndisc_update(skb
->dev
, neigh
, lladdr
, NUD_STALE
,
3403 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
3404 NEIGH_UPDATE_F_OVERRIDE
|
3405 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
3406 NEIGH_UPDATE_F_ISROUTER
)),
3407 NDISC_REDIRECT
, &ndopts
);
3410 from
= rcu_dereference(rt
->from
);
3414 nrt
= ip6_rt_cache_alloc(from
, &msg
->dest
, NULL
);
3418 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
3420 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
3422 nrt
->rt6i_gateway
= *(struct in6_addr
*)neigh
->primary_key
;
3424 /* rt6_insert_exception() will take care of duplicated exceptions */
3425 if (rt6_insert_exception(nrt
, from
)) {
3426 dst_release_immediate(&nrt
->dst
);
3430 netevent
.old
= &rt
->dst
;
3431 netevent
.new = &nrt
->dst
;
3432 netevent
.daddr
= &msg
->dest
;
3433 netevent
.neigh
= neigh
;
3434 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
3438 neigh_release(neigh
);
3441 #ifdef CONFIG_IPV6_ROUTE_INFO
3442 static struct fib6_info
*rt6_get_route_info(struct net
*net
,
3443 const struct in6_addr
*prefix
, int prefixlen
,
3444 const struct in6_addr
*gwaddr
,
3445 struct net_device
*dev
)
3447 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
3448 int ifindex
= dev
->ifindex
;
3449 struct fib6_node
*fn
;
3450 struct fib6_info
*rt
= NULL
;
3451 struct fib6_table
*table
;
3453 table
= fib6_get_table(net
, tb_id
);
3458 fn
= fib6_locate(&table
->tb6_root
, prefix
, prefixlen
, NULL
, 0, true);
3462 for_each_fib6_node_rt_rcu(fn
) {
3463 if (rt
->fib6_nh
.nh_dev
->ifindex
!= ifindex
)
3465 if ((rt
->fib6_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
3467 if (!ipv6_addr_equal(&rt
->fib6_nh
.nh_gw
, gwaddr
))
3469 if (!fib6_info_hold_safe(rt
))
3478 static struct fib6_info
*rt6_add_route_info(struct net
*net
,
3479 const struct in6_addr
*prefix
, int prefixlen
,
3480 const struct in6_addr
*gwaddr
,
3481 struct net_device
*dev
,
3484 struct fib6_config cfg
= {
3485 .fc_metric
= IP6_RT_PRIO_USER
,
3486 .fc_ifindex
= dev
->ifindex
,
3487 .fc_dst_len
= prefixlen
,
3488 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
3489 RTF_UP
| RTF_PREF(pref
),
3490 .fc_protocol
= RTPROT_RA
,
3491 .fc_type
= RTN_UNICAST
,
3492 .fc_nlinfo
.portid
= 0,
3493 .fc_nlinfo
.nlh
= NULL
,
3494 .fc_nlinfo
.nl_net
= net
,
3497 cfg
.fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
,
3498 cfg
.fc_dst
= *prefix
;
3499 cfg
.fc_gateway
= *gwaddr
;
3501 /* We should treat it as a default route if prefix length is 0. */
3503 cfg
.fc_flags
|= RTF_DEFAULT
;
3505 ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
);
3507 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, dev
);
3511 struct fib6_info
*rt6_get_dflt_router(struct net
*net
,
3512 const struct in6_addr
*addr
,
3513 struct net_device
*dev
)
3515 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
;
3516 struct fib6_info
*rt
;
3517 struct fib6_table
*table
;
3519 table
= fib6_get_table(net
, tb_id
);
3524 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
3525 if (dev
== rt
->fib6_nh
.nh_dev
&&
3526 ((rt
->fib6_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
3527 ipv6_addr_equal(&rt
->fib6_nh
.nh_gw
, addr
))
3530 if (rt
&& !fib6_info_hold_safe(rt
))
3536 struct fib6_info
*rt6_add_dflt_router(struct net
*net
,
3537 const struct in6_addr
*gwaddr
,
3538 struct net_device
*dev
,
3541 struct fib6_config cfg
= {
3542 .fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
,
3543 .fc_metric
= IP6_RT_PRIO_USER
,
3544 .fc_ifindex
= dev
->ifindex
,
3545 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
3546 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
3547 .fc_protocol
= RTPROT_RA
,
3548 .fc_type
= RTN_UNICAST
,
3549 .fc_nlinfo
.portid
= 0,
3550 .fc_nlinfo
.nlh
= NULL
,
3551 .fc_nlinfo
.nl_net
= net
,
3554 cfg
.fc_gateway
= *gwaddr
;
3556 if (!ip6_route_add(&cfg
, GFP_ATOMIC
, NULL
)) {
3557 struct fib6_table
*table
;
3559 table
= fib6_get_table(dev_net(dev
), cfg
.fc_table
);
3561 table
->flags
|= RT6_TABLE_HAS_DFLT_ROUTER
;
3564 return rt6_get_dflt_router(net
, gwaddr
, dev
);
3567 static void __rt6_purge_dflt_routers(struct net
*net
,
3568 struct fib6_table
*table
)
3570 struct fib6_info
*rt
;
3574 for_each_fib6_node_rt_rcu(&table
->tb6_root
) {
3575 struct net_device
*dev
= fib6_info_nh_dev(rt
);
3576 struct inet6_dev
*idev
= dev
? __in6_dev_get(dev
) : NULL
;
3578 if (rt
->fib6_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
) &&
3579 (!idev
|| idev
->cnf
.accept_ra
!= 2) &&
3580 fib6_info_hold_safe(rt
)) {
3582 ip6_del_rt(net
, rt
);
3588 table
->flags
&= ~RT6_TABLE_HAS_DFLT_ROUTER
;
3591 void rt6_purge_dflt_routers(struct net
*net
)
3593 struct fib6_table
*table
;
3594 struct hlist_head
*head
;
3599 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
3600 head
= &net
->ipv6
.fib_table_hash
[h
];
3601 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
3602 if (table
->flags
& RT6_TABLE_HAS_DFLT_ROUTER
)
3603 __rt6_purge_dflt_routers(net
, table
);
3610 static void rtmsg_to_fib6_config(struct net
*net
,
3611 struct in6_rtmsg
*rtmsg
,
3612 struct fib6_config
*cfg
)
3614 *cfg
= (struct fib6_config
){
3615 .fc_table
= l3mdev_fib_table_by_index(net
, rtmsg
->rtmsg_ifindex
) ?
3617 .fc_ifindex
= rtmsg
->rtmsg_ifindex
,
3618 .fc_metric
= rtmsg
->rtmsg_metric
,
3619 .fc_expires
= rtmsg
->rtmsg_info
,
3620 .fc_dst_len
= rtmsg
->rtmsg_dst_len
,
3621 .fc_src_len
= rtmsg
->rtmsg_src_len
,
3622 .fc_flags
= rtmsg
->rtmsg_flags
,
3623 .fc_type
= rtmsg
->rtmsg_type
,
3625 .fc_nlinfo
.nl_net
= net
,
3627 .fc_dst
= rtmsg
->rtmsg_dst
,
3628 .fc_src
= rtmsg
->rtmsg_src
,
3629 .fc_gateway
= rtmsg
->rtmsg_gateway
,
3633 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
3635 struct fib6_config cfg
;
3636 struct in6_rtmsg rtmsg
;
3640 case SIOCADDRT
: /* Add a route */
3641 case SIOCDELRT
: /* Delete a route */
3642 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
3644 err
= copy_from_user(&rtmsg
, arg
,
3645 sizeof(struct in6_rtmsg
));
3649 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
3654 err
= ip6_route_add(&cfg
, GFP_KERNEL
, NULL
);
3657 err
= ip6_route_del(&cfg
, NULL
);
3671 * Drop the packet on the floor
3674 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
3676 struct dst_entry
*dst
= skb_dst(skb
);
3677 struct net
*net
= dev_net(dst
->dev
);
3678 struct inet6_dev
*idev
;
3681 if (netif_is_l3_master(skb
->dev
) &&
3682 dst
->dev
== net
->loopback_dev
)
3683 idev
= __in6_dev_get_safely(dev_get_by_index_rcu(net
, IP6CB(skb
)->iif
));
3685 idev
= ip6_dst_idev(dst
);
3687 switch (ipstats_mib_noroutes
) {
3688 case IPSTATS_MIB_INNOROUTES
:
3689 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
3690 if (type
== IPV6_ADDR_ANY
) {
3691 IP6_INC_STATS(net
, idev
, IPSTATS_MIB_INADDRERRORS
);
3695 case IPSTATS_MIB_OUTNOROUTES
:
3696 IP6_INC_STATS(net
, idev
, ipstats_mib_noroutes
);
3700 /* Start over by dropping the dst for l3mdev case */
3701 if (netif_is_l3_master(skb
->dev
))
3704 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
3709 static int ip6_pkt_discard(struct sk_buff
*skb
)
3711 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
3714 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3716 skb
->dev
= skb_dst(skb
)->dev
;
3717 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
3720 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
3722 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
3725 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3727 skb
->dev
= skb_dst(skb
)->dev
;
3728 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
3732 * Allocate a dst for local (unicast / anycast) address.
3735 struct fib6_info
*addrconf_f6i_alloc(struct net
*net
,
3736 struct inet6_dev
*idev
,
3737 const struct in6_addr
*addr
,
3738 bool anycast
, gfp_t gfp_flags
)
3741 struct net_device
*dev
= idev
->dev
;
3742 struct fib6_info
*f6i
;
3744 f6i
= fib6_info_alloc(gfp_flags
);
3746 return ERR_PTR(-ENOMEM
);
3748 f6i
->fib6_metrics
= ip_fib_metrics_init(net
, NULL
, 0, NULL
);
3749 f6i
->dst_nocount
= true;
3750 f6i
->dst_host
= true;
3751 f6i
->fib6_protocol
= RTPROT_KERNEL
;
3752 f6i
->fib6_flags
= RTF_UP
| RTF_NONEXTHOP
;
3754 f6i
->fib6_type
= RTN_ANYCAST
;
3755 f6i
->fib6_flags
|= RTF_ANYCAST
;
3757 f6i
->fib6_type
= RTN_LOCAL
;
3758 f6i
->fib6_flags
|= RTF_LOCAL
;
3761 f6i
->fib6_nh
.nh_gw
= *addr
;
3763 f6i
->fib6_nh
.nh_dev
= dev
;
3764 f6i
->fib6_dst
.addr
= *addr
;
3765 f6i
->fib6_dst
.plen
= 128;
3766 tb_id
= l3mdev_fib_table(idev
->dev
) ? : RT6_TABLE_LOCAL
;
3767 f6i
->fib6_table
= fib6_get_table(net
, tb_id
);
3772 /* remove deleted ip from prefsrc entries */
3773 struct arg_dev_net_ip
{
3774 struct net_device
*dev
;
3776 struct in6_addr
*addr
;
3779 static int fib6_remove_prefsrc(struct fib6_info
*rt
, void *arg
)
3781 struct net_device
*dev
= ((struct arg_dev_net_ip
*)arg
)->dev
;
3782 struct net
*net
= ((struct arg_dev_net_ip
*)arg
)->net
;
3783 struct in6_addr
*addr
= ((struct arg_dev_net_ip
*)arg
)->addr
;
3785 if (((void *)rt
->fib6_nh
.nh_dev
== dev
|| !dev
) &&
3786 rt
!= net
->ipv6
.fib6_null_entry
&&
3787 ipv6_addr_equal(addr
, &rt
->fib6_prefsrc
.addr
)) {
3788 spin_lock_bh(&rt6_exception_lock
);
3789 /* remove prefsrc entry */
3790 rt
->fib6_prefsrc
.plen
= 0;
3791 spin_unlock_bh(&rt6_exception_lock
);
3796 void rt6_remove_prefsrc(struct inet6_ifaddr
*ifp
)
3798 struct net
*net
= dev_net(ifp
->idev
->dev
);
3799 struct arg_dev_net_ip adni
= {
3800 .dev
= ifp
->idev
->dev
,
3804 fib6_clean_all(net
, fib6_remove_prefsrc
, &adni
);
3807 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3809 /* Remove routers and update dst entries when gateway turn into host. */
3810 static int fib6_clean_tohost(struct fib6_info
*rt
, void *arg
)
3812 struct in6_addr
*gateway
= (struct in6_addr
*)arg
;
3814 if (((rt
->fib6_flags
& RTF_RA_ROUTER
) == RTF_RA_ROUTER
) &&
3815 ipv6_addr_equal(gateway
, &rt
->fib6_nh
.nh_gw
)) {
3819 /* Further clean up cached routes in exception table.
3820 * This is needed because cached route may have a different
3821 * gateway than its 'parent' in the case of an ip redirect.
3823 rt6_exceptions_clean_tohost(rt
, gateway
);
3828 void rt6_clean_tohost(struct net
*net
, struct in6_addr
*gateway
)
3830 fib6_clean_all(net
, fib6_clean_tohost
, gateway
);
3833 struct arg_netdev_event
{
3834 const struct net_device
*dev
;
3836 unsigned int nh_flags
;
3837 unsigned long event
;
3841 static struct fib6_info
*rt6_multipath_first_sibling(const struct fib6_info
*rt
)
3843 struct fib6_info
*iter
;
3844 struct fib6_node
*fn
;
3846 fn
= rcu_dereference_protected(rt
->fib6_node
,
3847 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
3848 iter
= rcu_dereference_protected(fn
->leaf
,
3849 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
3851 if (iter
->fib6_metric
== rt
->fib6_metric
&&
3852 rt6_qualify_for_ecmp(iter
))
3854 iter
= rcu_dereference_protected(iter
->fib6_next
,
3855 lockdep_is_held(&rt
->fib6_table
->tb6_lock
));
3861 static bool rt6_is_dead(const struct fib6_info
*rt
)
3863 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
||
3864 (rt
->fib6_nh
.nh_flags
& RTNH_F_LINKDOWN
&&
3865 fib6_ignore_linkdown(rt
)))
3871 static int rt6_multipath_total_weight(const struct fib6_info
*rt
)
3873 struct fib6_info
*iter
;
3876 if (!rt6_is_dead(rt
))
3877 total
+= rt
->fib6_nh
.nh_weight
;
3879 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
) {
3880 if (!rt6_is_dead(iter
))
3881 total
+= iter
->fib6_nh
.nh_weight
;
3887 static void rt6_upper_bound_set(struct fib6_info
*rt
, int *weight
, int total
)
3889 int upper_bound
= -1;
3891 if (!rt6_is_dead(rt
)) {
3892 *weight
+= rt
->fib6_nh
.nh_weight
;
3893 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
) (*weight
) << 31,
3896 atomic_set(&rt
->fib6_nh
.nh_upper_bound
, upper_bound
);
3899 static void rt6_multipath_upper_bound_set(struct fib6_info
*rt
, int total
)
3901 struct fib6_info
*iter
;
3904 rt6_upper_bound_set(rt
, &weight
, total
);
3906 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3907 rt6_upper_bound_set(iter
, &weight
, total
);
3910 void rt6_multipath_rebalance(struct fib6_info
*rt
)
3912 struct fib6_info
*first
;
3915 /* In case the entire multipath route was marked for flushing,
3916 * then there is no need to rebalance upon the removal of every
3919 if (!rt
->fib6_nsiblings
|| rt
->should_flush
)
3922 /* During lookup routes are evaluated in order, so we need to
3923 * make sure upper bounds are assigned from the first sibling
3926 first
= rt6_multipath_first_sibling(rt
);
3927 if (WARN_ON_ONCE(!first
))
3930 total
= rt6_multipath_total_weight(first
);
3931 rt6_multipath_upper_bound_set(first
, total
);
3934 static int fib6_ifup(struct fib6_info
*rt
, void *p_arg
)
3936 const struct arg_netdev_event
*arg
= p_arg
;
3937 struct net
*net
= dev_net(arg
->dev
);
3939 if (rt
!= net
->ipv6
.fib6_null_entry
&& rt
->fib6_nh
.nh_dev
== arg
->dev
) {
3940 rt
->fib6_nh
.nh_flags
&= ~arg
->nh_flags
;
3941 fib6_update_sernum_upto_root(net
, rt
);
3942 rt6_multipath_rebalance(rt
);
3948 void rt6_sync_up(struct net_device
*dev
, unsigned int nh_flags
)
3950 struct arg_netdev_event arg
= {
3953 .nh_flags
= nh_flags
,
3957 if (nh_flags
& RTNH_F_DEAD
&& netif_carrier_ok(dev
))
3958 arg
.nh_flags
|= RTNH_F_LINKDOWN
;
3960 fib6_clean_all(dev_net(dev
), fib6_ifup
, &arg
);
3963 static bool rt6_multipath_uses_dev(const struct fib6_info
*rt
,
3964 const struct net_device
*dev
)
3966 struct fib6_info
*iter
;
3968 if (rt
->fib6_nh
.nh_dev
== dev
)
3970 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3971 if (iter
->fib6_nh
.nh_dev
== dev
)
3977 static void rt6_multipath_flush(struct fib6_info
*rt
)
3979 struct fib6_info
*iter
;
3981 rt
->should_flush
= 1;
3982 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3983 iter
->should_flush
= 1;
3986 static unsigned int rt6_multipath_dead_count(const struct fib6_info
*rt
,
3987 const struct net_device
*down_dev
)
3989 struct fib6_info
*iter
;
3990 unsigned int dead
= 0;
3992 if (rt
->fib6_nh
.nh_dev
== down_dev
||
3993 rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
3995 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
3996 if (iter
->fib6_nh
.nh_dev
== down_dev
||
3997 iter
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
4003 static void rt6_multipath_nh_flags_set(struct fib6_info
*rt
,
4004 const struct net_device
*dev
,
4005 unsigned int nh_flags
)
4007 struct fib6_info
*iter
;
4009 if (rt
->fib6_nh
.nh_dev
== dev
)
4010 rt
->fib6_nh
.nh_flags
|= nh_flags
;
4011 list_for_each_entry(iter
, &rt
->fib6_siblings
, fib6_siblings
)
4012 if (iter
->fib6_nh
.nh_dev
== dev
)
4013 iter
->fib6_nh
.nh_flags
|= nh_flags
;
4016 /* called with write lock held for table with rt */
4017 static int fib6_ifdown(struct fib6_info
*rt
, void *p_arg
)
4019 const struct arg_netdev_event
*arg
= p_arg
;
4020 const struct net_device
*dev
= arg
->dev
;
4021 struct net
*net
= dev_net(dev
);
4023 if (rt
== net
->ipv6
.fib6_null_entry
)
4026 switch (arg
->event
) {
4027 case NETDEV_UNREGISTER
:
4028 return rt
->fib6_nh
.nh_dev
== dev
? -1 : 0;
4030 if (rt
->should_flush
)
4032 if (!rt
->fib6_nsiblings
)
4033 return rt
->fib6_nh
.nh_dev
== dev
? -1 : 0;
4034 if (rt6_multipath_uses_dev(rt
, dev
)) {
4037 count
= rt6_multipath_dead_count(rt
, dev
);
4038 if (rt
->fib6_nsiblings
+ 1 == count
) {
4039 rt6_multipath_flush(rt
);
4042 rt6_multipath_nh_flags_set(rt
, dev
, RTNH_F_DEAD
|
4044 fib6_update_sernum(net
, rt
);
4045 rt6_multipath_rebalance(rt
);
4049 if (rt
->fib6_nh
.nh_dev
!= dev
||
4050 rt
->fib6_flags
& (RTF_LOCAL
| RTF_ANYCAST
))
4052 rt
->fib6_nh
.nh_flags
|= RTNH_F_LINKDOWN
;
4053 rt6_multipath_rebalance(rt
);
4060 void rt6_sync_down_dev(struct net_device
*dev
, unsigned long event
)
4062 struct arg_netdev_event arg
= {
4068 struct net
*net
= dev_net(dev
);
4070 if (net
->ipv6
.sysctl
.skip_notify_on_dev_down
)
4071 fib6_clean_all_skip_notify(net
, fib6_ifdown
, &arg
);
4073 fib6_clean_all(net
, fib6_ifdown
, &arg
);
4076 void rt6_disable_ip(struct net_device
*dev
, unsigned long event
)
4078 rt6_sync_down_dev(dev
, event
);
4079 rt6_uncached_list_flush_dev(dev_net(dev
), dev
);
4080 neigh_ifdown(&nd_tbl
, dev
);
4083 struct rt6_mtu_change_arg
{
4084 struct net_device
*dev
;
4088 static int rt6_mtu_change_route(struct fib6_info
*rt
, void *p_arg
)
4090 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
4091 struct inet6_dev
*idev
;
4093 /* In IPv6 pmtu discovery is not optional,
4094 so that RTAX_MTU lock cannot disable it.
4095 We still use this lock to block changes
4096 caused by addrconf/ndisc.
4099 idev
= __in6_dev_get(arg
->dev
);
4103 /* For administrative MTU increase, there is no way to discover
4104 IPv6 PMTU increase, so PMTU increase should be updated here.
4105 Since RFC 1981 doesn't include administrative MTU increase
4106 update PMTU increase is a MUST. (i.e. jumbo frame)
4108 if (rt
->fib6_nh
.nh_dev
== arg
->dev
&&
4109 !fib6_metric_locked(rt
, RTAX_MTU
)) {
4110 u32 mtu
= rt
->fib6_pmtu
;
4112 if (mtu
>= arg
->mtu
||
4113 (mtu
< arg
->mtu
&& mtu
== idev
->cnf
.mtu6
))
4114 fib6_metric_set(rt
, RTAX_MTU
, arg
->mtu
);
4116 spin_lock_bh(&rt6_exception_lock
);
4117 rt6_exceptions_update_pmtu(idev
, rt
, arg
->mtu
);
4118 spin_unlock_bh(&rt6_exception_lock
);
4123 void rt6_mtu_change(struct net_device
*dev
, unsigned int mtu
)
4125 struct rt6_mtu_change_arg arg
= {
4130 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, &arg
);
4133 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
4134 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
4135 [RTA_PREFSRC
] = { .len
= sizeof(struct in6_addr
) },
4136 [RTA_OIF
] = { .type
= NLA_U32
},
4137 [RTA_IIF
] = { .type
= NLA_U32
},
4138 [RTA_PRIORITY
] = { .type
= NLA_U32
},
4139 [RTA_METRICS
] = { .type
= NLA_NESTED
},
4140 [RTA_MULTIPATH
] = { .len
= sizeof(struct rtnexthop
) },
4141 [RTA_PREF
] = { .type
= NLA_U8
},
4142 [RTA_ENCAP_TYPE
] = { .type
= NLA_U16
},
4143 [RTA_ENCAP
] = { .type
= NLA_NESTED
},
4144 [RTA_EXPIRES
] = { .type
= NLA_U32
},
4145 [RTA_UID
] = { .type
= NLA_U32
},
4146 [RTA_MARK
] = { .type
= NLA_U32
},
4147 [RTA_TABLE
] = { .type
= NLA_U32
},
4148 [RTA_IP_PROTO
] = { .type
= NLA_U8
},
4149 [RTA_SPORT
] = { .type
= NLA_U16
},
4150 [RTA_DPORT
] = { .type
= NLA_U16
},
4153 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4154 struct fib6_config
*cfg
,
4155 struct netlink_ext_ack
*extack
)
4158 struct nlattr
*tb
[RTA_MAX
+1];
4162 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
,
4168 rtm
= nlmsg_data(nlh
);
4170 *cfg
= (struct fib6_config
){
4171 .fc_table
= rtm
->rtm_table
,
4172 .fc_dst_len
= rtm
->rtm_dst_len
,
4173 .fc_src_len
= rtm
->rtm_src_len
,
4175 .fc_protocol
= rtm
->rtm_protocol
,
4176 .fc_type
= rtm
->rtm_type
,
4178 .fc_nlinfo
.portid
= NETLINK_CB(skb
).portid
,
4179 .fc_nlinfo
.nlh
= nlh
,
4180 .fc_nlinfo
.nl_net
= sock_net(skb
->sk
),
4183 if (rtm
->rtm_type
== RTN_UNREACHABLE
||
4184 rtm
->rtm_type
== RTN_BLACKHOLE
||
4185 rtm
->rtm_type
== RTN_PROHIBIT
||
4186 rtm
->rtm_type
== RTN_THROW
)
4187 cfg
->fc_flags
|= RTF_REJECT
;
4189 if (rtm
->rtm_type
== RTN_LOCAL
)
4190 cfg
->fc_flags
|= RTF_LOCAL
;
4192 if (rtm
->rtm_flags
& RTM_F_CLONED
)
4193 cfg
->fc_flags
|= RTF_CACHE
;
4195 cfg
->fc_flags
|= (rtm
->rtm_flags
& RTNH_F_ONLINK
);
4197 if (tb
[RTA_GATEWAY
]) {
4198 cfg
->fc_gateway
= nla_get_in6_addr(tb
[RTA_GATEWAY
]);
4199 cfg
->fc_flags
|= RTF_GATEWAY
;
4202 NL_SET_ERR_MSG(extack
, "IPv6 does not support RTA_VIA attribute");
4207 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
4209 if (nla_len(tb
[RTA_DST
]) < plen
)
4212 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
4216 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
4218 if (nla_len(tb
[RTA_SRC
]) < plen
)
4221 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
4224 if (tb
[RTA_PREFSRC
])
4225 cfg
->fc_prefsrc
= nla_get_in6_addr(tb
[RTA_PREFSRC
]);
4228 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
4230 if (tb
[RTA_PRIORITY
])
4231 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
4233 if (tb
[RTA_METRICS
]) {
4234 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
4235 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
4239 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
4241 if (tb
[RTA_MULTIPATH
]) {
4242 cfg
->fc_mp
= nla_data(tb
[RTA_MULTIPATH
]);
4243 cfg
->fc_mp_len
= nla_len(tb
[RTA_MULTIPATH
]);
4245 err
= lwtunnel_valid_encap_type_attr(cfg
->fc_mp
,
4246 cfg
->fc_mp_len
, extack
);
4252 pref
= nla_get_u8(tb
[RTA_PREF
]);
4253 if (pref
!= ICMPV6_ROUTER_PREF_LOW
&&
4254 pref
!= ICMPV6_ROUTER_PREF_HIGH
)
4255 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
4256 cfg
->fc_flags
|= RTF_PREF(pref
);
4260 cfg
->fc_encap
= tb
[RTA_ENCAP
];
4262 if (tb
[RTA_ENCAP_TYPE
]) {
4263 cfg
->fc_encap_type
= nla_get_u16(tb
[RTA_ENCAP_TYPE
]);
4265 err
= lwtunnel_valid_encap_type(cfg
->fc_encap_type
, extack
);
4270 if (tb
[RTA_EXPIRES
]) {
4271 unsigned long timeout
= addrconf_timeout_fixup(nla_get_u32(tb
[RTA_EXPIRES
]), HZ
);
4273 if (addrconf_finite_timeout(timeout
)) {
4274 cfg
->fc_expires
= jiffies_to_clock_t(timeout
* HZ
);
4275 cfg
->fc_flags
|= RTF_EXPIRES
;
4285 struct fib6_info
*fib6_info
;
4286 struct fib6_config r_cfg
;
4287 struct list_head next
;
4290 static int ip6_route_info_append(struct net
*net
,
4291 struct list_head
*rt6_nh_list
,
4292 struct fib6_info
*rt
,
4293 struct fib6_config
*r_cfg
)
4298 list_for_each_entry(nh
, rt6_nh_list
, next
) {
4299 /* check if fib6_info already exists */
4300 if (rt6_duplicate_nexthop(nh
->fib6_info
, rt
))
4304 nh
= kzalloc(sizeof(*nh
), GFP_KERNEL
);
4308 memcpy(&nh
->r_cfg
, r_cfg
, sizeof(*r_cfg
));
4309 list_add_tail(&nh
->next
, rt6_nh_list
);
4314 static void ip6_route_mpath_notify(struct fib6_info
*rt
,
4315 struct fib6_info
*rt_last
,
4316 struct nl_info
*info
,
4319 /* if this is an APPEND route, then rt points to the first route
4320 * inserted and rt_last points to last route inserted. Userspace
4321 * wants a consistent dump of the route which starts at the first
4322 * nexthop. Since sibling routes are always added at the end of
4323 * the list, find the first sibling of the last route appended
4325 if ((nlflags
& NLM_F_APPEND
) && rt_last
&& rt_last
->fib6_nsiblings
) {
4326 rt
= list_first_entry(&rt_last
->fib6_siblings
,
4332 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
4335 static int ip6_route_multipath_add(struct fib6_config
*cfg
,
4336 struct netlink_ext_ack
*extack
)
4338 struct fib6_info
*rt_notif
= NULL
, *rt_last
= NULL
;
4339 struct nl_info
*info
= &cfg
->fc_nlinfo
;
4340 struct fib6_config r_cfg
;
4341 struct rtnexthop
*rtnh
;
4342 struct fib6_info
*rt
;
4343 struct rt6_nh
*err_nh
;
4344 struct rt6_nh
*nh
, *nh_safe
;
4350 int replace
= (cfg
->fc_nlinfo
.nlh
&&
4351 (cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_REPLACE
));
4352 LIST_HEAD(rt6_nh_list
);
4354 nlflags
= replace
? NLM_F_REPLACE
: NLM_F_CREATE
;
4355 if (info
->nlh
&& info
->nlh
->nlmsg_flags
& NLM_F_APPEND
)
4356 nlflags
|= NLM_F_APPEND
;
4358 remaining
= cfg
->fc_mp_len
;
4359 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
4361 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4362 * fib6_info structs per nexthop
4364 while (rtnh_ok(rtnh
, remaining
)) {
4365 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
4366 if (rtnh
->rtnh_ifindex
)
4367 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
4369 attrlen
= rtnh_attrlen(rtnh
);
4371 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
4373 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
4375 r_cfg
.fc_gateway
= nla_get_in6_addr(nla
);
4376 r_cfg
.fc_flags
|= RTF_GATEWAY
;
4378 r_cfg
.fc_encap
= nla_find(attrs
, attrlen
, RTA_ENCAP
);
4379 nla
= nla_find(attrs
, attrlen
, RTA_ENCAP_TYPE
);
4381 r_cfg
.fc_encap_type
= nla_get_u16(nla
);
4384 r_cfg
.fc_flags
|= (rtnh
->rtnh_flags
& RTNH_F_ONLINK
);
4385 rt
= ip6_route_info_create(&r_cfg
, GFP_KERNEL
, extack
);
4391 if (!rt6_qualify_for_ecmp(rt
)) {
4393 NL_SET_ERR_MSG(extack
,
4394 "Device only routes can not be added for IPv6 using the multipath API.");
4395 fib6_info_release(rt
);
4399 rt
->fib6_nh
.nh_weight
= rtnh
->rtnh_hops
+ 1;
4401 err
= ip6_route_info_append(info
->nl_net
, &rt6_nh_list
,
4404 fib6_info_release(rt
);
4408 rtnh
= rtnh_next(rtnh
, &remaining
);
4411 /* for add and replace send one notification with all nexthops.
4412 * Skip the notification in fib6_add_rt2node and send one with
4413 * the full route when done
4415 info
->skip_notify
= 1;
4418 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
4419 err
= __ip6_ins_rt(nh
->fib6_info
, info
, extack
);
4420 fib6_info_release(nh
->fib6_info
);
4423 /* save reference to last route successfully inserted */
4424 rt_last
= nh
->fib6_info
;
4426 /* save reference to first route for notification */
4428 rt_notif
= nh
->fib6_info
;
4431 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4432 nh
->fib6_info
= NULL
;
4435 NL_SET_ERR_MSG_MOD(extack
,
4436 "multipath route replace failed (check consistency of installed routes)");
4441 /* Because each route is added like a single route we remove
4442 * these flags after the first nexthop: if there is a collision,
4443 * we have already failed to add the first nexthop:
4444 * fib6_add_rt2node() has rejected it; when replacing, old
4445 * nexthops have been replaced by first new, the rest should
4448 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
&= ~(NLM_F_EXCL
|
4453 /* success ... tell user about new route */
4454 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
4458 /* send notification for routes that were added so that
4459 * the delete notifications sent by ip6_route_del are
4463 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
4465 /* Delete routes that were already added */
4466 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
4469 ip6_route_del(&nh
->r_cfg
, extack
);
4473 list_for_each_entry_safe(nh
, nh_safe
, &rt6_nh_list
, next
) {
4475 fib6_info_release(nh
->fib6_info
);
4476 list_del(&nh
->next
);
4483 static int ip6_route_multipath_del(struct fib6_config
*cfg
,
4484 struct netlink_ext_ack
*extack
)
4486 struct fib6_config r_cfg
;
4487 struct rtnexthop
*rtnh
;
4490 int err
= 1, last_err
= 0;
4492 remaining
= cfg
->fc_mp_len
;
4493 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
4495 /* Parse a Multipath Entry */
4496 while (rtnh_ok(rtnh
, remaining
)) {
4497 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
4498 if (rtnh
->rtnh_ifindex
)
4499 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
4501 attrlen
= rtnh_attrlen(rtnh
);
4503 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
4505 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
4507 nla_memcpy(&r_cfg
.fc_gateway
, nla
, 16);
4508 r_cfg
.fc_flags
|= RTF_GATEWAY
;
4511 err
= ip6_route_del(&r_cfg
, extack
);
4515 rtnh
= rtnh_next(rtnh
, &remaining
);
4521 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4522 struct netlink_ext_ack
*extack
)
4524 struct fib6_config cfg
;
4527 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
4532 return ip6_route_multipath_del(&cfg
, extack
);
4534 cfg
.fc_delete_all_nh
= 1;
4535 return ip6_route_del(&cfg
, extack
);
4539 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
4540 struct netlink_ext_ack
*extack
)
4542 struct fib6_config cfg
;
4545 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
4550 return ip6_route_multipath_add(&cfg
, extack
);
4552 return ip6_route_add(&cfg
, GFP_KERNEL
, extack
);
4555 static size_t rt6_nlmsg_size(struct fib6_info
*rt
)
4557 int nexthop_len
= 0;
4559 if (rt
->fib6_nsiblings
) {
4560 nexthop_len
= nla_total_size(0) /* RTA_MULTIPATH */
4561 + NLA_ALIGN(sizeof(struct rtnexthop
))
4562 + nla_total_size(16) /* RTA_GATEWAY */
4563 + lwtunnel_get_encap_size(rt
->fib6_nh
.nh_lwtstate
);
4565 nexthop_len
*= rt
->fib6_nsiblings
;
4568 return NLMSG_ALIGN(sizeof(struct rtmsg
))
4569 + nla_total_size(16) /* RTA_SRC */
4570 + nla_total_size(16) /* RTA_DST */
4571 + nla_total_size(16) /* RTA_GATEWAY */
4572 + nla_total_size(16) /* RTA_PREFSRC */
4573 + nla_total_size(4) /* RTA_TABLE */
4574 + nla_total_size(4) /* RTA_IIF */
4575 + nla_total_size(4) /* RTA_OIF */
4576 + nla_total_size(4) /* RTA_PRIORITY */
4577 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
4578 + nla_total_size(sizeof(struct rta_cacheinfo
))
4579 + nla_total_size(TCP_CA_NAME_MAX
) /* RTAX_CC_ALGO */
4580 + nla_total_size(1) /* RTA_PREF */
4581 + lwtunnel_get_encap_size(rt
->fib6_nh
.nh_lwtstate
)
4585 static int rt6_nexthop_info(struct sk_buff
*skb
, struct fib6_info
*rt
,
4586 unsigned int *flags
, bool skip_oif
)
4588 if (rt
->fib6_nh
.nh_flags
& RTNH_F_DEAD
)
4589 *flags
|= RTNH_F_DEAD
;
4591 if (rt
->fib6_nh
.nh_flags
& RTNH_F_LINKDOWN
) {
4592 *flags
|= RTNH_F_LINKDOWN
;
4595 if (fib6_ignore_linkdown(rt
))
4596 *flags
|= RTNH_F_DEAD
;
4600 if (rt
->fib6_flags
& RTF_GATEWAY
) {
4601 if (nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt
->fib6_nh
.nh_gw
) < 0)
4602 goto nla_put_failure
;
4605 *flags
|= (rt
->fib6_nh
.nh_flags
& RTNH_F_ONLINK
);
4606 if (rt
->fib6_nh
.nh_flags
& RTNH_F_OFFLOAD
)
4607 *flags
|= RTNH_F_OFFLOAD
;
4609 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4610 if (!skip_oif
&& rt
->fib6_nh
.nh_dev
&&
4611 nla_put_u32(skb
, RTA_OIF
, rt
->fib6_nh
.nh_dev
->ifindex
))
4612 goto nla_put_failure
;
4614 if (rt
->fib6_nh
.nh_lwtstate
&&
4615 lwtunnel_fill_encap(skb
, rt
->fib6_nh
.nh_lwtstate
) < 0)
4616 goto nla_put_failure
;
4624 /* add multipath next hop */
4625 static int rt6_add_nexthop(struct sk_buff
*skb
, struct fib6_info
*rt
)
4627 const struct net_device
*dev
= rt
->fib6_nh
.nh_dev
;
4628 struct rtnexthop
*rtnh
;
4629 unsigned int flags
= 0;
4631 rtnh
= nla_reserve_nohdr(skb
, sizeof(*rtnh
));
4633 goto nla_put_failure
;
4635 rtnh
->rtnh_hops
= rt
->fib6_nh
.nh_weight
- 1;
4636 rtnh
->rtnh_ifindex
= dev
? dev
->ifindex
: 0;
4638 if (rt6_nexthop_info(skb
, rt
, &flags
, true) < 0)
4639 goto nla_put_failure
;
4641 rtnh
->rtnh_flags
= flags
;
4643 /* length of rtnetlink header + attributes */
4644 rtnh
->rtnh_len
= nlmsg_get_pos(skb
) - (void *)rtnh
;
4652 static int rt6_fill_node(struct net
*net
, struct sk_buff
*skb
,
4653 struct fib6_info
*rt
, struct dst_entry
*dst
,
4654 struct in6_addr
*dest
, struct in6_addr
*src
,
4655 int iif
, int type
, u32 portid
, u32 seq
,
4658 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
4659 struct rt6key
*rt6_dst
, *rt6_src
;
4660 u32
*pmetrics
, table
, rt6_flags
;
4661 struct nlmsghdr
*nlh
;
4665 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*rtm
), flags
);
4670 rt6_dst
= &rt6
->rt6i_dst
;
4671 rt6_src
= &rt6
->rt6i_src
;
4672 rt6_flags
= rt6
->rt6i_flags
;
4674 rt6_dst
= &rt
->fib6_dst
;
4675 rt6_src
= &rt
->fib6_src
;
4676 rt6_flags
= rt
->fib6_flags
;
4679 rtm
= nlmsg_data(nlh
);
4680 rtm
->rtm_family
= AF_INET6
;
4681 rtm
->rtm_dst_len
= rt6_dst
->plen
;
4682 rtm
->rtm_src_len
= rt6_src
->plen
;
4685 table
= rt
->fib6_table
->tb6_id
;
4687 table
= RT6_TABLE_UNSPEC
;
4688 rtm
->rtm_table
= table
< 256 ? table
: RT_TABLE_COMPAT
;
4689 if (nla_put_u32(skb
, RTA_TABLE
, table
))
4690 goto nla_put_failure
;
4692 rtm
->rtm_type
= rt
->fib6_type
;
4694 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
4695 rtm
->rtm_protocol
= rt
->fib6_protocol
;
4697 if (rt6_flags
& RTF_CACHE
)
4698 rtm
->rtm_flags
|= RTM_F_CLONED
;
4701 if (nla_put_in6_addr(skb
, RTA_DST
, dest
))
4702 goto nla_put_failure
;
4703 rtm
->rtm_dst_len
= 128;
4704 } else if (rtm
->rtm_dst_len
)
4705 if (nla_put_in6_addr(skb
, RTA_DST
, &rt6_dst
->addr
))
4706 goto nla_put_failure
;
4707 #ifdef CONFIG_IPV6_SUBTREES
4709 if (nla_put_in6_addr(skb
, RTA_SRC
, src
))
4710 goto nla_put_failure
;
4711 rtm
->rtm_src_len
= 128;
4712 } else if (rtm
->rtm_src_len
&&
4713 nla_put_in6_addr(skb
, RTA_SRC
, &rt6_src
->addr
))
4714 goto nla_put_failure
;
4717 #ifdef CONFIG_IPV6_MROUTE
4718 if (ipv6_addr_is_multicast(&rt6_dst
->addr
)) {
4719 int err
= ip6mr_get_route(net
, skb
, rtm
, portid
);
4724 goto nla_put_failure
;
4727 if (nla_put_u32(skb
, RTA_IIF
, iif
))
4728 goto nla_put_failure
;
4730 struct in6_addr saddr_buf
;
4731 if (ip6_route_get_saddr(net
, rt
, dest
, 0, &saddr_buf
) == 0 &&
4732 nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
4733 goto nla_put_failure
;
4736 if (rt
->fib6_prefsrc
.plen
) {
4737 struct in6_addr saddr_buf
;
4738 saddr_buf
= rt
->fib6_prefsrc
.addr
;
4739 if (nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
4740 goto nla_put_failure
;
4743 pmetrics
= dst
? dst_metrics_ptr(dst
) : rt
->fib6_metrics
->metrics
;
4744 if (rtnetlink_put_metrics(skb
, pmetrics
) < 0)
4745 goto nla_put_failure
;
4747 if (nla_put_u32(skb
, RTA_PRIORITY
, rt
->fib6_metric
))
4748 goto nla_put_failure
;
4750 /* For multipath routes, walk the siblings list and add
4751 * each as a nexthop within RTA_MULTIPATH.
4754 if (rt6_flags
& RTF_GATEWAY
&&
4755 nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt6
->rt6i_gateway
))
4756 goto nla_put_failure
;
4758 if (dst
->dev
&& nla_put_u32(skb
, RTA_OIF
, dst
->dev
->ifindex
))
4759 goto nla_put_failure
;
4760 } else if (rt
->fib6_nsiblings
) {
4761 struct fib6_info
*sibling
, *next_sibling
;
4764 mp
= nla_nest_start(skb
, RTA_MULTIPATH
);
4766 goto nla_put_failure
;
4768 if (rt6_add_nexthop(skb
, rt
) < 0)
4769 goto nla_put_failure
;
4771 list_for_each_entry_safe(sibling
, next_sibling
,
4772 &rt
->fib6_siblings
, fib6_siblings
) {
4773 if (rt6_add_nexthop(skb
, sibling
) < 0)
4774 goto nla_put_failure
;
4777 nla_nest_end(skb
, mp
);
4779 if (rt6_nexthop_info(skb
, rt
, &rtm
->rtm_flags
, false) < 0)
4780 goto nla_put_failure
;
4783 if (rt6_flags
& RTF_EXPIRES
) {
4784 expires
= dst
? dst
->expires
: rt
->expires
;
4788 if (rtnl_put_cacheinfo(skb
, dst
, 0, expires
, dst
? dst
->error
: 0) < 0)
4789 goto nla_put_failure
;
4791 if (nla_put_u8(skb
, RTA_PREF
, IPV6_EXTRACT_PREF(rt6_flags
)))
4792 goto nla_put_failure
;
4795 nlmsg_end(skb
, nlh
);
4799 nlmsg_cancel(skb
, nlh
);
4803 static bool fib6_info_uses_dev(const struct fib6_info
*f6i
,
4804 const struct net_device
*dev
)
4806 if (f6i
->fib6_nh
.nh_dev
== dev
)
4809 if (f6i
->fib6_nsiblings
) {
4810 struct fib6_info
*sibling
, *next_sibling
;
4812 list_for_each_entry_safe(sibling
, next_sibling
,
4813 &f6i
->fib6_siblings
, fib6_siblings
) {
4814 if (sibling
->fib6_nh
.nh_dev
== dev
)
4822 int rt6_dump_route(struct fib6_info
*rt
, void *p_arg
)
4824 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
4825 struct fib_dump_filter
*filter
= &arg
->filter
;
4826 unsigned int flags
= NLM_F_MULTI
;
4827 struct net
*net
= arg
->net
;
4829 if (rt
== net
->ipv6
.fib6_null_entry
)
4832 if ((filter
->flags
& RTM_F_PREFIX
) &&
4833 !(rt
->fib6_flags
& RTF_PREFIX_RT
)) {
4834 /* success since this is not a prefix route */
4837 if (filter
->filter_set
) {
4838 if ((filter
->rt_type
&& rt
->fib6_type
!= filter
->rt_type
) ||
4839 (filter
->dev
&& !fib6_info_uses_dev(rt
, filter
->dev
)) ||
4840 (filter
->protocol
&& rt
->fib6_protocol
!= filter
->protocol
)) {
4843 flags
|= NLM_F_DUMP_FILTERED
;
4846 return rt6_fill_node(net
, arg
->skb
, rt
, NULL
, NULL
, NULL
, 0,
4847 RTM_NEWROUTE
, NETLINK_CB(arg
->cb
->skb
).portid
,
4848 arg
->cb
->nlh
->nlmsg_seq
, flags
);
4851 static int inet6_rtm_valid_getroute_req(struct sk_buff
*skb
,
4852 const struct nlmsghdr
*nlh
,
4854 struct netlink_ext_ack
*extack
)
4859 if (nlh
->nlmsg_len
< nlmsg_msg_size(sizeof(*rtm
))) {
4860 NL_SET_ERR_MSG_MOD(extack
,
4861 "Invalid header for get route request");
4865 if (!netlink_strict_get_check(skb
))
4866 return nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
4867 rtm_ipv6_policy
, extack
);
4869 rtm
= nlmsg_data(nlh
);
4870 if ((rtm
->rtm_src_len
&& rtm
->rtm_src_len
!= 128) ||
4871 (rtm
->rtm_dst_len
&& rtm
->rtm_dst_len
!= 128) ||
4872 rtm
->rtm_table
|| rtm
->rtm_protocol
|| rtm
->rtm_scope
||
4874 NL_SET_ERR_MSG_MOD(extack
, "Invalid values in header for get route request");
4877 if (rtm
->rtm_flags
& ~RTM_F_FIB_MATCH
) {
4878 NL_SET_ERR_MSG_MOD(extack
,
4879 "Invalid flags for get route request");
4883 err
= nlmsg_parse_strict(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
4884 rtm_ipv6_policy
, extack
);
4888 if ((tb
[RTA_SRC
] && !rtm
->rtm_src_len
) ||
4889 (tb
[RTA_DST
] && !rtm
->rtm_dst_len
)) {
4890 NL_SET_ERR_MSG_MOD(extack
, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4894 for (i
= 0; i
<= RTA_MAX
; i
++) {
4910 NL_SET_ERR_MSG_MOD(extack
, "Unsupported attribute in get route request");
4918 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
4919 struct netlink_ext_ack
*extack
)
4921 struct net
*net
= sock_net(in_skb
->sk
);
4922 struct nlattr
*tb
[RTA_MAX
+1];
4923 int err
, iif
= 0, oif
= 0;
4924 struct fib6_info
*from
;
4925 struct dst_entry
*dst
;
4926 struct rt6_info
*rt
;
4927 struct sk_buff
*skb
;
4929 struct flowi6 fl6
= {};
4932 err
= inet6_rtm_valid_getroute_req(in_skb
, nlh
, tb
, extack
);
4937 rtm
= nlmsg_data(nlh
);
4938 fl6
.flowlabel
= ip6_make_flowinfo(rtm
->rtm_tos
, 0);
4939 fibmatch
= !!(rtm
->rtm_flags
& RTM_F_FIB_MATCH
);
4942 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
4945 fl6
.saddr
= *(struct in6_addr
*)nla_data(tb
[RTA_SRC
]);
4949 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
4952 fl6
.daddr
= *(struct in6_addr
*)nla_data(tb
[RTA_DST
]);
4956 iif
= nla_get_u32(tb
[RTA_IIF
]);
4959 oif
= nla_get_u32(tb
[RTA_OIF
]);
4962 fl6
.flowi6_mark
= nla_get_u32(tb
[RTA_MARK
]);
4965 fl6
.flowi6_uid
= make_kuid(current_user_ns(),
4966 nla_get_u32(tb
[RTA_UID
]));
4968 fl6
.flowi6_uid
= iif
? INVALID_UID
: current_uid();
4971 fl6
.fl6_sport
= nla_get_be16(tb
[RTA_SPORT
]);
4974 fl6
.fl6_dport
= nla_get_be16(tb
[RTA_DPORT
]);
4976 if (tb
[RTA_IP_PROTO
]) {
4977 err
= rtm_getroute_parse_ip_proto(tb
[RTA_IP_PROTO
],
4978 &fl6
.flowi6_proto
, AF_INET6
,
4985 struct net_device
*dev
;
4990 dev
= dev_get_by_index_rcu(net
, iif
);
4997 fl6
.flowi6_iif
= iif
;
4999 if (!ipv6_addr_any(&fl6
.saddr
))
5000 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
5002 dst
= ip6_route_input_lookup(net
, dev
, &fl6
, NULL
, flags
);
5006 fl6
.flowi6_oif
= oif
;
5008 dst
= ip6_route_output(net
, NULL
, &fl6
);
5012 rt
= container_of(dst
, struct rt6_info
, dst
);
5013 if (rt
->dst
.error
) {
5014 err
= rt
->dst
.error
;
5019 if (rt
== net
->ipv6
.ip6_null_entry
) {
5020 err
= rt
->dst
.error
;
5025 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
5032 skb_dst_set(skb
, &rt
->dst
);
5035 from
= rcu_dereference(rt
->from
);
5038 err
= rt6_fill_node(net
, skb
, from
, NULL
, NULL
, NULL
,
5040 NETLINK_CB(in_skb
).portid
,
5043 err
= rt6_fill_node(net
, skb
, from
, dst
, &fl6
.daddr
,
5044 &fl6
.saddr
, iif
, RTM_NEWROUTE
,
5045 NETLINK_CB(in_skb
).portid
,
5057 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
5062 void inet6_rt_notify(int event
, struct fib6_info
*rt
, struct nl_info
*info
,
5063 unsigned int nlm_flags
)
5065 struct sk_buff
*skb
;
5066 struct net
*net
= info
->nl_net
;
5071 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
5073 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
5077 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, NULL
, 0,
5078 event
, info
->portid
, seq
, nlm_flags
);
5080 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5081 WARN_ON(err
== -EMSGSIZE
);
5085 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
5086 info
->nlh
, gfp_any());
5090 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
5093 static int ip6_route_dev_notify(struct notifier_block
*this,
5094 unsigned long event
, void *ptr
)
5096 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
5097 struct net
*net
= dev_net(dev
);
5099 if (!(dev
->flags
& IFF_LOOPBACK
))
5102 if (event
== NETDEV_REGISTER
) {
5103 net
->ipv6
.fib6_null_entry
->fib6_nh
.nh_dev
= dev
;
5104 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
5105 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
5108 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
5109 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
5110 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
5112 } else if (event
== NETDEV_UNREGISTER
&&
5113 dev
->reg_state
!= NETREG_UNREGISTERED
) {
5114 /* NETDEV_UNREGISTER could be fired for multiple times by
5115 * netdev_wait_allrefs(). Make sure we only call this once.
5117 in6_dev_put_clear(&net
->ipv6
.ip6_null_entry
->rt6i_idev
);
5118 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5119 in6_dev_put_clear(&net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
);
5120 in6_dev_put_clear(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
);
5131 #ifdef CONFIG_PROC_FS
5132 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
5134 struct net
*net
= (struct net
*)seq
->private;
5135 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
5136 net
->ipv6
.rt6_stats
->fib_nodes
,
5137 net
->ipv6
.rt6_stats
->fib_route_nodes
,
5138 atomic_read(&net
->ipv6
.rt6_stats
->fib_rt_alloc
),
5139 net
->ipv6
.rt6_stats
->fib_rt_entries
,
5140 net
->ipv6
.rt6_stats
->fib_rt_cache
,
5141 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
5142 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
5146 #endif /* CONFIG_PROC_FS */
5148 #ifdef CONFIG_SYSCTL
5151 int ipv6_sysctl_rtcache_flush(struct ctl_table
*ctl
, int write
,
5152 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
5160 net
= (struct net
*)ctl
->extra1
;
5161 delay
= net
->ipv6
.sysctl
.flush_delay
;
5162 ret
= proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
5166 fib6_run_gc(delay
<= 0 ? 0 : (unsigned long)delay
, net
, delay
> 0);
5173 static struct ctl_table ipv6_route_table_template
[] = {
5175 .procname
= "flush",
5176 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
5177 .maxlen
= sizeof(int),
5179 .proc_handler
= ipv6_sysctl_rtcache_flush
5182 .procname
= "gc_thresh",
5183 .data
= &ip6_dst_ops_template
.gc_thresh
,
5184 .maxlen
= sizeof(int),
5186 .proc_handler
= proc_dointvec
,
5189 .procname
= "max_size",
5190 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
5191 .maxlen
= sizeof(int),
5193 .proc_handler
= proc_dointvec
,
5196 .procname
= "gc_min_interval",
5197 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
5198 .maxlen
= sizeof(int),
5200 .proc_handler
= proc_dointvec_jiffies
,
5203 .procname
= "gc_timeout",
5204 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
5205 .maxlen
= sizeof(int),
5207 .proc_handler
= proc_dointvec_jiffies
,
5210 .procname
= "gc_interval",
5211 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
5212 .maxlen
= sizeof(int),
5214 .proc_handler
= proc_dointvec_jiffies
,
5217 .procname
= "gc_elasticity",
5218 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
5219 .maxlen
= sizeof(int),
5221 .proc_handler
= proc_dointvec
,
5224 .procname
= "mtu_expires",
5225 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
5226 .maxlen
= sizeof(int),
5228 .proc_handler
= proc_dointvec_jiffies
,
5231 .procname
= "min_adv_mss",
5232 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
5233 .maxlen
= sizeof(int),
5235 .proc_handler
= proc_dointvec
,
5238 .procname
= "gc_min_interval_ms",
5239 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
5240 .maxlen
= sizeof(int),
5242 .proc_handler
= proc_dointvec_ms_jiffies
,
5245 .procname
= "skip_notify_on_dev_down",
5246 .data
= &init_net
.ipv6
.sysctl
.skip_notify_on_dev_down
,
5247 .maxlen
= sizeof(int),
5249 .proc_handler
= proc_dointvec
,
5256 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
5258 struct ctl_table
*table
;
5260 table
= kmemdup(ipv6_route_table_template
,
5261 sizeof(ipv6_route_table_template
),
5265 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
5266 table
[0].extra1
= net
;
5267 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
5268 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
5269 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
5270 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
5271 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
5272 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
5273 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
5274 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
5275 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
5276 table
[10].data
= &net
->ipv6
.sysctl
.skip_notify_on_dev_down
;
5278 /* Don't export sysctls to unprivileged users */
5279 if (net
->user_ns
!= &init_user_ns
)
5280 table
[0].procname
= NULL
;
5287 static int __net_init
ip6_route_net_init(struct net
*net
)
5291 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
5292 sizeof(net
->ipv6
.ip6_dst_ops
));
5294 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
5295 goto out_ip6_dst_ops
;
5297 net
->ipv6
.fib6_null_entry
= kmemdup(&fib6_null_entry_template
,
5298 sizeof(*net
->ipv6
.fib6_null_entry
),
5300 if (!net
->ipv6
.fib6_null_entry
)
5301 goto out_ip6_dst_entries
;
5303 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
5304 sizeof(*net
->ipv6
.ip6_null_entry
),
5306 if (!net
->ipv6
.ip6_null_entry
)
5307 goto out_fib6_null_entry
;
5308 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
5309 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
5310 ip6_template_metrics
, true);
5312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5313 net
->ipv6
.fib6_has_custom_rules
= false;
5314 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
5315 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
5317 if (!net
->ipv6
.ip6_prohibit_entry
)
5318 goto out_ip6_null_entry
;
5319 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
5320 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
5321 ip6_template_metrics
, true);
5323 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
5324 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
5326 if (!net
->ipv6
.ip6_blk_hole_entry
)
5327 goto out_ip6_prohibit_entry
;
5328 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
5329 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
5330 ip6_template_metrics
, true);
5333 net
->ipv6
.sysctl
.flush_delay
= 0;
5334 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
5335 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
5336 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
5337 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
5338 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
5339 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
5340 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
5341 net
->ipv6
.sysctl
.skip_notify_on_dev_down
= 0;
5343 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
5349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5350 out_ip6_prohibit_entry
:
5351 kfree(net
->ipv6
.ip6_prohibit_entry
);
5353 kfree(net
->ipv6
.ip6_null_entry
);
5355 out_fib6_null_entry
:
5356 kfree(net
->ipv6
.fib6_null_entry
);
5357 out_ip6_dst_entries
:
5358 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
5363 static void __net_exit
ip6_route_net_exit(struct net
*net
)
5365 kfree(net
->ipv6
.fib6_null_entry
);
5366 kfree(net
->ipv6
.ip6_null_entry
);
5367 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5368 kfree(net
->ipv6
.ip6_prohibit_entry
);
5369 kfree(net
->ipv6
.ip6_blk_hole_entry
);
5371 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
5374 static int __net_init
ip6_route_net_init_late(struct net
*net
)
5376 #ifdef CONFIG_PROC_FS
5377 proc_create_net("ipv6_route", 0, net
->proc_net
, &ipv6_route_seq_ops
,
5378 sizeof(struct ipv6_route_iter
));
5379 proc_create_net_single("rt6_stats", 0444, net
->proc_net
,
5380 rt6_stats_seq_show
, NULL
);
5385 static void __net_exit
ip6_route_net_exit_late(struct net
*net
)
5387 #ifdef CONFIG_PROC_FS
5388 remove_proc_entry("ipv6_route", net
->proc_net
);
5389 remove_proc_entry("rt6_stats", net
->proc_net
);
5393 static struct pernet_operations ip6_route_net_ops
= {
5394 .init
= ip6_route_net_init
,
5395 .exit
= ip6_route_net_exit
,
5398 static int __net_init
ipv6_inetpeer_init(struct net
*net
)
5400 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
5404 inet_peer_base_init(bp
);
5405 net
->ipv6
.peers
= bp
;
5409 static void __net_exit
ipv6_inetpeer_exit(struct net
*net
)
5411 struct inet_peer_base
*bp
= net
->ipv6
.peers
;
5413 net
->ipv6
.peers
= NULL
;
5414 inetpeer_invalidate_tree(bp
);
5418 static struct pernet_operations ipv6_inetpeer_ops
= {
5419 .init
= ipv6_inetpeer_init
,
5420 .exit
= ipv6_inetpeer_exit
,
5423 static struct pernet_operations ip6_route_net_late_ops
= {
5424 .init
= ip6_route_net_init_late
,
5425 .exit
= ip6_route_net_exit_late
,
5428 static struct notifier_block ip6_route_dev_notifier
= {
5429 .notifier_call
= ip6_route_dev_notify
,
5430 .priority
= ADDRCONF_NOTIFY_PRIORITY
- 10,
5433 void __init
ip6_route_init_special_entries(void)
5435 /* Registering of the loopback is done before this portion of code,
5436 * the loopback reference in rt6_info will not be taken, do it
5437 * manually for init_net */
5438 init_net
.ipv6
.fib6_null_entry
->fib6_nh
.nh_dev
= init_net
.loopback_dev
;
5439 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
5440 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5441 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5442 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
5443 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5444 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
5445 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
5449 int __init
ip6_route_init(void)
5455 ip6_dst_ops_template
.kmem_cachep
=
5456 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
5457 SLAB_HWCACHE_ALIGN
, NULL
);
5458 if (!ip6_dst_ops_template
.kmem_cachep
)
5461 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
5463 goto out_kmem_cache
;
5465 ret
= register_pernet_subsys(&ipv6_inetpeer_ops
);
5467 goto out_dst_entries
;
5469 ret
= register_pernet_subsys(&ip6_route_net_ops
);
5471 goto out_register_inetpeer
;
5473 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
5477 goto out_register_subsys
;
5483 ret
= fib6_rules_init();
5487 ret
= register_pernet_subsys(&ip6_route_net_late_ops
);
5489 goto fib6_rules_init
;
5491 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_NEWROUTE
,
5492 inet6_rtm_newroute
, NULL
, 0);
5494 goto out_register_late_subsys
;
5496 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_DELROUTE
,
5497 inet6_rtm_delroute
, NULL
, 0);
5499 goto out_register_late_subsys
;
5501 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_GETROUTE
,
5502 inet6_rtm_getroute
, NULL
,
5503 RTNL_FLAG_DOIT_UNLOCKED
);
5505 goto out_register_late_subsys
;
5507 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
5509 goto out_register_late_subsys
;
5511 for_each_possible_cpu(cpu
) {
5512 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
5514 INIT_LIST_HEAD(&ul
->head
);
5515 spin_lock_init(&ul
->lock
);
5521 out_register_late_subsys
:
5522 rtnl_unregister_all(PF_INET6
);
5523 unregister_pernet_subsys(&ip6_route_net_late_ops
);
5525 fib6_rules_cleanup();
5530 out_register_subsys
:
5531 unregister_pernet_subsys(&ip6_route_net_ops
);
5532 out_register_inetpeer
:
5533 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
5535 dst_entries_destroy(&ip6_dst_blackhole_ops
);
5537 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
5541 void ip6_route_cleanup(void)
5543 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
5544 unregister_pernet_subsys(&ip6_route_net_late_ops
);
5545 fib6_rules_cleanup();
5548 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
5549 unregister_pernet_subsys(&ip6_route_net_ops
);
5550 dst_entries_destroy(&ip6_dst_blackhole_ops
);
5551 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);