2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
76 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
77 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
78 static unsigned int ip6_default_mtu(const struct dst_entry
*dst
);
79 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
80 static void ip6_dst_destroy(struct dst_entry
*);
81 static void ip6_dst_ifdown(struct dst_entry
*,
82 struct net_device
*dev
, int how
);
83 static int ip6_dst_gc(struct dst_ops
*ops
);
85 static int ip6_pkt_discard(struct sk_buff
*skb
);
86 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
87 static void ip6_link_failure(struct sk_buff
*skb
);
88 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
92 struct in6_addr
*prefix
, int prefixlen
,
93 struct in6_addr
*gwaddr
, int ifindex
,
95 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
96 struct in6_addr
*prefix
, int prefixlen
,
97 struct in6_addr
*gwaddr
, int ifindex
);
100 static struct dst_ops ip6_dst_ops_template
= {
102 .protocol
= cpu_to_be16(ETH_P_IPV6
),
105 .check
= ip6_dst_check
,
106 .default_advmss
= ip6_default_advmss
,
107 .default_mtu
= ip6_default_mtu
,
108 .destroy
= ip6_dst_destroy
,
109 .ifdown
= ip6_dst_ifdown
,
110 .negative_advice
= ip6_negative_advice
,
111 .link_failure
= ip6_link_failure
,
112 .update_pmtu
= ip6_rt_update_pmtu
,
113 .local_out
= __ip6_local_out
,
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
120 static struct dst_ops ip6_dst_blackhole_ops
= {
122 .protocol
= cpu_to_be16(ETH_P_IPV6
),
123 .destroy
= ip6_dst_destroy
,
124 .check
= ip6_dst_check
,
125 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
128 static struct rt6_info ip6_null_entry_template
= {
130 .__refcnt
= ATOMIC_INIT(1),
133 .error
= -ENETUNREACH
,
134 .input
= ip6_pkt_discard
,
135 .output
= ip6_pkt_discard_out
,
137 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
138 .rt6i_protocol
= RTPROT_KERNEL
,
139 .rt6i_metric
= ~(u32
) 0,
140 .rt6i_ref
= ATOMIC_INIT(1),
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
146 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
148 static struct rt6_info ip6_prohibit_entry_template
= {
150 .__refcnt
= ATOMIC_INIT(1),
154 .input
= ip6_pkt_prohibit
,
155 .output
= ip6_pkt_prohibit_out
,
157 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
158 .rt6i_protocol
= RTPROT_KERNEL
,
159 .rt6i_metric
= ~(u32
) 0,
160 .rt6i_ref
= ATOMIC_INIT(1),
163 static struct rt6_info ip6_blk_hole_entry_template
= {
165 .__refcnt
= ATOMIC_INIT(1),
169 .input
= dst_discard
,
170 .output
= dst_discard
,
172 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
173 .rt6i_protocol
= RTPROT_KERNEL
,
174 .rt6i_metric
= ~(u32
) 0,
175 .rt6i_ref
= ATOMIC_INIT(1),
180 /* allocate dst with ip6_dst_ops */
181 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
183 return (struct rt6_info
*)dst_alloc(ops
);
186 static void ip6_dst_destroy(struct dst_entry
*dst
)
188 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
189 struct inet6_dev
*idev
= rt
->rt6i_idev
;
190 struct inet_peer
*peer
= rt
->rt6i_peer
;
193 rt
->rt6i_idev
= NULL
;
197 BUG_ON(!(rt
->rt6i_flags
& RTF_CACHE
));
198 rt
->rt6i_peer
= NULL
;
203 void rt6_bind_peer(struct rt6_info
*rt
, int create
)
205 struct inet_peer
*peer
;
207 if (WARN_ON(!(rt
->rt6i_flags
& RTF_CACHE
)))
210 peer
= inet_getpeer_v6(&rt
->rt6i_dst
.addr
, create
);
211 if (peer
&& cmpxchg(&rt
->rt6i_peer
, NULL
, peer
) != NULL
)
215 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
218 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
219 struct inet6_dev
*idev
= rt
->rt6i_idev
;
220 struct net_device
*loopback_dev
=
221 dev_net(dev
)->loopback_dev
;
223 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
224 struct inet6_dev
*loopback_idev
=
225 in6_dev_get(loopback_dev
);
226 if (loopback_idev
!= NULL
) {
227 rt
->rt6i_idev
= loopback_idev
;
233 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
235 return (rt
->rt6i_flags
& RTF_EXPIRES
) &&
236 time_after(jiffies
, rt
->rt6i_expires
);
239 static inline int rt6_need_strict(struct in6_addr
*daddr
)
241 return ipv6_addr_type(daddr
) &
242 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
);
246 * Route lookup. Any table->tb6_lock is implied.
249 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
251 struct in6_addr
*saddr
,
255 struct rt6_info
*local
= NULL
;
256 struct rt6_info
*sprt
;
258 if (!oif
&& ipv6_addr_any(saddr
))
261 for (sprt
= rt
; sprt
; sprt
= sprt
->dst
.rt6_next
) {
262 struct net_device
*dev
= sprt
->rt6i_dev
;
265 if (dev
->ifindex
== oif
)
267 if (dev
->flags
& IFF_LOOPBACK
) {
268 if (sprt
->rt6i_idev
== NULL
||
269 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
270 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
272 if (local
&& (!oif
||
273 local
->rt6i_idev
->dev
->ifindex
== oif
))
279 if (ipv6_chk_addr(net
, saddr
, dev
,
280 flags
& RT6_LOOKUP_F_IFACE
))
289 if (flags
& RT6_LOOKUP_F_IFACE
)
290 return net
->ipv6
.ip6_null_entry
;
296 #ifdef CONFIG_IPV6_ROUTER_PREF
297 static void rt6_probe(struct rt6_info
*rt
)
299 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
301 * Okay, this does not seem to be appropriate
302 * for now, however, we need to check if it
303 * is really so; aka Router Reachability Probing.
305 * Router Reachability Probe MUST be rate-limited
306 * to no more than one per minute.
308 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
310 read_lock_bh(&neigh
->lock
);
311 if (!(neigh
->nud_state
& NUD_VALID
) &&
312 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
313 struct in6_addr mcaddr
;
314 struct in6_addr
*target
;
316 neigh
->updated
= jiffies
;
317 read_unlock_bh(&neigh
->lock
);
319 target
= (struct in6_addr
*)&neigh
->primary_key
;
320 addrconf_addr_solict_mult(target
, &mcaddr
);
321 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
323 read_unlock_bh(&neigh
->lock
);
326 static inline void rt6_probe(struct rt6_info
*rt
)
332 * Default Router Selection (RFC 2461 6.3.6)
334 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
336 struct net_device
*dev
= rt
->rt6i_dev
;
337 if (!oif
|| dev
->ifindex
== oif
)
339 if ((dev
->flags
& IFF_LOOPBACK
) &&
340 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
345 static inline int rt6_check_neigh(struct rt6_info
*rt
)
347 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
349 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
350 !(rt
->rt6i_flags
& RTF_GATEWAY
))
353 read_lock_bh(&neigh
->lock
);
354 if (neigh
->nud_state
& NUD_VALID
)
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 else if (neigh
->nud_state
& NUD_FAILED
)
362 read_unlock_bh(&neigh
->lock
);
368 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
373 m
= rt6_check_dev(rt
, oif
);
374 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
376 #ifdef CONFIG_IPV6_ROUTER_PREF
377 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
379 n
= rt6_check_neigh(rt
);
380 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
385 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
386 int *mpri
, struct rt6_info
*match
)
390 if (rt6_check_expired(rt
))
393 m
= rt6_score_route(rt
, oif
, strict
);
398 if (strict
& RT6_LOOKUP_F_REACHABLE
)
402 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
410 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
411 struct rt6_info
*rr_head
,
412 u32 metric
, int oif
, int strict
)
414 struct rt6_info
*rt
, *match
;
418 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
419 rt
= rt
->dst
.rt6_next
)
420 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
421 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
422 rt
= rt
->dst
.rt6_next
)
423 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
428 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
430 struct rt6_info
*match
, *rt0
;
433 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
434 __func__
, fn
->leaf
, oif
);
438 fn
->rr_ptr
= rt0
= fn
->leaf
;
440 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
443 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
444 struct rt6_info
*next
= rt0
->dst
.rt6_next
;
446 /* no entries matched; do round-robin */
447 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
454 RT6_TRACE("%s() => %p\n",
457 net
= dev_net(rt0
->rt6i_dev
);
458 return match
? match
: net
->ipv6
.ip6_null_entry
;
461 #ifdef CONFIG_IPV6_ROUTE_INFO
462 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
463 struct in6_addr
*gwaddr
)
465 struct net
*net
= dev_net(dev
);
466 struct route_info
*rinfo
= (struct route_info
*) opt
;
467 struct in6_addr prefix_buf
, *prefix
;
469 unsigned long lifetime
;
472 if (len
< sizeof(struct route_info
)) {
476 /* Sanity check for prefix_len and length */
477 if (rinfo
->length
> 3) {
479 } else if (rinfo
->prefix_len
> 128) {
481 } else if (rinfo
->prefix_len
> 64) {
482 if (rinfo
->length
< 2) {
485 } else if (rinfo
->prefix_len
> 0) {
486 if (rinfo
->length
< 1) {
491 pref
= rinfo
->route_pref
;
492 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
495 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
497 if (rinfo
->length
== 3)
498 prefix
= (struct in6_addr
*)rinfo
->prefix
;
500 /* this function is safe */
501 ipv6_addr_prefix(&prefix_buf
,
502 (struct in6_addr
*)rinfo
->prefix
,
504 prefix
= &prefix_buf
;
507 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
510 if (rt
&& !lifetime
) {
516 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
519 rt
->rt6i_flags
= RTF_ROUTEINFO
|
520 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
523 if (!addrconf_finite_timeout(lifetime
)) {
524 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
526 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
527 rt
->rt6i_flags
|= RTF_EXPIRES
;
529 dst_release(&rt
->dst
);
535 #define BACKTRACK(__net, saddr) \
537 if (rt == __net->ipv6.ip6_null_entry) { \
538 struct fib6_node *pn; \
540 if (fn->fn_flags & RTN_TL_ROOT) \
543 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
544 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
547 if (fn->fn_flags & RTN_RTINFO) \
553 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
554 struct fib6_table
*table
,
555 struct flowi
*fl
, int flags
)
557 struct fib6_node
*fn
;
560 read_lock_bh(&table
->tb6_lock
);
561 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
564 rt
= rt6_device_match(net
, rt
, &fl
->fl6_src
, fl
->oif
, flags
);
565 BACKTRACK(net
, &fl
->fl6_src
);
567 dst_use(&rt
->dst
, jiffies
);
568 read_unlock_bh(&table
->tb6_lock
);
573 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
574 const struct in6_addr
*saddr
, int oif
, int strict
)
580 struct dst_entry
*dst
;
581 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
584 memcpy(&fl
.fl6_src
, saddr
, sizeof(*saddr
));
585 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
588 dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_lookup
);
590 return (struct rt6_info
*) dst
;
597 EXPORT_SYMBOL(rt6_lookup
);
599 /* ip6_ins_rt is called with FREE table->tb6_lock.
600 It takes new route entry, the addition fails by any reason the
601 route is freed. In any case, if caller does not hold it, it may
605 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
608 struct fib6_table
*table
;
610 table
= rt
->rt6i_table
;
611 write_lock_bh(&table
->tb6_lock
);
612 err
= fib6_add(&table
->tb6_root
, rt
, info
);
613 write_unlock_bh(&table
->tb6_lock
);
618 int ip6_ins_rt(struct rt6_info
*rt
)
620 struct nl_info info
= {
621 .nl_net
= dev_net(rt
->rt6i_dev
),
623 return __ip6_ins_rt(rt
, &info
);
626 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
627 struct in6_addr
*saddr
)
635 rt
= ip6_rt_copy(ort
);
638 struct neighbour
*neigh
;
639 int attempts
= !in_softirq();
641 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
642 if (rt
->rt6i_dst
.plen
!= 128 &&
643 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
644 rt
->rt6i_flags
|= RTF_ANYCAST
;
645 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
648 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
649 rt
->rt6i_dst
.plen
= 128;
650 rt
->rt6i_flags
|= RTF_CACHE
;
651 rt
->dst
.flags
|= DST_HOST
;
653 #ifdef CONFIG_IPV6_SUBTREES
654 if (rt
->rt6i_src
.plen
&& saddr
) {
655 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
656 rt
->rt6i_src
.plen
= 128;
661 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
663 struct net
*net
= dev_net(rt
->rt6i_dev
);
664 int saved_rt_min_interval
=
665 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
666 int saved_rt_elasticity
=
667 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
669 if (attempts
-- > 0) {
670 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
671 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
673 ip6_dst_gc(&net
->ipv6
.ip6_dst_ops
);
675 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
677 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
678 saved_rt_min_interval
;
684 "ipv6: Neighbour table overflow.\n");
688 rt
->rt6i_nexthop
= neigh
;
695 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
697 struct rt6_info
*rt
= ip6_rt_copy(ort
);
699 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
700 rt
->rt6i_dst
.plen
= 128;
701 rt
->rt6i_flags
|= RTF_CACHE
;
702 rt
->dst
.flags
|= DST_HOST
;
703 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
708 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
709 struct flowi
*fl
, int flags
)
711 struct fib6_node
*fn
;
712 struct rt6_info
*rt
, *nrt
;
716 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
718 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
721 read_lock_bh(&table
->tb6_lock
);
724 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
727 rt
= rt6_select(fn
, oif
, strict
| reachable
);
729 BACKTRACK(net
, &fl
->fl6_src
);
730 if (rt
== net
->ipv6
.ip6_null_entry
||
731 rt
->rt6i_flags
& RTF_CACHE
)
735 read_unlock_bh(&table
->tb6_lock
);
737 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
738 nrt
= rt6_alloc_cow(rt
, &fl
->fl6_dst
, &fl
->fl6_src
);
740 nrt
= rt6_alloc_clone(rt
, &fl
->fl6_dst
);
742 dst_release(&rt
->dst
);
743 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
747 err
= ip6_ins_rt(nrt
);
756 * Race condition! In the gap, when table->tb6_lock was
757 * released someone could insert this route. Relookup.
759 dst_release(&rt
->dst
);
768 read_unlock_bh(&table
->tb6_lock
);
770 rt
->dst
.lastuse
= jiffies
;
776 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
777 struct flowi
*fl
, int flags
)
779 return ip6_pol_route(net
, table
, fl
->iif
, fl
, flags
);
782 void ip6_route_input(struct sk_buff
*skb
)
784 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
785 struct net
*net
= dev_net(skb
->dev
);
786 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
788 .iif
= skb
->dev
->ifindex
,
789 .fl6_dst
= iph
->daddr
,
790 .fl6_src
= iph
->saddr
,
791 .fl6_flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
793 .proto
= iph
->nexthdr
,
796 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
797 flags
|= RT6_LOOKUP_F_IFACE
;
799 skb_dst_set(skb
, fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_input
));
802 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
803 struct flowi
*fl
, int flags
)
805 return ip6_pol_route(net
, table
, fl
->oif
, fl
, flags
);
808 struct dst_entry
* ip6_route_output(struct net
*net
, struct sock
*sk
,
813 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl
->fl6_dst
))
814 flags
|= RT6_LOOKUP_F_IFACE
;
816 if (!ipv6_addr_any(&fl
->fl6_src
))
817 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
819 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
821 return fib6_rule_lookup(net
, fl
, flags
, ip6_pol_route_output
);
824 EXPORT_SYMBOL(ip6_route_output
);
826 int ip6_dst_blackhole(struct sock
*sk
, struct dst_entry
**dstp
, struct flowi
*fl
)
828 struct rt6_info
*ort
= (struct rt6_info
*) *dstp
;
829 struct rt6_info
*rt
= (struct rt6_info
*)
830 dst_alloc(&ip6_dst_blackhole_ops
);
831 struct dst_entry
*new = NULL
;
836 atomic_set(&new->__refcnt
, 1);
838 new->input
= dst_discard
;
839 new->output
= dst_discard
;
841 dst_copy_metrics(new, &ort
->dst
);
842 new->dev
= ort
->dst
.dev
;
845 rt
->rt6i_idev
= ort
->rt6i_idev
;
847 in6_dev_hold(rt
->rt6i_idev
);
848 rt
->rt6i_expires
= 0;
850 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
851 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
854 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
855 #ifdef CONFIG_IPV6_SUBTREES
856 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
864 return new ? 0 : -ENOMEM
;
866 EXPORT_SYMBOL_GPL(ip6_dst_blackhole
);
869 * Destination cache support functions
872 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
876 rt
= (struct rt6_info
*) dst
;
878 if (rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
))
884 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
886 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
889 if (rt
->rt6i_flags
& RTF_CACHE
) {
890 if (rt6_check_expired(rt
)) {
902 static void ip6_link_failure(struct sk_buff
*skb
)
906 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
908 rt
= (struct rt6_info
*) skb_dst(skb
);
910 if (rt
->rt6i_flags
&RTF_CACHE
) {
911 dst_set_expires(&rt
->dst
, 0);
912 rt
->rt6i_flags
|= RTF_EXPIRES
;
913 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
914 rt
->rt6i_node
->fn_sernum
= -1;
918 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
920 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
922 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
923 rt6
->rt6i_flags
|= RTF_MODIFIED
;
924 if (mtu
< IPV6_MIN_MTU
) {
925 u32 features
= dst_metric(dst
, RTAX_FEATURES
);
927 features
|= RTAX_FEATURE_ALLFRAG
;
928 dst_metric_set(dst
, RTAX_FEATURES
, features
);
930 dst_metric_set(dst
, RTAX_MTU
, mtu
);
931 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
935 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
937 struct net_device
*dev
= dst
->dev
;
938 unsigned int mtu
= dst_mtu(dst
);
939 struct net
*net
= dev_net(dev
);
941 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
943 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
944 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
947 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
948 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
949 * IPV6_MAXPLEN is also valid and means: "any MSS,
950 * rely only on pmtu discovery"
952 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
957 static unsigned int ip6_default_mtu(const struct dst_entry
*dst
)
959 unsigned int mtu
= IPV6_MIN_MTU
;
960 struct inet6_dev
*idev
;
963 idev
= __in6_dev_get(dst
->dev
);
965 mtu
= idev
->cnf
.mtu6
;
971 static struct dst_entry
*icmp6_dst_gc_list
;
972 static DEFINE_SPINLOCK(icmp6_dst_lock
);
974 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
975 struct neighbour
*neigh
,
976 const struct in6_addr
*addr
)
979 struct inet6_dev
*idev
= in6_dev_get(dev
);
980 struct net
*net
= dev_net(dev
);
982 if (unlikely(idev
== NULL
))
985 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
986 if (unlikely(rt
== NULL
)) {
995 neigh
= ndisc_get_neigh(dev
, addr
);
1001 rt
->rt6i_idev
= idev
;
1002 rt
->rt6i_nexthop
= neigh
;
1003 atomic_set(&rt
->dst
.__refcnt
, 1);
1004 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 255);
1005 rt
->dst
.output
= ip6_output
;
1007 #if 0 /* there's no chance to use these for ndisc */
1008 rt
->dst
.flags
= ipv6_addr_type(addr
) & IPV6_ADDR_UNICAST
1011 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1012 rt
->rt6i_dst
.plen
= 128;
1015 spin_lock_bh(&icmp6_dst_lock
);
1016 rt
->dst
.next
= icmp6_dst_gc_list
;
1017 icmp6_dst_gc_list
= &rt
->dst
;
1018 spin_unlock_bh(&icmp6_dst_lock
);
1020 fib6_force_start_gc(net
);
1026 int icmp6_dst_gc(void)
1028 struct dst_entry
*dst
, *next
, **pprev
;
1033 spin_lock_bh(&icmp6_dst_lock
);
1034 pprev
= &icmp6_dst_gc_list
;
1036 while ((dst
= *pprev
) != NULL
) {
1037 if (!atomic_read(&dst
->__refcnt
)) {
1046 spin_unlock_bh(&icmp6_dst_lock
);
1051 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1054 struct dst_entry
*dst
, **pprev
;
1056 spin_lock_bh(&icmp6_dst_lock
);
1057 pprev
= &icmp6_dst_gc_list
;
1058 while ((dst
= *pprev
) != NULL
) {
1059 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1060 if (func(rt
, arg
)) {
1067 spin_unlock_bh(&icmp6_dst_lock
);
1070 static int ip6_dst_gc(struct dst_ops
*ops
)
1072 unsigned long now
= jiffies
;
1073 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1074 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1075 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1076 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1077 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1078 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1081 entries
= dst_entries_get_fast(ops
);
1082 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1083 entries
<= rt_max_size
)
1086 net
->ipv6
.ip6_rt_gc_expire
++;
1087 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1088 net
->ipv6
.ip6_rt_last_gc
= now
;
1089 entries
= dst_entries_get_slow(ops
);
1090 if (entries
< ops
->gc_thresh
)
1091 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1093 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1094 return entries
> rt_max_size
;
1097 /* Clean host part of a prefix. Not necessary in radix tree,
1098 but results in cleaner routing tables.
1100 Remove it only when all the things will work!
1103 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1105 int hoplimit
= dst_metric_raw(dst
, RTAX_HOPLIMIT
);
1106 if (hoplimit
== 0) {
1107 struct net_device
*dev
= dst
->dev
;
1108 struct inet6_dev
*idev
;
1111 idev
= __in6_dev_get(dev
);
1113 hoplimit
= idev
->cnf
.hop_limit
;
1115 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1120 EXPORT_SYMBOL(ip6_dst_hoplimit
);
1126 int ip6_route_add(struct fib6_config
*cfg
)
1129 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1130 struct rt6_info
*rt
= NULL
;
1131 struct net_device
*dev
= NULL
;
1132 struct inet6_dev
*idev
= NULL
;
1133 struct fib6_table
*table
;
1136 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1138 #ifndef CONFIG_IPV6_SUBTREES
1139 if (cfg
->fc_src_len
)
1142 if (cfg
->fc_ifindex
) {
1144 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1147 idev
= in6_dev_get(dev
);
1152 if (cfg
->fc_metric
== 0)
1153 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1155 table
= fib6_new_table(net
, cfg
->fc_table
);
1156 if (table
== NULL
) {
1161 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1168 rt
->dst
.obsolete
= -1;
1169 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1170 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1173 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1174 cfg
->fc_protocol
= RTPROT_BOOT
;
1175 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1177 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1179 if (addr_type
& IPV6_ADDR_MULTICAST
)
1180 rt
->dst
.input
= ip6_mc_input
;
1181 else if (cfg
->fc_flags
& RTF_LOCAL
)
1182 rt
->dst
.input
= ip6_input
;
1184 rt
->dst
.input
= ip6_forward
;
1186 rt
->dst
.output
= ip6_output
;
1188 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1189 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1190 if (rt
->rt6i_dst
.plen
== 128)
1191 rt
->dst
.flags
= DST_HOST
;
1193 #ifdef CONFIG_IPV6_SUBTREES
1194 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1195 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1198 rt
->rt6i_metric
= cfg
->fc_metric
;
1200 /* We cannot add true routes via loopback here,
1201 they would result in kernel looping; promote them to reject routes
1203 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1204 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
)
1205 && !(cfg
->fc_flags
&RTF_LOCAL
))) {
1206 /* hold loopback dev/idev if we haven't done so. */
1207 if (dev
!= net
->loopback_dev
) {
1212 dev
= net
->loopback_dev
;
1214 idev
= in6_dev_get(dev
);
1220 rt
->dst
.output
= ip6_pkt_discard_out
;
1221 rt
->dst
.input
= ip6_pkt_discard
;
1222 rt
->dst
.error
= -ENETUNREACH
;
1223 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1227 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1228 struct in6_addr
*gw_addr
;
1231 gw_addr
= &cfg
->fc_gateway
;
1232 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1233 gwa_type
= ipv6_addr_type(gw_addr
);
1235 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1236 struct rt6_info
*grt
;
1238 /* IPv6 strictly inhibits using not link-local
1239 addresses as nexthop address.
1240 Otherwise, router will not able to send redirects.
1241 It is very good, but in some (rare!) circumstances
1242 (SIT, PtP, NBMA NOARP links) it is handy to allow
1243 some exceptions. --ANK
1246 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1249 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1251 err
= -EHOSTUNREACH
;
1255 if (dev
!= grt
->rt6i_dev
) {
1256 dst_release(&grt
->dst
);
1260 dev
= grt
->rt6i_dev
;
1261 idev
= grt
->rt6i_idev
;
1263 in6_dev_hold(grt
->rt6i_idev
);
1265 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1267 dst_release(&grt
->dst
);
1273 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1281 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1282 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1283 if (IS_ERR(rt
->rt6i_nexthop
)) {
1284 err
= PTR_ERR(rt
->rt6i_nexthop
);
1285 rt
->rt6i_nexthop
= NULL
;
1290 rt
->rt6i_flags
= cfg
->fc_flags
;
1297 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1298 int type
= nla_type(nla
);
1301 if (type
> RTAX_MAX
) {
1306 dst_metric_set(&rt
->dst
, type
, nla_get_u32(nla
));
1312 rt
->rt6i_idev
= idev
;
1313 rt
->rt6i_table
= table
;
1315 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1317 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1329 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1332 struct fib6_table
*table
;
1333 struct net
*net
= dev_net(rt
->rt6i_dev
);
1335 if (rt
== net
->ipv6
.ip6_null_entry
)
1338 table
= rt
->rt6i_table
;
1339 write_lock_bh(&table
->tb6_lock
);
1341 err
= fib6_del(rt
, info
);
1342 dst_release(&rt
->dst
);
1344 write_unlock_bh(&table
->tb6_lock
);
1349 int ip6_del_rt(struct rt6_info
*rt
)
1351 struct nl_info info
= {
1352 .nl_net
= dev_net(rt
->rt6i_dev
),
1354 return __ip6_del_rt(rt
, &info
);
1357 static int ip6_route_del(struct fib6_config
*cfg
)
1359 struct fib6_table
*table
;
1360 struct fib6_node
*fn
;
1361 struct rt6_info
*rt
;
1364 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1368 read_lock_bh(&table
->tb6_lock
);
1370 fn
= fib6_locate(&table
->tb6_root
,
1371 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1372 &cfg
->fc_src
, cfg
->fc_src_len
);
1375 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1376 if (cfg
->fc_ifindex
&&
1377 (rt
->rt6i_dev
== NULL
||
1378 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1380 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1381 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1383 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1386 read_unlock_bh(&table
->tb6_lock
);
1388 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1391 read_unlock_bh(&table
->tb6_lock
);
1399 struct ip6rd_flowi
{
1401 struct in6_addr gateway
;
1404 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1405 struct fib6_table
*table
,
1409 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl
;
1410 struct rt6_info
*rt
;
1411 struct fib6_node
*fn
;
1414 * Get the "current" route for this destination and
1415 * check if the redirect has come from approriate router.
1417 * RFC 2461 specifies that redirects should only be
1418 * accepted if they come from the nexthop to the target.
1419 * Due to the way the routes are chosen, this notion
1420 * is a bit fuzzy and one might need to check all possible
1424 read_lock_bh(&table
->tb6_lock
);
1425 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
1427 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1429 * Current route is on-link; redirect is always invalid.
1431 * Seems, previous statement is not true. It could
1432 * be node, which looks for us as on-link (f.e. proxy ndisc)
1433 * But then router serving it might decide, that we should
1434 * know truth 8)8) --ANK (980726).
1436 if (rt6_check_expired(rt
))
1438 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1440 if (fl
->oif
!= rt
->rt6i_dev
->ifindex
)
1442 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1448 rt
= net
->ipv6
.ip6_null_entry
;
1449 BACKTRACK(net
, &fl
->fl6_src
);
1453 read_unlock_bh(&table
->tb6_lock
);
1458 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1459 struct in6_addr
*src
,
1460 struct in6_addr
*gateway
,
1461 struct net_device
*dev
)
1463 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1464 struct net
*net
= dev_net(dev
);
1465 struct ip6rd_flowi rdfl
= {
1467 .oif
= dev
->ifindex
,
1473 ipv6_addr_copy(&rdfl
.gateway
, gateway
);
1475 if (rt6_need_strict(dest
))
1476 flags
|= RT6_LOOKUP_F_IFACE
;
1478 return (struct rt6_info
*)fib6_rule_lookup(net
, (struct flowi
*)&rdfl
,
1479 flags
, __ip6_route_redirect
);
1482 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1483 struct in6_addr
*saddr
,
1484 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1486 struct rt6_info
*rt
, *nrt
= NULL
;
1487 struct netevent_redirect netevent
;
1488 struct net
*net
= dev_net(neigh
->dev
);
1490 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1492 if (rt
== net
->ipv6
.ip6_null_entry
) {
1493 if (net_ratelimit())
1494 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1495 "for redirect target\n");
1500 * We have finally decided to accept it.
1503 neigh_update(neigh
, lladdr
, NUD_STALE
,
1504 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1505 NEIGH_UPDATE_F_OVERRIDE
|
1506 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1507 NEIGH_UPDATE_F_ISROUTER
))
1511 * Redirect received -> path was valid.
1512 * Look, redirects are sent only in response to data packets,
1513 * so that this nexthop apparently is reachable. --ANK
1515 dst_confirm(&rt
->dst
);
1517 /* Duplicate redirect: silently ignore. */
1518 if (neigh
== rt
->dst
.neighbour
)
1521 nrt
= ip6_rt_copy(rt
);
1525 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1527 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1529 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1530 nrt
->rt6i_dst
.plen
= 128;
1531 nrt
->dst
.flags
|= DST_HOST
;
1533 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1534 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1536 if (ip6_ins_rt(nrt
))
1539 netevent
.old
= &rt
->dst
;
1540 netevent
.new = &nrt
->dst
;
1541 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1543 if (rt
->rt6i_flags
&RTF_CACHE
) {
1549 dst_release(&rt
->dst
);
1553 * Handle ICMP "packet too big" messages
1554 * i.e. Path MTU discovery
1557 static void rt6_do_pmtu_disc(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1558 struct net
*net
, u32 pmtu
, int ifindex
)
1560 struct rt6_info
*rt
, *nrt
;
1563 rt
= rt6_lookup(net
, daddr
, saddr
, ifindex
, 0);
1567 if (rt6_check_expired(rt
)) {
1572 if (pmtu
>= dst_mtu(&rt
->dst
))
1575 if (pmtu
< IPV6_MIN_MTU
) {
1577 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578 * MTU (1280) and a fragment header should always be included
1579 * after a node receiving Too Big message reporting PMTU is
1580 * less than the IPv6 Minimum Link MTU.
1582 pmtu
= IPV6_MIN_MTU
;
1586 /* New mtu received -> path was valid.
1587 They are sent only in response to data packets,
1588 so that this nexthop apparently is reachable. --ANK
1590 dst_confirm(&rt
->dst
);
1592 /* Host route. If it is static, it would be better
1593 not to override it, but add new one, so that
1594 when cache entry will expire old pmtu
1595 would return automatically.
1597 if (rt
->rt6i_flags
& RTF_CACHE
) {
1598 dst_metric_set(&rt
->dst
, RTAX_MTU
, pmtu
);
1600 u32 features
= dst_metric(&rt
->dst
, RTAX_FEATURES
);
1601 features
|= RTAX_FEATURE_ALLFRAG
;
1602 dst_metric_set(&rt
->dst
, RTAX_FEATURES
, features
);
1604 dst_set_expires(&rt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1605 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1610 Two cases are possible:
1611 1. It is connected route. Action: COW
1612 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1614 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1615 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1617 nrt
= rt6_alloc_clone(rt
, daddr
);
1620 dst_metric_set(&nrt
->dst
, RTAX_MTU
, pmtu
);
1622 u32 features
= dst_metric(&nrt
->dst
, RTAX_FEATURES
);
1623 features
|= RTAX_FEATURE_ALLFRAG
;
1624 dst_metric_set(&nrt
->dst
, RTAX_FEATURES
, features
);
1627 /* According to RFC 1981, detecting PMTU increase shouldn't be
1628 * happened within 5 mins, the recommended timer is 10 mins.
1629 * Here this route expiration time is set to ip6_rt_mtu_expires
1630 * which is 10 mins. After 10 mins the decreased pmtu is expired
1631 * and detecting PMTU increase will be automatically happened.
1633 dst_set_expires(&nrt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1634 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1639 dst_release(&rt
->dst
);
1642 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1643 struct net_device
*dev
, u32 pmtu
)
1645 struct net
*net
= dev_net(dev
);
1648 * RFC 1981 states that a node "MUST reduce the size of the packets it
1649 * is sending along the path" that caused the Packet Too Big message.
1650 * Since it's not possible in the general case to determine which
1651 * interface was used to send the original packet, we update the MTU
1652 * on the interface that will be used to send future packets. We also
1653 * update the MTU on the interface that received the Packet Too Big in
1654 * case the original packet was forced out that interface with
1655 * SO_BINDTODEVICE or similar. This is the next best thing to the
1656 * correct behaviour, which would be to update the MTU on all
1659 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, 0);
1660 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, dev
->ifindex
);
1664 * Misc support functions
1667 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1669 struct net
*net
= dev_net(ort
->rt6i_dev
);
1670 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1673 rt
->dst
.input
= ort
->dst
.input
;
1674 rt
->dst
.output
= ort
->dst
.output
;
1676 dst_copy_metrics(&rt
->dst
, &ort
->dst
);
1677 rt
->dst
.error
= ort
->dst
.error
;
1678 rt
->dst
.dev
= ort
->dst
.dev
;
1680 dev_hold(rt
->dst
.dev
);
1681 rt
->rt6i_idev
= ort
->rt6i_idev
;
1683 in6_dev_hold(rt
->rt6i_idev
);
1684 rt
->dst
.lastuse
= jiffies
;
1685 rt
->rt6i_expires
= 0;
1687 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1688 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1689 rt
->rt6i_metric
= 0;
1691 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1692 #ifdef CONFIG_IPV6_SUBTREES
1693 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1695 rt
->rt6i_table
= ort
->rt6i_table
;
1700 #ifdef CONFIG_IPV6_ROUTE_INFO
1701 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1702 struct in6_addr
*prefix
, int prefixlen
,
1703 struct in6_addr
*gwaddr
, int ifindex
)
1705 struct fib6_node
*fn
;
1706 struct rt6_info
*rt
= NULL
;
1707 struct fib6_table
*table
;
1709 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1713 write_lock_bh(&table
->tb6_lock
);
1714 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1718 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1719 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1721 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1723 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1729 write_unlock_bh(&table
->tb6_lock
);
1733 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1734 struct in6_addr
*prefix
, int prefixlen
,
1735 struct in6_addr
*gwaddr
, int ifindex
,
1738 struct fib6_config cfg
= {
1739 .fc_table
= RT6_TABLE_INFO
,
1740 .fc_metric
= IP6_RT_PRIO_USER
,
1741 .fc_ifindex
= ifindex
,
1742 .fc_dst_len
= prefixlen
,
1743 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1744 RTF_UP
| RTF_PREF(pref
),
1746 .fc_nlinfo
.nlh
= NULL
,
1747 .fc_nlinfo
.nl_net
= net
,
1750 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1751 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1753 /* We should treat it as a default route if prefix length is 0. */
1755 cfg
.fc_flags
|= RTF_DEFAULT
;
1757 ip6_route_add(&cfg
);
1759 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1763 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1765 struct rt6_info
*rt
;
1766 struct fib6_table
*table
;
1768 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1772 write_lock_bh(&table
->tb6_lock
);
1773 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->dst
.rt6_next
) {
1774 if (dev
== rt
->rt6i_dev
&&
1775 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1776 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1781 write_unlock_bh(&table
->tb6_lock
);
1785 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1786 struct net_device
*dev
,
1789 struct fib6_config cfg
= {
1790 .fc_table
= RT6_TABLE_DFLT
,
1791 .fc_metric
= IP6_RT_PRIO_USER
,
1792 .fc_ifindex
= dev
->ifindex
,
1793 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1794 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1796 .fc_nlinfo
.nlh
= NULL
,
1797 .fc_nlinfo
.nl_net
= dev_net(dev
),
1800 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1802 ip6_route_add(&cfg
);
1804 return rt6_get_dflt_router(gwaddr
, dev
);
1807 void rt6_purge_dflt_routers(struct net
*net
)
1809 struct rt6_info
*rt
;
1810 struct fib6_table
*table
;
1812 /* NOTE: Keep consistent with rt6_get_dflt_router */
1813 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1818 read_lock_bh(&table
->tb6_lock
);
1819 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1820 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1822 read_unlock_bh(&table
->tb6_lock
);
1827 read_unlock_bh(&table
->tb6_lock
);
1830 static void rtmsg_to_fib6_config(struct net
*net
,
1831 struct in6_rtmsg
*rtmsg
,
1832 struct fib6_config
*cfg
)
1834 memset(cfg
, 0, sizeof(*cfg
));
1836 cfg
->fc_table
= RT6_TABLE_MAIN
;
1837 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1838 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1839 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1840 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1841 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1842 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1844 cfg
->fc_nlinfo
.nl_net
= net
;
1846 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1847 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1848 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1851 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1853 struct fib6_config cfg
;
1854 struct in6_rtmsg rtmsg
;
1858 case SIOCADDRT
: /* Add a route */
1859 case SIOCDELRT
: /* Delete a route */
1860 if (!capable(CAP_NET_ADMIN
))
1862 err
= copy_from_user(&rtmsg
, arg
,
1863 sizeof(struct in6_rtmsg
));
1867 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1872 err
= ip6_route_add(&cfg
);
1875 err
= ip6_route_del(&cfg
);
1889 * Drop the packet on the floor
1892 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
1895 struct dst_entry
*dst
= skb_dst(skb
);
1896 switch (ipstats_mib_noroutes
) {
1897 case IPSTATS_MIB_INNOROUTES
:
1898 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1899 if (type
== IPV6_ADDR_ANY
) {
1900 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1901 IPSTATS_MIB_INADDRERRORS
);
1905 case IPSTATS_MIB_OUTNOROUTES
:
1906 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1907 ipstats_mib_noroutes
);
1910 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
1915 static int ip6_pkt_discard(struct sk_buff
*skb
)
1917 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1920 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1922 skb
->dev
= skb_dst(skb
)->dev
;
1923 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1928 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1930 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1933 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1935 skb
->dev
= skb_dst(skb
)->dev
;
1936 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1942 * Allocate a dst for local (unicast / anycast) address.
1945 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1946 const struct in6_addr
*addr
,
1949 struct net
*net
= dev_net(idev
->dev
);
1950 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1951 struct neighbour
*neigh
;
1954 if (net_ratelimit())
1955 pr_warning("IPv6: Maximum number of routes reached,"
1956 " consider increasing route/max_size.\n");
1957 return ERR_PTR(-ENOMEM
);
1960 dev_hold(net
->loopback_dev
);
1963 rt
->dst
.flags
= DST_HOST
;
1964 rt
->dst
.input
= ip6_input
;
1965 rt
->dst
.output
= ip6_output
;
1966 rt
->rt6i_dev
= net
->loopback_dev
;
1967 rt
->rt6i_idev
= idev
;
1968 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, -1);
1969 rt
->dst
.obsolete
= -1;
1971 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
1973 rt
->rt6i_flags
|= RTF_ANYCAST
;
1975 rt
->rt6i_flags
|= RTF_LOCAL
;
1976 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
1977 if (IS_ERR(neigh
)) {
1980 /* We are casting this because that is the return
1981 * value type. But an errno encoded pointer is the
1982 * same regardless of the underlying pointer type,
1983 * and that's what we are returning. So this is OK.
1985 return (struct rt6_info
*) neigh
;
1987 rt
->rt6i_nexthop
= neigh
;
1989 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1990 rt
->rt6i_dst
.plen
= 128;
1991 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
1993 atomic_set(&rt
->dst
.__refcnt
, 1);
1998 struct arg_dev_net
{
1999 struct net_device
*dev
;
2003 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
2005 const struct arg_dev_net
*adn
= arg
;
2006 const struct net_device
*dev
= adn
->dev
;
2008 if ((rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
2009 rt
!= adn
->net
->ipv6
.ip6_null_entry
) {
2010 RT6_TRACE("deleted by ifdown %p\n", rt
);
2016 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
2018 struct arg_dev_net adn
= {
2023 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
2024 icmp6_clean_all(fib6_ifdown
, &adn
);
2027 struct rt6_mtu_change_arg
2029 struct net_device
*dev
;
2033 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2035 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2036 struct inet6_dev
*idev
;
2038 /* In IPv6 pmtu discovery is not optional,
2039 so that RTAX_MTU lock cannot disable it.
2040 We still use this lock to block changes
2041 caused by addrconf/ndisc.
2044 idev
= __in6_dev_get(arg
->dev
);
2048 /* For administrative MTU increase, there is no way to discover
2049 IPv6 PMTU increase, so PMTU increase should be updated here.
2050 Since RFC 1981 doesn't include administrative MTU increase
2051 update PMTU increase is a MUST. (i.e. jumbo frame)
2054 If new MTU is less than route PMTU, this new MTU will be the
2055 lowest MTU in the path, update the route PMTU to reflect PMTU
2056 decreases; if new MTU is greater than route PMTU, and the
2057 old MTU is the lowest MTU in the path, update the route PMTU
2058 to reflect the increase. In this case if the other nodes' MTU
2059 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2062 if (rt
->rt6i_dev
== arg
->dev
&&
2063 !dst_metric_locked(&rt
->dst
, RTAX_MTU
) &&
2064 (dst_mtu(&rt
->dst
) >= arg
->mtu
||
2065 (dst_mtu(&rt
->dst
) < arg
->mtu
&&
2066 dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
))) {
2067 dst_metric_set(&rt
->dst
, RTAX_MTU
, arg
->mtu
);
2072 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2074 struct rt6_mtu_change_arg arg
= {
2079 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2082 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2083 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2084 [RTA_OIF
] = { .type
= NLA_U32
},
2085 [RTA_IIF
] = { .type
= NLA_U32
},
2086 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2087 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2090 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2091 struct fib6_config
*cfg
)
2094 struct nlattr
*tb
[RTA_MAX
+1];
2097 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2102 rtm
= nlmsg_data(nlh
);
2103 memset(cfg
, 0, sizeof(*cfg
));
2105 cfg
->fc_table
= rtm
->rtm_table
;
2106 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2107 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2108 cfg
->fc_flags
= RTF_UP
;
2109 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2111 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2112 cfg
->fc_flags
|= RTF_REJECT
;
2114 if (rtm
->rtm_type
== RTN_LOCAL
)
2115 cfg
->fc_flags
|= RTF_LOCAL
;
2117 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2118 cfg
->fc_nlinfo
.nlh
= nlh
;
2119 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2121 if (tb
[RTA_GATEWAY
]) {
2122 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2123 cfg
->fc_flags
|= RTF_GATEWAY
;
2127 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2129 if (nla_len(tb
[RTA_DST
]) < plen
)
2132 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2136 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2138 if (nla_len(tb
[RTA_SRC
]) < plen
)
2141 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2145 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2147 if (tb
[RTA_PRIORITY
])
2148 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2150 if (tb
[RTA_METRICS
]) {
2151 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2152 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2156 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2163 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2165 struct fib6_config cfg
;
2168 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2172 return ip6_route_del(&cfg
);
2175 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2177 struct fib6_config cfg
;
2180 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2184 return ip6_route_add(&cfg
);
2187 static inline size_t rt6_nlmsg_size(void)
2189 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2190 + nla_total_size(16) /* RTA_SRC */
2191 + nla_total_size(16) /* RTA_DST */
2192 + nla_total_size(16) /* RTA_GATEWAY */
2193 + nla_total_size(16) /* RTA_PREFSRC */
2194 + nla_total_size(4) /* RTA_TABLE */
2195 + nla_total_size(4) /* RTA_IIF */
2196 + nla_total_size(4) /* RTA_OIF */
2197 + nla_total_size(4) /* RTA_PRIORITY */
2198 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2199 + nla_total_size(sizeof(struct rta_cacheinfo
));
2202 static int rt6_fill_node(struct net
*net
,
2203 struct sk_buff
*skb
, struct rt6_info
*rt
,
2204 struct in6_addr
*dst
, struct in6_addr
*src
,
2205 int iif
, int type
, u32 pid
, u32 seq
,
2206 int prefix
, int nowait
, unsigned int flags
)
2209 struct nlmsghdr
*nlh
;
2213 if (prefix
) { /* user wants prefix routes only */
2214 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2215 /* success since this is not a prefix route */
2220 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2224 rtm
= nlmsg_data(nlh
);
2225 rtm
->rtm_family
= AF_INET6
;
2226 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2227 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2230 table
= rt
->rt6i_table
->tb6_id
;
2232 table
= RT6_TABLE_UNSPEC
;
2233 rtm
->rtm_table
= table
;
2234 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2235 if (rt
->rt6i_flags
&RTF_REJECT
)
2236 rtm
->rtm_type
= RTN_UNREACHABLE
;
2237 else if (rt
->rt6i_flags
&RTF_LOCAL
)
2238 rtm
->rtm_type
= RTN_LOCAL
;
2239 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2240 rtm
->rtm_type
= RTN_LOCAL
;
2242 rtm
->rtm_type
= RTN_UNICAST
;
2244 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2245 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2246 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2247 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2248 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2249 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2250 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2251 rtm
->rtm_protocol
= RTPROT_RA
;
2253 if (rt
->rt6i_flags
&RTF_CACHE
)
2254 rtm
->rtm_flags
|= RTM_F_CLONED
;
2257 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2258 rtm
->rtm_dst_len
= 128;
2259 } else if (rtm
->rtm_dst_len
)
2260 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2261 #ifdef CONFIG_IPV6_SUBTREES
2263 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2264 rtm
->rtm_src_len
= 128;
2265 } else if (rtm
->rtm_src_len
)
2266 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2269 #ifdef CONFIG_IPV6_MROUTE
2270 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2271 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2276 goto nla_put_failure
;
2278 if (err
== -EMSGSIZE
)
2279 goto nla_put_failure
;
2284 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2286 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->dst
);
2287 struct in6_addr saddr_buf
;
2288 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2289 dst
, 0, &saddr_buf
) == 0)
2290 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2293 if (rtnetlink_put_metrics(skb
, dst_metrics_ptr(&rt
->dst
)) < 0)
2294 goto nla_put_failure
;
2296 if (rt
->dst
.neighbour
)
2297 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->dst
.neighbour
->primary_key
);
2300 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2302 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2304 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2306 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2307 expires
= rt
->rt6i_expires
- jiffies
;
2311 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, 0, 0,
2312 expires
, rt
->dst
.error
) < 0)
2313 goto nla_put_failure
;
2315 return nlmsg_end(skb
, nlh
);
2318 nlmsg_cancel(skb
, nlh
);
2322 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2324 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2327 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2328 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2329 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2333 return rt6_fill_node(arg
->net
,
2334 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2335 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2336 prefix
, 0, NLM_F_MULTI
);
2339 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2341 struct net
*net
= sock_net(in_skb
->sk
);
2342 struct nlattr
*tb
[RTA_MAX
+1];
2343 struct rt6_info
*rt
;
2344 struct sk_buff
*skb
;
2349 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2354 memset(&fl
, 0, sizeof(fl
));
2357 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2360 ipv6_addr_copy(&fl
.fl6_src
, nla_data(tb
[RTA_SRC
]));
2364 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2367 ipv6_addr_copy(&fl
.fl6_dst
, nla_data(tb
[RTA_DST
]));
2371 iif
= nla_get_u32(tb
[RTA_IIF
]);
2374 fl
.oif
= nla_get_u32(tb
[RTA_OIF
]);
2377 struct net_device
*dev
;
2378 dev
= __dev_get_by_index(net
, iif
);
2385 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2391 /* Reserve room for dummy headers, this skb can pass
2392 through good chunk of routing engine.
2394 skb_reset_mac_header(skb
);
2395 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2397 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl
);
2398 skb_dst_set(skb
, &rt
->dst
);
2400 err
= rt6_fill_node(net
, skb
, rt
, &fl
.fl6_dst
, &fl
.fl6_src
, iif
,
2401 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2402 nlh
->nlmsg_seq
, 0, 0, 0);
2408 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2413 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2415 struct sk_buff
*skb
;
2416 struct net
*net
= info
->nl_net
;
2421 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2423 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2427 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2428 event
, info
->pid
, seq
, 0, 0, 0);
2430 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2431 WARN_ON(err
== -EMSGSIZE
);
2435 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2436 info
->nlh
, gfp_any());
2440 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2443 static int ip6_route_dev_notify(struct notifier_block
*this,
2444 unsigned long event
, void *data
)
2446 struct net_device
*dev
= (struct net_device
*)data
;
2447 struct net
*net
= dev_net(dev
);
2449 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2450 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
2451 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2452 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2453 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
2454 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2455 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
2456 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2467 #ifdef CONFIG_PROC_FS
2478 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2480 struct seq_file
*m
= p_arg
;
2482 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2484 #ifdef CONFIG_IPV6_SUBTREES
2485 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2487 seq_puts(m
, "00000000000000000000000000000000 00 ");
2490 if (rt
->rt6i_nexthop
) {
2491 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2493 seq_puts(m
, "00000000000000000000000000000000");
2495 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2496 rt
->rt6i_metric
, atomic_read(&rt
->dst
.__refcnt
),
2497 rt
->dst
.__use
, rt
->rt6i_flags
,
2498 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2502 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2504 struct net
*net
= (struct net
*)m
->private;
2505 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2509 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2511 return single_open_net(inode
, file
, ipv6_route_show
);
2514 static const struct file_operations ipv6_route_proc_fops
= {
2515 .owner
= THIS_MODULE
,
2516 .open
= ipv6_route_open
,
2518 .llseek
= seq_lseek
,
2519 .release
= single_release_net
,
2522 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2524 struct net
*net
= (struct net
*)seq
->private;
2525 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2526 net
->ipv6
.rt6_stats
->fib_nodes
,
2527 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2528 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2529 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2530 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2531 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
2532 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2537 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2539 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2542 static const struct file_operations rt6_stats_seq_fops
= {
2543 .owner
= THIS_MODULE
,
2544 .open
= rt6_stats_seq_open
,
2546 .llseek
= seq_lseek
,
2547 .release
= single_release_net
,
2549 #endif /* CONFIG_PROC_FS */
2551 #ifdef CONFIG_SYSCTL
2554 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
,
2555 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2557 struct net
*net
= current
->nsproxy
->net_ns
;
2558 int delay
= net
->ipv6
.sysctl
.flush_delay
;
2560 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
2561 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2567 ctl_table ipv6_route_table_template
[] = {
2569 .procname
= "flush",
2570 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2571 .maxlen
= sizeof(int),
2573 .proc_handler
= ipv6_sysctl_rtcache_flush
2576 .procname
= "gc_thresh",
2577 .data
= &ip6_dst_ops_template
.gc_thresh
,
2578 .maxlen
= sizeof(int),
2580 .proc_handler
= proc_dointvec
,
2583 .procname
= "max_size",
2584 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2585 .maxlen
= sizeof(int),
2587 .proc_handler
= proc_dointvec
,
2590 .procname
= "gc_min_interval",
2591 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2592 .maxlen
= sizeof(int),
2594 .proc_handler
= proc_dointvec_jiffies
,
2597 .procname
= "gc_timeout",
2598 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2599 .maxlen
= sizeof(int),
2601 .proc_handler
= proc_dointvec_jiffies
,
2604 .procname
= "gc_interval",
2605 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2606 .maxlen
= sizeof(int),
2608 .proc_handler
= proc_dointvec_jiffies
,
2611 .procname
= "gc_elasticity",
2612 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2613 .maxlen
= sizeof(int),
2615 .proc_handler
= proc_dointvec
,
2618 .procname
= "mtu_expires",
2619 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2620 .maxlen
= sizeof(int),
2622 .proc_handler
= proc_dointvec_jiffies
,
2625 .procname
= "min_adv_mss",
2626 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2627 .maxlen
= sizeof(int),
2629 .proc_handler
= proc_dointvec
,
2632 .procname
= "gc_min_interval_ms",
2633 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2634 .maxlen
= sizeof(int),
2636 .proc_handler
= proc_dointvec_ms_jiffies
,
2641 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
2643 struct ctl_table
*table
;
2645 table
= kmemdup(ipv6_route_table_template
,
2646 sizeof(ipv6_route_table_template
),
2650 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2651 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
2652 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2653 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2654 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2655 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2656 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2657 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2658 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2659 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2666 static int __net_init
ip6_route_net_init(struct net
*net
)
2670 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
2671 sizeof(net
->ipv6
.ip6_dst_ops
));
2673 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
2674 goto out_ip6_dst_ops
;
2676 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2677 sizeof(*net
->ipv6
.ip6_null_entry
),
2679 if (!net
->ipv6
.ip6_null_entry
)
2680 goto out_ip6_dst_entries
;
2681 net
->ipv6
.ip6_null_entry
->dst
.path
=
2682 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2683 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2684 dst_metric_set(&net
->ipv6
.ip6_null_entry
->dst
, RTAX_HOPLIMIT
, 255);
2686 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2688 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2690 if (!net
->ipv6
.ip6_prohibit_entry
)
2691 goto out_ip6_null_entry
;
2692 net
->ipv6
.ip6_prohibit_entry
->dst
.path
=
2693 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2694 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2695 dst_metric_set(&net
->ipv6
.ip6_prohibit_entry
->dst
, RTAX_HOPLIMIT
, 255);
2697 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2698 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2700 if (!net
->ipv6
.ip6_blk_hole_entry
)
2701 goto out_ip6_prohibit_entry
;
2702 net
->ipv6
.ip6_blk_hole_entry
->dst
.path
=
2703 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2704 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2705 dst_metric_set(&net
->ipv6
.ip6_blk_hole_entry
->dst
, RTAX_HOPLIMIT
, 255);
2708 net
->ipv6
.sysctl
.flush_delay
= 0;
2709 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2710 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2711 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2712 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2713 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2714 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2715 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2717 #ifdef CONFIG_PROC_FS
2718 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2719 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2721 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2727 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2728 out_ip6_prohibit_entry
:
2729 kfree(net
->ipv6
.ip6_prohibit_entry
);
2731 kfree(net
->ipv6
.ip6_null_entry
);
2733 out_ip6_dst_entries
:
2734 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
2739 static void __net_exit
ip6_route_net_exit(struct net
*net
)
2741 #ifdef CONFIG_PROC_FS
2742 proc_net_remove(net
, "ipv6_route");
2743 proc_net_remove(net
, "rt6_stats");
2745 kfree(net
->ipv6
.ip6_null_entry
);
2746 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2747 kfree(net
->ipv6
.ip6_prohibit_entry
);
2748 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2750 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
2753 static struct pernet_operations ip6_route_net_ops
= {
2754 .init
= ip6_route_net_init
,
2755 .exit
= ip6_route_net_exit
,
2758 static struct notifier_block ip6_route_dev_notifier
= {
2759 .notifier_call
= ip6_route_dev_notify
,
2763 int __init
ip6_route_init(void)
2768 ip6_dst_ops_template
.kmem_cachep
=
2769 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2770 SLAB_HWCACHE_ALIGN
, NULL
);
2771 if (!ip6_dst_ops_template
.kmem_cachep
)
2774 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
2776 goto out_kmem_cache
;
2778 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2780 goto out_dst_entries
;
2782 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2784 /* Registering of the loopback is done before this portion of code,
2785 * the loopback reference in rt6_info will not be taken, do it
2786 * manually for init_net */
2787 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
2788 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2789 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2790 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
2791 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2792 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
2793 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2797 goto out_register_subsys
;
2803 ret
= fib6_rules_init();
2808 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2809 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2810 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2811 goto fib6_rules_init
;
2813 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2815 goto fib6_rules_init
;
2821 fib6_rules_cleanup();
2826 out_register_subsys
:
2827 unregister_pernet_subsys(&ip6_route_net_ops
);
2829 dst_entries_destroy(&ip6_dst_blackhole_ops
);
2831 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2835 void ip6_route_cleanup(void)
2837 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2838 fib6_rules_cleanup();
2841 unregister_pernet_subsys(&ip6_route_net_ops
);
2842 dst_entries_destroy(&ip6_dst_blackhole_ops
);
2843 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);