2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
51 #include <linux/rtnetlink.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
57 #include <asm/uaccess.h>
60 #include <linux/sysctl.h>
63 /* Set to 3 to get tracing. */
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #define RT6_TRACE(x...) do { ; } while (0)
74 #define CLONE_OFFLINK_ROUTE 0
76 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
77 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
78 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
79 static void ip6_dst_destroy(struct dst_entry
*);
80 static void ip6_dst_ifdown(struct dst_entry
*,
81 struct net_device
*dev
, int how
);
82 static int ip6_dst_gc(struct dst_ops
*ops
);
84 static int ip6_pkt_discard(struct sk_buff
*skb
);
85 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
86 static void ip6_link_failure(struct sk_buff
*skb
);
87 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
91 struct in6_addr
*prefix
, int prefixlen
,
92 struct in6_addr
*gwaddr
, int ifindex
,
94 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
95 struct in6_addr
*prefix
, int prefixlen
,
96 struct in6_addr
*gwaddr
, int ifindex
);
99 static struct dst_ops ip6_dst_ops_template
= {
101 .protocol
= cpu_to_be16(ETH_P_IPV6
),
104 .check
= ip6_dst_check
,
105 .destroy
= ip6_dst_destroy
,
106 .ifdown
= ip6_dst_ifdown
,
107 .negative_advice
= ip6_negative_advice
,
108 .link_failure
= ip6_link_failure
,
109 .update_pmtu
= ip6_rt_update_pmtu
,
110 .local_out
= __ip6_local_out
,
111 .entries
= ATOMIC_INIT(0),
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
118 static struct dst_ops ip6_dst_blackhole_ops
= {
120 .protocol
= cpu_to_be16(ETH_P_IPV6
),
121 .destroy
= ip6_dst_destroy
,
122 .check
= ip6_dst_check
,
123 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
124 .entries
= ATOMIC_INIT(0),
127 static struct rt6_info ip6_null_entry_template
= {
130 .__refcnt
= ATOMIC_INIT(1),
133 .error
= -ENETUNREACH
,
134 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
135 .input
= ip6_pkt_discard
,
136 .output
= ip6_pkt_discard_out
,
139 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
140 .rt6i_protocol
= RTPROT_KERNEL
,
141 .rt6i_metric
= ~(u32
) 0,
142 .rt6i_ref
= ATOMIC_INIT(1),
145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
148 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
150 static struct rt6_info ip6_prohibit_entry_template
= {
153 .__refcnt
= ATOMIC_INIT(1),
157 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
158 .input
= ip6_pkt_prohibit
,
159 .output
= ip6_pkt_prohibit_out
,
162 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
163 .rt6i_protocol
= RTPROT_KERNEL
,
164 .rt6i_metric
= ~(u32
) 0,
165 .rt6i_ref
= ATOMIC_INIT(1),
168 static struct rt6_info ip6_blk_hole_entry_template
= {
171 .__refcnt
= ATOMIC_INIT(1),
175 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
176 .input
= dst_discard
,
177 .output
= dst_discard
,
180 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
181 .rt6i_protocol
= RTPROT_KERNEL
,
182 .rt6i_metric
= ~(u32
) 0,
183 .rt6i_ref
= ATOMIC_INIT(1),
188 /* allocate dst with ip6_dst_ops */
189 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
191 return (struct rt6_info
*)dst_alloc(ops
);
194 static void ip6_dst_destroy(struct dst_entry
*dst
)
196 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
197 struct inet6_dev
*idev
= rt
->rt6i_idev
;
200 rt
->rt6i_idev
= NULL
;
205 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
208 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
209 struct inet6_dev
*idev
= rt
->rt6i_idev
;
210 struct net_device
*loopback_dev
=
211 dev_net(dev
)->loopback_dev
;
213 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
214 struct inet6_dev
*loopback_idev
=
215 in6_dev_get(loopback_dev
);
216 if (loopback_idev
!= NULL
) {
217 rt
->rt6i_idev
= loopback_idev
;
223 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
225 return (rt
->rt6i_flags
& RTF_EXPIRES
&&
226 time_after(jiffies
, rt
->rt6i_expires
));
229 static inline int rt6_need_strict(struct in6_addr
*daddr
)
231 return (ipv6_addr_type(daddr
) &
232 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
));
236 * Route lookup. Any table->tb6_lock is implied.
239 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
241 struct in6_addr
*saddr
,
245 struct rt6_info
*local
= NULL
;
246 struct rt6_info
*sprt
;
248 if (!oif
&& ipv6_addr_any(saddr
))
251 for (sprt
= rt
; sprt
; sprt
= sprt
->u
.dst
.rt6_next
) {
252 struct net_device
*dev
= sprt
->rt6i_dev
;
255 if (dev
->ifindex
== oif
)
257 if (dev
->flags
& IFF_LOOPBACK
) {
258 if (sprt
->rt6i_idev
== NULL
||
259 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
260 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
262 if (local
&& (!oif
||
263 local
->rt6i_idev
->dev
->ifindex
== oif
))
269 if (ipv6_chk_addr(net
, saddr
, dev
,
270 flags
& RT6_LOOKUP_F_IFACE
))
279 if (flags
& RT6_LOOKUP_F_IFACE
)
280 return net
->ipv6
.ip6_null_entry
;
286 #ifdef CONFIG_IPV6_ROUTER_PREF
287 static void rt6_probe(struct rt6_info
*rt
)
289 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
291 * Okay, this does not seem to be appropriate
292 * for now, however, we need to check if it
293 * is really so; aka Router Reachability Probing.
295 * Router Reachability Probe MUST be rate-limited
296 * to no more than one per minute.
298 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
300 read_lock_bh(&neigh
->lock
);
301 if (!(neigh
->nud_state
& NUD_VALID
) &&
302 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
303 struct in6_addr mcaddr
;
304 struct in6_addr
*target
;
306 neigh
->updated
= jiffies
;
307 read_unlock_bh(&neigh
->lock
);
309 target
= (struct in6_addr
*)&neigh
->primary_key
;
310 addrconf_addr_solict_mult(target
, &mcaddr
);
311 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
313 read_unlock_bh(&neigh
->lock
);
316 static inline void rt6_probe(struct rt6_info
*rt
)
323 * Default Router Selection (RFC 2461 6.3.6)
325 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
327 struct net_device
*dev
= rt
->rt6i_dev
;
328 if (!oif
|| dev
->ifindex
== oif
)
330 if ((dev
->flags
& IFF_LOOPBACK
) &&
331 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
336 static inline int rt6_check_neigh(struct rt6_info
*rt
)
338 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
340 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
341 !(rt
->rt6i_flags
& RTF_GATEWAY
))
344 read_lock_bh(&neigh
->lock
);
345 if (neigh
->nud_state
& NUD_VALID
)
347 #ifdef CONFIG_IPV6_ROUTER_PREF
348 else if (neigh
->nud_state
& NUD_FAILED
)
353 read_unlock_bh(&neigh
->lock
);
359 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
364 m
= rt6_check_dev(rt
, oif
);
365 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
367 #ifdef CONFIG_IPV6_ROUTER_PREF
368 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
370 n
= rt6_check_neigh(rt
);
371 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
376 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
377 int *mpri
, struct rt6_info
*match
)
381 if (rt6_check_expired(rt
))
384 m
= rt6_score_route(rt
, oif
, strict
);
389 if (strict
& RT6_LOOKUP_F_REACHABLE
)
393 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
401 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
402 struct rt6_info
*rr_head
,
403 u32 metric
, int oif
, int strict
)
405 struct rt6_info
*rt
, *match
;
409 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
410 rt
= rt
->u
.dst
.rt6_next
)
411 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
412 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
413 rt
= rt
->u
.dst
.rt6_next
)
414 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
419 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
421 struct rt6_info
*match
, *rt0
;
424 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
425 __func__
, fn
->leaf
, oif
);
429 fn
->rr_ptr
= rt0
= fn
->leaf
;
431 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
434 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
435 struct rt6_info
*next
= rt0
->u
.dst
.rt6_next
;
437 /* no entries matched; do round-robin */
438 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
445 RT6_TRACE("%s() => %p\n",
448 net
= dev_net(rt0
->rt6i_dev
);
449 return (match
? match
: net
->ipv6
.ip6_null_entry
);
452 #ifdef CONFIG_IPV6_ROUTE_INFO
453 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
454 struct in6_addr
*gwaddr
)
456 struct net
*net
= dev_net(dev
);
457 struct route_info
*rinfo
= (struct route_info
*) opt
;
458 struct in6_addr prefix_buf
, *prefix
;
460 unsigned long lifetime
;
463 if (len
< sizeof(struct route_info
)) {
467 /* Sanity check for prefix_len and length */
468 if (rinfo
->length
> 3) {
470 } else if (rinfo
->prefix_len
> 128) {
472 } else if (rinfo
->prefix_len
> 64) {
473 if (rinfo
->length
< 2) {
476 } else if (rinfo
->prefix_len
> 0) {
477 if (rinfo
->length
< 1) {
482 pref
= rinfo
->route_pref
;
483 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
486 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
488 if (rinfo
->length
== 3)
489 prefix
= (struct in6_addr
*)rinfo
->prefix
;
491 /* this function is safe */
492 ipv6_addr_prefix(&prefix_buf
,
493 (struct in6_addr
*)rinfo
->prefix
,
495 prefix
= &prefix_buf
;
498 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
501 if (rt
&& !lifetime
) {
507 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
510 rt
->rt6i_flags
= RTF_ROUTEINFO
|
511 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
514 if (!addrconf_finite_timeout(lifetime
)) {
515 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
517 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
518 rt
->rt6i_flags
|= RTF_EXPIRES
;
520 dst_release(&rt
->u
.dst
);
526 #define BACKTRACK(__net, saddr) \
528 if (rt == __net->ipv6.ip6_null_entry) { \
529 struct fib6_node *pn; \
531 if (fn->fn_flags & RTN_TL_ROOT) \
534 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
538 if (fn->fn_flags & RTN_RTINFO) \
544 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
545 struct fib6_table
*table
,
546 struct flowi
*fl
, int flags
)
548 struct fib6_node
*fn
;
551 read_lock_bh(&table
->tb6_lock
);
552 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
555 rt
= rt6_device_match(net
, rt
, &fl
->fl6_src
, fl
->oif
, flags
);
556 BACKTRACK(net
, &fl
->fl6_src
);
558 dst_use(&rt
->u
.dst
, jiffies
);
559 read_unlock_bh(&table
->tb6_lock
);
564 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
565 const struct in6_addr
*saddr
, int oif
, int strict
)
575 struct dst_entry
*dst
;
576 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
579 memcpy(&fl
.fl6_src
, saddr
, sizeof(*saddr
));
580 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
583 dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_lookup
);
585 return (struct rt6_info
*) dst
;
592 EXPORT_SYMBOL(rt6_lookup
);
594 /* ip6_ins_rt is called with FREE table->tb6_lock.
595 It takes new route entry, the addition fails by any reason the
596 route is freed. In any case, if caller does not hold it, it may
600 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
603 struct fib6_table
*table
;
605 table
= rt
->rt6i_table
;
606 write_lock_bh(&table
->tb6_lock
);
607 err
= fib6_add(&table
->tb6_root
, rt
, info
);
608 write_unlock_bh(&table
->tb6_lock
);
613 int ip6_ins_rt(struct rt6_info
*rt
)
615 struct nl_info info
= {
616 .nl_net
= dev_net(rt
->rt6i_dev
),
618 return __ip6_ins_rt(rt
, &info
);
621 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
622 struct in6_addr
*saddr
)
630 rt
= ip6_rt_copy(ort
);
633 struct neighbour
*neigh
;
634 int attempts
= !in_softirq();
636 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
637 if (rt
->rt6i_dst
.plen
!= 128 &&
638 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
639 rt
->rt6i_flags
|= RTF_ANYCAST
;
640 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
643 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
644 rt
->rt6i_dst
.plen
= 128;
645 rt
->rt6i_flags
|= RTF_CACHE
;
646 rt
->u
.dst
.flags
|= DST_HOST
;
648 #ifdef CONFIG_IPV6_SUBTREES
649 if (rt
->rt6i_src
.plen
&& saddr
) {
650 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
651 rt
->rt6i_src
.plen
= 128;
656 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
658 struct net
*net
= dev_net(rt
->rt6i_dev
);
659 int saved_rt_min_interval
=
660 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
661 int saved_rt_elasticity
=
662 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
664 if (attempts
-- > 0) {
665 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
666 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
668 ip6_dst_gc(&net
->ipv6
.ip6_dst_ops
);
670 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
672 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
673 saved_rt_min_interval
;
679 "Neighbour table overflow.\n");
680 dst_free(&rt
->u
.dst
);
683 rt
->rt6i_nexthop
= neigh
;
690 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
692 struct rt6_info
*rt
= ip6_rt_copy(ort
);
694 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
695 rt
->rt6i_dst
.plen
= 128;
696 rt
->rt6i_flags
|= RTF_CACHE
;
697 rt
->u
.dst
.flags
|= DST_HOST
;
698 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
703 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
704 struct flowi
*fl
, int flags
)
706 struct fib6_node
*fn
;
707 struct rt6_info
*rt
, *nrt
;
711 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
713 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
716 read_lock_bh(&table
->tb6_lock
);
719 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
722 rt
= rt6_select(fn
, oif
, strict
| reachable
);
724 BACKTRACK(net
, &fl
->fl6_src
);
725 if (rt
== net
->ipv6
.ip6_null_entry
||
726 rt
->rt6i_flags
& RTF_CACHE
)
729 dst_hold(&rt
->u
.dst
);
730 read_unlock_bh(&table
->tb6_lock
);
732 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
733 nrt
= rt6_alloc_cow(rt
, &fl
->fl6_dst
, &fl
->fl6_src
);
735 #if CLONE_OFFLINK_ROUTE
736 nrt
= rt6_alloc_clone(rt
, &fl
->fl6_dst
);
742 dst_release(&rt
->u
.dst
);
743 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
745 dst_hold(&rt
->u
.dst
);
747 err
= ip6_ins_rt(nrt
);
756 * Race condition! In the gap, when table->tb6_lock was
757 * released someone could insert this route. Relookup.
759 dst_release(&rt
->u
.dst
);
767 dst_hold(&rt
->u
.dst
);
768 read_unlock_bh(&table
->tb6_lock
);
770 rt
->u
.dst
.lastuse
= jiffies
;
776 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
777 struct flowi
*fl
, int flags
)
779 return ip6_pol_route(net
, table
, fl
->iif
, fl
, flags
);
782 void ip6_route_input(struct sk_buff
*skb
)
784 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
785 struct net
*net
= dev_net(skb
->dev
);
786 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
788 .iif
= skb
->dev
->ifindex
,
793 .flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
797 .proto
= iph
->nexthdr
,
800 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
801 flags
|= RT6_LOOKUP_F_IFACE
;
803 skb_dst_set(skb
, fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_input
));
806 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
807 struct flowi
*fl
, int flags
)
809 return ip6_pol_route(net
, table
, fl
->oif
, fl
, flags
);
812 struct dst_entry
* ip6_route_output(struct net
*net
, struct sock
*sk
,
817 if (rt6_need_strict(&fl
->fl6_dst
))
818 flags
|= RT6_LOOKUP_F_IFACE
;
820 if (!ipv6_addr_any(&fl
->fl6_src
))
821 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
823 unsigned int prefs
= inet6_sk(sk
)->srcprefs
;
824 if (prefs
& IPV6_PREFER_SRC_TMP
)
825 flags
|= RT6_LOOKUP_F_SRCPREF_TMP
;
826 if (prefs
& IPV6_PREFER_SRC_PUBLIC
)
827 flags
|= RT6_LOOKUP_F_SRCPREF_PUBLIC
;
828 if (prefs
& IPV6_PREFER_SRC_COA
)
829 flags
|= RT6_LOOKUP_F_SRCPREF_COA
;
832 return fib6_rule_lookup(net
, fl
, flags
, ip6_pol_route_output
);
835 EXPORT_SYMBOL(ip6_route_output
);
837 int ip6_dst_blackhole(struct sock
*sk
, struct dst_entry
**dstp
, struct flowi
*fl
)
839 struct rt6_info
*ort
= (struct rt6_info
*) *dstp
;
840 struct rt6_info
*rt
= (struct rt6_info
*)
841 dst_alloc(&ip6_dst_blackhole_ops
);
842 struct dst_entry
*new = NULL
;
847 atomic_set(&new->__refcnt
, 1);
849 new->input
= dst_discard
;
850 new->output
= dst_discard
;
852 memcpy(new->metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
853 new->dev
= ort
->u
.dst
.dev
;
856 rt
->rt6i_idev
= ort
->rt6i_idev
;
858 in6_dev_hold(rt
->rt6i_idev
);
859 rt
->rt6i_expires
= 0;
861 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
862 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
865 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
866 #ifdef CONFIG_IPV6_SUBTREES
867 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
875 return (new ? 0 : -ENOMEM
);
877 EXPORT_SYMBOL_GPL(ip6_dst_blackhole
);
880 * Destination cache support functions
883 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
887 rt
= (struct rt6_info
*) dst
;
889 if (rt
&& rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
))
895 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
897 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
900 if (rt
->rt6i_flags
& RTF_CACHE
)
908 static void ip6_link_failure(struct sk_buff
*skb
)
912 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0, skb
->dev
);
914 rt
= (struct rt6_info
*) skb_dst(skb
);
916 if (rt
->rt6i_flags
&RTF_CACHE
) {
917 dst_set_expires(&rt
->u
.dst
, 0);
918 rt
->rt6i_flags
|= RTF_EXPIRES
;
919 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
920 rt
->rt6i_node
->fn_sernum
= -1;
924 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
926 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
928 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
929 rt6
->rt6i_flags
|= RTF_MODIFIED
;
930 if (mtu
< IPV6_MIN_MTU
) {
932 dst
->metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
934 dst
->metrics
[RTAX_MTU
-1] = mtu
;
935 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
939 static int ipv6_get_mtu(struct net_device
*dev
);
941 static inline unsigned int ipv6_advmss(struct net
*net
, unsigned int mtu
)
943 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
945 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
946 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
949 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
950 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
951 * IPV6_MAXPLEN is also valid and means: "any MSS,
952 * rely only on pmtu discovery"
954 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
959 static struct dst_entry
*icmp6_dst_gc_list
;
960 static DEFINE_SPINLOCK(icmp6_dst_lock
);
962 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
963 struct neighbour
*neigh
,
964 const struct in6_addr
*addr
)
967 struct inet6_dev
*idev
= in6_dev_get(dev
);
968 struct net
*net
= dev_net(dev
);
970 if (unlikely(idev
== NULL
))
973 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
974 if (unlikely(rt
== NULL
)) {
983 neigh
= ndisc_get_neigh(dev
, addr
);
989 rt
->rt6i_idev
= idev
;
990 rt
->rt6i_nexthop
= neigh
;
991 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
992 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = 255;
993 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
994 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
995 rt
->u
.dst
.output
= ip6_output
;
997 #if 0 /* there's no chance to use these for ndisc */
998 rt
->u
.dst
.flags
= ipv6_addr_type(addr
) & IPV6_ADDR_UNICAST
1001 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1002 rt
->rt6i_dst
.plen
= 128;
1005 spin_lock_bh(&icmp6_dst_lock
);
1006 rt
->u
.dst
.next
= icmp6_dst_gc_list
;
1007 icmp6_dst_gc_list
= &rt
->u
.dst
;
1008 spin_unlock_bh(&icmp6_dst_lock
);
1010 fib6_force_start_gc(net
);
1016 int icmp6_dst_gc(void)
1018 struct dst_entry
*dst
, *next
, **pprev
;
1023 spin_lock_bh(&icmp6_dst_lock
);
1024 pprev
= &icmp6_dst_gc_list
;
1026 while ((dst
= *pprev
) != NULL
) {
1027 if (!atomic_read(&dst
->__refcnt
)) {
1036 spin_unlock_bh(&icmp6_dst_lock
);
1041 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1044 struct dst_entry
*dst
, **pprev
;
1046 spin_lock_bh(&icmp6_dst_lock
);
1047 pprev
= &icmp6_dst_gc_list
;
1048 while ((dst
= *pprev
) != NULL
) {
1049 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1050 if (func(rt
, arg
)) {
1057 spin_unlock_bh(&icmp6_dst_lock
);
1060 static int ip6_dst_gc(struct dst_ops
*ops
)
1062 unsigned long now
= jiffies
;
1063 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1064 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1065 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1066 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1067 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1068 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1070 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1071 atomic_read(&ops
->entries
) <= rt_max_size
)
1074 net
->ipv6
.ip6_rt_gc_expire
++;
1075 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1076 net
->ipv6
.ip6_rt_last_gc
= now
;
1077 if (atomic_read(&ops
->entries
) < ops
->gc_thresh
)
1078 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1080 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1081 return (atomic_read(&ops
->entries
) > rt_max_size
);
1084 /* Clean host part of a prefix. Not necessary in radix tree,
1085 but results in cleaner routing tables.
1087 Remove it only when all the things will work!
1090 static int ipv6_get_mtu(struct net_device
*dev
)
1092 int mtu
= IPV6_MIN_MTU
;
1093 struct inet6_dev
*idev
;
1095 idev
= in6_dev_get(dev
);
1097 mtu
= idev
->cnf
.mtu6
;
1103 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1105 int hoplimit
= dst_metric(dst
, RTAX_HOPLIMIT
);
1107 struct net_device
*dev
= dst
->dev
;
1108 struct inet6_dev
*idev
= in6_dev_get(dev
);
1110 hoplimit
= idev
->cnf
.hop_limit
;
1113 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1122 int ip6_route_add(struct fib6_config
*cfg
)
1125 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1126 struct rt6_info
*rt
= NULL
;
1127 struct net_device
*dev
= NULL
;
1128 struct inet6_dev
*idev
= NULL
;
1129 struct fib6_table
*table
;
1132 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1134 #ifndef CONFIG_IPV6_SUBTREES
1135 if (cfg
->fc_src_len
)
1138 if (cfg
->fc_ifindex
) {
1140 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1143 idev
= in6_dev_get(dev
);
1148 if (cfg
->fc_metric
== 0)
1149 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1151 table
= fib6_new_table(net
, cfg
->fc_table
);
1152 if (table
== NULL
) {
1157 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1164 rt
->u
.dst
.obsolete
= -1;
1165 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1166 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1169 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1170 cfg
->fc_protocol
= RTPROT_BOOT
;
1171 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1173 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1175 if (addr_type
& IPV6_ADDR_MULTICAST
)
1176 rt
->u
.dst
.input
= ip6_mc_input
;
1178 rt
->u
.dst
.input
= ip6_forward
;
1180 rt
->u
.dst
.output
= ip6_output
;
1182 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1183 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1184 if (rt
->rt6i_dst
.plen
== 128)
1185 rt
->u
.dst
.flags
= DST_HOST
;
1187 #ifdef CONFIG_IPV6_SUBTREES
1188 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1189 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1192 rt
->rt6i_metric
= cfg
->fc_metric
;
1194 /* We cannot add true routes via loopback here,
1195 they would result in kernel looping; promote them to reject routes
1197 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1198 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
))) {
1199 /* hold loopback dev/idev if we haven't done so. */
1200 if (dev
!= net
->loopback_dev
) {
1205 dev
= net
->loopback_dev
;
1207 idev
= in6_dev_get(dev
);
1213 rt
->u
.dst
.output
= ip6_pkt_discard_out
;
1214 rt
->u
.dst
.input
= ip6_pkt_discard
;
1215 rt
->u
.dst
.error
= -ENETUNREACH
;
1216 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1220 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1221 struct in6_addr
*gw_addr
;
1224 gw_addr
= &cfg
->fc_gateway
;
1225 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1226 gwa_type
= ipv6_addr_type(gw_addr
);
1228 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1229 struct rt6_info
*grt
;
1231 /* IPv6 strictly inhibits using not link-local
1232 addresses as nexthop address.
1233 Otherwise, router will not able to send redirects.
1234 It is very good, but in some (rare!) circumstances
1235 (SIT, PtP, NBMA NOARP links) it is handy to allow
1236 some exceptions. --ANK
1239 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1242 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1244 err
= -EHOSTUNREACH
;
1248 if (dev
!= grt
->rt6i_dev
) {
1249 dst_release(&grt
->u
.dst
);
1253 dev
= grt
->rt6i_dev
;
1254 idev
= grt
->rt6i_idev
;
1256 in6_dev_hold(grt
->rt6i_idev
);
1258 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1260 dst_release(&grt
->u
.dst
);
1266 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1274 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1275 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1276 if (IS_ERR(rt
->rt6i_nexthop
)) {
1277 err
= PTR_ERR(rt
->rt6i_nexthop
);
1278 rt
->rt6i_nexthop
= NULL
;
1283 rt
->rt6i_flags
= cfg
->fc_flags
;
1290 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1291 int type
= nla_type(nla
);
1294 if (type
> RTAX_MAX
) {
1299 rt
->u
.dst
.metrics
[type
- 1] = nla_get_u32(nla
);
1304 if (dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
) == 0)
1305 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1306 if (!dst_mtu(&rt
->u
.dst
))
1307 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(dev
);
1308 if (!dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
))
1309 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1310 rt
->u
.dst
.dev
= dev
;
1311 rt
->rt6i_idev
= idev
;
1312 rt
->rt6i_table
= table
;
1314 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1316 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1324 dst_free(&rt
->u
.dst
);
1328 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1331 struct fib6_table
*table
;
1332 struct net
*net
= dev_net(rt
->rt6i_dev
);
1334 if (rt
== net
->ipv6
.ip6_null_entry
)
1337 table
= rt
->rt6i_table
;
1338 write_lock_bh(&table
->tb6_lock
);
1340 err
= fib6_del(rt
, info
);
1341 dst_release(&rt
->u
.dst
);
1343 write_unlock_bh(&table
->tb6_lock
);
1348 int ip6_del_rt(struct rt6_info
*rt
)
1350 struct nl_info info
= {
1351 .nl_net
= dev_net(rt
->rt6i_dev
),
1353 return __ip6_del_rt(rt
, &info
);
1356 static int ip6_route_del(struct fib6_config
*cfg
)
1358 struct fib6_table
*table
;
1359 struct fib6_node
*fn
;
1360 struct rt6_info
*rt
;
1363 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1367 read_lock_bh(&table
->tb6_lock
);
1369 fn
= fib6_locate(&table
->tb6_root
,
1370 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1371 &cfg
->fc_src
, cfg
->fc_src_len
);
1374 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1375 if (cfg
->fc_ifindex
&&
1376 (rt
->rt6i_dev
== NULL
||
1377 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1379 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1380 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1382 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1384 dst_hold(&rt
->u
.dst
);
1385 read_unlock_bh(&table
->tb6_lock
);
1387 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1390 read_unlock_bh(&table
->tb6_lock
);
1398 struct ip6rd_flowi
{
1400 struct in6_addr gateway
;
1403 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1404 struct fib6_table
*table
,
1408 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl
;
1409 struct rt6_info
*rt
;
1410 struct fib6_node
*fn
;
1413 * Get the "current" route for this destination and
1414 * check if the redirect has come from approriate router.
1416 * RFC 2461 specifies that redirects should only be
1417 * accepted if they come from the nexthop to the target.
1418 * Due to the way the routes are chosen, this notion
1419 * is a bit fuzzy and one might need to check all possible
1423 read_lock_bh(&table
->tb6_lock
);
1424 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
1426 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1428 * Current route is on-link; redirect is always invalid.
1430 * Seems, previous statement is not true. It could
1431 * be node, which looks for us as on-link (f.e. proxy ndisc)
1432 * But then router serving it might decide, that we should
1433 * know truth 8)8) --ANK (980726).
1435 if (rt6_check_expired(rt
))
1437 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1439 if (fl
->oif
!= rt
->rt6i_dev
->ifindex
)
1441 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1447 rt
= net
->ipv6
.ip6_null_entry
;
1448 BACKTRACK(net
, &fl
->fl6_src
);
1450 dst_hold(&rt
->u
.dst
);
1452 read_unlock_bh(&table
->tb6_lock
);
1457 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1458 struct in6_addr
*src
,
1459 struct in6_addr
*gateway
,
1460 struct net_device
*dev
)
1462 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1463 struct net
*net
= dev_net(dev
);
1464 struct ip6rd_flowi rdfl
= {
1466 .oif
= dev
->ifindex
,
1476 ipv6_addr_copy(&rdfl
.gateway
, gateway
);
1478 if (rt6_need_strict(dest
))
1479 flags
|= RT6_LOOKUP_F_IFACE
;
1481 return (struct rt6_info
*)fib6_rule_lookup(net
, (struct flowi
*)&rdfl
,
1482 flags
, __ip6_route_redirect
);
1485 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1486 struct in6_addr
*saddr
,
1487 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1489 struct rt6_info
*rt
, *nrt
= NULL
;
1490 struct netevent_redirect netevent
;
1491 struct net
*net
= dev_net(neigh
->dev
);
1493 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1495 if (rt
== net
->ipv6
.ip6_null_entry
) {
1496 if (net_ratelimit())
1497 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1498 "for redirect target\n");
1503 * We have finally decided to accept it.
1506 neigh_update(neigh
, lladdr
, NUD_STALE
,
1507 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1508 NEIGH_UPDATE_F_OVERRIDE
|
1509 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1510 NEIGH_UPDATE_F_ISROUTER
))
1514 * Redirect received -> path was valid.
1515 * Look, redirects are sent only in response to data packets,
1516 * so that this nexthop apparently is reachable. --ANK
1518 dst_confirm(&rt
->u
.dst
);
1520 /* Duplicate redirect: silently ignore. */
1521 if (neigh
== rt
->u
.dst
.neighbour
)
1524 nrt
= ip6_rt_copy(rt
);
1528 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1530 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1532 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1533 nrt
->rt6i_dst
.plen
= 128;
1534 nrt
->u
.dst
.flags
|= DST_HOST
;
1536 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1537 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1538 /* Reset pmtu, it may be better */
1539 nrt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(neigh
->dev
);
1540 nrt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(dev_net(neigh
->dev
),
1541 dst_mtu(&nrt
->u
.dst
));
1543 if (ip6_ins_rt(nrt
))
1546 netevent
.old
= &rt
->u
.dst
;
1547 netevent
.new = &nrt
->u
.dst
;
1548 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1550 if (rt
->rt6i_flags
&RTF_CACHE
) {
1556 dst_release(&rt
->u
.dst
);
1561 * Handle ICMP "packet too big" messages
1562 * i.e. Path MTU discovery
1565 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1566 struct net_device
*dev
, u32 pmtu
)
1568 struct rt6_info
*rt
, *nrt
;
1569 struct net
*net
= dev_net(dev
);
1572 rt
= rt6_lookup(net
, daddr
, saddr
, dev
->ifindex
, 0);
1576 if (pmtu
>= dst_mtu(&rt
->u
.dst
))
1579 if (pmtu
< IPV6_MIN_MTU
) {
1581 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1582 * MTU (1280) and a fragment header should always be included
1583 * after a node receiving Too Big message reporting PMTU is
1584 * less than the IPv6 Minimum Link MTU.
1586 pmtu
= IPV6_MIN_MTU
;
1590 /* New mtu received -> path was valid.
1591 They are sent only in response to data packets,
1592 so that this nexthop apparently is reachable. --ANK
1594 dst_confirm(&rt
->u
.dst
);
1596 /* Host route. If it is static, it would be better
1597 not to override it, but add new one, so that
1598 when cache entry will expire old pmtu
1599 would return automatically.
1601 if (rt
->rt6i_flags
& RTF_CACHE
) {
1602 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1604 rt
->u
.dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1605 dst_set_expires(&rt
->u
.dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1606 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1611 Two cases are possible:
1612 1. It is connected route. Action: COW
1613 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1615 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1616 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1618 nrt
= rt6_alloc_clone(rt
, daddr
);
1621 nrt
->u
.dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1623 nrt
->u
.dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1625 /* According to RFC 1981, detecting PMTU increase shouldn't be
1626 * happened within 5 mins, the recommended timer is 10 mins.
1627 * Here this route expiration time is set to ip6_rt_mtu_expires
1628 * which is 10 mins. After 10 mins the decreased pmtu is expired
1629 * and detecting PMTU increase will be automatically happened.
1631 dst_set_expires(&nrt
->u
.dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1632 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1637 dst_release(&rt
->u
.dst
);
1641 * Misc support functions
1644 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1646 struct net
*net
= dev_net(ort
->rt6i_dev
);
1647 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1650 rt
->u
.dst
.input
= ort
->u
.dst
.input
;
1651 rt
->u
.dst
.output
= ort
->u
.dst
.output
;
1653 memcpy(rt
->u
.dst
.metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
1654 rt
->u
.dst
.error
= ort
->u
.dst
.error
;
1655 rt
->u
.dst
.dev
= ort
->u
.dst
.dev
;
1657 dev_hold(rt
->u
.dst
.dev
);
1658 rt
->rt6i_idev
= ort
->rt6i_idev
;
1660 in6_dev_hold(rt
->rt6i_idev
);
1661 rt
->u
.dst
.lastuse
= jiffies
;
1662 rt
->rt6i_expires
= 0;
1664 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1665 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1666 rt
->rt6i_metric
= 0;
1668 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1669 #ifdef CONFIG_IPV6_SUBTREES
1670 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1672 rt
->rt6i_table
= ort
->rt6i_table
;
1677 #ifdef CONFIG_IPV6_ROUTE_INFO
1678 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1679 struct in6_addr
*prefix
, int prefixlen
,
1680 struct in6_addr
*gwaddr
, int ifindex
)
1682 struct fib6_node
*fn
;
1683 struct rt6_info
*rt
= NULL
;
1684 struct fib6_table
*table
;
1686 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1690 write_lock_bh(&table
->tb6_lock
);
1691 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1695 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1696 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1698 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1700 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1702 dst_hold(&rt
->u
.dst
);
1706 write_unlock_bh(&table
->tb6_lock
);
1710 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1711 struct in6_addr
*prefix
, int prefixlen
,
1712 struct in6_addr
*gwaddr
, int ifindex
,
1715 struct fib6_config cfg
= {
1716 .fc_table
= RT6_TABLE_INFO
,
1717 .fc_metric
= IP6_RT_PRIO_USER
,
1718 .fc_ifindex
= ifindex
,
1719 .fc_dst_len
= prefixlen
,
1720 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1721 RTF_UP
| RTF_PREF(pref
),
1723 .fc_nlinfo
.nlh
= NULL
,
1724 .fc_nlinfo
.nl_net
= net
,
1727 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1728 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1730 /* We should treat it as a default route if prefix length is 0. */
1732 cfg
.fc_flags
|= RTF_DEFAULT
;
1734 ip6_route_add(&cfg
);
1736 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1740 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1742 struct rt6_info
*rt
;
1743 struct fib6_table
*table
;
1745 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1749 write_lock_bh(&table
->tb6_lock
);
1750 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->u
.dst
.rt6_next
) {
1751 if (dev
== rt
->rt6i_dev
&&
1752 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1753 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1757 dst_hold(&rt
->u
.dst
);
1758 write_unlock_bh(&table
->tb6_lock
);
1762 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1763 struct net_device
*dev
,
1766 struct fib6_config cfg
= {
1767 .fc_table
= RT6_TABLE_DFLT
,
1768 .fc_metric
= IP6_RT_PRIO_USER
,
1769 .fc_ifindex
= dev
->ifindex
,
1770 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1771 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1773 .fc_nlinfo
.nlh
= NULL
,
1774 .fc_nlinfo
.nl_net
= dev_net(dev
),
1777 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1779 ip6_route_add(&cfg
);
1781 return rt6_get_dflt_router(gwaddr
, dev
);
1784 void rt6_purge_dflt_routers(struct net
*net
)
1786 struct rt6_info
*rt
;
1787 struct fib6_table
*table
;
1789 /* NOTE: Keep consistent with rt6_get_dflt_router */
1790 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1795 read_lock_bh(&table
->tb6_lock
);
1796 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1797 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1798 dst_hold(&rt
->u
.dst
);
1799 read_unlock_bh(&table
->tb6_lock
);
1804 read_unlock_bh(&table
->tb6_lock
);
1807 static void rtmsg_to_fib6_config(struct net
*net
,
1808 struct in6_rtmsg
*rtmsg
,
1809 struct fib6_config
*cfg
)
1811 memset(cfg
, 0, sizeof(*cfg
));
1813 cfg
->fc_table
= RT6_TABLE_MAIN
;
1814 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1815 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1816 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1817 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1818 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1819 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1821 cfg
->fc_nlinfo
.nl_net
= net
;
1823 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1824 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1825 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1828 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1830 struct fib6_config cfg
;
1831 struct in6_rtmsg rtmsg
;
1835 case SIOCADDRT
: /* Add a route */
1836 case SIOCDELRT
: /* Delete a route */
1837 if (!capable(CAP_NET_ADMIN
))
1839 err
= copy_from_user(&rtmsg
, arg
,
1840 sizeof(struct in6_rtmsg
));
1844 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1849 err
= ip6_route_add(&cfg
);
1852 err
= ip6_route_del(&cfg
);
1866 * Drop the packet on the floor
1869 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
1872 struct dst_entry
*dst
= skb_dst(skb
);
1873 switch (ipstats_mib_noroutes
) {
1874 case IPSTATS_MIB_INNOROUTES
:
1875 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1876 if (type
== IPV6_ADDR_ANY
|| type
== IPV6_ADDR_RESERVED
) {
1877 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1878 IPSTATS_MIB_INADDRERRORS
);
1882 case IPSTATS_MIB_OUTNOROUTES
:
1883 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1884 ipstats_mib_noroutes
);
1887 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0, skb
->dev
);
1892 static int ip6_pkt_discard(struct sk_buff
*skb
)
1894 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1897 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1899 skb
->dev
= skb_dst(skb
)->dev
;
1900 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1903 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1905 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1907 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1910 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1912 skb
->dev
= skb_dst(skb
)->dev
;
1913 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1919 * Allocate a dst for local (unicast / anycast) address.
1922 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1923 const struct in6_addr
*addr
,
1926 struct net
*net
= dev_net(idev
->dev
);
1927 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1928 struct neighbour
*neigh
;
1931 return ERR_PTR(-ENOMEM
);
1933 dev_hold(net
->loopback_dev
);
1936 rt
->u
.dst
.flags
= DST_HOST
;
1937 rt
->u
.dst
.input
= ip6_input
;
1938 rt
->u
.dst
.output
= ip6_output
;
1939 rt
->rt6i_dev
= net
->loopback_dev
;
1940 rt
->rt6i_idev
= idev
;
1941 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
1942 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1943 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1944 rt
->u
.dst
.obsolete
= -1;
1946 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
1948 rt
->rt6i_flags
|= RTF_ANYCAST
;
1950 rt
->rt6i_flags
|= RTF_LOCAL
;
1951 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
1952 if (IS_ERR(neigh
)) {
1953 dst_free(&rt
->u
.dst
);
1955 /* We are casting this because that is the return
1956 * value type. But an errno encoded pointer is the
1957 * same regardless of the underlying pointer type,
1958 * and that's what we are returning. So this is OK.
1960 return (struct rt6_info
*) neigh
;
1962 rt
->rt6i_nexthop
= neigh
;
1964 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1965 rt
->rt6i_dst
.plen
= 128;
1966 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
1968 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
1973 struct arg_dev_net
{
1974 struct net_device
*dev
;
1978 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
1980 struct net_device
*dev
= ((struct arg_dev_net
*)arg
)->dev
;
1981 struct net
*net
= ((struct arg_dev_net
*)arg
)->net
;
1983 if (((void *)rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
1984 rt
!= net
->ipv6
.ip6_null_entry
) {
1985 RT6_TRACE("deleted by ifdown %p\n", rt
);
1991 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
1993 struct arg_dev_net adn
= {
1998 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
1999 icmp6_clean_all(fib6_ifdown
, &adn
);
2002 struct rt6_mtu_change_arg
2004 struct net_device
*dev
;
2008 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2010 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2011 struct inet6_dev
*idev
;
2012 struct net
*net
= dev_net(arg
->dev
);
2014 /* In IPv6 pmtu discovery is not optional,
2015 so that RTAX_MTU lock cannot disable it.
2016 We still use this lock to block changes
2017 caused by addrconf/ndisc.
2020 idev
= __in6_dev_get(arg
->dev
);
2024 /* For administrative MTU increase, there is no way to discover
2025 IPv6 PMTU increase, so PMTU increase should be updated here.
2026 Since RFC 1981 doesn't include administrative MTU increase
2027 update PMTU increase is a MUST. (i.e. jumbo frame)
2030 If new MTU is less than route PMTU, this new MTU will be the
2031 lowest MTU in the path, update the route PMTU to reflect PMTU
2032 decreases; if new MTU is greater than route PMTU, and the
2033 old MTU is the lowest MTU in the path, update the route PMTU
2034 to reflect the increase. In this case if the other nodes' MTU
2035 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2038 if (rt
->rt6i_dev
== arg
->dev
&&
2039 !dst_metric_locked(&rt
->u
.dst
, RTAX_MTU
) &&
2040 (dst_mtu(&rt
->u
.dst
) >= arg
->mtu
||
2041 (dst_mtu(&rt
->u
.dst
) < arg
->mtu
&&
2042 dst_mtu(&rt
->u
.dst
) == idev
->cnf
.mtu6
))) {
2043 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = arg
->mtu
;
2044 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, arg
->mtu
);
2049 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2051 struct rt6_mtu_change_arg arg
= {
2056 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2059 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2060 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2061 [RTA_OIF
] = { .type
= NLA_U32
},
2062 [RTA_IIF
] = { .type
= NLA_U32
},
2063 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2064 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2067 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2068 struct fib6_config
*cfg
)
2071 struct nlattr
*tb
[RTA_MAX
+1];
2074 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2079 rtm
= nlmsg_data(nlh
);
2080 memset(cfg
, 0, sizeof(*cfg
));
2082 cfg
->fc_table
= rtm
->rtm_table
;
2083 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2084 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2085 cfg
->fc_flags
= RTF_UP
;
2086 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2088 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2089 cfg
->fc_flags
|= RTF_REJECT
;
2091 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2092 cfg
->fc_nlinfo
.nlh
= nlh
;
2093 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2095 if (tb
[RTA_GATEWAY
]) {
2096 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2097 cfg
->fc_flags
|= RTF_GATEWAY
;
2101 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2103 if (nla_len(tb
[RTA_DST
]) < plen
)
2106 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2110 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2112 if (nla_len(tb
[RTA_SRC
]) < plen
)
2115 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2119 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2121 if (tb
[RTA_PRIORITY
])
2122 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2124 if (tb
[RTA_METRICS
]) {
2125 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2126 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2130 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2137 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2139 struct fib6_config cfg
;
2142 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2146 return ip6_route_del(&cfg
);
2149 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2151 struct fib6_config cfg
;
2154 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2158 return ip6_route_add(&cfg
);
2161 static inline size_t rt6_nlmsg_size(void)
2163 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2164 + nla_total_size(16) /* RTA_SRC */
2165 + nla_total_size(16) /* RTA_DST */
2166 + nla_total_size(16) /* RTA_GATEWAY */
2167 + nla_total_size(16) /* RTA_PREFSRC */
2168 + nla_total_size(4) /* RTA_TABLE */
2169 + nla_total_size(4) /* RTA_IIF */
2170 + nla_total_size(4) /* RTA_OIF */
2171 + nla_total_size(4) /* RTA_PRIORITY */
2172 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2173 + nla_total_size(sizeof(struct rta_cacheinfo
));
2176 static int rt6_fill_node(struct net
*net
,
2177 struct sk_buff
*skb
, struct rt6_info
*rt
,
2178 struct in6_addr
*dst
, struct in6_addr
*src
,
2179 int iif
, int type
, u32 pid
, u32 seq
,
2180 int prefix
, int nowait
, unsigned int flags
)
2183 struct nlmsghdr
*nlh
;
2187 if (prefix
) { /* user wants prefix routes only */
2188 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2189 /* success since this is not a prefix route */
2194 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2198 rtm
= nlmsg_data(nlh
);
2199 rtm
->rtm_family
= AF_INET6
;
2200 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2201 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2204 table
= rt
->rt6i_table
->tb6_id
;
2206 table
= RT6_TABLE_UNSPEC
;
2207 rtm
->rtm_table
= table
;
2208 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2209 if (rt
->rt6i_flags
&RTF_REJECT
)
2210 rtm
->rtm_type
= RTN_UNREACHABLE
;
2211 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2212 rtm
->rtm_type
= RTN_LOCAL
;
2214 rtm
->rtm_type
= RTN_UNICAST
;
2216 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2217 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2218 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2219 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2220 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2221 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2222 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2223 rtm
->rtm_protocol
= RTPROT_RA
;
2225 if (rt
->rt6i_flags
&RTF_CACHE
)
2226 rtm
->rtm_flags
|= RTM_F_CLONED
;
2229 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2230 rtm
->rtm_dst_len
= 128;
2231 } else if (rtm
->rtm_dst_len
)
2232 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2233 #ifdef CONFIG_IPV6_SUBTREES
2235 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2236 rtm
->rtm_src_len
= 128;
2237 } else if (rtm
->rtm_src_len
)
2238 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2241 #ifdef CONFIG_IPV6_MROUTE
2242 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2243 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2248 goto nla_put_failure
;
2250 if (err
== -EMSGSIZE
)
2251 goto nla_put_failure
;
2256 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2258 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->u
.dst
);
2259 struct in6_addr saddr_buf
;
2260 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2261 dst
, 0, &saddr_buf
) == 0)
2262 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2265 if (rtnetlink_put_metrics(skb
, rt
->u
.dst
.metrics
) < 0)
2266 goto nla_put_failure
;
2268 if (rt
->u
.dst
.neighbour
)
2269 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->u
.dst
.neighbour
->primary_key
);
2272 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2274 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2276 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2278 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2279 expires
= rt
->rt6i_expires
- jiffies
;
2283 if (rtnl_put_cacheinfo(skb
, &rt
->u
.dst
, 0, 0, 0,
2284 expires
, rt
->u
.dst
.error
) < 0)
2285 goto nla_put_failure
;
2287 return nlmsg_end(skb
, nlh
);
2290 nlmsg_cancel(skb
, nlh
);
2294 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2296 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2299 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2300 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2301 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2305 return rt6_fill_node(arg
->net
,
2306 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2307 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2308 prefix
, 0, NLM_F_MULTI
);
2311 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2313 struct net
*net
= sock_net(in_skb
->sk
);
2314 struct nlattr
*tb
[RTA_MAX
+1];
2315 struct rt6_info
*rt
;
2316 struct sk_buff
*skb
;
2321 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2326 memset(&fl
, 0, sizeof(fl
));
2329 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2332 ipv6_addr_copy(&fl
.fl6_src
, nla_data(tb
[RTA_SRC
]));
2336 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2339 ipv6_addr_copy(&fl
.fl6_dst
, nla_data(tb
[RTA_DST
]));
2343 iif
= nla_get_u32(tb
[RTA_IIF
]);
2346 fl
.oif
= nla_get_u32(tb
[RTA_OIF
]);
2349 struct net_device
*dev
;
2350 dev
= __dev_get_by_index(net
, iif
);
2357 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2363 /* Reserve room for dummy headers, this skb can pass
2364 through good chunk of routing engine.
2366 skb_reset_mac_header(skb
);
2367 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2369 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl
);
2370 skb_dst_set(skb
, &rt
->u
.dst
);
2372 err
= rt6_fill_node(net
, skb
, rt
, &fl
.fl6_dst
, &fl
.fl6_src
, iif
,
2373 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2374 nlh
->nlmsg_seq
, 0, 0, 0);
2380 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2385 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2387 struct sk_buff
*skb
;
2388 struct net
*net
= info
->nl_net
;
2393 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2395 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2399 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2400 event
, info
->pid
, seq
, 0, 0, 0);
2402 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2403 WARN_ON(err
== -EMSGSIZE
);
2407 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2408 info
->nlh
, gfp_any());
2412 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2415 static int ip6_route_dev_notify(struct notifier_block
*this,
2416 unsigned long event
, void *data
)
2418 struct net_device
*dev
= (struct net_device
*)data
;
2419 struct net
*net
= dev_net(dev
);
2421 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2422 net
->ipv6
.ip6_null_entry
->u
.dst
.dev
= dev
;
2423 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2424 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2425 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.dev
= dev
;
2426 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2427 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.dev
= dev
;
2428 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2439 #ifdef CONFIG_PROC_FS
2441 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2452 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2454 struct seq_file
*m
= p_arg
;
2456 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2458 #ifdef CONFIG_IPV6_SUBTREES
2459 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2461 seq_puts(m
, "00000000000000000000000000000000 00 ");
2464 if (rt
->rt6i_nexthop
) {
2465 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2467 seq_puts(m
, "00000000000000000000000000000000");
2469 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2470 rt
->rt6i_metric
, atomic_read(&rt
->u
.dst
.__refcnt
),
2471 rt
->u
.dst
.__use
, rt
->rt6i_flags
,
2472 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2476 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2478 struct net
*net
= (struct net
*)m
->private;
2479 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2483 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2485 return single_open_net(inode
, file
, ipv6_route_show
);
2488 static const struct file_operations ipv6_route_proc_fops
= {
2489 .owner
= THIS_MODULE
,
2490 .open
= ipv6_route_open
,
2492 .llseek
= seq_lseek
,
2493 .release
= single_release_net
,
2496 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2498 struct net
*net
= (struct net
*)seq
->private;
2499 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2500 net
->ipv6
.rt6_stats
->fib_nodes
,
2501 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2502 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2503 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2504 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2505 atomic_read(&net
->ipv6
.ip6_dst_ops
.entries
),
2506 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2511 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2513 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2516 static const struct file_operations rt6_stats_seq_fops
= {
2517 .owner
= THIS_MODULE
,
2518 .open
= rt6_stats_seq_open
,
2520 .llseek
= seq_lseek
,
2521 .release
= single_release_net
,
2523 #endif /* CONFIG_PROC_FS */
2525 #ifdef CONFIG_SYSCTL
2528 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
,
2529 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2531 struct net
*net
= current
->nsproxy
->net_ns
;
2532 int delay
= net
->ipv6
.sysctl
.flush_delay
;
2534 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
2535 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2541 ctl_table ipv6_route_table_template
[] = {
2543 .procname
= "flush",
2544 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2545 .maxlen
= sizeof(int),
2547 .proc_handler
= ipv6_sysctl_rtcache_flush
2550 .procname
= "gc_thresh",
2551 .data
= &ip6_dst_ops_template
.gc_thresh
,
2552 .maxlen
= sizeof(int),
2554 .proc_handler
= proc_dointvec
,
2557 .procname
= "max_size",
2558 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2559 .maxlen
= sizeof(int),
2561 .proc_handler
= proc_dointvec
,
2564 .procname
= "gc_min_interval",
2565 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2566 .maxlen
= sizeof(int),
2568 .proc_handler
= proc_dointvec_jiffies
,
2571 .procname
= "gc_timeout",
2572 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2573 .maxlen
= sizeof(int),
2575 .proc_handler
= proc_dointvec_jiffies
,
2578 .procname
= "gc_interval",
2579 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2580 .maxlen
= sizeof(int),
2582 .proc_handler
= proc_dointvec_jiffies
,
2585 .procname
= "gc_elasticity",
2586 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2587 .maxlen
= sizeof(int),
2589 .proc_handler
= proc_dointvec_jiffies
,
2592 .procname
= "mtu_expires",
2593 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2594 .maxlen
= sizeof(int),
2596 .proc_handler
= proc_dointvec_jiffies
,
2599 .procname
= "min_adv_mss",
2600 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2601 .maxlen
= sizeof(int),
2603 .proc_handler
= proc_dointvec_jiffies
,
2606 .procname
= "gc_min_interval_ms",
2607 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2608 .maxlen
= sizeof(int),
2610 .proc_handler
= proc_dointvec_ms_jiffies
,
2615 struct ctl_table
*ipv6_route_sysctl_init(struct net
*net
)
2617 struct ctl_table
*table
;
2619 table
= kmemdup(ipv6_route_table_template
,
2620 sizeof(ipv6_route_table_template
),
2624 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2625 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
2626 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2627 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2628 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2629 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2630 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2631 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2632 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2633 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2640 static int ip6_route_net_init(struct net
*net
)
2644 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
2645 sizeof(net
->ipv6
.ip6_dst_ops
));
2647 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2648 sizeof(*net
->ipv6
.ip6_null_entry
),
2650 if (!net
->ipv6
.ip6_null_entry
)
2651 goto out_ip6_dst_ops
;
2652 net
->ipv6
.ip6_null_entry
->u
.dst
.path
=
2653 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2654 net
->ipv6
.ip6_null_entry
->u
.dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2656 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2657 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2658 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2660 if (!net
->ipv6
.ip6_prohibit_entry
)
2661 goto out_ip6_null_entry
;
2662 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.path
=
2663 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2664 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2666 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2667 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2669 if (!net
->ipv6
.ip6_blk_hole_entry
)
2670 goto out_ip6_prohibit_entry
;
2671 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.path
=
2672 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2673 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2676 net
->ipv6
.sysctl
.flush_delay
= 0;
2677 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2678 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2679 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2680 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2681 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2682 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2683 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2685 #ifdef CONFIG_PROC_FS
2686 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2687 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2689 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2695 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2696 out_ip6_prohibit_entry
:
2697 kfree(net
->ipv6
.ip6_prohibit_entry
);
2699 kfree(net
->ipv6
.ip6_null_entry
);
2705 static void ip6_route_net_exit(struct net
*net
)
2707 #ifdef CONFIG_PROC_FS
2708 proc_net_remove(net
, "ipv6_route");
2709 proc_net_remove(net
, "rt6_stats");
2711 kfree(net
->ipv6
.ip6_null_entry
);
2712 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2713 kfree(net
->ipv6
.ip6_prohibit_entry
);
2714 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2718 static struct pernet_operations ip6_route_net_ops
= {
2719 .init
= ip6_route_net_init
,
2720 .exit
= ip6_route_net_exit
,
2723 static struct notifier_block ip6_route_dev_notifier
= {
2724 .notifier_call
= ip6_route_dev_notify
,
2728 int __init
ip6_route_init(void)
2733 ip6_dst_ops_template
.kmem_cachep
=
2734 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2735 SLAB_HWCACHE_ALIGN
, NULL
);
2736 if (!ip6_dst_ops_template
.kmem_cachep
)
2739 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2741 goto out_kmem_cache
;
2743 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2745 /* Registering of the loopback is done before this portion of code,
2746 * the loopback reference in rt6_info will not be taken, do it
2747 * manually for init_net */
2748 init_net
.ipv6
.ip6_null_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2749 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2750 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2751 init_net
.ipv6
.ip6_prohibit_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2752 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2753 init_net
.ipv6
.ip6_blk_hole_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2754 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2758 goto out_register_subsys
;
2764 ret
= fib6_rules_init();
2769 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2770 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2771 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2772 goto fib6_rules_init
;
2774 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2776 goto fib6_rules_init
;
2782 fib6_rules_cleanup();
2787 out_register_subsys
:
2788 unregister_pernet_subsys(&ip6_route_net_ops
);
2790 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2794 void ip6_route_cleanup(void)
2796 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2797 fib6_rules_cleanup();
2800 unregister_pernet_subsys(&ip6_route_net_ops
);
2801 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);