Staging: Merge branch 'tidspbridge-for-2.6.39' of git://dev.omapzoom.org/pub/scm...
[zen-stable.git] / net / ipv6 / route.c
blob1534508f6c68a3c4f010657e94051e06a7d727c4
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 struct in6_addr *prefix, int prefixlen,
93 struct in6_addr *gwaddr, int ifindex,
94 unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 struct in6_addr *prefix, int prefixlen,
97 struct in6_addr *gwaddr, int ifindex);
98 #endif
100 static struct dst_ops ip6_dst_ops_template = {
101 .family = AF_INET6,
102 .protocol = cpu_to_be16(ETH_P_IPV6),
103 .gc = ip6_dst_gc,
104 .gc_thresh = 1024,
105 .check = ip6_dst_check,
106 .default_advmss = ip6_default_advmss,
107 .default_mtu = ip6_default_mtu,
108 .destroy = ip6_dst_destroy,
109 .ifdown = ip6_dst_ifdown,
110 .negative_advice = ip6_negative_advice,
111 .link_failure = ip6_link_failure,
112 .update_pmtu = ip6_rt_update_pmtu,
113 .local_out = __ip6_local_out,
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
120 static struct dst_ops ip6_dst_blackhole_ops = {
121 .family = AF_INET6,
122 .protocol = cpu_to_be16(ETH_P_IPV6),
123 .destroy = ip6_dst_destroy,
124 .check = ip6_dst_check,
125 .update_pmtu = ip6_rt_blackhole_update_pmtu,
128 static struct rt6_info ip6_null_entry_template = {
129 .dst = {
130 .__refcnt = ATOMIC_INIT(1),
131 .__use = 1,
132 .obsolete = -1,
133 .error = -ENETUNREACH,
134 .input = ip6_pkt_discard,
135 .output = ip6_pkt_discard_out,
137 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
138 .rt6i_protocol = RTPROT_KERNEL,
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 static int ip6_pkt_prohibit(struct sk_buff *skb);
146 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148 static struct rt6_info ip6_prohibit_entry_template = {
149 .dst = {
150 .__refcnt = ATOMIC_INIT(1),
151 .__use = 1,
152 .obsolete = -1,
153 .error = -EACCES,
154 .input = ip6_pkt_prohibit,
155 .output = ip6_pkt_prohibit_out,
157 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
158 .rt6i_protocol = RTPROT_KERNEL,
159 .rt6i_metric = ~(u32) 0,
160 .rt6i_ref = ATOMIC_INIT(1),
163 static struct rt6_info ip6_blk_hole_entry_template = {
164 .dst = {
165 .__refcnt = ATOMIC_INIT(1),
166 .__use = 1,
167 .obsolete = -1,
168 .error = -EINVAL,
169 .input = dst_discard,
170 .output = dst_discard,
172 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
173 .rt6i_protocol = RTPROT_KERNEL,
174 .rt6i_metric = ~(u32) 0,
175 .rt6i_ref = ATOMIC_INIT(1),
178 #endif
180 /* allocate dst with ip6_dst_ops */
181 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
183 return (struct rt6_info *)dst_alloc(ops);
186 static void ip6_dst_destroy(struct dst_entry *dst)
188 struct rt6_info *rt = (struct rt6_info *)dst;
189 struct inet6_dev *idev = rt->rt6i_idev;
190 struct inet_peer *peer = rt->rt6i_peer;
192 if (idev != NULL) {
193 rt->rt6i_idev = NULL;
194 in6_dev_put(idev);
196 if (peer) {
197 BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
198 rt->rt6i_peer = NULL;
199 inet_putpeer(peer);
203 void rt6_bind_peer(struct rt6_info *rt, int create)
205 struct inet_peer *peer;
207 if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
208 return;
210 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
211 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
212 inet_putpeer(peer);
215 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
216 int how)
218 struct rt6_info *rt = (struct rt6_info *)dst;
219 struct inet6_dev *idev = rt->rt6i_idev;
220 struct net_device *loopback_dev =
221 dev_net(dev)->loopback_dev;
223 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
224 struct inet6_dev *loopback_idev =
225 in6_dev_get(loopback_dev);
226 if (loopback_idev != NULL) {
227 rt->rt6i_idev = loopback_idev;
228 in6_dev_put(idev);
233 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
235 return (rt->rt6i_flags & RTF_EXPIRES) &&
236 time_after(jiffies, rt->rt6i_expires);
239 static inline int rt6_need_strict(struct in6_addr *daddr)
241 return ipv6_addr_type(daddr) &
242 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
246 * Route lookup. Any table->tb6_lock is implied.
249 static inline struct rt6_info *rt6_device_match(struct net *net,
250 struct rt6_info *rt,
251 struct in6_addr *saddr,
252 int oif,
253 int flags)
255 struct rt6_info *local = NULL;
256 struct rt6_info *sprt;
258 if (!oif && ipv6_addr_any(saddr))
259 goto out;
261 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
262 struct net_device *dev = sprt->rt6i_dev;
264 if (oif) {
265 if (dev->ifindex == oif)
266 return sprt;
267 if (dev->flags & IFF_LOOPBACK) {
268 if (sprt->rt6i_idev == NULL ||
269 sprt->rt6i_idev->dev->ifindex != oif) {
270 if (flags & RT6_LOOKUP_F_IFACE && oif)
271 continue;
272 if (local && (!oif ||
273 local->rt6i_idev->dev->ifindex == oif))
274 continue;
276 local = sprt;
278 } else {
279 if (ipv6_chk_addr(net, saddr, dev,
280 flags & RT6_LOOKUP_F_IFACE))
281 return sprt;
285 if (oif) {
286 if (local)
287 return local;
289 if (flags & RT6_LOOKUP_F_IFACE)
290 return net->ipv6.ip6_null_entry;
292 out:
293 return rt;
296 #ifdef CONFIG_IPV6_ROUTER_PREF
297 static void rt6_probe(struct rt6_info *rt)
299 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
301 * Okay, this does not seem to be appropriate
302 * for now, however, we need to check if it
303 * is really so; aka Router Reachability Probing.
305 * Router Reachability Probe MUST be rate-limited
306 * to no more than one per minute.
308 if (!neigh || (neigh->nud_state & NUD_VALID))
309 return;
310 read_lock_bh(&neigh->lock);
311 if (!(neigh->nud_state & NUD_VALID) &&
312 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
313 struct in6_addr mcaddr;
314 struct in6_addr *target;
316 neigh->updated = jiffies;
317 read_unlock_bh(&neigh->lock);
319 target = (struct in6_addr *)&neigh->primary_key;
320 addrconf_addr_solict_mult(target, &mcaddr);
321 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
322 } else
323 read_unlock_bh(&neigh->lock);
325 #else
326 static inline void rt6_probe(struct rt6_info *rt)
329 #endif
332 * Default Router Selection (RFC 2461 6.3.6)
334 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
336 struct net_device *dev = rt->rt6i_dev;
337 if (!oif || dev->ifindex == oif)
338 return 2;
339 if ((dev->flags & IFF_LOOPBACK) &&
340 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
341 return 1;
342 return 0;
345 static inline int rt6_check_neigh(struct rt6_info *rt)
347 struct neighbour *neigh = rt->rt6i_nexthop;
348 int m;
349 if (rt->rt6i_flags & RTF_NONEXTHOP ||
350 !(rt->rt6i_flags & RTF_GATEWAY))
351 m = 1;
352 else if (neigh) {
353 read_lock_bh(&neigh->lock);
354 if (neigh->nud_state & NUD_VALID)
355 m = 2;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 else if (neigh->nud_state & NUD_FAILED)
358 m = 0;
359 #endif
360 else
361 m = 1;
362 read_unlock_bh(&neigh->lock);
363 } else
364 m = 0;
365 return m;
368 static int rt6_score_route(struct rt6_info *rt, int oif,
369 int strict)
371 int m, n;
373 m = rt6_check_dev(rt, oif);
374 if (!m && (strict & RT6_LOOKUP_F_IFACE))
375 return -1;
376 #ifdef CONFIG_IPV6_ROUTER_PREF
377 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
378 #endif
379 n = rt6_check_neigh(rt);
380 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
381 return -1;
382 return m;
385 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
386 int *mpri, struct rt6_info *match)
388 int m;
390 if (rt6_check_expired(rt))
391 goto out;
393 m = rt6_score_route(rt, oif, strict);
394 if (m < 0)
395 goto out;
397 if (m > *mpri) {
398 if (strict & RT6_LOOKUP_F_REACHABLE)
399 rt6_probe(match);
400 *mpri = m;
401 match = rt;
402 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
403 rt6_probe(rt);
406 out:
407 return match;
410 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
411 struct rt6_info *rr_head,
412 u32 metric, int oif, int strict)
414 struct rt6_info *rt, *match;
415 int mpri = -1;
417 match = NULL;
418 for (rt = rr_head; rt && rt->rt6i_metric == metric;
419 rt = rt->dst.rt6_next)
420 match = find_match(rt, oif, strict, &mpri, match);
421 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
422 rt = rt->dst.rt6_next)
423 match = find_match(rt, oif, strict, &mpri, match);
425 return match;
428 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
430 struct rt6_info *match, *rt0;
431 struct net *net;
433 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
434 __func__, fn->leaf, oif);
436 rt0 = fn->rr_ptr;
437 if (!rt0)
438 fn->rr_ptr = rt0 = fn->leaf;
440 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
442 if (!match &&
443 (strict & RT6_LOOKUP_F_REACHABLE)) {
444 struct rt6_info *next = rt0->dst.rt6_next;
446 /* no entries matched; do round-robin */
447 if (!next || next->rt6i_metric != rt0->rt6i_metric)
448 next = fn->leaf;
450 if (next != rt0)
451 fn->rr_ptr = next;
454 RT6_TRACE("%s() => %p\n",
455 __func__, match);
457 net = dev_net(rt0->rt6i_dev);
458 return match ? match : net->ipv6.ip6_null_entry;
461 #ifdef CONFIG_IPV6_ROUTE_INFO
462 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
463 struct in6_addr *gwaddr)
465 struct net *net = dev_net(dev);
466 struct route_info *rinfo = (struct route_info *) opt;
467 struct in6_addr prefix_buf, *prefix;
468 unsigned int pref;
469 unsigned long lifetime;
470 struct rt6_info *rt;
472 if (len < sizeof(struct route_info)) {
473 return -EINVAL;
476 /* Sanity check for prefix_len and length */
477 if (rinfo->length > 3) {
478 return -EINVAL;
479 } else if (rinfo->prefix_len > 128) {
480 return -EINVAL;
481 } else if (rinfo->prefix_len > 64) {
482 if (rinfo->length < 2) {
483 return -EINVAL;
485 } else if (rinfo->prefix_len > 0) {
486 if (rinfo->length < 1) {
487 return -EINVAL;
491 pref = rinfo->route_pref;
492 if (pref == ICMPV6_ROUTER_PREF_INVALID)
493 return -EINVAL;
495 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
497 if (rinfo->length == 3)
498 prefix = (struct in6_addr *)rinfo->prefix;
499 else {
500 /* this function is safe */
501 ipv6_addr_prefix(&prefix_buf,
502 (struct in6_addr *)rinfo->prefix,
503 rinfo->prefix_len);
504 prefix = &prefix_buf;
507 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
508 dev->ifindex);
510 if (rt && !lifetime) {
511 ip6_del_rt(rt);
512 rt = NULL;
515 if (!rt && lifetime)
516 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
517 pref);
518 else if (rt)
519 rt->rt6i_flags = RTF_ROUTEINFO |
520 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
522 if (rt) {
523 if (!addrconf_finite_timeout(lifetime)) {
524 rt->rt6i_flags &= ~RTF_EXPIRES;
525 } else {
526 rt->rt6i_expires = jiffies + HZ * lifetime;
527 rt->rt6i_flags |= RTF_EXPIRES;
529 dst_release(&rt->dst);
531 return 0;
533 #endif
535 #define BACKTRACK(__net, saddr) \
536 do { \
537 if (rt == __net->ipv6.ip6_null_entry) { \
538 struct fib6_node *pn; \
539 while (1) { \
540 if (fn->fn_flags & RTN_TL_ROOT) \
541 goto out; \
542 pn = fn->parent; \
543 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
544 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
545 else \
546 fn = pn; \
547 if (fn->fn_flags & RTN_RTINFO) \
548 goto restart; \
551 } while(0)
553 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
554 struct fib6_table *table,
555 struct flowi *fl, int flags)
557 struct fib6_node *fn;
558 struct rt6_info *rt;
560 read_lock_bh(&table->tb6_lock);
561 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
562 restart:
563 rt = fn->leaf;
564 rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
565 BACKTRACK(net, &fl->fl6_src);
566 out:
567 dst_use(&rt->dst, jiffies);
568 read_unlock_bh(&table->tb6_lock);
569 return rt;
573 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
574 const struct in6_addr *saddr, int oif, int strict)
576 struct flowi fl = {
577 .oif = oif,
578 .fl6_dst = *daddr,
580 struct dst_entry *dst;
581 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
583 if (saddr) {
584 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
585 flags |= RT6_LOOKUP_F_HAS_SADDR;
588 dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
589 if (dst->error == 0)
590 return (struct rt6_info *) dst;
592 dst_release(dst);
594 return NULL;
597 EXPORT_SYMBOL(rt6_lookup);
599 /* ip6_ins_rt is called with FREE table->tb6_lock.
600 It takes new route entry, the addition fails by any reason the
601 route is freed. In any case, if caller does not hold it, it may
602 be destroyed.
605 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
607 int err;
608 struct fib6_table *table;
610 table = rt->rt6i_table;
611 write_lock_bh(&table->tb6_lock);
612 err = fib6_add(&table->tb6_root, rt, info);
613 write_unlock_bh(&table->tb6_lock);
615 return err;
618 int ip6_ins_rt(struct rt6_info *rt)
620 struct nl_info info = {
621 .nl_net = dev_net(rt->rt6i_dev),
623 return __ip6_ins_rt(rt, &info);
626 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
627 struct in6_addr *saddr)
629 struct rt6_info *rt;
632 * Clone the route.
635 rt = ip6_rt_copy(ort);
637 if (rt) {
638 struct neighbour *neigh;
639 int attempts = !in_softirq();
641 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
642 if (rt->rt6i_dst.plen != 128 &&
643 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
644 rt->rt6i_flags |= RTF_ANYCAST;
645 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
648 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
649 rt->rt6i_dst.plen = 128;
650 rt->rt6i_flags |= RTF_CACHE;
651 rt->dst.flags |= DST_HOST;
653 #ifdef CONFIG_IPV6_SUBTREES
654 if (rt->rt6i_src.plen && saddr) {
655 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
656 rt->rt6i_src.plen = 128;
658 #endif
660 retry:
661 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
662 if (IS_ERR(neigh)) {
663 struct net *net = dev_net(rt->rt6i_dev);
664 int saved_rt_min_interval =
665 net->ipv6.sysctl.ip6_rt_gc_min_interval;
666 int saved_rt_elasticity =
667 net->ipv6.sysctl.ip6_rt_gc_elasticity;
669 if (attempts-- > 0) {
670 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
671 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
673 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
675 net->ipv6.sysctl.ip6_rt_gc_elasticity =
676 saved_rt_elasticity;
677 net->ipv6.sysctl.ip6_rt_gc_min_interval =
678 saved_rt_min_interval;
679 goto retry;
682 if (net_ratelimit())
683 printk(KERN_WARNING
684 "ipv6: Neighbour table overflow.\n");
685 dst_free(&rt->dst);
686 return NULL;
688 rt->rt6i_nexthop = neigh;
692 return rt;
695 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
697 struct rt6_info *rt = ip6_rt_copy(ort);
698 if (rt) {
699 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
700 rt->rt6i_dst.plen = 128;
701 rt->rt6i_flags |= RTF_CACHE;
702 rt->dst.flags |= DST_HOST;
703 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
705 return rt;
708 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
709 struct flowi *fl, int flags)
711 struct fib6_node *fn;
712 struct rt6_info *rt, *nrt;
713 int strict = 0;
714 int attempts = 3;
715 int err;
716 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
718 strict |= flags & RT6_LOOKUP_F_IFACE;
720 relookup:
721 read_lock_bh(&table->tb6_lock);
723 restart_2:
724 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
726 restart:
727 rt = rt6_select(fn, oif, strict | reachable);
729 BACKTRACK(net, &fl->fl6_src);
730 if (rt == net->ipv6.ip6_null_entry ||
731 rt->rt6i_flags & RTF_CACHE)
732 goto out;
734 dst_hold(&rt->dst);
735 read_unlock_bh(&table->tb6_lock);
737 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
738 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
739 else
740 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
742 dst_release(&rt->dst);
743 rt = nrt ? : net->ipv6.ip6_null_entry;
745 dst_hold(&rt->dst);
746 if (nrt) {
747 err = ip6_ins_rt(nrt);
748 if (!err)
749 goto out2;
752 if (--attempts <= 0)
753 goto out2;
756 * Race condition! In the gap, when table->tb6_lock was
757 * released someone could insert this route. Relookup.
759 dst_release(&rt->dst);
760 goto relookup;
762 out:
763 if (reachable) {
764 reachable = 0;
765 goto restart_2;
767 dst_hold(&rt->dst);
768 read_unlock_bh(&table->tb6_lock);
769 out2:
770 rt->dst.lastuse = jiffies;
771 rt->dst.__use++;
773 return rt;
776 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
777 struct flowi *fl, int flags)
779 return ip6_pol_route(net, table, fl->iif, fl, flags);
782 void ip6_route_input(struct sk_buff *skb)
784 struct ipv6hdr *iph = ipv6_hdr(skb);
785 struct net *net = dev_net(skb->dev);
786 int flags = RT6_LOOKUP_F_HAS_SADDR;
787 struct flowi fl = {
788 .iif = skb->dev->ifindex,
789 .fl6_dst = iph->daddr,
790 .fl6_src = iph->saddr,
791 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
792 .mark = skb->mark,
793 .proto = iph->nexthdr,
796 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
797 flags |= RT6_LOOKUP_F_IFACE;
799 skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
802 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
803 struct flowi *fl, int flags)
805 return ip6_pol_route(net, table, fl->oif, fl, flags);
808 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
809 struct flowi *fl)
811 int flags = 0;
813 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
814 flags |= RT6_LOOKUP_F_IFACE;
816 if (!ipv6_addr_any(&fl->fl6_src))
817 flags |= RT6_LOOKUP_F_HAS_SADDR;
818 else if (sk)
819 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
821 return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
824 EXPORT_SYMBOL(ip6_route_output);
826 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
828 struct rt6_info *ort = (struct rt6_info *) *dstp;
829 struct rt6_info *rt = (struct rt6_info *)
830 dst_alloc(&ip6_dst_blackhole_ops);
831 struct dst_entry *new = NULL;
833 if (rt) {
834 new = &rt->dst;
836 atomic_set(&new->__refcnt, 1);
837 new->__use = 1;
838 new->input = dst_discard;
839 new->output = dst_discard;
841 dst_copy_metrics(new, &ort->dst);
842 new->dev = ort->dst.dev;
843 if (new->dev)
844 dev_hold(new->dev);
845 rt->rt6i_idev = ort->rt6i_idev;
846 if (rt->rt6i_idev)
847 in6_dev_hold(rt->rt6i_idev);
848 rt->rt6i_expires = 0;
850 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
851 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
852 rt->rt6i_metric = 0;
854 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
855 #ifdef CONFIG_IPV6_SUBTREES
856 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
857 #endif
859 dst_free(new);
862 dst_release(*dstp);
863 *dstp = new;
864 return new ? 0 : -ENOMEM;
866 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
869 * Destination cache support functions
872 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
874 struct rt6_info *rt;
876 rt = (struct rt6_info *) dst;
878 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
879 return dst;
881 return NULL;
884 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
886 struct rt6_info *rt = (struct rt6_info *) dst;
888 if (rt) {
889 if (rt->rt6i_flags & RTF_CACHE) {
890 if (rt6_check_expired(rt)) {
891 ip6_del_rt(rt);
892 dst = NULL;
894 } else {
895 dst_release(dst);
896 dst = NULL;
899 return dst;
902 static void ip6_link_failure(struct sk_buff *skb)
904 struct rt6_info *rt;
906 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
908 rt = (struct rt6_info *) skb_dst(skb);
909 if (rt) {
910 if (rt->rt6i_flags&RTF_CACHE) {
911 dst_set_expires(&rt->dst, 0);
912 rt->rt6i_flags |= RTF_EXPIRES;
913 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
914 rt->rt6i_node->fn_sernum = -1;
918 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
920 struct rt6_info *rt6 = (struct rt6_info*)dst;
922 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
923 rt6->rt6i_flags |= RTF_MODIFIED;
924 if (mtu < IPV6_MIN_MTU) {
925 u32 features = dst_metric(dst, RTAX_FEATURES);
926 mtu = IPV6_MIN_MTU;
927 features |= RTAX_FEATURE_ALLFRAG;
928 dst_metric_set(dst, RTAX_FEATURES, features);
930 dst_metric_set(dst, RTAX_MTU, mtu);
931 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
935 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
937 struct net_device *dev = dst->dev;
938 unsigned int mtu = dst_mtu(dst);
939 struct net *net = dev_net(dev);
941 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
943 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
944 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
947 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
948 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
949 * IPV6_MAXPLEN is also valid and means: "any MSS,
950 * rely only on pmtu discovery"
952 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
953 mtu = IPV6_MAXPLEN;
954 return mtu;
957 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
959 unsigned int mtu = IPV6_MIN_MTU;
960 struct inet6_dev *idev;
962 rcu_read_lock();
963 idev = __in6_dev_get(dst->dev);
964 if (idev)
965 mtu = idev->cnf.mtu6;
966 rcu_read_unlock();
968 return mtu;
971 static struct dst_entry *icmp6_dst_gc_list;
972 static DEFINE_SPINLOCK(icmp6_dst_lock);
974 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
975 struct neighbour *neigh,
976 const struct in6_addr *addr)
978 struct rt6_info *rt;
979 struct inet6_dev *idev = in6_dev_get(dev);
980 struct net *net = dev_net(dev);
982 if (unlikely(idev == NULL))
983 return NULL;
985 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
986 if (unlikely(rt == NULL)) {
987 in6_dev_put(idev);
988 goto out;
991 dev_hold(dev);
992 if (neigh)
993 neigh_hold(neigh);
994 else {
995 neigh = ndisc_get_neigh(dev, addr);
996 if (IS_ERR(neigh))
997 neigh = NULL;
1000 rt->rt6i_dev = dev;
1001 rt->rt6i_idev = idev;
1002 rt->rt6i_nexthop = neigh;
1003 atomic_set(&rt->dst.__refcnt, 1);
1004 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1005 rt->dst.output = ip6_output;
1007 #if 0 /* there's no chance to use these for ndisc */
1008 rt->dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1009 ? DST_HOST
1010 : 0;
1011 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1012 rt->rt6i_dst.plen = 128;
1013 #endif
1015 spin_lock_bh(&icmp6_dst_lock);
1016 rt->dst.next = icmp6_dst_gc_list;
1017 icmp6_dst_gc_list = &rt->dst;
1018 spin_unlock_bh(&icmp6_dst_lock);
1020 fib6_force_start_gc(net);
1022 out:
1023 return &rt->dst;
1026 int icmp6_dst_gc(void)
1028 struct dst_entry *dst, *next, **pprev;
1029 int more = 0;
1031 next = NULL;
1033 spin_lock_bh(&icmp6_dst_lock);
1034 pprev = &icmp6_dst_gc_list;
1036 while ((dst = *pprev) != NULL) {
1037 if (!atomic_read(&dst->__refcnt)) {
1038 *pprev = dst->next;
1039 dst_free(dst);
1040 } else {
1041 pprev = &dst->next;
1042 ++more;
1046 spin_unlock_bh(&icmp6_dst_lock);
1048 return more;
1051 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1052 void *arg)
1054 struct dst_entry *dst, **pprev;
1056 spin_lock_bh(&icmp6_dst_lock);
1057 pprev = &icmp6_dst_gc_list;
1058 while ((dst = *pprev) != NULL) {
1059 struct rt6_info *rt = (struct rt6_info *) dst;
1060 if (func(rt, arg)) {
1061 *pprev = dst->next;
1062 dst_free(dst);
1063 } else {
1064 pprev = &dst->next;
1067 spin_unlock_bh(&icmp6_dst_lock);
1070 static int ip6_dst_gc(struct dst_ops *ops)
1072 unsigned long now = jiffies;
1073 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1074 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1075 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1076 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1077 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1078 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1079 int entries;
1081 entries = dst_entries_get_fast(ops);
1082 if (time_after(rt_last_gc + rt_min_interval, now) &&
1083 entries <= rt_max_size)
1084 goto out;
1086 net->ipv6.ip6_rt_gc_expire++;
1087 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1088 net->ipv6.ip6_rt_last_gc = now;
1089 entries = dst_entries_get_slow(ops);
1090 if (entries < ops->gc_thresh)
1091 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1092 out:
1093 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1094 return entries > rt_max_size;
1097 /* Clean host part of a prefix. Not necessary in radix tree,
1098 but results in cleaner routing tables.
1100 Remove it only when all the things will work!
1103 int ip6_dst_hoplimit(struct dst_entry *dst)
1105 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1106 if (hoplimit == 0) {
1107 struct net_device *dev = dst->dev;
1108 struct inet6_dev *idev;
1110 rcu_read_lock();
1111 idev = __in6_dev_get(dev);
1112 if (idev)
1113 hoplimit = idev->cnf.hop_limit;
1114 else
1115 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1116 rcu_read_unlock();
1118 return hoplimit;
1120 EXPORT_SYMBOL(ip6_dst_hoplimit);
1126 int ip6_route_add(struct fib6_config *cfg)
1128 int err;
1129 struct net *net = cfg->fc_nlinfo.nl_net;
1130 struct rt6_info *rt = NULL;
1131 struct net_device *dev = NULL;
1132 struct inet6_dev *idev = NULL;
1133 struct fib6_table *table;
1134 int addr_type;
1136 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1137 return -EINVAL;
1138 #ifndef CONFIG_IPV6_SUBTREES
1139 if (cfg->fc_src_len)
1140 return -EINVAL;
1141 #endif
1142 if (cfg->fc_ifindex) {
1143 err = -ENODEV;
1144 dev = dev_get_by_index(net, cfg->fc_ifindex);
1145 if (!dev)
1146 goto out;
1147 idev = in6_dev_get(dev);
1148 if (!idev)
1149 goto out;
1152 if (cfg->fc_metric == 0)
1153 cfg->fc_metric = IP6_RT_PRIO_USER;
1155 table = fib6_new_table(net, cfg->fc_table);
1156 if (table == NULL) {
1157 err = -ENOBUFS;
1158 goto out;
1161 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1163 if (rt == NULL) {
1164 err = -ENOMEM;
1165 goto out;
1168 rt->dst.obsolete = -1;
1169 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1170 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1173 if (cfg->fc_protocol == RTPROT_UNSPEC)
1174 cfg->fc_protocol = RTPROT_BOOT;
1175 rt->rt6i_protocol = cfg->fc_protocol;
1177 addr_type = ipv6_addr_type(&cfg->fc_dst);
1179 if (addr_type & IPV6_ADDR_MULTICAST)
1180 rt->dst.input = ip6_mc_input;
1181 else if (cfg->fc_flags & RTF_LOCAL)
1182 rt->dst.input = ip6_input;
1183 else
1184 rt->dst.input = ip6_forward;
1186 rt->dst.output = ip6_output;
1188 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1189 rt->rt6i_dst.plen = cfg->fc_dst_len;
1190 if (rt->rt6i_dst.plen == 128)
1191 rt->dst.flags = DST_HOST;
1193 #ifdef CONFIG_IPV6_SUBTREES
1194 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1195 rt->rt6i_src.plen = cfg->fc_src_len;
1196 #endif
1198 rt->rt6i_metric = cfg->fc_metric;
1200 /* We cannot add true routes via loopback here,
1201 they would result in kernel looping; promote them to reject routes
1203 if ((cfg->fc_flags & RTF_REJECT) ||
1204 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1205 && !(cfg->fc_flags&RTF_LOCAL))) {
1206 /* hold loopback dev/idev if we haven't done so. */
1207 if (dev != net->loopback_dev) {
1208 if (dev) {
1209 dev_put(dev);
1210 in6_dev_put(idev);
1212 dev = net->loopback_dev;
1213 dev_hold(dev);
1214 idev = in6_dev_get(dev);
1215 if (!idev) {
1216 err = -ENODEV;
1217 goto out;
1220 rt->dst.output = ip6_pkt_discard_out;
1221 rt->dst.input = ip6_pkt_discard;
1222 rt->dst.error = -ENETUNREACH;
1223 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1224 goto install_route;
1227 if (cfg->fc_flags & RTF_GATEWAY) {
1228 struct in6_addr *gw_addr;
1229 int gwa_type;
1231 gw_addr = &cfg->fc_gateway;
1232 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1233 gwa_type = ipv6_addr_type(gw_addr);
1235 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1236 struct rt6_info *grt;
1238 /* IPv6 strictly inhibits using not link-local
1239 addresses as nexthop address.
1240 Otherwise, router will not able to send redirects.
1241 It is very good, but in some (rare!) circumstances
1242 (SIT, PtP, NBMA NOARP links) it is handy to allow
1243 some exceptions. --ANK
1245 err = -EINVAL;
1246 if (!(gwa_type&IPV6_ADDR_UNICAST))
1247 goto out;
1249 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1251 err = -EHOSTUNREACH;
1252 if (grt == NULL)
1253 goto out;
1254 if (dev) {
1255 if (dev != grt->rt6i_dev) {
1256 dst_release(&grt->dst);
1257 goto out;
1259 } else {
1260 dev = grt->rt6i_dev;
1261 idev = grt->rt6i_idev;
1262 dev_hold(dev);
1263 in6_dev_hold(grt->rt6i_idev);
1265 if (!(grt->rt6i_flags&RTF_GATEWAY))
1266 err = 0;
1267 dst_release(&grt->dst);
1269 if (err)
1270 goto out;
1272 err = -EINVAL;
1273 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1274 goto out;
1277 err = -ENODEV;
1278 if (dev == NULL)
1279 goto out;
1281 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1282 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1283 if (IS_ERR(rt->rt6i_nexthop)) {
1284 err = PTR_ERR(rt->rt6i_nexthop);
1285 rt->rt6i_nexthop = NULL;
1286 goto out;
1290 rt->rt6i_flags = cfg->fc_flags;
1292 install_route:
1293 if (cfg->fc_mx) {
1294 struct nlattr *nla;
1295 int remaining;
1297 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1298 int type = nla_type(nla);
1300 if (type) {
1301 if (type > RTAX_MAX) {
1302 err = -EINVAL;
1303 goto out;
1306 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1311 rt->dst.dev = dev;
1312 rt->rt6i_idev = idev;
1313 rt->rt6i_table = table;
1315 cfg->fc_nlinfo.nl_net = dev_net(dev);
1317 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1319 out:
1320 if (dev)
1321 dev_put(dev);
1322 if (idev)
1323 in6_dev_put(idev);
1324 if (rt)
1325 dst_free(&rt->dst);
1326 return err;
1329 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1331 int err;
1332 struct fib6_table *table;
1333 struct net *net = dev_net(rt->rt6i_dev);
1335 if (rt == net->ipv6.ip6_null_entry)
1336 return -ENOENT;
1338 table = rt->rt6i_table;
1339 write_lock_bh(&table->tb6_lock);
1341 err = fib6_del(rt, info);
1342 dst_release(&rt->dst);
1344 write_unlock_bh(&table->tb6_lock);
1346 return err;
1349 int ip6_del_rt(struct rt6_info *rt)
1351 struct nl_info info = {
1352 .nl_net = dev_net(rt->rt6i_dev),
1354 return __ip6_del_rt(rt, &info);
1357 static int ip6_route_del(struct fib6_config *cfg)
1359 struct fib6_table *table;
1360 struct fib6_node *fn;
1361 struct rt6_info *rt;
1362 int err = -ESRCH;
1364 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1365 if (table == NULL)
1366 return err;
1368 read_lock_bh(&table->tb6_lock);
1370 fn = fib6_locate(&table->tb6_root,
1371 &cfg->fc_dst, cfg->fc_dst_len,
1372 &cfg->fc_src, cfg->fc_src_len);
1374 if (fn) {
1375 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1376 if (cfg->fc_ifindex &&
1377 (rt->rt6i_dev == NULL ||
1378 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1379 continue;
1380 if (cfg->fc_flags & RTF_GATEWAY &&
1381 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1382 continue;
1383 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1384 continue;
1385 dst_hold(&rt->dst);
1386 read_unlock_bh(&table->tb6_lock);
1388 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1391 read_unlock_bh(&table->tb6_lock);
1393 return err;
1397 * Handle redirects
1399 struct ip6rd_flowi {
1400 struct flowi fl;
1401 struct in6_addr gateway;
1404 static struct rt6_info *__ip6_route_redirect(struct net *net,
1405 struct fib6_table *table,
1406 struct flowi *fl,
1407 int flags)
1409 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1410 struct rt6_info *rt;
1411 struct fib6_node *fn;
1414 * Get the "current" route for this destination and
1415 * check if the redirect has come from approriate router.
1417 * RFC 2461 specifies that redirects should only be
1418 * accepted if they come from the nexthop to the target.
1419 * Due to the way the routes are chosen, this notion
1420 * is a bit fuzzy and one might need to check all possible
1421 * routes.
1424 read_lock_bh(&table->tb6_lock);
1425 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1426 restart:
1427 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1429 * Current route is on-link; redirect is always invalid.
1431 * Seems, previous statement is not true. It could
1432 * be node, which looks for us as on-link (f.e. proxy ndisc)
1433 * But then router serving it might decide, that we should
1434 * know truth 8)8) --ANK (980726).
1436 if (rt6_check_expired(rt))
1437 continue;
1438 if (!(rt->rt6i_flags & RTF_GATEWAY))
1439 continue;
1440 if (fl->oif != rt->rt6i_dev->ifindex)
1441 continue;
1442 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1443 continue;
1444 break;
1447 if (!rt)
1448 rt = net->ipv6.ip6_null_entry;
1449 BACKTRACK(net, &fl->fl6_src);
1450 out:
1451 dst_hold(&rt->dst);
1453 read_unlock_bh(&table->tb6_lock);
1455 return rt;
1458 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1459 struct in6_addr *src,
1460 struct in6_addr *gateway,
1461 struct net_device *dev)
1463 int flags = RT6_LOOKUP_F_HAS_SADDR;
1464 struct net *net = dev_net(dev);
1465 struct ip6rd_flowi rdfl = {
1466 .fl = {
1467 .oif = dev->ifindex,
1468 .fl6_dst = *dest,
1469 .fl6_src = *src,
1473 ipv6_addr_copy(&rdfl.gateway, gateway);
1475 if (rt6_need_strict(dest))
1476 flags |= RT6_LOOKUP_F_IFACE;
1478 return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1479 flags, __ip6_route_redirect);
1482 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1483 struct in6_addr *saddr,
1484 struct neighbour *neigh, u8 *lladdr, int on_link)
1486 struct rt6_info *rt, *nrt = NULL;
1487 struct netevent_redirect netevent;
1488 struct net *net = dev_net(neigh->dev);
1490 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1492 if (rt == net->ipv6.ip6_null_entry) {
1493 if (net_ratelimit())
1494 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1495 "for redirect target\n");
1496 goto out;
1500 * We have finally decided to accept it.
1503 neigh_update(neigh, lladdr, NUD_STALE,
1504 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1505 NEIGH_UPDATE_F_OVERRIDE|
1506 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1507 NEIGH_UPDATE_F_ISROUTER))
1511 * Redirect received -> path was valid.
1512 * Look, redirects are sent only in response to data packets,
1513 * so that this nexthop apparently is reachable. --ANK
1515 dst_confirm(&rt->dst);
1517 /* Duplicate redirect: silently ignore. */
1518 if (neigh == rt->dst.neighbour)
1519 goto out;
1521 nrt = ip6_rt_copy(rt);
1522 if (nrt == NULL)
1523 goto out;
1525 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1526 if (on_link)
1527 nrt->rt6i_flags &= ~RTF_GATEWAY;
1529 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1530 nrt->rt6i_dst.plen = 128;
1531 nrt->dst.flags |= DST_HOST;
1533 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1534 nrt->rt6i_nexthop = neigh_clone(neigh);
1536 if (ip6_ins_rt(nrt))
1537 goto out;
1539 netevent.old = &rt->dst;
1540 netevent.new = &nrt->dst;
1541 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1543 if (rt->rt6i_flags&RTF_CACHE) {
1544 ip6_del_rt(rt);
1545 return;
1548 out:
1549 dst_release(&rt->dst);
1553 * Handle ICMP "packet too big" messages
1554 * i.e. Path MTU discovery
1557 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1558 struct net *net, u32 pmtu, int ifindex)
1560 struct rt6_info *rt, *nrt;
1561 int allfrag = 0;
1562 again:
1563 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1564 if (rt == NULL)
1565 return;
1567 if (rt6_check_expired(rt)) {
1568 ip6_del_rt(rt);
1569 goto again;
1572 if (pmtu >= dst_mtu(&rt->dst))
1573 goto out;
1575 if (pmtu < IPV6_MIN_MTU) {
1577 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578 * MTU (1280) and a fragment header should always be included
1579 * after a node receiving Too Big message reporting PMTU is
1580 * less than the IPv6 Minimum Link MTU.
1582 pmtu = IPV6_MIN_MTU;
1583 allfrag = 1;
1586 /* New mtu received -> path was valid.
1587 They are sent only in response to data packets,
1588 so that this nexthop apparently is reachable. --ANK
1590 dst_confirm(&rt->dst);
1592 /* Host route. If it is static, it would be better
1593 not to override it, but add new one, so that
1594 when cache entry will expire old pmtu
1595 would return automatically.
1597 if (rt->rt6i_flags & RTF_CACHE) {
1598 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1599 if (allfrag) {
1600 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1601 features |= RTAX_FEATURE_ALLFRAG;
1602 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1604 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1605 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1606 goto out;
1609 /* Network route.
1610 Two cases are possible:
1611 1. It is connected route. Action: COW
1612 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1614 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1615 nrt = rt6_alloc_cow(rt, daddr, saddr);
1616 else
1617 nrt = rt6_alloc_clone(rt, daddr);
1619 if (nrt) {
1620 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1621 if (allfrag) {
1622 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1623 features |= RTAX_FEATURE_ALLFRAG;
1624 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1627 /* According to RFC 1981, detecting PMTU increase shouldn't be
1628 * happened within 5 mins, the recommended timer is 10 mins.
1629 * Here this route expiration time is set to ip6_rt_mtu_expires
1630 * which is 10 mins. After 10 mins the decreased pmtu is expired
1631 * and detecting PMTU increase will be automatically happened.
1633 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1634 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1636 ip6_ins_rt(nrt);
1638 out:
1639 dst_release(&rt->dst);
1642 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1643 struct net_device *dev, u32 pmtu)
1645 struct net *net = dev_net(dev);
1648 * RFC 1981 states that a node "MUST reduce the size of the packets it
1649 * is sending along the path" that caused the Packet Too Big message.
1650 * Since it's not possible in the general case to determine which
1651 * interface was used to send the original packet, we update the MTU
1652 * on the interface that will be used to send future packets. We also
1653 * update the MTU on the interface that received the Packet Too Big in
1654 * case the original packet was forced out that interface with
1655 * SO_BINDTODEVICE or similar. This is the next best thing to the
1656 * correct behaviour, which would be to update the MTU on all
1657 * interfaces.
1659 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1660 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1664 * Misc support functions
1667 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1669 struct net *net = dev_net(ort->rt6i_dev);
1670 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1672 if (rt) {
1673 rt->dst.input = ort->dst.input;
1674 rt->dst.output = ort->dst.output;
1676 dst_copy_metrics(&rt->dst, &ort->dst);
1677 rt->dst.error = ort->dst.error;
1678 rt->dst.dev = ort->dst.dev;
1679 if (rt->dst.dev)
1680 dev_hold(rt->dst.dev);
1681 rt->rt6i_idev = ort->rt6i_idev;
1682 if (rt->rt6i_idev)
1683 in6_dev_hold(rt->rt6i_idev);
1684 rt->dst.lastuse = jiffies;
1685 rt->rt6i_expires = 0;
1687 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1688 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1689 rt->rt6i_metric = 0;
1691 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1692 #ifdef CONFIG_IPV6_SUBTREES
1693 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1694 #endif
1695 rt->rt6i_table = ort->rt6i_table;
1697 return rt;
1700 #ifdef CONFIG_IPV6_ROUTE_INFO
1701 static struct rt6_info *rt6_get_route_info(struct net *net,
1702 struct in6_addr *prefix, int prefixlen,
1703 struct in6_addr *gwaddr, int ifindex)
1705 struct fib6_node *fn;
1706 struct rt6_info *rt = NULL;
1707 struct fib6_table *table;
1709 table = fib6_get_table(net, RT6_TABLE_INFO);
1710 if (table == NULL)
1711 return NULL;
1713 write_lock_bh(&table->tb6_lock);
1714 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1715 if (!fn)
1716 goto out;
1718 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1719 if (rt->rt6i_dev->ifindex != ifindex)
1720 continue;
1721 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1722 continue;
1723 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1724 continue;
1725 dst_hold(&rt->dst);
1726 break;
1728 out:
1729 write_unlock_bh(&table->tb6_lock);
1730 return rt;
1733 static struct rt6_info *rt6_add_route_info(struct net *net,
1734 struct in6_addr *prefix, int prefixlen,
1735 struct in6_addr *gwaddr, int ifindex,
1736 unsigned pref)
1738 struct fib6_config cfg = {
1739 .fc_table = RT6_TABLE_INFO,
1740 .fc_metric = IP6_RT_PRIO_USER,
1741 .fc_ifindex = ifindex,
1742 .fc_dst_len = prefixlen,
1743 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1744 RTF_UP | RTF_PREF(pref),
1745 .fc_nlinfo.pid = 0,
1746 .fc_nlinfo.nlh = NULL,
1747 .fc_nlinfo.nl_net = net,
1750 ipv6_addr_copy(&cfg.fc_dst, prefix);
1751 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1753 /* We should treat it as a default route if prefix length is 0. */
1754 if (!prefixlen)
1755 cfg.fc_flags |= RTF_DEFAULT;
1757 ip6_route_add(&cfg);
1759 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1761 #endif
1763 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1765 struct rt6_info *rt;
1766 struct fib6_table *table;
1768 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1769 if (table == NULL)
1770 return NULL;
1772 write_lock_bh(&table->tb6_lock);
1773 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1774 if (dev == rt->rt6i_dev &&
1775 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1776 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1777 break;
1779 if (rt)
1780 dst_hold(&rt->dst);
1781 write_unlock_bh(&table->tb6_lock);
1782 return rt;
1785 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1786 struct net_device *dev,
1787 unsigned int pref)
1789 struct fib6_config cfg = {
1790 .fc_table = RT6_TABLE_DFLT,
1791 .fc_metric = IP6_RT_PRIO_USER,
1792 .fc_ifindex = dev->ifindex,
1793 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1794 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1795 .fc_nlinfo.pid = 0,
1796 .fc_nlinfo.nlh = NULL,
1797 .fc_nlinfo.nl_net = dev_net(dev),
1800 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1802 ip6_route_add(&cfg);
1804 return rt6_get_dflt_router(gwaddr, dev);
1807 void rt6_purge_dflt_routers(struct net *net)
1809 struct rt6_info *rt;
1810 struct fib6_table *table;
1812 /* NOTE: Keep consistent with rt6_get_dflt_router */
1813 table = fib6_get_table(net, RT6_TABLE_DFLT);
1814 if (table == NULL)
1815 return;
1817 restart:
1818 read_lock_bh(&table->tb6_lock);
1819 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1820 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1821 dst_hold(&rt->dst);
1822 read_unlock_bh(&table->tb6_lock);
1823 ip6_del_rt(rt);
1824 goto restart;
1827 read_unlock_bh(&table->tb6_lock);
1830 static void rtmsg_to_fib6_config(struct net *net,
1831 struct in6_rtmsg *rtmsg,
1832 struct fib6_config *cfg)
1834 memset(cfg, 0, sizeof(*cfg));
1836 cfg->fc_table = RT6_TABLE_MAIN;
1837 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1838 cfg->fc_metric = rtmsg->rtmsg_metric;
1839 cfg->fc_expires = rtmsg->rtmsg_info;
1840 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1841 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1842 cfg->fc_flags = rtmsg->rtmsg_flags;
1844 cfg->fc_nlinfo.nl_net = net;
1846 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1847 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1848 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1851 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1853 struct fib6_config cfg;
1854 struct in6_rtmsg rtmsg;
1855 int err;
1857 switch(cmd) {
1858 case SIOCADDRT: /* Add a route */
1859 case SIOCDELRT: /* Delete a route */
1860 if (!capable(CAP_NET_ADMIN))
1861 return -EPERM;
1862 err = copy_from_user(&rtmsg, arg,
1863 sizeof(struct in6_rtmsg));
1864 if (err)
1865 return -EFAULT;
1867 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1869 rtnl_lock();
1870 switch (cmd) {
1871 case SIOCADDRT:
1872 err = ip6_route_add(&cfg);
1873 break;
1874 case SIOCDELRT:
1875 err = ip6_route_del(&cfg);
1876 break;
1877 default:
1878 err = -EINVAL;
1880 rtnl_unlock();
1882 return err;
1885 return -EINVAL;
1889 * Drop the packet on the floor
1892 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1894 int type;
1895 struct dst_entry *dst = skb_dst(skb);
1896 switch (ipstats_mib_noroutes) {
1897 case IPSTATS_MIB_INNOROUTES:
1898 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1899 if (type == IPV6_ADDR_ANY) {
1900 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1901 IPSTATS_MIB_INADDRERRORS);
1902 break;
1904 /* FALLTHROUGH */
1905 case IPSTATS_MIB_OUTNOROUTES:
1906 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1907 ipstats_mib_noroutes);
1908 break;
1910 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1911 kfree_skb(skb);
1912 return 0;
1915 static int ip6_pkt_discard(struct sk_buff *skb)
1917 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1920 static int ip6_pkt_discard_out(struct sk_buff *skb)
1922 skb->dev = skb_dst(skb)->dev;
1923 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1928 static int ip6_pkt_prohibit(struct sk_buff *skb)
1930 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1933 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1935 skb->dev = skb_dst(skb)->dev;
1936 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1939 #endif
1942 * Allocate a dst for local (unicast / anycast) address.
1945 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1946 const struct in6_addr *addr,
1947 int anycast)
1949 struct net *net = dev_net(idev->dev);
1950 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1951 struct neighbour *neigh;
1953 if (rt == NULL) {
1954 if (net_ratelimit())
1955 pr_warning("IPv6: Maximum number of routes reached,"
1956 " consider increasing route/max_size.\n");
1957 return ERR_PTR(-ENOMEM);
1960 dev_hold(net->loopback_dev);
1961 in6_dev_hold(idev);
1963 rt->dst.flags = DST_HOST;
1964 rt->dst.input = ip6_input;
1965 rt->dst.output = ip6_output;
1966 rt->rt6i_dev = net->loopback_dev;
1967 rt->rt6i_idev = idev;
1968 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1969 rt->dst.obsolete = -1;
1971 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1972 if (anycast)
1973 rt->rt6i_flags |= RTF_ANYCAST;
1974 else
1975 rt->rt6i_flags |= RTF_LOCAL;
1976 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1977 if (IS_ERR(neigh)) {
1978 dst_free(&rt->dst);
1980 /* We are casting this because that is the return
1981 * value type. But an errno encoded pointer is the
1982 * same regardless of the underlying pointer type,
1983 * and that's what we are returning. So this is OK.
1985 return (struct rt6_info *) neigh;
1987 rt->rt6i_nexthop = neigh;
1989 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1990 rt->rt6i_dst.plen = 128;
1991 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1993 atomic_set(&rt->dst.__refcnt, 1);
1995 return rt;
1998 struct arg_dev_net {
1999 struct net_device *dev;
2000 struct net *net;
2003 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2005 const struct arg_dev_net *adn = arg;
2006 const struct net_device *dev = adn->dev;
2008 if ((rt->rt6i_dev == dev || dev == NULL) &&
2009 rt != adn->net->ipv6.ip6_null_entry) {
2010 RT6_TRACE("deleted by ifdown %p\n", rt);
2011 return -1;
2013 return 0;
2016 void rt6_ifdown(struct net *net, struct net_device *dev)
2018 struct arg_dev_net adn = {
2019 .dev = dev,
2020 .net = net,
2023 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2024 icmp6_clean_all(fib6_ifdown, &adn);
2027 struct rt6_mtu_change_arg
2029 struct net_device *dev;
2030 unsigned mtu;
2033 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2035 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2036 struct inet6_dev *idev;
2038 /* In IPv6 pmtu discovery is not optional,
2039 so that RTAX_MTU lock cannot disable it.
2040 We still use this lock to block changes
2041 caused by addrconf/ndisc.
2044 idev = __in6_dev_get(arg->dev);
2045 if (idev == NULL)
2046 return 0;
2048 /* For administrative MTU increase, there is no way to discover
2049 IPv6 PMTU increase, so PMTU increase should be updated here.
2050 Since RFC 1981 doesn't include administrative MTU increase
2051 update PMTU increase is a MUST. (i.e. jumbo frame)
2054 If new MTU is less than route PMTU, this new MTU will be the
2055 lowest MTU in the path, update the route PMTU to reflect PMTU
2056 decreases; if new MTU is greater than route PMTU, and the
2057 old MTU is the lowest MTU in the path, update the route PMTU
2058 to reflect the increase. In this case if the other nodes' MTU
2059 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2060 PMTU discouvery.
2062 if (rt->rt6i_dev == arg->dev &&
2063 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2064 (dst_mtu(&rt->dst) >= arg->mtu ||
2065 (dst_mtu(&rt->dst) < arg->mtu &&
2066 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2067 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2069 return 0;
2072 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2074 struct rt6_mtu_change_arg arg = {
2075 .dev = dev,
2076 .mtu = mtu,
2079 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2082 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2083 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2084 [RTA_OIF] = { .type = NLA_U32 },
2085 [RTA_IIF] = { .type = NLA_U32 },
2086 [RTA_PRIORITY] = { .type = NLA_U32 },
2087 [RTA_METRICS] = { .type = NLA_NESTED },
2090 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2091 struct fib6_config *cfg)
2093 struct rtmsg *rtm;
2094 struct nlattr *tb[RTA_MAX+1];
2095 int err;
2097 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2098 if (err < 0)
2099 goto errout;
2101 err = -EINVAL;
2102 rtm = nlmsg_data(nlh);
2103 memset(cfg, 0, sizeof(*cfg));
2105 cfg->fc_table = rtm->rtm_table;
2106 cfg->fc_dst_len = rtm->rtm_dst_len;
2107 cfg->fc_src_len = rtm->rtm_src_len;
2108 cfg->fc_flags = RTF_UP;
2109 cfg->fc_protocol = rtm->rtm_protocol;
2111 if (rtm->rtm_type == RTN_UNREACHABLE)
2112 cfg->fc_flags |= RTF_REJECT;
2114 if (rtm->rtm_type == RTN_LOCAL)
2115 cfg->fc_flags |= RTF_LOCAL;
2117 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2118 cfg->fc_nlinfo.nlh = nlh;
2119 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2121 if (tb[RTA_GATEWAY]) {
2122 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2123 cfg->fc_flags |= RTF_GATEWAY;
2126 if (tb[RTA_DST]) {
2127 int plen = (rtm->rtm_dst_len + 7) >> 3;
2129 if (nla_len(tb[RTA_DST]) < plen)
2130 goto errout;
2132 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2135 if (tb[RTA_SRC]) {
2136 int plen = (rtm->rtm_src_len + 7) >> 3;
2138 if (nla_len(tb[RTA_SRC]) < plen)
2139 goto errout;
2141 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2144 if (tb[RTA_OIF])
2145 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2147 if (tb[RTA_PRIORITY])
2148 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2150 if (tb[RTA_METRICS]) {
2151 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2152 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2155 if (tb[RTA_TABLE])
2156 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2158 err = 0;
2159 errout:
2160 return err;
2163 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2165 struct fib6_config cfg;
2166 int err;
2168 err = rtm_to_fib6_config(skb, nlh, &cfg);
2169 if (err < 0)
2170 return err;
2172 return ip6_route_del(&cfg);
2175 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2177 struct fib6_config cfg;
2178 int err;
2180 err = rtm_to_fib6_config(skb, nlh, &cfg);
2181 if (err < 0)
2182 return err;
2184 return ip6_route_add(&cfg);
2187 static inline size_t rt6_nlmsg_size(void)
2189 return NLMSG_ALIGN(sizeof(struct rtmsg))
2190 + nla_total_size(16) /* RTA_SRC */
2191 + nla_total_size(16) /* RTA_DST */
2192 + nla_total_size(16) /* RTA_GATEWAY */
2193 + nla_total_size(16) /* RTA_PREFSRC */
2194 + nla_total_size(4) /* RTA_TABLE */
2195 + nla_total_size(4) /* RTA_IIF */
2196 + nla_total_size(4) /* RTA_OIF */
2197 + nla_total_size(4) /* RTA_PRIORITY */
2198 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2199 + nla_total_size(sizeof(struct rta_cacheinfo));
2202 static int rt6_fill_node(struct net *net,
2203 struct sk_buff *skb, struct rt6_info *rt,
2204 struct in6_addr *dst, struct in6_addr *src,
2205 int iif, int type, u32 pid, u32 seq,
2206 int prefix, int nowait, unsigned int flags)
2208 struct rtmsg *rtm;
2209 struct nlmsghdr *nlh;
2210 long expires;
2211 u32 table;
2213 if (prefix) { /* user wants prefix routes only */
2214 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2215 /* success since this is not a prefix route */
2216 return 1;
2220 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2221 if (nlh == NULL)
2222 return -EMSGSIZE;
2224 rtm = nlmsg_data(nlh);
2225 rtm->rtm_family = AF_INET6;
2226 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2227 rtm->rtm_src_len = rt->rt6i_src.plen;
2228 rtm->rtm_tos = 0;
2229 if (rt->rt6i_table)
2230 table = rt->rt6i_table->tb6_id;
2231 else
2232 table = RT6_TABLE_UNSPEC;
2233 rtm->rtm_table = table;
2234 NLA_PUT_U32(skb, RTA_TABLE, table);
2235 if (rt->rt6i_flags&RTF_REJECT)
2236 rtm->rtm_type = RTN_UNREACHABLE;
2237 else if (rt->rt6i_flags&RTF_LOCAL)
2238 rtm->rtm_type = RTN_LOCAL;
2239 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2240 rtm->rtm_type = RTN_LOCAL;
2241 else
2242 rtm->rtm_type = RTN_UNICAST;
2243 rtm->rtm_flags = 0;
2244 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2245 rtm->rtm_protocol = rt->rt6i_protocol;
2246 if (rt->rt6i_flags&RTF_DYNAMIC)
2247 rtm->rtm_protocol = RTPROT_REDIRECT;
2248 else if (rt->rt6i_flags & RTF_ADDRCONF)
2249 rtm->rtm_protocol = RTPROT_KERNEL;
2250 else if (rt->rt6i_flags&RTF_DEFAULT)
2251 rtm->rtm_protocol = RTPROT_RA;
2253 if (rt->rt6i_flags&RTF_CACHE)
2254 rtm->rtm_flags |= RTM_F_CLONED;
2256 if (dst) {
2257 NLA_PUT(skb, RTA_DST, 16, dst);
2258 rtm->rtm_dst_len = 128;
2259 } else if (rtm->rtm_dst_len)
2260 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2261 #ifdef CONFIG_IPV6_SUBTREES
2262 if (src) {
2263 NLA_PUT(skb, RTA_SRC, 16, src);
2264 rtm->rtm_src_len = 128;
2265 } else if (rtm->rtm_src_len)
2266 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2267 #endif
2268 if (iif) {
2269 #ifdef CONFIG_IPV6_MROUTE
2270 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2271 int err = ip6mr_get_route(net, skb, rtm, nowait);
2272 if (err <= 0) {
2273 if (!nowait) {
2274 if (err == 0)
2275 return 0;
2276 goto nla_put_failure;
2277 } else {
2278 if (err == -EMSGSIZE)
2279 goto nla_put_failure;
2282 } else
2283 #endif
2284 NLA_PUT_U32(skb, RTA_IIF, iif);
2285 } else if (dst) {
2286 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2287 struct in6_addr saddr_buf;
2288 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2289 dst, 0, &saddr_buf) == 0)
2290 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2293 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2294 goto nla_put_failure;
2296 if (rt->dst.neighbour)
2297 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2299 if (rt->dst.dev)
2300 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2302 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2304 if (!(rt->rt6i_flags & RTF_EXPIRES))
2305 expires = 0;
2306 else if (rt->rt6i_expires - jiffies < INT_MAX)
2307 expires = rt->rt6i_expires - jiffies;
2308 else
2309 expires = INT_MAX;
2311 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2312 expires, rt->dst.error) < 0)
2313 goto nla_put_failure;
2315 return nlmsg_end(skb, nlh);
2317 nla_put_failure:
2318 nlmsg_cancel(skb, nlh);
2319 return -EMSGSIZE;
2322 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2324 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2325 int prefix;
2327 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2328 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2329 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2330 } else
2331 prefix = 0;
2333 return rt6_fill_node(arg->net,
2334 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2335 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2336 prefix, 0, NLM_F_MULTI);
2339 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2341 struct net *net = sock_net(in_skb->sk);
2342 struct nlattr *tb[RTA_MAX+1];
2343 struct rt6_info *rt;
2344 struct sk_buff *skb;
2345 struct rtmsg *rtm;
2346 struct flowi fl;
2347 int err, iif = 0;
2349 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2350 if (err < 0)
2351 goto errout;
2353 err = -EINVAL;
2354 memset(&fl, 0, sizeof(fl));
2356 if (tb[RTA_SRC]) {
2357 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2358 goto errout;
2360 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2363 if (tb[RTA_DST]) {
2364 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2365 goto errout;
2367 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2370 if (tb[RTA_IIF])
2371 iif = nla_get_u32(tb[RTA_IIF]);
2373 if (tb[RTA_OIF])
2374 fl.oif = nla_get_u32(tb[RTA_OIF]);
2376 if (iif) {
2377 struct net_device *dev;
2378 dev = __dev_get_by_index(net, iif);
2379 if (!dev) {
2380 err = -ENODEV;
2381 goto errout;
2385 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2386 if (skb == NULL) {
2387 err = -ENOBUFS;
2388 goto errout;
2391 /* Reserve room for dummy headers, this skb can pass
2392 through good chunk of routing engine.
2394 skb_reset_mac_header(skb);
2395 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2397 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2398 skb_dst_set(skb, &rt->dst);
2400 err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2401 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2402 nlh->nlmsg_seq, 0, 0, 0);
2403 if (err < 0) {
2404 kfree_skb(skb);
2405 goto errout;
2408 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2409 errout:
2410 return err;
2413 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2415 struct sk_buff *skb;
2416 struct net *net = info->nl_net;
2417 u32 seq;
2418 int err;
2420 err = -ENOBUFS;
2421 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2423 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2424 if (skb == NULL)
2425 goto errout;
2427 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2428 event, info->pid, seq, 0, 0, 0);
2429 if (err < 0) {
2430 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2431 WARN_ON(err == -EMSGSIZE);
2432 kfree_skb(skb);
2433 goto errout;
2435 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2436 info->nlh, gfp_any());
2437 return;
2438 errout:
2439 if (err < 0)
2440 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2443 static int ip6_route_dev_notify(struct notifier_block *this,
2444 unsigned long event, void *data)
2446 struct net_device *dev = (struct net_device *)data;
2447 struct net *net = dev_net(dev);
2449 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2450 net->ipv6.ip6_null_entry->dst.dev = dev;
2451 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2452 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2453 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2454 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2455 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2456 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2457 #endif
2460 return NOTIFY_OK;
2464 * /proc
2467 #ifdef CONFIG_PROC_FS
2469 struct rt6_proc_arg
2471 char *buffer;
2472 int offset;
2473 int length;
2474 int skip;
2475 int len;
2478 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2480 struct seq_file *m = p_arg;
2482 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2484 #ifdef CONFIG_IPV6_SUBTREES
2485 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2486 #else
2487 seq_puts(m, "00000000000000000000000000000000 00 ");
2488 #endif
2490 if (rt->rt6i_nexthop) {
2491 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2492 } else {
2493 seq_puts(m, "00000000000000000000000000000000");
2495 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2496 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2497 rt->dst.__use, rt->rt6i_flags,
2498 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2499 return 0;
2502 static int ipv6_route_show(struct seq_file *m, void *v)
2504 struct net *net = (struct net *)m->private;
2505 fib6_clean_all(net, rt6_info_route, 0, m);
2506 return 0;
2509 static int ipv6_route_open(struct inode *inode, struct file *file)
2511 return single_open_net(inode, file, ipv6_route_show);
2514 static const struct file_operations ipv6_route_proc_fops = {
2515 .owner = THIS_MODULE,
2516 .open = ipv6_route_open,
2517 .read = seq_read,
2518 .llseek = seq_lseek,
2519 .release = single_release_net,
2522 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2524 struct net *net = (struct net *)seq->private;
2525 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2526 net->ipv6.rt6_stats->fib_nodes,
2527 net->ipv6.rt6_stats->fib_route_nodes,
2528 net->ipv6.rt6_stats->fib_rt_alloc,
2529 net->ipv6.rt6_stats->fib_rt_entries,
2530 net->ipv6.rt6_stats->fib_rt_cache,
2531 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2532 net->ipv6.rt6_stats->fib_discarded_routes);
2534 return 0;
2537 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2539 return single_open_net(inode, file, rt6_stats_seq_show);
2542 static const struct file_operations rt6_stats_seq_fops = {
2543 .owner = THIS_MODULE,
2544 .open = rt6_stats_seq_open,
2545 .read = seq_read,
2546 .llseek = seq_lseek,
2547 .release = single_release_net,
2549 #endif /* CONFIG_PROC_FS */
2551 #ifdef CONFIG_SYSCTL
2553 static
2554 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2555 void __user *buffer, size_t *lenp, loff_t *ppos)
2557 struct net *net = current->nsproxy->net_ns;
2558 int delay = net->ipv6.sysctl.flush_delay;
2559 if (write) {
2560 proc_dointvec(ctl, write, buffer, lenp, ppos);
2561 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2562 return 0;
2563 } else
2564 return -EINVAL;
2567 ctl_table ipv6_route_table_template[] = {
2569 .procname = "flush",
2570 .data = &init_net.ipv6.sysctl.flush_delay,
2571 .maxlen = sizeof(int),
2572 .mode = 0200,
2573 .proc_handler = ipv6_sysctl_rtcache_flush
2576 .procname = "gc_thresh",
2577 .data = &ip6_dst_ops_template.gc_thresh,
2578 .maxlen = sizeof(int),
2579 .mode = 0644,
2580 .proc_handler = proc_dointvec,
2583 .procname = "max_size",
2584 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2585 .maxlen = sizeof(int),
2586 .mode = 0644,
2587 .proc_handler = proc_dointvec,
2590 .procname = "gc_min_interval",
2591 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2592 .maxlen = sizeof(int),
2593 .mode = 0644,
2594 .proc_handler = proc_dointvec_jiffies,
2597 .procname = "gc_timeout",
2598 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2599 .maxlen = sizeof(int),
2600 .mode = 0644,
2601 .proc_handler = proc_dointvec_jiffies,
2604 .procname = "gc_interval",
2605 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2606 .maxlen = sizeof(int),
2607 .mode = 0644,
2608 .proc_handler = proc_dointvec_jiffies,
2611 .procname = "gc_elasticity",
2612 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2613 .maxlen = sizeof(int),
2614 .mode = 0644,
2615 .proc_handler = proc_dointvec,
2618 .procname = "mtu_expires",
2619 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2620 .maxlen = sizeof(int),
2621 .mode = 0644,
2622 .proc_handler = proc_dointvec_jiffies,
2625 .procname = "min_adv_mss",
2626 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2627 .maxlen = sizeof(int),
2628 .mode = 0644,
2629 .proc_handler = proc_dointvec,
2632 .procname = "gc_min_interval_ms",
2633 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2634 .maxlen = sizeof(int),
2635 .mode = 0644,
2636 .proc_handler = proc_dointvec_ms_jiffies,
2641 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2643 struct ctl_table *table;
2645 table = kmemdup(ipv6_route_table_template,
2646 sizeof(ipv6_route_table_template),
2647 GFP_KERNEL);
2649 if (table) {
2650 table[0].data = &net->ipv6.sysctl.flush_delay;
2651 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2652 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2653 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2654 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2655 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2656 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2657 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2658 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2659 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2662 return table;
2664 #endif
2666 static int __net_init ip6_route_net_init(struct net *net)
2668 int ret = -ENOMEM;
2670 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2671 sizeof(net->ipv6.ip6_dst_ops));
2673 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2674 goto out_ip6_dst_ops;
2676 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2677 sizeof(*net->ipv6.ip6_null_entry),
2678 GFP_KERNEL);
2679 if (!net->ipv6.ip6_null_entry)
2680 goto out_ip6_dst_entries;
2681 net->ipv6.ip6_null_entry->dst.path =
2682 (struct dst_entry *)net->ipv6.ip6_null_entry;
2683 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2684 dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2686 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2688 sizeof(*net->ipv6.ip6_prohibit_entry),
2689 GFP_KERNEL);
2690 if (!net->ipv6.ip6_prohibit_entry)
2691 goto out_ip6_null_entry;
2692 net->ipv6.ip6_prohibit_entry->dst.path =
2693 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2694 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2695 dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2697 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2698 sizeof(*net->ipv6.ip6_blk_hole_entry),
2699 GFP_KERNEL);
2700 if (!net->ipv6.ip6_blk_hole_entry)
2701 goto out_ip6_prohibit_entry;
2702 net->ipv6.ip6_blk_hole_entry->dst.path =
2703 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2704 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2705 dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2706 #endif
2708 net->ipv6.sysctl.flush_delay = 0;
2709 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2710 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2711 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2712 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2713 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2714 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2715 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2717 #ifdef CONFIG_PROC_FS
2718 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2719 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2720 #endif
2721 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2723 ret = 0;
2724 out:
2725 return ret;
2727 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2728 out_ip6_prohibit_entry:
2729 kfree(net->ipv6.ip6_prohibit_entry);
2730 out_ip6_null_entry:
2731 kfree(net->ipv6.ip6_null_entry);
2732 #endif
2733 out_ip6_dst_entries:
2734 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2735 out_ip6_dst_ops:
2736 goto out;
2739 static void __net_exit ip6_route_net_exit(struct net *net)
2741 #ifdef CONFIG_PROC_FS
2742 proc_net_remove(net, "ipv6_route");
2743 proc_net_remove(net, "rt6_stats");
2744 #endif
2745 kfree(net->ipv6.ip6_null_entry);
2746 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2747 kfree(net->ipv6.ip6_prohibit_entry);
2748 kfree(net->ipv6.ip6_blk_hole_entry);
2749 #endif
2750 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2753 static struct pernet_operations ip6_route_net_ops = {
2754 .init = ip6_route_net_init,
2755 .exit = ip6_route_net_exit,
2758 static struct notifier_block ip6_route_dev_notifier = {
2759 .notifier_call = ip6_route_dev_notify,
2760 .priority = 0,
2763 int __init ip6_route_init(void)
2765 int ret;
2767 ret = -ENOMEM;
2768 ip6_dst_ops_template.kmem_cachep =
2769 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2770 SLAB_HWCACHE_ALIGN, NULL);
2771 if (!ip6_dst_ops_template.kmem_cachep)
2772 goto out;
2774 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2775 if (ret)
2776 goto out_kmem_cache;
2778 ret = register_pernet_subsys(&ip6_route_net_ops);
2779 if (ret)
2780 goto out_dst_entries;
2782 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2784 /* Registering of the loopback is done before this portion of code,
2785 * the loopback reference in rt6_info will not be taken, do it
2786 * manually for init_net */
2787 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2788 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2789 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2790 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2791 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2792 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2793 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2794 #endif
2795 ret = fib6_init();
2796 if (ret)
2797 goto out_register_subsys;
2799 ret = xfrm6_init();
2800 if (ret)
2801 goto out_fib6_init;
2803 ret = fib6_rules_init();
2804 if (ret)
2805 goto xfrm6_init;
2807 ret = -ENOBUFS;
2808 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2809 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2810 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2811 goto fib6_rules_init;
2813 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2814 if (ret)
2815 goto fib6_rules_init;
2817 out:
2818 return ret;
2820 fib6_rules_init:
2821 fib6_rules_cleanup();
2822 xfrm6_init:
2823 xfrm6_fini();
2824 out_fib6_init:
2825 fib6_gc_cleanup();
2826 out_register_subsys:
2827 unregister_pernet_subsys(&ip6_route_net_ops);
2828 out_dst_entries:
2829 dst_entries_destroy(&ip6_dst_blackhole_ops);
2830 out_kmem_cache:
2831 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2832 goto out;
2835 void ip6_route_cleanup(void)
2837 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2838 fib6_rules_cleanup();
2839 xfrm6_fini();
2840 fib6_gc_cleanup();
2841 unregister_pernet_subsys(&ip6_route_net_ops);
2842 dst_entries_destroy(&ip6_dst_blackhole_ops);
2843 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);