2 * Linux INET6 implementation
3 * Forwarding Information Database
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 * Yuji SEKIYA @USAGI: Support default route on router node;
15 * remove ip6_null_entry from the top of
17 * Ville Nuorvala: Fixed routing subtrees.
20 #define pr_fmt(fmt) "IPv6: " fmt
22 #include <linux/errno.h>
23 #include <linux/types.h>
24 #include <linux/net.h>
25 #include <linux/route.h>
26 #include <linux/netdevice.h>
27 #include <linux/in6.h>
28 #include <linux/init.h>
29 #include <linux/list.h>
30 #include <linux/slab.h>
33 #include <net/ndisc.h>
34 #include <net/addrconf.h>
35 #include <net/lwtunnel.h>
36 #include <net/fib_notifier.h>
38 #include <net/ip6_fib.h>
39 #include <net/ip6_route.h>
41 static struct kmem_cache
*fib6_node_kmem __read_mostly
;
46 int (*func
)(struct rt6_info
*, void *arg
);
51 #ifdef CONFIG_IPV6_SUBTREES
52 #define FWS_INIT FWS_S
54 #define FWS_INIT FWS_L
57 static struct rt6_info
*fib6_find_prefix(struct net
*net
,
58 struct fib6_table
*table
,
59 struct fib6_node
*fn
);
60 static struct fib6_node
*fib6_repair_tree(struct net
*net
,
61 struct fib6_table
*table
,
62 struct fib6_node
*fn
);
63 static int fib6_walk(struct net
*net
, struct fib6_walker
*w
);
64 static int fib6_walk_continue(struct fib6_walker
*w
);
67 * A routing update causes an increase of the serial number on the
68 * affected subtree. This allows for cached routes to be asynchronously
69 * tested when modifications are made to the destination cache as a
70 * result of redirects, path MTU changes, etc.
73 static void fib6_gc_timer_cb(struct timer_list
*t
);
75 #define FOR_WALKERS(net, w) \
76 list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
78 static void fib6_walker_link(struct net
*net
, struct fib6_walker
*w
)
80 write_lock_bh(&net
->ipv6
.fib6_walker_lock
);
81 list_add(&w
->lh
, &net
->ipv6
.fib6_walkers
);
82 write_unlock_bh(&net
->ipv6
.fib6_walker_lock
);
85 static void fib6_walker_unlink(struct net
*net
, struct fib6_walker
*w
)
87 write_lock_bh(&net
->ipv6
.fib6_walker_lock
);
89 write_unlock_bh(&net
->ipv6
.fib6_walker_lock
);
92 static int fib6_new_sernum(struct net
*net
)
97 old
= atomic_read(&net
->ipv6
.fib6_sernum
);
98 new = old
< INT_MAX
? old
+ 1 : 1;
99 } while (atomic_cmpxchg(&net
->ipv6
.fib6_sernum
,
105 FIB6_NO_SERNUM_CHANGE
= 0,
108 void fib6_update_sernum(struct rt6_info
*rt
)
110 struct net
*net
= dev_net(rt
->dst
.dev
);
111 struct fib6_node
*fn
;
113 fn
= rcu_dereference_protected(rt
->rt6i_node
,
114 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
116 fn
->fn_sernum
= fib6_new_sernum(net
);
120 * Auxiliary address test functions for the radix tree.
122 * These assume a 32bit processor (although it will work on
129 #if defined(__LITTLE_ENDIAN)
130 # define BITOP_BE32_SWIZZLE (0x1F & ~7)
132 # define BITOP_BE32_SWIZZLE 0
135 static __be32
addr_bit_set(const void *token
, int fn_bit
)
137 const __be32
*addr
= token
;
140 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
141 * is optimized version of
142 * htonl(1 << ((~fn_bit)&0x1F))
143 * See include/asm-generic/bitops/le.h.
145 return (__force __be32
)(1 << ((~fn_bit
^ BITOP_BE32_SWIZZLE
) & 0x1f)) &
149 static struct fib6_node
*node_alloc(struct net
*net
)
151 struct fib6_node
*fn
;
153 fn
= kmem_cache_zalloc(fib6_node_kmem
, GFP_ATOMIC
);
155 net
->ipv6
.rt6_stats
->fib_nodes
++;
160 static void node_free_immediate(struct net
*net
, struct fib6_node
*fn
)
162 kmem_cache_free(fib6_node_kmem
, fn
);
163 net
->ipv6
.rt6_stats
->fib_nodes
--;
166 static void node_free_rcu(struct rcu_head
*head
)
168 struct fib6_node
*fn
= container_of(head
, struct fib6_node
, rcu
);
170 kmem_cache_free(fib6_node_kmem
, fn
);
173 static void node_free(struct net
*net
, struct fib6_node
*fn
)
175 call_rcu(&fn
->rcu
, node_free_rcu
);
176 net
->ipv6
.rt6_stats
->fib_nodes
--;
179 void rt6_free_pcpu(struct rt6_info
*non_pcpu_rt
)
183 if (!non_pcpu_rt
->rt6i_pcpu
)
186 for_each_possible_cpu(cpu
) {
187 struct rt6_info
**ppcpu_rt
;
188 struct rt6_info
*pcpu_rt
;
190 ppcpu_rt
= per_cpu_ptr(non_pcpu_rt
->rt6i_pcpu
, cpu
);
193 dst_dev_put(&pcpu_rt
->dst
);
194 dst_release(&pcpu_rt
->dst
);
199 EXPORT_SYMBOL_GPL(rt6_free_pcpu
);
201 static void fib6_free_table(struct fib6_table
*table
)
203 inetpeer_invalidate_tree(&table
->tb6_peers
);
207 static void fib6_link_table(struct net
*net
, struct fib6_table
*tb
)
212 * Initialize table lock at a single place to give lockdep a key,
213 * tables aren't visible prior to being linked to the list.
215 spin_lock_init(&tb
->tb6_lock
);
216 h
= tb
->tb6_id
& (FIB6_TABLE_HASHSZ
- 1);
219 * No protection necessary, this is the only list mutatation
220 * operation, tables never disappear once they exist.
222 hlist_add_head_rcu(&tb
->tb6_hlist
, &net
->ipv6
.fib_table_hash
[h
]);
225 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
227 static struct fib6_table
*fib6_alloc_table(struct net
*net
, u32 id
)
229 struct fib6_table
*table
;
231 table
= kzalloc(sizeof(*table
), GFP_ATOMIC
);
234 rcu_assign_pointer(table
->tb6_root
.leaf
,
235 net
->ipv6
.ip6_null_entry
);
236 table
->tb6_root
.fn_flags
= RTN_ROOT
| RTN_TL_ROOT
| RTN_RTINFO
;
237 inet_peer_base_init(&table
->tb6_peers
);
243 struct fib6_table
*fib6_new_table(struct net
*net
, u32 id
)
245 struct fib6_table
*tb
;
249 tb
= fib6_get_table(net
, id
);
253 tb
= fib6_alloc_table(net
, id
);
255 fib6_link_table(net
, tb
);
259 EXPORT_SYMBOL_GPL(fib6_new_table
);
261 struct fib6_table
*fib6_get_table(struct net
*net
, u32 id
)
263 struct fib6_table
*tb
;
264 struct hlist_head
*head
;
269 h
= id
& (FIB6_TABLE_HASHSZ
- 1);
271 head
= &net
->ipv6
.fib_table_hash
[h
];
272 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
) {
273 if (tb
->tb6_id
== id
) {
282 EXPORT_SYMBOL_GPL(fib6_get_table
);
284 static void __net_init
fib6_tables_init(struct net
*net
)
286 fib6_link_table(net
, net
->ipv6
.fib6_main_tbl
);
287 fib6_link_table(net
, net
->ipv6
.fib6_local_tbl
);
291 struct fib6_table
*fib6_new_table(struct net
*net
, u32 id
)
293 return fib6_get_table(net
, id
);
296 struct fib6_table
*fib6_get_table(struct net
*net
, u32 id
)
298 return net
->ipv6
.fib6_main_tbl
;
301 struct dst_entry
*fib6_rule_lookup(struct net
*net
, struct flowi6
*fl6
,
302 const struct sk_buff
*skb
,
303 int flags
, pol_lookup_t lookup
)
307 rt
= lookup(net
, net
->ipv6
.fib6_main_tbl
, fl6
, skb
, flags
);
308 if (rt
->dst
.error
== -EAGAIN
) {
310 rt
= net
->ipv6
.ip6_null_entry
;
317 static void __net_init
fib6_tables_init(struct net
*net
)
319 fib6_link_table(net
, net
->ipv6
.fib6_main_tbl
);
324 unsigned int fib6_tables_seq_read(struct net
*net
)
326 unsigned int h
, fib_seq
= 0;
329 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
330 struct hlist_head
*head
= &net
->ipv6
.fib_table_hash
[h
];
331 struct fib6_table
*tb
;
333 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
)
334 fib_seq
+= tb
->fib_seq
;
341 static int call_fib6_entry_notifier(struct notifier_block
*nb
, struct net
*net
,
342 enum fib_event_type event_type
,
345 struct fib6_entry_notifier_info info
= {
349 return call_fib6_notifier(nb
, net
, event_type
, &info
.info
);
352 static int call_fib6_entry_notifiers(struct net
*net
,
353 enum fib_event_type event_type
,
355 struct netlink_ext_ack
*extack
)
357 struct fib6_entry_notifier_info info
= {
358 .info
.extack
= extack
,
362 rt
->rt6i_table
->fib_seq
++;
363 return call_fib6_notifiers(net
, event_type
, &info
.info
);
366 struct fib6_dump_arg
{
368 struct notifier_block
*nb
;
371 static void fib6_rt_dump(struct rt6_info
*rt
, struct fib6_dump_arg
*arg
)
373 if (rt
== arg
->net
->ipv6
.ip6_null_entry
)
375 call_fib6_entry_notifier(arg
->nb
, arg
->net
, FIB_EVENT_ENTRY_ADD
, rt
);
378 static int fib6_node_dump(struct fib6_walker
*w
)
382 for_each_fib6_walker_rt(w
)
383 fib6_rt_dump(rt
, w
->args
);
388 static void fib6_table_dump(struct net
*net
, struct fib6_table
*tb
,
389 struct fib6_walker
*w
)
391 w
->root
= &tb
->tb6_root
;
392 spin_lock_bh(&tb
->tb6_lock
);
394 spin_unlock_bh(&tb
->tb6_lock
);
397 /* Called with rcu_read_lock() */
398 int fib6_tables_dump(struct net
*net
, struct notifier_block
*nb
)
400 struct fib6_dump_arg arg
;
401 struct fib6_walker
*w
;
404 w
= kzalloc(sizeof(*w
), GFP_ATOMIC
);
408 w
->func
= fib6_node_dump
;
413 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
414 struct hlist_head
*head
= &net
->ipv6
.fib_table_hash
[h
];
415 struct fib6_table
*tb
;
417 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
)
418 fib6_table_dump(net
, tb
, w
);
426 static int fib6_dump_node(struct fib6_walker
*w
)
431 for_each_fib6_walker_rt(w
) {
432 res
= rt6_dump_route(rt
, w
->args
);
434 /* Frame is full, suspend walking */
439 /* Multipath routes are dumped in one route with the
440 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
441 * last sibling of this route (no need to dump the
442 * sibling routes again)
444 if (rt
->rt6i_nsiblings
)
445 rt
= list_last_entry(&rt
->rt6i_siblings
,
453 static void fib6_dump_end(struct netlink_callback
*cb
)
455 struct net
*net
= sock_net(cb
->skb
->sk
);
456 struct fib6_walker
*w
= (void *)cb
->args
[2];
461 fib6_walker_unlink(net
, w
);
466 cb
->done
= (void *)cb
->args
[3];
470 static int fib6_dump_done(struct netlink_callback
*cb
)
473 return cb
->done
? cb
->done(cb
) : 0;
476 static int fib6_dump_table(struct fib6_table
*table
, struct sk_buff
*skb
,
477 struct netlink_callback
*cb
)
479 struct net
*net
= sock_net(skb
->sk
);
480 struct fib6_walker
*w
;
483 w
= (void *)cb
->args
[2];
484 w
->root
= &table
->tb6_root
;
486 if (cb
->args
[4] == 0) {
490 spin_lock_bh(&table
->tb6_lock
);
491 res
= fib6_walk(net
, w
);
492 spin_unlock_bh(&table
->tb6_lock
);
495 cb
->args
[5] = w
->root
->fn_sernum
;
498 if (cb
->args
[5] != w
->root
->fn_sernum
) {
499 /* Begin at the root if the tree changed */
500 cb
->args
[5] = w
->root
->fn_sernum
;
507 spin_lock_bh(&table
->tb6_lock
);
508 res
= fib6_walk_continue(w
);
509 spin_unlock_bh(&table
->tb6_lock
);
511 fib6_walker_unlink(net
, w
);
519 static int inet6_dump_fib(struct sk_buff
*skb
, struct netlink_callback
*cb
)
521 struct net
*net
= sock_net(skb
->sk
);
523 unsigned int e
= 0, s_e
;
524 struct rt6_rtnl_dump_arg arg
;
525 struct fib6_walker
*w
;
526 struct fib6_table
*tb
;
527 struct hlist_head
*head
;
533 w
= (void *)cb
->args
[2];
537 * 1. hook callback destructor.
539 cb
->args
[3] = (long)cb
->done
;
540 cb
->done
= fib6_dump_done
;
543 * 2. allocate and initialize walker.
545 w
= kzalloc(sizeof(*w
), GFP_ATOMIC
);
548 w
->func
= fib6_dump_node
;
549 cb
->args
[2] = (long)w
;
558 for (h
= s_h
; h
< FIB6_TABLE_HASHSZ
; h
++, s_e
= 0) {
560 head
= &net
->ipv6
.fib_table_hash
[h
];
561 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
) {
564 res
= fib6_dump_table(tb
, skb
, cb
);
576 res
= res
< 0 ? res
: skb
->len
;
585 * return the appropriate node for a routing tree "add" operation
586 * by either creating and inserting or by returning an existing
590 static struct fib6_node
*fib6_add_1(struct net
*net
,
591 struct fib6_table
*table
,
592 struct fib6_node
*root
,
593 struct in6_addr
*addr
, int plen
,
594 int offset
, int allow_create
,
595 int replace_required
,
596 struct netlink_ext_ack
*extack
)
598 struct fib6_node
*fn
, *in
, *ln
;
599 struct fib6_node
*pn
= NULL
;
604 RT6_TRACE("fib6_add_1\n");
606 /* insert node in tree */
611 struct rt6_info
*leaf
= rcu_dereference_protected(fn
->leaf
,
612 lockdep_is_held(&table
->tb6_lock
));
613 key
= (struct rt6key
*)((u8
*)leaf
+ offset
);
618 if (plen
< fn
->fn_bit
||
619 !ipv6_prefix_equal(&key
->addr
, addr
, fn
->fn_bit
)) {
621 if (replace_required
) {
622 NL_SET_ERR_MSG(extack
,
623 "Can not replace route - no match found");
624 pr_warn("Can't replace route, no match found\n");
625 return ERR_PTR(-ENOENT
);
627 pr_warn("NLM_F_CREATE should be set when creating new route\n");
636 if (plen
== fn
->fn_bit
) {
637 /* clean up an intermediate node */
638 if (!(fn
->fn_flags
& RTN_RTINFO
)) {
639 RCU_INIT_POINTER(fn
->leaf
, NULL
);
641 /* remove null_entry in the root node */
642 } else if (fn
->fn_flags
& RTN_TL_ROOT
&&
643 rcu_access_pointer(fn
->leaf
) ==
644 net
->ipv6
.ip6_null_entry
) {
645 RCU_INIT_POINTER(fn
->leaf
, NULL
);
652 * We have more bits to go
655 /* Try to walk down on tree. */
656 dir
= addr_bit_set(addr
, fn
->fn_bit
);
659 rcu_dereference_protected(fn
->right
,
660 lockdep_is_held(&table
->tb6_lock
)) :
661 rcu_dereference_protected(fn
->left
,
662 lockdep_is_held(&table
->tb6_lock
));
666 /* We should not create new node because
667 * NLM_F_REPLACE was specified without NLM_F_CREATE
668 * I assume it is safe to require NLM_F_CREATE when
669 * REPLACE flag is used! Later we may want to remove the
670 * check for replace_required, because according
671 * to netlink specification, NLM_F_CREATE
672 * MUST be specified if new route is created.
673 * That would keep IPv6 consistent with IPv4
675 if (replace_required
) {
676 NL_SET_ERR_MSG(extack
,
677 "Can not replace route - no match found");
678 pr_warn("Can't replace route, no match found\n");
679 return ERR_PTR(-ENOENT
);
681 pr_warn("NLM_F_CREATE should be set when creating new route\n");
684 * We walked to the bottom of tree.
685 * Create new leaf node without children.
688 ln
= node_alloc(net
);
691 return ERR_PTR(-ENOMEM
);
693 RCU_INIT_POINTER(ln
->parent
, pn
);
696 rcu_assign_pointer(pn
->right
, ln
);
698 rcu_assign_pointer(pn
->left
, ln
);
705 * split since we don't have a common prefix anymore or
706 * we have a less significant route.
707 * we've to insert an intermediate node on the list
708 * this new node will point to the one we need to create
712 pn
= rcu_dereference_protected(fn
->parent
,
713 lockdep_is_held(&table
->tb6_lock
));
715 /* find 1st bit in difference between the 2 addrs.
717 See comment in __ipv6_addr_diff: bit may be an invalid value,
718 but if it is >= plen, the value is ignored in any case.
721 bit
= __ipv6_addr_diff(addr
, &key
->addr
, sizeof(*addr
));
726 * (new leaf node)[ln] (old node)[fn]
729 in
= node_alloc(net
);
730 ln
= node_alloc(net
);
734 node_free_immediate(net
, in
);
736 node_free_immediate(net
, ln
);
737 return ERR_PTR(-ENOMEM
);
741 * new intermediate node.
743 * be off since that an address that chooses one of
744 * the branches would not match less specific routes
745 * in the other branch
750 RCU_INIT_POINTER(in
->parent
, pn
);
752 atomic_inc(&rcu_dereference_protected(in
->leaf
,
753 lockdep_is_held(&table
->tb6_lock
))->rt6i_ref
);
755 /* update parent pointer */
757 rcu_assign_pointer(pn
->right
, in
);
759 rcu_assign_pointer(pn
->left
, in
);
763 RCU_INIT_POINTER(ln
->parent
, in
);
764 rcu_assign_pointer(fn
->parent
, in
);
766 if (addr_bit_set(addr
, bit
)) {
767 rcu_assign_pointer(in
->right
, ln
);
768 rcu_assign_pointer(in
->left
, fn
);
770 rcu_assign_pointer(in
->left
, ln
);
771 rcu_assign_pointer(in
->right
, fn
);
773 } else { /* plen <= bit */
776 * (new leaf node)[ln]
778 * (old node)[fn] NULL
781 ln
= node_alloc(net
);
784 return ERR_PTR(-ENOMEM
);
788 RCU_INIT_POINTER(ln
->parent
, pn
);
790 if (addr_bit_set(&key
->addr
, plen
))
791 RCU_INIT_POINTER(ln
->right
, fn
);
793 RCU_INIT_POINTER(ln
->left
, fn
);
795 rcu_assign_pointer(fn
->parent
, ln
);
798 rcu_assign_pointer(pn
->right
, ln
);
800 rcu_assign_pointer(pn
->left
, ln
);
805 static void fib6_copy_metrics(u32
*mp
, const struct mx6_config
*mxc
)
809 for (i
= 0; i
< RTAX_MAX
; i
++) {
810 if (test_bit(i
, mxc
->mx_valid
))
815 static int fib6_commit_metrics(struct dst_entry
*dst
, struct mx6_config
*mxc
)
820 if (dst
->flags
& DST_HOST
) {
821 u32
*mp
= dst_metrics_write_ptr(dst
);
826 fib6_copy_metrics(mp
, mxc
);
828 dst_init_metrics(dst
, mxc
->mx
, false);
830 /* We've stolen mx now. */
837 static void fib6_purge_rt(struct rt6_info
*rt
, struct fib6_node
*fn
,
840 struct fib6_table
*table
= rt
->rt6i_table
;
842 if (atomic_read(&rt
->rt6i_ref
) != 1) {
843 /* This route is used as dummy address holder in some split
844 * nodes. It is not leaked, but it still holds other resources,
845 * which must be released in time. So, scan ascendant nodes
846 * and replace dummy references to this route with references
847 * to still alive ones.
850 struct rt6_info
*leaf
= rcu_dereference_protected(fn
->leaf
,
851 lockdep_is_held(&table
->tb6_lock
));
852 struct rt6_info
*new_leaf
;
853 if (!(fn
->fn_flags
& RTN_RTINFO
) && leaf
== rt
) {
854 new_leaf
= fib6_find_prefix(net
, table
, fn
);
855 atomic_inc(&new_leaf
->rt6i_ref
);
856 rcu_assign_pointer(fn
->leaf
, new_leaf
);
859 fn
= rcu_dereference_protected(fn
->parent
,
860 lockdep_is_held(&table
->tb6_lock
));
866 * Insert routing information in a node.
869 static int fib6_add_rt2node(struct fib6_node
*fn
, struct rt6_info
*rt
,
870 struct nl_info
*info
, struct mx6_config
*mxc
,
871 struct netlink_ext_ack
*extack
)
873 struct rt6_info
*leaf
= rcu_dereference_protected(fn
->leaf
,
874 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
875 struct rt6_info
*iter
= NULL
;
876 struct rt6_info __rcu
**ins
;
877 struct rt6_info __rcu
**fallback_ins
= NULL
;
878 int replace
= (info
->nlh
&&
879 (info
->nlh
->nlmsg_flags
& NLM_F_REPLACE
));
880 int add
= (!info
->nlh
||
881 (info
->nlh
->nlmsg_flags
& NLM_F_CREATE
));
883 bool rt_can_ecmp
= rt6_qualify_for_ecmp(rt
);
884 u16 nlflags
= NLM_F_EXCL
;
887 if (info
->nlh
&& (info
->nlh
->nlmsg_flags
& NLM_F_APPEND
))
888 nlflags
|= NLM_F_APPEND
;
892 for (iter
= leaf
; iter
;
893 iter
= rcu_dereference_protected(iter
->rt6_next
,
894 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
))) {
896 * Search for duplicates
899 if (iter
->rt6i_metric
== rt
->rt6i_metric
) {
901 * Same priority level
904 (info
->nlh
->nlmsg_flags
& NLM_F_EXCL
))
907 nlflags
&= ~NLM_F_EXCL
;
909 if (rt_can_ecmp
== rt6_qualify_for_ecmp(iter
)) {
914 fallback_ins
= fallback_ins
?: ins
;
918 if (rt6_duplicate_nexthop(iter
, rt
)) {
919 if (rt
->rt6i_nsiblings
)
920 rt
->rt6i_nsiblings
= 0;
921 if (!(iter
->rt6i_flags
& RTF_EXPIRES
))
923 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
924 rt6_clean_expires(iter
);
926 rt6_set_expires(iter
, rt
->dst
.expires
);
927 iter
->rt6i_pmtu
= rt
->rt6i_pmtu
;
930 /* If we have the same destination and the same metric,
931 * but not the same gateway, then the route we try to
932 * add is sibling to this route, increment our counter
933 * of siblings, and later we will add our route to the
935 * Only static routes (which don't have flag
936 * RTF_EXPIRES) are used for ECMPv6.
938 * To avoid long list, we only had siblings if the
939 * route have a gateway.
942 rt6_qualify_for_ecmp(iter
))
943 rt
->rt6i_nsiblings
++;
946 if (iter
->rt6i_metric
> rt
->rt6i_metric
)
950 ins
= &iter
->rt6_next
;
953 if (fallback_ins
&& !found
) {
954 /* No ECMP-able route found, replace first non-ECMP one */
956 iter
= rcu_dereference_protected(*ins
,
957 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
961 /* Reset round-robin state, if necessary */
962 if (ins
== &fn
->leaf
)
965 /* Link this route to others same route. */
966 if (rt
->rt6i_nsiblings
) {
967 unsigned int rt6i_nsiblings
;
968 struct rt6_info
*sibling
, *temp_sibling
;
970 /* Find the first route that have the same metric */
973 if (sibling
->rt6i_metric
== rt
->rt6i_metric
&&
974 rt6_qualify_for_ecmp(sibling
)) {
975 list_add_tail(&rt
->rt6i_siblings
,
976 &sibling
->rt6i_siblings
);
979 sibling
= rcu_dereference_protected(sibling
->rt6_next
,
980 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
982 /* For each sibling in the list, increment the counter of
983 * siblings. BUG() if counters does not match, list of siblings
987 list_for_each_entry_safe(sibling
, temp_sibling
,
988 &rt
->rt6i_siblings
, rt6i_siblings
) {
989 sibling
->rt6i_nsiblings
++;
990 BUG_ON(sibling
->rt6i_nsiblings
!= rt
->rt6i_nsiblings
);
993 BUG_ON(rt6i_nsiblings
!= rt
->rt6i_nsiblings
);
994 rt6_multipath_rebalance(temp_sibling
);
1002 pr_warn("NLM_F_CREATE should be set when creating new route\n");
1005 nlflags
|= NLM_F_CREATE
;
1006 err
= fib6_commit_metrics(&rt
->dst
, mxc
);
1010 err
= call_fib6_entry_notifiers(info
->nl_net
,
1011 FIB_EVENT_ENTRY_ADD
,
1016 rcu_assign_pointer(rt
->rt6_next
, iter
);
1017 atomic_inc(&rt
->rt6i_ref
);
1018 rcu_assign_pointer(rt
->rt6i_node
, fn
);
1019 rcu_assign_pointer(*ins
, rt
);
1020 if (!info
->skip_notify
)
1021 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
1022 info
->nl_net
->ipv6
.rt6_stats
->fib_rt_entries
++;
1024 if (!(fn
->fn_flags
& RTN_RTINFO
)) {
1025 info
->nl_net
->ipv6
.rt6_stats
->fib_route_nodes
++;
1026 fn
->fn_flags
|= RTN_RTINFO
;
1035 pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1039 err
= fib6_commit_metrics(&rt
->dst
, mxc
);
1043 err
= call_fib6_entry_notifiers(info
->nl_net
,
1044 FIB_EVENT_ENTRY_REPLACE
,
1049 atomic_inc(&rt
->rt6i_ref
);
1050 rcu_assign_pointer(rt
->rt6i_node
, fn
);
1051 rt
->rt6_next
= iter
->rt6_next
;
1052 rcu_assign_pointer(*ins
, rt
);
1053 if (!info
->skip_notify
)
1054 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, NLM_F_REPLACE
);
1055 if (!(fn
->fn_flags
& RTN_RTINFO
)) {
1056 info
->nl_net
->ipv6
.rt6_stats
->fib_route_nodes
++;
1057 fn
->fn_flags
|= RTN_RTINFO
;
1059 nsiblings
= iter
->rt6i_nsiblings
;
1060 iter
->rt6i_node
= NULL
;
1061 fib6_purge_rt(iter
, fn
, info
->nl_net
);
1062 if (rcu_access_pointer(fn
->rr_ptr
) == iter
)
1067 /* Replacing an ECMP route, remove all siblings */
1068 ins
= &rt
->rt6_next
;
1069 iter
= rcu_dereference_protected(*ins
,
1070 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1072 if (iter
->rt6i_metric
> rt
->rt6i_metric
)
1074 if (rt6_qualify_for_ecmp(iter
)) {
1075 *ins
= iter
->rt6_next
;
1076 iter
->rt6i_node
= NULL
;
1077 fib6_purge_rt(iter
, fn
, info
->nl_net
);
1078 if (rcu_access_pointer(fn
->rr_ptr
) == iter
)
1082 info
->nl_net
->ipv6
.rt6_stats
->fib_rt_entries
--;
1084 ins
= &iter
->rt6_next
;
1086 iter
= rcu_dereference_protected(*ins
,
1087 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1089 WARN_ON(nsiblings
!= 0);
1096 static void fib6_start_gc(struct net
*net
, struct rt6_info
*rt
)
1098 if (!timer_pending(&net
->ipv6
.ip6_fib_timer
) &&
1099 (rt
->rt6i_flags
& (RTF_EXPIRES
| RTF_CACHE
)))
1100 mod_timer(&net
->ipv6
.ip6_fib_timer
,
1101 jiffies
+ net
->ipv6
.sysctl
.ip6_rt_gc_interval
);
1104 void fib6_force_start_gc(struct net
*net
)
1106 if (!timer_pending(&net
->ipv6
.ip6_fib_timer
))
1107 mod_timer(&net
->ipv6
.ip6_fib_timer
,
1108 jiffies
+ net
->ipv6
.sysctl
.ip6_rt_gc_interval
);
1111 static void __fib6_update_sernum_upto_root(struct rt6_info
*rt
,
1114 struct fib6_node
*fn
= rcu_dereference_protected(rt
->rt6i_node
,
1115 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1117 /* paired with smp_rmb() in rt6_get_cookie_safe() */
1120 fn
->fn_sernum
= sernum
;
1121 fn
= rcu_dereference_protected(fn
->parent
,
1122 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1126 void fib6_update_sernum_upto_root(struct net
*net
, struct rt6_info
*rt
)
1128 __fib6_update_sernum_upto_root(rt
, fib6_new_sernum(net
));
1132 * Add routing information to the routing tree.
1133 * <destination addr>/<source addr>
1134 * with source addr info in sub-trees
1135 * Need to own table->tb6_lock
1138 int fib6_add(struct fib6_node
*root
, struct rt6_info
*rt
,
1139 struct nl_info
*info
, struct mx6_config
*mxc
,
1140 struct netlink_ext_ack
*extack
)
1142 struct fib6_table
*table
= rt
->rt6i_table
;
1143 struct fib6_node
*fn
, *pn
= NULL
;
1145 int allow_create
= 1;
1146 int replace_required
= 0;
1147 int sernum
= fib6_new_sernum(info
->nl_net
);
1149 if (WARN_ON_ONCE(!atomic_read(&rt
->dst
.__refcnt
)))
1151 if (WARN_ON_ONCE(rt
->rt6i_flags
& RTF_CACHE
))
1155 if (!(info
->nlh
->nlmsg_flags
& NLM_F_CREATE
))
1157 if (info
->nlh
->nlmsg_flags
& NLM_F_REPLACE
)
1158 replace_required
= 1;
1160 if (!allow_create
&& !replace_required
)
1161 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1163 fn
= fib6_add_1(info
->nl_net
, table
, root
,
1164 &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
,
1165 offsetof(struct rt6_info
, rt6i_dst
), allow_create
,
1166 replace_required
, extack
);
1175 #ifdef CONFIG_IPV6_SUBTREES
1176 if (rt
->rt6i_src
.plen
) {
1177 struct fib6_node
*sn
;
1179 if (!rcu_access_pointer(fn
->subtree
)) {
1180 struct fib6_node
*sfn
;
1192 /* Create subtree root node */
1193 sfn
= node_alloc(info
->nl_net
);
1197 atomic_inc(&info
->nl_net
->ipv6
.ip6_null_entry
->rt6i_ref
);
1198 rcu_assign_pointer(sfn
->leaf
,
1199 info
->nl_net
->ipv6
.ip6_null_entry
);
1200 sfn
->fn_flags
= RTN_ROOT
;
1202 /* Now add the first leaf node to new subtree */
1204 sn
= fib6_add_1(info
->nl_net
, table
, sfn
,
1205 &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
,
1206 offsetof(struct rt6_info
, rt6i_src
),
1207 allow_create
, replace_required
, extack
);
1210 /* If it is failed, discard just allocated
1211 root, and then (in failure) stale node
1214 node_free_immediate(info
->nl_net
, sfn
);
1219 /* Now link new subtree to main tree */
1220 rcu_assign_pointer(sfn
->parent
, fn
);
1221 rcu_assign_pointer(fn
->subtree
, sfn
);
1223 sn
= fib6_add_1(info
->nl_net
, table
, FIB6_SUBTREE(fn
),
1224 &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
,
1225 offsetof(struct rt6_info
, rt6i_src
),
1226 allow_create
, replace_required
, extack
);
1234 if (!rcu_access_pointer(fn
->leaf
)) {
1235 if (fn
->fn_flags
& RTN_TL_ROOT
) {
1236 /* put back null_entry for root node */
1237 rcu_assign_pointer(fn
->leaf
,
1238 info
->nl_net
->ipv6
.ip6_null_entry
);
1240 atomic_inc(&rt
->rt6i_ref
);
1241 rcu_assign_pointer(fn
->leaf
, rt
);
1248 err
= fib6_add_rt2node(fn
, rt
, info
, mxc
, extack
);
1250 __fib6_update_sernum_upto_root(rt
, sernum
);
1251 fib6_start_gc(info
->nl_net
, rt
);
1256 #ifdef CONFIG_IPV6_SUBTREES
1258 * If fib6_add_1 has cleared the old leaf pointer in the
1259 * super-tree leaf node we have to find a new one for it.
1262 struct rt6_info
*pn_leaf
=
1263 rcu_dereference_protected(pn
->leaf
,
1264 lockdep_is_held(&table
->tb6_lock
));
1265 if (pn_leaf
== rt
) {
1267 RCU_INIT_POINTER(pn
->leaf
, NULL
);
1268 atomic_dec(&rt
->rt6i_ref
);
1270 if (!pn_leaf
&& !(pn
->fn_flags
& RTN_RTINFO
)) {
1271 pn_leaf
= fib6_find_prefix(info
->nl_net
, table
,
1277 info
->nl_net
->ipv6
.ip6_null_entry
;
1280 atomic_inc(&pn_leaf
->rt6i_ref
);
1281 rcu_assign_pointer(pn
->leaf
, pn_leaf
);
1290 /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1291 * 1. fn is an intermediate node and we failed to add the new
1292 * route to it in both subtree creation failure and fib6_add_rt2node()
1294 * 2. fn is the root node in the table and we fail to add the first
1295 * default route to it.
1298 (!(fn
->fn_flags
& (RTN_RTINFO
|RTN_ROOT
)) ||
1299 (fn
->fn_flags
& RTN_TL_ROOT
&&
1300 !rcu_access_pointer(fn
->leaf
))))
1301 fib6_repair_tree(info
->nl_net
, table
, fn
);
1302 /* Always release dst as dst->__refcnt is guaranteed
1303 * to be taken before entering this function
1305 dst_release_immediate(&rt
->dst
);
1310 * Routing tree lookup
1314 struct lookup_args
{
1315 int offset
; /* key offset on rt6_info */
1316 const struct in6_addr
*addr
; /* search key */
1319 static struct fib6_node
*fib6_lookup_1(struct fib6_node
*root
,
1320 struct lookup_args
*args
)
1322 struct fib6_node
*fn
;
1325 if (unlikely(args
->offset
== 0))
1335 struct fib6_node
*next
;
1337 dir
= addr_bit_set(args
->addr
, fn
->fn_bit
);
1339 next
= dir
? rcu_dereference(fn
->right
) :
1340 rcu_dereference(fn
->left
);
1350 struct fib6_node
*subtree
= FIB6_SUBTREE(fn
);
1352 if (subtree
|| fn
->fn_flags
& RTN_RTINFO
) {
1353 struct rt6_info
*leaf
= rcu_dereference(fn
->leaf
);
1359 key
= (struct rt6key
*) ((u8
*)leaf
+ args
->offset
);
1361 if (ipv6_prefix_equal(&key
->addr
, args
->addr
, key
->plen
)) {
1362 #ifdef CONFIG_IPV6_SUBTREES
1364 struct fib6_node
*sfn
;
1365 sfn
= fib6_lookup_1(subtree
, args
+ 1);
1371 if (fn
->fn_flags
& RTN_RTINFO
)
1376 if (fn
->fn_flags
& RTN_ROOT
)
1379 fn
= rcu_dereference(fn
->parent
);
1385 /* called with rcu_read_lock() held
1387 struct fib6_node
*fib6_lookup(struct fib6_node
*root
, const struct in6_addr
*daddr
,
1388 const struct in6_addr
*saddr
)
1390 struct fib6_node
*fn
;
1391 struct lookup_args args
[] = {
1393 .offset
= offsetof(struct rt6_info
, rt6i_dst
),
1396 #ifdef CONFIG_IPV6_SUBTREES
1398 .offset
= offsetof(struct rt6_info
, rt6i_src
),
1403 .offset
= 0, /* sentinel */
1407 fn
= fib6_lookup_1(root
, daddr
? args
: args
+ 1);
1408 if (!fn
|| fn
->fn_flags
& RTN_TL_ROOT
)
1415 * Get node with specified destination prefix (and source prefix,
1416 * if subtrees are used)
1417 * exact_match == true means we try to find fn with exact match of
1418 * the passed in prefix addr
1419 * exact_match == false means we try to find fn with longest prefix
1420 * match of the passed in prefix addr. This is useful for finding fn
1421 * for cached route as it will be stored in the exception table under
1422 * the node with longest prefix length.
1426 static struct fib6_node
*fib6_locate_1(struct fib6_node
*root
,
1427 const struct in6_addr
*addr
,
1428 int plen
, int offset
,
1431 struct fib6_node
*fn
, *prev
= NULL
;
1433 for (fn
= root
; fn
; ) {
1434 struct rt6_info
*leaf
= rcu_dereference(fn
->leaf
);
1437 /* This node is being deleted */
1439 if (plen
<= fn
->fn_bit
)
1445 key
= (struct rt6key
*)((u8
*)leaf
+ offset
);
1450 if (plen
< fn
->fn_bit
||
1451 !ipv6_prefix_equal(&key
->addr
, addr
, fn
->fn_bit
))
1454 if (plen
== fn
->fn_bit
)
1461 * We have more bits to go
1463 if (addr_bit_set(addr
, fn
->fn_bit
))
1464 fn
= rcu_dereference(fn
->right
);
1466 fn
= rcu_dereference(fn
->left
);
1475 struct fib6_node
*fib6_locate(struct fib6_node
*root
,
1476 const struct in6_addr
*daddr
, int dst_len
,
1477 const struct in6_addr
*saddr
, int src_len
,
1480 struct fib6_node
*fn
;
1482 fn
= fib6_locate_1(root
, daddr
, dst_len
,
1483 offsetof(struct rt6_info
, rt6i_dst
),
1486 #ifdef CONFIG_IPV6_SUBTREES
1488 WARN_ON(saddr
== NULL
);
1490 struct fib6_node
*subtree
= FIB6_SUBTREE(fn
);
1493 fn
= fib6_locate_1(subtree
, saddr
, src_len
,
1494 offsetof(struct rt6_info
, rt6i_src
),
1501 if (fn
&& fn
->fn_flags
& RTN_RTINFO
)
1513 static struct rt6_info
*fib6_find_prefix(struct net
*net
,
1514 struct fib6_table
*table
,
1515 struct fib6_node
*fn
)
1517 struct fib6_node
*child_left
, *child_right
;
1519 if (fn
->fn_flags
& RTN_ROOT
)
1520 return net
->ipv6
.ip6_null_entry
;
1523 child_left
= rcu_dereference_protected(fn
->left
,
1524 lockdep_is_held(&table
->tb6_lock
));
1525 child_right
= rcu_dereference_protected(fn
->right
,
1526 lockdep_is_held(&table
->tb6_lock
));
1528 return rcu_dereference_protected(child_left
->leaf
,
1529 lockdep_is_held(&table
->tb6_lock
));
1531 return rcu_dereference_protected(child_right
->leaf
,
1532 lockdep_is_held(&table
->tb6_lock
));
1534 fn
= FIB6_SUBTREE(fn
);
1540 * Called to trim the tree of intermediate nodes when possible. "fn"
1541 * is the node we want to try and remove.
1542 * Need to own table->tb6_lock
1545 static struct fib6_node
*fib6_repair_tree(struct net
*net
,
1546 struct fib6_table
*table
,
1547 struct fib6_node
*fn
)
1551 struct fib6_node
*child
;
1552 struct fib6_walker
*w
;
1555 /* Set fn->leaf to null_entry for root node. */
1556 if (fn
->fn_flags
& RTN_TL_ROOT
) {
1557 rcu_assign_pointer(fn
->leaf
, net
->ipv6
.ip6_null_entry
);
1562 struct fib6_node
*fn_r
= rcu_dereference_protected(fn
->right
,
1563 lockdep_is_held(&table
->tb6_lock
));
1564 struct fib6_node
*fn_l
= rcu_dereference_protected(fn
->left
,
1565 lockdep_is_held(&table
->tb6_lock
));
1566 struct fib6_node
*pn
= rcu_dereference_protected(fn
->parent
,
1567 lockdep_is_held(&table
->tb6_lock
));
1568 struct fib6_node
*pn_r
= rcu_dereference_protected(pn
->right
,
1569 lockdep_is_held(&table
->tb6_lock
));
1570 struct fib6_node
*pn_l
= rcu_dereference_protected(pn
->left
,
1571 lockdep_is_held(&table
->tb6_lock
));
1572 struct rt6_info
*fn_leaf
= rcu_dereference_protected(fn
->leaf
,
1573 lockdep_is_held(&table
->tb6_lock
));
1574 struct rt6_info
*pn_leaf
= rcu_dereference_protected(pn
->leaf
,
1575 lockdep_is_held(&table
->tb6_lock
));
1576 struct rt6_info
*new_fn_leaf
;
1578 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn
->fn_bit
, iter
);
1581 WARN_ON(fn
->fn_flags
& RTN_RTINFO
);
1582 WARN_ON(fn
->fn_flags
& RTN_TL_ROOT
);
1588 child
= fn_r
, children
|= 1;
1590 child
= fn_l
, children
|= 2;
1592 if (children
== 3 || FIB6_SUBTREE(fn
)
1593 #ifdef CONFIG_IPV6_SUBTREES
1594 /* Subtree root (i.e. fn) may have one child */
1595 || (children
&& fn
->fn_flags
& RTN_ROOT
)
1598 new_fn_leaf
= fib6_find_prefix(net
, table
, fn
);
1601 WARN_ON(!new_fn_leaf
);
1602 new_fn_leaf
= net
->ipv6
.ip6_null_entry
;
1605 atomic_inc(&new_fn_leaf
->rt6i_ref
);
1606 rcu_assign_pointer(fn
->leaf
, new_fn_leaf
);
1610 #ifdef CONFIG_IPV6_SUBTREES
1611 if (FIB6_SUBTREE(pn
) == fn
) {
1612 WARN_ON(!(fn
->fn_flags
& RTN_ROOT
));
1613 RCU_INIT_POINTER(pn
->subtree
, NULL
);
1616 WARN_ON(fn
->fn_flags
& RTN_ROOT
);
1619 rcu_assign_pointer(pn
->right
, child
);
1620 else if (pn_l
== fn
)
1621 rcu_assign_pointer(pn
->left
, child
);
1627 rcu_assign_pointer(child
->parent
, pn
);
1629 #ifdef CONFIG_IPV6_SUBTREES
1633 read_lock(&net
->ipv6
.fib6_walker_lock
);
1634 FOR_WALKERS(net
, w
) {
1636 if (w
->node
== fn
) {
1637 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w
, w
->state
, nstate
);
1642 if (w
->node
== fn
) {
1645 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w
, w
->state
);
1646 w
->state
= w
->state
>= FWS_R
? FWS_U
: FWS_INIT
;
1648 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w
, w
->state
);
1649 w
->state
= w
->state
>= FWS_C
? FWS_U
: FWS_INIT
;
1654 read_unlock(&net
->ipv6
.fib6_walker_lock
);
1657 if (pn
->fn_flags
& RTN_RTINFO
|| FIB6_SUBTREE(pn
))
1660 RCU_INIT_POINTER(pn
->leaf
, NULL
);
1661 rt6_release(pn_leaf
);
1666 static void fib6_del_route(struct fib6_table
*table
, struct fib6_node
*fn
,
1667 struct rt6_info __rcu
**rtp
, struct nl_info
*info
)
1669 struct fib6_walker
*w
;
1670 struct rt6_info
*rt
= rcu_dereference_protected(*rtp
,
1671 lockdep_is_held(&table
->tb6_lock
));
1672 struct net
*net
= info
->nl_net
;
1674 RT6_TRACE("fib6_del_route\n");
1676 WARN_ON_ONCE(rt
->rt6i_flags
& RTF_CACHE
);
1679 *rtp
= rt
->rt6_next
;
1680 rt
->rt6i_node
= NULL
;
1681 net
->ipv6
.rt6_stats
->fib_rt_entries
--;
1682 net
->ipv6
.rt6_stats
->fib_discarded_routes
++;
1684 /* Flush all cached dst in exception table */
1685 rt6_flush_exceptions(rt
);
1687 /* Reset round-robin state, if necessary */
1688 if (rcu_access_pointer(fn
->rr_ptr
) == rt
)
1691 /* Remove this entry from other siblings */
1692 if (rt
->rt6i_nsiblings
) {
1693 struct rt6_info
*sibling
, *next_sibling
;
1695 list_for_each_entry_safe(sibling
, next_sibling
,
1696 &rt
->rt6i_siblings
, rt6i_siblings
)
1697 sibling
->rt6i_nsiblings
--;
1698 rt
->rt6i_nsiblings
= 0;
1699 list_del_init(&rt
->rt6i_siblings
);
1700 rt6_multipath_rebalance(next_sibling
);
1703 /* Adjust walkers */
1704 read_lock(&net
->ipv6
.fib6_walker_lock
);
1705 FOR_WALKERS(net
, w
) {
1706 if (w
->state
== FWS_C
&& w
->leaf
== rt
) {
1707 RT6_TRACE("walker %p adjusted by delroute\n", w
);
1708 w
->leaf
= rcu_dereference_protected(rt
->rt6_next
,
1709 lockdep_is_held(&table
->tb6_lock
));
1714 read_unlock(&net
->ipv6
.fib6_walker_lock
);
1716 /* If it was last route, call fib6_repair_tree() to:
1717 * 1. For root node, put back null_entry as how the table was created.
1718 * 2. For other nodes, expunge its radix tree node.
1720 if (!rcu_access_pointer(fn
->leaf
)) {
1721 if (!(fn
->fn_flags
& RTN_TL_ROOT
)) {
1722 fn
->fn_flags
&= ~RTN_RTINFO
;
1723 net
->ipv6
.rt6_stats
->fib_route_nodes
--;
1725 fn
= fib6_repair_tree(net
, table
, fn
);
1728 fib6_purge_rt(rt
, fn
, net
);
1730 call_fib6_entry_notifiers(net
, FIB_EVENT_ENTRY_DEL
, rt
, NULL
);
1731 if (!info
->skip_notify
)
1732 inet6_rt_notify(RTM_DELROUTE
, rt
, info
, 0);
1736 /* Need to own table->tb6_lock */
1737 int fib6_del(struct rt6_info
*rt
, struct nl_info
*info
)
1739 struct fib6_node
*fn
= rcu_dereference_protected(rt
->rt6i_node
,
1740 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1741 struct fib6_table
*table
= rt
->rt6i_table
;
1742 struct net
*net
= info
->nl_net
;
1743 struct rt6_info __rcu
**rtp
;
1744 struct rt6_info __rcu
**rtp_next
;
1747 if (rt
->dst
.obsolete
> 0) {
1752 if (!fn
|| rt
== net
->ipv6
.ip6_null_entry
)
1755 WARN_ON(!(fn
->fn_flags
& RTN_RTINFO
));
1757 /* remove cached dst from exception table */
1758 if (rt
->rt6i_flags
& RTF_CACHE
)
1759 return rt6_remove_exception_rt(rt
);
1762 * Walk the leaf entries looking for ourself
1765 for (rtp
= &fn
->leaf
; *rtp
; rtp
= rtp_next
) {
1766 struct rt6_info
*cur
= rcu_dereference_protected(*rtp
,
1767 lockdep_is_held(&table
->tb6_lock
));
1769 fib6_del_route(table
, fn
, rtp
, info
);
1772 rtp_next
= &cur
->rt6_next
;
1778 * Tree traversal function.
1780 * Certainly, it is not interrupt safe.
1781 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1782 * It means, that we can modify tree during walking
1783 * and use this function for garbage collection, clone pruning,
1784 * cleaning tree when a device goes down etc. etc.
1786 * It guarantees that every node will be traversed,
1787 * and that it will be traversed only once.
1789 * Callback function w->func may return:
1790 * 0 -> continue walking.
1791 * positive value -> walking is suspended (used by tree dumps,
1792 * and probably by gc, if it will be split to several slices)
1793 * negative value -> terminate walking.
1795 * The function itself returns:
1796 * 0 -> walk is complete.
1797 * >0 -> walk is incomplete (i.e. suspended)
1798 * <0 -> walk is terminated by an error.
1800 * This function is called with tb6_lock held.
1803 static int fib6_walk_continue(struct fib6_walker
*w
)
1805 struct fib6_node
*fn
, *pn
, *left
, *right
;
1807 /* w->root should always be table->tb6_root */
1808 WARN_ON_ONCE(!(w
->root
->fn_flags
& RTN_TL_ROOT
));
1816 #ifdef CONFIG_IPV6_SUBTREES
1818 if (FIB6_SUBTREE(fn
)) {
1819 w
->node
= FIB6_SUBTREE(fn
);
1826 left
= rcu_dereference_protected(fn
->left
, 1);
1829 w
->state
= FWS_INIT
;
1835 right
= rcu_dereference_protected(fn
->right
, 1);
1838 w
->state
= FWS_INIT
;
1842 w
->leaf
= rcu_dereference_protected(fn
->leaf
, 1);
1845 if (w
->leaf
&& fn
->fn_flags
& RTN_RTINFO
) {
1866 pn
= rcu_dereference_protected(fn
->parent
, 1);
1867 left
= rcu_dereference_protected(pn
->left
, 1);
1868 right
= rcu_dereference_protected(pn
->right
, 1);
1870 #ifdef CONFIG_IPV6_SUBTREES
1871 if (FIB6_SUBTREE(pn
) == fn
) {
1872 WARN_ON(!(fn
->fn_flags
& RTN_ROOT
));
1883 w
->leaf
= rcu_dereference_protected(w
->node
->leaf
, 1);
1893 static int fib6_walk(struct net
*net
, struct fib6_walker
*w
)
1897 w
->state
= FWS_INIT
;
1900 fib6_walker_link(net
, w
);
1901 res
= fib6_walk_continue(w
);
1903 fib6_walker_unlink(net
, w
);
1907 static int fib6_clean_node(struct fib6_walker
*w
)
1910 struct rt6_info
*rt
;
1911 struct fib6_cleaner
*c
= container_of(w
, struct fib6_cleaner
, w
);
1912 struct nl_info info
= {
1916 if (c
->sernum
!= FIB6_NO_SERNUM_CHANGE
&&
1917 w
->node
->fn_sernum
!= c
->sernum
)
1918 w
->node
->fn_sernum
= c
->sernum
;
1921 WARN_ON_ONCE(c
->sernum
== FIB6_NO_SERNUM_CHANGE
);
1926 for_each_fib6_walker_rt(w
) {
1927 res
= c
->func(rt
, c
->arg
);
1930 res
= fib6_del(rt
, &info
);
1933 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
1935 rcu_access_pointer(rt
->rt6i_node
),
1941 } else if (res
== -2) {
1942 if (WARN_ON(!rt
->rt6i_nsiblings
))
1944 rt
= list_last_entry(&rt
->rt6i_siblings
,
1945 struct rt6_info
, rt6i_siblings
);
1955 * Convenient frontend to tree walker.
1957 * func is called on each route.
1958 * It may return -2 -> skip multipath route.
1959 * -1 -> delete this route.
1960 * 0 -> continue walking
1963 static void fib6_clean_tree(struct net
*net
, struct fib6_node
*root
,
1964 int (*func
)(struct rt6_info
*, void *arg
),
1965 int sernum
, void *arg
)
1967 struct fib6_cleaner c
;
1970 c
.w
.func
= fib6_clean_node
;
1978 fib6_walk(net
, &c
.w
);
1981 static void __fib6_clean_all(struct net
*net
,
1982 int (*func
)(struct rt6_info
*, void *),
1983 int sernum
, void *arg
)
1985 struct fib6_table
*table
;
1986 struct hlist_head
*head
;
1990 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
1991 head
= &net
->ipv6
.fib_table_hash
[h
];
1992 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
1993 spin_lock_bh(&table
->tb6_lock
);
1994 fib6_clean_tree(net
, &table
->tb6_root
,
1996 spin_unlock_bh(&table
->tb6_lock
);
2002 void fib6_clean_all(struct net
*net
, int (*func
)(struct rt6_info
*, void *),
2005 __fib6_clean_all(net
, func
, FIB6_NO_SERNUM_CHANGE
, arg
);
2008 static void fib6_flush_trees(struct net
*net
)
2010 int new_sernum
= fib6_new_sernum(net
);
2012 __fib6_clean_all(net
, NULL
, new_sernum
, NULL
);
2016 * Garbage collection
2019 static int fib6_age(struct rt6_info
*rt
, void *arg
)
2021 struct fib6_gc_args
*gc_args
= arg
;
2022 unsigned long now
= jiffies
;
2025 * check addrconf expiration here.
2026 * Routes are expired even if they are in use.
2029 if (rt
->rt6i_flags
& RTF_EXPIRES
&& rt
->dst
.expires
) {
2030 if (time_after(now
, rt
->dst
.expires
)) {
2031 RT6_TRACE("expiring %p\n", rt
);
2037 /* Also age clones in the exception table.
2038 * Note, that clones are aged out
2039 * only if they are not in use now.
2041 rt6_age_exceptions(rt
, gc_args
, now
);
2046 void fib6_run_gc(unsigned long expires
, struct net
*net
, bool force
)
2048 struct fib6_gc_args gc_args
;
2052 spin_lock_bh(&net
->ipv6
.fib6_gc_lock
);
2053 } else if (!spin_trylock_bh(&net
->ipv6
.fib6_gc_lock
)) {
2054 mod_timer(&net
->ipv6
.ip6_fib_timer
, jiffies
+ HZ
);
2057 gc_args
.timeout
= expires
? (int)expires
:
2058 net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2061 fib6_clean_all(net
, fib6_age
, &gc_args
);
2063 net
->ipv6
.ip6_rt_last_gc
= now
;
2066 mod_timer(&net
->ipv6
.ip6_fib_timer
,
2068 + net
->ipv6
.sysctl
.ip6_rt_gc_interval
));
2070 del_timer(&net
->ipv6
.ip6_fib_timer
);
2071 spin_unlock_bh(&net
->ipv6
.fib6_gc_lock
);
2074 static void fib6_gc_timer_cb(struct timer_list
*t
)
2076 struct net
*arg
= from_timer(arg
, t
, ipv6
.ip6_fib_timer
);
2078 fib6_run_gc(0, arg
, true);
2081 static int __net_init
fib6_net_init(struct net
*net
)
2083 size_t size
= sizeof(struct hlist_head
) * FIB6_TABLE_HASHSZ
;
2086 err
= fib6_notifier_init(net
);
2090 spin_lock_init(&net
->ipv6
.fib6_gc_lock
);
2091 rwlock_init(&net
->ipv6
.fib6_walker_lock
);
2092 INIT_LIST_HEAD(&net
->ipv6
.fib6_walkers
);
2093 timer_setup(&net
->ipv6
.ip6_fib_timer
, fib6_gc_timer_cb
, 0);
2095 net
->ipv6
.rt6_stats
= kzalloc(sizeof(*net
->ipv6
.rt6_stats
), GFP_KERNEL
);
2096 if (!net
->ipv6
.rt6_stats
)
2099 /* Avoid false sharing : Use at least a full cache line */
2100 size
= max_t(size_t, size
, L1_CACHE_BYTES
);
2102 net
->ipv6
.fib_table_hash
= kzalloc(size
, GFP_KERNEL
);
2103 if (!net
->ipv6
.fib_table_hash
)
2106 net
->ipv6
.fib6_main_tbl
= kzalloc(sizeof(*net
->ipv6
.fib6_main_tbl
),
2108 if (!net
->ipv6
.fib6_main_tbl
)
2109 goto out_fib_table_hash
;
2111 net
->ipv6
.fib6_main_tbl
->tb6_id
= RT6_TABLE_MAIN
;
2112 rcu_assign_pointer(net
->ipv6
.fib6_main_tbl
->tb6_root
.leaf
,
2113 net
->ipv6
.ip6_null_entry
);
2114 net
->ipv6
.fib6_main_tbl
->tb6_root
.fn_flags
=
2115 RTN_ROOT
| RTN_TL_ROOT
| RTN_RTINFO
;
2116 inet_peer_base_init(&net
->ipv6
.fib6_main_tbl
->tb6_peers
);
2118 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2119 net
->ipv6
.fib6_local_tbl
= kzalloc(sizeof(*net
->ipv6
.fib6_local_tbl
),
2121 if (!net
->ipv6
.fib6_local_tbl
)
2122 goto out_fib6_main_tbl
;
2123 net
->ipv6
.fib6_local_tbl
->tb6_id
= RT6_TABLE_LOCAL
;
2124 rcu_assign_pointer(net
->ipv6
.fib6_local_tbl
->tb6_root
.leaf
,
2125 net
->ipv6
.ip6_null_entry
);
2126 net
->ipv6
.fib6_local_tbl
->tb6_root
.fn_flags
=
2127 RTN_ROOT
| RTN_TL_ROOT
| RTN_RTINFO
;
2128 inet_peer_base_init(&net
->ipv6
.fib6_local_tbl
->tb6_peers
);
2130 fib6_tables_init(net
);
2134 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2136 kfree(net
->ipv6
.fib6_main_tbl
);
2139 kfree(net
->ipv6
.fib_table_hash
);
2141 kfree(net
->ipv6
.rt6_stats
);
2143 fib6_notifier_exit(net
);
2147 static void fib6_net_exit(struct net
*net
)
2151 del_timer_sync(&net
->ipv6
.ip6_fib_timer
);
2153 for (i
= 0; i
< FIB6_TABLE_HASHSZ
; i
++) {
2154 struct hlist_head
*head
= &net
->ipv6
.fib_table_hash
[i
];
2155 struct hlist_node
*tmp
;
2156 struct fib6_table
*tb
;
2158 hlist_for_each_entry_safe(tb
, tmp
, head
, tb6_hlist
) {
2159 hlist_del(&tb
->tb6_hlist
);
2160 fib6_free_table(tb
);
2164 kfree(net
->ipv6
.fib_table_hash
);
2165 kfree(net
->ipv6
.rt6_stats
);
2166 fib6_notifier_exit(net
);
2169 static struct pernet_operations fib6_net_ops
= {
2170 .init
= fib6_net_init
,
2171 .exit
= fib6_net_exit
,
2174 int __init
fib6_init(void)
2178 fib6_node_kmem
= kmem_cache_create("fib6_nodes",
2179 sizeof(struct fib6_node
),
2180 0, SLAB_HWCACHE_ALIGN
,
2182 if (!fib6_node_kmem
)
2185 ret
= register_pernet_subsys(&fib6_net_ops
);
2187 goto out_kmem_cache_create
;
2189 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_GETROUTE
, NULL
,
2192 goto out_unregister_subsys
;
2194 __fib6_flush_trees
= fib6_flush_trees
;
2198 out_unregister_subsys
:
2199 unregister_pernet_subsys(&fib6_net_ops
);
2200 out_kmem_cache_create
:
2201 kmem_cache_destroy(fib6_node_kmem
);
2205 void fib6_gc_cleanup(void)
2207 unregister_pernet_subsys(&fib6_net_ops
);
2208 kmem_cache_destroy(fib6_node_kmem
);
2211 #ifdef CONFIG_PROC_FS
2213 struct ipv6_route_iter
{
2214 struct seq_net_private p
;
2215 struct fib6_walker w
;
2217 struct fib6_table
*tbl
;
2221 static int ipv6_route_seq_show(struct seq_file
*seq
, void *v
)
2223 struct rt6_info
*rt
= v
;
2224 struct ipv6_route_iter
*iter
= seq
->private;
2226 seq_printf(seq
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2228 #ifdef CONFIG_IPV6_SUBTREES
2229 seq_printf(seq
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2231 seq_puts(seq
, "00000000000000000000000000000000 00 ");
2233 if (rt
->rt6i_flags
& RTF_GATEWAY
)
2234 seq_printf(seq
, "%pi6", &rt
->rt6i_gateway
);
2236 seq_puts(seq
, "00000000000000000000000000000000");
2238 seq_printf(seq
, " %08x %08x %08x %08x %8s\n",
2239 rt
->rt6i_metric
, atomic_read(&rt
->dst
.__refcnt
),
2240 rt
->dst
.__use
, rt
->rt6i_flags
,
2241 rt
->dst
.dev
? rt
->dst
.dev
->name
: "");
2242 iter
->w
.leaf
= NULL
;
2246 static int ipv6_route_yield(struct fib6_walker
*w
)
2248 struct ipv6_route_iter
*iter
= w
->args
;
2254 iter
->w
.leaf
= rcu_dereference_protected(
2255 iter
->w
.leaf
->rt6_next
,
2256 lockdep_is_held(&iter
->tbl
->tb6_lock
));
2258 if (!iter
->skip
&& iter
->w
.leaf
)
2260 } while (iter
->w
.leaf
);
2265 static void ipv6_route_seq_setup_walk(struct ipv6_route_iter
*iter
,
2268 memset(&iter
->w
, 0, sizeof(iter
->w
));
2269 iter
->w
.func
= ipv6_route_yield
;
2270 iter
->w
.root
= &iter
->tbl
->tb6_root
;
2271 iter
->w
.state
= FWS_INIT
;
2272 iter
->w
.node
= iter
->w
.root
;
2273 iter
->w
.args
= iter
;
2274 iter
->sernum
= iter
->w
.root
->fn_sernum
;
2275 INIT_LIST_HEAD(&iter
->w
.lh
);
2276 fib6_walker_link(net
, &iter
->w
);
2279 static struct fib6_table
*ipv6_route_seq_next_table(struct fib6_table
*tbl
,
2283 struct hlist_node
*node
;
2286 h
= (tbl
->tb6_id
& (FIB6_TABLE_HASHSZ
- 1)) + 1;
2287 node
= rcu_dereference_bh(hlist_next_rcu(&tbl
->tb6_hlist
));
2293 while (!node
&& h
< FIB6_TABLE_HASHSZ
) {
2294 node
= rcu_dereference_bh(
2295 hlist_first_rcu(&net
->ipv6
.fib_table_hash
[h
++]));
2297 return hlist_entry_safe(node
, struct fib6_table
, tb6_hlist
);
2300 static void ipv6_route_check_sernum(struct ipv6_route_iter
*iter
)
2302 if (iter
->sernum
!= iter
->w
.root
->fn_sernum
) {
2303 iter
->sernum
= iter
->w
.root
->fn_sernum
;
2304 iter
->w
.state
= FWS_INIT
;
2305 iter
->w
.node
= iter
->w
.root
;
2306 WARN_ON(iter
->w
.skip
);
2307 iter
->w
.skip
= iter
->w
.count
;
2311 static void *ipv6_route_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2315 struct net
*net
= seq_file_net(seq
);
2316 struct ipv6_route_iter
*iter
= seq
->private;
2321 n
= rcu_dereference_bh(((struct rt6_info
*)v
)->rt6_next
);
2328 ipv6_route_check_sernum(iter
);
2329 spin_lock_bh(&iter
->tbl
->tb6_lock
);
2330 r
= fib6_walk_continue(&iter
->w
);
2331 spin_unlock_bh(&iter
->tbl
->tb6_lock
);
2335 return iter
->w
.leaf
;
2337 fib6_walker_unlink(net
, &iter
->w
);
2340 fib6_walker_unlink(net
, &iter
->w
);
2342 iter
->tbl
= ipv6_route_seq_next_table(iter
->tbl
, net
);
2346 ipv6_route_seq_setup_walk(iter
, net
);
2350 static void *ipv6_route_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2353 struct net
*net
= seq_file_net(seq
);
2354 struct ipv6_route_iter
*iter
= seq
->private;
2357 iter
->tbl
= ipv6_route_seq_next_table(NULL
, net
);
2361 ipv6_route_seq_setup_walk(iter
, net
);
2362 return ipv6_route_seq_next(seq
, NULL
, pos
);
2368 static bool ipv6_route_iter_active(struct ipv6_route_iter
*iter
)
2370 struct fib6_walker
*w
= &iter
->w
;
2371 return w
->node
&& !(w
->state
== FWS_U
&& w
->node
== w
->root
);
2374 static void ipv6_route_seq_stop(struct seq_file
*seq
, void *v
)
2377 struct net
*net
= seq_file_net(seq
);
2378 struct ipv6_route_iter
*iter
= seq
->private;
2380 if (ipv6_route_iter_active(iter
))
2381 fib6_walker_unlink(net
, &iter
->w
);
2383 rcu_read_unlock_bh();
2386 static const struct seq_operations ipv6_route_seq_ops
= {
2387 .start
= ipv6_route_seq_start
,
2388 .next
= ipv6_route_seq_next
,
2389 .stop
= ipv6_route_seq_stop
,
2390 .show
= ipv6_route_seq_show
2393 int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2395 return seq_open_net(inode
, file
, &ipv6_route_seq_ops
,
2396 sizeof(struct ipv6_route_iter
));
2399 #endif /* CONFIG_PROC_FS */