1 // SPDX-License-Identifier: GPL-2.0
2 /* Generic nexthop implementation
4 * Copyright (c) 2017-19 Cumulus Networks
5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
8 #include <linux/nexthop.h>
9 #include <linux/rtnetlink.h>
10 #include <linux/slab.h>
11 #include <linux/vmalloc.h>
13 #include <net/ipv6_stubs.h>
14 #include <net/lwtunnel.h>
15 #include <net/ndisc.h>
16 #include <net/nexthop.h>
17 #include <net/route.h>
20 #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ)
21 #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */
23 static void remove_nexthop(struct net
*net
, struct nexthop
*nh
,
24 struct nl_info
*nlinfo
);
26 #define NH_DEV_HASHBITS 8
27 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
29 #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \
30 NHA_OP_FLAG_DUMP_HW_STATS)
32 static const struct nla_policy rtm_nh_policy_new
[] = {
33 [NHA_ID
] = { .type
= NLA_U32
},
34 [NHA_GROUP
] = { .type
= NLA_BINARY
},
35 [NHA_GROUP_TYPE
] = { .type
= NLA_U16
},
36 [NHA_BLACKHOLE
] = { .type
= NLA_FLAG
},
37 [NHA_OIF
] = { .type
= NLA_U32
},
38 [NHA_GATEWAY
] = { .type
= NLA_BINARY
},
39 [NHA_ENCAP_TYPE
] = { .type
= NLA_U16
},
40 [NHA_ENCAP
] = { .type
= NLA_NESTED
},
41 [NHA_FDB
] = { .type
= NLA_FLAG
},
42 [NHA_RES_GROUP
] = { .type
= NLA_NESTED
},
43 [NHA_HW_STATS_ENABLE
] = NLA_POLICY_MAX(NLA_U32
, true),
46 static const struct nla_policy rtm_nh_policy_get
[] = {
47 [NHA_ID
] = { .type
= NLA_U32
},
48 [NHA_OP_FLAGS
] = NLA_POLICY_MASK(NLA_U32
,
49 NHA_OP_FLAGS_DUMP_ALL
),
52 static const struct nla_policy rtm_nh_policy_del
[] = {
53 [NHA_ID
] = { .type
= NLA_U32
},
56 static const struct nla_policy rtm_nh_policy_dump
[] = {
57 [NHA_OIF
] = { .type
= NLA_U32
},
58 [NHA_GROUPS
] = { .type
= NLA_FLAG
},
59 [NHA_MASTER
] = { .type
= NLA_U32
},
60 [NHA_FDB
] = { .type
= NLA_FLAG
},
61 [NHA_OP_FLAGS
] = NLA_POLICY_MASK(NLA_U32
,
62 NHA_OP_FLAGS_DUMP_ALL
),
65 static const struct nla_policy rtm_nh_res_policy_new
[] = {
66 [NHA_RES_GROUP_BUCKETS
] = { .type
= NLA_U16
},
67 [NHA_RES_GROUP_IDLE_TIMER
] = { .type
= NLA_U32
},
68 [NHA_RES_GROUP_UNBALANCED_TIMER
] = { .type
= NLA_U32
},
71 static const struct nla_policy rtm_nh_policy_dump_bucket
[] = {
72 [NHA_ID
] = { .type
= NLA_U32
},
73 [NHA_OIF
] = { .type
= NLA_U32
},
74 [NHA_MASTER
] = { .type
= NLA_U32
},
75 [NHA_RES_BUCKET
] = { .type
= NLA_NESTED
},
78 static const struct nla_policy rtm_nh_res_bucket_policy_dump
[] = {
79 [NHA_RES_BUCKET_NH_ID
] = { .type
= NLA_U32
},
82 static const struct nla_policy rtm_nh_policy_get_bucket
[] = {
83 [NHA_ID
] = { .type
= NLA_U32
},
84 [NHA_RES_BUCKET
] = { .type
= NLA_NESTED
},
87 static const struct nla_policy rtm_nh_res_bucket_policy_get
[] = {
88 [NHA_RES_BUCKET_INDEX
] = { .type
= NLA_U16
},
91 static bool nexthop_notifiers_is_empty(struct net
*net
)
93 return !net
->nexthop
.notifier_chain
.head
;
97 __nh_notifier_single_info_init(struct nh_notifier_single_info
*nh_info
,
98 const struct nh_info
*nhi
)
100 nh_info
->dev
= nhi
->fib_nhc
.nhc_dev
;
101 nh_info
->gw_family
= nhi
->fib_nhc
.nhc_gw_family
;
102 if (nh_info
->gw_family
== AF_INET
)
103 nh_info
->ipv4
= nhi
->fib_nhc
.nhc_gw
.ipv4
;
104 else if (nh_info
->gw_family
== AF_INET6
)
105 nh_info
->ipv6
= nhi
->fib_nhc
.nhc_gw
.ipv6
;
107 nh_info
->id
= nhi
->nh_parent
->id
;
108 nh_info
->is_reject
= nhi
->reject_nh
;
109 nh_info
->is_fdb
= nhi
->fdb_nh
;
110 nh_info
->has_encap
= !!nhi
->fib_nhc
.nhc_lwtstate
;
113 static int nh_notifier_single_info_init(struct nh_notifier_info
*info
,
114 const struct nexthop
*nh
)
116 struct nh_info
*nhi
= rtnl_dereference(nh
->nh_info
);
118 info
->type
= NH_NOTIFIER_INFO_TYPE_SINGLE
;
119 info
->nh
= kzalloc(sizeof(*info
->nh
), GFP_KERNEL
);
123 __nh_notifier_single_info_init(info
->nh
, nhi
);
128 static void nh_notifier_single_info_fini(struct nh_notifier_info
*info
)
133 static int nh_notifier_mpath_info_init(struct nh_notifier_info
*info
,
134 struct nh_group
*nhg
)
136 u16 num_nh
= nhg
->num_nh
;
139 info
->type
= NH_NOTIFIER_INFO_TYPE_GRP
;
140 info
->nh_grp
= kzalloc(struct_size(info
->nh_grp
, nh_entries
, num_nh
),
145 info
->nh_grp
->num_nh
= num_nh
;
146 info
->nh_grp
->is_fdb
= nhg
->fdb_nh
;
147 info
->nh_grp
->hw_stats
= nhg
->hw_stats
;
149 for (i
= 0; i
< num_nh
; i
++) {
150 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
153 nhi
= rtnl_dereference(nhge
->nh
->nh_info
);
154 info
->nh_grp
->nh_entries
[i
].weight
= nhge
->weight
;
155 __nh_notifier_single_info_init(&info
->nh_grp
->nh_entries
[i
].nh
,
162 static int nh_notifier_res_table_info_init(struct nh_notifier_info
*info
,
163 struct nh_group
*nhg
)
165 struct nh_res_table
*res_table
= rtnl_dereference(nhg
->res_table
);
166 u16 num_nh_buckets
= res_table
->num_nh_buckets
;
170 info
->type
= NH_NOTIFIER_INFO_TYPE_RES_TABLE
;
171 size
= struct_size(info
->nh_res_table
, nhs
, num_nh_buckets
);
172 info
->nh_res_table
= __vmalloc(size
, GFP_KERNEL
| __GFP_ZERO
|
174 if (!info
->nh_res_table
)
177 info
->nh_res_table
->num_nh_buckets
= num_nh_buckets
;
178 info
->nh_res_table
->hw_stats
= nhg
->hw_stats
;
180 for (i
= 0; i
< num_nh_buckets
; i
++) {
181 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
182 struct nh_grp_entry
*nhge
;
185 nhge
= rtnl_dereference(bucket
->nh_entry
);
186 nhi
= rtnl_dereference(nhge
->nh
->nh_info
);
187 __nh_notifier_single_info_init(&info
->nh_res_table
->nhs
[i
],
194 static int nh_notifier_grp_info_init(struct nh_notifier_info
*info
,
195 const struct nexthop
*nh
)
197 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
199 if (nhg
->hash_threshold
)
200 return nh_notifier_mpath_info_init(info
, nhg
);
201 else if (nhg
->resilient
)
202 return nh_notifier_res_table_info_init(info
, nhg
);
206 static void nh_notifier_grp_info_fini(struct nh_notifier_info
*info
,
207 const struct nexthop
*nh
)
209 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
211 if (nhg
->hash_threshold
)
213 else if (nhg
->resilient
)
214 vfree(info
->nh_res_table
);
217 static int nh_notifier_info_init(struct nh_notifier_info
*info
,
218 const struct nexthop
*nh
)
223 return nh_notifier_grp_info_init(info
, nh
);
225 return nh_notifier_single_info_init(info
, nh
);
228 static void nh_notifier_info_fini(struct nh_notifier_info
*info
,
229 const struct nexthop
*nh
)
232 nh_notifier_grp_info_fini(info
, nh
);
234 nh_notifier_single_info_fini(info
);
237 static int call_nexthop_notifiers(struct net
*net
,
238 enum nexthop_event_type event_type
,
240 struct netlink_ext_ack
*extack
)
242 struct nh_notifier_info info
= {
250 if (nexthop_notifiers_is_empty(net
))
253 err
= nh_notifier_info_init(&info
, nh
);
255 NL_SET_ERR_MSG(extack
, "Failed to initialize nexthop notifier info");
259 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
261 nh_notifier_info_fini(&info
, nh
);
263 return notifier_to_errno(err
);
267 nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info
*info
,
268 bool force
, unsigned int *p_idle_timer_ms
)
270 struct nh_res_table
*res_table
;
271 struct nh_group
*nhg
;
275 /* When 'force' is false, nexthop bucket replacement is performed
276 * because the bucket was deemed to be idle. In this case, capable
277 * listeners can choose to perform an atomic replacement: The bucket is
278 * only replaced if it is inactive. However, if the idle timer interval
279 * is smaller than the interval in which a listener is querying
280 * buckets' activity from the device, then atomic replacement should
281 * not be tried. Pass the idle timer value to listeners, so that they
282 * could determine which type of replacement to perform.
285 *p_idle_timer_ms
= 0;
291 nh
= nexthop_find_by_id(info
->net
, info
->id
);
297 nhg
= rcu_dereference(nh
->nh_grp
);
298 res_table
= rcu_dereference(nhg
->res_table
);
299 *p_idle_timer_ms
= jiffies_to_msecs(res_table
->idle_timer
);
307 static int nh_notifier_res_bucket_info_init(struct nh_notifier_info
*info
,
308 u16 bucket_index
, bool force
,
309 struct nh_info
*oldi
,
310 struct nh_info
*newi
)
312 unsigned int idle_timer_ms
;
315 err
= nh_notifier_res_bucket_idle_timer_get(info
, force
,
320 info
->type
= NH_NOTIFIER_INFO_TYPE_RES_BUCKET
;
321 info
->nh_res_bucket
= kzalloc(sizeof(*info
->nh_res_bucket
),
323 if (!info
->nh_res_bucket
)
326 info
->nh_res_bucket
->bucket_index
= bucket_index
;
327 info
->nh_res_bucket
->idle_timer_ms
= idle_timer_ms
;
328 info
->nh_res_bucket
->force
= force
;
329 __nh_notifier_single_info_init(&info
->nh_res_bucket
->old_nh
, oldi
);
330 __nh_notifier_single_info_init(&info
->nh_res_bucket
->new_nh
, newi
);
334 static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info
*info
)
336 kfree(info
->nh_res_bucket
);
339 static int __call_nexthop_res_bucket_notifiers(struct net
*net
, u32 nhg_id
,
340 u16 bucket_index
, bool force
,
341 struct nh_info
*oldi
,
342 struct nh_info
*newi
,
343 struct netlink_ext_ack
*extack
)
345 struct nh_notifier_info info
= {
352 if (nexthop_notifiers_is_empty(net
))
355 err
= nh_notifier_res_bucket_info_init(&info
, bucket_index
, force
,
360 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
361 NEXTHOP_EVENT_BUCKET_REPLACE
, &info
);
362 nh_notifier_res_bucket_info_fini(&info
);
364 return notifier_to_errno(err
);
367 /* There are three users of RES_TABLE, and NHs etc. referenced from there:
369 * 1) a collection of callbacks for NH maintenance. This operates under
371 * 2) the delayed work that gradually balances the resilient table,
372 * 3) and nexthop_select_path(), operating under RCU.
374 * Both the delayed work and the RTNL block are writers, and need to
375 * maintain mutual exclusion. Since there are only two and well-known
376 * writers for each table, the RTNL code can make sure it has exclusive
379 * - Have the DW operate without locking;
380 * - synchronously cancel the DW;
382 * - if the write was not actually a delete, call upkeep, which schedules
383 * DW again if necessary.
385 * The functions that are always called from the RTNL context use
386 * rtnl_dereference(). The functions that can also be called from the DW do
387 * a raw dereference and rely on the above mutual exclusion scheme.
389 #define nh_res_dereference(p) (rcu_dereference_raw(p))
391 static int call_nexthop_res_bucket_notifiers(struct net
*net
, u32 nhg_id
,
392 u16 bucket_index
, bool force
,
393 struct nexthop
*old_nh
,
394 struct nexthop
*new_nh
,
395 struct netlink_ext_ack
*extack
)
397 struct nh_info
*oldi
= nh_res_dereference(old_nh
->nh_info
);
398 struct nh_info
*newi
= nh_res_dereference(new_nh
->nh_info
);
400 return __call_nexthop_res_bucket_notifiers(net
, nhg_id
, bucket_index
,
401 force
, oldi
, newi
, extack
);
404 static int call_nexthop_res_table_notifiers(struct net
*net
, struct nexthop
*nh
,
405 struct netlink_ext_ack
*extack
)
407 struct nh_notifier_info info
= {
412 struct nh_group
*nhg
;
417 if (nexthop_notifiers_is_empty(net
))
420 /* At this point, the nexthop buckets are still not populated. Only
421 * emit a notification with the logical nexthops, so that a listener
422 * could potentially veto it in case of unsupported configuration.
424 nhg
= rtnl_dereference(nh
->nh_grp
);
425 err
= nh_notifier_mpath_info_init(&info
, nhg
);
427 NL_SET_ERR_MSG(extack
, "Failed to initialize nexthop notifier info");
431 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
432 NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE
,
436 return notifier_to_errno(err
);
439 static int call_nexthop_notifier(struct notifier_block
*nb
, struct net
*net
,
440 enum nexthop_event_type event_type
,
442 struct netlink_ext_ack
*extack
)
444 struct nh_notifier_info info
= {
450 err
= nh_notifier_info_init(&info
, nh
);
454 err
= nb
->notifier_call(nb
, event_type
, &info
);
455 nh_notifier_info_fini(&info
, nh
);
457 return notifier_to_errno(err
);
460 static unsigned int nh_dev_hashfn(unsigned int val
)
462 unsigned int mask
= NH_DEV_HASHSIZE
- 1;
465 (val
>> NH_DEV_HASHBITS
) ^
466 (val
>> (NH_DEV_HASHBITS
* 2))) & mask
;
469 static void nexthop_devhash_add(struct net
*net
, struct nh_info
*nhi
)
471 struct net_device
*dev
= nhi
->fib_nhc
.nhc_dev
;
472 struct hlist_head
*head
;
477 hash
= nh_dev_hashfn(dev
->ifindex
);
478 head
= &net
->nexthop
.devhash
[hash
];
479 hlist_add_head(&nhi
->dev_hash
, head
);
482 static void nexthop_free_group(struct nexthop
*nh
)
484 struct nh_group
*nhg
;
487 nhg
= rcu_dereference_raw(nh
->nh_grp
);
488 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
489 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
491 WARN_ON(!list_empty(&nhge
->nh_list
));
492 free_percpu(nhge
->stats
);
493 nexthop_put(nhge
->nh
);
496 WARN_ON(nhg
->spare
== nhg
);
499 vfree(rcu_dereference_raw(nhg
->res_table
));
505 static void nexthop_free_single(struct nexthop
*nh
)
509 nhi
= rcu_dereference_raw(nh
->nh_info
);
510 switch (nhi
->family
) {
512 fib_nh_release(nh
->net
, &nhi
->fib_nh
);
515 ipv6_stub
->fib6_nh_release(&nhi
->fib6_nh
);
521 void nexthop_free_rcu(struct rcu_head
*head
)
523 struct nexthop
*nh
= container_of(head
, struct nexthop
, rcu
);
526 nexthop_free_group(nh
);
528 nexthop_free_single(nh
);
532 EXPORT_SYMBOL_GPL(nexthop_free_rcu
);
534 static struct nexthop
*nexthop_alloc(void)
538 nh
= kzalloc(sizeof(struct nexthop
), GFP_KERNEL
);
540 INIT_LIST_HEAD(&nh
->fi_list
);
541 INIT_LIST_HEAD(&nh
->f6i_list
);
542 INIT_LIST_HEAD(&nh
->grp_list
);
543 INIT_LIST_HEAD(&nh
->fdb_list
);
548 static struct nh_group
*nexthop_grp_alloc(u16 num_nh
)
550 struct nh_group
*nhg
;
552 nhg
= kzalloc(struct_size(nhg
, nh_entries
, num_nh
), GFP_KERNEL
);
554 nhg
->num_nh
= num_nh
;
559 static void nh_res_table_upkeep_dw(struct work_struct
*work
);
561 static struct nh_res_table
*
562 nexthop_res_table_alloc(struct net
*net
, u32 nhg_id
, struct nh_config
*cfg
)
564 const u16 num_nh_buckets
= cfg
->nh_grp_res_num_buckets
;
565 struct nh_res_table
*res_table
;
568 size
= struct_size(res_table
, nh_buckets
, num_nh_buckets
);
569 res_table
= __vmalloc(size
, GFP_KERNEL
| __GFP_ZERO
| __GFP_NOWARN
);
573 res_table
->net
= net
;
574 res_table
->nhg_id
= nhg_id
;
575 INIT_DELAYED_WORK(&res_table
->upkeep_dw
, &nh_res_table_upkeep_dw
);
576 INIT_LIST_HEAD(&res_table
->uw_nh_entries
);
577 res_table
->idle_timer
= cfg
->nh_grp_res_idle_timer
;
578 res_table
->unbalanced_timer
= cfg
->nh_grp_res_unbalanced_timer
;
579 res_table
->num_nh_buckets
= num_nh_buckets
;
583 static void nh_base_seq_inc(struct net
*net
)
585 while (++net
->nexthop
.seq
== 0)
589 /* no reference taken; rcu lock or rtnl must be held */
590 struct nexthop
*nexthop_find_by_id(struct net
*net
, u32 id
)
592 struct rb_node
**pp
, *parent
= NULL
, *next
;
594 pp
= &net
->nexthop
.rb_root
.rb_node
;
598 next
= rcu_dereference_raw(*pp
);
603 nh
= rb_entry(parent
, struct nexthop
, rb_node
);
606 else if (id
> nh
->id
)
607 pp
= &next
->rb_right
;
613 EXPORT_SYMBOL_GPL(nexthop_find_by_id
);
615 /* used for auto id allocation; called with rtnl held */
616 static u32
nh_find_unused_id(struct net
*net
)
618 u32 id_start
= net
->nexthop
.last_id_allocated
;
621 net
->nexthop
.last_id_allocated
++;
622 if (net
->nexthop
.last_id_allocated
== id_start
)
625 if (!nexthop_find_by_id(net
, net
->nexthop
.last_id_allocated
))
626 return net
->nexthop
.last_id_allocated
;
631 static void nh_res_time_set_deadline(unsigned long next_time
,
632 unsigned long *deadline
)
634 if (time_before(next_time
, *deadline
))
635 *deadline
= next_time
;
638 static clock_t nh_res_table_unbalanced_time(struct nh_res_table
*res_table
)
640 if (list_empty(&res_table
->uw_nh_entries
))
642 return jiffies_delta_to_clock_t(jiffies
- res_table
->unbalanced_since
);
645 static int nla_put_nh_group_res(struct sk_buff
*skb
, struct nh_group
*nhg
)
647 struct nh_res_table
*res_table
= rtnl_dereference(nhg
->res_table
);
650 nest
= nla_nest_start(skb
, NHA_RES_GROUP
);
654 if (nla_put_u16(skb
, NHA_RES_GROUP_BUCKETS
,
655 res_table
->num_nh_buckets
) ||
656 nla_put_u32(skb
, NHA_RES_GROUP_IDLE_TIMER
,
657 jiffies_to_clock_t(res_table
->idle_timer
)) ||
658 nla_put_u32(skb
, NHA_RES_GROUP_UNBALANCED_TIMER
,
659 jiffies_to_clock_t(res_table
->unbalanced_timer
)) ||
660 nla_put_u64_64bit(skb
, NHA_RES_GROUP_UNBALANCED_TIME
,
661 nh_res_table_unbalanced_time(res_table
),
663 goto nla_put_failure
;
665 nla_nest_end(skb
, nest
);
669 nla_nest_cancel(skb
, nest
);
673 static void nh_grp_entry_stats_inc(struct nh_grp_entry
*nhge
)
675 struct nh_grp_entry_stats
*cpu_stats
;
677 cpu_stats
= get_cpu_ptr(nhge
->stats
);
678 u64_stats_update_begin(&cpu_stats
->syncp
);
679 u64_stats_inc(&cpu_stats
->packets
);
680 u64_stats_update_end(&cpu_stats
->syncp
);
681 put_cpu_ptr(cpu_stats
);
684 static void nh_grp_entry_stats_read(struct nh_grp_entry
*nhge
,
691 for_each_possible_cpu(i
) {
692 struct nh_grp_entry_stats
*cpu_stats
;
696 cpu_stats
= per_cpu_ptr(nhge
->stats
, i
);
698 start
= u64_stats_fetch_begin(&cpu_stats
->syncp
);
699 packets
= u64_stats_read(&cpu_stats
->packets
);
700 } while (u64_stats_fetch_retry(&cpu_stats
->syncp
, start
));
702 *ret_packets
+= packets
;
706 static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info
*info
,
707 const struct nexthop
*nh
)
709 struct nh_group
*nhg
;
713 nhg
= rtnl_dereference(nh
->nh_grp
);
716 info
->type
= NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS
;
717 info
->nh_grp_hw_stats
= kzalloc(struct_size(info
->nh_grp_hw_stats
,
720 if (!info
->nh_grp_hw_stats
)
723 info
->nh_grp_hw_stats
->num_nh
= nhg
->num_nh
;
724 for (i
= 0; i
< nhg
->num_nh
; i
++) {
725 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
727 info
->nh_grp_hw_stats
->stats
[i
].id
= nhge
->nh
->id
;
733 static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info
*info
)
735 kfree(info
->nh_grp_hw_stats
);
738 void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info
*info
,
742 info
->hw_stats_used
= true;
743 info
->stats
[nh_idx
].packets
+= delta_packets
;
745 EXPORT_SYMBOL(nh_grp_hw_stats_report_delta
);
747 static void nh_grp_hw_stats_apply_update(struct nexthop
*nh
,
748 struct nh_notifier_info
*info
)
750 struct nh_group
*nhg
;
754 nhg
= rtnl_dereference(nh
->nh_grp
);
756 for (i
= 0; i
< nhg
->num_nh
; i
++) {
757 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
759 nhge
->packets_hw
+= info
->nh_grp_hw_stats
->stats
[i
].packets
;
763 static int nh_grp_hw_stats_update(struct nexthop
*nh
, bool *hw_stats_used
)
765 struct nh_notifier_info info
= {
768 struct net
*net
= nh
->net
;
771 if (nexthop_notifiers_is_empty(net
)) {
772 *hw_stats_used
= false;
776 err
= nh_notifier_grp_hw_stats_init(&info
, nh
);
780 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
781 NEXTHOP_EVENT_HW_STATS_REPORT_DELTA
,
784 /* Cache whatever we got, even if there was an error, otherwise the
785 * successful stats retrievals would get lost.
787 nh_grp_hw_stats_apply_update(nh
, &info
);
788 *hw_stats_used
= info
.nh_grp_hw_stats
->hw_stats_used
;
790 nh_notifier_grp_hw_stats_fini(&info
);
791 return notifier_to_errno(err
);
794 static int nla_put_nh_group_stats_entry(struct sk_buff
*skb
,
795 struct nh_grp_entry
*nhge
,
801 nh_grp_entry_stats_read(nhge
, &packets
);
803 nest
= nla_nest_start(skb
, NHA_GROUP_STATS_ENTRY
);
807 if (nla_put_u32(skb
, NHA_GROUP_STATS_ENTRY_ID
, nhge
->nh
->id
) ||
808 nla_put_uint(skb
, NHA_GROUP_STATS_ENTRY_PACKETS
,
809 packets
+ nhge
->packets_hw
))
810 goto nla_put_failure
;
812 if (op_flags
& NHA_OP_FLAG_DUMP_HW_STATS
&&
813 nla_put_uint(skb
, NHA_GROUP_STATS_ENTRY_PACKETS_HW
,
815 goto nla_put_failure
;
817 nla_nest_end(skb
, nest
);
821 nla_nest_cancel(skb
, nest
);
825 static int nla_put_nh_group_stats(struct sk_buff
*skb
, struct nexthop
*nh
,
828 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
834 if (nla_put_u32(skb
, NHA_HW_STATS_ENABLE
, nhg
->hw_stats
))
837 if (op_flags
& NHA_OP_FLAG_DUMP_HW_STATS
&&
839 err
= nh_grp_hw_stats_update(nh
, &hw_stats_used
);
843 if (nla_put_u32(skb
, NHA_HW_STATS_USED
, hw_stats_used
))
847 nest
= nla_nest_start(skb
, NHA_GROUP_STATS
);
851 for (i
= 0; i
< nhg
->num_nh
; i
++)
852 if (nla_put_nh_group_stats_entry(skb
, &nhg
->nh_entries
[i
],
856 nla_nest_end(skb
, nest
);
860 nla_nest_cancel(skb
, nest
);
867 static int nla_put_nh_group(struct sk_buff
*skb
, struct nexthop
*nh
,
868 u32 op_flags
, u32
*resp_op_flags
)
870 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
871 struct nexthop_grp
*p
;
872 size_t len
= nhg
->num_nh
* sizeof(*p
);
878 *resp_op_flags
|= NHA_OP_FLAG_RESP_GRP_RESVD_0
;
880 if (nhg
->hash_threshold
)
881 group_type
= NEXTHOP_GRP_TYPE_MPATH
;
882 else if (nhg
->resilient
)
883 group_type
= NEXTHOP_GRP_TYPE_RES
;
885 if (nla_put_u16(skb
, NHA_GROUP_TYPE
, group_type
))
886 goto nla_put_failure
;
888 nla
= nla_reserve(skb
, NHA_GROUP
, len
);
890 goto nla_put_failure
;
893 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
894 weight
= nhg
->nh_entries
[i
].weight
- 1;
896 *p
++ = (struct nexthop_grp
) {
897 .id
= nhg
->nh_entries
[i
].nh
->id
,
899 .weight_high
= weight
>> 8,
903 if (nhg
->resilient
&& nla_put_nh_group_res(skb
, nhg
))
904 goto nla_put_failure
;
906 if (op_flags
& NHA_OP_FLAG_DUMP_STATS
&&
907 (nla_put_u32(skb
, NHA_HW_STATS_ENABLE
, nhg
->hw_stats
) ||
908 nla_put_nh_group_stats(skb
, nh
, op_flags
)))
909 goto nla_put_failure
;
917 static int nh_fill_node(struct sk_buff
*skb
, struct nexthop
*nh
,
918 int event
, u32 portid
, u32 seq
, unsigned int nlflags
,
921 struct fib6_nh
*fib6_nh
;
922 struct fib_nh
*fib_nh
;
923 struct nlmsghdr
*nlh
;
927 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*nhm
), nlflags
);
931 nhm
= nlmsg_data(nlh
);
932 nhm
->nh_family
= AF_UNSPEC
;
933 nhm
->nh_flags
= nh
->nh_flags
;
934 nhm
->nh_protocol
= nh
->protocol
;
938 if (nla_put_u32(skb
, NHA_ID
, nh
->id
))
939 goto nla_put_failure
;
942 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
943 u32 resp_op_flags
= 0;
945 if (nhg
->fdb_nh
&& nla_put_flag(skb
, NHA_FDB
))
946 goto nla_put_failure
;
947 if (nla_put_nh_group(skb
, nh
, op_flags
, &resp_op_flags
) ||
948 nla_put_u32(skb
, NHA_OP_FLAGS
, resp_op_flags
))
949 goto nla_put_failure
;
953 nhi
= rtnl_dereference(nh
->nh_info
);
954 nhm
->nh_family
= nhi
->family
;
955 if (nhi
->reject_nh
) {
956 if (nla_put_flag(skb
, NHA_BLACKHOLE
))
957 goto nla_put_failure
;
959 } else if (nhi
->fdb_nh
) {
960 if (nla_put_flag(skb
, NHA_FDB
))
961 goto nla_put_failure
;
963 const struct net_device
*dev
;
965 dev
= nhi
->fib_nhc
.nhc_dev
;
966 if (dev
&& nla_put_u32(skb
, NHA_OIF
, dev
->ifindex
))
967 goto nla_put_failure
;
970 nhm
->nh_scope
= nhi
->fib_nhc
.nhc_scope
;
971 switch (nhi
->family
) {
973 fib_nh
= &nhi
->fib_nh
;
974 if (fib_nh
->fib_nh_gw_family
&&
975 nla_put_be32(skb
, NHA_GATEWAY
, fib_nh
->fib_nh_gw4
))
976 goto nla_put_failure
;
980 fib6_nh
= &nhi
->fib6_nh
;
981 if (fib6_nh
->fib_nh_gw_family
&&
982 nla_put_in6_addr(skb
, NHA_GATEWAY
, &fib6_nh
->fib_nh_gw6
))
983 goto nla_put_failure
;
987 if (nhi
->fib_nhc
.nhc_lwtstate
&&
988 lwtunnel_fill_encap(skb
, nhi
->fib_nhc
.nhc_lwtstate
,
989 NHA_ENCAP
, NHA_ENCAP_TYPE
) < 0)
990 goto nla_put_failure
;
997 nlmsg_cancel(skb
, nlh
);
1001 static size_t nh_nlmsg_size_grp_res(struct nh_group
*nhg
)
1003 return nla_total_size(0) + /* NHA_RES_GROUP */
1004 nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */
1005 nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */
1006 nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */
1007 nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
1010 static size_t nh_nlmsg_size_grp(struct nexthop
*nh
)
1012 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
1013 size_t sz
= sizeof(struct nexthop_grp
) * nhg
->num_nh
;
1014 size_t tot
= nla_total_size(sz
) +
1015 nla_total_size(2); /* NHA_GROUP_TYPE */
1018 tot
+= nh_nlmsg_size_grp_res(nhg
);
1023 static size_t nh_nlmsg_size_single(struct nexthop
*nh
)
1025 struct nh_info
*nhi
= rtnl_dereference(nh
->nh_info
);
1028 /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
1029 * are mutually exclusive
1031 sz
= nla_total_size(4); /* NHA_OIF */
1033 switch (nhi
->family
) {
1035 if (nhi
->fib_nh
.fib_nh_gw_family
)
1036 sz
+= nla_total_size(4); /* NHA_GATEWAY */
1041 if (nhi
->fib6_nh
.fib_nh_gw_family
)
1042 sz
+= nla_total_size(sizeof(const struct in6_addr
));
1046 if (nhi
->fib_nhc
.nhc_lwtstate
) {
1047 sz
+= lwtunnel_get_encap_size(nhi
->fib_nhc
.nhc_lwtstate
);
1048 sz
+= nla_total_size(2); /* NHA_ENCAP_TYPE */
1054 static size_t nh_nlmsg_size(struct nexthop
*nh
)
1056 size_t sz
= NLMSG_ALIGN(sizeof(struct nhmsg
));
1058 sz
+= nla_total_size(4); /* NHA_ID */
1061 sz
+= nh_nlmsg_size_grp(nh
) +
1062 nla_total_size(4) + /* NHA_OP_FLAGS */
1065 sz
+= nh_nlmsg_size_single(nh
);
1070 static void nexthop_notify(int event
, struct nexthop
*nh
, struct nl_info
*info
)
1072 unsigned int nlflags
= info
->nlh
? info
->nlh
->nlmsg_flags
: 0;
1073 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
1074 struct sk_buff
*skb
;
1077 skb
= nlmsg_new(nh_nlmsg_size(nh
), gfp_any());
1081 err
= nh_fill_node(skb
, nh
, event
, info
->portid
, seq
, nlflags
, 0);
1083 /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
1084 WARN_ON(err
== -EMSGSIZE
);
1089 rtnl_notify(skb
, info
->nl_net
, info
->portid
, RTNLGRP_NEXTHOP
,
1090 info
->nlh
, gfp_any());
1093 rtnl_set_sk_err(info
->nl_net
, RTNLGRP_NEXTHOP
, err
);
1096 static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket
*bucket
)
1098 return (unsigned long)atomic_long_read(&bucket
->used_time
);
1101 static unsigned long
1102 nh_res_bucket_idle_point(const struct nh_res_table
*res_table
,
1103 const struct nh_res_bucket
*bucket
,
1106 unsigned long time
= nh_res_bucket_used_time(bucket
);
1108 /* Bucket was not used since it was migrated. The idle time is now. */
1109 if (time
== bucket
->migrated_time
)
1112 return time
+ res_table
->idle_timer
;
1115 static unsigned long
1116 nh_res_table_unb_point(const struct nh_res_table
*res_table
)
1118 return res_table
->unbalanced_since
+ res_table
->unbalanced_timer
;
1121 static void nh_res_bucket_set_idle(const struct nh_res_table
*res_table
,
1122 struct nh_res_bucket
*bucket
)
1124 unsigned long now
= jiffies
;
1126 atomic_long_set(&bucket
->used_time
, (long)now
);
1127 bucket
->migrated_time
= now
;
1130 static void nh_res_bucket_set_busy(struct nh_res_bucket
*bucket
)
1132 atomic_long_set(&bucket
->used_time
, (long)jiffies
);
1135 static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket
*bucket
)
1137 unsigned long used_time
= nh_res_bucket_used_time(bucket
);
1139 return jiffies_delta_to_clock_t(jiffies
- used_time
);
1142 static int nh_fill_res_bucket(struct sk_buff
*skb
, struct nexthop
*nh
,
1143 struct nh_res_bucket
*bucket
, u16 bucket_index
,
1144 int event
, u32 portid
, u32 seq
,
1145 unsigned int nlflags
,
1146 struct netlink_ext_ack
*extack
)
1148 struct nh_grp_entry
*nhge
= nh_res_dereference(bucket
->nh_entry
);
1149 struct nlmsghdr
*nlh
;
1150 struct nlattr
*nest
;
1153 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*nhm
), nlflags
);
1157 nhm
= nlmsg_data(nlh
);
1158 nhm
->nh_family
= AF_UNSPEC
;
1159 nhm
->nh_flags
= bucket
->nh_flags
;
1160 nhm
->nh_protocol
= nh
->protocol
;
1164 if (nla_put_u32(skb
, NHA_ID
, nh
->id
))
1165 goto nla_put_failure
;
1167 nest
= nla_nest_start(skb
, NHA_RES_BUCKET
);
1169 goto nla_put_failure
;
1171 if (nla_put_u16(skb
, NHA_RES_BUCKET_INDEX
, bucket_index
) ||
1172 nla_put_u32(skb
, NHA_RES_BUCKET_NH_ID
, nhge
->nh
->id
) ||
1173 nla_put_u64_64bit(skb
, NHA_RES_BUCKET_IDLE_TIME
,
1174 nh_res_bucket_idle_time(bucket
),
1175 NHA_RES_BUCKET_PAD
))
1176 goto nla_put_failure_nest
;
1178 nla_nest_end(skb
, nest
);
1179 nlmsg_end(skb
, nlh
);
1182 nla_put_failure_nest
:
1183 nla_nest_cancel(skb
, nest
);
1185 nlmsg_cancel(skb
, nlh
);
1189 static void nexthop_bucket_notify(struct nh_res_table
*res_table
,
1192 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[bucket_index
];
1193 struct nh_grp_entry
*nhge
= nh_res_dereference(bucket
->nh_entry
);
1194 struct nexthop
*nh
= nhge
->nh_parent
;
1195 struct sk_buff
*skb
;
1198 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1202 err
= nh_fill_res_bucket(skb
, nh
, bucket
, bucket_index
,
1203 RTM_NEWNEXTHOPBUCKET
, 0, 0, NLM_F_REPLACE
,
1210 rtnl_notify(skb
, nh
->net
, 0, RTNLGRP_NEXTHOP
, NULL
, GFP_KERNEL
);
1213 rtnl_set_sk_err(nh
->net
, RTNLGRP_NEXTHOP
, err
);
1216 static bool valid_group_nh(struct nexthop
*nh
, unsigned int npaths
,
1217 bool *is_fdb
, struct netlink_ext_ack
*extack
)
1220 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
1222 /* Nesting groups within groups is not supported. */
1223 if (nhg
->hash_threshold
) {
1224 NL_SET_ERR_MSG(extack
,
1225 "Hash-threshold group can not be a nexthop within a group");
1228 if (nhg
->resilient
) {
1229 NL_SET_ERR_MSG(extack
,
1230 "Resilient group can not be a nexthop within a group");
1233 *is_fdb
= nhg
->fdb_nh
;
1235 struct nh_info
*nhi
= rtnl_dereference(nh
->nh_info
);
1237 if (nhi
->reject_nh
&& npaths
> 1) {
1238 NL_SET_ERR_MSG(extack
,
1239 "Blackhole nexthop can not be used in a group with more than 1 path");
1242 *is_fdb
= nhi
->fdb_nh
;
1248 static int nh_check_attr_fdb_group(struct nexthop
*nh
, u8
*nh_family
,
1249 struct netlink_ext_ack
*extack
)
1251 struct nh_info
*nhi
;
1253 nhi
= rtnl_dereference(nh
->nh_info
);
1256 NL_SET_ERR_MSG(extack
, "FDB nexthop group can only have fdb nexthops");
1260 if (*nh_family
== AF_UNSPEC
) {
1261 *nh_family
= nhi
->family
;
1262 } else if (*nh_family
!= nhi
->family
) {
1263 NL_SET_ERR_MSG(extack
, "FDB nexthop group cannot have mixed family nexthops");
1270 static int nh_check_attr_group(struct net
*net
,
1271 struct nlattr
*tb
[], size_t tb_size
,
1272 u16 nh_grp_type
, struct netlink_ext_ack
*extack
)
1274 unsigned int len
= nla_len(tb
[NHA_GROUP
]);
1275 u8 nh_family
= AF_UNSPEC
;
1276 struct nexthop_grp
*nhg
;
1280 if (!len
|| len
& (sizeof(struct nexthop_grp
) - 1)) {
1281 NL_SET_ERR_MSG(extack
,
1282 "Invalid length for nexthop group attribute");
1286 /* convert len to number of nexthop ids */
1287 len
/= sizeof(*nhg
);
1289 nhg
= nla_data(tb
[NHA_GROUP
]);
1290 for (i
= 0; i
< len
; ++i
) {
1291 if (nhg
[i
].resvd2
) {
1292 NL_SET_ERR_MSG(extack
, "Reserved field in nexthop_grp must be 0");
1295 if (nexthop_grp_weight(&nhg
[i
]) == 0) {
1296 /* 0xffff got passed in, representing weight of 0x10000,
1297 * which is too heavy.
1299 NL_SET_ERR_MSG(extack
, "Invalid value for weight");
1302 for (j
= i
+ 1; j
< len
; ++j
) {
1303 if (nhg
[i
].id
== nhg
[j
].id
) {
1304 NL_SET_ERR_MSG(extack
, "Nexthop id can not be used twice in a group");
1312 nhg
= nla_data(tb
[NHA_GROUP
]);
1313 for (i
= 0; i
< len
; ++i
) {
1317 nh
= nexthop_find_by_id(net
, nhg
[i
].id
);
1319 NL_SET_ERR_MSG(extack
, "Invalid nexthop id");
1322 if (!valid_group_nh(nh
, len
, &is_fdb_nh
, extack
))
1325 if (nhg_fdb
&& nh_check_attr_fdb_group(nh
, &nh_family
, extack
))
1328 if (!nhg_fdb
&& is_fdb_nh
) {
1329 NL_SET_ERR_MSG(extack
, "Non FDB nexthop group cannot have fdb nexthops");
1333 for (i
= NHA_GROUP_TYPE
+ 1; i
< tb_size
; ++i
) {
1337 case NHA_HW_STATS_ENABLE
:
1341 if (nh_grp_type
== NEXTHOP_GRP_TYPE_RES
)
1345 NL_SET_ERR_MSG(extack
,
1346 "No other attributes can be set in nexthop groups");
1353 static bool ipv6_good_nh(const struct fib6_nh
*nh
)
1355 int state
= NUD_REACHABLE
;
1356 struct neighbour
*n
;
1360 n
= __ipv6_neigh_lookup_noref_stub(nh
->fib_nh_dev
, &nh
->fib_nh_gw6
);
1362 state
= READ_ONCE(n
->nud_state
);
1366 return !!(state
& NUD_VALID
);
1369 static bool ipv4_good_nh(const struct fib_nh
*nh
)
1371 int state
= NUD_REACHABLE
;
1372 struct neighbour
*n
;
1376 n
= __ipv4_neigh_lookup_noref(nh
->fib_nh_dev
,
1377 (__force u32
)nh
->fib_nh_gw4
);
1379 state
= READ_ONCE(n
->nud_state
);
1383 return !!(state
& NUD_VALID
);
1386 static bool nexthop_is_good_nh(const struct nexthop
*nh
)
1388 struct nh_info
*nhi
= rcu_dereference(nh
->nh_info
);
1390 switch (nhi
->family
) {
1392 return ipv4_good_nh(&nhi
->fib_nh
);
1394 return ipv6_good_nh(&nhi
->fib6_nh
);
1400 static struct nexthop
*nexthop_select_path_fdb(struct nh_group
*nhg
, int hash
)
1404 for (i
= 0; i
< nhg
->num_nh
; i
++) {
1405 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1407 if (hash
> atomic_read(&nhge
->hthr
.upper_bound
))
1410 nh_grp_entry_stats_inc(nhge
);
1418 static struct nexthop
*nexthop_select_path_hthr(struct nh_group
*nhg
, int hash
)
1420 struct nh_grp_entry
*nhge0
= NULL
;
1424 return nexthop_select_path_fdb(nhg
, hash
);
1426 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
1427 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1429 /* nexthops always check if it is good and does
1430 * not rely on a sysctl for this behavior
1432 if (!nexthop_is_good_nh(nhge
->nh
))
1438 if (hash
> atomic_read(&nhge
->hthr
.upper_bound
))
1441 nh_grp_entry_stats_inc(nhge
);
1446 nhge0
= &nhg
->nh_entries
[0];
1447 nh_grp_entry_stats_inc(nhge0
);
1451 static struct nexthop
*nexthop_select_path_res(struct nh_group
*nhg
, int hash
)
1453 struct nh_res_table
*res_table
= rcu_dereference(nhg
->res_table
);
1454 u16 bucket_index
= hash
% res_table
->num_nh_buckets
;
1455 struct nh_res_bucket
*bucket
;
1456 struct nh_grp_entry
*nhge
;
1458 /* nexthop_select_path() is expected to return a non-NULL value, so
1459 * skip protocol validation and just hand out whatever there is.
1461 bucket
= &res_table
->nh_buckets
[bucket_index
];
1462 nh_res_bucket_set_busy(bucket
);
1463 nhge
= rcu_dereference(bucket
->nh_entry
);
1464 nh_grp_entry_stats_inc(nhge
);
1468 struct nexthop
*nexthop_select_path(struct nexthop
*nh
, int hash
)
1470 struct nh_group
*nhg
;
1475 nhg
= rcu_dereference(nh
->nh_grp
);
1476 if (nhg
->hash_threshold
)
1477 return nexthop_select_path_hthr(nhg
, hash
);
1478 else if (nhg
->resilient
)
1479 return nexthop_select_path_res(nhg
, hash
);
1484 EXPORT_SYMBOL_GPL(nexthop_select_path
);
1486 int nexthop_for_each_fib6_nh(struct nexthop
*nh
,
1487 int (*cb
)(struct fib6_nh
*nh
, void *arg
),
1490 struct nh_info
*nhi
;
1494 struct nh_group
*nhg
;
1497 nhg
= rcu_dereference_rtnl(nh
->nh_grp
);
1498 for (i
= 0; i
< nhg
->num_nh
; i
++) {
1499 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1501 nhi
= rcu_dereference_rtnl(nhge
->nh
->nh_info
);
1502 err
= cb(&nhi
->fib6_nh
, arg
);
1507 nhi
= rcu_dereference_rtnl(nh
->nh_info
);
1508 err
= cb(&nhi
->fib6_nh
, arg
);
1515 EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh
);
1517 static int check_src_addr(const struct in6_addr
*saddr
,
1518 struct netlink_ext_ack
*extack
)
1520 if (!ipv6_addr_any(saddr
)) {
1521 NL_SET_ERR_MSG(extack
, "IPv6 routes using source address can not use nexthop objects");
1527 int fib6_check_nexthop(struct nexthop
*nh
, struct fib6_config
*cfg
,
1528 struct netlink_ext_ack
*extack
)
1530 struct nh_info
*nhi
;
1533 /* fib6_src is unique to a fib6_info and limits the ability to cache
1534 * routes in fib6_nh within a nexthop that is potentially shared
1535 * across multiple fib entries. If the config wants to use source
1536 * routing it can not use nexthop objects. mlxsw also does not allow
1537 * fib6_src on routes.
1539 if (cfg
&& check_src_addr(&cfg
->fc_src
, extack
) < 0)
1543 struct nh_group
*nhg
;
1545 nhg
= rtnl_dereference(nh
->nh_grp
);
1548 is_fdb_nh
= nhg
->fdb_nh
;
1550 nhi
= rtnl_dereference(nh
->nh_info
);
1551 if (nhi
->family
== AF_INET
)
1553 is_fdb_nh
= nhi
->fdb_nh
;
1557 NL_SET_ERR_MSG(extack
, "Route cannot point to a fdb nexthop");
1563 NL_SET_ERR_MSG(extack
, "IPv6 routes can not use an IPv4 nexthop");
1566 EXPORT_SYMBOL_GPL(fib6_check_nexthop
);
1568 /* if existing nexthop has ipv6 routes linked to it, need
1569 * to verify this new spec works with ipv6
1571 static int fib6_check_nh_list(struct nexthop
*old
, struct nexthop
*new,
1572 struct netlink_ext_ack
*extack
)
1574 struct fib6_info
*f6i
;
1576 if (list_empty(&old
->f6i_list
))
1579 list_for_each_entry(f6i
, &old
->f6i_list
, nh_list
) {
1580 if (check_src_addr(&f6i
->fib6_src
.addr
, extack
) < 0)
1584 return fib6_check_nexthop(new, NULL
, extack
);
1587 static int nexthop_check_scope(struct nh_info
*nhi
, u8 scope
,
1588 struct netlink_ext_ack
*extack
)
1590 if (scope
== RT_SCOPE_HOST
&& nhi
->fib_nhc
.nhc_gw_family
) {
1591 NL_SET_ERR_MSG(extack
,
1592 "Route with host scope can not have a gateway");
1596 if (nhi
->fib_nhc
.nhc_flags
& RTNH_F_ONLINK
&& scope
>= RT_SCOPE_LINK
) {
1597 NL_SET_ERR_MSG(extack
, "Scope mismatch with nexthop");
1604 /* Invoked by fib add code to verify nexthop by id is ok with
1605 * config for prefix; parts of fib_check_nh not done when nexthop
1608 int fib_check_nexthop(struct nexthop
*nh
, u8 scope
,
1609 struct netlink_ext_ack
*extack
)
1611 struct nh_info
*nhi
;
1615 struct nh_group
*nhg
;
1617 nhg
= rtnl_dereference(nh
->nh_grp
);
1619 NL_SET_ERR_MSG(extack
, "Route cannot point to a fdb nexthop");
1624 if (scope
== RT_SCOPE_HOST
) {
1625 NL_SET_ERR_MSG(extack
, "Route with host scope can not have multiple nexthops");
1630 /* all nexthops in a group have the same scope */
1631 nhi
= rtnl_dereference(nhg
->nh_entries
[0].nh
->nh_info
);
1632 err
= nexthop_check_scope(nhi
, scope
, extack
);
1634 nhi
= rtnl_dereference(nh
->nh_info
);
1636 NL_SET_ERR_MSG(extack
, "Route cannot point to a fdb nexthop");
1640 err
= nexthop_check_scope(nhi
, scope
, extack
);
1647 static int fib_check_nh_list(struct nexthop
*old
, struct nexthop
*new,
1648 struct netlink_ext_ack
*extack
)
1650 struct fib_info
*fi
;
1652 list_for_each_entry(fi
, &old
->fi_list
, nh_list
) {
1655 err
= fib_check_nexthop(new, fi
->fib_scope
, extack
);
1662 static bool nh_res_nhge_is_balanced(const struct nh_grp_entry
*nhge
)
1664 return nhge
->res
.count_buckets
== nhge
->res
.wants_buckets
;
1667 static bool nh_res_nhge_is_ow(const struct nh_grp_entry
*nhge
)
1669 return nhge
->res
.count_buckets
> nhge
->res
.wants_buckets
;
1672 static bool nh_res_nhge_is_uw(const struct nh_grp_entry
*nhge
)
1674 return nhge
->res
.count_buckets
< nhge
->res
.wants_buckets
;
1677 static bool nh_res_table_is_balanced(const struct nh_res_table
*res_table
)
1679 return list_empty(&res_table
->uw_nh_entries
);
1682 static void nh_res_bucket_unset_nh(struct nh_res_bucket
*bucket
)
1684 struct nh_grp_entry
*nhge
;
1686 if (bucket
->occupied
) {
1687 nhge
= nh_res_dereference(bucket
->nh_entry
);
1688 nhge
->res
.count_buckets
--;
1689 bucket
->occupied
= false;
1693 static void nh_res_bucket_set_nh(struct nh_res_bucket
*bucket
,
1694 struct nh_grp_entry
*nhge
)
1696 nh_res_bucket_unset_nh(bucket
);
1698 bucket
->occupied
= true;
1699 rcu_assign_pointer(bucket
->nh_entry
, nhge
);
1700 nhge
->res
.count_buckets
++;
1703 static bool nh_res_bucket_should_migrate(struct nh_res_table
*res_table
,
1704 struct nh_res_bucket
*bucket
,
1705 unsigned long *deadline
, bool *force
)
1707 unsigned long now
= jiffies
;
1708 struct nh_grp_entry
*nhge
;
1709 unsigned long idle_point
;
1711 if (!bucket
->occupied
) {
1712 /* The bucket is not occupied, its NHGE pointer is either
1713 * NULL or obsolete. We _have to_ migrate: set force.
1719 nhge
= nh_res_dereference(bucket
->nh_entry
);
1721 /* If the bucket is populated by an underweight or balanced
1722 * nexthop, do not migrate.
1724 if (!nh_res_nhge_is_ow(nhge
))
1727 /* At this point we know that the bucket is populated with an
1728 * overweight nexthop. It needs to be migrated to a new nexthop if
1729 * the idle timer of unbalanced timer expired.
1732 idle_point
= nh_res_bucket_idle_point(res_table
, bucket
, now
);
1733 if (time_after_eq(now
, idle_point
)) {
1734 /* The bucket is idle. We _can_ migrate: unset force. */
1739 /* Unbalanced timer of 0 means "never force". */
1740 if (res_table
->unbalanced_timer
) {
1741 unsigned long unb_point
;
1743 unb_point
= nh_res_table_unb_point(res_table
);
1744 if (time_after(now
, unb_point
)) {
1745 /* The bucket is not idle, but the unbalanced timer
1746 * expired. We _can_ migrate, but set force anyway,
1747 * so that drivers know to ignore activity reports
1754 nh_res_time_set_deadline(unb_point
, deadline
);
1757 nh_res_time_set_deadline(idle_point
, deadline
);
1761 static bool nh_res_bucket_migrate(struct nh_res_table
*res_table
,
1762 u16 bucket_index
, bool notify
,
1763 bool notify_nl
, bool force
)
1765 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[bucket_index
];
1766 struct nh_grp_entry
*new_nhge
;
1767 struct netlink_ext_ack extack
;
1770 new_nhge
= list_first_entry_or_null(&res_table
->uw_nh_entries
,
1771 struct nh_grp_entry
,
1773 if (WARN_ON_ONCE(!new_nhge
))
1774 /* If this function is called, "bucket" is either not
1775 * occupied, or it belongs to a next hop that is
1776 * overweight. In either case, there ought to be a
1777 * corresponding underweight next hop.
1782 struct nh_grp_entry
*old_nhge
;
1784 old_nhge
= nh_res_dereference(bucket
->nh_entry
);
1785 err
= call_nexthop_res_bucket_notifiers(res_table
->net
,
1787 bucket_index
, force
,
1789 new_nhge
->nh
, &extack
);
1791 pr_err_ratelimited("%s\n", extack
._msg
);
1794 /* It is not possible to veto a forced replacement, so
1795 * just clear the hardware flags from the nexthop
1796 * bucket to indicate to user space that this bucket is
1797 * not correctly populated in hardware.
1799 bucket
->nh_flags
&= ~(RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
1803 nh_res_bucket_set_nh(bucket
, new_nhge
);
1804 nh_res_bucket_set_idle(res_table
, bucket
);
1807 nexthop_bucket_notify(res_table
, bucket_index
);
1809 if (nh_res_nhge_is_balanced(new_nhge
))
1810 list_del(&new_nhge
->res
.uw_nh_entry
);
1814 #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
1816 static void nh_res_table_upkeep(struct nh_res_table
*res_table
,
1817 bool notify
, bool notify_nl
)
1819 unsigned long now
= jiffies
;
1820 unsigned long deadline
;
1823 /* Deadline is the next time that upkeep should be run. It is the
1824 * earliest time at which one of the buckets might be migrated.
1825 * Start at the most pessimistic estimate: either unbalanced_timer
1826 * from now, or if there is none, idle_timer from now. For each
1827 * encountered time point, call nh_res_time_set_deadline() to
1828 * refine the estimate.
1830 if (res_table
->unbalanced_timer
)
1831 deadline
= now
+ res_table
->unbalanced_timer
;
1833 deadline
= now
+ res_table
->idle_timer
;
1835 for (i
= 0; i
< res_table
->num_nh_buckets
; i
++) {
1836 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
1839 if (nh_res_bucket_should_migrate(res_table
, bucket
,
1840 &deadline
, &force
)) {
1841 if (!nh_res_bucket_migrate(res_table
, i
, notify
,
1842 notify_nl
, force
)) {
1843 unsigned long idle_point
;
1845 /* A driver can override the migration
1846 * decision if the HW reports that the
1847 * bucket is actually not idle. Therefore
1848 * remark the bucket as busy again and
1849 * update the deadline.
1851 nh_res_bucket_set_busy(bucket
);
1852 idle_point
= nh_res_bucket_idle_point(res_table
,
1855 nh_res_time_set_deadline(idle_point
, &deadline
);
1860 /* If the group is still unbalanced, schedule the next upkeep to
1861 * either the deadline computed above, or the minimum deadline,
1862 * whichever comes later.
1864 if (!nh_res_table_is_balanced(res_table
)) {
1865 unsigned long now
= jiffies
;
1866 unsigned long min_deadline
;
1868 min_deadline
= now
+ NH_RES_UPKEEP_DW_MINIMUM_INTERVAL
;
1869 if (time_before(deadline
, min_deadline
))
1870 deadline
= min_deadline
;
1872 queue_delayed_work(system_power_efficient_wq
,
1873 &res_table
->upkeep_dw
, deadline
- now
);
1877 static void nh_res_table_upkeep_dw(struct work_struct
*work
)
1879 struct delayed_work
*dw
= to_delayed_work(work
);
1880 struct nh_res_table
*res_table
;
1882 res_table
= container_of(dw
, struct nh_res_table
, upkeep_dw
);
1883 nh_res_table_upkeep(res_table
, true, true);
1886 static void nh_res_table_cancel_upkeep(struct nh_res_table
*res_table
)
1888 cancel_delayed_work_sync(&res_table
->upkeep_dw
);
1891 static void nh_res_group_rebalance(struct nh_group
*nhg
,
1892 struct nh_res_table
*res_table
)
1894 u16 prev_upper_bound
= 0;
1899 INIT_LIST_HEAD(&res_table
->uw_nh_entries
);
1901 for (i
= 0; i
< nhg
->num_nh
; ++i
)
1902 total
+= nhg
->nh_entries
[i
].weight
;
1904 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
1905 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1910 btw
= ((u64
)res_table
->num_nh_buckets
) * w
;
1911 upper_bound
= DIV_ROUND_CLOSEST_ULL(btw
, total
);
1912 nhge
->res
.wants_buckets
= upper_bound
- prev_upper_bound
;
1913 prev_upper_bound
= upper_bound
;
1915 if (nh_res_nhge_is_uw(nhge
)) {
1916 if (list_empty(&res_table
->uw_nh_entries
))
1917 res_table
->unbalanced_since
= jiffies
;
1918 list_add(&nhge
->res
.uw_nh_entry
,
1919 &res_table
->uw_nh_entries
);
1924 /* Migrate buckets in res_table so that they reference NHGE's from NHG with
1925 * the right NH ID. Set those buckets that do not have a corresponding NHGE
1926 * entry in NHG as not occupied.
1928 static void nh_res_table_migrate_buckets(struct nh_res_table
*res_table
,
1929 struct nh_group
*nhg
)
1933 for (i
= 0; i
< res_table
->num_nh_buckets
; i
++) {
1934 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
1935 u32 id
= rtnl_dereference(bucket
->nh_entry
)->nh
->id
;
1939 for (j
= 0; j
< nhg
->num_nh
; j
++) {
1940 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[j
];
1942 if (nhge
->nh
->id
== id
) {
1943 nh_res_bucket_set_nh(bucket
, nhge
);
1950 nh_res_bucket_unset_nh(bucket
);
1954 static void replace_nexthop_grp_res(struct nh_group
*oldg
,
1955 struct nh_group
*newg
)
1957 /* For NH group replacement, the new NHG might only have a stub
1958 * hash table with 0 buckets, because the number of buckets was not
1959 * specified. For NH removal, oldg and newg both reference the same
1960 * res_table. So in any case, in the following, we want to work
1961 * with oldg->res_table.
1963 struct nh_res_table
*old_res_table
= rtnl_dereference(oldg
->res_table
);
1964 unsigned long prev_unbalanced_since
= old_res_table
->unbalanced_since
;
1965 bool prev_has_uw
= !list_empty(&old_res_table
->uw_nh_entries
);
1967 nh_res_table_cancel_upkeep(old_res_table
);
1968 nh_res_table_migrate_buckets(old_res_table
, newg
);
1969 nh_res_group_rebalance(newg
, old_res_table
);
1970 if (prev_has_uw
&& !list_empty(&old_res_table
->uw_nh_entries
))
1971 old_res_table
->unbalanced_since
= prev_unbalanced_since
;
1972 nh_res_table_upkeep(old_res_table
, true, false);
1975 static void nh_hthr_group_rebalance(struct nh_group
*nhg
)
1981 for (i
= 0; i
< nhg
->num_nh
; ++i
)
1982 total
+= nhg
->nh_entries
[i
].weight
;
1984 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
1985 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1989 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
)w
<< 31, total
) - 1;
1990 atomic_set(&nhge
->hthr
.upper_bound
, upper_bound
);
1994 static void remove_nh_grp_entry(struct net
*net
, struct nh_grp_entry
*nhge
,
1995 struct nl_info
*nlinfo
)
1997 struct nh_grp_entry
*nhges
, *new_nhges
;
1998 struct nexthop
*nhp
= nhge
->nh_parent
;
1999 struct netlink_ext_ack extack
;
2000 struct nexthop
*nh
= nhge
->nh
;
2001 struct nh_group
*nhg
, *newg
;
2006 nhg
= rtnl_dereference(nhp
->nh_grp
);
2009 /* last entry, keep it visible and remove the parent */
2010 if (nhg
->num_nh
== 1) {
2011 remove_nexthop(net
, nhp
, nlinfo
);
2015 newg
->has_v4
= false;
2016 newg
->is_multipath
= nhg
->is_multipath
;
2017 newg
->hash_threshold
= nhg
->hash_threshold
;
2018 newg
->resilient
= nhg
->resilient
;
2019 newg
->fdb_nh
= nhg
->fdb_nh
;
2020 newg
->num_nh
= nhg
->num_nh
;
2022 /* copy old entries to new except the one getting removed */
2023 nhges
= nhg
->nh_entries
;
2024 new_nhges
= newg
->nh_entries
;
2025 for (i
= 0, j
= 0; i
< nhg
->num_nh
; ++i
) {
2026 struct nh_info
*nhi
;
2028 /* current nexthop getting removed */
2029 if (nhg
->nh_entries
[i
].nh
== nh
) {
2034 nhi
= rtnl_dereference(nhges
[i
].nh
->nh_info
);
2035 if (nhi
->family
== AF_INET
)
2036 newg
->has_v4
= true;
2038 list_del(&nhges
[i
].nh_list
);
2039 new_nhges
[j
].stats
= nhges
[i
].stats
;
2040 new_nhges
[j
].nh_parent
= nhges
[i
].nh_parent
;
2041 new_nhges
[j
].nh
= nhges
[i
].nh
;
2042 new_nhges
[j
].weight
= nhges
[i
].weight
;
2043 list_add(&new_nhges
[j
].nh_list
, &new_nhges
[j
].nh
->grp_list
);
2047 if (newg
->hash_threshold
)
2048 nh_hthr_group_rebalance(newg
);
2049 else if (newg
->resilient
)
2050 replace_nexthop_grp_res(nhg
, newg
);
2052 rcu_assign_pointer(nhp
->nh_grp
, newg
);
2054 list_del(&nhge
->nh_list
);
2055 free_percpu(nhge
->stats
);
2056 nexthop_put(nhge
->nh
);
2058 /* Removal of a NH from a resilient group is notified through
2059 * bucket notifications.
2061 if (newg
->hash_threshold
) {
2062 err
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, nhp
,
2065 pr_err("%s\n", extack
._msg
);
2069 nexthop_notify(RTM_NEWNEXTHOP
, nhp
, nlinfo
);
2072 static void remove_nexthop_from_groups(struct net
*net
, struct nexthop
*nh
,
2073 struct nl_info
*nlinfo
)
2075 struct nh_grp_entry
*nhge
, *tmp
;
2077 list_for_each_entry_safe(nhge
, tmp
, &nh
->grp_list
, nh_list
)
2078 remove_nh_grp_entry(net
, nhge
, nlinfo
);
2080 /* make sure all see the newly published array before releasing rtnl */
2084 static void remove_nexthop_group(struct nexthop
*nh
, struct nl_info
*nlinfo
)
2086 struct nh_group
*nhg
= rcu_dereference_rtnl(nh
->nh_grp
);
2087 struct nh_res_table
*res_table
;
2088 int i
, num_nh
= nhg
->num_nh
;
2090 for (i
= 0; i
< num_nh
; ++i
) {
2091 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
2093 if (WARN_ON(!nhge
->nh
))
2096 list_del_init(&nhge
->nh_list
);
2099 if (nhg
->resilient
) {
2100 res_table
= rtnl_dereference(nhg
->res_table
);
2101 nh_res_table_cancel_upkeep(res_table
);
2105 /* not called for nexthop replace */
2106 static void __remove_nexthop_fib(struct net
*net
, struct nexthop
*nh
)
2108 struct fib6_info
*f6i
, *tmp
;
2109 bool do_flush
= false;
2110 struct fib_info
*fi
;
2112 list_for_each_entry(fi
, &nh
->fi_list
, nh_list
) {
2113 fi
->fib_flags
|= RTNH_F_DEAD
;
2119 /* ip6_del_rt removes the entry from this list hence the _safe */
2120 list_for_each_entry_safe(f6i
, tmp
, &nh
->f6i_list
, nh_list
) {
2121 /* __ip6_del_rt does a release, so do a hold here */
2122 fib6_info_hold(f6i
);
2123 ipv6_stub
->ip6_del_rt(net
, f6i
,
2124 !READ_ONCE(net
->ipv4
.sysctl_nexthop_compat_mode
));
2128 static void __remove_nexthop(struct net
*net
, struct nexthop
*nh
,
2129 struct nl_info
*nlinfo
)
2131 __remove_nexthop_fib(net
, nh
);
2134 remove_nexthop_group(nh
, nlinfo
);
2136 struct nh_info
*nhi
;
2138 nhi
= rtnl_dereference(nh
->nh_info
);
2139 if (nhi
->fib_nhc
.nhc_dev
)
2140 hlist_del(&nhi
->dev_hash
);
2142 remove_nexthop_from_groups(net
, nh
, nlinfo
);
2146 static void remove_nexthop(struct net
*net
, struct nexthop
*nh
,
2147 struct nl_info
*nlinfo
)
2149 call_nexthop_notifiers(net
, NEXTHOP_EVENT_DEL
, nh
, NULL
);
2151 /* remove from the tree */
2152 rb_erase(&nh
->rb_node
, &net
->nexthop
.rb_root
);
2155 nexthop_notify(RTM_DELNEXTHOP
, nh
, nlinfo
);
2157 __remove_nexthop(net
, nh
, nlinfo
);
2158 nh_base_seq_inc(net
);
2163 /* if any FIB entries reference this nexthop, any dst entries
2164 * need to be regenerated
2166 static void nh_rt_cache_flush(struct net
*net
, struct nexthop
*nh
,
2167 struct nexthop
*replaced_nh
)
2169 struct fib6_info
*f6i
;
2170 struct nh_group
*nhg
;
2173 if (!list_empty(&nh
->fi_list
))
2174 rt_cache_flush(net
);
2176 list_for_each_entry(f6i
, &nh
->f6i_list
, nh_list
)
2177 ipv6_stub
->fib6_update_sernum(net
, f6i
);
2179 /* if an IPv6 group was replaced, we have to release all old
2180 * dsts to make sure all refcounts are released
2182 if (!replaced_nh
->is_group
)
2185 nhg
= rtnl_dereference(replaced_nh
->nh_grp
);
2186 for (i
= 0; i
< nhg
->num_nh
; i
++) {
2187 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
2188 struct nh_info
*nhi
= rtnl_dereference(nhge
->nh
->nh_info
);
2190 if (nhi
->family
== AF_INET6
)
2191 ipv6_stub
->fib6_nh_release_dsts(&nhi
->fib6_nh
);
2195 static int replace_nexthop_grp(struct net
*net
, struct nexthop
*old
,
2196 struct nexthop
*new, const struct nh_config
*cfg
,
2197 struct netlink_ext_ack
*extack
)
2199 struct nh_res_table
*tmp_table
= NULL
;
2200 struct nh_res_table
*new_res_table
;
2201 struct nh_res_table
*old_res_table
;
2202 struct nh_group
*oldg
, *newg
;
2205 if (!new->is_group
) {
2206 NL_SET_ERR_MSG(extack
, "Can not replace a nexthop group with a nexthop.");
2210 oldg
= rtnl_dereference(old
->nh_grp
);
2211 newg
= rtnl_dereference(new->nh_grp
);
2213 if (newg
->hash_threshold
!= oldg
->hash_threshold
) {
2214 NL_SET_ERR_MSG(extack
, "Can not replace a nexthop group with one of a different type.");
2218 if (newg
->hash_threshold
) {
2219 err
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, new,
2223 } else if (newg
->resilient
) {
2224 new_res_table
= rtnl_dereference(newg
->res_table
);
2225 old_res_table
= rtnl_dereference(oldg
->res_table
);
2227 /* Accept if num_nh_buckets was not given, but if it was
2228 * given, demand that the value be correct.
2230 if (cfg
->nh_grp_res_has_num_buckets
&&
2231 cfg
->nh_grp_res_num_buckets
!=
2232 old_res_table
->num_nh_buckets
) {
2233 NL_SET_ERR_MSG(extack
, "Can not change number of buckets of a resilient nexthop group.");
2237 /* Emit a pre-replace notification so that listeners could veto
2238 * a potentially unsupported configuration. Otherwise,
2239 * individual bucket replacement notifications would need to be
2240 * vetoed, which is something that should only happen if the
2241 * bucket is currently active.
2243 err
= call_nexthop_res_table_notifiers(net
, new, extack
);
2247 if (cfg
->nh_grp_res_has_idle_timer
)
2248 old_res_table
->idle_timer
= cfg
->nh_grp_res_idle_timer
;
2249 if (cfg
->nh_grp_res_has_unbalanced_timer
)
2250 old_res_table
->unbalanced_timer
=
2251 cfg
->nh_grp_res_unbalanced_timer
;
2253 replace_nexthop_grp_res(oldg
, newg
);
2255 tmp_table
= new_res_table
;
2256 rcu_assign_pointer(newg
->res_table
, old_res_table
);
2257 rcu_assign_pointer(newg
->spare
->res_table
, old_res_table
);
2260 /* update parents - used by nexthop code for cleanup */
2261 for (i
= 0; i
< newg
->num_nh
; i
++)
2262 newg
->nh_entries
[i
].nh_parent
= old
;
2264 rcu_assign_pointer(old
->nh_grp
, newg
);
2266 /* Make sure concurrent readers are not using 'oldg' anymore. */
2269 if (newg
->resilient
) {
2270 rcu_assign_pointer(oldg
->res_table
, tmp_table
);
2271 rcu_assign_pointer(oldg
->spare
->res_table
, tmp_table
);
2274 for (i
= 0; i
< oldg
->num_nh
; i
++)
2275 oldg
->nh_entries
[i
].nh_parent
= new;
2277 rcu_assign_pointer(new->nh_grp
, oldg
);
2282 static void nh_group_v4_update(struct nh_group
*nhg
)
2284 struct nh_grp_entry
*nhges
;
2285 bool has_v4
= false;
2288 nhges
= nhg
->nh_entries
;
2289 for (i
= 0; i
< nhg
->num_nh
; i
++) {
2290 struct nh_info
*nhi
;
2292 nhi
= rtnl_dereference(nhges
[i
].nh
->nh_info
);
2293 if (nhi
->family
== AF_INET
)
2296 nhg
->has_v4
= has_v4
;
2299 static int replace_nexthop_single_notify_res(struct net
*net
,
2300 struct nh_res_table
*res_table
,
2301 struct nexthop
*old
,
2302 struct nh_info
*oldi
,
2303 struct nh_info
*newi
,
2304 struct netlink_ext_ack
*extack
)
2306 u32 nhg_id
= res_table
->nhg_id
;
2310 for (i
= 0; i
< res_table
->num_nh_buckets
; i
++) {
2311 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
2312 struct nh_grp_entry
*nhge
;
2314 nhge
= rtnl_dereference(bucket
->nh_entry
);
2315 if (nhge
->nh
== old
) {
2316 err
= __call_nexthop_res_bucket_notifiers(net
, nhg_id
,
2329 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
2330 struct nh_grp_entry
*nhge
;
2332 nhge
= rtnl_dereference(bucket
->nh_entry
);
2333 if (nhge
->nh
== old
)
2334 __call_nexthop_res_bucket_notifiers(net
, nhg_id
, i
,
2341 static int replace_nexthop_single_notify(struct net
*net
,
2342 struct nexthop
*group_nh
,
2343 struct nexthop
*old
,
2344 struct nh_info
*oldi
,
2345 struct nh_info
*newi
,
2346 struct netlink_ext_ack
*extack
)
2348 struct nh_group
*nhg
= rtnl_dereference(group_nh
->nh_grp
);
2349 struct nh_res_table
*res_table
;
2351 if (nhg
->hash_threshold
) {
2352 return call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
,
2354 } else if (nhg
->resilient
) {
2355 res_table
= rtnl_dereference(nhg
->res_table
);
2356 return replace_nexthop_single_notify_res(net
, res_table
,
2364 static int replace_nexthop_single(struct net
*net
, struct nexthop
*old
,
2365 struct nexthop
*new,
2366 struct netlink_ext_ack
*extack
)
2368 u8 old_protocol
, old_nh_flags
;
2369 struct nh_info
*oldi
, *newi
;
2370 struct nh_grp_entry
*nhge
;
2373 if (new->is_group
) {
2374 NL_SET_ERR_MSG(extack
, "Can not replace a nexthop with a nexthop group.");
2378 err
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, new, extack
);
2382 /* Hardware flags were set on 'old' as 'new' is not in the red-black
2383 * tree. Therefore, inherit the flags from 'old' to 'new'.
2385 new->nh_flags
|= old
->nh_flags
& (RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
2387 oldi
= rtnl_dereference(old
->nh_info
);
2388 newi
= rtnl_dereference(new->nh_info
);
2390 newi
->nh_parent
= old
;
2391 oldi
->nh_parent
= new;
2393 old_protocol
= old
->protocol
;
2394 old_nh_flags
= old
->nh_flags
;
2396 old
->protocol
= new->protocol
;
2397 old
->nh_flags
= new->nh_flags
;
2399 rcu_assign_pointer(old
->nh_info
, newi
);
2400 rcu_assign_pointer(new->nh_info
, oldi
);
2402 /* Send a replace notification for all the groups using the nexthop. */
2403 list_for_each_entry(nhge
, &old
->grp_list
, nh_list
) {
2404 struct nexthop
*nhp
= nhge
->nh_parent
;
2406 err
= replace_nexthop_single_notify(net
, nhp
, old
, oldi
, newi
,
2412 /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
2413 * update IPv4 indication in all the groups using the nexthop.
2415 if (oldi
->family
== AF_INET
&& newi
->family
== AF_INET6
) {
2416 list_for_each_entry(nhge
, &old
->grp_list
, nh_list
) {
2417 struct nexthop
*nhp
= nhge
->nh_parent
;
2418 struct nh_group
*nhg
;
2420 nhg
= rtnl_dereference(nhp
->nh_grp
);
2421 nh_group_v4_update(nhg
);
2428 rcu_assign_pointer(new->nh_info
, newi
);
2429 rcu_assign_pointer(old
->nh_info
, oldi
);
2430 old
->nh_flags
= old_nh_flags
;
2431 old
->protocol
= old_protocol
;
2432 oldi
->nh_parent
= old
;
2433 newi
->nh_parent
= new;
2434 list_for_each_entry_continue_reverse(nhge
, &old
->grp_list
, nh_list
) {
2435 struct nexthop
*nhp
= nhge
->nh_parent
;
2437 replace_nexthop_single_notify(net
, nhp
, old
, newi
, oldi
, NULL
);
2439 call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, old
, extack
);
2443 static void __nexthop_replace_notify(struct net
*net
, struct nexthop
*nh
,
2444 struct nl_info
*info
)
2446 struct fib6_info
*f6i
;
2448 if (!list_empty(&nh
->fi_list
)) {
2449 struct fib_info
*fi
;
2451 /* expectation is a few fib_info per nexthop and then
2452 * a lot of routes per fib_info. So mark the fib_info
2453 * and then walk the fib tables once
2455 list_for_each_entry(fi
, &nh
->fi_list
, nh_list
)
2456 fi
->nh_updated
= true;
2458 fib_info_notify_update(net
, info
);
2460 list_for_each_entry(fi
, &nh
->fi_list
, nh_list
)
2461 fi
->nh_updated
= false;
2464 list_for_each_entry(f6i
, &nh
->f6i_list
, nh_list
)
2465 ipv6_stub
->fib6_rt_update(net
, f6i
, info
);
2468 /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
2469 * linked to this nexthop and for all groups that the nexthop
2472 static void nexthop_replace_notify(struct net
*net
, struct nexthop
*nh
,
2473 struct nl_info
*info
)
2475 struct nh_grp_entry
*nhge
;
2477 __nexthop_replace_notify(net
, nh
, info
);
2479 list_for_each_entry(nhge
, &nh
->grp_list
, nh_list
)
2480 __nexthop_replace_notify(net
, nhge
->nh_parent
, info
);
2483 static int replace_nexthop(struct net
*net
, struct nexthop
*old
,
2484 struct nexthop
*new, const struct nh_config
*cfg
,
2485 struct netlink_ext_ack
*extack
)
2487 bool new_is_reject
= false;
2488 struct nh_grp_entry
*nhge
;
2491 /* check that existing FIB entries are ok with the
2492 * new nexthop definition
2494 err
= fib_check_nh_list(old
, new, extack
);
2498 err
= fib6_check_nh_list(old
, new, extack
);
2502 if (!new->is_group
) {
2503 struct nh_info
*nhi
= rtnl_dereference(new->nh_info
);
2505 new_is_reject
= nhi
->reject_nh
;
2508 list_for_each_entry(nhge
, &old
->grp_list
, nh_list
) {
2509 /* if new nexthop is a blackhole, any groups using this
2510 * nexthop cannot have more than 1 path
2512 if (new_is_reject
&&
2513 nexthop_num_path(nhge
->nh_parent
) > 1) {
2514 NL_SET_ERR_MSG(extack
, "Blackhole nexthop can not be a member of a group with more than one path");
2518 err
= fib_check_nh_list(nhge
->nh_parent
, new, extack
);
2522 err
= fib6_check_nh_list(nhge
->nh_parent
, new, extack
);
2528 err
= replace_nexthop_grp(net
, old
, new, cfg
, extack
);
2530 err
= replace_nexthop_single(net
, old
, new, extack
);
2533 nh_rt_cache_flush(net
, old
, new);
2535 __remove_nexthop(net
, new, NULL
);
2542 /* called with rtnl_lock held */
2543 static int insert_nexthop(struct net
*net
, struct nexthop
*new_nh
,
2544 struct nh_config
*cfg
, struct netlink_ext_ack
*extack
)
2546 struct rb_node
**pp
, *parent
= NULL
, *next
;
2547 struct rb_root
*root
= &net
->nexthop
.rb_root
;
2548 bool replace
= !!(cfg
->nlflags
& NLM_F_REPLACE
);
2549 bool create
= !!(cfg
->nlflags
& NLM_F_CREATE
);
2550 u32 new_id
= new_nh
->id
;
2551 int replace_notify
= 0;
2554 pp
= &root
->rb_node
;
2564 nh
= rb_entry(parent
, struct nexthop
, rb_node
);
2565 if (new_id
< nh
->id
) {
2566 pp
= &next
->rb_left
;
2567 } else if (new_id
> nh
->id
) {
2568 pp
= &next
->rb_right
;
2569 } else if (replace
) {
2570 rc
= replace_nexthop(net
, nh
, new_nh
, cfg
, extack
);
2572 new_nh
= nh
; /* send notification with old nh */
2577 /* id already exists and not a replace */
2582 if (replace
&& !create
) {
2583 NL_SET_ERR_MSG(extack
, "Replace specified without create and no entry exists");
2588 if (new_nh
->is_group
) {
2589 struct nh_group
*nhg
= rtnl_dereference(new_nh
->nh_grp
);
2590 struct nh_res_table
*res_table
;
2592 if (nhg
->resilient
) {
2593 res_table
= rtnl_dereference(nhg
->res_table
);
2595 /* Not passing the number of buckets is OK when
2596 * replacing, but not when creating a new group.
2598 if (!cfg
->nh_grp_res_has_num_buckets
) {
2599 NL_SET_ERR_MSG(extack
, "Number of buckets not specified for nexthop group insertion");
2604 nh_res_group_rebalance(nhg
, res_table
);
2606 /* Do not send bucket notifications, we do full
2607 * notification below.
2609 nh_res_table_upkeep(res_table
, false, false);
2613 rb_link_node_rcu(&new_nh
->rb_node
, parent
, pp
);
2614 rb_insert_color(&new_nh
->rb_node
, root
);
2616 /* The initial insertion is a full notification for hash-threshold as
2617 * well as resilient groups.
2619 rc
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, new_nh
, extack
);
2621 rb_erase(&new_nh
->rb_node
, &net
->nexthop
.rb_root
);
2625 nh_base_seq_inc(net
);
2626 nexthop_notify(RTM_NEWNEXTHOP
, new_nh
, &cfg
->nlinfo
);
2627 if (replace_notify
&&
2628 READ_ONCE(net
->ipv4
.sysctl_nexthop_compat_mode
))
2629 nexthop_replace_notify(net
, new_nh
, &cfg
->nlinfo
);
2636 /* remove all nexthops tied to a device being deleted */
2637 static void nexthop_flush_dev(struct net_device
*dev
, unsigned long event
)
2639 unsigned int hash
= nh_dev_hashfn(dev
->ifindex
);
2640 struct net
*net
= dev_net(dev
);
2641 struct hlist_head
*head
= &net
->nexthop
.devhash
[hash
];
2642 struct hlist_node
*n
;
2643 struct nh_info
*nhi
;
2645 hlist_for_each_entry_safe(nhi
, n
, head
, dev_hash
) {
2646 if (nhi
->fib_nhc
.nhc_dev
!= dev
)
2649 if (nhi
->reject_nh
&&
2650 (event
== NETDEV_DOWN
|| event
== NETDEV_CHANGE
))
2653 remove_nexthop(net
, nhi
->nh_parent
, NULL
);
2657 /* rtnl; called when net namespace is deleted */
2658 static void flush_all_nexthops(struct net
*net
)
2660 struct rb_root
*root
= &net
->nexthop
.rb_root
;
2661 struct rb_node
*node
;
2664 while ((node
= rb_first(root
))) {
2665 nh
= rb_entry(node
, struct nexthop
, rb_node
);
2666 remove_nexthop(net
, nh
, NULL
);
2671 static struct nexthop
*nexthop_create_group(struct net
*net
,
2672 struct nh_config
*cfg
)
2674 struct nlattr
*grps_attr
= cfg
->nh_grp
;
2675 struct nexthop_grp
*entry
= nla_data(grps_attr
);
2676 u16 num_nh
= nla_len(grps_attr
) / sizeof(*entry
);
2677 struct nh_group
*nhg
;
2682 if (WARN_ON(!num_nh
))
2683 return ERR_PTR(-EINVAL
);
2685 nh
= nexthop_alloc();
2687 return ERR_PTR(-ENOMEM
);
2691 nhg
= nexthop_grp_alloc(num_nh
);
2694 return ERR_PTR(-ENOMEM
);
2697 /* spare group used for removals */
2698 nhg
->spare
= nexthop_grp_alloc(num_nh
);
2702 return ERR_PTR(-ENOMEM
);
2704 nhg
->spare
->spare
= nhg
;
2706 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
2707 struct nexthop
*nhe
;
2708 struct nh_info
*nhi
;
2710 nhe
= nexthop_find_by_id(net
, entry
[i
].id
);
2711 if (!nexthop_get(nhe
)) {
2716 nhi
= rtnl_dereference(nhe
->nh_info
);
2717 if (nhi
->family
== AF_INET
)
2720 nhg
->nh_entries
[i
].stats
=
2721 netdev_alloc_pcpu_stats(struct nh_grp_entry_stats
);
2722 if (!nhg
->nh_entries
[i
].stats
) {
2727 nhg
->nh_entries
[i
].nh
= nhe
;
2728 nhg
->nh_entries
[i
].weight
= nexthop_grp_weight(&entry
[i
]);
2730 list_add(&nhg
->nh_entries
[i
].nh_list
, &nhe
->grp_list
);
2731 nhg
->nh_entries
[i
].nh_parent
= nh
;
2734 if (cfg
->nh_grp_type
== NEXTHOP_GRP_TYPE_MPATH
) {
2735 nhg
->hash_threshold
= 1;
2736 nhg
->is_multipath
= true;
2737 } else if (cfg
->nh_grp_type
== NEXTHOP_GRP_TYPE_RES
) {
2738 struct nh_res_table
*res_table
;
2740 res_table
= nexthop_res_table_alloc(net
, cfg
->nh_id
, cfg
);
2746 rcu_assign_pointer(nhg
->spare
->res_table
, res_table
);
2747 rcu_assign_pointer(nhg
->res_table
, res_table
);
2748 nhg
->resilient
= true;
2749 nhg
->is_multipath
= true;
2752 WARN_ON_ONCE(nhg
->hash_threshold
+ nhg
->resilient
!= 1);
2754 if (nhg
->hash_threshold
)
2755 nh_hthr_group_rebalance(nhg
);
2760 if (cfg
->nh_hw_stats
)
2761 nhg
->hw_stats
= true;
2763 rcu_assign_pointer(nh
->nh_grp
, nhg
);
2768 for (i
--; i
>= 0; --i
) {
2769 list_del(&nhg
->nh_entries
[i
].nh_list
);
2770 free_percpu(nhg
->nh_entries
[i
].stats
);
2771 nexthop_put(nhg
->nh_entries
[i
].nh
);
2778 return ERR_PTR(err
);
2781 static int nh_create_ipv4(struct net
*net
, struct nexthop
*nh
,
2782 struct nh_info
*nhi
, struct nh_config
*cfg
,
2783 struct netlink_ext_ack
*extack
)
2785 struct fib_nh
*fib_nh
= &nhi
->fib_nh
;
2786 struct fib_config fib_cfg
= {
2787 .fc_oif
= cfg
->nh_ifindex
,
2788 .fc_gw4
= cfg
->gw
.ipv4
,
2789 .fc_gw_family
= cfg
->gw
.ipv4
? AF_INET
: 0,
2790 .fc_flags
= cfg
->nh_flags
,
2791 .fc_nlinfo
= cfg
->nlinfo
,
2792 .fc_encap
= cfg
->nh_encap
,
2793 .fc_encap_type
= cfg
->nh_encap_type
,
2795 u32 tb_id
= (cfg
->dev
? l3mdev_fib_table(cfg
->dev
) : RT_TABLE_MAIN
);
2798 err
= fib_nh_init(net
, fib_nh
, &fib_cfg
, 1, extack
);
2800 fib_nh_release(net
, fib_nh
);
2807 /* sets nh_dev if successful */
2808 err
= fib_check_nh(net
, fib_nh
, tb_id
, 0, extack
);
2810 nh
->nh_flags
= fib_nh
->fib_nh_flags
;
2811 fib_info_update_nhc_saddr(net
, &fib_nh
->nh_common
,
2812 !fib_nh
->fib_nh_scope
? 0 : fib_nh
->fib_nh_scope
- 1);
2814 fib_nh_release(net
, fib_nh
);
2820 static int nh_create_ipv6(struct net
*net
, struct nexthop
*nh
,
2821 struct nh_info
*nhi
, struct nh_config
*cfg
,
2822 struct netlink_ext_ack
*extack
)
2824 struct fib6_nh
*fib6_nh
= &nhi
->fib6_nh
;
2825 struct fib6_config fib6_cfg
= {
2826 .fc_table
= l3mdev_fib_table(cfg
->dev
),
2827 .fc_ifindex
= cfg
->nh_ifindex
,
2828 .fc_gateway
= cfg
->gw
.ipv6
,
2829 .fc_flags
= cfg
->nh_flags
,
2830 .fc_nlinfo
= cfg
->nlinfo
,
2831 .fc_encap
= cfg
->nh_encap
,
2832 .fc_encap_type
= cfg
->nh_encap_type
,
2833 .fc_is_fdb
= cfg
->nh_fdb
,
2837 if (!ipv6_addr_any(&cfg
->gw
.ipv6
))
2838 fib6_cfg
.fc_flags
|= RTF_GATEWAY
;
2840 /* sets nh_dev if successful */
2841 err
= ipv6_stub
->fib6_nh_init(net
, fib6_nh
, &fib6_cfg
, GFP_KERNEL
,
2844 /* IPv6 is not enabled, don't call fib6_nh_release */
2845 if (err
== -EAFNOSUPPORT
)
2847 ipv6_stub
->fib6_nh_release(fib6_nh
);
2849 nh
->nh_flags
= fib6_nh
->fib_nh_flags
;
2855 static struct nexthop
*nexthop_create(struct net
*net
, struct nh_config
*cfg
,
2856 struct netlink_ext_ack
*extack
)
2858 struct nh_info
*nhi
;
2862 nh
= nexthop_alloc();
2864 return ERR_PTR(-ENOMEM
);
2866 nhi
= kzalloc(sizeof(*nhi
), GFP_KERNEL
);
2869 return ERR_PTR(-ENOMEM
);
2872 nh
->nh_flags
= cfg
->nh_flags
;
2875 nhi
->nh_parent
= nh
;
2876 nhi
->family
= cfg
->nh_family
;
2877 nhi
->fib_nhc
.nhc_scope
= RT_SCOPE_LINK
;
2882 if (cfg
->nh_blackhole
) {
2884 cfg
->nh_ifindex
= net
->loopback_dev
->ifindex
;
2887 switch (cfg
->nh_family
) {
2889 err
= nh_create_ipv4(net
, nh
, nhi
, cfg
, extack
);
2892 err
= nh_create_ipv6(net
, nh
, nhi
, cfg
, extack
);
2899 return ERR_PTR(err
);
2902 /* add the entry to the device based hash */
2904 nexthop_devhash_add(net
, nhi
);
2906 rcu_assign_pointer(nh
->nh_info
, nhi
);
2911 /* called with rtnl lock held */
2912 static struct nexthop
*nexthop_add(struct net
*net
, struct nh_config
*cfg
,
2913 struct netlink_ext_ack
*extack
)
2918 if (cfg
->nlflags
& NLM_F_REPLACE
&& !cfg
->nh_id
) {
2919 NL_SET_ERR_MSG(extack
, "Replace requires nexthop id");
2920 return ERR_PTR(-EINVAL
);
2924 cfg
->nh_id
= nh_find_unused_id(net
);
2926 NL_SET_ERR_MSG(extack
, "No unused id");
2927 return ERR_PTR(-EINVAL
);
2932 nh
= nexthop_create_group(net
, cfg
);
2934 nh
= nexthop_create(net
, cfg
, extack
);
2939 refcount_set(&nh
->refcnt
, 1);
2940 nh
->id
= cfg
->nh_id
;
2941 nh
->protocol
= cfg
->nh_protocol
;
2944 err
= insert_nexthop(net
, nh
, cfg
, extack
);
2946 __remove_nexthop(net
, nh
, NULL
);
2954 static int rtm_nh_get_timer(struct nlattr
*attr
, unsigned long fallback
,
2955 unsigned long *timer_p
, bool *has_p
,
2956 struct netlink_ext_ack
*extack
)
2958 unsigned long timer
;
2962 *timer_p
= fallback
;
2967 value
= nla_get_u32(attr
);
2968 timer
= clock_t_to_jiffies(value
);
2969 if (timer
== ~0UL) {
2970 NL_SET_ERR_MSG(extack
, "Timer value too large");
2979 static int rtm_to_nh_config_grp_res(struct nlattr
*res
, struct nh_config
*cfg
,
2980 struct netlink_ext_ack
*extack
)
2982 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_res_policy_new
)] = {};
2986 err
= nla_parse_nested(tb
,
2987 ARRAY_SIZE(rtm_nh_res_policy_new
) - 1,
2988 res
, rtm_nh_res_policy_new
, extack
);
2993 if (tb
[NHA_RES_GROUP_BUCKETS
]) {
2994 cfg
->nh_grp_res_num_buckets
=
2995 nla_get_u16(tb
[NHA_RES_GROUP_BUCKETS
]);
2996 cfg
->nh_grp_res_has_num_buckets
= true;
2997 if (!cfg
->nh_grp_res_num_buckets
) {
2998 NL_SET_ERR_MSG(extack
, "Number of buckets needs to be non-0");
3003 err
= rtm_nh_get_timer(tb
[NHA_RES_GROUP_IDLE_TIMER
],
3004 NH_RES_DEFAULT_IDLE_TIMER
,
3005 &cfg
->nh_grp_res_idle_timer
,
3006 &cfg
->nh_grp_res_has_idle_timer
,
3011 return rtm_nh_get_timer(tb
[NHA_RES_GROUP_UNBALANCED_TIMER
],
3012 NH_RES_DEFAULT_UNBALANCED_TIMER
,
3013 &cfg
->nh_grp_res_unbalanced_timer
,
3014 &cfg
->nh_grp_res_has_unbalanced_timer
,
3018 static int rtm_to_nh_config(struct net
*net
, struct sk_buff
*skb
,
3019 struct nlmsghdr
*nlh
, struct nh_config
*cfg
,
3020 struct netlink_ext_ack
*extack
)
3022 struct nhmsg
*nhm
= nlmsg_data(nlh
);
3023 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_new
)];
3026 err
= nlmsg_parse(nlh
, sizeof(*nhm
), tb
,
3027 ARRAY_SIZE(rtm_nh_policy_new
) - 1,
3028 rtm_nh_policy_new
, extack
);
3033 if (nhm
->resvd
|| nhm
->nh_scope
) {
3034 NL_SET_ERR_MSG(extack
, "Invalid values in ancillary header");
3037 if (nhm
->nh_flags
& ~NEXTHOP_VALID_USER_FLAGS
) {
3038 NL_SET_ERR_MSG(extack
, "Invalid nexthop flags in ancillary header");
3042 switch (nhm
->nh_family
) {
3051 NL_SET_ERR_MSG(extack
, "Invalid address family");
3055 memset(cfg
, 0, sizeof(*cfg
));
3056 cfg
->nlflags
= nlh
->nlmsg_flags
;
3057 cfg
->nlinfo
.portid
= NETLINK_CB(skb
).portid
;
3058 cfg
->nlinfo
.nlh
= nlh
;
3059 cfg
->nlinfo
.nl_net
= net
;
3061 cfg
->nh_family
= nhm
->nh_family
;
3062 cfg
->nh_protocol
= nhm
->nh_protocol
;
3063 cfg
->nh_flags
= nhm
->nh_flags
;
3066 cfg
->nh_id
= nla_get_u32(tb
[NHA_ID
]);
3069 if (tb
[NHA_OIF
] || tb
[NHA_BLACKHOLE
] ||
3070 tb
[NHA_ENCAP
] || tb
[NHA_ENCAP_TYPE
]) {
3071 NL_SET_ERR_MSG(extack
, "Fdb attribute can not be used with encap, oif or blackhole");
3074 if (nhm
->nh_flags
) {
3075 NL_SET_ERR_MSG(extack
, "Unsupported nexthop flags in ancillary header");
3078 cfg
->nh_fdb
= nla_get_flag(tb
[NHA_FDB
]);
3081 if (tb
[NHA_GROUP
]) {
3082 if (nhm
->nh_family
!= AF_UNSPEC
) {
3083 NL_SET_ERR_MSG(extack
, "Invalid family for group");
3086 cfg
->nh_grp
= tb
[NHA_GROUP
];
3088 cfg
->nh_grp_type
= NEXTHOP_GRP_TYPE_MPATH
;
3089 if (tb
[NHA_GROUP_TYPE
])
3090 cfg
->nh_grp_type
= nla_get_u16(tb
[NHA_GROUP_TYPE
]);
3092 if (cfg
->nh_grp_type
> NEXTHOP_GRP_TYPE_MAX
) {
3093 NL_SET_ERR_MSG(extack
, "Invalid group type");
3096 err
= nh_check_attr_group(net
, tb
, ARRAY_SIZE(tb
),
3097 cfg
->nh_grp_type
, extack
);
3101 if (cfg
->nh_grp_type
== NEXTHOP_GRP_TYPE_RES
)
3102 err
= rtm_to_nh_config_grp_res(tb
[NHA_RES_GROUP
],
3105 if (tb
[NHA_HW_STATS_ENABLE
])
3106 cfg
->nh_hw_stats
= nla_get_u32(tb
[NHA_HW_STATS_ENABLE
]);
3108 /* no other attributes should be set */
3112 if (tb
[NHA_BLACKHOLE
]) {
3113 if (tb
[NHA_GATEWAY
] || tb
[NHA_OIF
] ||
3114 tb
[NHA_ENCAP
] || tb
[NHA_ENCAP_TYPE
] || tb
[NHA_FDB
]) {
3115 NL_SET_ERR_MSG(extack
, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
3119 cfg
->nh_blackhole
= 1;
3124 if (!cfg
->nh_fdb
&& !tb
[NHA_OIF
]) {
3125 NL_SET_ERR_MSG(extack
, "Device attribute required for non-blackhole and non-fdb nexthops");
3129 if (!cfg
->nh_fdb
&& tb
[NHA_OIF
]) {
3130 cfg
->nh_ifindex
= nla_get_u32(tb
[NHA_OIF
]);
3131 if (cfg
->nh_ifindex
)
3132 cfg
->dev
= __dev_get_by_index(net
, cfg
->nh_ifindex
);
3135 NL_SET_ERR_MSG(extack
, "Invalid device index");
3137 } else if (!(cfg
->dev
->flags
& IFF_UP
)) {
3138 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
3141 } else if (!netif_carrier_ok(cfg
->dev
)) {
3142 NL_SET_ERR_MSG(extack
, "Carrier for nexthop device is down");
3149 if (tb
[NHA_GATEWAY
]) {
3150 struct nlattr
*gwa
= tb
[NHA_GATEWAY
];
3152 switch (cfg
->nh_family
) {
3154 if (nla_len(gwa
) != sizeof(u32
)) {
3155 NL_SET_ERR_MSG(extack
, "Invalid gateway");
3158 cfg
->gw
.ipv4
= nla_get_be32(gwa
);
3161 if (nla_len(gwa
) != sizeof(struct in6_addr
)) {
3162 NL_SET_ERR_MSG(extack
, "Invalid gateway");
3165 cfg
->gw
.ipv6
= nla_get_in6_addr(gwa
);
3168 NL_SET_ERR_MSG(extack
,
3169 "Unknown address family for gateway");
3173 /* device only nexthop (no gateway) */
3174 if (cfg
->nh_flags
& RTNH_F_ONLINK
) {
3175 NL_SET_ERR_MSG(extack
,
3176 "ONLINK flag can not be set for nexthop without a gateway");
3181 if (tb
[NHA_ENCAP
]) {
3182 cfg
->nh_encap
= tb
[NHA_ENCAP
];
3184 if (!tb
[NHA_ENCAP_TYPE
]) {
3185 NL_SET_ERR_MSG(extack
, "LWT encapsulation type is missing");
3189 cfg
->nh_encap_type
= nla_get_u16(tb
[NHA_ENCAP_TYPE
]);
3190 err
= lwtunnel_valid_encap_type(cfg
->nh_encap_type
, extack
);
3194 } else if (tb
[NHA_ENCAP_TYPE
]) {
3195 NL_SET_ERR_MSG(extack
, "LWT encapsulation attribute is missing");
3199 if (tb
[NHA_HW_STATS_ENABLE
]) {
3200 NL_SET_ERR_MSG(extack
, "Cannot enable nexthop hardware statistics for non-group nexthops");
3210 static int rtm_new_nexthop(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
3211 struct netlink_ext_ack
*extack
)
3213 struct net
*net
= sock_net(skb
->sk
);
3214 struct nh_config cfg
;
3218 err
= rtm_to_nh_config(net
, skb
, nlh
, &cfg
, extack
);
3220 nh
= nexthop_add(net
, &cfg
, extack
);
3228 static int nh_valid_get_del_req(const struct nlmsghdr
*nlh
,
3229 struct nlattr
**tb
, u32
*id
, u32
*op_flags
,
3230 struct netlink_ext_ack
*extack
)
3232 struct nhmsg
*nhm
= nlmsg_data(nlh
);
3234 if (nhm
->nh_protocol
|| nhm
->resvd
|| nhm
->nh_scope
|| nhm
->nh_flags
) {
3235 NL_SET_ERR_MSG(extack
, "Invalid values in header");
3240 NL_SET_ERR_MSG(extack
, "Nexthop id is missing");
3244 *id
= nla_get_u32(tb
[NHA_ID
]);
3246 NL_SET_ERR_MSG(extack
, "Invalid nexthop id");
3251 *op_flags
= nla_get_u32_default(tb
[NHA_OP_FLAGS
], 0);
3257 static int rtm_del_nexthop(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
3258 struct netlink_ext_ack
*extack
)
3260 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_del
)];
3261 struct net
*net
= sock_net(skb
->sk
);
3262 struct nl_info nlinfo
= {
3265 .portid
= NETLINK_CB(skb
).portid
,
3271 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3272 ARRAY_SIZE(rtm_nh_policy_del
) - 1, rtm_nh_policy_del
,
3277 err
= nh_valid_get_del_req(nlh
, tb
, &id
, NULL
, extack
);
3281 nh
= nexthop_find_by_id(net
, id
);
3285 remove_nexthop(net
, nh
, &nlinfo
);
3291 static int rtm_get_nexthop(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
3292 struct netlink_ext_ack
*extack
)
3294 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_get
)];
3295 struct net
*net
= sock_net(in_skb
->sk
);
3296 struct sk_buff
*skb
= NULL
;
3302 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3303 ARRAY_SIZE(rtm_nh_policy_get
) - 1, rtm_nh_policy_get
,
3308 err
= nh_valid_get_del_req(nlh
, tb
, &id
, &op_flags
, extack
);
3313 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3318 nh
= nexthop_find_by_id(net
, id
);
3322 err
= nh_fill_node(skb
, nh
, RTM_NEWNEXTHOP
, NETLINK_CB(in_skb
).portid
,
3323 nlh
->nlmsg_seq
, 0, op_flags
);
3325 WARN_ON(err
== -EMSGSIZE
);
3329 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
3337 struct nh_dump_filter
{
3343 u32 res_bucket_nh_id
;
3347 static bool nh_dump_filtered(struct nexthop
*nh
,
3348 struct nh_dump_filter
*filter
, u8 family
)
3350 const struct net_device
*dev
;
3351 const struct nh_info
*nhi
;
3353 if (filter
->group_filter
&& !nh
->is_group
)
3356 if (!filter
->dev_idx
&& !filter
->master_idx
&& !family
)
3362 nhi
= rtnl_dereference(nh
->nh_info
);
3363 if (family
&& nhi
->family
!= family
)
3366 dev
= nhi
->fib_nhc
.nhc_dev
;
3367 if (filter
->dev_idx
&& (!dev
|| dev
->ifindex
!= filter
->dev_idx
))
3370 if (filter
->master_idx
) {
3371 struct net_device
*master
;
3376 master
= netdev_master_upper_dev_get((struct net_device
*)dev
);
3377 if (!master
|| master
->ifindex
!= filter
->master_idx
)
3384 static int __nh_valid_dump_req(const struct nlmsghdr
*nlh
, struct nlattr
**tb
,
3385 struct nh_dump_filter
*filter
,
3386 struct netlink_ext_ack
*extack
)
3392 idx
= nla_get_u32(tb
[NHA_OIF
]);
3393 if (idx
> INT_MAX
) {
3394 NL_SET_ERR_MSG(extack
, "Invalid device index");
3397 filter
->dev_idx
= idx
;
3399 if (tb
[NHA_MASTER
]) {
3400 idx
= nla_get_u32(tb
[NHA_MASTER
]);
3401 if (idx
> INT_MAX
) {
3402 NL_SET_ERR_MSG(extack
, "Invalid master device index");
3405 filter
->master_idx
= idx
;
3407 filter
->group_filter
= nla_get_flag(tb
[NHA_GROUPS
]);
3408 filter
->fdb_filter
= nla_get_flag(tb
[NHA_FDB
]);
3410 nhm
= nlmsg_data(nlh
);
3411 if (nhm
->nh_protocol
|| nhm
->resvd
|| nhm
->nh_scope
|| nhm
->nh_flags
) {
3412 NL_SET_ERR_MSG(extack
, "Invalid values in header for nexthop dump request");
3419 static int nh_valid_dump_req(const struct nlmsghdr
*nlh
,
3420 struct nh_dump_filter
*filter
,
3421 struct netlink_callback
*cb
)
3423 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_dump
)];
3426 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3427 ARRAY_SIZE(rtm_nh_policy_dump
) - 1,
3428 rtm_nh_policy_dump
, cb
->extack
);
3432 filter
->op_flags
= nla_get_u32_default(tb
[NHA_OP_FLAGS
], 0);
3434 return __nh_valid_dump_req(nlh
, tb
, filter
, cb
->extack
);
3437 struct rtm_dump_nh_ctx
{
3441 static struct rtm_dump_nh_ctx
*
3442 rtm_dump_nh_ctx(struct netlink_callback
*cb
)
3444 struct rtm_dump_nh_ctx
*ctx
= (void *)cb
->ctx
;
3446 BUILD_BUG_ON(sizeof(*ctx
) > sizeof(cb
->ctx
));
3450 static int rtm_dump_walk_nexthops(struct sk_buff
*skb
,
3451 struct netlink_callback
*cb
,
3452 struct rb_root
*root
,
3453 struct rtm_dump_nh_ctx
*ctx
,
3454 int (*nh_cb
)(struct sk_buff
*skb
,
3455 struct netlink_callback
*cb
,
3456 struct nexthop
*nh
, void *data
),
3459 struct rb_node
*node
;
3464 for (node
= rb_first(root
); node
; node
= rb_next(node
)) {
3467 nh
= rb_entry(node
, struct nexthop
, rb_node
);
3472 err
= nh_cb(skb
, cb
, nh
, data
);
3480 static int rtm_dump_nexthop_cb(struct sk_buff
*skb
, struct netlink_callback
*cb
,
3481 struct nexthop
*nh
, void *data
)
3483 struct nhmsg
*nhm
= nlmsg_data(cb
->nlh
);
3484 struct nh_dump_filter
*filter
= data
;
3486 if (nh_dump_filtered(nh
, filter
, nhm
->nh_family
))
3489 return nh_fill_node(skb
, nh
, RTM_NEWNEXTHOP
,
3490 NETLINK_CB(cb
->skb
).portid
,
3491 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, filter
->op_flags
);
3495 static int rtm_dump_nexthop(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3497 struct rtm_dump_nh_ctx
*ctx
= rtm_dump_nh_ctx(cb
);
3498 struct net
*net
= sock_net(skb
->sk
);
3499 struct rb_root
*root
= &net
->nexthop
.rb_root
;
3500 struct nh_dump_filter filter
= {};
3503 err
= nh_valid_dump_req(cb
->nlh
, &filter
, cb
);
3507 err
= rtm_dump_walk_nexthops(skb
, cb
, root
, ctx
,
3508 &rtm_dump_nexthop_cb
, &filter
);
3510 cb
->seq
= net
->nexthop
.seq
;
3511 nl_dump_check_consistent(cb
, nlmsg_hdr(skb
));
3515 static struct nexthop
*
3516 nexthop_find_group_resilient(struct net
*net
, u32 id
,
3517 struct netlink_ext_ack
*extack
)
3519 struct nh_group
*nhg
;
3522 nh
= nexthop_find_by_id(net
, id
);
3524 return ERR_PTR(-ENOENT
);
3526 if (!nh
->is_group
) {
3527 NL_SET_ERR_MSG(extack
, "Not a nexthop group");
3528 return ERR_PTR(-EINVAL
);
3531 nhg
= rtnl_dereference(nh
->nh_grp
);
3532 if (!nhg
->resilient
) {
3533 NL_SET_ERR_MSG(extack
, "Nexthop group not of type resilient");
3534 return ERR_PTR(-EINVAL
);
3540 static int nh_valid_dump_nhid(struct nlattr
*attr
, u32
*nh_id_p
,
3541 struct netlink_ext_ack
*extack
)
3546 idx
= nla_get_u32(attr
);
3548 NL_SET_ERR_MSG(extack
, "Invalid nexthop id");
3559 static int nh_valid_dump_bucket_req(const struct nlmsghdr
*nlh
,
3560 struct nh_dump_filter
*filter
,
3561 struct netlink_callback
*cb
)
3563 struct nlattr
*res_tb
[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump
)];
3564 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_dump_bucket
)];
3567 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3568 ARRAY_SIZE(rtm_nh_policy_dump_bucket
) - 1,
3569 rtm_nh_policy_dump_bucket
, NULL
);
3573 err
= nh_valid_dump_nhid(tb
[NHA_ID
], &filter
->nh_id
, cb
->extack
);
3577 if (tb
[NHA_RES_BUCKET
]) {
3578 size_t max
= ARRAY_SIZE(rtm_nh_res_bucket_policy_dump
) - 1;
3580 err
= nla_parse_nested(res_tb
, max
,
3582 rtm_nh_res_bucket_policy_dump
,
3587 err
= nh_valid_dump_nhid(res_tb
[NHA_RES_BUCKET_NH_ID
],
3588 &filter
->res_bucket_nh_id
,
3594 return __nh_valid_dump_req(nlh
, tb
, filter
, cb
->extack
);
3597 struct rtm_dump_res_bucket_ctx
{
3598 struct rtm_dump_nh_ctx nh
;
3602 static struct rtm_dump_res_bucket_ctx
*
3603 rtm_dump_res_bucket_ctx(struct netlink_callback
*cb
)
3605 struct rtm_dump_res_bucket_ctx
*ctx
= (void *)cb
->ctx
;
3607 BUILD_BUG_ON(sizeof(*ctx
) > sizeof(cb
->ctx
));
3611 struct rtm_dump_nexthop_bucket_data
{
3612 struct rtm_dump_res_bucket_ctx
*ctx
;
3613 struct nh_dump_filter filter
;
3616 static int rtm_dump_nexthop_bucket_nh(struct sk_buff
*skb
,
3617 struct netlink_callback
*cb
,
3619 struct rtm_dump_nexthop_bucket_data
*dd
)
3621 u32 portid
= NETLINK_CB(cb
->skb
).portid
;
3622 struct nhmsg
*nhm
= nlmsg_data(cb
->nlh
);
3623 struct nh_res_table
*res_table
;
3624 struct nh_group
*nhg
;
3628 nhg
= rtnl_dereference(nh
->nh_grp
);
3629 res_table
= rtnl_dereference(nhg
->res_table
);
3630 for (bucket_index
= dd
->ctx
->bucket_index
;
3631 bucket_index
< res_table
->num_nh_buckets
;
3633 struct nh_res_bucket
*bucket
;
3634 struct nh_grp_entry
*nhge
;
3636 bucket
= &res_table
->nh_buckets
[bucket_index
];
3637 nhge
= rtnl_dereference(bucket
->nh_entry
);
3638 if (nh_dump_filtered(nhge
->nh
, &dd
->filter
, nhm
->nh_family
))
3641 if (dd
->filter
.res_bucket_nh_id
&&
3642 dd
->filter
.res_bucket_nh_id
!= nhge
->nh
->id
)
3645 dd
->ctx
->bucket_index
= bucket_index
;
3646 err
= nh_fill_res_bucket(skb
, nh
, bucket
, bucket_index
,
3647 RTM_NEWNEXTHOPBUCKET
, portid
,
3648 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
,
3654 dd
->ctx
->bucket_index
= 0;
3659 static int rtm_dump_nexthop_bucket_cb(struct sk_buff
*skb
,
3660 struct netlink_callback
*cb
,
3661 struct nexthop
*nh
, void *data
)
3663 struct rtm_dump_nexthop_bucket_data
*dd
= data
;
3664 struct nh_group
*nhg
;
3669 nhg
= rtnl_dereference(nh
->nh_grp
);
3670 if (!nhg
->resilient
)
3673 return rtm_dump_nexthop_bucket_nh(skb
, cb
, nh
, dd
);
3677 static int rtm_dump_nexthop_bucket(struct sk_buff
*skb
,
3678 struct netlink_callback
*cb
)
3680 struct rtm_dump_res_bucket_ctx
*ctx
= rtm_dump_res_bucket_ctx(cb
);
3681 struct rtm_dump_nexthop_bucket_data dd
= { .ctx
= ctx
};
3682 struct net
*net
= sock_net(skb
->sk
);
3686 err
= nh_valid_dump_bucket_req(cb
->nlh
, &dd
.filter
, cb
);
3690 if (dd
.filter
.nh_id
) {
3691 nh
= nexthop_find_group_resilient(net
, dd
.filter
.nh_id
,
3695 err
= rtm_dump_nexthop_bucket_nh(skb
, cb
, nh
, &dd
);
3697 struct rb_root
*root
= &net
->nexthop
.rb_root
;
3699 err
= rtm_dump_walk_nexthops(skb
, cb
, root
, &ctx
->nh
,
3700 &rtm_dump_nexthop_bucket_cb
, &dd
);
3703 cb
->seq
= net
->nexthop
.seq
;
3704 nl_dump_check_consistent(cb
, nlmsg_hdr(skb
));
3708 static int nh_valid_get_bucket_req_res_bucket(struct nlattr
*res
,
3710 struct netlink_ext_ack
*extack
)
3712 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_res_bucket_policy_get
)];
3715 err
= nla_parse_nested(tb
, ARRAY_SIZE(rtm_nh_res_bucket_policy_get
) - 1,
3716 res
, rtm_nh_res_bucket_policy_get
, extack
);
3720 if (!tb
[NHA_RES_BUCKET_INDEX
]) {
3721 NL_SET_ERR_MSG(extack
, "Bucket index is missing");
3725 *bucket_index
= nla_get_u16(tb
[NHA_RES_BUCKET_INDEX
]);
3729 static int nh_valid_get_bucket_req(const struct nlmsghdr
*nlh
,
3730 u32
*id
, u16
*bucket_index
,
3731 struct netlink_ext_ack
*extack
)
3733 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_get_bucket
)];
3736 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3737 ARRAY_SIZE(rtm_nh_policy_get_bucket
) - 1,
3738 rtm_nh_policy_get_bucket
, extack
);
3742 err
= nh_valid_get_del_req(nlh
, tb
, id
, NULL
, extack
);
3746 if (!tb
[NHA_RES_BUCKET
]) {
3747 NL_SET_ERR_MSG(extack
, "Bucket information is missing");
3751 err
= nh_valid_get_bucket_req_res_bucket(tb
[NHA_RES_BUCKET
],
3752 bucket_index
, extack
);
3760 static int rtm_get_nexthop_bucket(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
3761 struct netlink_ext_ack
*extack
)
3763 struct net
*net
= sock_net(in_skb
->sk
);
3764 struct nh_res_table
*res_table
;
3765 struct sk_buff
*skb
= NULL
;
3766 struct nh_group
*nhg
;
3772 err
= nh_valid_get_bucket_req(nlh
, &id
, &bucket_index
, extack
);
3776 nh
= nexthop_find_group_resilient(net
, id
, extack
);
3780 nhg
= rtnl_dereference(nh
->nh_grp
);
3781 res_table
= rtnl_dereference(nhg
->res_table
);
3782 if (bucket_index
>= res_table
->num_nh_buckets
) {
3783 NL_SET_ERR_MSG(extack
, "Bucket index out of bounds");
3787 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3791 err
= nh_fill_res_bucket(skb
, nh
, &res_table
->nh_buckets
[bucket_index
],
3792 bucket_index
, RTM_NEWNEXTHOPBUCKET
,
3793 NETLINK_CB(in_skb
).portid
, nlh
->nlmsg_seq
,
3796 WARN_ON(err
== -EMSGSIZE
);
3800 return rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
3807 static void nexthop_sync_mtu(struct net_device
*dev
, u32 orig_mtu
)
3809 unsigned int hash
= nh_dev_hashfn(dev
->ifindex
);
3810 struct net
*net
= dev_net(dev
);
3811 struct hlist_head
*head
= &net
->nexthop
.devhash
[hash
];
3812 struct hlist_node
*n
;
3813 struct nh_info
*nhi
;
3815 hlist_for_each_entry_safe(nhi
, n
, head
, dev_hash
) {
3816 if (nhi
->fib_nhc
.nhc_dev
== dev
) {
3817 if (nhi
->family
== AF_INET
)
3818 fib_nhc_update_mtu(&nhi
->fib_nhc
, dev
->mtu
,
3825 static int nh_netdev_event(struct notifier_block
*this,
3826 unsigned long event
, void *ptr
)
3828 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
3829 struct netdev_notifier_info_ext
*info_ext
;
3833 case NETDEV_UNREGISTER
:
3834 nexthop_flush_dev(dev
, event
);
3837 if (!(dev_get_flags(dev
) & (IFF_RUNNING
| IFF_LOWER_UP
)))
3838 nexthop_flush_dev(dev
, event
);
3840 case NETDEV_CHANGEMTU
:
3842 nexthop_sync_mtu(dev
, info_ext
->ext
.mtu
);
3843 rt_cache_flush(dev_net(dev
));
3849 static struct notifier_block nh_netdev_notifier
= {
3850 .notifier_call
= nh_netdev_event
,
3853 static int nexthops_dump(struct net
*net
, struct notifier_block
*nb
,
3854 enum nexthop_event_type event_type
,
3855 struct netlink_ext_ack
*extack
)
3857 struct rb_root
*root
= &net
->nexthop
.rb_root
;
3858 struct rb_node
*node
;
3861 for (node
= rb_first(root
); node
; node
= rb_next(node
)) {
3864 nh
= rb_entry(node
, struct nexthop
, rb_node
);
3865 err
= call_nexthop_notifier(nb
, net
, event_type
, nh
, extack
);
3873 int register_nexthop_notifier(struct net
*net
, struct notifier_block
*nb
,
3874 struct netlink_ext_ack
*extack
)
3879 err
= nexthops_dump(net
, nb
, NEXTHOP_EVENT_REPLACE
, extack
);
3882 err
= blocking_notifier_chain_register(&net
->nexthop
.notifier_chain
,
3888 EXPORT_SYMBOL(register_nexthop_notifier
);
3890 int __unregister_nexthop_notifier(struct net
*net
, struct notifier_block
*nb
)
3894 err
= blocking_notifier_chain_unregister(&net
->nexthop
.notifier_chain
,
3897 nexthops_dump(net
, nb
, NEXTHOP_EVENT_DEL
, NULL
);
3900 EXPORT_SYMBOL(__unregister_nexthop_notifier
);
3902 int unregister_nexthop_notifier(struct net
*net
, struct notifier_block
*nb
)
3907 err
= __unregister_nexthop_notifier(net
, nb
);
3911 EXPORT_SYMBOL(unregister_nexthop_notifier
);
3913 void nexthop_set_hw_flags(struct net
*net
, u32 id
, bool offload
, bool trap
)
3915 struct nexthop
*nexthop
;
3919 nexthop
= nexthop_find_by_id(net
, id
);
3923 nexthop
->nh_flags
&= ~(RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
3925 nexthop
->nh_flags
|= RTNH_F_OFFLOAD
;
3927 nexthop
->nh_flags
|= RTNH_F_TRAP
;
3932 EXPORT_SYMBOL(nexthop_set_hw_flags
);
3934 void nexthop_bucket_set_hw_flags(struct net
*net
, u32 id
, u16 bucket_index
,
3935 bool offload
, bool trap
)
3937 struct nh_res_table
*res_table
;
3938 struct nh_res_bucket
*bucket
;
3939 struct nexthop
*nexthop
;
3940 struct nh_group
*nhg
;
3944 nexthop
= nexthop_find_by_id(net
, id
);
3945 if (!nexthop
|| !nexthop
->is_group
)
3948 nhg
= rcu_dereference(nexthop
->nh_grp
);
3949 if (!nhg
->resilient
)
3952 if (bucket_index
>= nhg
->res_table
->num_nh_buckets
)
3955 res_table
= rcu_dereference(nhg
->res_table
);
3956 bucket
= &res_table
->nh_buckets
[bucket_index
];
3957 bucket
->nh_flags
&= ~(RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
3959 bucket
->nh_flags
|= RTNH_F_OFFLOAD
;
3961 bucket
->nh_flags
|= RTNH_F_TRAP
;
3966 EXPORT_SYMBOL(nexthop_bucket_set_hw_flags
);
3968 void nexthop_res_grp_activity_update(struct net
*net
, u32 id
, u16 num_buckets
,
3969 unsigned long *activity
)
3971 struct nh_res_table
*res_table
;
3972 struct nexthop
*nexthop
;
3973 struct nh_group
*nhg
;
3978 nexthop
= nexthop_find_by_id(net
, id
);
3979 if (!nexthop
|| !nexthop
->is_group
)
3982 nhg
= rcu_dereference(nexthop
->nh_grp
);
3983 if (!nhg
->resilient
)
3986 /* Instead of silently ignoring some buckets, demand that the sizes
3989 res_table
= rcu_dereference(nhg
->res_table
);
3990 if (num_buckets
!= res_table
->num_nh_buckets
)
3993 for (i
= 0; i
< num_buckets
; i
++) {
3994 if (test_bit(i
, activity
))
3995 nh_res_bucket_set_busy(&res_table
->nh_buckets
[i
]);
4001 EXPORT_SYMBOL(nexthop_res_grp_activity_update
);
4003 static void __net_exit
nexthop_net_exit_batch_rtnl(struct list_head
*net_list
,
4004 struct list_head
*dev_to_kill
)
4009 list_for_each_entry(net
, net_list
, exit_list
)
4010 flush_all_nexthops(net
);
4013 static void __net_exit
nexthop_net_exit(struct net
*net
)
4015 kfree(net
->nexthop
.devhash
);
4016 net
->nexthop
.devhash
= NULL
;
4019 static int __net_init
nexthop_net_init(struct net
*net
)
4021 size_t sz
= sizeof(struct hlist_head
) * NH_DEV_HASHSIZE
;
4023 net
->nexthop
.rb_root
= RB_ROOT
;
4024 net
->nexthop
.devhash
= kzalloc(sz
, GFP_KERNEL
);
4025 if (!net
->nexthop
.devhash
)
4027 BLOCKING_INIT_NOTIFIER_HEAD(&net
->nexthop
.notifier_chain
);
4032 static struct pernet_operations nexthop_net_ops
= {
4033 .init
= nexthop_net_init
,
4034 .exit
= nexthop_net_exit
,
4035 .exit_batch_rtnl
= nexthop_net_exit_batch_rtnl
,
4038 static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers
[] __initconst
= {
4039 {.msgtype
= RTM_NEWNEXTHOP
, .doit
= rtm_new_nexthop
},
4040 {.msgtype
= RTM_DELNEXTHOP
, .doit
= rtm_del_nexthop
},
4041 {.msgtype
= RTM_GETNEXTHOP
, .doit
= rtm_get_nexthop
,
4042 .dumpit
= rtm_dump_nexthop
},
4043 {.msgtype
= RTM_GETNEXTHOPBUCKET
, .doit
= rtm_get_nexthop_bucket
,
4044 .dumpit
= rtm_dump_nexthop_bucket
},
4045 {.protocol
= PF_INET
, .msgtype
= RTM_NEWNEXTHOP
,
4046 .doit
= rtm_new_nexthop
},
4047 {.protocol
= PF_INET
, .msgtype
= RTM_GETNEXTHOP
,
4048 .dumpit
= rtm_dump_nexthop
},
4049 {.protocol
= PF_INET6
, .msgtype
= RTM_NEWNEXTHOP
,
4050 .doit
= rtm_new_nexthop
},
4051 {.protocol
= PF_INET6
, .msgtype
= RTM_GETNEXTHOP
,
4052 .dumpit
= rtm_dump_nexthop
},
4055 static int __init
nexthop_init(void)
4057 register_pernet_subsys(&nexthop_net_ops
);
4059 register_netdevice_notifier(&nh_netdev_notifier
);
4061 rtnl_register_many(nexthop_rtnl_msg_handlers
);
4065 subsys_initcall(nexthop_init
);