1 // SPDX-License-Identifier: GPL-2.0-only
3 * VXLAN: Virtual eXtensible Local Area Network
5 * Copyright (c) 2012-2013 Vyatta Inc.
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 #include <linux/kernel.h>
11 #include <linux/module.h>
12 #include <linux/errno.h>
13 #include <linux/slab.h>
14 #include <linux/udp.h>
15 #include <linux/igmp.h>
16 #include <linux/if_ether.h>
17 #include <linux/ethtool.h>
19 #include <net/ndisc.h>
21 #include <net/ipv6_stubs.h>
24 #include <net/rtnetlink.h>
25 #include <net/inet_ecn.h>
26 #include <net/net_namespace.h>
27 #include <net/netns/generic.h>
28 #include <net/tun_proto.h>
29 #include <net/vxlan.h>
30 #include <net/nexthop.h>
32 #if IS_ENABLED(CONFIG_IPV6)
33 #include <net/ip6_tunnel.h>
34 #include <net/ip6_checksum.h>
37 #include "vxlan_private.h"
39 #define VXLAN_VERSION "0.1"
41 #define FDB_AGE_DEFAULT 300 /* 5 min */
42 #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
44 /* UDP port for VXLAN traffic.
45 * The IANA assigned port is 4789, but the Linux default is 8472
46 * for compatibility with early adopters.
48 static unsigned short vxlan_port __read_mostly
= 8472;
49 module_param_named(udp_port
, vxlan_port
, ushort
, 0444);
50 MODULE_PARM_DESC(udp_port
, "Destination UDP port");
52 static bool log_ecn_error
= true;
53 module_param(log_ecn_error
, bool, 0644);
54 MODULE_PARM_DESC(log_ecn_error
, "Log packets received with corrupted ECN");
56 unsigned int vxlan_net_id
;
58 const u8 all_zeros_mac
[ETH_ALEN
+ 2];
59 static struct rtnl_link_ops vxlan_link_ops
;
61 static int vxlan_sock_add(struct vxlan_dev
*vxlan
);
63 static void vxlan_vs_del_dev(struct vxlan_dev
*vxlan
);
65 /* salt for hash table */
66 static u32 vxlan_salt __read_mostly
;
68 static inline bool vxlan_collect_metadata(struct vxlan_sock
*vs
)
70 return vs
->flags
& VXLAN_F_COLLECT_METADATA
||
71 ip_tunnel_collect_metadata();
74 /* Find VXLAN socket based on network namespace, address family, UDP port,
75 * enabled unshareable flags and socket device binding (see l3mdev with
78 static struct vxlan_sock
*vxlan_find_sock(struct net
*net
, sa_family_t family
,
79 __be16 port
, u32 flags
, int ifindex
)
81 struct vxlan_sock
*vs
;
83 flags
&= VXLAN_F_RCV_FLAGS
;
85 hlist_for_each_entry_rcu(vs
, vs_head(net
, port
), hlist
) {
86 if (inet_sk(vs
->sock
->sk
)->inet_sport
== port
&&
87 vxlan_get_sk_family(vs
) == family
&&
89 vs
->sock
->sk
->sk_bound_dev_if
== ifindex
)
95 static struct vxlan_dev
*vxlan_vs_find_vni(struct vxlan_sock
*vs
,
96 int ifindex
, __be32 vni
,
97 struct vxlan_vni_node
**vninode
)
99 struct vxlan_vni_node
*vnode
;
100 struct vxlan_dev_node
*node
;
102 /* For flow based devices, map all packets to VNI 0 */
103 if (vs
->flags
& VXLAN_F_COLLECT_METADATA
&&
104 !(vs
->flags
& VXLAN_F_VNIFILTER
))
107 hlist_for_each_entry_rcu(node
, vni_head(vs
, vni
), hlist
) {
111 if (node
->vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
) {
112 vnode
= vxlan_vnifilter_lookup(node
->vxlan
, vni
);
115 } else if (node
->vxlan
->default_dst
.remote_vni
!= vni
) {
119 if (IS_ENABLED(CONFIG_IPV6
)) {
120 const struct vxlan_config
*cfg
= &node
->vxlan
->cfg
;
122 if ((cfg
->flags
& VXLAN_F_IPV6_LINKLOCAL
) &&
123 cfg
->remote_ifindex
!= ifindex
)
135 /* Look up VNI in a per net namespace table */
136 static struct vxlan_dev
*vxlan_find_vni(struct net
*net
, int ifindex
,
137 __be32 vni
, sa_family_t family
,
138 __be16 port
, u32 flags
)
140 struct vxlan_sock
*vs
;
142 vs
= vxlan_find_sock(net
, family
, port
, flags
, ifindex
);
146 return vxlan_vs_find_vni(vs
, ifindex
, vni
, NULL
);
149 /* Fill in neighbour message in skbuff. */
150 static int vxlan_fdb_info(struct sk_buff
*skb
, struct vxlan_dev
*vxlan
,
151 const struct vxlan_fdb
*fdb
,
152 u32 portid
, u32 seq
, int type
, unsigned int flags
,
153 const struct vxlan_rdst
*rdst
)
155 unsigned long now
= jiffies
;
156 struct nda_cacheinfo ci
;
157 bool send_ip
, send_eth
;
158 struct nlmsghdr
*nlh
;
164 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*ndm
), flags
);
168 ndm
= nlmsg_data(nlh
);
169 memset(ndm
, 0, sizeof(*ndm
));
171 send_eth
= send_ip
= true;
174 nh
= rcu_dereference(fdb
->nh
);
176 nh_family
= nexthop_get_family(nh
);
181 if (type
== RTM_GETNEIGH
) {
183 send_ip
= !vxlan_addr_any(&rdst
->remote_ip
);
184 ndm
->ndm_family
= send_ip
? rdst
->remote_ip
.sa
.sa_family
: AF_INET
;
186 ndm
->ndm_family
= nh_family
;
188 send_eth
= !is_zero_ether_addr(fdb
->eth_addr
);
190 ndm
->ndm_family
= AF_BRIDGE
;
191 ndm
->ndm_state
= fdb
->state
;
192 ndm
->ndm_ifindex
= vxlan
->dev
->ifindex
;
193 ndm
->ndm_flags
= fdb
->flags
;
194 if (rdst
&& rdst
->offloaded
)
195 ndm
->ndm_flags
|= NTF_OFFLOADED
;
196 ndm
->ndm_type
= RTN_UNICAST
;
198 if (!net_eq(dev_net(vxlan
->dev
), vxlan
->net
) &&
199 nla_put_s32(skb
, NDA_LINK_NETNSID
,
200 peernet2id(dev_net(vxlan
->dev
), vxlan
->net
)))
201 goto nla_put_failure
;
203 if (send_eth
&& nla_put(skb
, NDA_LLADDR
, ETH_ALEN
, &fdb
->eth_addr
))
204 goto nla_put_failure
;
206 if (nla_put_u32(skb
, NDA_NH_ID
, nh_id
))
207 goto nla_put_failure
;
209 if (send_ip
&& vxlan_nla_put_addr(skb
, NDA_DST
,
211 goto nla_put_failure
;
213 if (rdst
->remote_port
&&
214 rdst
->remote_port
!= vxlan
->cfg
.dst_port
&&
215 nla_put_be16(skb
, NDA_PORT
, rdst
->remote_port
))
216 goto nla_put_failure
;
217 if (rdst
->remote_vni
!= vxlan
->default_dst
.remote_vni
&&
218 nla_put_u32(skb
, NDA_VNI
, be32_to_cpu(rdst
->remote_vni
)))
219 goto nla_put_failure
;
220 if (rdst
->remote_ifindex
&&
221 nla_put_u32(skb
, NDA_IFINDEX
, rdst
->remote_ifindex
))
222 goto nla_put_failure
;
225 if ((vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
) && fdb
->vni
&&
226 nla_put_u32(skb
, NDA_SRC_VNI
,
227 be32_to_cpu(fdb
->vni
)))
228 goto nla_put_failure
;
230 ci
.ndm_used
= jiffies_to_clock_t(now
- fdb
->used
);
231 ci
.ndm_confirmed
= 0;
232 ci
.ndm_updated
= jiffies_to_clock_t(now
- fdb
->updated
);
235 if (nla_put(skb
, NDA_CACHEINFO
, sizeof(ci
), &ci
))
236 goto nla_put_failure
;
242 nlmsg_cancel(skb
, nlh
);
246 static inline size_t vxlan_nlmsg_size(void)
248 return NLMSG_ALIGN(sizeof(struct ndmsg
))
249 + nla_total_size(ETH_ALEN
) /* NDA_LLADDR */
250 + nla_total_size(sizeof(struct in6_addr
)) /* NDA_DST */
251 + nla_total_size(sizeof(__be16
)) /* NDA_PORT */
252 + nla_total_size(sizeof(__be32
)) /* NDA_VNI */
253 + nla_total_size(sizeof(__u32
)) /* NDA_IFINDEX */
254 + nla_total_size(sizeof(__s32
)) /* NDA_LINK_NETNSID */
255 + nla_total_size(sizeof(struct nda_cacheinfo
));
258 static void __vxlan_fdb_notify(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*fdb
,
259 struct vxlan_rdst
*rd
, int type
)
261 struct net
*net
= dev_net(vxlan
->dev
);
265 skb
= nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC
);
269 err
= vxlan_fdb_info(skb
, vxlan
, fdb
, 0, 0, type
, 0, rd
);
271 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
272 WARN_ON(err
== -EMSGSIZE
);
277 rtnl_notify(skb
, net
, 0, RTNLGRP_NEIGH
, NULL
, GFP_ATOMIC
);
280 rtnl_set_sk_err(net
, RTNLGRP_NEIGH
, err
);
283 static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev
*vxlan
,
284 const struct vxlan_fdb
*fdb
,
285 const struct vxlan_rdst
*rd
,
286 struct netlink_ext_ack
*extack
,
287 struct switchdev_notifier_vxlan_fdb_info
*fdb_info
)
289 fdb_info
->info
.dev
= vxlan
->dev
;
290 fdb_info
->info
.extack
= extack
;
291 fdb_info
->remote_ip
= rd
->remote_ip
;
292 fdb_info
->remote_port
= rd
->remote_port
;
293 fdb_info
->remote_vni
= rd
->remote_vni
;
294 fdb_info
->remote_ifindex
= rd
->remote_ifindex
;
295 memcpy(fdb_info
->eth_addr
, fdb
->eth_addr
, ETH_ALEN
);
296 fdb_info
->vni
= fdb
->vni
;
297 fdb_info
->offloaded
= rd
->offloaded
;
298 fdb_info
->added_by_user
= fdb
->flags
& NTF_VXLAN_ADDED_BY_USER
;
301 static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev
*vxlan
,
302 struct vxlan_fdb
*fdb
,
303 struct vxlan_rdst
*rd
,
305 struct netlink_ext_ack
*extack
)
307 struct switchdev_notifier_vxlan_fdb_info info
;
308 enum switchdev_notifier_type notifier_type
;
314 notifier_type
= adding
? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
315 : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE
;
316 vxlan_fdb_switchdev_notifier_info(vxlan
, fdb
, rd
, NULL
, &info
);
317 ret
= call_switchdev_notifiers(notifier_type
, vxlan
->dev
,
319 return notifier_to_errno(ret
);
322 static int vxlan_fdb_notify(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*fdb
,
323 struct vxlan_rdst
*rd
, int type
, bool swdev_notify
,
324 struct netlink_ext_ack
*extack
)
328 if (swdev_notify
&& rd
) {
331 err
= vxlan_fdb_switchdev_call_notifiers(vxlan
, fdb
, rd
,
337 vxlan_fdb_switchdev_call_notifiers(vxlan
, fdb
, rd
,
343 __vxlan_fdb_notify(vxlan
, fdb
, rd
, type
);
347 static void vxlan_ip_miss(struct net_device
*dev
, union vxlan_addr
*ipa
)
349 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
350 struct vxlan_fdb f
= {
353 struct vxlan_rdst remote
= {
354 .remote_ip
= *ipa
, /* goes to NDA_DST */
355 .remote_vni
= cpu_to_be32(VXLAN_N_VID
),
358 vxlan_fdb_notify(vxlan
, &f
, &remote
, RTM_GETNEIGH
, true, NULL
);
361 static void vxlan_fdb_miss(struct vxlan_dev
*vxlan
, const u8 eth_addr
[ETH_ALEN
])
363 struct vxlan_fdb f
= {
366 struct vxlan_rdst remote
= { };
368 memcpy(f
.eth_addr
, eth_addr
, ETH_ALEN
);
370 vxlan_fdb_notify(vxlan
, &f
, &remote
, RTM_GETNEIGH
, true, NULL
);
373 /* Hash Ethernet address */
374 static u32
eth_hash(const unsigned char *addr
)
376 u64 value
= get_unaligned((u64
*)addr
);
378 /* only want 6 bytes */
384 return hash_64(value
, FDB_HASH_BITS
);
387 u32
eth_vni_hash(const unsigned char *addr
, __be32 vni
)
389 /* use 1 byte of OUI and 3 bytes of NIC */
390 u32 key
= get_unaligned((u32
*)(addr
+ 2));
392 return jhash_2words(key
, vni
, vxlan_salt
) & (FDB_HASH_SIZE
- 1);
395 u32
fdb_head_index(struct vxlan_dev
*vxlan
, const u8
*mac
, __be32 vni
)
397 if (vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
)
398 return eth_vni_hash(mac
, vni
);
400 return eth_hash(mac
);
403 /* Hash chain to use given mac address */
404 static inline struct hlist_head
*vxlan_fdb_head(struct vxlan_dev
*vxlan
,
405 const u8
*mac
, __be32 vni
)
407 return &vxlan
->fdb_head
[fdb_head_index(vxlan
, mac
, vni
)];
410 /* Look up Ethernet address in forwarding table */
411 static struct vxlan_fdb
*__vxlan_find_mac(struct vxlan_dev
*vxlan
,
412 const u8
*mac
, __be32 vni
)
414 struct hlist_head
*head
= vxlan_fdb_head(vxlan
, mac
, vni
);
417 hlist_for_each_entry_rcu(f
, head
, hlist
) {
418 if (ether_addr_equal(mac
, f
->eth_addr
)) {
419 if (vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
) {
431 static struct vxlan_fdb
*vxlan_find_mac(struct vxlan_dev
*vxlan
,
432 const u8
*mac
, __be32 vni
)
436 f
= __vxlan_find_mac(vxlan
, mac
, vni
);
437 if (f
&& f
->used
!= jiffies
)
443 /* caller should hold vxlan->hash_lock */
444 static struct vxlan_rdst
*vxlan_fdb_find_rdst(struct vxlan_fdb
*f
,
445 union vxlan_addr
*ip
, __be16 port
,
446 __be32 vni
, __u32 ifindex
)
448 struct vxlan_rdst
*rd
;
450 list_for_each_entry(rd
, &f
->remotes
, list
) {
451 if (vxlan_addr_equal(&rd
->remote_ip
, ip
) &&
452 rd
->remote_port
== port
&&
453 rd
->remote_vni
== vni
&&
454 rd
->remote_ifindex
== ifindex
)
461 int vxlan_fdb_find_uc(struct net_device
*dev
, const u8
*mac
, __be32 vni
,
462 struct switchdev_notifier_vxlan_fdb_info
*fdb_info
)
464 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
465 u8 eth_addr
[ETH_ALEN
+ 2] = { 0 };
466 struct vxlan_rdst
*rdst
;
470 if (is_multicast_ether_addr(mac
) ||
471 is_zero_ether_addr(mac
))
474 ether_addr_copy(eth_addr
, mac
);
478 f
= __vxlan_find_mac(vxlan
, eth_addr
, vni
);
484 rdst
= first_remote_rcu(f
);
485 vxlan_fdb_switchdev_notifier_info(vxlan
, f
, rdst
, NULL
, fdb_info
);
491 EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc
);
493 static int vxlan_fdb_notify_one(struct notifier_block
*nb
,
494 const struct vxlan_dev
*vxlan
,
495 const struct vxlan_fdb
*f
,
496 const struct vxlan_rdst
*rdst
,
497 struct netlink_ext_ack
*extack
)
499 struct switchdev_notifier_vxlan_fdb_info fdb_info
;
502 vxlan_fdb_switchdev_notifier_info(vxlan
, f
, rdst
, extack
, &fdb_info
);
503 rc
= nb
->notifier_call(nb
, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
,
505 return notifier_to_errno(rc
);
508 int vxlan_fdb_replay(const struct net_device
*dev
, __be32 vni
,
509 struct notifier_block
*nb
,
510 struct netlink_ext_ack
*extack
)
512 struct vxlan_dev
*vxlan
;
513 struct vxlan_rdst
*rdst
;
518 if (!netif_is_vxlan(dev
))
520 vxlan
= netdev_priv(dev
);
522 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
523 spin_lock_bh(&vxlan
->hash_lock
[h
]);
524 hlist_for_each_entry(f
, &vxlan
->fdb_head
[h
], hlist
) {
526 list_for_each_entry(rdst
, &f
->remotes
, list
) {
527 rc
= vxlan_fdb_notify_one(nb
, vxlan
,
535 spin_unlock_bh(&vxlan
->hash_lock
[h
]);
540 spin_unlock_bh(&vxlan
->hash_lock
[h
]);
543 EXPORT_SYMBOL_GPL(vxlan_fdb_replay
);
545 void vxlan_fdb_clear_offload(const struct net_device
*dev
, __be32 vni
)
547 struct vxlan_dev
*vxlan
;
548 struct vxlan_rdst
*rdst
;
552 if (!netif_is_vxlan(dev
))
554 vxlan
= netdev_priv(dev
);
556 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
557 spin_lock_bh(&vxlan
->hash_lock
[h
]);
558 hlist_for_each_entry(f
, &vxlan
->fdb_head
[h
], hlist
)
560 list_for_each_entry(rdst
, &f
->remotes
, list
)
561 rdst
->offloaded
= false;
562 spin_unlock_bh(&vxlan
->hash_lock
[h
]);
566 EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload
);
568 /* Replace destination of unicast mac */
569 static int vxlan_fdb_replace(struct vxlan_fdb
*f
,
570 union vxlan_addr
*ip
, __be16 port
, __be32 vni
,
571 __u32 ifindex
, struct vxlan_rdst
*oldrd
)
573 struct vxlan_rdst
*rd
;
575 rd
= vxlan_fdb_find_rdst(f
, ip
, port
, vni
, ifindex
);
579 rd
= list_first_entry_or_null(&f
->remotes
, struct vxlan_rdst
, list
);
584 dst_cache_reset(&rd
->dst_cache
);
586 rd
->remote_port
= port
;
587 rd
->remote_vni
= vni
;
588 rd
->remote_ifindex
= ifindex
;
589 rd
->offloaded
= false;
593 /* Add/update destinations for multicast */
594 static int vxlan_fdb_append(struct vxlan_fdb
*f
,
595 union vxlan_addr
*ip
, __be16 port
, __be32 vni
,
596 __u32 ifindex
, struct vxlan_rdst
**rdp
)
598 struct vxlan_rdst
*rd
;
600 rd
= vxlan_fdb_find_rdst(f
, ip
, port
, vni
, ifindex
);
604 rd
= kmalloc(sizeof(*rd
), GFP_ATOMIC
);
608 if (dst_cache_init(&rd
->dst_cache
, GFP_ATOMIC
)) {
614 rd
->remote_port
= port
;
615 rd
->offloaded
= false;
616 rd
->remote_vni
= vni
;
617 rd
->remote_ifindex
= ifindex
;
619 list_add_tail_rcu(&rd
->list
, &f
->remotes
);
625 static bool vxlan_parse_gpe_proto(struct vxlanhdr
*hdr
, __be16
*protocol
)
627 struct vxlanhdr_gpe
*gpe
= (struct vxlanhdr_gpe
*)hdr
;
629 /* Need to have Next Protocol set for interfaces in GPE mode. */
630 if (!gpe
->np_applied
)
632 /* "The initial version is 0. If a receiver does not support the
633 * version indicated it MUST drop the packet.
635 if (gpe
->version
!= 0)
637 /* "When the O bit is set to 1, the packet is an OAM packet and OAM
638 * processing MUST occur." However, we don't implement OAM
639 * processing, thus drop the packet.
644 *protocol
= tun_p_to_eth_p(gpe
->next_protocol
);
651 static struct vxlanhdr
*vxlan_gro_remcsum(struct sk_buff
*skb
,
653 struct vxlanhdr
*vh
, size_t hdrlen
,
655 struct gro_remcsum
*grc
,
658 size_t start
, offset
;
660 if (skb
->remcsum_offload
)
663 if (!NAPI_GRO_CB(skb
)->csum_valid
)
666 start
= vxlan_rco_start(vni_field
);
667 offset
= start
+ vxlan_rco_offset(vni_field
);
669 vh
= skb_gro_remcsum_process(skb
, (void *)vh
, off
, hdrlen
,
670 start
, offset
, grc
, nopartial
);
672 skb
->remcsum_offload
= 1;
677 static struct vxlanhdr
*vxlan_gro_prepare_receive(struct sock
*sk
,
678 struct list_head
*head
,
680 struct gro_remcsum
*grc
)
683 struct vxlanhdr
*vh
, *vh2
;
684 unsigned int hlen
, off_vx
;
685 struct vxlan_sock
*vs
= rcu_dereference_sk_user_data(sk
);
688 skb_gro_remcsum_init(grc
);
690 off_vx
= skb_gro_offset(skb
);
691 hlen
= off_vx
+ sizeof(*vh
);
692 vh
= skb_gro_header(skb
, hlen
, off_vx
);
696 skb_gro_postpull_rcsum(skb
, vh
, sizeof(struct vxlanhdr
));
698 flags
= vh
->vx_flags
;
700 if ((flags
& VXLAN_HF_RCO
) && (vs
->flags
& VXLAN_F_REMCSUM_RX
)) {
701 vh
= vxlan_gro_remcsum(skb
, off_vx
, vh
, sizeof(struct vxlanhdr
),
704 VXLAN_F_REMCSUM_NOPARTIAL
));
710 skb_gro_pull(skb
, sizeof(struct vxlanhdr
)); /* pull vxlan header */
712 list_for_each_entry(p
, head
, list
) {
713 if (!NAPI_GRO_CB(p
)->same_flow
)
716 vh2
= (struct vxlanhdr
*)(p
->data
+ off_vx
);
717 if (vh
->vx_flags
!= vh2
->vx_flags
||
718 vh
->vx_vni
!= vh2
->vx_vni
) {
719 NAPI_GRO_CB(p
)->same_flow
= 0;
727 static struct sk_buff
*vxlan_gro_receive(struct sock
*sk
,
728 struct list_head
*head
,
731 struct sk_buff
*pp
= NULL
;
732 struct gro_remcsum grc
;
735 if (vxlan_gro_prepare_receive(sk
, head
, skb
, &grc
)) {
736 pp
= call_gro_receive(eth_gro_receive
, head
, skb
);
739 skb_gro_flush_final_remcsum(skb
, pp
, flush
, &grc
);
743 static struct sk_buff
*vxlan_gpe_gro_receive(struct sock
*sk
,
744 struct list_head
*head
,
747 const struct packet_offload
*ptype
;
748 struct sk_buff
*pp
= NULL
;
749 struct gro_remcsum grc
;
754 vh
= vxlan_gro_prepare_receive(sk
, head
, skb
, &grc
);
756 if (!vxlan_parse_gpe_proto(vh
, &protocol
))
758 ptype
= gro_find_receive_by_type(protocol
);
761 pp
= call_gro_receive(ptype
->callbacks
.gro_receive
, head
, skb
);
765 skb_gro_flush_final_remcsum(skb
, pp
, flush
, &grc
);
769 static int vxlan_gro_complete(struct sock
*sk
, struct sk_buff
*skb
, int nhoff
)
771 /* Sets 'skb->inner_mac_header' since we are always called with
772 * 'skb->encapsulation' set.
774 return eth_gro_complete(skb
, nhoff
+ sizeof(struct vxlanhdr
));
777 static int vxlan_gpe_gro_complete(struct sock
*sk
, struct sk_buff
*skb
, int nhoff
)
779 struct vxlanhdr
*vh
= (struct vxlanhdr
*)(skb
->data
+ nhoff
);
780 const struct packet_offload
*ptype
;
784 if (!vxlan_parse_gpe_proto(vh
, &protocol
))
786 ptype
= gro_find_complete_by_type(protocol
);
788 err
= ptype
->callbacks
.gro_complete(skb
, nhoff
+ sizeof(struct vxlanhdr
));
792 static struct vxlan_fdb
*vxlan_fdb_alloc(struct vxlan_dev
*vxlan
, const u8
*mac
,
793 __u16 state
, __be32 src_vni
,
798 f
= kmalloc(sizeof(*f
), GFP_ATOMIC
);
802 f
->flags
= ndm_flags
;
803 f
->updated
= f
->used
= jiffies
;
806 RCU_INIT_POINTER(f
->vdev
, vxlan
);
807 INIT_LIST_HEAD(&f
->nh_list
);
808 INIT_LIST_HEAD(&f
->remotes
);
809 memcpy(f
->eth_addr
, mac
, ETH_ALEN
);
814 static void vxlan_fdb_insert(struct vxlan_dev
*vxlan
, const u8
*mac
,
815 __be32 src_vni
, struct vxlan_fdb
*f
)
818 hlist_add_head_rcu(&f
->hlist
,
819 vxlan_fdb_head(vxlan
, mac
, src_vni
));
822 static int vxlan_fdb_nh_update(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*fdb
,
823 u32 nhid
, struct netlink_ext_ack
*extack
)
825 struct nexthop
*old_nh
= rtnl_dereference(fdb
->nh
);
829 if (old_nh
&& old_nh
->id
== nhid
)
832 nh
= nexthop_find_by_id(vxlan
->net
, nhid
);
834 NL_SET_ERR_MSG(extack
, "Nexthop id does not exist");
838 if (!nexthop_get(nh
)) {
839 NL_SET_ERR_MSG(extack
, "Nexthop has been deleted");
843 if (!nexthop_is_fdb(nh
)) {
844 NL_SET_ERR_MSG(extack
, "Nexthop is not a fdb nexthop");
848 if (!nexthop_is_multipath(nh
)) {
849 NL_SET_ERR_MSG(extack
, "Nexthop is not a multipath group");
853 /* check nexthop group family */
854 switch (vxlan
->default_dst
.remote_ip
.sa
.sa_family
) {
856 if (!nexthop_has_v4(nh
)) {
858 NL_SET_ERR_MSG(extack
, "Nexthop group family not supported");
863 if (nexthop_has_v4(nh
)) {
865 NL_SET_ERR_MSG(extack
, "Nexthop group family not supported");
871 list_del_rcu(&fdb
->nh_list
);
874 rcu_assign_pointer(fdb
->nh
, nh
);
875 list_add_tail_rcu(&fdb
->nh_list
, &nh
->fdb_list
);
884 int vxlan_fdb_create(struct vxlan_dev
*vxlan
,
885 const u8
*mac
, union vxlan_addr
*ip
,
886 __u16 state
, __be16 port
, __be32 src_vni
,
887 __be32 vni
, __u32 ifindex
, __u16 ndm_flags
,
888 u32 nhid
, struct vxlan_fdb
**fdb
,
889 struct netlink_ext_ack
*extack
)
891 struct vxlan_rdst
*rd
= NULL
;
895 if (vxlan
->cfg
.addrmax
&&
896 vxlan
->addrcnt
>= vxlan
->cfg
.addrmax
)
899 netdev_dbg(vxlan
->dev
, "add %pM -> %pIS\n", mac
, ip
);
900 f
= vxlan_fdb_alloc(vxlan
, mac
, state
, src_vni
, ndm_flags
);
905 rc
= vxlan_fdb_nh_update(vxlan
, f
, nhid
, extack
);
907 rc
= vxlan_fdb_append(f
, ip
, port
, vni
, ifindex
, &rd
);
920 static void __vxlan_fdb_free(struct vxlan_fdb
*f
)
922 struct vxlan_rdst
*rd
, *nd
;
925 nh
= rcu_dereference_raw(f
->nh
);
927 rcu_assign_pointer(f
->nh
, NULL
);
928 rcu_assign_pointer(f
->vdev
, NULL
);
932 list_for_each_entry_safe(rd
, nd
, &f
->remotes
, list
) {
933 dst_cache_destroy(&rd
->dst_cache
);
939 static void vxlan_fdb_free(struct rcu_head
*head
)
941 struct vxlan_fdb
*f
= container_of(head
, struct vxlan_fdb
, rcu
);
946 static void vxlan_fdb_destroy(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*f
,
947 bool do_notify
, bool swdev_notify
)
949 struct vxlan_rdst
*rd
;
951 netdev_dbg(vxlan
->dev
, "delete %pM\n", f
->eth_addr
);
955 if (rcu_access_pointer(f
->nh
))
956 vxlan_fdb_notify(vxlan
, f
, NULL
, RTM_DELNEIGH
,
959 list_for_each_entry(rd
, &f
->remotes
, list
)
960 vxlan_fdb_notify(vxlan
, f
, rd
, RTM_DELNEIGH
,
964 hlist_del_rcu(&f
->hlist
);
965 list_del_rcu(&f
->nh_list
);
966 call_rcu(&f
->rcu
, vxlan_fdb_free
);
969 static void vxlan_dst_free(struct rcu_head
*head
)
971 struct vxlan_rdst
*rd
= container_of(head
, struct vxlan_rdst
, rcu
);
973 dst_cache_destroy(&rd
->dst_cache
);
977 static int vxlan_fdb_update_existing(struct vxlan_dev
*vxlan
,
978 union vxlan_addr
*ip
,
979 __u16 state
, __u16 flags
,
980 __be16 port
, __be32 vni
,
981 __u32 ifindex
, __u16 ndm_flags
,
982 struct vxlan_fdb
*f
, u32 nhid
,
984 struct netlink_ext_ack
*extack
)
986 __u16 fdb_flags
= (ndm_flags
& ~NTF_USE
);
987 struct vxlan_rdst
*rd
= NULL
;
988 struct vxlan_rdst oldrd
;
993 if (nhid
&& !rcu_access_pointer(f
->nh
)) {
994 NL_SET_ERR_MSG(extack
,
995 "Cannot replace an existing non nexthop fdb with a nexthop");
999 if (nhid
&& (flags
& NLM_F_APPEND
)) {
1000 NL_SET_ERR_MSG(extack
,
1001 "Cannot append to a nexthop fdb");
1005 /* Do not allow an externally learned entry to take over an entry added
1008 if (!(fdb_flags
& NTF_EXT_LEARNED
) ||
1009 !(f
->flags
& NTF_VXLAN_ADDED_BY_USER
)) {
1010 if (f
->state
!= state
) {
1012 f
->updated
= jiffies
;
1015 if (f
->flags
!= fdb_flags
) {
1016 f
->flags
= fdb_flags
;
1017 f
->updated
= jiffies
;
1022 if ((flags
& NLM_F_REPLACE
)) {
1023 /* Only change unicasts */
1024 if (!(is_multicast_ether_addr(f
->eth_addr
) ||
1025 is_zero_ether_addr(f
->eth_addr
))) {
1027 rc
= vxlan_fdb_nh_update(vxlan
, f
, nhid
, extack
);
1031 rc
= vxlan_fdb_replace(f
, ip
, port
, vni
,
1036 NL_SET_ERR_MSG(extack
, "Cannot replace non-unicast fdb entries");
1040 if ((flags
& NLM_F_APPEND
) &&
1041 (is_multicast_ether_addr(f
->eth_addr
) ||
1042 is_zero_ether_addr(f
->eth_addr
))) {
1043 rc
= vxlan_fdb_append(f
, ip
, port
, vni
, ifindex
, &rd
);
1050 if (ndm_flags
& NTF_USE
)
1055 rd
= first_remote_rtnl(f
);
1057 err
= vxlan_fdb_notify(vxlan
, f
, rd
, RTM_NEWNEIGH
,
1058 swdev_notify
, extack
);
1068 if ((flags
& NLM_F_REPLACE
) && rc
)
1070 else if ((flags
& NLM_F_APPEND
) && rc
) {
1071 list_del_rcu(&rd
->list
);
1072 call_rcu(&rd
->rcu
, vxlan_dst_free
);
1077 static int vxlan_fdb_update_create(struct vxlan_dev
*vxlan
,
1078 const u8
*mac
, union vxlan_addr
*ip
,
1079 __u16 state
, __u16 flags
,
1080 __be16 port
, __be32 src_vni
, __be32 vni
,
1081 __u32 ifindex
, __u16 ndm_flags
, u32 nhid
,
1083 struct netlink_ext_ack
*extack
)
1085 __u16 fdb_flags
= (ndm_flags
& ~NTF_USE
);
1086 struct vxlan_fdb
*f
;
1089 /* Disallow replace to add a multicast entry */
1090 if ((flags
& NLM_F_REPLACE
) &&
1091 (is_multicast_ether_addr(mac
) || is_zero_ether_addr(mac
)))
1094 netdev_dbg(vxlan
->dev
, "add %pM -> %pIS\n", mac
, ip
);
1095 rc
= vxlan_fdb_create(vxlan
, mac
, ip
, state
, port
, src_vni
,
1096 vni
, ifindex
, fdb_flags
, nhid
, &f
, extack
);
1100 vxlan_fdb_insert(vxlan
, mac
, src_vni
, f
);
1101 rc
= vxlan_fdb_notify(vxlan
, f
, first_remote_rtnl(f
), RTM_NEWNEIGH
,
1102 swdev_notify
, extack
);
1109 vxlan_fdb_destroy(vxlan
, f
, false, false);
1113 /* Add new entry to forwarding table -- assumes lock held */
1114 int vxlan_fdb_update(struct vxlan_dev
*vxlan
,
1115 const u8
*mac
, union vxlan_addr
*ip
,
1116 __u16 state
, __u16 flags
,
1117 __be16 port
, __be32 src_vni
, __be32 vni
,
1118 __u32 ifindex
, __u16 ndm_flags
, u32 nhid
,
1120 struct netlink_ext_ack
*extack
)
1122 struct vxlan_fdb
*f
;
1124 f
= __vxlan_find_mac(vxlan
, mac
, src_vni
);
1126 if (flags
& NLM_F_EXCL
) {
1127 netdev_dbg(vxlan
->dev
,
1128 "lost race to create %pM\n", mac
);
1132 return vxlan_fdb_update_existing(vxlan
, ip
, state
, flags
, port
,
1133 vni
, ifindex
, ndm_flags
, f
,
1134 nhid
, swdev_notify
, extack
);
1136 if (!(flags
& NLM_F_CREATE
))
1139 return vxlan_fdb_update_create(vxlan
, mac
, ip
, state
, flags
,
1140 port
, src_vni
, vni
, ifindex
,
1141 ndm_flags
, nhid
, swdev_notify
,
1146 static void vxlan_fdb_dst_destroy(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*f
,
1147 struct vxlan_rdst
*rd
, bool swdev_notify
)
1149 list_del_rcu(&rd
->list
);
1150 vxlan_fdb_notify(vxlan
, f
, rd
, RTM_DELNEIGH
, swdev_notify
, NULL
);
1151 call_rcu(&rd
->rcu
, vxlan_dst_free
);
1154 static int vxlan_fdb_parse(struct nlattr
*tb
[], struct vxlan_dev
*vxlan
,
1155 union vxlan_addr
*ip
, __be16
*port
, __be32
*src_vni
,
1156 __be32
*vni
, u32
*ifindex
, u32
*nhid
,
1157 struct netlink_ext_ack
*extack
)
1159 struct net
*net
= dev_net(vxlan
->dev
);
1162 if (tb
[NDA_NH_ID
] &&
1163 (tb
[NDA_DST
] || tb
[NDA_VNI
] || tb
[NDA_IFINDEX
] || tb
[NDA_PORT
])) {
1164 NL_SET_ERR_MSG(extack
, "DST, VNI, ifindex and port are mutually exclusive with NH_ID");
1169 err
= vxlan_nla_get_addr(ip
, tb
[NDA_DST
]);
1171 NL_SET_ERR_MSG(extack
, "Unsupported address family");
1175 union vxlan_addr
*remote
= &vxlan
->default_dst
.remote_ip
;
1177 if (remote
->sa
.sa_family
== AF_INET
) {
1178 ip
->sin
.sin_addr
.s_addr
= htonl(INADDR_ANY
);
1179 ip
->sa
.sa_family
= AF_INET
;
1180 #if IS_ENABLED(CONFIG_IPV6)
1182 ip
->sin6
.sin6_addr
= in6addr_any
;
1183 ip
->sa
.sa_family
= AF_INET6
;
1189 if (nla_len(tb
[NDA_PORT
]) != sizeof(__be16
)) {
1190 NL_SET_ERR_MSG(extack
, "Invalid vxlan port");
1193 *port
= nla_get_be16(tb
[NDA_PORT
]);
1195 *port
= vxlan
->cfg
.dst_port
;
1199 if (nla_len(tb
[NDA_VNI
]) != sizeof(u32
)) {
1200 NL_SET_ERR_MSG(extack
, "Invalid vni");
1203 *vni
= cpu_to_be32(nla_get_u32(tb
[NDA_VNI
]));
1205 *vni
= vxlan
->default_dst
.remote_vni
;
1208 if (tb
[NDA_SRC_VNI
]) {
1209 if (nla_len(tb
[NDA_SRC_VNI
]) != sizeof(u32
)) {
1210 NL_SET_ERR_MSG(extack
, "Invalid src vni");
1213 *src_vni
= cpu_to_be32(nla_get_u32(tb
[NDA_SRC_VNI
]));
1215 *src_vni
= vxlan
->default_dst
.remote_vni
;
1218 if (tb
[NDA_IFINDEX
]) {
1219 struct net_device
*tdev
;
1221 if (nla_len(tb
[NDA_IFINDEX
]) != sizeof(u32
)) {
1222 NL_SET_ERR_MSG(extack
, "Invalid ifindex");
1225 *ifindex
= nla_get_u32(tb
[NDA_IFINDEX
]);
1226 tdev
= __dev_get_by_index(net
, *ifindex
);
1228 NL_SET_ERR_MSG(extack
, "Device not found");
1229 return -EADDRNOTAVAIL
;
1235 *nhid
= nla_get_u32_default(tb
[NDA_NH_ID
], 0);
1240 /* Add static entry (via netlink) */
1241 static int vxlan_fdb_add(struct ndmsg
*ndm
, struct nlattr
*tb
[],
1242 struct net_device
*dev
,
1243 const unsigned char *addr
, u16 vid
, u16 flags
,
1244 bool *notified
, struct netlink_ext_ack
*extack
)
1246 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1247 /* struct net *net = dev_net(vxlan->dev); */
1248 union vxlan_addr ip
;
1250 __be32 src_vni
, vni
;
1255 if (!(ndm
->ndm_state
& (NUD_PERMANENT
|NUD_REACHABLE
))) {
1256 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
1261 if (!tb
|| (!tb
[NDA_DST
] && !tb
[NDA_NH_ID
]))
1264 err
= vxlan_fdb_parse(tb
, vxlan
, &ip
, &port
, &src_vni
, &vni
, &ifindex
,
1269 if (vxlan
->default_dst
.remote_ip
.sa
.sa_family
!= ip
.sa
.sa_family
)
1270 return -EAFNOSUPPORT
;
1272 hash_index
= fdb_head_index(vxlan
, addr
, src_vni
);
1273 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
1274 err
= vxlan_fdb_update(vxlan
, addr
, &ip
, ndm
->ndm_state
, flags
,
1275 port
, src_vni
, vni
, ifindex
,
1276 ndm
->ndm_flags
| NTF_VXLAN_ADDED_BY_USER
,
1277 nhid
, true, extack
);
1278 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
1286 int __vxlan_fdb_delete(struct vxlan_dev
*vxlan
,
1287 const unsigned char *addr
, union vxlan_addr ip
,
1288 __be16 port
, __be32 src_vni
, __be32 vni
,
1289 u32 ifindex
, bool swdev_notify
)
1291 struct vxlan_rdst
*rd
= NULL
;
1292 struct vxlan_fdb
*f
;
1295 f
= vxlan_find_mac(vxlan
, addr
, src_vni
);
1299 if (!vxlan_addr_any(&ip
)) {
1300 rd
= vxlan_fdb_find_rdst(f
, &ip
, port
, vni
, ifindex
);
1305 /* remove a destination if it's not the only one on the list,
1306 * otherwise destroy the fdb entry
1308 if (rd
&& !list_is_singular(&f
->remotes
)) {
1309 vxlan_fdb_dst_destroy(vxlan
, f
, rd
, swdev_notify
);
1313 vxlan_fdb_destroy(vxlan
, f
, true, swdev_notify
);
1319 /* Delete entry (via netlink) */
1320 static int vxlan_fdb_delete(struct ndmsg
*ndm
, struct nlattr
*tb
[],
1321 struct net_device
*dev
,
1322 const unsigned char *addr
, u16 vid
, bool *notified
,
1323 struct netlink_ext_ack
*extack
)
1325 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1326 union vxlan_addr ip
;
1327 __be32 src_vni
, vni
;
1333 err
= vxlan_fdb_parse(tb
, vxlan
, &ip
, &port
, &src_vni
, &vni
, &ifindex
,
1338 hash_index
= fdb_head_index(vxlan
, addr
, src_vni
);
1339 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
1340 err
= __vxlan_fdb_delete(vxlan
, addr
, ip
, port
, src_vni
, vni
, ifindex
,
1342 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
1350 /* Dump forwarding table */
1351 static int vxlan_fdb_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
,
1352 struct net_device
*dev
,
1353 struct net_device
*filter_dev
, int *idx
)
1355 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1359 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
1360 struct vxlan_fdb
*f
;
1363 hlist_for_each_entry_rcu(f
, &vxlan
->fdb_head
[h
], hlist
) {
1364 struct vxlan_rdst
*rd
;
1366 if (rcu_access_pointer(f
->nh
)) {
1367 if (*idx
< cb
->args
[2])
1369 err
= vxlan_fdb_info(skb
, vxlan
, f
,
1370 NETLINK_CB(cb
->skb
).portid
,
1383 list_for_each_entry_rcu(rd
, &f
->remotes
, list
) {
1384 if (*idx
< cb
->args
[2])
1387 err
= vxlan_fdb_info(skb
, vxlan
, f
,
1388 NETLINK_CB(cb
->skb
).portid
,
1406 static int vxlan_fdb_get(struct sk_buff
*skb
,
1407 struct nlattr
*tb
[],
1408 struct net_device
*dev
,
1409 const unsigned char *addr
,
1410 u16 vid
, u32 portid
, u32 seq
,
1411 struct netlink_ext_ack
*extack
)
1413 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1414 struct vxlan_fdb
*f
;
1419 vni
= cpu_to_be32(nla_get_u32(tb
[NDA_VNI
]));
1421 vni
= vxlan
->default_dst
.remote_vni
;
1425 f
= __vxlan_find_mac(vxlan
, addr
, vni
);
1427 NL_SET_ERR_MSG(extack
, "Fdb entry not found");
1432 err
= vxlan_fdb_info(skb
, vxlan
, f
, portid
, seq
,
1433 RTM_NEWNEIGH
, 0, first_remote_rcu(f
));
1439 /* Watch incoming packets to learn mapping between Ethernet address
1440 * and Tunnel endpoint.
1442 static enum skb_drop_reason
vxlan_snoop(struct net_device
*dev
,
1443 union vxlan_addr
*src_ip
,
1444 const u8
*src_mac
, u32 src_ifindex
,
1447 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1448 struct vxlan_fdb
*f
;
1451 /* Ignore packets from invalid src-address */
1452 if (!is_valid_ether_addr(src_mac
))
1453 return SKB_DROP_REASON_MAC_INVALID_SOURCE
;
1455 #if IS_ENABLED(CONFIG_IPV6)
1456 if (src_ip
->sa
.sa_family
== AF_INET6
&&
1457 (ipv6_addr_type(&src_ip
->sin6
.sin6_addr
) & IPV6_ADDR_LINKLOCAL
))
1458 ifindex
= src_ifindex
;
1461 f
= vxlan_find_mac(vxlan
, src_mac
, vni
);
1463 struct vxlan_rdst
*rdst
= first_remote_rcu(f
);
1465 if (likely(vxlan_addr_equal(&rdst
->remote_ip
, src_ip
) &&
1466 rdst
->remote_ifindex
== ifindex
))
1467 return SKB_NOT_DROPPED_YET
;
1469 /* Don't migrate static entries, drop packets */
1470 if (f
->state
& (NUD_PERMANENT
| NUD_NOARP
))
1471 return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS
;
1473 /* Don't override an fdb with nexthop with a learnt entry */
1474 if (rcu_access_pointer(f
->nh
))
1475 return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS
;
1477 if (net_ratelimit())
1479 "%pM migrated from %pIS to %pIS\n",
1480 src_mac
, &rdst
->remote_ip
.sa
, &src_ip
->sa
);
1482 rdst
->remote_ip
= *src_ip
;
1483 f
->updated
= jiffies
;
1484 vxlan_fdb_notify(vxlan
, f
, rdst
, RTM_NEWNEIGH
, true, NULL
);
1486 u32 hash_index
= fdb_head_index(vxlan
, src_mac
, vni
);
1488 /* learned new entry */
1489 spin_lock(&vxlan
->hash_lock
[hash_index
]);
1491 /* close off race between vxlan_flush and incoming packets */
1492 if (netif_running(dev
))
1493 vxlan_fdb_update(vxlan
, src_mac
, src_ip
,
1495 NLM_F_EXCL
|NLM_F_CREATE
,
1496 vxlan
->cfg
.dst_port
,
1498 vxlan
->default_dst
.remote_vni
,
1499 ifindex
, NTF_SELF
, 0, true, NULL
);
1500 spin_unlock(&vxlan
->hash_lock
[hash_index
]);
1503 return SKB_NOT_DROPPED_YET
;
1506 static bool __vxlan_sock_release_prep(struct vxlan_sock
*vs
)
1508 struct vxlan_net
*vn
;
1512 if (!refcount_dec_and_test(&vs
->refcnt
))
1515 vn
= net_generic(sock_net(vs
->sock
->sk
), vxlan_net_id
);
1516 spin_lock(&vn
->sock_lock
);
1517 hlist_del_rcu(&vs
->hlist
);
1518 udp_tunnel_notify_del_rx_port(vs
->sock
,
1519 (vs
->flags
& VXLAN_F_GPE
) ?
1520 UDP_TUNNEL_TYPE_VXLAN_GPE
:
1521 UDP_TUNNEL_TYPE_VXLAN
);
1522 spin_unlock(&vn
->sock_lock
);
1527 static void vxlan_sock_release(struct vxlan_dev
*vxlan
)
1529 struct vxlan_sock
*sock4
= rtnl_dereference(vxlan
->vn4_sock
);
1530 #if IS_ENABLED(CONFIG_IPV6)
1531 struct vxlan_sock
*sock6
= rtnl_dereference(vxlan
->vn6_sock
);
1533 RCU_INIT_POINTER(vxlan
->vn6_sock
, NULL
);
1536 RCU_INIT_POINTER(vxlan
->vn4_sock
, NULL
);
1539 if (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
)
1540 vxlan_vs_del_vnigrp(vxlan
);
1542 vxlan_vs_del_dev(vxlan
);
1544 if (__vxlan_sock_release_prep(sock4
)) {
1545 udp_tunnel_sock_release(sock4
->sock
);
1549 #if IS_ENABLED(CONFIG_IPV6)
1550 if (__vxlan_sock_release_prep(sock6
)) {
1551 udp_tunnel_sock_release(sock6
->sock
);
1557 static enum skb_drop_reason
vxlan_remcsum(struct vxlanhdr
*unparsed
,
1558 struct sk_buff
*skb
,
1561 enum skb_drop_reason reason
;
1562 size_t start
, offset
;
1564 if (!(unparsed
->vx_flags
& VXLAN_HF_RCO
) || skb
->remcsum_offload
)
1567 start
= vxlan_rco_start(unparsed
->vx_vni
);
1568 offset
= start
+ vxlan_rco_offset(unparsed
->vx_vni
);
1570 reason
= pskb_may_pull_reason(skb
, offset
+ sizeof(u16
));
1574 skb_remcsum_process(skb
, (void *)(vxlan_hdr(skb
) + 1), start
, offset
,
1575 !!(vxflags
& VXLAN_F_REMCSUM_NOPARTIAL
));
1577 unparsed
->vx_flags
&= ~VXLAN_HF_RCO
;
1578 unparsed
->vx_vni
&= VXLAN_VNI_MASK
;
1580 return SKB_NOT_DROPPED_YET
;
1583 static void vxlan_parse_gbp_hdr(struct vxlanhdr
*unparsed
,
1584 struct sk_buff
*skb
, u32 vxflags
,
1585 struct vxlan_metadata
*md
)
1587 struct vxlanhdr_gbp
*gbp
= (struct vxlanhdr_gbp
*)unparsed
;
1588 struct metadata_dst
*tun_dst
;
1590 if (!(unparsed
->vx_flags
& VXLAN_HF_GBP
))
1593 md
->gbp
= ntohs(gbp
->policy_id
);
1595 tun_dst
= (struct metadata_dst
*)skb_dst(skb
);
1597 __set_bit(IP_TUNNEL_VXLAN_OPT_BIT
,
1598 tun_dst
->u
.tun_info
.key
.tun_flags
);
1599 tun_dst
->u
.tun_info
.options_len
= sizeof(*md
);
1601 if (gbp
->dont_learn
)
1602 md
->gbp
|= VXLAN_GBP_DONT_LEARN
;
1604 if (gbp
->policy_applied
)
1605 md
->gbp
|= VXLAN_GBP_POLICY_APPLIED
;
1607 /* In flow-based mode, GBP is carried in dst_metadata */
1608 if (!(vxflags
& VXLAN_F_COLLECT_METADATA
))
1609 skb
->mark
= md
->gbp
;
1611 unparsed
->vx_flags
&= ~VXLAN_GBP_USED_BITS
;
1614 static enum skb_drop_reason
vxlan_set_mac(struct vxlan_dev
*vxlan
,
1615 struct vxlan_sock
*vs
,
1616 struct sk_buff
*skb
, __be32 vni
)
1618 union vxlan_addr saddr
;
1619 u32 ifindex
= skb
->dev
->ifindex
;
1621 skb_reset_mac_header(skb
);
1622 skb
->protocol
= eth_type_trans(skb
, vxlan
->dev
);
1623 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
1625 /* Ignore packet loops (and multicast echo) */
1626 if (ether_addr_equal(eth_hdr(skb
)->h_source
, vxlan
->dev
->dev_addr
))
1627 return SKB_DROP_REASON_LOCAL_MAC
;
1629 /* Get address from the outer IP header */
1630 if (vxlan_get_sk_family(vs
) == AF_INET
) {
1631 saddr
.sin
.sin_addr
.s_addr
= ip_hdr(skb
)->saddr
;
1632 saddr
.sa
.sa_family
= AF_INET
;
1633 #if IS_ENABLED(CONFIG_IPV6)
1635 saddr
.sin6
.sin6_addr
= ipv6_hdr(skb
)->saddr
;
1636 saddr
.sa
.sa_family
= AF_INET6
;
1640 if (!(vxlan
->cfg
.flags
& VXLAN_F_LEARN
))
1641 return SKB_NOT_DROPPED_YET
;
1643 return vxlan_snoop(skb
->dev
, &saddr
, eth_hdr(skb
)->h_source
,
1647 static bool vxlan_ecn_decapsulate(struct vxlan_sock
*vs
, void *oiph
,
1648 struct sk_buff
*skb
)
1652 if (vxlan_get_sk_family(vs
) == AF_INET
)
1653 err
= IP_ECN_decapsulate(oiph
, skb
);
1654 #if IS_ENABLED(CONFIG_IPV6)
1656 err
= IP6_ECN_decapsulate(oiph
, skb
);
1659 if (unlikely(err
) && log_ecn_error
) {
1660 if (vxlan_get_sk_family(vs
) == AF_INET
)
1661 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
1662 &((struct iphdr
*)oiph
)->saddr
,
1663 ((struct iphdr
*)oiph
)->tos
);
1665 net_info_ratelimited("non-ECT from %pI6\n",
1666 &((struct ipv6hdr
*)oiph
)->saddr
);
1671 /* Callback from net/ipv4/udp.c to receive packets */
1672 static int vxlan_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1674 struct vxlan_vni_node
*vninode
= NULL
;
1675 struct vxlan_dev
*vxlan
;
1676 struct vxlan_sock
*vs
;
1677 struct vxlanhdr unparsed
;
1678 struct vxlan_metadata _md
;
1679 struct vxlan_metadata
*md
= &_md
;
1680 __be16 protocol
= htons(ETH_P_TEB
);
1681 enum skb_drop_reason reason
;
1682 bool raw_proto
= false;
1687 /* Need UDP and VXLAN header to be present */
1688 reason
= pskb_may_pull_reason(skb
, VXLAN_HLEN
);
1692 unparsed
= *vxlan_hdr(skb
);
1693 /* VNI flag always required to be set */
1694 if (!(unparsed
.vx_flags
& VXLAN_HF_VNI
)) {
1695 netdev_dbg(skb
->dev
, "invalid vxlan flags=%#x vni=%#x\n",
1696 ntohl(vxlan_hdr(skb
)->vx_flags
),
1697 ntohl(vxlan_hdr(skb
)->vx_vni
));
1698 reason
= SKB_DROP_REASON_VXLAN_INVALID_HDR
;
1699 /* Return non vxlan pkt */
1702 unparsed
.vx_flags
&= ~VXLAN_HF_VNI
;
1703 unparsed
.vx_vni
&= ~VXLAN_VNI_MASK
;
1705 vs
= rcu_dereference_sk_user_data(sk
);
1709 vni
= vxlan_vni(vxlan_hdr(skb
)->vx_vni
);
1711 vxlan
= vxlan_vs_find_vni(vs
, skb
->dev
->ifindex
, vni
, &vninode
);
1713 reason
= SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND
;
1717 /* For backwards compatibility, only allow reserved fields to be
1718 * used by VXLAN extensions if explicitly requested.
1720 if (vs
->flags
& VXLAN_F_GPE
) {
1721 if (!vxlan_parse_gpe_proto(&unparsed
, &protocol
))
1723 unparsed
.vx_flags
&= ~VXLAN_GPE_USED_BITS
;
1727 if (__iptunnel_pull_header(skb
, VXLAN_HLEN
, protocol
, raw_proto
,
1728 !net_eq(vxlan
->net
, dev_net(vxlan
->dev
)))) {
1729 reason
= SKB_DROP_REASON_NOMEM
;
1733 if (vs
->flags
& VXLAN_F_REMCSUM_RX
) {
1734 reason
= vxlan_remcsum(&unparsed
, skb
, vs
->flags
);
1735 if (unlikely(reason
))
1739 if (vxlan_collect_metadata(vs
)) {
1740 IP_TUNNEL_DECLARE_FLAGS(flags
) = { };
1741 struct metadata_dst
*tun_dst
;
1743 __set_bit(IP_TUNNEL_KEY_BIT
, flags
);
1744 tun_dst
= udp_tun_rx_dst(skb
, vxlan_get_sk_family(vs
), flags
,
1745 key32_to_tunnel_id(vni
), sizeof(*md
));
1748 reason
= SKB_DROP_REASON_NOMEM
;
1752 md
= ip_tunnel_info_opts(&tun_dst
->u
.tun_info
);
1754 skb_dst_set(skb
, (struct dst_entry
*)tun_dst
);
1756 memset(md
, 0, sizeof(*md
));
1759 if (vs
->flags
& VXLAN_F_GBP
)
1760 vxlan_parse_gbp_hdr(&unparsed
, skb
, vs
->flags
, md
);
1761 /* Note that GBP and GPE can never be active together. This is
1762 * ensured in vxlan_dev_configure.
1765 if (unparsed
.vx_flags
|| unparsed
.vx_vni
) {
1766 /* If there are any unprocessed flags remaining treat
1767 * this as a malformed packet. This behavior diverges from
1768 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
1769 * in reserved fields are to be ignored. The approach here
1770 * maintains compatibility with previous stack code, and also
1771 * is more robust and provides a little more security in
1772 * adding extensions to VXLAN.
1774 reason
= SKB_DROP_REASON_VXLAN_INVALID_HDR
;
1779 reason
= vxlan_set_mac(vxlan
, vs
, skb
, vni
);
1783 skb_reset_mac_header(skb
);
1784 skb
->dev
= vxlan
->dev
;
1785 skb
->pkt_type
= PACKET_HOST
;
1788 /* Save offset of outer header relative to skb->head,
1789 * because we are going to reset the network header to the inner header
1790 * and might change skb->head.
1792 nh
= skb_network_header(skb
) - skb
->head
;
1794 skb_reset_network_header(skb
);
1796 reason
= pskb_inet_may_pull_reason(skb
);
1798 DEV_STATS_INC(vxlan
->dev
, rx_length_errors
);
1799 DEV_STATS_INC(vxlan
->dev
, rx_errors
);
1800 vxlan_vnifilter_count(vxlan
, vni
, vninode
,
1801 VXLAN_VNI_STATS_RX_ERRORS
, 0);
1805 /* Get the outer header. */
1806 oiph
= skb
->head
+ nh
;
1808 if (!vxlan_ecn_decapsulate(vs
, oiph
, skb
)) {
1809 reason
= SKB_DROP_REASON_IP_TUNNEL_ECN
;
1810 DEV_STATS_INC(vxlan
->dev
, rx_frame_errors
);
1811 DEV_STATS_INC(vxlan
->dev
, rx_errors
);
1812 vxlan_vnifilter_count(vxlan
, vni
, vninode
,
1813 VXLAN_VNI_STATS_RX_ERRORS
, 0);
1819 if (unlikely(!(vxlan
->dev
->flags
& IFF_UP
))) {
1821 dev_core_stats_rx_dropped_inc(vxlan
->dev
);
1822 vxlan_vnifilter_count(vxlan
, vni
, vninode
,
1823 VXLAN_VNI_STATS_RX_DROPS
, 0);
1824 reason
= SKB_DROP_REASON_DEV_READY
;
1828 dev_sw_netstats_rx_add(vxlan
->dev
, skb
->len
);
1829 vxlan_vnifilter_count(vxlan
, vni
, vninode
, VXLAN_VNI_STATS_RX
, skb
->len
);
1830 gro_cells_receive(&vxlan
->gro_cells
, skb
);
1837 reason
= reason
?: SKB_DROP_REASON_NOT_SPECIFIED
;
1838 /* Consume bad packet */
1839 kfree_skb_reason(skb
, reason
);
1843 /* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
1844 static int vxlan_err_lookup(struct sock
*sk
, struct sk_buff
*skb
)
1846 struct vxlan_dev
*vxlan
;
1847 struct vxlan_sock
*vs
;
1848 struct vxlanhdr
*hdr
;
1851 if (!pskb_may_pull(skb
, skb_transport_offset(skb
) + VXLAN_HLEN
))
1854 hdr
= vxlan_hdr(skb
);
1856 if (!(hdr
->vx_flags
& VXLAN_HF_VNI
))
1859 vs
= rcu_dereference_sk_user_data(sk
);
1863 vni
= vxlan_vni(hdr
->vx_vni
);
1864 vxlan
= vxlan_vs_find_vni(vs
, skb
->dev
->ifindex
, vni
, NULL
);
1871 static int arp_reduce(struct net_device
*dev
, struct sk_buff
*skb
, __be32 vni
)
1873 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1874 struct arphdr
*parp
;
1877 struct neighbour
*n
;
1879 if (dev
->flags
& IFF_NOARP
)
1882 if (!pskb_may_pull(skb
, arp_hdr_len(dev
))) {
1883 dev_core_stats_tx_dropped_inc(dev
);
1884 vxlan_vnifilter_count(vxlan
, vni
, NULL
,
1885 VXLAN_VNI_STATS_TX_DROPS
, 0);
1888 parp
= arp_hdr(skb
);
1890 if ((parp
->ar_hrd
!= htons(ARPHRD_ETHER
) &&
1891 parp
->ar_hrd
!= htons(ARPHRD_IEEE802
)) ||
1892 parp
->ar_pro
!= htons(ETH_P_IP
) ||
1893 parp
->ar_op
!= htons(ARPOP_REQUEST
) ||
1894 parp
->ar_hln
!= dev
->addr_len
||
1897 arpptr
= (u8
*)parp
+ sizeof(struct arphdr
);
1899 arpptr
+= dev
->addr_len
; /* sha */
1900 memcpy(&sip
, arpptr
, sizeof(sip
));
1901 arpptr
+= sizeof(sip
);
1902 arpptr
+= dev
->addr_len
; /* tha */
1903 memcpy(&tip
, arpptr
, sizeof(tip
));
1905 if (ipv4_is_loopback(tip
) ||
1906 ipv4_is_multicast(tip
))
1909 n
= neigh_lookup(&arp_tbl
, &tip
, dev
);
1912 struct vxlan_fdb
*f
;
1913 struct sk_buff
*reply
;
1915 if (!(READ_ONCE(n
->nud_state
) & NUD_CONNECTED
)) {
1920 f
= vxlan_find_mac(vxlan
, n
->ha
, vni
);
1921 if (f
&& vxlan_addr_any(&(first_remote_rcu(f
)->remote_ip
))) {
1922 /* bridge-local neighbor */
1927 reply
= arp_create(ARPOP_REPLY
, ETH_P_ARP
, sip
, dev
, tip
, sha
,
1935 skb_reset_mac_header(reply
);
1936 __skb_pull(reply
, skb_network_offset(reply
));
1937 reply
->ip_summed
= CHECKSUM_UNNECESSARY
;
1938 reply
->pkt_type
= PACKET_HOST
;
1940 if (netif_rx(reply
) == NET_RX_DROP
) {
1941 dev_core_stats_rx_dropped_inc(dev
);
1942 vxlan_vnifilter_count(vxlan
, vni
, NULL
,
1943 VXLAN_VNI_STATS_RX_DROPS
, 0);
1946 } else if (vxlan
->cfg
.flags
& VXLAN_F_L3MISS
) {
1947 union vxlan_addr ipa
= {
1948 .sin
.sin_addr
.s_addr
= tip
,
1949 .sin
.sin_family
= AF_INET
,
1952 vxlan_ip_miss(dev
, &ipa
);
1956 return NETDEV_TX_OK
;
1959 #if IS_ENABLED(CONFIG_IPV6)
1960 static struct sk_buff
*vxlan_na_create(struct sk_buff
*request
,
1961 struct neighbour
*n
, bool isrouter
)
1963 struct net_device
*dev
= request
->dev
;
1964 struct sk_buff
*reply
;
1965 struct nd_msg
*ns
, *na
;
1966 struct ipv6hdr
*pip6
;
1968 int na_olen
= 8; /* opt hdr + ETH_ALEN for target */
1972 if (dev
== NULL
|| !pskb_may_pull(request
, request
->len
))
1975 len
= LL_RESERVED_SPACE(dev
) + sizeof(struct ipv6hdr
) +
1976 sizeof(*na
) + na_olen
+ dev
->needed_tailroom
;
1977 reply
= alloc_skb(len
, GFP_ATOMIC
);
1981 reply
->protocol
= htons(ETH_P_IPV6
);
1983 skb_reserve(reply
, LL_RESERVED_SPACE(request
->dev
));
1984 skb_push(reply
, sizeof(struct ethhdr
));
1985 skb_reset_mac_header(reply
);
1987 ns
= (struct nd_msg
*)(ipv6_hdr(request
) + 1);
1989 daddr
= eth_hdr(request
)->h_source
;
1990 ns_olen
= request
->len
- skb_network_offset(request
) -
1991 sizeof(struct ipv6hdr
) - sizeof(*ns
);
1992 for (i
= 0; i
< ns_olen
-1; i
+= (ns
->opt
[i
+1]<<3)) {
1993 if (!ns
->opt
[i
+ 1]) {
1997 if (ns
->opt
[i
] == ND_OPT_SOURCE_LL_ADDR
) {
1998 daddr
= ns
->opt
+ i
+ sizeof(struct nd_opt_hdr
);
2003 /* Ethernet header */
2004 ether_addr_copy(eth_hdr(reply
)->h_dest
, daddr
);
2005 ether_addr_copy(eth_hdr(reply
)->h_source
, n
->ha
);
2006 eth_hdr(reply
)->h_proto
= htons(ETH_P_IPV6
);
2007 reply
->protocol
= htons(ETH_P_IPV6
);
2009 skb_pull(reply
, sizeof(struct ethhdr
));
2010 skb_reset_network_header(reply
);
2011 skb_put(reply
, sizeof(struct ipv6hdr
));
2015 pip6
= ipv6_hdr(reply
);
2016 memset(pip6
, 0, sizeof(struct ipv6hdr
));
2018 pip6
->priority
= ipv6_hdr(request
)->priority
;
2019 pip6
->nexthdr
= IPPROTO_ICMPV6
;
2020 pip6
->hop_limit
= 255;
2021 pip6
->daddr
= ipv6_hdr(request
)->saddr
;
2022 pip6
->saddr
= *(struct in6_addr
*)n
->primary_key
;
2024 skb_pull(reply
, sizeof(struct ipv6hdr
));
2025 skb_reset_transport_header(reply
);
2027 /* Neighbor Advertisement */
2028 na
= skb_put_zero(reply
, sizeof(*na
) + na_olen
);
2029 na
->icmph
.icmp6_type
= NDISC_NEIGHBOUR_ADVERTISEMENT
;
2030 na
->icmph
.icmp6_router
= isrouter
;
2031 na
->icmph
.icmp6_override
= 1;
2032 na
->icmph
.icmp6_solicited
= 1;
2033 na
->target
= ns
->target
;
2034 ether_addr_copy(&na
->opt
[2], n
->ha
);
2035 na
->opt
[0] = ND_OPT_TARGET_LL_ADDR
;
2036 na
->opt
[1] = na_olen
>> 3;
2038 na
->icmph
.icmp6_cksum
= csum_ipv6_magic(&pip6
->saddr
,
2039 &pip6
->daddr
, sizeof(*na
)+na_olen
, IPPROTO_ICMPV6
,
2040 csum_partial(na
, sizeof(*na
)+na_olen
, 0));
2042 pip6
->payload_len
= htons(sizeof(*na
)+na_olen
);
2044 skb_push(reply
, sizeof(struct ipv6hdr
));
2046 reply
->ip_summed
= CHECKSUM_UNNECESSARY
;
2051 static int neigh_reduce(struct net_device
*dev
, struct sk_buff
*skb
, __be32 vni
)
2053 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2054 const struct in6_addr
*daddr
;
2055 const struct ipv6hdr
*iphdr
;
2056 struct inet6_dev
*in6_dev
;
2057 struct neighbour
*n
;
2061 in6_dev
= __in6_dev_get(dev
);
2065 iphdr
= ipv6_hdr(skb
);
2066 daddr
= &iphdr
->daddr
;
2067 msg
= (struct nd_msg
*)(iphdr
+ 1);
2069 if (ipv6_addr_loopback(daddr
) ||
2070 ipv6_addr_is_multicast(&msg
->target
))
2073 n
= neigh_lookup(ipv6_stub
->nd_tbl
, &msg
->target
, dev
);
2076 struct vxlan_fdb
*f
;
2077 struct sk_buff
*reply
;
2079 if (!(READ_ONCE(n
->nud_state
) & NUD_CONNECTED
)) {
2084 f
= vxlan_find_mac(vxlan
, n
->ha
, vni
);
2085 if (f
&& vxlan_addr_any(&(first_remote_rcu(f
)->remote_ip
))) {
2086 /* bridge-local neighbor */
2091 reply
= vxlan_na_create(skb
, n
,
2092 !!(f
? f
->flags
& NTF_ROUTER
: 0));
2099 if (netif_rx(reply
) == NET_RX_DROP
) {
2100 dev_core_stats_rx_dropped_inc(dev
);
2101 vxlan_vnifilter_count(vxlan
, vni
, NULL
,
2102 VXLAN_VNI_STATS_RX_DROPS
, 0);
2104 } else if (vxlan
->cfg
.flags
& VXLAN_F_L3MISS
) {
2105 union vxlan_addr ipa
= {
2106 .sin6
.sin6_addr
= msg
->target
,
2107 .sin6
.sin6_family
= AF_INET6
,
2110 vxlan_ip_miss(dev
, &ipa
);
2116 return NETDEV_TX_OK
;
2120 static bool route_shortcircuit(struct net_device
*dev
, struct sk_buff
*skb
)
2122 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2123 struct neighbour
*n
;
2125 if (is_multicast_ether_addr(eth_hdr(skb
)->h_dest
))
2129 switch (ntohs(eth_hdr(skb
)->h_proto
)) {
2134 if (!pskb_may_pull(skb
, sizeof(struct iphdr
)))
2137 n
= neigh_lookup(&arp_tbl
, &pip
->daddr
, dev
);
2138 if (!n
&& (vxlan
->cfg
.flags
& VXLAN_F_L3MISS
)) {
2139 union vxlan_addr ipa
= {
2140 .sin
.sin_addr
.s_addr
= pip
->daddr
,
2141 .sin
.sin_family
= AF_INET
,
2144 vxlan_ip_miss(dev
, &ipa
);
2150 #if IS_ENABLED(CONFIG_IPV6)
2153 struct ipv6hdr
*pip6
;
2155 if (!pskb_may_pull(skb
, sizeof(struct ipv6hdr
)))
2157 pip6
= ipv6_hdr(skb
);
2158 n
= neigh_lookup(ipv6_stub
->nd_tbl
, &pip6
->daddr
, dev
);
2159 if (!n
&& (vxlan
->cfg
.flags
& VXLAN_F_L3MISS
)) {
2160 union vxlan_addr ipa
= {
2161 .sin6
.sin6_addr
= pip6
->daddr
,
2162 .sin6
.sin6_family
= AF_INET6
,
2165 vxlan_ip_miss(dev
, &ipa
);
2179 diff
= !ether_addr_equal(eth_hdr(skb
)->h_dest
, n
->ha
);
2181 memcpy(eth_hdr(skb
)->h_source
, eth_hdr(skb
)->h_dest
,
2183 memcpy(eth_hdr(skb
)->h_dest
, n
->ha
, dev
->addr_len
);
2192 static int vxlan_build_gpe_hdr(struct vxlanhdr
*vxh
, __be16 protocol
)
2194 struct vxlanhdr_gpe
*gpe
= (struct vxlanhdr_gpe
*)vxh
;
2196 gpe
->np_applied
= 1;
2197 gpe
->next_protocol
= tun_p_from_eth_p(protocol
);
2198 if (!gpe
->next_protocol
)
2199 return -EPFNOSUPPORT
;
2203 static int vxlan_build_skb(struct sk_buff
*skb
, struct dst_entry
*dst
,
2204 int iphdr_len
, __be32 vni
,
2205 struct vxlan_metadata
*md
, u32 vxflags
,
2208 struct vxlanhdr
*vxh
;
2211 int type
= udp_sum
? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL
;
2212 __be16 inner_protocol
= htons(ETH_P_TEB
);
2214 if ((vxflags
& VXLAN_F_REMCSUM_TX
) &&
2215 skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2216 int csum_start
= skb_checksum_start_offset(skb
);
2218 if (csum_start
<= VXLAN_MAX_REMCSUM_START
&&
2219 !(csum_start
& VXLAN_RCO_SHIFT_MASK
) &&
2220 (skb
->csum_offset
== offsetof(struct udphdr
, check
) ||
2221 skb
->csum_offset
== offsetof(struct tcphdr
, check
)))
2222 type
|= SKB_GSO_TUNNEL_REMCSUM
;
2225 min_headroom
= LL_RESERVED_SPACE(dst
->dev
) + dst
->header_len
2226 + VXLAN_HLEN
+ iphdr_len
;
2228 /* Need space for new headers (invalidates iph ptr) */
2229 err
= skb_cow_head(skb
, min_headroom
);
2233 err
= iptunnel_handle_offloads(skb
, type
);
2237 vxh
= __skb_push(skb
, sizeof(*vxh
));
2238 vxh
->vx_flags
= VXLAN_HF_VNI
;
2239 vxh
->vx_vni
= vxlan_vni_field(vni
);
2241 if (type
& SKB_GSO_TUNNEL_REMCSUM
) {
2244 start
= skb_checksum_start_offset(skb
) - sizeof(struct vxlanhdr
);
2245 vxh
->vx_vni
|= vxlan_compute_rco(start
, skb
->csum_offset
);
2246 vxh
->vx_flags
|= VXLAN_HF_RCO
;
2248 if (!skb_is_gso(skb
)) {
2249 skb
->ip_summed
= CHECKSUM_NONE
;
2250 skb
->encapsulation
= 0;
2254 if (vxflags
& VXLAN_F_GBP
)
2255 vxlan_build_gbp_hdr(vxh
, md
);
2256 if (vxflags
& VXLAN_F_GPE
) {
2257 err
= vxlan_build_gpe_hdr(vxh
, skb
->protocol
);
2260 inner_protocol
= skb
->protocol
;
2263 skb_set_inner_protocol(skb
, inner_protocol
);
2267 /* Bypass encapsulation if the destination is local */
2268 static void vxlan_encap_bypass(struct sk_buff
*skb
, struct vxlan_dev
*src_vxlan
,
2269 struct vxlan_dev
*dst_vxlan
, __be32 vni
,
2272 union vxlan_addr loopback
;
2273 union vxlan_addr
*remote_ip
= &dst_vxlan
->default_dst
.remote_ip
;
2274 struct net_device
*dev
;
2277 skb
->pkt_type
= PACKET_HOST
;
2278 skb
->encapsulation
= 0;
2279 skb
->dev
= dst_vxlan
->dev
;
2280 __skb_pull(skb
, skb_network_offset(skb
));
2282 if (remote_ip
->sa
.sa_family
== AF_INET
) {
2283 loopback
.sin
.sin_addr
.s_addr
= htonl(INADDR_LOOPBACK
);
2284 loopback
.sa
.sa_family
= AF_INET
;
2285 #if IS_ENABLED(CONFIG_IPV6)
2287 loopback
.sin6
.sin6_addr
= in6addr_loopback
;
2288 loopback
.sa
.sa_family
= AF_INET6
;
2294 if (unlikely(!(dev
->flags
& IFF_UP
))) {
2295 kfree_skb_reason(skb
, SKB_DROP_REASON_DEV_READY
);
2299 if ((dst_vxlan
->cfg
.flags
& VXLAN_F_LEARN
) && snoop
)
2300 vxlan_snoop(dev
, &loopback
, eth_hdr(skb
)->h_source
, 0, vni
);
2302 dev_sw_netstats_tx_add(src_vxlan
->dev
, 1, len
);
2303 vxlan_vnifilter_count(src_vxlan
, vni
, NULL
, VXLAN_VNI_STATS_TX
, len
);
2305 if (__netif_rx(skb
) == NET_RX_SUCCESS
) {
2306 dev_sw_netstats_rx_add(dst_vxlan
->dev
, len
);
2307 vxlan_vnifilter_count(dst_vxlan
, vni
, NULL
, VXLAN_VNI_STATS_RX
,
2311 dev_core_stats_rx_dropped_inc(dev
);
2312 vxlan_vnifilter_count(dst_vxlan
, vni
, NULL
,
2313 VXLAN_VNI_STATS_RX_DROPS
, 0);
2318 static int encap_bypass_if_local(struct sk_buff
*skb
, struct net_device
*dev
,
2319 struct vxlan_dev
*vxlan
,
2321 __be16 dst_port
, int dst_ifindex
, __be32 vni
,
2322 struct dst_entry
*dst
,
2325 #if IS_ENABLED(CONFIG_IPV6)
2326 /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
2327 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
2328 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
2330 BUILD_BUG_ON(RTCF_LOCAL
!= RTF_LOCAL
);
2332 /* Bypass encapsulation if the destination is local */
2333 if (rt_flags
& RTCF_LOCAL
&&
2334 !(rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
2335 vxlan
->cfg
.flags
& VXLAN_F_LOCALBYPASS
) {
2336 struct vxlan_dev
*dst_vxlan
;
2339 dst_vxlan
= vxlan_find_vni(vxlan
->net
, dst_ifindex
, vni
,
2340 addr_family
, dst_port
,
2343 DEV_STATS_INC(dev
, tx_errors
);
2344 vxlan_vnifilter_count(vxlan
, vni
, NULL
,
2345 VXLAN_VNI_STATS_TX_ERRORS
, 0);
2346 kfree_skb_reason(skb
, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND
);
2350 vxlan_encap_bypass(skb
, vxlan
, dst_vxlan
, vni
, true);
2357 void vxlan_xmit_one(struct sk_buff
*skb
, struct net_device
*dev
,
2358 __be32 default_vni
, struct vxlan_rdst
*rdst
, bool did_rsc
)
2360 struct dst_cache
*dst_cache
;
2361 struct ip_tunnel_info
*info
;
2362 struct ip_tunnel_key
*pkey
;
2363 struct ip_tunnel_key key
;
2364 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2365 const struct iphdr
*old_iph
;
2366 struct vxlan_metadata _md
;
2367 struct vxlan_metadata
*md
= &_md
;
2368 unsigned int pkt_len
= skb
->len
;
2369 __be16 src_port
= 0, dst_port
;
2370 struct dst_entry
*ndst
= NULL
;
2375 u32 flags
= vxlan
->cfg
.flags
;
2377 bool udp_sum
= false;
2378 bool xnet
= !net_eq(vxlan
->net
, dev_net(vxlan
->dev
));
2379 enum skb_drop_reason reason
;
2383 no_eth_encap
= flags
& VXLAN_F_GPE
&& skb
->protocol
!= htons(ETH_P_TEB
);
2384 reason
= skb_vlan_inet_prepare(skb
, no_eth_encap
);
2388 reason
= SKB_DROP_REASON_NOT_SPECIFIED
;
2389 old_iph
= ip_hdr(skb
);
2391 info
= skb_tunnel_info(skb
);
2392 use_cache
= ip_tunnel_dst_cache_usable(skb
, info
);
2395 memset(&key
, 0, sizeof(key
));
2398 if (vxlan_addr_any(&rdst
->remote_ip
)) {
2400 /* short-circuited back to local bridge */
2401 vxlan_encap_bypass(skb
, vxlan
, vxlan
,
2408 addr_family
= vxlan
->cfg
.saddr
.sa
.sa_family
;
2409 dst_port
= rdst
->remote_port
? rdst
->remote_port
: vxlan
->cfg
.dst_port
;
2410 vni
= (rdst
->remote_vni
) ? : default_vni
;
2411 ifindex
= rdst
->remote_ifindex
;
2413 if (addr_family
== AF_INET
) {
2414 key
.u
.ipv4
.src
= vxlan
->cfg
.saddr
.sin
.sin_addr
.s_addr
;
2415 key
.u
.ipv4
.dst
= rdst
->remote_ip
.sin
.sin_addr
.s_addr
;
2417 key
.u
.ipv6
.src
= vxlan
->cfg
.saddr
.sin6
.sin6_addr
;
2418 key
.u
.ipv6
.dst
= rdst
->remote_ip
.sin6
.sin6_addr
;
2421 dst_cache
= &rdst
->dst_cache
;
2422 md
->gbp
= skb
->mark
;
2423 if (flags
& VXLAN_F_TTL_INHERIT
) {
2424 ttl
= ip_tunnel_get_ttl(old_iph
, skb
);
2426 ttl
= vxlan
->cfg
.ttl
;
2427 if (!ttl
&& vxlan_addr_multicast(&rdst
->remote_ip
))
2430 tos
= vxlan
->cfg
.tos
;
2432 tos
= ip_tunnel_get_dsfield(old_iph
, skb
);
2436 if (addr_family
== AF_INET
)
2437 udp_sum
= !(flags
& VXLAN_F_UDP_ZERO_CSUM_TX
);
2439 udp_sum
= !(flags
& VXLAN_F_UDP_ZERO_CSUM6_TX
);
2440 #if IS_ENABLED(CONFIG_IPV6)
2441 switch (vxlan
->cfg
.label_policy
) {
2442 case VXLAN_LABEL_FIXED
:
2443 key
.label
= vxlan
->cfg
.label
;
2445 case VXLAN_LABEL_INHERIT
:
2446 key
.label
= ip_tunnel_get_flowlabel(old_iph
, skb
);
2449 DEBUG_NET_WARN_ON_ONCE(1);
2455 WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
2460 addr_family
= ip_tunnel_info_af(info
);
2461 dst_port
= info
->key
.tp_dst
? : vxlan
->cfg
.dst_port
;
2462 vni
= tunnel_id_to_key32(info
->key
.tun_id
);
2464 dst_cache
= &info
->dst_cache
;
2465 if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT
, info
->key
.tun_flags
)) {
2466 if (info
->options_len
< sizeof(*md
))
2468 md
= ip_tunnel_info_opts(info
);
2470 ttl
= info
->key
.ttl
;
2471 tos
= info
->key
.tos
;
2472 udp_sum
= test_bit(IP_TUNNEL_CSUM_BIT
, info
->key
.tun_flags
);
2474 src_port
= udp_flow_src_port(dev_net(dev
), skb
, vxlan
->cfg
.port_min
,
2475 vxlan
->cfg
.port_max
, true);
2478 if (addr_family
== AF_INET
) {
2479 struct vxlan_sock
*sock4
= rcu_dereference(vxlan
->vn4_sock
);
2485 ifindex
= sock4
->sock
->sk
->sk_bound_dev_if
;
2487 rt
= udp_tunnel_dst_lookup(skb
, dev
, vxlan
->net
, ifindex
,
2488 &saddr
, pkey
, src_port
, dst_port
,
2489 tos
, use_cache
? dst_cache
: NULL
);
2492 reason
= SKB_DROP_REASON_IP_OUTNOROUTES
;
2497 /* Bypass encapsulation if the destination is local */
2498 err
= encap_bypass_if_local(skb
, dev
, vxlan
, AF_INET
,
2499 dst_port
, ifindex
, vni
,
2500 &rt
->dst
, rt
->rt_flags
);
2504 if (vxlan
->cfg
.df
== VXLAN_DF_SET
) {
2506 } else if (vxlan
->cfg
.df
== VXLAN_DF_INHERIT
) {
2507 struct ethhdr
*eth
= eth_hdr(skb
);
2509 if (ntohs(eth
->h_proto
) == ETH_P_IPV6
||
2510 (ntohs(eth
->h_proto
) == ETH_P_IP
&&
2511 old_iph
->frag_off
& htons(IP_DF
)))
2514 } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT
,
2515 info
->key
.tun_flags
)) {
2520 err
= skb_tunnel_check_pmtu(skb
, ndst
, vxlan_headroom(flags
& VXLAN_F_GPE
),
2521 netif_is_any_bridge_port(dev
));
2526 struct ip_tunnel_info
*unclone
;
2528 unclone
= skb_tunnel_info_unclone(skb
);
2529 if (unlikely(!unclone
))
2532 unclone
->key
.u
.ipv4
.src
= pkey
->u
.ipv4
.dst
;
2533 unclone
->key
.u
.ipv4
.dst
= saddr
;
2535 vxlan_encap_bypass(skb
, vxlan
, vxlan
, vni
, false);
2540 tos
= ip_tunnel_ecn_encap(tos
, old_iph
, skb
);
2541 ttl
= ttl
? : ip4_dst_hoplimit(&rt
->dst
);
2542 err
= vxlan_build_skb(skb
, ndst
, sizeof(struct iphdr
),
2543 vni
, md
, flags
, udp_sum
);
2545 reason
= SKB_DROP_REASON_NOMEM
;
2549 udp_tunnel_xmit_skb(rt
, sock4
->sock
->sk
, skb
, saddr
,
2550 pkey
->u
.ipv4
.dst
, tos
, ttl
, df
,
2551 src_port
, dst_port
, xnet
, !udp_sum
);
2552 #if IS_ENABLED(CONFIG_IPV6)
2554 struct vxlan_sock
*sock6
= rcu_dereference(vxlan
->vn6_sock
);
2555 struct in6_addr saddr
;
2558 ifindex
= sock6
->sock
->sk
->sk_bound_dev_if
;
2560 ndst
= udp_tunnel6_dst_lookup(skb
, dev
, vxlan
->net
, sock6
->sock
,
2561 ifindex
, &saddr
, pkey
,
2562 src_port
, dst_port
, tos
,
2563 use_cache
? dst_cache
: NULL
);
2565 err
= PTR_ERR(ndst
);
2567 reason
= SKB_DROP_REASON_IP_OUTNOROUTES
;
2572 u32 rt6i_flags
= dst_rt6_info(ndst
)->rt6i_flags
;
2574 err
= encap_bypass_if_local(skb
, dev
, vxlan
, AF_INET6
,
2575 dst_port
, ifindex
, vni
,
2581 err
= skb_tunnel_check_pmtu(skb
, ndst
,
2582 vxlan_headroom((flags
& VXLAN_F_GPE
) | VXLAN_F_IPV6
),
2583 netif_is_any_bridge_port(dev
));
2588 struct ip_tunnel_info
*unclone
;
2590 unclone
= skb_tunnel_info_unclone(skb
);
2591 if (unlikely(!unclone
))
2594 unclone
->key
.u
.ipv6
.src
= pkey
->u
.ipv6
.dst
;
2595 unclone
->key
.u
.ipv6
.dst
= saddr
;
2598 vxlan_encap_bypass(skb
, vxlan
, vxlan
, vni
, false);
2603 tos
= ip_tunnel_ecn_encap(tos
, old_iph
, skb
);
2604 ttl
= ttl
? : ip6_dst_hoplimit(ndst
);
2605 skb_scrub_packet(skb
, xnet
);
2606 err
= vxlan_build_skb(skb
, ndst
, sizeof(struct ipv6hdr
),
2607 vni
, md
, flags
, udp_sum
);
2609 reason
= SKB_DROP_REASON_NOMEM
;
2613 udp_tunnel6_xmit_skb(ndst
, sock6
->sock
->sk
, skb
, dev
,
2614 &saddr
, &pkey
->u
.ipv6
.dst
, tos
, ttl
,
2615 pkey
->label
, src_port
, dst_port
, !udp_sum
);
2618 vxlan_vnifilter_count(vxlan
, vni
, NULL
, VXLAN_VNI_STATS_TX
, pkt_len
);
2624 dev_core_stats_tx_dropped_inc(dev
);
2625 vxlan_vnifilter_count(vxlan
, vni
, NULL
, VXLAN_VNI_STATS_TX_DROPS
, 0);
2626 kfree_skb_reason(skb
, reason
);
2632 DEV_STATS_INC(dev
, collisions
);
2633 else if (err
== -ENETUNREACH
)
2634 DEV_STATS_INC(dev
, tx_carrier_errors
);
2636 DEV_STATS_INC(dev
, tx_errors
);
2637 vxlan_vnifilter_count(vxlan
, vni
, NULL
, VXLAN_VNI_STATS_TX_ERRORS
, 0);
2638 kfree_skb_reason(skb
, reason
);
2641 static void vxlan_xmit_nh(struct sk_buff
*skb
, struct net_device
*dev
,
2642 struct vxlan_fdb
*f
, __be32 vni
, bool did_rsc
)
2644 struct vxlan_rdst nh_rdst
;
2649 memset(&nh_rdst
, 0, sizeof(struct vxlan_rdst
));
2650 hash
= skb_get_hash(skb
);
2653 nh
= rcu_dereference(f
->nh
);
2658 do_xmit
= vxlan_fdb_nh_path_select(nh
, hash
, &nh_rdst
);
2661 if (likely(do_xmit
))
2662 vxlan_xmit_one(skb
, dev
, vni
, &nh_rdst
, did_rsc
);
2669 dev_core_stats_tx_dropped_inc(dev
);
2670 vxlan_vnifilter_count(netdev_priv(dev
), vni
, NULL
,
2671 VXLAN_VNI_STATS_TX_DROPS
, 0);
2675 static netdev_tx_t
vxlan_xmit_nhid(struct sk_buff
*skb
, struct net_device
*dev
,
2676 u32 nhid
, __be32 vni
)
2678 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2679 struct vxlan_rdst nh_rdst
;
2684 memset(&nh_rdst
, 0, sizeof(struct vxlan_rdst
));
2685 hash
= skb_get_hash(skb
);
2688 nh
= nexthop_find_by_id(dev_net(dev
), nhid
);
2689 if (unlikely(!nh
|| !nexthop_is_fdb(nh
) || !nexthop_is_multipath(nh
))) {
2693 do_xmit
= vxlan_fdb_nh_path_select(nh
, hash
, &nh_rdst
);
2696 if (vxlan
->cfg
.saddr
.sa
.sa_family
!= nh_rdst
.remote_ip
.sa
.sa_family
)
2699 if (likely(do_xmit
))
2700 vxlan_xmit_one(skb
, dev
, vni
, &nh_rdst
, false);
2704 return NETDEV_TX_OK
;
2707 dev_core_stats_tx_dropped_inc(dev
);
2708 vxlan_vnifilter_count(netdev_priv(dev
), vni
, NULL
,
2709 VXLAN_VNI_STATS_TX_DROPS
, 0);
2711 return NETDEV_TX_OK
;
2714 /* Transmit local packets over Vxlan
2716 * Outer IP header inherits ECN and DF from inner header.
2717 * Outer UDP destination is the VXLAN assigned port.
2718 * source port is based on hash of flow
2720 static netdev_tx_t
vxlan_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
2722 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2723 struct vxlan_rdst
*rdst
, *fdst
= NULL
;
2724 const struct ip_tunnel_info
*info
;
2725 struct vxlan_fdb
*f
;
2731 info
= skb_tunnel_info(skb
);
2733 skb_reset_mac_header(skb
);
2735 if (vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
) {
2736 if (info
&& info
->mode
& IP_TUNNEL_INFO_BRIDGE
&&
2737 info
->mode
& IP_TUNNEL_INFO_TX
) {
2738 vni
= tunnel_id_to_key32(info
->key
.tun_id
);
2739 nhid
= info
->key
.nhid
;
2741 if (info
&& info
->mode
& IP_TUNNEL_INFO_TX
)
2742 vxlan_xmit_one(skb
, dev
, vni
, NULL
, false);
2744 kfree_skb_reason(skb
, SKB_DROP_REASON_TUNNEL_TXINFO
);
2745 return NETDEV_TX_OK
;
2749 if (vxlan
->cfg
.flags
& VXLAN_F_PROXY
) {
2751 if (ntohs(eth
->h_proto
) == ETH_P_ARP
)
2752 return arp_reduce(dev
, skb
, vni
);
2753 #if IS_ENABLED(CONFIG_IPV6)
2754 else if (ntohs(eth
->h_proto
) == ETH_P_IPV6
&&
2755 pskb_may_pull(skb
, sizeof(struct ipv6hdr
) +
2756 sizeof(struct nd_msg
)) &&
2757 ipv6_hdr(skb
)->nexthdr
== IPPROTO_ICMPV6
) {
2758 struct nd_msg
*m
= (struct nd_msg
*)(ipv6_hdr(skb
) + 1);
2760 if (m
->icmph
.icmp6_code
== 0 &&
2761 m
->icmph
.icmp6_type
== NDISC_NEIGHBOUR_SOLICITATION
)
2762 return neigh_reduce(dev
, skb
, vni
);
2768 return vxlan_xmit_nhid(skb
, dev
, nhid
, vni
);
2770 if (vxlan
->cfg
.flags
& VXLAN_F_MDB
) {
2771 struct vxlan_mdb_entry
*mdb_entry
;
2774 mdb_entry
= vxlan_mdb_entry_skb_get(vxlan
, skb
, vni
);
2778 ret
= vxlan_mdb_xmit(vxlan
, mdb_entry
, skb
);
2786 f
= vxlan_find_mac(vxlan
, eth
->h_dest
, vni
);
2789 if (f
&& (f
->flags
& NTF_ROUTER
) && (vxlan
->cfg
.flags
& VXLAN_F_RSC
) &&
2790 (ntohs(eth
->h_proto
) == ETH_P_IP
||
2791 ntohs(eth
->h_proto
) == ETH_P_IPV6
)) {
2792 did_rsc
= route_shortcircuit(dev
, skb
);
2794 f
= vxlan_find_mac(vxlan
, eth
->h_dest
, vni
);
2798 f
= vxlan_find_mac(vxlan
, all_zeros_mac
, vni
);
2800 if ((vxlan
->cfg
.flags
& VXLAN_F_L2MISS
) &&
2801 !is_multicast_ether_addr(eth
->h_dest
))
2802 vxlan_fdb_miss(vxlan
, eth
->h_dest
);
2804 dev_core_stats_tx_dropped_inc(dev
);
2805 vxlan_vnifilter_count(vxlan
, vni
, NULL
,
2806 VXLAN_VNI_STATS_TX_DROPS
, 0);
2807 kfree_skb_reason(skb
, SKB_DROP_REASON_VXLAN_NO_REMOTE
);
2808 return NETDEV_TX_OK
;
2812 if (rcu_access_pointer(f
->nh
)) {
2813 vxlan_xmit_nh(skb
, dev
, f
,
2814 (vni
? : vxlan
->default_dst
.remote_vni
), did_rsc
);
2816 list_for_each_entry_rcu(rdst
, &f
->remotes
, list
) {
2817 struct sk_buff
*skb1
;
2823 skb1
= skb_clone(skb
, GFP_ATOMIC
);
2825 vxlan_xmit_one(skb1
, dev
, vni
, rdst
, did_rsc
);
2828 vxlan_xmit_one(skb
, dev
, vni
, fdst
, did_rsc
);
2830 kfree_skb_reason(skb
, SKB_DROP_REASON_VXLAN_NO_REMOTE
);
2833 return NETDEV_TX_OK
;
2836 /* Walk the forwarding table and purge stale entries */
2837 static void vxlan_cleanup(struct timer_list
*t
)
2839 struct vxlan_dev
*vxlan
= from_timer(vxlan
, t
, age_timer
);
2840 unsigned long next_timer
= jiffies
+ FDB_AGE_INTERVAL
;
2843 if (!netif_running(vxlan
->dev
))
2846 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
2847 struct hlist_node
*p
, *n
;
2849 spin_lock(&vxlan
->hash_lock
[h
]);
2850 hlist_for_each_safe(p
, n
, &vxlan
->fdb_head
[h
]) {
2852 = container_of(p
, struct vxlan_fdb
, hlist
);
2853 unsigned long timeout
;
2855 if (f
->state
& (NUD_PERMANENT
| NUD_NOARP
))
2858 if (f
->flags
& NTF_EXT_LEARNED
)
2861 timeout
= f
->used
+ vxlan
->cfg
.age_interval
* HZ
;
2862 if (time_before_eq(timeout
, jiffies
)) {
2863 netdev_dbg(vxlan
->dev
,
2864 "garbage collect %pM\n",
2866 f
->state
= NUD_STALE
;
2867 vxlan_fdb_destroy(vxlan
, f
, true, true);
2868 } else if (time_before(timeout
, next_timer
))
2869 next_timer
= timeout
;
2871 spin_unlock(&vxlan
->hash_lock
[h
]);
2874 mod_timer(&vxlan
->age_timer
, next_timer
);
2877 static void vxlan_vs_del_dev(struct vxlan_dev
*vxlan
)
2879 struct vxlan_net
*vn
= net_generic(vxlan
->net
, vxlan_net_id
);
2881 spin_lock(&vn
->sock_lock
);
2882 hlist_del_init_rcu(&vxlan
->hlist4
.hlist
);
2883 #if IS_ENABLED(CONFIG_IPV6)
2884 hlist_del_init_rcu(&vxlan
->hlist6
.hlist
);
2886 spin_unlock(&vn
->sock_lock
);
2889 static void vxlan_vs_add_dev(struct vxlan_sock
*vs
, struct vxlan_dev
*vxlan
,
2890 struct vxlan_dev_node
*node
)
2892 struct vxlan_net
*vn
= net_generic(vxlan
->net
, vxlan_net_id
);
2893 __be32 vni
= vxlan
->default_dst
.remote_vni
;
2895 node
->vxlan
= vxlan
;
2896 spin_lock(&vn
->sock_lock
);
2897 hlist_add_head_rcu(&node
->hlist
, vni_head(vs
, vni
));
2898 spin_unlock(&vn
->sock_lock
);
2901 /* Setup stats when device is created */
2902 static int vxlan_init(struct net_device
*dev
)
2904 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2907 if (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
)
2908 vxlan_vnigroup_init(vxlan
);
2910 err
= gro_cells_init(&vxlan
->gro_cells
, dev
);
2912 goto err_vnigroup_uninit
;
2914 err
= vxlan_mdb_init(vxlan
);
2916 goto err_gro_cells_destroy
;
2918 netdev_lockdep_set_classes(dev
);
2921 err_gro_cells_destroy
:
2922 gro_cells_destroy(&vxlan
->gro_cells
);
2923 err_vnigroup_uninit
:
2924 if (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
)
2925 vxlan_vnigroup_uninit(vxlan
);
2929 static void vxlan_fdb_delete_default(struct vxlan_dev
*vxlan
, __be32 vni
)
2931 struct vxlan_fdb
*f
;
2932 u32 hash_index
= fdb_head_index(vxlan
, all_zeros_mac
, vni
);
2934 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
2935 f
= __vxlan_find_mac(vxlan
, all_zeros_mac
, vni
);
2937 vxlan_fdb_destroy(vxlan
, f
, true, true);
2938 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
2941 static void vxlan_uninit(struct net_device
*dev
)
2943 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2945 vxlan_mdb_fini(vxlan
);
2947 if (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
)
2948 vxlan_vnigroup_uninit(vxlan
);
2950 gro_cells_destroy(&vxlan
->gro_cells
);
2952 vxlan_fdb_delete_default(vxlan
, vxlan
->cfg
.vni
);
2955 /* Start ageing timer and join group when device is brought up */
2956 static int vxlan_open(struct net_device
*dev
)
2958 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2961 ret
= vxlan_sock_add(vxlan
);
2965 ret
= vxlan_multicast_join(vxlan
);
2967 vxlan_sock_release(vxlan
);
2971 if (vxlan
->cfg
.age_interval
)
2972 mod_timer(&vxlan
->age_timer
, jiffies
+ FDB_AGE_INTERVAL
);
2977 struct vxlan_fdb_flush_desc
{
2978 bool ignore_default_entry
;
2979 unsigned long state
;
2980 unsigned long state_mask
;
2981 unsigned long flags
;
2982 unsigned long flags_mask
;
2987 union vxlan_addr dst_ip
;
2990 static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb
*f
,
2991 const struct vxlan_dev
*vxlan
)
2993 return is_zero_ether_addr(f
->eth_addr
) && f
->vni
== vxlan
->cfg
.vni
;
2996 static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb
*f
, u32 nhid
)
2998 struct nexthop
*nh
= rtnl_dereference(f
->nh
);
3000 return nh
&& nh
->id
== nhid
;
3003 static bool vxlan_fdb_flush_matches(const struct vxlan_fdb
*f
,
3004 const struct vxlan_dev
*vxlan
,
3005 const struct vxlan_fdb_flush_desc
*desc
)
3007 if (desc
->state_mask
&& (f
->state
& desc
->state_mask
) != desc
->state
)
3010 if (desc
->flags_mask
&& (f
->flags
& desc
->flags_mask
) != desc
->flags
)
3013 if (desc
->ignore_default_entry
&& vxlan_fdb_is_default_entry(f
, vxlan
))
3016 if (desc
->src_vni
&& f
->vni
!= desc
->src_vni
)
3019 if (desc
->nhid
&& !vxlan_fdb_nhid_matches(f
, desc
->nhid
))
3026 vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc
*desc
)
3028 return desc
->vni
|| desc
->port
|| desc
->dst_ip
.sa
.sa_family
;
3032 vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc
*desc
,
3033 const struct vxlan_rdst
*rd
)
3035 if (desc
->vni
&& rd
->remote_vni
!= desc
->vni
)
3038 if (desc
->port
&& rd
->remote_port
!= desc
->port
)
3041 if (desc
->dst_ip
.sa
.sa_family
&&
3042 !vxlan_addr_equal(&rd
->remote_ip
, &desc
->dst_ip
))
3049 vxlan_fdb_flush_match_remotes(struct vxlan_fdb
*f
, struct vxlan_dev
*vxlan
,
3050 const struct vxlan_fdb_flush_desc
*desc
,
3051 bool *p_destroy_fdb
)
3053 bool remotes_flushed
= false;
3054 struct vxlan_rdst
*rd
, *tmp
;
3056 list_for_each_entry_safe(rd
, tmp
, &f
->remotes
, list
) {
3057 if (!vxlan_fdb_flush_remote_matches(desc
, rd
))
3060 vxlan_fdb_dst_destroy(vxlan
, f
, rd
, true);
3061 remotes_flushed
= true;
3064 *p_destroy_fdb
= remotes_flushed
&& list_empty(&f
->remotes
);
3067 /* Purge the forwarding table */
3068 static void vxlan_flush(struct vxlan_dev
*vxlan
,
3069 const struct vxlan_fdb_flush_desc
*desc
)
3071 bool match_remotes
= vxlan_fdb_flush_should_match_remotes(desc
);
3074 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
3075 struct hlist_node
*p
, *n
;
3077 spin_lock_bh(&vxlan
->hash_lock
[h
]);
3078 hlist_for_each_safe(p
, n
, &vxlan
->fdb_head
[h
]) {
3080 = container_of(p
, struct vxlan_fdb
, hlist
);
3082 if (!vxlan_fdb_flush_matches(f
, vxlan
, desc
))
3085 if (match_remotes
) {
3086 bool destroy_fdb
= false;
3088 vxlan_fdb_flush_match_remotes(f
, vxlan
, desc
,
3095 vxlan_fdb_destroy(vxlan
, f
, true, true);
3097 spin_unlock_bh(&vxlan
->hash_lock
[h
]);
3101 static const struct nla_policy vxlan_del_bulk_policy
[NDA_MAX
+ 1] = {
3102 [NDA_SRC_VNI
] = { .type
= NLA_U32
},
3103 [NDA_NH_ID
] = { .type
= NLA_U32
},
3104 [NDA_VNI
] = { .type
= NLA_U32
},
3105 [NDA_PORT
] = { .type
= NLA_U16
},
3106 [NDA_DST
] = NLA_POLICY_RANGE(NLA_BINARY
, sizeof(struct in_addr
),
3107 sizeof(struct in6_addr
)),
3108 [NDA_NDM_STATE_MASK
] = { .type
= NLA_U16
},
3109 [NDA_NDM_FLAGS_MASK
] = { .type
= NLA_U8
},
3112 #define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF)
3113 #define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP)
3114 #define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \
3117 static int vxlan_fdb_delete_bulk(struct nlmsghdr
*nlh
, struct net_device
*dev
,
3118 struct netlink_ext_ack
*extack
)
3120 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3121 struct vxlan_fdb_flush_desc desc
= {};
3122 struct ndmsg
*ndm
= nlmsg_data(nlh
);
3123 struct nlattr
*tb
[NDA_MAX
+ 1];
3127 ndm_flags
= ndm
->ndm_flags
& ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS
;
3129 err
= nlmsg_parse(nlh
, sizeof(*ndm
), tb
, NDA_MAX
, vxlan_del_bulk_policy
,
3134 if (ndm_flags
& ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS
) {
3135 NL_SET_ERR_MSG(extack
, "Unsupported fdb flush ndm flag bits set");
3138 if (ndm
->ndm_state
& ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES
) {
3139 NL_SET_ERR_MSG(extack
, "Unsupported fdb flush ndm state bits set");
3143 desc
.state
= ndm
->ndm_state
;
3144 desc
.flags
= ndm_flags
;
3146 if (tb
[NDA_NDM_STATE_MASK
])
3147 desc
.state_mask
= nla_get_u16(tb
[NDA_NDM_STATE_MASK
]);
3149 if (tb
[NDA_NDM_FLAGS_MASK
])
3150 desc
.flags_mask
= nla_get_u8(tb
[NDA_NDM_FLAGS_MASK
]);
3152 if (tb
[NDA_SRC_VNI
])
3153 desc
.src_vni
= cpu_to_be32(nla_get_u32(tb
[NDA_SRC_VNI
]));
3156 desc
.nhid
= nla_get_u32(tb
[NDA_NH_ID
]);
3159 desc
.vni
= cpu_to_be32(nla_get_u32(tb
[NDA_VNI
]));
3162 desc
.port
= nla_get_be16(tb
[NDA_PORT
]);
3165 union vxlan_addr ip
;
3167 err
= vxlan_nla_get_addr(&ip
, tb
[NDA_DST
]);
3169 NL_SET_ERR_MSG_ATTR(extack
, tb
[NDA_DST
],
3170 "Unsupported address family");
3176 vxlan_flush(vxlan
, &desc
);
3181 /* Cleanup timer and forwarding table on shutdown */
3182 static int vxlan_stop(struct net_device
*dev
)
3184 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3185 struct vxlan_fdb_flush_desc desc
= {
3186 /* Default entry is deleted at vxlan_uninit. */
3187 .ignore_default_entry
= true,
3189 .state_mask
= NUD_PERMANENT
| NUD_NOARP
,
3192 vxlan_multicast_leave(vxlan
);
3194 del_timer_sync(&vxlan
->age_timer
);
3196 vxlan_flush(vxlan
, &desc
);
3197 vxlan_sock_release(vxlan
);
3202 /* Stub, nothing needs to be done. */
3203 static void vxlan_set_multicast_list(struct net_device
*dev
)
3207 static int vxlan_change_mtu(struct net_device
*dev
, int new_mtu
)
3209 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3210 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
3211 struct net_device
*lowerdev
= __dev_get_by_index(vxlan
->net
,
3212 dst
->remote_ifindex
);
3214 /* This check is different than dev->max_mtu, because it looks at
3215 * the lowerdev->mtu, rather than the static dev->max_mtu
3218 int max_mtu
= lowerdev
->mtu
- vxlan_headroom(vxlan
->cfg
.flags
);
3219 if (new_mtu
> max_mtu
)
3223 WRITE_ONCE(dev
->mtu
, new_mtu
);
3227 static int vxlan_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
3229 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3230 struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
3231 __be16 sport
, dport
;
3233 sport
= udp_flow_src_port(dev_net(dev
), skb
, vxlan
->cfg
.port_min
,
3234 vxlan
->cfg
.port_max
, true);
3235 dport
= info
->key
.tp_dst
? : vxlan
->cfg
.dst_port
;
3237 if (ip_tunnel_info_af(info
) == AF_INET
) {
3238 struct vxlan_sock
*sock4
= rcu_dereference(vxlan
->vn4_sock
);
3244 rt
= udp_tunnel_dst_lookup(skb
, dev
, vxlan
->net
, 0,
3245 &info
->key
.u
.ipv4
.src
,
3247 sport
, dport
, info
->key
.tos
,
3253 #if IS_ENABLED(CONFIG_IPV6)
3254 struct vxlan_sock
*sock6
= rcu_dereference(vxlan
->vn6_sock
);
3255 struct dst_entry
*ndst
;
3260 ndst
= udp_tunnel6_dst_lookup(skb
, dev
, vxlan
->net
, sock6
->sock
,
3261 0, &info
->key
.u
.ipv6
.src
,
3263 sport
, dport
, info
->key
.tos
,
3266 return PTR_ERR(ndst
);
3268 #else /* !CONFIG_IPV6 */
3269 return -EPFNOSUPPORT
;
3272 info
->key
.tp_src
= sport
;
3273 info
->key
.tp_dst
= dport
;
3277 static const struct net_device_ops vxlan_netdev_ether_ops
= {
3278 .ndo_init
= vxlan_init
,
3279 .ndo_uninit
= vxlan_uninit
,
3280 .ndo_open
= vxlan_open
,
3281 .ndo_stop
= vxlan_stop
,
3282 .ndo_start_xmit
= vxlan_xmit
,
3283 .ndo_set_rx_mode
= vxlan_set_multicast_list
,
3284 .ndo_change_mtu
= vxlan_change_mtu
,
3285 .ndo_validate_addr
= eth_validate_addr
,
3286 .ndo_set_mac_address
= eth_mac_addr
,
3287 .ndo_fdb_add
= vxlan_fdb_add
,
3288 .ndo_fdb_del
= vxlan_fdb_delete
,
3289 .ndo_fdb_del_bulk
= vxlan_fdb_delete_bulk
,
3290 .ndo_fdb_dump
= vxlan_fdb_dump
,
3291 .ndo_fdb_get
= vxlan_fdb_get
,
3292 .ndo_mdb_add
= vxlan_mdb_add
,
3293 .ndo_mdb_del
= vxlan_mdb_del
,
3294 .ndo_mdb_del_bulk
= vxlan_mdb_del_bulk
,
3295 .ndo_mdb_dump
= vxlan_mdb_dump
,
3296 .ndo_mdb_get
= vxlan_mdb_get
,
3297 .ndo_fill_metadata_dst
= vxlan_fill_metadata_dst
,
3300 static const struct net_device_ops vxlan_netdev_raw_ops
= {
3301 .ndo_init
= vxlan_init
,
3302 .ndo_uninit
= vxlan_uninit
,
3303 .ndo_open
= vxlan_open
,
3304 .ndo_stop
= vxlan_stop
,
3305 .ndo_start_xmit
= vxlan_xmit
,
3306 .ndo_change_mtu
= vxlan_change_mtu
,
3307 .ndo_fill_metadata_dst
= vxlan_fill_metadata_dst
,
3310 /* Info for udev, that this is a virtual tunnel endpoint */
3311 static const struct device_type vxlan_type
= {
3315 /* Calls the ndo_udp_tunnel_add of the caller in order to
3316 * supply the listening VXLAN udp ports. Callers are expected
3317 * to implement the ndo_udp_tunnel_add.
3319 static void vxlan_offload_rx_ports(struct net_device
*dev
, bool push
)
3321 struct vxlan_sock
*vs
;
3322 struct net
*net
= dev_net(dev
);
3323 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
3326 spin_lock(&vn
->sock_lock
);
3327 for (i
= 0; i
< PORT_HASH_SIZE
; ++i
) {
3328 hlist_for_each_entry_rcu(vs
, &vn
->sock_list
[i
], hlist
) {
3329 unsigned short type
;
3331 if (vs
->flags
& VXLAN_F_GPE
)
3332 type
= UDP_TUNNEL_TYPE_VXLAN_GPE
;
3334 type
= UDP_TUNNEL_TYPE_VXLAN
;
3337 udp_tunnel_push_rx_port(dev
, vs
->sock
, type
);
3339 udp_tunnel_drop_rx_port(dev
, vs
->sock
, type
);
3342 spin_unlock(&vn
->sock_lock
);
3345 /* Initialize the device structure. */
3346 static void vxlan_setup(struct net_device
*dev
)
3348 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3351 eth_hw_addr_random(dev
);
3354 dev
->needs_free_netdev
= true;
3355 SET_NETDEV_DEVTYPE(dev
, &vxlan_type
);
3357 dev
->features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
| NETIF_F_FRAGLIST
;
3358 dev
->features
|= NETIF_F_RXCSUM
;
3359 dev
->features
|= NETIF_F_GSO_SOFTWARE
;
3361 dev
->vlan_features
= dev
->features
;
3362 dev
->hw_features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
| NETIF_F_FRAGLIST
;
3363 dev
->hw_features
|= NETIF_F_RXCSUM
;
3364 dev
->hw_features
|= NETIF_F_GSO_SOFTWARE
;
3365 netif_keep_dst(dev
);
3366 dev
->priv_flags
|= IFF_NO_QUEUE
;
3367 dev
->change_proto_down
= true;
3370 /* MTU range: 68 - 65535 */
3371 dev
->min_mtu
= ETH_MIN_MTU
;
3372 dev
->max_mtu
= ETH_MAX_MTU
;
3374 dev
->pcpu_stat_type
= NETDEV_PCPU_STAT_TSTATS
;
3375 INIT_LIST_HEAD(&vxlan
->next
);
3377 timer_setup(&vxlan
->age_timer
, vxlan_cleanup
, TIMER_DEFERRABLE
);
3381 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
3382 spin_lock_init(&vxlan
->hash_lock
[h
]);
3383 INIT_HLIST_HEAD(&vxlan
->fdb_head
[h
]);
3387 static void vxlan_ether_setup(struct net_device
*dev
)
3389 dev
->priv_flags
&= ~IFF_TX_SKB_SHARING
;
3390 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
;
3391 dev
->netdev_ops
= &vxlan_netdev_ether_ops
;
3394 static void vxlan_raw_setup(struct net_device
*dev
)
3396 dev
->header_ops
= NULL
;
3397 dev
->type
= ARPHRD_NONE
;
3398 dev
->hard_header_len
= 0;
3400 dev
->flags
= IFF_POINTOPOINT
| IFF_NOARP
| IFF_MULTICAST
;
3401 dev
->netdev_ops
= &vxlan_netdev_raw_ops
;
3404 static const struct nla_policy vxlan_policy
[IFLA_VXLAN_MAX
+ 1] = {
3405 [IFLA_VXLAN_UNSPEC
] = { .strict_start_type
= IFLA_VXLAN_LOCALBYPASS
},
3406 [IFLA_VXLAN_ID
] = { .type
= NLA_U32
},
3407 [IFLA_VXLAN_GROUP
] = { .len
= sizeof_field(struct iphdr
, daddr
) },
3408 [IFLA_VXLAN_GROUP6
] = { .len
= sizeof(struct in6_addr
) },
3409 [IFLA_VXLAN_LINK
] = { .type
= NLA_U32
},
3410 [IFLA_VXLAN_LOCAL
] = { .len
= sizeof_field(struct iphdr
, saddr
) },
3411 [IFLA_VXLAN_LOCAL6
] = { .len
= sizeof(struct in6_addr
) },
3412 [IFLA_VXLAN_TOS
] = { .type
= NLA_U8
},
3413 [IFLA_VXLAN_TTL
] = { .type
= NLA_U8
},
3414 [IFLA_VXLAN_LABEL
] = { .type
= NLA_U32
},
3415 [IFLA_VXLAN_LEARNING
] = { .type
= NLA_U8
},
3416 [IFLA_VXLAN_AGEING
] = { .type
= NLA_U32
},
3417 [IFLA_VXLAN_LIMIT
] = { .type
= NLA_U32
},
3418 [IFLA_VXLAN_PORT_RANGE
] = { .len
= sizeof(struct ifla_vxlan_port_range
) },
3419 [IFLA_VXLAN_PROXY
] = { .type
= NLA_U8
},
3420 [IFLA_VXLAN_RSC
] = { .type
= NLA_U8
},
3421 [IFLA_VXLAN_L2MISS
] = { .type
= NLA_U8
},
3422 [IFLA_VXLAN_L3MISS
] = { .type
= NLA_U8
},
3423 [IFLA_VXLAN_COLLECT_METADATA
] = { .type
= NLA_U8
},
3424 [IFLA_VXLAN_PORT
] = { .type
= NLA_U16
},
3425 [IFLA_VXLAN_UDP_CSUM
] = { .type
= NLA_U8
},
3426 [IFLA_VXLAN_UDP_ZERO_CSUM6_TX
] = { .type
= NLA_U8
},
3427 [IFLA_VXLAN_UDP_ZERO_CSUM6_RX
] = { .type
= NLA_U8
},
3428 [IFLA_VXLAN_REMCSUM_TX
] = { .type
= NLA_U8
},
3429 [IFLA_VXLAN_REMCSUM_RX
] = { .type
= NLA_U8
},
3430 [IFLA_VXLAN_GBP
] = { .type
= NLA_FLAG
, },
3431 [IFLA_VXLAN_GPE
] = { .type
= NLA_FLAG
, },
3432 [IFLA_VXLAN_REMCSUM_NOPARTIAL
] = { .type
= NLA_FLAG
},
3433 [IFLA_VXLAN_TTL_INHERIT
] = { .type
= NLA_FLAG
},
3434 [IFLA_VXLAN_DF
] = { .type
= NLA_U8
},
3435 [IFLA_VXLAN_VNIFILTER
] = { .type
= NLA_U8
},
3436 [IFLA_VXLAN_LOCALBYPASS
] = NLA_POLICY_MAX(NLA_U8
, 1),
3437 [IFLA_VXLAN_LABEL_POLICY
] = NLA_POLICY_MAX(NLA_U32
, VXLAN_LABEL_MAX
),
3440 static int vxlan_validate(struct nlattr
*tb
[], struct nlattr
*data
[],
3441 struct netlink_ext_ack
*extack
)
3443 if (tb
[IFLA_ADDRESS
]) {
3444 if (nla_len(tb
[IFLA_ADDRESS
]) != ETH_ALEN
) {
3445 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_ADDRESS
],
3446 "Provided link layer address is not Ethernet");
3450 if (!is_valid_ether_addr(nla_data(tb
[IFLA_ADDRESS
]))) {
3451 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_ADDRESS
],
3452 "Provided Ethernet address is not unicast");
3453 return -EADDRNOTAVAIL
;
3458 u32 mtu
= nla_get_u32(tb
[IFLA_MTU
]);
3460 if (mtu
< ETH_MIN_MTU
|| mtu
> ETH_MAX_MTU
) {
3461 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_MTU
],
3462 "MTU must be between 68 and 65535");
3468 NL_SET_ERR_MSG(extack
,
3469 "Required attributes not provided to perform the operation");
3473 if (data
[IFLA_VXLAN_ID
]) {
3474 u32 id
= nla_get_u32(data
[IFLA_VXLAN_ID
]);
3476 if (id
>= VXLAN_N_VID
) {
3477 NL_SET_ERR_MSG_ATTR(extack
, data
[IFLA_VXLAN_ID
],
3478 "VXLAN ID must be lower than 16777216");
3483 if (data
[IFLA_VXLAN_PORT_RANGE
]) {
3484 const struct ifla_vxlan_port_range
*p
3485 = nla_data(data
[IFLA_VXLAN_PORT_RANGE
]);
3487 if (ntohs(p
->high
) < ntohs(p
->low
)) {
3488 NL_SET_ERR_MSG_ATTR(extack
, data
[IFLA_VXLAN_PORT_RANGE
],
3489 "Invalid source port range");
3494 if (data
[IFLA_VXLAN_DF
]) {
3495 enum ifla_vxlan_df df
= nla_get_u8(data
[IFLA_VXLAN_DF
]);
3497 if (df
< 0 || df
> VXLAN_DF_MAX
) {
3498 NL_SET_ERR_MSG_ATTR(extack
, data
[IFLA_VXLAN_DF
],
3499 "Invalid DF attribute");
3507 static void vxlan_get_drvinfo(struct net_device
*netdev
,
3508 struct ethtool_drvinfo
*drvinfo
)
3510 strscpy(drvinfo
->version
, VXLAN_VERSION
, sizeof(drvinfo
->version
));
3511 strscpy(drvinfo
->driver
, "vxlan", sizeof(drvinfo
->driver
));
3514 static int vxlan_get_link_ksettings(struct net_device
*dev
,
3515 struct ethtool_link_ksettings
*cmd
)
3517 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3518 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
3519 struct net_device
*lowerdev
= __dev_get_by_index(vxlan
->net
,
3520 dst
->remote_ifindex
);
3523 cmd
->base
.duplex
= DUPLEX_UNKNOWN
;
3524 cmd
->base
.port
= PORT_OTHER
;
3525 cmd
->base
.speed
= SPEED_UNKNOWN
;
3530 return __ethtool_get_link_ksettings(lowerdev
, cmd
);
3533 static const struct ethtool_ops vxlan_ethtool_ops
= {
3534 .get_drvinfo
= vxlan_get_drvinfo
,
3535 .get_link
= ethtool_op_get_link
,
3536 .get_link_ksettings
= vxlan_get_link_ksettings
,
3539 static struct socket
*vxlan_create_sock(struct net
*net
, bool ipv6
,
3540 __be16 port
, u32 flags
, int ifindex
)
3542 struct socket
*sock
;
3543 struct udp_port_cfg udp_conf
;
3546 memset(&udp_conf
, 0, sizeof(udp_conf
));
3549 udp_conf
.family
= AF_INET6
;
3550 udp_conf
.use_udp6_rx_checksums
=
3551 !(flags
& VXLAN_F_UDP_ZERO_CSUM6_RX
);
3552 udp_conf
.ipv6_v6only
= 1;
3554 udp_conf
.family
= AF_INET
;
3557 udp_conf
.local_udp_port
= port
;
3558 udp_conf
.bind_ifindex
= ifindex
;
3560 /* Open UDP socket */
3561 err
= udp_sock_create(net
, &udp_conf
, &sock
);
3563 return ERR_PTR(err
);
3565 udp_allow_gso(sock
->sk
);
3569 /* Create new listen socket if needed */
3570 static struct vxlan_sock
*vxlan_socket_create(struct net
*net
, bool ipv6
,
3571 __be16 port
, u32 flags
,
3574 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
3575 struct vxlan_sock
*vs
;
3576 struct socket
*sock
;
3578 struct udp_tunnel_sock_cfg tunnel_cfg
;
3580 vs
= kzalloc(sizeof(*vs
), GFP_KERNEL
);
3582 return ERR_PTR(-ENOMEM
);
3584 for (h
= 0; h
< VNI_HASH_SIZE
; ++h
)
3585 INIT_HLIST_HEAD(&vs
->vni_list
[h
]);
3587 sock
= vxlan_create_sock(net
, ipv6
, port
, flags
, ifindex
);
3590 return ERR_CAST(sock
);
3594 refcount_set(&vs
->refcnt
, 1);
3595 vs
->flags
= (flags
& VXLAN_F_RCV_FLAGS
);
3597 spin_lock(&vn
->sock_lock
);
3598 hlist_add_head_rcu(&vs
->hlist
, vs_head(net
, port
));
3599 udp_tunnel_notify_add_rx_port(sock
,
3600 (vs
->flags
& VXLAN_F_GPE
) ?
3601 UDP_TUNNEL_TYPE_VXLAN_GPE
:
3602 UDP_TUNNEL_TYPE_VXLAN
);
3603 spin_unlock(&vn
->sock_lock
);
3605 /* Mark socket as an encapsulation socket. */
3606 memset(&tunnel_cfg
, 0, sizeof(tunnel_cfg
));
3607 tunnel_cfg
.sk_user_data
= vs
;
3608 tunnel_cfg
.encap_type
= 1;
3609 tunnel_cfg
.encap_rcv
= vxlan_rcv
;
3610 tunnel_cfg
.encap_err_lookup
= vxlan_err_lookup
;
3611 tunnel_cfg
.encap_destroy
= NULL
;
3612 if (vs
->flags
& VXLAN_F_GPE
) {
3613 tunnel_cfg
.gro_receive
= vxlan_gpe_gro_receive
;
3614 tunnel_cfg
.gro_complete
= vxlan_gpe_gro_complete
;
3616 tunnel_cfg
.gro_receive
= vxlan_gro_receive
;
3617 tunnel_cfg
.gro_complete
= vxlan_gro_complete
;
3620 setup_udp_tunnel_sock(net
, sock
, &tunnel_cfg
);
3625 static int __vxlan_sock_add(struct vxlan_dev
*vxlan
, bool ipv6
)
3627 struct vxlan_net
*vn
= net_generic(vxlan
->net
, vxlan_net_id
);
3628 bool metadata
= vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
;
3629 struct vxlan_sock
*vs
= NULL
;
3630 struct vxlan_dev_node
*node
;
3631 int l3mdev_index
= 0;
3633 if (vxlan
->cfg
.remote_ifindex
)
3634 l3mdev_index
= l3mdev_master_upper_ifindex_by_index(
3635 vxlan
->net
, vxlan
->cfg
.remote_ifindex
);
3637 if (!vxlan
->cfg
.no_share
) {
3638 spin_lock(&vn
->sock_lock
);
3639 vs
= vxlan_find_sock(vxlan
->net
, ipv6
? AF_INET6
: AF_INET
,
3640 vxlan
->cfg
.dst_port
, vxlan
->cfg
.flags
,
3642 if (vs
&& !refcount_inc_not_zero(&vs
->refcnt
)) {
3643 spin_unlock(&vn
->sock_lock
);
3646 spin_unlock(&vn
->sock_lock
);
3649 vs
= vxlan_socket_create(vxlan
->net
, ipv6
,
3650 vxlan
->cfg
.dst_port
, vxlan
->cfg
.flags
,
3654 #if IS_ENABLED(CONFIG_IPV6)
3656 rcu_assign_pointer(vxlan
->vn6_sock
, vs
);
3657 node
= &vxlan
->hlist6
;
3661 rcu_assign_pointer(vxlan
->vn4_sock
, vs
);
3662 node
= &vxlan
->hlist4
;
3665 if (metadata
&& (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
))
3666 vxlan_vs_add_vnigrp(vxlan
, vs
, ipv6
);
3668 vxlan_vs_add_dev(vs
, vxlan
, node
);
3673 static int vxlan_sock_add(struct vxlan_dev
*vxlan
)
3675 bool metadata
= vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
;
3676 bool ipv6
= vxlan
->cfg
.flags
& VXLAN_F_IPV6
|| metadata
;
3677 bool ipv4
= !ipv6
|| metadata
;
3680 RCU_INIT_POINTER(vxlan
->vn4_sock
, NULL
);
3681 #if IS_ENABLED(CONFIG_IPV6)
3682 RCU_INIT_POINTER(vxlan
->vn6_sock
, NULL
);
3684 ret
= __vxlan_sock_add(vxlan
, true);
3685 if (ret
< 0 && ret
!= -EAFNOSUPPORT
)
3690 ret
= __vxlan_sock_add(vxlan
, false);
3692 vxlan_sock_release(vxlan
);
3696 int vxlan_vni_in_use(struct net
*src_net
, struct vxlan_dev
*vxlan
,
3697 struct vxlan_config
*conf
, __be32 vni
)
3699 struct vxlan_net
*vn
= net_generic(src_net
, vxlan_net_id
);
3700 struct vxlan_dev
*tmp
;
3702 list_for_each_entry(tmp
, &vn
->vxlan_list
, next
) {
3705 if (tmp
->cfg
.flags
& VXLAN_F_VNIFILTER
) {
3706 if (!vxlan_vnifilter_lookup(tmp
, vni
))
3708 } else if (tmp
->cfg
.vni
!= vni
) {
3711 if (tmp
->cfg
.dst_port
!= conf
->dst_port
)
3713 if ((tmp
->cfg
.flags
& (VXLAN_F_RCV_FLAGS
| VXLAN_F_IPV6
)) !=
3714 (conf
->flags
& (VXLAN_F_RCV_FLAGS
| VXLAN_F_IPV6
)))
3717 if ((conf
->flags
& VXLAN_F_IPV6_LINKLOCAL
) &&
3718 tmp
->cfg
.remote_ifindex
!= conf
->remote_ifindex
)
3727 static int vxlan_config_validate(struct net
*src_net
, struct vxlan_config
*conf
,
3728 struct net_device
**lower
,
3729 struct vxlan_dev
*old
,
3730 struct netlink_ext_ack
*extack
)
3732 bool use_ipv6
= false;
3734 if (conf
->flags
& VXLAN_F_GPE
) {
3735 /* For now, allow GPE only together with
3736 * COLLECT_METADATA. This can be relaxed later; in such
3737 * case, the other side of the PtP link will have to be
3740 if ((conf
->flags
& ~VXLAN_F_ALLOWED_GPE
) ||
3741 !(conf
->flags
& VXLAN_F_COLLECT_METADATA
)) {
3742 NL_SET_ERR_MSG(extack
,
3743 "VXLAN GPE does not support this combination of attributes");
3748 if (!conf
->remote_ip
.sa
.sa_family
&& !conf
->saddr
.sa
.sa_family
) {
3749 /* Unless IPv6 is explicitly requested, assume IPv4 */
3750 conf
->remote_ip
.sa
.sa_family
= AF_INET
;
3751 conf
->saddr
.sa
.sa_family
= AF_INET
;
3752 } else if (!conf
->remote_ip
.sa
.sa_family
) {
3753 conf
->remote_ip
.sa
.sa_family
= conf
->saddr
.sa
.sa_family
;
3754 } else if (!conf
->saddr
.sa
.sa_family
) {
3755 conf
->saddr
.sa
.sa_family
= conf
->remote_ip
.sa
.sa_family
;
3758 if (conf
->saddr
.sa
.sa_family
!= conf
->remote_ip
.sa
.sa_family
) {
3759 NL_SET_ERR_MSG(extack
,
3760 "Local and remote address must be from the same family");
3764 if (vxlan_addr_multicast(&conf
->saddr
)) {
3765 NL_SET_ERR_MSG(extack
, "Local address cannot be multicast");
3769 if (conf
->saddr
.sa
.sa_family
== AF_INET6
) {
3770 if (!IS_ENABLED(CONFIG_IPV6
)) {
3771 NL_SET_ERR_MSG(extack
,
3772 "IPv6 support not enabled in the kernel");
3773 return -EPFNOSUPPORT
;
3776 conf
->flags
|= VXLAN_F_IPV6
;
3778 if (!(conf
->flags
& VXLAN_F_COLLECT_METADATA
)) {
3780 ipv6_addr_type(&conf
->saddr
.sin6
.sin6_addr
);
3782 ipv6_addr_type(&conf
->remote_ip
.sin6
.sin6_addr
);
3784 if (local_type
& IPV6_ADDR_LINKLOCAL
) {
3785 if (!(remote_type
& IPV6_ADDR_LINKLOCAL
) &&
3786 (remote_type
!= IPV6_ADDR_ANY
)) {
3787 NL_SET_ERR_MSG(extack
,
3788 "Invalid combination of local and remote address scopes");
3792 conf
->flags
|= VXLAN_F_IPV6_LINKLOCAL
;
3795 (IPV6_ADDR_UNICAST
| IPV6_ADDR_LINKLOCAL
)) {
3796 NL_SET_ERR_MSG(extack
,
3797 "Invalid combination of local and remote address scopes");
3801 conf
->flags
&= ~VXLAN_F_IPV6_LINKLOCAL
;
3806 if (conf
->label
&& !use_ipv6
) {
3807 NL_SET_ERR_MSG(extack
,
3808 "Label attribute only applies to IPv6 VXLAN devices");
3812 if (conf
->label_policy
&& !use_ipv6
) {
3813 NL_SET_ERR_MSG(extack
,
3814 "Label policy only applies to IPv6 VXLAN devices");
3818 if (conf
->remote_ifindex
) {
3819 struct net_device
*lowerdev
;
3821 lowerdev
= __dev_get_by_index(src_net
, conf
->remote_ifindex
);
3823 NL_SET_ERR_MSG(extack
,
3824 "Invalid local interface, device not found");
3828 #if IS_ENABLED(CONFIG_IPV6)
3830 struct inet6_dev
*idev
= __in6_dev_get(lowerdev
);
3832 if (idev
&& idev
->cnf
.disable_ipv6
) {
3833 NL_SET_ERR_MSG(extack
,
3834 "IPv6 support disabled by administrator");
3842 if (vxlan_addr_multicast(&conf
->remote_ip
)) {
3843 NL_SET_ERR_MSG(extack
,
3844 "Local interface required for multicast remote destination");
3849 #if IS_ENABLED(CONFIG_IPV6)
3850 if (conf
->flags
& VXLAN_F_IPV6_LINKLOCAL
) {
3851 NL_SET_ERR_MSG(extack
,
3852 "Local interface required for link-local local/remote addresses");
3860 if (!conf
->dst_port
) {
3861 if (conf
->flags
& VXLAN_F_GPE
)
3862 conf
->dst_port
= htons(IANA_VXLAN_GPE_UDP_PORT
);
3864 conf
->dst_port
= htons(vxlan_port
);
3867 if (!conf
->age_interval
)
3868 conf
->age_interval
= FDB_AGE_DEFAULT
;
3870 if (vxlan_vni_in_use(src_net
, old
, conf
, conf
->vni
)) {
3871 NL_SET_ERR_MSG(extack
,
3872 "A VXLAN device with the specified VNI already exists");
3879 static void vxlan_config_apply(struct net_device
*dev
,
3880 struct vxlan_config
*conf
,
3881 struct net_device
*lowerdev
,
3882 struct net
*src_net
,
3885 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3886 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
3887 unsigned short needed_headroom
= ETH_HLEN
;
3888 int max_mtu
= ETH_MAX_MTU
;
3889 u32 flags
= conf
->flags
;
3892 if (flags
& VXLAN_F_GPE
)
3893 vxlan_raw_setup(dev
);
3895 vxlan_ether_setup(dev
);
3898 dev
->mtu
= conf
->mtu
;
3900 vxlan
->net
= src_net
;
3903 dst
->remote_vni
= conf
->vni
;
3905 memcpy(&dst
->remote_ip
, &conf
->remote_ip
, sizeof(conf
->remote_ip
));
3908 dst
->remote_ifindex
= conf
->remote_ifindex
;
3910 netif_inherit_tso_max(dev
, lowerdev
);
3912 needed_headroom
= lowerdev
->hard_header_len
;
3913 needed_headroom
+= lowerdev
->needed_headroom
;
3915 dev
->needed_tailroom
= lowerdev
->needed_tailroom
;
3917 max_mtu
= lowerdev
->mtu
- vxlan_headroom(flags
);
3918 if (max_mtu
< ETH_MIN_MTU
)
3919 max_mtu
= ETH_MIN_MTU
;
3921 if (!changelink
&& !conf
->mtu
)
3925 if (dev
->mtu
> max_mtu
)
3928 if (flags
& VXLAN_F_COLLECT_METADATA
)
3929 flags
|= VXLAN_F_IPV6
;
3930 needed_headroom
+= vxlan_headroom(flags
);
3931 dev
->needed_headroom
= needed_headroom
;
3933 memcpy(&vxlan
->cfg
, conf
, sizeof(*conf
));
3936 static int vxlan_dev_configure(struct net
*src_net
, struct net_device
*dev
,
3937 struct vxlan_config
*conf
, bool changelink
,
3938 struct netlink_ext_ack
*extack
)
3940 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3941 struct net_device
*lowerdev
;
3944 ret
= vxlan_config_validate(src_net
, conf
, &lowerdev
, vxlan
, extack
);
3948 vxlan_config_apply(dev
, conf
, lowerdev
, src_net
, changelink
);
3953 static int __vxlan_dev_create(struct net
*net
, struct net_device
*dev
,
3954 struct vxlan_config
*conf
,
3955 struct netlink_ext_ack
*extack
)
3957 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
3958 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
3959 struct net_device
*remote_dev
= NULL
;
3960 struct vxlan_fdb
*f
= NULL
;
3961 bool unregister
= false;
3962 struct vxlan_rdst
*dst
;
3965 dst
= &vxlan
->default_dst
;
3966 err
= vxlan_dev_configure(net
, dev
, conf
, false, extack
);
3970 dev
->ethtool_ops
= &vxlan_ethtool_ops
;
3972 /* create an fdb entry for a valid default destination */
3973 if (!vxlan_addr_any(&dst
->remote_ip
)) {
3974 err
= vxlan_fdb_create(vxlan
, all_zeros_mac
,
3976 NUD_REACHABLE
| NUD_PERMANENT
,
3977 vxlan
->cfg
.dst_port
,
3980 dst
->remote_ifindex
,
3981 NTF_SELF
, 0, &f
, extack
);
3986 err
= register_netdevice(dev
);
3991 if (dst
->remote_ifindex
) {
3992 remote_dev
= __dev_get_by_index(net
, dst
->remote_ifindex
);
3998 err
= netdev_upper_dev_link(remote_dev
, dev
, extack
);
4003 err
= rtnl_configure_link(dev
, NULL
, 0, NULL
);
4008 vxlan_fdb_insert(vxlan
, all_zeros_mac
, dst
->remote_vni
, f
);
4010 /* notify default fdb entry */
4011 err
= vxlan_fdb_notify(vxlan
, f
, first_remote_rtnl(f
),
4012 RTM_NEWNEIGH
, true, extack
);
4014 vxlan_fdb_destroy(vxlan
, f
, false, false);
4016 netdev_upper_dev_unlink(remote_dev
, dev
);
4021 list_add(&vxlan
->next
, &vn
->vxlan_list
);
4023 dst
->remote_dev
= remote_dev
;
4027 netdev_upper_dev_unlink(remote_dev
, dev
);
4029 /* unregister_netdevice() destroys the default FDB entry with deletion
4030 * notification. But the addition notification was not sent yet, so
4031 * destroy the entry by hand here.
4034 __vxlan_fdb_free(f
);
4037 unregister_netdevice(dev
);
4041 /* Set/clear flags based on attribute */
4042 static int vxlan_nl2flag(struct vxlan_config
*conf
, struct nlattr
*tb
[],
4043 int attrtype
, unsigned long mask
, bool changelink
,
4044 bool changelink_supported
,
4045 struct netlink_ext_ack
*extack
)
4047 unsigned long flags
;
4052 if (changelink
&& !changelink_supported
) {
4053 vxlan_flag_attr_error(attrtype
, extack
);
4057 if (vxlan_policy
[attrtype
].type
== NLA_FLAG
)
4058 flags
= conf
->flags
| mask
;
4059 else if (nla_get_u8(tb
[attrtype
]))
4060 flags
= conf
->flags
| mask
;
4062 flags
= conf
->flags
& ~mask
;
4064 conf
->flags
= flags
;
4069 static int vxlan_nl2conf(struct nlattr
*tb
[], struct nlattr
*data
[],
4070 struct net_device
*dev
, struct vxlan_config
*conf
,
4071 bool changelink
, struct netlink_ext_ack
*extack
)
4073 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4076 memset(conf
, 0, sizeof(*conf
));
4078 /* if changelink operation, start with old existing cfg */
4080 memcpy(conf
, &vxlan
->cfg
, sizeof(*conf
));
4082 if (data
[IFLA_VXLAN_ID
]) {
4083 __be32 vni
= cpu_to_be32(nla_get_u32(data
[IFLA_VXLAN_ID
]));
4085 if (changelink
&& (vni
!= conf
->vni
)) {
4086 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_ID
], "Cannot change VNI");
4089 conf
->vni
= cpu_to_be32(nla_get_u32(data
[IFLA_VXLAN_ID
]));
4092 if (data
[IFLA_VXLAN_GROUP
]) {
4093 if (changelink
&& (conf
->remote_ip
.sa
.sa_family
!= AF_INET
)) {
4094 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_GROUP
], "New group address family does not match old group");
4098 conf
->remote_ip
.sin
.sin_addr
.s_addr
= nla_get_in_addr(data
[IFLA_VXLAN_GROUP
]);
4099 conf
->remote_ip
.sa
.sa_family
= AF_INET
;
4100 } else if (data
[IFLA_VXLAN_GROUP6
]) {
4101 if (!IS_ENABLED(CONFIG_IPV6
)) {
4102 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_GROUP6
], "IPv6 support not enabled in the kernel");
4103 return -EPFNOSUPPORT
;
4106 if (changelink
&& (conf
->remote_ip
.sa
.sa_family
!= AF_INET6
)) {
4107 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_GROUP6
], "New group address family does not match old group");
4111 conf
->remote_ip
.sin6
.sin6_addr
= nla_get_in6_addr(data
[IFLA_VXLAN_GROUP6
]);
4112 conf
->remote_ip
.sa
.sa_family
= AF_INET6
;
4115 if (data
[IFLA_VXLAN_LOCAL
]) {
4116 if (changelink
&& (conf
->saddr
.sa
.sa_family
!= AF_INET
)) {
4117 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_LOCAL
], "New local address family does not match old");
4121 conf
->saddr
.sin
.sin_addr
.s_addr
= nla_get_in_addr(data
[IFLA_VXLAN_LOCAL
]);
4122 conf
->saddr
.sa
.sa_family
= AF_INET
;
4123 } else if (data
[IFLA_VXLAN_LOCAL6
]) {
4124 if (!IS_ENABLED(CONFIG_IPV6
)) {
4125 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_LOCAL6
], "IPv6 support not enabled in the kernel");
4126 return -EPFNOSUPPORT
;
4129 if (changelink
&& (conf
->saddr
.sa
.sa_family
!= AF_INET6
)) {
4130 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_LOCAL6
], "New local address family does not match old");
4134 /* TODO: respect scope id */
4135 conf
->saddr
.sin6
.sin6_addr
= nla_get_in6_addr(data
[IFLA_VXLAN_LOCAL6
]);
4136 conf
->saddr
.sa
.sa_family
= AF_INET6
;
4139 if (data
[IFLA_VXLAN_LINK
])
4140 conf
->remote_ifindex
= nla_get_u32(data
[IFLA_VXLAN_LINK
]);
4142 if (data
[IFLA_VXLAN_TOS
])
4143 conf
->tos
= nla_get_u8(data
[IFLA_VXLAN_TOS
]);
4145 if (data
[IFLA_VXLAN_TTL
])
4146 conf
->ttl
= nla_get_u8(data
[IFLA_VXLAN_TTL
]);
4148 if (data
[IFLA_VXLAN_TTL_INHERIT
]) {
4149 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_TTL_INHERIT
,
4150 VXLAN_F_TTL_INHERIT
, changelink
, false,
4157 if (data
[IFLA_VXLAN_LABEL
])
4158 conf
->label
= nla_get_be32(data
[IFLA_VXLAN_LABEL
]) &
4159 IPV6_FLOWLABEL_MASK
;
4160 if (data
[IFLA_VXLAN_LABEL_POLICY
])
4161 conf
->label_policy
= nla_get_u32(data
[IFLA_VXLAN_LABEL_POLICY
]);
4163 if (data
[IFLA_VXLAN_LEARNING
]) {
4164 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_LEARNING
,
4165 VXLAN_F_LEARN
, changelink
, true,
4169 } else if (!changelink
) {
4170 /* default to learn on a new device */
4171 conf
->flags
|= VXLAN_F_LEARN
;
4174 if (data
[IFLA_VXLAN_AGEING
])
4175 conf
->age_interval
= nla_get_u32(data
[IFLA_VXLAN_AGEING
]);
4177 if (data
[IFLA_VXLAN_PROXY
]) {
4178 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_PROXY
,
4179 VXLAN_F_PROXY
, changelink
, false,
4185 if (data
[IFLA_VXLAN_RSC
]) {
4186 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_RSC
,
4187 VXLAN_F_RSC
, changelink
, false,
4193 if (data
[IFLA_VXLAN_L2MISS
]) {
4194 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_L2MISS
,
4195 VXLAN_F_L2MISS
, changelink
, false,
4201 if (data
[IFLA_VXLAN_L3MISS
]) {
4202 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_L3MISS
,
4203 VXLAN_F_L3MISS
, changelink
, false,
4209 if (data
[IFLA_VXLAN_LIMIT
]) {
4211 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_LIMIT
],
4212 "Cannot change limit");
4215 conf
->addrmax
= nla_get_u32(data
[IFLA_VXLAN_LIMIT
]);
4218 if (data
[IFLA_VXLAN_COLLECT_METADATA
]) {
4219 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_COLLECT_METADATA
,
4220 VXLAN_F_COLLECT_METADATA
, changelink
, false,
4226 if (data
[IFLA_VXLAN_PORT_RANGE
]) {
4228 const struct ifla_vxlan_port_range
*p
4229 = nla_data(data
[IFLA_VXLAN_PORT_RANGE
]);
4230 conf
->port_min
= ntohs(p
->low
);
4231 conf
->port_max
= ntohs(p
->high
);
4233 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_PORT_RANGE
],
4234 "Cannot change port range");
4239 if (data
[IFLA_VXLAN_PORT
]) {
4241 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_PORT
],
4242 "Cannot change port");
4245 conf
->dst_port
= nla_get_be16(data
[IFLA_VXLAN_PORT
]);
4248 if (data
[IFLA_VXLAN_UDP_CSUM
]) {
4250 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_VXLAN_UDP_CSUM
],
4251 "Cannot change UDP_CSUM flag");
4254 if (!nla_get_u8(data
[IFLA_VXLAN_UDP_CSUM
]))
4255 conf
->flags
|= VXLAN_F_UDP_ZERO_CSUM_TX
;
4258 if (data
[IFLA_VXLAN_LOCALBYPASS
]) {
4259 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_LOCALBYPASS
,
4260 VXLAN_F_LOCALBYPASS
, changelink
,
4264 } else if (!changelink
) {
4265 /* default to local bypass on a new device */
4266 conf
->flags
|= VXLAN_F_LOCALBYPASS
;
4269 if (data
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX
]) {
4270 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_UDP_ZERO_CSUM6_TX
,
4271 VXLAN_F_UDP_ZERO_CSUM6_TX
, changelink
,
4277 if (data
[IFLA_VXLAN_UDP_ZERO_CSUM6_RX
]) {
4278 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_UDP_ZERO_CSUM6_RX
,
4279 VXLAN_F_UDP_ZERO_CSUM6_RX
, changelink
,
4285 if (data
[IFLA_VXLAN_REMCSUM_TX
]) {
4286 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_REMCSUM_TX
,
4287 VXLAN_F_REMCSUM_TX
, changelink
, false,
4293 if (data
[IFLA_VXLAN_REMCSUM_RX
]) {
4294 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_REMCSUM_RX
,
4295 VXLAN_F_REMCSUM_RX
, changelink
, false,
4301 if (data
[IFLA_VXLAN_GBP
]) {
4302 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_GBP
,
4303 VXLAN_F_GBP
, changelink
, false, extack
);
4308 if (data
[IFLA_VXLAN_GPE
]) {
4309 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_GPE
,
4310 VXLAN_F_GPE
, changelink
, false,
4316 if (data
[IFLA_VXLAN_REMCSUM_NOPARTIAL
]) {
4317 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_REMCSUM_NOPARTIAL
,
4318 VXLAN_F_REMCSUM_NOPARTIAL
, changelink
,
4326 NL_SET_ERR_MSG_ATTR(extack
, tb
[IFLA_MTU
],
4327 "Cannot change mtu");
4330 conf
->mtu
= nla_get_u32(tb
[IFLA_MTU
]);
4333 if (data
[IFLA_VXLAN_DF
])
4334 conf
->df
= nla_get_u8(data
[IFLA_VXLAN_DF
]);
4336 if (data
[IFLA_VXLAN_VNIFILTER
]) {
4337 err
= vxlan_nl2flag(conf
, data
, IFLA_VXLAN_VNIFILTER
,
4338 VXLAN_F_VNIFILTER
, changelink
, false,
4343 if ((conf
->flags
& VXLAN_F_VNIFILTER
) &&
4344 !(conf
->flags
& VXLAN_F_COLLECT_METADATA
)) {
4345 NL_SET_ERR_MSG_ATTR(extack
, data
[IFLA_VXLAN_VNIFILTER
],
4346 "vxlan vnifilter only valid in collect metadata mode");
4354 static int vxlan_newlink(struct net
*src_net
, struct net_device
*dev
,
4355 struct nlattr
*tb
[], struct nlattr
*data
[],
4356 struct netlink_ext_ack
*extack
)
4358 struct vxlan_config conf
;
4361 err
= vxlan_nl2conf(tb
, data
, dev
, &conf
, false, extack
);
4365 return __vxlan_dev_create(src_net
, dev
, &conf
, extack
);
4368 static int vxlan_changelink(struct net_device
*dev
, struct nlattr
*tb
[],
4369 struct nlattr
*data
[],
4370 struct netlink_ext_ack
*extack
)
4372 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4373 struct net_device
*lowerdev
;
4374 struct vxlan_config conf
;
4375 struct vxlan_rdst
*dst
;
4378 dst
= &vxlan
->default_dst
;
4379 err
= vxlan_nl2conf(tb
, data
, dev
, &conf
, true, extack
);
4383 err
= vxlan_config_validate(vxlan
->net
, &conf
, &lowerdev
,
4388 if (dst
->remote_dev
== lowerdev
)
4391 err
= netdev_adjacent_change_prepare(dst
->remote_dev
, lowerdev
, dev
,
4396 /* handle default dst entry */
4397 if (!vxlan_addr_equal(&conf
.remote_ip
, &dst
->remote_ip
)) {
4398 u32 hash_index
= fdb_head_index(vxlan
, all_zeros_mac
, conf
.vni
);
4400 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
4401 if (!vxlan_addr_any(&conf
.remote_ip
)) {
4402 err
= vxlan_fdb_update(vxlan
, all_zeros_mac
,
4404 NUD_REACHABLE
| NUD_PERMANENT
,
4405 NLM_F_APPEND
| NLM_F_CREATE
,
4406 vxlan
->cfg
.dst_port
,
4408 conf
.remote_ifindex
,
4409 NTF_SELF
, 0, true, extack
);
4411 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
4412 netdev_adjacent_change_abort(dst
->remote_dev
,
4417 if (!vxlan_addr_any(&dst
->remote_ip
))
4418 __vxlan_fdb_delete(vxlan
, all_zeros_mac
,
4420 vxlan
->cfg
.dst_port
,
4423 dst
->remote_ifindex
,
4425 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
4427 /* If vni filtering device, also update fdb entries of
4428 * all vnis that were using default remote ip
4430 if (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
) {
4431 err
= vxlan_vnilist_update_group(vxlan
, &dst
->remote_ip
,
4432 &conf
.remote_ip
, extack
);
4434 netdev_adjacent_change_abort(dst
->remote_dev
,
4441 if (conf
.age_interval
!= vxlan
->cfg
.age_interval
)
4442 mod_timer(&vxlan
->age_timer
, jiffies
);
4444 netdev_adjacent_change_commit(dst
->remote_dev
, lowerdev
, dev
);
4445 if (lowerdev
&& lowerdev
!= dst
->remote_dev
)
4446 dst
->remote_dev
= lowerdev
;
4447 vxlan_config_apply(dev
, &conf
, lowerdev
, vxlan
->net
, true);
4451 static void vxlan_dellink(struct net_device
*dev
, struct list_head
*head
)
4453 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4454 struct vxlan_fdb_flush_desc desc
= {
4455 /* Default entry is deleted at vxlan_uninit. */
4456 .ignore_default_entry
= true,
4459 vxlan_flush(vxlan
, &desc
);
4461 list_del(&vxlan
->next
);
4462 unregister_netdevice_queue(dev
, head
);
4463 if (vxlan
->default_dst
.remote_dev
)
4464 netdev_upper_dev_unlink(vxlan
->default_dst
.remote_dev
, dev
);
4467 static size_t vxlan_get_size(const struct net_device
*dev
)
4469 return nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_ID */
4470 nla_total_size(sizeof(struct in6_addr
)) + /* IFLA_VXLAN_GROUP{6} */
4471 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_LINK */
4472 nla_total_size(sizeof(struct in6_addr
)) + /* IFLA_VXLAN_LOCAL{6} */
4473 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_TTL */
4474 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_TTL_INHERIT */
4475 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_TOS */
4476 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_DF */
4477 nla_total_size(sizeof(__be32
)) + /* IFLA_VXLAN_LABEL */
4478 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_LABEL_POLICY */
4479 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_LEARNING */
4480 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_PROXY */
4481 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_RSC */
4482 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_L2MISS */
4483 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_L3MISS */
4484 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_COLLECT_METADATA */
4485 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_AGEING */
4486 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_LIMIT */
4487 nla_total_size(sizeof(__be16
)) + /* IFLA_VXLAN_PORT */
4488 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_UDP_CSUM */
4489 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
4490 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
4491 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_REMCSUM_TX */
4492 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_REMCSUM_RX */
4493 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_LOCALBYPASS */
4494 /* IFLA_VXLAN_PORT_RANGE */
4495 nla_total_size(sizeof(struct ifla_vxlan_port_range
)) +
4496 nla_total_size(0) + /* IFLA_VXLAN_GBP */
4497 nla_total_size(0) + /* IFLA_VXLAN_GPE */
4498 nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */
4499 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_VNIFILTER */
4503 static int vxlan_fill_info(struct sk_buff
*skb
, const struct net_device
*dev
)
4505 const struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4506 const struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
4507 struct ifla_vxlan_port_range ports
= {
4508 .low
= htons(vxlan
->cfg
.port_min
),
4509 .high
= htons(vxlan
->cfg
.port_max
),
4512 if (nla_put_u32(skb
, IFLA_VXLAN_ID
, be32_to_cpu(dst
->remote_vni
)))
4513 goto nla_put_failure
;
4515 if (!vxlan_addr_any(&dst
->remote_ip
)) {
4516 if (dst
->remote_ip
.sa
.sa_family
== AF_INET
) {
4517 if (nla_put_in_addr(skb
, IFLA_VXLAN_GROUP
,
4518 dst
->remote_ip
.sin
.sin_addr
.s_addr
))
4519 goto nla_put_failure
;
4520 #if IS_ENABLED(CONFIG_IPV6)
4522 if (nla_put_in6_addr(skb
, IFLA_VXLAN_GROUP6
,
4523 &dst
->remote_ip
.sin6
.sin6_addr
))
4524 goto nla_put_failure
;
4529 if (dst
->remote_ifindex
&& nla_put_u32(skb
, IFLA_VXLAN_LINK
, dst
->remote_ifindex
))
4530 goto nla_put_failure
;
4532 if (!vxlan_addr_any(&vxlan
->cfg
.saddr
)) {
4533 if (vxlan
->cfg
.saddr
.sa
.sa_family
== AF_INET
) {
4534 if (nla_put_in_addr(skb
, IFLA_VXLAN_LOCAL
,
4535 vxlan
->cfg
.saddr
.sin
.sin_addr
.s_addr
))
4536 goto nla_put_failure
;
4537 #if IS_ENABLED(CONFIG_IPV6)
4539 if (nla_put_in6_addr(skb
, IFLA_VXLAN_LOCAL6
,
4540 &vxlan
->cfg
.saddr
.sin6
.sin6_addr
))
4541 goto nla_put_failure
;
4546 if (nla_put_u8(skb
, IFLA_VXLAN_TTL
, vxlan
->cfg
.ttl
) ||
4547 nla_put_u8(skb
, IFLA_VXLAN_TTL_INHERIT
,
4548 !!(vxlan
->cfg
.flags
& VXLAN_F_TTL_INHERIT
)) ||
4549 nla_put_u8(skb
, IFLA_VXLAN_TOS
, vxlan
->cfg
.tos
) ||
4550 nla_put_u8(skb
, IFLA_VXLAN_DF
, vxlan
->cfg
.df
) ||
4551 nla_put_be32(skb
, IFLA_VXLAN_LABEL
, vxlan
->cfg
.label
) ||
4552 nla_put_u32(skb
, IFLA_VXLAN_LABEL_POLICY
, vxlan
->cfg
.label_policy
) ||
4553 nla_put_u8(skb
, IFLA_VXLAN_LEARNING
,
4554 !!(vxlan
->cfg
.flags
& VXLAN_F_LEARN
)) ||
4555 nla_put_u8(skb
, IFLA_VXLAN_PROXY
,
4556 !!(vxlan
->cfg
.flags
& VXLAN_F_PROXY
)) ||
4557 nla_put_u8(skb
, IFLA_VXLAN_RSC
,
4558 !!(vxlan
->cfg
.flags
& VXLAN_F_RSC
)) ||
4559 nla_put_u8(skb
, IFLA_VXLAN_L2MISS
,
4560 !!(vxlan
->cfg
.flags
& VXLAN_F_L2MISS
)) ||
4561 nla_put_u8(skb
, IFLA_VXLAN_L3MISS
,
4562 !!(vxlan
->cfg
.flags
& VXLAN_F_L3MISS
)) ||
4563 nla_put_u8(skb
, IFLA_VXLAN_COLLECT_METADATA
,
4564 !!(vxlan
->cfg
.flags
& VXLAN_F_COLLECT_METADATA
)) ||
4565 nla_put_u32(skb
, IFLA_VXLAN_AGEING
, vxlan
->cfg
.age_interval
) ||
4566 nla_put_u32(skb
, IFLA_VXLAN_LIMIT
, vxlan
->cfg
.addrmax
) ||
4567 nla_put_be16(skb
, IFLA_VXLAN_PORT
, vxlan
->cfg
.dst_port
) ||
4568 nla_put_u8(skb
, IFLA_VXLAN_UDP_CSUM
,
4569 !(vxlan
->cfg
.flags
& VXLAN_F_UDP_ZERO_CSUM_TX
)) ||
4570 nla_put_u8(skb
, IFLA_VXLAN_UDP_ZERO_CSUM6_TX
,
4571 !!(vxlan
->cfg
.flags
& VXLAN_F_UDP_ZERO_CSUM6_TX
)) ||
4572 nla_put_u8(skb
, IFLA_VXLAN_UDP_ZERO_CSUM6_RX
,
4573 !!(vxlan
->cfg
.flags
& VXLAN_F_UDP_ZERO_CSUM6_RX
)) ||
4574 nla_put_u8(skb
, IFLA_VXLAN_REMCSUM_TX
,
4575 !!(vxlan
->cfg
.flags
& VXLAN_F_REMCSUM_TX
)) ||
4576 nla_put_u8(skb
, IFLA_VXLAN_REMCSUM_RX
,
4577 !!(vxlan
->cfg
.flags
& VXLAN_F_REMCSUM_RX
)) ||
4578 nla_put_u8(skb
, IFLA_VXLAN_LOCALBYPASS
,
4579 !!(vxlan
->cfg
.flags
& VXLAN_F_LOCALBYPASS
)))
4580 goto nla_put_failure
;
4582 if (nla_put(skb
, IFLA_VXLAN_PORT_RANGE
, sizeof(ports
), &ports
))
4583 goto nla_put_failure
;
4585 if (vxlan
->cfg
.flags
& VXLAN_F_GBP
&&
4586 nla_put_flag(skb
, IFLA_VXLAN_GBP
))
4587 goto nla_put_failure
;
4589 if (vxlan
->cfg
.flags
& VXLAN_F_GPE
&&
4590 nla_put_flag(skb
, IFLA_VXLAN_GPE
))
4591 goto nla_put_failure
;
4593 if (vxlan
->cfg
.flags
& VXLAN_F_REMCSUM_NOPARTIAL
&&
4594 nla_put_flag(skb
, IFLA_VXLAN_REMCSUM_NOPARTIAL
))
4595 goto nla_put_failure
;
4597 if (vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
&&
4598 nla_put_u8(skb
, IFLA_VXLAN_VNIFILTER
,
4599 !!(vxlan
->cfg
.flags
& VXLAN_F_VNIFILTER
)))
4600 goto nla_put_failure
;
4608 static struct net
*vxlan_get_link_net(const struct net_device
*dev
)
4610 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4612 return READ_ONCE(vxlan
->net
);
4615 static struct rtnl_link_ops vxlan_link_ops __read_mostly
= {
4617 .maxtype
= IFLA_VXLAN_MAX
,
4618 .policy
= vxlan_policy
,
4619 .priv_size
= sizeof(struct vxlan_dev
),
4620 .setup
= vxlan_setup
,
4621 .validate
= vxlan_validate
,
4622 .newlink
= vxlan_newlink
,
4623 .changelink
= vxlan_changelink
,
4624 .dellink
= vxlan_dellink
,
4625 .get_size
= vxlan_get_size
,
4626 .fill_info
= vxlan_fill_info
,
4627 .get_link_net
= vxlan_get_link_net
,
4630 struct net_device
*vxlan_dev_create(struct net
*net
, const char *name
,
4631 u8 name_assign_type
,
4632 struct vxlan_config
*conf
)
4634 struct nlattr
*tb
[IFLA_MAX
+ 1];
4635 struct net_device
*dev
;
4638 memset(&tb
, 0, sizeof(tb
));
4640 dev
= rtnl_create_link(net
, name
, name_assign_type
,
4641 &vxlan_link_ops
, tb
, NULL
);
4645 err
= __vxlan_dev_create(net
, dev
, conf
, NULL
);
4648 return ERR_PTR(err
);
4651 err
= rtnl_configure_link(dev
, NULL
, 0, NULL
);
4653 LIST_HEAD(list_kill
);
4655 vxlan_dellink(dev
, &list_kill
);
4656 unregister_netdevice_many(&list_kill
);
4657 return ERR_PTR(err
);
4662 EXPORT_SYMBOL_GPL(vxlan_dev_create
);
4664 static void vxlan_handle_lowerdev_unregister(struct vxlan_net
*vn
,
4665 struct net_device
*dev
)
4667 struct vxlan_dev
*vxlan
, *next
;
4668 LIST_HEAD(list_kill
);
4670 list_for_each_entry_safe(vxlan
, next
, &vn
->vxlan_list
, next
) {
4671 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
4673 /* In case we created vxlan device with carrier
4674 * and we loose the carrier due to module unload
4675 * we also need to remove vxlan device. In other
4676 * cases, it's not necessary and remote_ifindex
4677 * is 0 here, so no matches.
4679 if (dst
->remote_ifindex
== dev
->ifindex
)
4680 vxlan_dellink(vxlan
->dev
, &list_kill
);
4683 unregister_netdevice_many(&list_kill
);
4686 static int vxlan_netdevice_event(struct notifier_block
*unused
,
4687 unsigned long event
, void *ptr
)
4689 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4690 struct vxlan_net
*vn
= net_generic(dev_net(dev
), vxlan_net_id
);
4692 if (event
== NETDEV_UNREGISTER
)
4693 vxlan_handle_lowerdev_unregister(vn
, dev
);
4694 else if (event
== NETDEV_UDP_TUNNEL_PUSH_INFO
)
4695 vxlan_offload_rx_ports(dev
, true);
4696 else if (event
== NETDEV_UDP_TUNNEL_DROP_INFO
)
4697 vxlan_offload_rx_ports(dev
, false);
4702 static struct notifier_block vxlan_notifier_block __read_mostly
= {
4703 .notifier_call
= vxlan_netdevice_event
,
4707 vxlan_fdb_offloaded_set(struct net_device
*dev
,
4708 struct switchdev_notifier_vxlan_fdb_info
*fdb_info
)
4710 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4711 struct vxlan_rdst
*rdst
;
4712 struct vxlan_fdb
*f
;
4715 hash_index
= fdb_head_index(vxlan
, fdb_info
->eth_addr
, fdb_info
->vni
);
4717 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
4719 f
= vxlan_find_mac(vxlan
, fdb_info
->eth_addr
, fdb_info
->vni
);
4723 rdst
= vxlan_fdb_find_rdst(f
, &fdb_info
->remote_ip
,
4724 fdb_info
->remote_port
,
4725 fdb_info
->remote_vni
,
4726 fdb_info
->remote_ifindex
);
4730 rdst
->offloaded
= fdb_info
->offloaded
;
4733 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
4737 vxlan_fdb_external_learn_add(struct net_device
*dev
,
4738 struct switchdev_notifier_vxlan_fdb_info
*fdb_info
)
4740 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4741 struct netlink_ext_ack
*extack
;
4745 hash_index
= fdb_head_index(vxlan
, fdb_info
->eth_addr
, fdb_info
->vni
);
4746 extack
= switchdev_notifier_info_to_extack(&fdb_info
->info
);
4748 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
4749 err
= vxlan_fdb_update(vxlan
, fdb_info
->eth_addr
, &fdb_info
->remote_ip
,
4751 NLM_F_CREATE
| NLM_F_REPLACE
,
4752 fdb_info
->remote_port
,
4754 fdb_info
->remote_vni
,
4755 fdb_info
->remote_ifindex
,
4756 NTF_USE
| NTF_SELF
| NTF_EXT_LEARNED
,
4758 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
4764 vxlan_fdb_external_learn_del(struct net_device
*dev
,
4765 struct switchdev_notifier_vxlan_fdb_info
*fdb_info
)
4767 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
4768 struct vxlan_fdb
*f
;
4772 hash_index
= fdb_head_index(vxlan
, fdb_info
->eth_addr
, fdb_info
->vni
);
4773 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
4775 f
= vxlan_find_mac(vxlan
, fdb_info
->eth_addr
, fdb_info
->vni
);
4778 else if (f
->flags
& NTF_EXT_LEARNED
)
4779 err
= __vxlan_fdb_delete(vxlan
, fdb_info
->eth_addr
,
4780 fdb_info
->remote_ip
,
4781 fdb_info
->remote_port
,
4783 fdb_info
->remote_vni
,
4784 fdb_info
->remote_ifindex
,
4787 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
4792 static int vxlan_switchdev_event(struct notifier_block
*unused
,
4793 unsigned long event
, void *ptr
)
4795 struct net_device
*dev
= switchdev_notifier_info_to_dev(ptr
);
4796 struct switchdev_notifier_vxlan_fdb_info
*fdb_info
;
4800 case SWITCHDEV_VXLAN_FDB_OFFLOADED
:
4801 vxlan_fdb_offloaded_set(dev
, ptr
);
4803 case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE
:
4805 err
= vxlan_fdb_external_learn_add(dev
, fdb_info
);
4807 err
= notifier_from_errno(err
);
4810 fdb_info
->offloaded
= true;
4811 vxlan_fdb_offloaded_set(dev
, fdb_info
);
4813 case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE
:
4815 err
= vxlan_fdb_external_learn_del(dev
, fdb_info
);
4817 err
= notifier_from_errno(err
);
4820 fdb_info
->offloaded
= false;
4821 vxlan_fdb_offloaded_set(dev
, fdb_info
);
4828 static struct notifier_block vxlan_switchdev_notifier_block __read_mostly
= {
4829 .notifier_call
= vxlan_switchdev_event
,
4832 static void vxlan_fdb_nh_flush(struct nexthop
*nh
)
4834 struct vxlan_fdb
*fdb
;
4835 struct vxlan_dev
*vxlan
;
4839 list_for_each_entry_rcu(fdb
, &nh
->fdb_list
, nh_list
) {
4840 vxlan
= rcu_dereference(fdb
->vdev
);
4842 hash_index
= fdb_head_index(vxlan
, fdb
->eth_addr
,
4843 vxlan
->default_dst
.remote_vni
);
4844 spin_lock_bh(&vxlan
->hash_lock
[hash_index
]);
4845 if (!hlist_unhashed(&fdb
->hlist
))
4846 vxlan_fdb_destroy(vxlan
, fdb
, false, false);
4847 spin_unlock_bh(&vxlan
->hash_lock
[hash_index
]);
4852 static int vxlan_nexthop_event(struct notifier_block
*nb
,
4853 unsigned long event
, void *ptr
)
4855 struct nh_notifier_info
*info
= ptr
;
4858 if (event
!= NEXTHOP_EVENT_DEL
)
4861 nh
= nexthop_find_by_id(info
->net
, info
->id
);
4865 vxlan_fdb_nh_flush(nh
);
4870 static __net_init
int vxlan_init_net(struct net
*net
)
4872 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
4875 INIT_LIST_HEAD(&vn
->vxlan_list
);
4876 spin_lock_init(&vn
->sock_lock
);
4877 vn
->nexthop_notifier_block
.notifier_call
= vxlan_nexthop_event
;
4879 for (h
= 0; h
< PORT_HASH_SIZE
; ++h
)
4880 INIT_HLIST_HEAD(&vn
->sock_list
[h
]);
4882 return register_nexthop_notifier(net
, &vn
->nexthop_notifier_block
,
4886 static void __net_exit
vxlan_destroy_tunnels(struct vxlan_net
*vn
,
4887 struct list_head
*dev_to_kill
)
4889 struct vxlan_dev
*vxlan
, *next
;
4891 list_for_each_entry_safe(vxlan
, next
, &vn
->vxlan_list
, next
)
4892 vxlan_dellink(vxlan
->dev
, dev_to_kill
);
4895 static void __net_exit
vxlan_exit_batch_rtnl(struct list_head
*net_list
,
4896 struct list_head
*dev_to_kill
)
4901 list_for_each_entry(net
, net_list
, exit_list
) {
4902 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
4904 __unregister_nexthop_notifier(net
, &vn
->nexthop_notifier_block
);
4906 vxlan_destroy_tunnels(vn
, dev_to_kill
);
4910 static void __net_exit
vxlan_exit_net(struct net
*net
)
4912 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
4915 for (h
= 0; h
< PORT_HASH_SIZE
; ++h
)
4916 WARN_ON_ONCE(!hlist_empty(&vn
->sock_list
[h
]));
4919 static struct pernet_operations vxlan_net_ops
= {
4920 .init
= vxlan_init_net
,
4921 .exit_batch_rtnl
= vxlan_exit_batch_rtnl
,
4922 .exit
= vxlan_exit_net
,
4923 .id
= &vxlan_net_id
,
4924 .size
= sizeof(struct vxlan_net
),
4927 static int __init
vxlan_init_module(void)
4931 get_random_bytes(&vxlan_salt
, sizeof(vxlan_salt
));
4933 rc
= register_pernet_subsys(&vxlan_net_ops
);
4937 rc
= register_netdevice_notifier(&vxlan_notifier_block
);
4941 rc
= register_switchdev_notifier(&vxlan_switchdev_notifier_block
);
4945 rc
= rtnl_link_register(&vxlan_link_ops
);
4949 rc
= vxlan_vnifilter_init();
4955 rtnl_link_unregister(&vxlan_link_ops
);
4957 unregister_switchdev_notifier(&vxlan_switchdev_notifier_block
);
4959 unregister_netdevice_notifier(&vxlan_notifier_block
);
4961 unregister_pernet_subsys(&vxlan_net_ops
);
4965 late_initcall(vxlan_init_module
);
4967 static void __exit
vxlan_cleanup_module(void)
4969 vxlan_vnifilter_uninit();
4970 rtnl_link_unregister(&vxlan_link_ops
);
4971 unregister_switchdev_notifier(&vxlan_switchdev_notifier_block
);
4972 unregister_netdevice_notifier(&vxlan_notifier_block
);
4973 unregister_pernet_subsys(&vxlan_net_ops
);
4974 /* rcu_barrier() is called by netns */
4976 module_exit(vxlan_cleanup_module
);
4978 MODULE_LICENSE("GPL");
4979 MODULE_VERSION(VXLAN_VERSION
);
4980 MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
4981 MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
4982 MODULE_ALIAS_RTNL_LINK("vxlan");