1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key
, __be32 remote
)
55 return hash_32((__force u32
)key
^ (__force u32
)remote
,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm
*p
,
60 __be16 flags
, __be32 key
)
62 if (p
->i_flags
& TUNNEL_KEY
) {
63 if (flags
& TUNNEL_KEY
)
64 return key
== p
->i_key
;
66 /* key expected, none present */
69 return !(flags
& TUNNEL_KEY
);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel
*ip_tunnel_lookup(struct ip_tunnel_net
*itn
,
84 int link
, __be16 flags
,
85 __be32 remote
, __be32 local
,
89 struct ip_tunnel
*t
, *cand
= NULL
;
90 struct hlist_head
*head
;
92 hash
= ip_tunnel_hash(key
, remote
);
93 head
= &itn
->tunnels
[hash
];
95 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
96 if (local
!= t
->parms
.iph
.saddr
||
97 remote
!= t
->parms
.iph
.daddr
||
98 !(t
->dev
->flags
& IFF_UP
))
101 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
104 if (t
->parms
.link
== link
)
110 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
111 if (remote
!= t
->parms
.iph
.daddr
||
112 t
->parms
.iph
.saddr
!= 0 ||
113 !(t
->dev
->flags
& IFF_UP
))
116 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
119 if (t
->parms
.link
== link
)
125 hash
= ip_tunnel_hash(key
, 0);
126 head
= &itn
->tunnels
[hash
];
128 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
129 if ((local
!= t
->parms
.iph
.saddr
|| t
->parms
.iph
.daddr
!= 0) &&
130 (local
!= t
->parms
.iph
.daddr
|| !ipv4_is_multicast(local
)))
133 if (!(t
->dev
->flags
& IFF_UP
))
136 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
139 if (t
->parms
.link
== link
)
145 if (flags
& TUNNEL_NO_KEY
)
146 goto skip_key_lookup
;
148 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
149 if (t
->parms
.i_key
!= key
||
150 t
->parms
.iph
.saddr
!= 0 ||
151 t
->parms
.iph
.daddr
!= 0 ||
152 !(t
->dev
->flags
& IFF_UP
))
155 if (t
->parms
.link
== link
)
165 t
= rcu_dereference(itn
->collect_md_tun
);
166 if (t
&& t
->dev
->flags
& IFF_UP
)
169 if (itn
->fb_tunnel_dev
&& itn
->fb_tunnel_dev
->flags
& IFF_UP
)
170 return netdev_priv(itn
->fb_tunnel_dev
);
174 EXPORT_SYMBOL_GPL(ip_tunnel_lookup
);
176 static struct hlist_head
*ip_bucket(struct ip_tunnel_net
*itn
,
177 struct ip_tunnel_parm
*parms
)
181 __be32 i_key
= parms
->i_key
;
183 if (parms
->iph
.daddr
&& !ipv4_is_multicast(parms
->iph
.daddr
))
184 remote
= parms
->iph
.daddr
;
188 if (!(parms
->i_flags
& TUNNEL_KEY
) && (parms
->i_flags
& VTI_ISVTI
))
191 h
= ip_tunnel_hash(i_key
, remote
);
192 return &itn
->tunnels
[h
];
195 static void ip_tunnel_add(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
197 struct hlist_head
*head
= ip_bucket(itn
, &t
->parms
);
200 rcu_assign_pointer(itn
->collect_md_tun
, t
);
201 hlist_add_head_rcu(&t
->hash_node
, head
);
204 static void ip_tunnel_del(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
207 rcu_assign_pointer(itn
->collect_md_tun
, NULL
);
208 hlist_del_init_rcu(&t
->hash_node
);
211 static struct ip_tunnel
*ip_tunnel_find(struct ip_tunnel_net
*itn
,
212 struct ip_tunnel_parm
*parms
,
215 __be32 remote
= parms
->iph
.daddr
;
216 __be32 local
= parms
->iph
.saddr
;
217 __be32 key
= parms
->i_key
;
218 __be16 flags
= parms
->i_flags
;
219 int link
= parms
->link
;
220 struct ip_tunnel
*t
= NULL
;
221 struct hlist_head
*head
= ip_bucket(itn
, parms
);
223 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
224 if (local
== t
->parms
.iph
.saddr
&&
225 remote
== t
->parms
.iph
.daddr
&&
226 link
== t
->parms
.link
&&
227 type
== t
->dev
->type
&&
228 ip_tunnel_key_match(&t
->parms
, flags
, key
))
234 static struct net_device
*__ip_tunnel_create(struct net
*net
,
235 const struct rtnl_link_ops
*ops
,
236 struct ip_tunnel_parm
*parms
)
239 struct ip_tunnel
*tunnel
;
240 struct net_device
*dev
;
244 if (parms
->name
[0]) {
245 if (!dev_valid_name(parms
->name
))
247 strlcpy(name
, parms
->name
, IFNAMSIZ
);
249 if (strlen(ops
->kind
) > (IFNAMSIZ
- 3))
251 strcpy(name
, ops
->kind
);
256 dev
= alloc_netdev(ops
->priv_size
, name
, NET_NAME_UNKNOWN
, ops
->setup
);
261 dev_net_set(dev
, net
);
263 dev
->rtnl_link_ops
= ops
;
265 tunnel
= netdev_priv(dev
);
266 tunnel
->parms
= *parms
;
269 err
= register_netdevice(dev
);
281 static int ip_tunnel_bind_dev(struct net_device
*dev
)
283 struct net_device
*tdev
= NULL
;
284 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
285 const struct iphdr
*iph
;
286 int hlen
= LL_MAX_HEADER
;
287 int mtu
= ETH_DATA_LEN
;
288 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
290 iph
= &tunnel
->parms
.iph
;
292 /* Guess output device to choose reasonable mtu and needed_headroom */
297 ip_tunnel_init_flow(&fl4
, iph
->protocol
, iph
->daddr
,
298 iph
->saddr
, tunnel
->parms
.o_key
,
299 RT_TOS(iph
->tos
), tunnel
->parms
.link
,
301 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
307 if (dev
->type
!= ARPHRD_ETHER
)
308 dev
->flags
|= IFF_POINTOPOINT
;
310 dst_cache_reset(&tunnel
->dst_cache
);
313 if (!tdev
&& tunnel
->parms
.link
)
314 tdev
= __dev_get_by_index(tunnel
->net
, tunnel
->parms
.link
);
317 hlen
= tdev
->hard_header_len
+ tdev
->needed_headroom
;
318 mtu
= min(tdev
->mtu
, IP_MAX_MTU
);
321 dev
->needed_headroom
= t_hlen
+ hlen
;
322 mtu
-= (dev
->hard_header_len
+ t_hlen
);
324 if (mtu
< IPV4_MIN_MTU
)
330 static struct ip_tunnel
*ip_tunnel_create(struct net
*net
,
331 struct ip_tunnel_net
*itn
,
332 struct ip_tunnel_parm
*parms
)
334 struct ip_tunnel
*nt
;
335 struct net_device
*dev
;
340 dev
= __ip_tunnel_create(net
, itn
->rtnl_link_ops
, parms
);
342 return ERR_CAST(dev
);
344 mtu
= ip_tunnel_bind_dev(dev
);
345 err
= dev_set_mtu(dev
, mtu
);
347 goto err_dev_set_mtu
;
349 nt
= netdev_priv(dev
);
350 t_hlen
= nt
->hlen
+ sizeof(struct iphdr
);
351 dev
->min_mtu
= ETH_MIN_MTU
;
352 dev
->max_mtu
= IP_MAX_MTU
- dev
->hard_header_len
- t_hlen
;
353 ip_tunnel_add(itn
, nt
);
357 unregister_netdevice(dev
);
361 int ip_tunnel_rcv(struct ip_tunnel
*tunnel
, struct sk_buff
*skb
,
362 const struct tnl_ptk_info
*tpi
, struct metadata_dst
*tun_dst
,
365 struct pcpu_sw_netstats
*tstats
;
366 const struct iphdr
*iph
= ip_hdr(skb
);
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370 if (ipv4_is_multicast(iph
->daddr
)) {
371 tunnel
->dev
->stats
.multicast
++;
372 skb
->pkt_type
= PACKET_BROADCAST
;
376 if ((!(tpi
->flags
&TUNNEL_CSUM
) && (tunnel
->parms
.i_flags
&TUNNEL_CSUM
)) ||
377 ((tpi
->flags
&TUNNEL_CSUM
) && !(tunnel
->parms
.i_flags
&TUNNEL_CSUM
))) {
378 tunnel
->dev
->stats
.rx_crc_errors
++;
379 tunnel
->dev
->stats
.rx_errors
++;
383 if (tunnel
->parms
.i_flags
&TUNNEL_SEQ
) {
384 if (!(tpi
->flags
&TUNNEL_SEQ
) ||
385 (tunnel
->i_seqno
&& (s32
)(ntohl(tpi
->seq
) - tunnel
->i_seqno
) < 0)) {
386 tunnel
->dev
->stats
.rx_fifo_errors
++;
387 tunnel
->dev
->stats
.rx_errors
++;
390 tunnel
->i_seqno
= ntohl(tpi
->seq
) + 1;
393 skb_reset_network_header(skb
);
395 err
= IP_ECN_decapsulate(iph
, skb
);
398 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
399 &iph
->saddr
, iph
->tos
);
401 ++tunnel
->dev
->stats
.rx_frame_errors
;
402 ++tunnel
->dev
->stats
.rx_errors
;
407 tstats
= this_cpu_ptr(tunnel
->dev
->tstats
);
408 u64_stats_update_begin(&tstats
->syncp
);
409 tstats
->rx_packets
++;
410 tstats
->rx_bytes
+= skb
->len
;
411 u64_stats_update_end(&tstats
->syncp
);
413 skb_scrub_packet(skb
, !net_eq(tunnel
->net
, dev_net(tunnel
->dev
)));
415 if (tunnel
->dev
->type
== ARPHRD_ETHER
) {
416 skb
->protocol
= eth_type_trans(skb
, tunnel
->dev
);
417 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
419 skb
->dev
= tunnel
->dev
;
423 skb_dst_set(skb
, (struct dst_entry
*)tun_dst
);
425 gro_cells_receive(&tunnel
->gro_cells
, skb
);
430 dst_release((struct dst_entry
*)tun_dst
);
434 EXPORT_SYMBOL_GPL(ip_tunnel_rcv
);
436 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops
*ops
,
439 if (num
>= MAX_IPTUN_ENCAP_OPS
)
442 return !cmpxchg((const struct ip_tunnel_encap_ops
**)
446 EXPORT_SYMBOL(ip_tunnel_encap_add_ops
);
448 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops
*ops
,
453 if (num
>= MAX_IPTUN_ENCAP_OPS
)
456 ret
= (cmpxchg((const struct ip_tunnel_encap_ops
**)
458 ops
, NULL
) == ops
) ? 0 : -1;
464 EXPORT_SYMBOL(ip_tunnel_encap_del_ops
);
466 int ip_tunnel_encap_setup(struct ip_tunnel
*t
,
467 struct ip_tunnel_encap
*ipencap
)
471 memset(&t
->encap
, 0, sizeof(t
->encap
));
473 hlen
= ip_encap_hlen(ipencap
);
477 t
->encap
.type
= ipencap
->type
;
478 t
->encap
.sport
= ipencap
->sport
;
479 t
->encap
.dport
= ipencap
->dport
;
480 t
->encap
.flags
= ipencap
->flags
;
482 t
->encap_hlen
= hlen
;
483 t
->hlen
= t
->encap_hlen
+ t
->tun_hlen
;
487 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup
);
489 static int tnl_update_pmtu(struct net_device
*dev
, struct sk_buff
*skb
,
490 struct rtable
*rt
, __be16 df
,
491 const struct iphdr
*inner_iph
,
492 int tunnel_hlen
, __be32 dst
, bool md
)
494 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
498 tunnel_hlen
= md
? tunnel_hlen
: tunnel
->hlen
;
499 pkt_size
= skb
->len
- tunnel_hlen
- dev
->hard_header_len
;
502 mtu
= dst_mtu(&rt
->dst
) - dev
->hard_header_len
503 - sizeof(struct iphdr
) - tunnel_hlen
;
505 mtu
= skb_valid_dst(skb
) ? dst_mtu(skb_dst(skb
)) : dev
->mtu
;
507 if (skb_valid_dst(skb
))
508 skb_dst_update_pmtu_no_confirm(skb
, mtu
);
510 if (skb
->protocol
== htons(ETH_P_IP
)) {
511 if (!skb_is_gso(skb
) &&
512 (inner_iph
->frag_off
& htons(IP_DF
)) &&
514 memset(IPCB(skb
), 0, sizeof(*IPCB(skb
)));
515 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
519 #if IS_ENABLED(CONFIG_IPV6)
520 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
521 struct rt6_info
*rt6
;
524 rt6
= skb_valid_dst(skb
) ? (struct rt6_info
*)skb_dst(skb
) :
526 daddr
= md
? dst
: tunnel
->parms
.iph
.daddr
;
528 if (rt6
&& mtu
< dst_mtu(skb_dst(skb
)) &&
529 mtu
>= IPV6_MIN_MTU
) {
530 if ((daddr
&& !ipv4_is_multicast(daddr
)) ||
531 rt6
->rt6i_dst
.plen
== 128) {
532 rt6
->rt6i_flags
|= RTF_MODIFIED
;
533 dst_metric_set(skb_dst(skb
), RTAX_MTU
, mtu
);
537 if (!skb_is_gso(skb
) && mtu
>= IPV6_MIN_MTU
&&
539 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
547 void ip_md_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
548 u8 proto
, int tunnel_hlen
)
550 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
551 u32 headroom
= sizeof(struct iphdr
);
552 struct ip_tunnel_info
*tun_info
;
553 const struct ip_tunnel_key
*key
;
554 const struct iphdr
*inner_iph
;
555 struct rtable
*rt
= NULL
;
561 tun_info
= skb_tunnel_info(skb
);
562 if (unlikely(!tun_info
|| !(tun_info
->mode
& IP_TUNNEL_INFO_TX
) ||
563 ip_tunnel_info_af(tun_info
) != AF_INET
))
565 key
= &tun_info
->key
;
566 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
567 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
570 if (skb
->protocol
== htons(ETH_P_IP
))
571 tos
= inner_iph
->tos
;
572 else if (skb
->protocol
== htons(ETH_P_IPV6
))
573 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
575 ip_tunnel_init_flow(&fl4
, proto
, key
->u
.ipv4
.dst
, key
->u
.ipv4
.src
,
576 tunnel_id_to_key32(key
->tun_id
), RT_TOS(tos
),
577 0, skb
->mark
, skb_get_hash(skb
));
578 if (tunnel
->encap
.type
!= TUNNEL_ENCAP_NONE
)
581 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
583 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
, &fl4
.saddr
);
585 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
587 dev
->stats
.tx_carrier_errors
++;
591 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
594 if (rt
->dst
.dev
== dev
) {
596 dev
->stats
.collisions
++;
600 if (key
->tun_flags
& TUNNEL_DONT_FRAGMENT
)
602 if (tnl_update_pmtu(dev
, skb
, rt
, df
, inner_iph
, tunnel_hlen
,
603 key
->u
.ipv4
.dst
, true)) {
608 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
611 if (skb
->protocol
== htons(ETH_P_IP
))
612 ttl
= inner_iph
->ttl
;
613 else if (skb
->protocol
== htons(ETH_P_IPV6
))
614 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
616 ttl
= ip4_dst_hoplimit(&rt
->dst
);
619 if (!df
&& skb
->protocol
== htons(ETH_P_IP
))
620 df
= inner_iph
->frag_off
& htons(IP_DF
);
622 headroom
+= LL_RESERVED_SPACE(rt
->dst
.dev
) + rt
->dst
.header_len
;
623 if (headroom
> dev
->needed_headroom
)
624 dev
->needed_headroom
= headroom
;
626 if (skb_cow_head(skb
, dev
->needed_headroom
)) {
630 iptunnel_xmit(NULL
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, proto
, tos
, ttl
,
631 df
, !net_eq(tunnel
->net
, dev_net(dev
)));
634 dev
->stats
.tx_errors
++;
637 dev
->stats
.tx_dropped
++;
641 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit
);
643 void ip_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
644 const struct iphdr
*tnl_params
, u8 protocol
)
646 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
647 struct ip_tunnel_info
*tun_info
= NULL
;
648 const struct iphdr
*inner_iph
;
649 unsigned int max_headroom
; /* The extra header space needed */
650 struct rtable
*rt
= NULL
; /* Route to the other host */
651 bool use_cache
= false;
659 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
660 connected
= (tunnel
->parms
.iph
.daddr
!= 0);
662 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
664 dst
= tnl_params
->daddr
;
669 dev
->stats
.tx_fifo_errors
++;
673 tun_info
= skb_tunnel_info(skb
);
674 if (tun_info
&& (tun_info
->mode
& IP_TUNNEL_INFO_TX
) &&
675 ip_tunnel_info_af(tun_info
) == AF_INET
&&
676 tun_info
->key
.u
.ipv4
.dst
) {
677 dst
= tun_info
->key
.u
.ipv4
.dst
;
681 else if (skb
->protocol
== htons(ETH_P_IP
)) {
682 rt
= skb_rtable(skb
);
683 dst
= rt_nexthop(rt
, inner_iph
->daddr
);
685 #if IS_ENABLED(CONFIG_IPV6)
686 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
687 const struct in6_addr
*addr6
;
688 struct neighbour
*neigh
;
689 bool do_tx_error_icmp
;
692 neigh
= dst_neigh_lookup(skb_dst(skb
),
693 &ipv6_hdr(skb
)->daddr
);
697 addr6
= (const struct in6_addr
*)&neigh
->primary_key
;
698 addr_type
= ipv6_addr_type(addr6
);
700 if (addr_type
== IPV6_ADDR_ANY
) {
701 addr6
= &ipv6_hdr(skb
)->daddr
;
702 addr_type
= ipv6_addr_type(addr6
);
705 if ((addr_type
& IPV6_ADDR_COMPATv4
) == 0)
706 do_tx_error_icmp
= true;
708 do_tx_error_icmp
= false;
709 dst
= addr6
->s6_addr32
[3];
711 neigh_release(neigh
);
712 if (do_tx_error_icmp
)
723 tos
= tnl_params
->tos
;
726 if (skb
->protocol
== htons(ETH_P_IP
)) {
727 tos
= inner_iph
->tos
;
729 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
730 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
735 ip_tunnel_init_flow(&fl4
, protocol
, dst
, tnl_params
->saddr
,
736 tunnel
->parms
.o_key
, RT_TOS(tos
), tunnel
->parms
.link
,
737 tunnel
->fwmark
, skb_get_hash(skb
));
739 if (ip_tunnel_encap(skb
, tunnel
, &protocol
, &fl4
) < 0)
742 if (connected
&& md
) {
743 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
745 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
,
748 rt
= connected
? dst_cache_get_ip4(&tunnel
->dst_cache
,
753 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
756 dev
->stats
.tx_carrier_errors
++;
760 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
762 else if (!md
&& connected
)
763 dst_cache_set_ip4(&tunnel
->dst_cache
, &rt
->dst
,
767 if (rt
->dst
.dev
== dev
) {
769 dev
->stats
.collisions
++;
773 if (tnl_update_pmtu(dev
, skb
, rt
, tnl_params
->frag_off
, inner_iph
,
779 if (tunnel
->err_count
> 0) {
780 if (time_before(jiffies
,
781 tunnel
->err_time
+ IPTUNNEL_ERR_TIMEO
)) {
784 dst_link_failure(skb
);
786 tunnel
->err_count
= 0;
789 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
790 ttl
= tnl_params
->ttl
;
792 if (skb
->protocol
== htons(ETH_P_IP
))
793 ttl
= inner_iph
->ttl
;
794 #if IS_ENABLED(CONFIG_IPV6)
795 else if (skb
->protocol
== htons(ETH_P_IPV6
))
796 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
799 ttl
= ip4_dst_hoplimit(&rt
->dst
);
802 df
= tnl_params
->frag_off
;
803 if (skb
->protocol
== htons(ETH_P_IP
) && !tunnel
->ignore_df
)
804 df
|= (inner_iph
->frag_off
&htons(IP_DF
));
806 max_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + sizeof(struct iphdr
)
807 + rt
->dst
.header_len
+ ip_encap_hlen(&tunnel
->encap
);
808 if (max_headroom
> dev
->needed_headroom
)
809 dev
->needed_headroom
= max_headroom
;
811 if (skb_cow_head(skb
, dev
->needed_headroom
)) {
813 dev
->stats
.tx_dropped
++;
818 iptunnel_xmit(NULL
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, protocol
, tos
, ttl
,
819 df
, !net_eq(tunnel
->net
, dev_net(dev
)));
822 #if IS_ENABLED(CONFIG_IPV6)
824 dst_link_failure(skb
);
827 dev
->stats
.tx_errors
++;
830 EXPORT_SYMBOL_GPL(ip_tunnel_xmit
);
832 static void ip_tunnel_update(struct ip_tunnel_net
*itn
,
834 struct net_device
*dev
,
835 struct ip_tunnel_parm
*p
,
839 ip_tunnel_del(itn
, t
);
840 t
->parms
.iph
.saddr
= p
->iph
.saddr
;
841 t
->parms
.iph
.daddr
= p
->iph
.daddr
;
842 t
->parms
.i_key
= p
->i_key
;
843 t
->parms
.o_key
= p
->o_key
;
844 if (dev
->type
!= ARPHRD_ETHER
) {
845 memcpy(dev
->dev_addr
, &p
->iph
.saddr
, 4);
846 memcpy(dev
->broadcast
, &p
->iph
.daddr
, 4);
848 ip_tunnel_add(itn
, t
);
850 t
->parms
.iph
.ttl
= p
->iph
.ttl
;
851 t
->parms
.iph
.tos
= p
->iph
.tos
;
852 t
->parms
.iph
.frag_off
= p
->iph
.frag_off
;
854 if (t
->parms
.link
!= p
->link
|| t
->fwmark
!= fwmark
) {
857 t
->parms
.link
= p
->link
;
859 mtu
= ip_tunnel_bind_dev(dev
);
863 dst_cache_reset(&t
->dst_cache
);
864 netdev_state_change(dev
);
867 int ip_tunnel_ioctl(struct net_device
*dev
, struct ip_tunnel_parm
*p
, int cmd
)
870 struct ip_tunnel
*t
= netdev_priv(dev
);
871 struct net
*net
= t
->net
;
872 struct ip_tunnel_net
*itn
= net_generic(net
, t
->ip_tnl_net_id
);
876 if (dev
== itn
->fb_tunnel_dev
) {
877 t
= ip_tunnel_find(itn
, p
, itn
->fb_tunnel_dev
->type
);
879 t
= netdev_priv(dev
);
881 memcpy(p
, &t
->parms
, sizeof(*p
));
887 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
890 p
->iph
.frag_off
|= htons(IP_DF
);
891 if (!(p
->i_flags
& VTI_ISVTI
)) {
892 if (!(p
->i_flags
& TUNNEL_KEY
))
894 if (!(p
->o_flags
& TUNNEL_KEY
))
898 t
= ip_tunnel_find(itn
, p
, itn
->type
);
900 if (cmd
== SIOCADDTUNNEL
) {
902 t
= ip_tunnel_create(net
, itn
, p
);
903 err
= PTR_ERR_OR_ZERO(t
);
910 if (dev
!= itn
->fb_tunnel_dev
&& cmd
== SIOCCHGTUNNEL
) {
917 unsigned int nflags
= 0;
919 if (ipv4_is_multicast(p
->iph
.daddr
))
920 nflags
= IFF_BROADCAST
;
921 else if (p
->iph
.daddr
)
922 nflags
= IFF_POINTOPOINT
;
924 if ((dev
->flags
^nflags
)&(IFF_POINTOPOINT
|IFF_BROADCAST
)) {
929 t
= netdev_priv(dev
);
935 ip_tunnel_update(itn
, t
, dev
, p
, true, 0);
943 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
946 if (dev
== itn
->fb_tunnel_dev
) {
948 t
= ip_tunnel_find(itn
, p
, itn
->fb_tunnel_dev
->type
);
952 if (t
== netdev_priv(itn
->fb_tunnel_dev
))
956 unregister_netdevice(dev
);
967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl
);
969 int __ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
, bool strict
)
971 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
972 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
973 int max_mtu
= IP_MAX_MTU
- dev
->hard_header_len
- t_hlen
;
975 if (new_mtu
< ETH_MIN_MTU
)
978 if (new_mtu
> max_mtu
) {
988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu
);
990 int ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
)
992 return __ip_tunnel_change_mtu(dev
, new_mtu
, true);
994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu
);
996 static void ip_tunnel_dev_free(struct net_device
*dev
)
998 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1000 gro_cells_destroy(&tunnel
->gro_cells
);
1001 dst_cache_destroy(&tunnel
->dst_cache
);
1002 free_percpu(dev
->tstats
);
1005 void ip_tunnel_dellink(struct net_device
*dev
, struct list_head
*head
)
1007 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1008 struct ip_tunnel_net
*itn
;
1010 itn
= net_generic(tunnel
->net
, tunnel
->ip_tnl_net_id
);
1012 if (itn
->fb_tunnel_dev
!= dev
) {
1013 ip_tunnel_del(itn
, netdev_priv(dev
));
1014 unregister_netdevice_queue(dev
, head
);
1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink
);
1019 struct net
*ip_tunnel_get_link_net(const struct net_device
*dev
)
1021 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1025 EXPORT_SYMBOL(ip_tunnel_get_link_net
);
1027 int ip_tunnel_get_iflink(const struct net_device
*dev
)
1029 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1031 return tunnel
->parms
.link
;
1033 EXPORT_SYMBOL(ip_tunnel_get_iflink
);
1035 int ip_tunnel_init_net(struct net
*net
, unsigned int ip_tnl_net_id
,
1036 struct rtnl_link_ops
*ops
, char *devname
)
1038 struct ip_tunnel_net
*itn
= net_generic(net
, ip_tnl_net_id
);
1039 struct ip_tunnel_parm parms
;
1042 itn
->rtnl_link_ops
= ops
;
1043 for (i
= 0; i
< IP_TNL_HASH_SIZE
; i
++)
1044 INIT_HLIST_HEAD(&itn
->tunnels
[i
]);
1046 if (!ops
|| !net_has_fallback_tunnels(net
)) {
1047 struct ip_tunnel_net
*it_init_net
;
1049 it_init_net
= net_generic(&init_net
, ip_tnl_net_id
);
1050 itn
->type
= it_init_net
->type
;
1051 itn
->fb_tunnel_dev
= NULL
;
1055 memset(&parms
, 0, sizeof(parms
));
1057 strlcpy(parms
.name
, devname
, IFNAMSIZ
);
1060 itn
->fb_tunnel_dev
= __ip_tunnel_create(net
, ops
, &parms
);
1061 /* FB netdevice is special: we have one, and only one per netns.
1062 * Allowing to move it to another netns is clearly unsafe.
1064 if (!IS_ERR(itn
->fb_tunnel_dev
)) {
1065 itn
->fb_tunnel_dev
->features
|= NETIF_F_NETNS_LOCAL
;
1066 itn
->fb_tunnel_dev
->mtu
= ip_tunnel_bind_dev(itn
->fb_tunnel_dev
);
1067 ip_tunnel_add(itn
, netdev_priv(itn
->fb_tunnel_dev
));
1068 itn
->type
= itn
->fb_tunnel_dev
->type
;
1072 return PTR_ERR_OR_ZERO(itn
->fb_tunnel_dev
);
1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net
);
1076 static void ip_tunnel_destroy(struct net
*net
, struct ip_tunnel_net
*itn
,
1077 struct list_head
*head
,
1078 struct rtnl_link_ops
*ops
)
1080 struct net_device
*dev
, *aux
;
1083 for_each_netdev_safe(net
, dev
, aux
)
1084 if (dev
->rtnl_link_ops
== ops
)
1085 unregister_netdevice_queue(dev
, head
);
1087 for (h
= 0; h
< IP_TNL_HASH_SIZE
; h
++) {
1088 struct ip_tunnel
*t
;
1089 struct hlist_node
*n
;
1090 struct hlist_head
*thead
= &itn
->tunnels
[h
];
1092 hlist_for_each_entry_safe(t
, n
, thead
, hash_node
)
1093 /* If dev is in the same netns, it has already
1094 * been added to the list by the previous loop.
1096 if (!net_eq(dev_net(t
->dev
), net
))
1097 unregister_netdevice_queue(t
->dev
, head
);
1101 void ip_tunnel_delete_nets(struct list_head
*net_list
, unsigned int id
,
1102 struct rtnl_link_ops
*ops
)
1104 struct ip_tunnel_net
*itn
;
1109 list_for_each_entry(net
, net_list
, exit_list
) {
1110 itn
= net_generic(net
, id
);
1111 ip_tunnel_destroy(net
, itn
, &list
, ops
);
1113 unregister_netdevice_many(&list
);
1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets
);
1118 int ip_tunnel_newlink(struct net_device
*dev
, struct nlattr
*tb
[],
1119 struct ip_tunnel_parm
*p
, __u32 fwmark
)
1121 struct ip_tunnel
*nt
;
1122 struct net
*net
= dev_net(dev
);
1123 struct ip_tunnel_net
*itn
;
1127 nt
= netdev_priv(dev
);
1128 itn
= net_generic(net
, nt
->ip_tnl_net_id
);
1130 if (nt
->collect_md
) {
1131 if (rtnl_dereference(itn
->collect_md_tun
))
1134 if (ip_tunnel_find(itn
, p
, dev
->type
))
1140 nt
->fwmark
= fwmark
;
1141 err
= register_netdevice(dev
);
1143 goto err_register_netdevice
;
1145 if (dev
->type
== ARPHRD_ETHER
&& !tb
[IFLA_ADDRESS
])
1146 eth_hw_addr_random(dev
);
1148 mtu
= ip_tunnel_bind_dev(dev
);
1150 unsigned int max
= IP_MAX_MTU
- dev
->hard_header_len
- nt
->hlen
;
1152 mtu
= clamp(dev
->mtu
, (unsigned int)ETH_MIN_MTU
,
1153 (unsigned int)(max
- sizeof(struct iphdr
)));
1156 err
= dev_set_mtu(dev
, mtu
);
1158 goto err_dev_set_mtu
;
1160 ip_tunnel_add(itn
, nt
);
1164 unregister_netdevice(dev
);
1165 err_register_netdevice
:
1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink
);
1170 int ip_tunnel_changelink(struct net_device
*dev
, struct nlattr
*tb
[],
1171 struct ip_tunnel_parm
*p
, __u32 fwmark
)
1173 struct ip_tunnel
*t
;
1174 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1175 struct net
*net
= tunnel
->net
;
1176 struct ip_tunnel_net
*itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
1178 if (dev
== itn
->fb_tunnel_dev
)
1181 t
= ip_tunnel_find(itn
, p
, dev
->type
);
1189 if (dev
->type
!= ARPHRD_ETHER
) {
1190 unsigned int nflags
= 0;
1192 if (ipv4_is_multicast(p
->iph
.daddr
))
1193 nflags
= IFF_BROADCAST
;
1194 else if (p
->iph
.daddr
)
1195 nflags
= IFF_POINTOPOINT
;
1197 if ((dev
->flags
^ nflags
) &
1198 (IFF_POINTOPOINT
| IFF_BROADCAST
))
1203 ip_tunnel_update(itn
, t
, dev
, p
, !tb
[IFLA_MTU
], fwmark
);
1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink
);
1208 int ip_tunnel_init(struct net_device
*dev
)
1210 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1211 struct iphdr
*iph
= &tunnel
->parms
.iph
;
1214 dev
->needs_free_netdev
= true;
1215 dev
->priv_destructor
= ip_tunnel_dev_free
;
1216 dev
->tstats
= netdev_alloc_pcpu_stats(struct pcpu_sw_netstats
);
1220 err
= dst_cache_init(&tunnel
->dst_cache
, GFP_KERNEL
);
1222 free_percpu(dev
->tstats
);
1226 err
= gro_cells_init(&tunnel
->gro_cells
, dev
);
1228 dst_cache_destroy(&tunnel
->dst_cache
);
1229 free_percpu(dev
->tstats
);
1234 tunnel
->net
= dev_net(dev
);
1235 strcpy(tunnel
->parms
.name
, dev
->name
);
1239 if (tunnel
->collect_md
) {
1240 dev
->features
|= NETIF_F_NETNS_LOCAL
;
1241 netif_keep_dst(dev
);
1245 EXPORT_SYMBOL_GPL(ip_tunnel_init
);
1247 void ip_tunnel_uninit(struct net_device
*dev
)
1249 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1250 struct net
*net
= tunnel
->net
;
1251 struct ip_tunnel_net
*itn
;
1253 itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
1254 /* fb_tunnel_dev will be unregisted in net-exit call. */
1255 if (itn
->fb_tunnel_dev
!= dev
)
1256 ip_tunnel_del(itn
, netdev_priv(dev
));
1258 dst_cache_reset(&tunnel
->dst_cache
);
1260 EXPORT_SYMBOL_GPL(ip_tunnel_uninit
);
1262 /* Do least required initialization, rest of init is done in tunnel_init call */
1263 void ip_tunnel_setup(struct net_device
*dev
, unsigned int net_id
)
1265 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1266 tunnel
->ip_tnl_net_id
= net_id
;
1268 EXPORT_SYMBOL_GPL(ip_tunnel_setup
);
1270 MODULE_LICENSE("GPL");