1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key
, __be32 remote
)
55 return hash_32((__force u32
)key
^ (__force u32
)remote
,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm
*p
,
60 __be16 flags
, __be32 key
)
62 if (p
->i_flags
& TUNNEL_KEY
) {
63 if (flags
& TUNNEL_KEY
)
64 return key
== p
->i_key
;
66 /* key expected, none present */
69 return !(flags
& TUNNEL_KEY
);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel
*ip_tunnel_lookup(struct ip_tunnel_net
*itn
,
84 int link
, __be16 flags
,
85 __be32 remote
, __be32 local
,
89 struct ip_tunnel
*t
, *cand
= NULL
;
90 struct hlist_head
*head
;
92 hash
= ip_tunnel_hash(key
, remote
);
93 head
= &itn
->tunnels
[hash
];
95 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
96 if (local
!= t
->parms
.iph
.saddr
||
97 remote
!= t
->parms
.iph
.daddr
||
98 !(t
->dev
->flags
& IFF_UP
))
101 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
104 if (t
->parms
.link
== link
)
110 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
111 if (remote
!= t
->parms
.iph
.daddr
||
112 t
->parms
.iph
.saddr
!= 0 ||
113 !(t
->dev
->flags
& IFF_UP
))
116 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
119 if (t
->parms
.link
== link
)
125 hash
= ip_tunnel_hash(key
, 0);
126 head
= &itn
->tunnels
[hash
];
128 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
129 if ((local
!= t
->parms
.iph
.saddr
|| t
->parms
.iph
.daddr
!= 0) &&
130 (local
!= t
->parms
.iph
.daddr
|| !ipv4_is_multicast(local
)))
133 if (!(t
->dev
->flags
& IFF_UP
))
136 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
139 if (t
->parms
.link
== link
)
145 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
146 if ((!(flags
& TUNNEL_NO_KEY
) && t
->parms
.i_key
!= key
) ||
147 t
->parms
.iph
.saddr
!= 0 ||
148 t
->parms
.iph
.daddr
!= 0 ||
149 !(t
->dev
->flags
& IFF_UP
))
152 if (t
->parms
.link
== link
)
161 t
= rcu_dereference(itn
->collect_md_tun
);
162 if (t
&& t
->dev
->flags
& IFF_UP
)
165 if (itn
->fb_tunnel_dev
&& itn
->fb_tunnel_dev
->flags
& IFF_UP
)
166 return netdev_priv(itn
->fb_tunnel_dev
);
170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup
);
172 static struct hlist_head
*ip_bucket(struct ip_tunnel_net
*itn
,
173 struct ip_tunnel_parm
*parms
)
177 __be32 i_key
= parms
->i_key
;
179 if (parms
->iph
.daddr
&& !ipv4_is_multicast(parms
->iph
.daddr
))
180 remote
= parms
->iph
.daddr
;
184 if (!(parms
->i_flags
& TUNNEL_KEY
) && (parms
->i_flags
& VTI_ISVTI
))
187 h
= ip_tunnel_hash(i_key
, remote
);
188 return &itn
->tunnels
[h
];
191 static void ip_tunnel_add(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
193 struct hlist_head
*head
= ip_bucket(itn
, &t
->parms
);
196 rcu_assign_pointer(itn
->collect_md_tun
, t
);
197 hlist_add_head_rcu(&t
->hash_node
, head
);
200 static void ip_tunnel_del(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
203 rcu_assign_pointer(itn
->collect_md_tun
, NULL
);
204 hlist_del_init_rcu(&t
->hash_node
);
207 static struct ip_tunnel
*ip_tunnel_find(struct ip_tunnel_net
*itn
,
208 struct ip_tunnel_parm
*parms
,
211 __be32 remote
= parms
->iph
.daddr
;
212 __be32 local
= parms
->iph
.saddr
;
213 __be32 key
= parms
->i_key
;
214 __be16 flags
= parms
->i_flags
;
215 int link
= parms
->link
;
216 struct ip_tunnel
*t
= NULL
;
217 struct hlist_head
*head
= ip_bucket(itn
, parms
);
219 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
220 if (local
== t
->parms
.iph
.saddr
&&
221 remote
== t
->parms
.iph
.daddr
&&
222 link
== t
->parms
.link
&&
223 type
== t
->dev
->type
&&
224 ip_tunnel_key_match(&t
->parms
, flags
, key
))
230 static struct net_device
*__ip_tunnel_create(struct net
*net
,
231 const struct rtnl_link_ops
*ops
,
232 struct ip_tunnel_parm
*parms
)
235 struct ip_tunnel
*tunnel
;
236 struct net_device
*dev
;
240 if (parms
->name
[0]) {
241 if (!dev_valid_name(parms
->name
))
243 strlcpy(name
, parms
->name
, IFNAMSIZ
);
245 if (strlen(ops
->kind
) > (IFNAMSIZ
- 3))
247 strcpy(name
, ops
->kind
);
252 dev
= alloc_netdev(ops
->priv_size
, name
, NET_NAME_UNKNOWN
, ops
->setup
);
257 dev_net_set(dev
, net
);
259 dev
->rtnl_link_ops
= ops
;
261 tunnel
= netdev_priv(dev
);
262 tunnel
->parms
= *parms
;
265 err
= register_netdevice(dev
);
277 static int ip_tunnel_bind_dev(struct net_device
*dev
)
279 struct net_device
*tdev
= NULL
;
280 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
281 const struct iphdr
*iph
;
282 int hlen
= LL_MAX_HEADER
;
283 int mtu
= ETH_DATA_LEN
;
284 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
286 iph
= &tunnel
->parms
.iph
;
288 /* Guess output device to choose reasonable mtu and needed_headroom */
293 ip_tunnel_init_flow(&fl4
, iph
->protocol
, iph
->daddr
,
294 iph
->saddr
, tunnel
->parms
.o_key
,
295 RT_TOS(iph
->tos
), tunnel
->parms
.link
,
297 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
303 if (dev
->type
!= ARPHRD_ETHER
)
304 dev
->flags
|= IFF_POINTOPOINT
;
306 dst_cache_reset(&tunnel
->dst_cache
);
309 if (!tdev
&& tunnel
->parms
.link
)
310 tdev
= __dev_get_by_index(tunnel
->net
, tunnel
->parms
.link
);
313 hlen
= tdev
->hard_header_len
+ tdev
->needed_headroom
;
314 mtu
= min(tdev
->mtu
, IP_MAX_MTU
);
317 dev
->needed_headroom
= t_hlen
+ hlen
;
318 mtu
-= (dev
->hard_header_len
+ t_hlen
);
320 if (mtu
< IPV4_MIN_MTU
)
326 static struct ip_tunnel
*ip_tunnel_create(struct net
*net
,
327 struct ip_tunnel_net
*itn
,
328 struct ip_tunnel_parm
*parms
)
330 struct ip_tunnel
*nt
;
331 struct net_device
*dev
;
336 dev
= __ip_tunnel_create(net
, itn
->rtnl_link_ops
, parms
);
338 return ERR_CAST(dev
);
340 mtu
= ip_tunnel_bind_dev(dev
);
341 err
= dev_set_mtu(dev
, mtu
);
343 goto err_dev_set_mtu
;
345 nt
= netdev_priv(dev
);
346 t_hlen
= nt
->hlen
+ sizeof(struct iphdr
);
347 dev
->min_mtu
= ETH_MIN_MTU
;
348 dev
->max_mtu
= IP_MAX_MTU
- dev
->hard_header_len
- t_hlen
;
349 ip_tunnel_add(itn
, nt
);
353 unregister_netdevice(dev
);
357 int ip_tunnel_rcv(struct ip_tunnel
*tunnel
, struct sk_buff
*skb
,
358 const struct tnl_ptk_info
*tpi
, struct metadata_dst
*tun_dst
,
361 struct pcpu_sw_netstats
*tstats
;
362 const struct iphdr
*iph
= ip_hdr(skb
);
365 #ifdef CONFIG_NET_IPGRE_BROADCAST
366 if (ipv4_is_multicast(iph
->daddr
)) {
367 tunnel
->dev
->stats
.multicast
++;
368 skb
->pkt_type
= PACKET_BROADCAST
;
372 if ((!(tpi
->flags
&TUNNEL_CSUM
) && (tunnel
->parms
.i_flags
&TUNNEL_CSUM
)) ||
373 ((tpi
->flags
&TUNNEL_CSUM
) && !(tunnel
->parms
.i_flags
&TUNNEL_CSUM
))) {
374 tunnel
->dev
->stats
.rx_crc_errors
++;
375 tunnel
->dev
->stats
.rx_errors
++;
379 if (tunnel
->parms
.i_flags
&TUNNEL_SEQ
) {
380 if (!(tpi
->flags
&TUNNEL_SEQ
) ||
381 (tunnel
->i_seqno
&& (s32
)(ntohl(tpi
->seq
) - tunnel
->i_seqno
) < 0)) {
382 tunnel
->dev
->stats
.rx_fifo_errors
++;
383 tunnel
->dev
->stats
.rx_errors
++;
386 tunnel
->i_seqno
= ntohl(tpi
->seq
) + 1;
389 skb_reset_network_header(skb
);
391 err
= IP_ECN_decapsulate(iph
, skb
);
394 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
395 &iph
->saddr
, iph
->tos
);
397 ++tunnel
->dev
->stats
.rx_frame_errors
;
398 ++tunnel
->dev
->stats
.rx_errors
;
403 tstats
= this_cpu_ptr(tunnel
->dev
->tstats
);
404 u64_stats_update_begin(&tstats
->syncp
);
405 tstats
->rx_packets
++;
406 tstats
->rx_bytes
+= skb
->len
;
407 u64_stats_update_end(&tstats
->syncp
);
409 skb_scrub_packet(skb
, !net_eq(tunnel
->net
, dev_net(tunnel
->dev
)));
411 if (tunnel
->dev
->type
== ARPHRD_ETHER
) {
412 skb
->protocol
= eth_type_trans(skb
, tunnel
->dev
);
413 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
415 skb
->dev
= tunnel
->dev
;
419 skb_dst_set(skb
, (struct dst_entry
*)tun_dst
);
421 gro_cells_receive(&tunnel
->gro_cells
, skb
);
426 dst_release((struct dst_entry
*)tun_dst
);
430 EXPORT_SYMBOL_GPL(ip_tunnel_rcv
);
432 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops
*ops
,
435 if (num
>= MAX_IPTUN_ENCAP_OPS
)
438 return !cmpxchg((const struct ip_tunnel_encap_ops
**)
442 EXPORT_SYMBOL(ip_tunnel_encap_add_ops
);
444 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops
*ops
,
449 if (num
>= MAX_IPTUN_ENCAP_OPS
)
452 ret
= (cmpxchg((const struct ip_tunnel_encap_ops
**)
454 ops
, NULL
) == ops
) ? 0 : -1;
460 EXPORT_SYMBOL(ip_tunnel_encap_del_ops
);
462 int ip_tunnel_encap_setup(struct ip_tunnel
*t
,
463 struct ip_tunnel_encap
*ipencap
)
467 memset(&t
->encap
, 0, sizeof(t
->encap
));
469 hlen
= ip_encap_hlen(ipencap
);
473 t
->encap
.type
= ipencap
->type
;
474 t
->encap
.sport
= ipencap
->sport
;
475 t
->encap
.dport
= ipencap
->dport
;
476 t
->encap
.flags
= ipencap
->flags
;
478 t
->encap_hlen
= hlen
;
479 t
->hlen
= t
->encap_hlen
+ t
->tun_hlen
;
483 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup
);
485 static int tnl_update_pmtu(struct net_device
*dev
, struct sk_buff
*skb
,
486 struct rtable
*rt
, __be16 df
,
487 const struct iphdr
*inner_iph
,
488 int tunnel_hlen
, __be32 dst
, bool md
)
490 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
494 tunnel_hlen
= md
? tunnel_hlen
: tunnel
->hlen
;
495 pkt_size
= skb
->len
- tunnel_hlen
- dev
->hard_header_len
;
498 mtu
= dst_mtu(&rt
->dst
) - dev
->hard_header_len
499 - sizeof(struct iphdr
) - tunnel_hlen
;
501 mtu
= skb_valid_dst(skb
) ? dst_mtu(skb_dst(skb
)) : dev
->mtu
;
503 if (skb_valid_dst(skb
))
504 skb_dst_update_pmtu_no_confirm(skb
, mtu
);
506 if (skb
->protocol
== htons(ETH_P_IP
)) {
507 if (!skb_is_gso(skb
) &&
508 (inner_iph
->frag_off
& htons(IP_DF
)) &&
510 memset(IPCB(skb
), 0, sizeof(*IPCB(skb
)));
511 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
515 #if IS_ENABLED(CONFIG_IPV6)
516 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
517 struct rt6_info
*rt6
;
520 rt6
= skb_valid_dst(skb
) ? (struct rt6_info
*)skb_dst(skb
) :
522 daddr
= md
? dst
: tunnel
->parms
.iph
.daddr
;
524 if (rt6
&& mtu
< dst_mtu(skb_dst(skb
)) &&
525 mtu
>= IPV6_MIN_MTU
) {
526 if ((daddr
&& !ipv4_is_multicast(daddr
)) ||
527 rt6
->rt6i_dst
.plen
== 128) {
528 rt6
->rt6i_flags
|= RTF_MODIFIED
;
529 dst_metric_set(skb_dst(skb
), RTAX_MTU
, mtu
);
533 if (!skb_is_gso(skb
) && mtu
>= IPV6_MIN_MTU
&&
535 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
543 void ip_md_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
544 u8 proto
, int tunnel_hlen
)
546 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
547 u32 headroom
= sizeof(struct iphdr
);
548 struct ip_tunnel_info
*tun_info
;
549 const struct ip_tunnel_key
*key
;
550 const struct iphdr
*inner_iph
;
551 struct rtable
*rt
= NULL
;
557 tun_info
= skb_tunnel_info(skb
);
558 if (unlikely(!tun_info
|| !(tun_info
->mode
& IP_TUNNEL_INFO_TX
) ||
559 ip_tunnel_info_af(tun_info
) != AF_INET
))
561 key
= &tun_info
->key
;
562 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
563 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
566 if (skb
->protocol
== htons(ETH_P_IP
))
567 tos
= inner_iph
->tos
;
568 else if (skb
->protocol
== htons(ETH_P_IPV6
))
569 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
571 ip_tunnel_init_flow(&fl4
, proto
, key
->u
.ipv4
.dst
, key
->u
.ipv4
.src
,
572 tunnel_id_to_key32(key
->tun_id
), RT_TOS(tos
),
573 0, skb
->mark
, skb_get_hash(skb
));
574 if (tunnel
->encap
.type
!= TUNNEL_ENCAP_NONE
)
577 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
579 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
, &fl4
.saddr
);
581 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
583 dev
->stats
.tx_carrier_errors
++;
587 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
590 if (rt
->dst
.dev
== dev
) {
592 dev
->stats
.collisions
++;
596 if (key
->tun_flags
& TUNNEL_DONT_FRAGMENT
)
598 if (tnl_update_pmtu(dev
, skb
, rt
, df
, inner_iph
, tunnel_hlen
,
599 key
->u
.ipv4
.dst
, true)) {
604 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
607 if (skb
->protocol
== htons(ETH_P_IP
))
608 ttl
= inner_iph
->ttl
;
609 else if (skb
->protocol
== htons(ETH_P_IPV6
))
610 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
612 ttl
= ip4_dst_hoplimit(&rt
->dst
);
615 if (!df
&& skb
->protocol
== htons(ETH_P_IP
))
616 df
= inner_iph
->frag_off
& htons(IP_DF
);
618 headroom
+= LL_RESERVED_SPACE(rt
->dst
.dev
) + rt
->dst
.header_len
;
619 if (headroom
> dev
->needed_headroom
)
620 dev
->needed_headroom
= headroom
;
622 if (skb_cow_head(skb
, dev
->needed_headroom
)) {
626 iptunnel_xmit(NULL
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, proto
, tos
, ttl
,
627 df
, !net_eq(tunnel
->net
, dev_net(dev
)));
630 dev
->stats
.tx_errors
++;
633 dev
->stats
.tx_dropped
++;
637 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit
);
639 void ip_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
640 const struct iphdr
*tnl_params
, u8 protocol
)
642 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
643 struct ip_tunnel_info
*tun_info
= NULL
;
644 const struct iphdr
*inner_iph
;
645 unsigned int max_headroom
; /* The extra header space needed */
646 struct rtable
*rt
= NULL
; /* Route to the other host */
647 bool use_cache
= false;
655 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
656 connected
= (tunnel
->parms
.iph
.daddr
!= 0);
658 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
660 dst
= tnl_params
->daddr
;
665 dev
->stats
.tx_fifo_errors
++;
669 tun_info
= skb_tunnel_info(skb
);
670 if (tun_info
&& (tun_info
->mode
& IP_TUNNEL_INFO_TX
) &&
671 ip_tunnel_info_af(tun_info
) == AF_INET
&&
672 tun_info
->key
.u
.ipv4
.dst
) {
673 dst
= tun_info
->key
.u
.ipv4
.dst
;
677 else if (skb
->protocol
== htons(ETH_P_IP
)) {
678 rt
= skb_rtable(skb
);
679 dst
= rt_nexthop(rt
, inner_iph
->daddr
);
681 #if IS_ENABLED(CONFIG_IPV6)
682 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
683 const struct in6_addr
*addr6
;
684 struct neighbour
*neigh
;
685 bool do_tx_error_icmp
;
688 neigh
= dst_neigh_lookup(skb_dst(skb
),
689 &ipv6_hdr(skb
)->daddr
);
693 addr6
= (const struct in6_addr
*)&neigh
->primary_key
;
694 addr_type
= ipv6_addr_type(addr6
);
696 if (addr_type
== IPV6_ADDR_ANY
) {
697 addr6
= &ipv6_hdr(skb
)->daddr
;
698 addr_type
= ipv6_addr_type(addr6
);
701 if ((addr_type
& IPV6_ADDR_COMPATv4
) == 0)
702 do_tx_error_icmp
= true;
704 do_tx_error_icmp
= false;
705 dst
= addr6
->s6_addr32
[3];
707 neigh_release(neigh
);
708 if (do_tx_error_icmp
)
719 tos
= tnl_params
->tos
;
722 if (skb
->protocol
== htons(ETH_P_IP
)) {
723 tos
= inner_iph
->tos
;
725 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
726 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
731 ip_tunnel_init_flow(&fl4
, protocol
, dst
, tnl_params
->saddr
,
732 tunnel
->parms
.o_key
, RT_TOS(tos
), tunnel
->parms
.link
,
733 tunnel
->fwmark
, skb_get_hash(skb
));
735 if (ip_tunnel_encap(skb
, tunnel
, &protocol
, &fl4
) < 0)
738 if (connected
&& md
) {
739 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
741 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
,
744 rt
= connected
? dst_cache_get_ip4(&tunnel
->dst_cache
,
749 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
752 dev
->stats
.tx_carrier_errors
++;
756 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
758 else if (!md
&& connected
)
759 dst_cache_set_ip4(&tunnel
->dst_cache
, &rt
->dst
,
763 if (rt
->dst
.dev
== dev
) {
765 dev
->stats
.collisions
++;
769 if (tnl_update_pmtu(dev
, skb
, rt
, tnl_params
->frag_off
, inner_iph
,
775 if (tunnel
->err_count
> 0) {
776 if (time_before(jiffies
,
777 tunnel
->err_time
+ IPTUNNEL_ERR_TIMEO
)) {
780 dst_link_failure(skb
);
782 tunnel
->err_count
= 0;
785 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
786 ttl
= tnl_params
->ttl
;
788 if (skb
->protocol
== htons(ETH_P_IP
))
789 ttl
= inner_iph
->ttl
;
790 #if IS_ENABLED(CONFIG_IPV6)
791 else if (skb
->protocol
== htons(ETH_P_IPV6
))
792 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
795 ttl
= ip4_dst_hoplimit(&rt
->dst
);
798 df
= tnl_params
->frag_off
;
799 if (skb
->protocol
== htons(ETH_P_IP
) && !tunnel
->ignore_df
)
800 df
|= (inner_iph
->frag_off
&htons(IP_DF
));
802 max_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + sizeof(struct iphdr
)
803 + rt
->dst
.header_len
+ ip_encap_hlen(&tunnel
->encap
);
804 if (max_headroom
> dev
->needed_headroom
)
805 dev
->needed_headroom
= max_headroom
;
807 if (skb_cow_head(skb
, dev
->needed_headroom
)) {
809 dev
->stats
.tx_dropped
++;
814 iptunnel_xmit(NULL
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, protocol
, tos
, ttl
,
815 df
, !net_eq(tunnel
->net
, dev_net(dev
)));
818 #if IS_ENABLED(CONFIG_IPV6)
820 dst_link_failure(skb
);
823 dev
->stats
.tx_errors
++;
826 EXPORT_SYMBOL_GPL(ip_tunnel_xmit
);
828 static void ip_tunnel_update(struct ip_tunnel_net
*itn
,
830 struct net_device
*dev
,
831 struct ip_tunnel_parm
*p
,
835 ip_tunnel_del(itn
, t
);
836 t
->parms
.iph
.saddr
= p
->iph
.saddr
;
837 t
->parms
.iph
.daddr
= p
->iph
.daddr
;
838 t
->parms
.i_key
= p
->i_key
;
839 t
->parms
.o_key
= p
->o_key
;
840 if (dev
->type
!= ARPHRD_ETHER
) {
841 memcpy(dev
->dev_addr
, &p
->iph
.saddr
, 4);
842 memcpy(dev
->broadcast
, &p
->iph
.daddr
, 4);
844 ip_tunnel_add(itn
, t
);
846 t
->parms
.iph
.ttl
= p
->iph
.ttl
;
847 t
->parms
.iph
.tos
= p
->iph
.tos
;
848 t
->parms
.iph
.frag_off
= p
->iph
.frag_off
;
850 if (t
->parms
.link
!= p
->link
|| t
->fwmark
!= fwmark
) {
853 t
->parms
.link
= p
->link
;
855 mtu
= ip_tunnel_bind_dev(dev
);
859 dst_cache_reset(&t
->dst_cache
);
860 netdev_state_change(dev
);
863 int ip_tunnel_ioctl(struct net_device
*dev
, struct ip_tunnel_parm
*p
, int cmd
)
866 struct ip_tunnel
*t
= netdev_priv(dev
);
867 struct net
*net
= t
->net
;
868 struct ip_tunnel_net
*itn
= net_generic(net
, t
->ip_tnl_net_id
);
872 if (dev
== itn
->fb_tunnel_dev
) {
873 t
= ip_tunnel_find(itn
, p
, itn
->fb_tunnel_dev
->type
);
875 t
= netdev_priv(dev
);
877 memcpy(p
, &t
->parms
, sizeof(*p
));
883 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
886 p
->iph
.frag_off
|= htons(IP_DF
);
887 if (!(p
->i_flags
& VTI_ISVTI
)) {
888 if (!(p
->i_flags
& TUNNEL_KEY
))
890 if (!(p
->o_flags
& TUNNEL_KEY
))
894 t
= ip_tunnel_find(itn
, p
, itn
->type
);
896 if (cmd
== SIOCADDTUNNEL
) {
898 t
= ip_tunnel_create(net
, itn
, p
);
899 err
= PTR_ERR_OR_ZERO(t
);
906 if (dev
!= itn
->fb_tunnel_dev
&& cmd
== SIOCCHGTUNNEL
) {
913 unsigned int nflags
= 0;
915 if (ipv4_is_multicast(p
->iph
.daddr
))
916 nflags
= IFF_BROADCAST
;
917 else if (p
->iph
.daddr
)
918 nflags
= IFF_POINTOPOINT
;
920 if ((dev
->flags
^nflags
)&(IFF_POINTOPOINT
|IFF_BROADCAST
)) {
925 t
= netdev_priv(dev
);
931 ip_tunnel_update(itn
, t
, dev
, p
, true, 0);
939 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
942 if (dev
== itn
->fb_tunnel_dev
) {
944 t
= ip_tunnel_find(itn
, p
, itn
->fb_tunnel_dev
->type
);
948 if (t
== netdev_priv(itn
->fb_tunnel_dev
))
952 unregister_netdevice(dev
);
963 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl
);
965 int __ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
, bool strict
)
967 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
968 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
969 int max_mtu
= IP_MAX_MTU
- dev
->hard_header_len
- t_hlen
;
971 if (new_mtu
< ETH_MIN_MTU
)
974 if (new_mtu
> max_mtu
) {
984 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu
);
986 int ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
)
988 return __ip_tunnel_change_mtu(dev
, new_mtu
, true);
990 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu
);
992 static void ip_tunnel_dev_free(struct net_device
*dev
)
994 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
996 gro_cells_destroy(&tunnel
->gro_cells
);
997 dst_cache_destroy(&tunnel
->dst_cache
);
998 free_percpu(dev
->tstats
);
1001 void ip_tunnel_dellink(struct net_device
*dev
, struct list_head
*head
)
1003 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1004 struct ip_tunnel_net
*itn
;
1006 itn
= net_generic(tunnel
->net
, tunnel
->ip_tnl_net_id
);
1008 if (itn
->fb_tunnel_dev
!= dev
) {
1009 ip_tunnel_del(itn
, netdev_priv(dev
));
1010 unregister_netdevice_queue(dev
, head
);
1013 EXPORT_SYMBOL_GPL(ip_tunnel_dellink
);
1015 struct net
*ip_tunnel_get_link_net(const struct net_device
*dev
)
1017 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1021 EXPORT_SYMBOL(ip_tunnel_get_link_net
);
1023 int ip_tunnel_get_iflink(const struct net_device
*dev
)
1025 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1027 return tunnel
->parms
.link
;
1029 EXPORT_SYMBOL(ip_tunnel_get_iflink
);
1031 int ip_tunnel_init_net(struct net
*net
, unsigned int ip_tnl_net_id
,
1032 struct rtnl_link_ops
*ops
, char *devname
)
1034 struct ip_tunnel_net
*itn
= net_generic(net
, ip_tnl_net_id
);
1035 struct ip_tunnel_parm parms
;
1038 itn
->rtnl_link_ops
= ops
;
1039 for (i
= 0; i
< IP_TNL_HASH_SIZE
; i
++)
1040 INIT_HLIST_HEAD(&itn
->tunnels
[i
]);
1042 if (!ops
|| !net_has_fallback_tunnels(net
)) {
1043 struct ip_tunnel_net
*it_init_net
;
1045 it_init_net
= net_generic(&init_net
, ip_tnl_net_id
);
1046 itn
->type
= it_init_net
->type
;
1047 itn
->fb_tunnel_dev
= NULL
;
1051 memset(&parms
, 0, sizeof(parms
));
1053 strlcpy(parms
.name
, devname
, IFNAMSIZ
);
1056 itn
->fb_tunnel_dev
= __ip_tunnel_create(net
, ops
, &parms
);
1057 /* FB netdevice is special: we have one, and only one per netns.
1058 * Allowing to move it to another netns is clearly unsafe.
1060 if (!IS_ERR(itn
->fb_tunnel_dev
)) {
1061 itn
->fb_tunnel_dev
->features
|= NETIF_F_NETNS_LOCAL
;
1062 itn
->fb_tunnel_dev
->mtu
= ip_tunnel_bind_dev(itn
->fb_tunnel_dev
);
1063 ip_tunnel_add(itn
, netdev_priv(itn
->fb_tunnel_dev
));
1064 itn
->type
= itn
->fb_tunnel_dev
->type
;
1068 return PTR_ERR_OR_ZERO(itn
->fb_tunnel_dev
);
1070 EXPORT_SYMBOL_GPL(ip_tunnel_init_net
);
1072 static void ip_tunnel_destroy(struct net
*net
, struct ip_tunnel_net
*itn
,
1073 struct list_head
*head
,
1074 struct rtnl_link_ops
*ops
)
1076 struct net_device
*dev
, *aux
;
1079 for_each_netdev_safe(net
, dev
, aux
)
1080 if (dev
->rtnl_link_ops
== ops
)
1081 unregister_netdevice_queue(dev
, head
);
1083 for (h
= 0; h
< IP_TNL_HASH_SIZE
; h
++) {
1084 struct ip_tunnel
*t
;
1085 struct hlist_node
*n
;
1086 struct hlist_head
*thead
= &itn
->tunnels
[h
];
1088 hlist_for_each_entry_safe(t
, n
, thead
, hash_node
)
1089 /* If dev is in the same netns, it has already
1090 * been added to the list by the previous loop.
1092 if (!net_eq(dev_net(t
->dev
), net
))
1093 unregister_netdevice_queue(t
->dev
, head
);
1097 void ip_tunnel_delete_nets(struct list_head
*net_list
, unsigned int id
,
1098 struct rtnl_link_ops
*ops
)
1100 struct ip_tunnel_net
*itn
;
1105 list_for_each_entry(net
, net_list
, exit_list
) {
1106 itn
= net_generic(net
, id
);
1107 ip_tunnel_destroy(net
, itn
, &list
, ops
);
1109 unregister_netdevice_many(&list
);
1112 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets
);
1114 int ip_tunnel_newlink(struct net_device
*dev
, struct nlattr
*tb
[],
1115 struct ip_tunnel_parm
*p
, __u32 fwmark
)
1117 struct ip_tunnel
*nt
;
1118 struct net
*net
= dev_net(dev
);
1119 struct ip_tunnel_net
*itn
;
1123 nt
= netdev_priv(dev
);
1124 itn
= net_generic(net
, nt
->ip_tnl_net_id
);
1126 if (nt
->collect_md
) {
1127 if (rtnl_dereference(itn
->collect_md_tun
))
1130 if (ip_tunnel_find(itn
, p
, dev
->type
))
1136 nt
->fwmark
= fwmark
;
1137 err
= register_netdevice(dev
);
1139 goto err_register_netdevice
;
1141 if (dev
->type
== ARPHRD_ETHER
&& !tb
[IFLA_ADDRESS
])
1142 eth_hw_addr_random(dev
);
1144 mtu
= ip_tunnel_bind_dev(dev
);
1146 unsigned int max
= IP_MAX_MTU
- dev
->hard_header_len
- nt
->hlen
;
1148 mtu
= clamp(dev
->mtu
, (unsigned int)ETH_MIN_MTU
,
1149 (unsigned int)(max
- sizeof(struct iphdr
)));
1152 err
= dev_set_mtu(dev
, mtu
);
1154 goto err_dev_set_mtu
;
1156 ip_tunnel_add(itn
, nt
);
1160 unregister_netdevice(dev
);
1161 err_register_netdevice
:
1164 EXPORT_SYMBOL_GPL(ip_tunnel_newlink
);
1166 int ip_tunnel_changelink(struct net_device
*dev
, struct nlattr
*tb
[],
1167 struct ip_tunnel_parm
*p
, __u32 fwmark
)
1169 struct ip_tunnel
*t
;
1170 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1171 struct net
*net
= tunnel
->net
;
1172 struct ip_tunnel_net
*itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
1174 if (dev
== itn
->fb_tunnel_dev
)
1177 t
= ip_tunnel_find(itn
, p
, dev
->type
);
1185 if (dev
->type
!= ARPHRD_ETHER
) {
1186 unsigned int nflags
= 0;
1188 if (ipv4_is_multicast(p
->iph
.daddr
))
1189 nflags
= IFF_BROADCAST
;
1190 else if (p
->iph
.daddr
)
1191 nflags
= IFF_POINTOPOINT
;
1193 if ((dev
->flags
^ nflags
) &
1194 (IFF_POINTOPOINT
| IFF_BROADCAST
))
1199 ip_tunnel_update(itn
, t
, dev
, p
, !tb
[IFLA_MTU
], fwmark
);
1202 EXPORT_SYMBOL_GPL(ip_tunnel_changelink
);
1204 int ip_tunnel_init(struct net_device
*dev
)
1206 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1207 struct iphdr
*iph
= &tunnel
->parms
.iph
;
1210 dev
->needs_free_netdev
= true;
1211 dev
->priv_destructor
= ip_tunnel_dev_free
;
1212 dev
->tstats
= netdev_alloc_pcpu_stats(struct pcpu_sw_netstats
);
1216 err
= dst_cache_init(&tunnel
->dst_cache
, GFP_KERNEL
);
1218 free_percpu(dev
->tstats
);
1222 err
= gro_cells_init(&tunnel
->gro_cells
, dev
);
1224 dst_cache_destroy(&tunnel
->dst_cache
);
1225 free_percpu(dev
->tstats
);
1230 tunnel
->net
= dev_net(dev
);
1231 strcpy(tunnel
->parms
.name
, dev
->name
);
1235 if (tunnel
->collect_md
)
1236 netif_keep_dst(dev
);
1239 EXPORT_SYMBOL_GPL(ip_tunnel_init
);
1241 void ip_tunnel_uninit(struct net_device
*dev
)
1243 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1244 struct net
*net
= tunnel
->net
;
1245 struct ip_tunnel_net
*itn
;
1247 itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
1248 /* fb_tunnel_dev will be unregisted in net-exit call. */
1249 if (itn
->fb_tunnel_dev
!= dev
)
1250 ip_tunnel_del(itn
, netdev_priv(dev
));
1252 dst_cache_reset(&tunnel
->dst_cache
);
1254 EXPORT_SYMBOL_GPL(ip_tunnel_uninit
);
1256 /* Do least required initialization, rest of init is done in tunnel_init call */
1257 void ip_tunnel_setup(struct net_device
*dev
, unsigned int net_id
)
1259 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1260 tunnel
->ip_tnl_net_id
= net_id
;
1262 EXPORT_SYMBOL_GPL(ip_tunnel_setup
);
1264 MODULE_LICENSE("GPL");