1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
46 #include <net/inet_dscp.h>
48 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
54 static unsigned int ip_tunnel_hash(__be32 key
, __be32 remote
)
56 return hash_32((__force u32
)key
^ (__force u32
)remote
,
60 static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern
*p
,
61 const unsigned long *flags
, __be32 key
)
63 if (!test_bit(IP_TUNNEL_KEY_BIT
, flags
))
64 return !test_bit(IP_TUNNEL_KEY_BIT
, p
->i_flags
);
66 return test_bit(IP_TUNNEL_KEY_BIT
, p
->i_flags
) && p
->i_key
== key
;
69 /* Fallback tunnel: no source, no destination, no key, no options
72 We require exact key match i.e. if a key is present in packet
73 it will match only tunnel with the same key; if it is not present,
74 it will match only keyless tunnel.
76 All keysless packets, if not matched configured keyless tunnels
77 will match fallback tunnel.
78 Given src, dst and key, find appropriate for input tunnel.
80 struct ip_tunnel
*ip_tunnel_lookup(struct ip_tunnel_net
*itn
,
81 int link
, const unsigned long *flags
,
82 __be32 remote
, __be32 local
,
85 struct ip_tunnel
*t
, *cand
= NULL
;
86 struct hlist_head
*head
;
87 struct net_device
*ndev
;
90 hash
= ip_tunnel_hash(key
, remote
);
91 head
= &itn
->tunnels
[hash
];
93 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
94 if (local
!= t
->parms
.iph
.saddr
||
95 remote
!= t
->parms
.iph
.daddr
||
96 !(t
->dev
->flags
& IFF_UP
))
99 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
102 if (READ_ONCE(t
->parms
.link
) == link
)
107 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
108 if (remote
!= t
->parms
.iph
.daddr
||
109 t
->parms
.iph
.saddr
!= 0 ||
110 !(t
->dev
->flags
& IFF_UP
))
113 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
116 if (READ_ONCE(t
->parms
.link
) == link
)
122 hash
= ip_tunnel_hash(key
, 0);
123 head
= &itn
->tunnels
[hash
];
125 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
126 if ((local
!= t
->parms
.iph
.saddr
|| t
->parms
.iph
.daddr
!= 0) &&
127 (local
!= t
->parms
.iph
.daddr
|| !ipv4_is_multicast(local
)))
130 if (!(t
->dev
->flags
& IFF_UP
))
133 if (!ip_tunnel_key_match(&t
->parms
, flags
, key
))
136 if (READ_ONCE(t
->parms
.link
) == link
)
142 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
143 if ((!test_bit(IP_TUNNEL_NO_KEY_BIT
, flags
) &&
144 t
->parms
.i_key
!= key
) ||
145 t
->parms
.iph
.saddr
!= 0 ||
146 t
->parms
.iph
.daddr
!= 0 ||
147 !(t
->dev
->flags
& IFF_UP
))
150 if (READ_ONCE(t
->parms
.link
) == link
)
159 t
= rcu_dereference(itn
->collect_md_tun
);
160 if (t
&& t
->dev
->flags
& IFF_UP
)
163 ndev
= READ_ONCE(itn
->fb_tunnel_dev
);
164 if (ndev
&& ndev
->flags
& IFF_UP
)
165 return netdev_priv(ndev
);
169 EXPORT_SYMBOL_GPL(ip_tunnel_lookup
);
171 static struct hlist_head
*ip_bucket(struct ip_tunnel_net
*itn
,
172 struct ip_tunnel_parm_kern
*parms
)
176 __be32 i_key
= parms
->i_key
;
178 if (parms
->iph
.daddr
&& !ipv4_is_multicast(parms
->iph
.daddr
))
179 remote
= parms
->iph
.daddr
;
183 if (!test_bit(IP_TUNNEL_KEY_BIT
, parms
->i_flags
) &&
184 test_bit(IP_TUNNEL_VTI_BIT
, parms
->i_flags
))
187 h
= ip_tunnel_hash(i_key
, remote
);
188 return &itn
->tunnels
[h
];
191 static void ip_tunnel_add(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
193 struct hlist_head
*head
= ip_bucket(itn
, &t
->parms
);
196 rcu_assign_pointer(itn
->collect_md_tun
, t
);
197 hlist_add_head_rcu(&t
->hash_node
, head
);
200 static void ip_tunnel_del(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
203 rcu_assign_pointer(itn
->collect_md_tun
, NULL
);
204 hlist_del_init_rcu(&t
->hash_node
);
207 static struct ip_tunnel
*ip_tunnel_find(struct ip_tunnel_net
*itn
,
208 struct ip_tunnel_parm_kern
*parms
,
211 __be32 remote
= parms
->iph
.daddr
;
212 __be32 local
= parms
->iph
.saddr
;
213 IP_TUNNEL_DECLARE_FLAGS(flags
);
214 __be32 key
= parms
->i_key
;
215 int link
= parms
->link
;
216 struct ip_tunnel
*t
= NULL
;
217 struct hlist_head
*head
= ip_bucket(itn
, parms
);
219 ip_tunnel_flags_copy(flags
, parms
->i_flags
);
221 hlist_for_each_entry_rcu(t
, head
, hash_node
, lockdep_rtnl_is_held()) {
222 if (local
== t
->parms
.iph
.saddr
&&
223 remote
== t
->parms
.iph
.daddr
&&
224 link
== READ_ONCE(t
->parms
.link
) &&
225 type
== t
->dev
->type
&&
226 ip_tunnel_key_match(&t
->parms
, flags
, key
))
232 static struct net_device
*__ip_tunnel_create(struct net
*net
,
233 const struct rtnl_link_ops
*ops
,
234 struct ip_tunnel_parm_kern
*parms
)
237 struct ip_tunnel
*tunnel
;
238 struct net_device
*dev
;
242 if (parms
->name
[0]) {
243 if (!dev_valid_name(parms
->name
))
245 strscpy(name
, parms
->name
, IFNAMSIZ
);
247 if (strlen(ops
->kind
) > (IFNAMSIZ
- 3))
249 strcpy(name
, ops
->kind
);
254 dev
= alloc_netdev(ops
->priv_size
, name
, NET_NAME_UNKNOWN
, ops
->setup
);
259 dev_net_set(dev
, net
);
261 dev
->rtnl_link_ops
= ops
;
263 tunnel
= netdev_priv(dev
);
264 tunnel
->parms
= *parms
;
267 err
= register_netdevice(dev
);
279 static int ip_tunnel_bind_dev(struct net_device
*dev
)
281 struct net_device
*tdev
= NULL
;
282 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
283 const struct iphdr
*iph
;
284 int hlen
= LL_MAX_HEADER
;
285 int mtu
= ETH_DATA_LEN
;
286 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
288 iph
= &tunnel
->parms
.iph
;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4
, iph
->protocol
, iph
->daddr
,
296 iph
->saddr
, tunnel
->parms
.o_key
,
297 iph
->tos
& INET_DSCP_MASK
, dev_net(dev
),
298 tunnel
->parms
.link
, tunnel
->fwmark
, 0, 0);
299 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
305 if (dev
->type
!= ARPHRD_ETHER
)
306 dev
->flags
|= IFF_POINTOPOINT
;
308 dst_cache_reset(&tunnel
->dst_cache
);
311 if (!tdev
&& tunnel
->parms
.link
)
312 tdev
= __dev_get_by_index(tunnel
->net
, tunnel
->parms
.link
);
315 hlen
= tdev
->hard_header_len
+ tdev
->needed_headroom
;
316 mtu
= min(tdev
->mtu
, IP_MAX_MTU
);
319 dev
->needed_headroom
= t_hlen
+ hlen
;
320 mtu
-= t_hlen
+ (dev
->type
== ARPHRD_ETHER
? dev
->hard_header_len
: 0);
322 if (mtu
< IPV4_MIN_MTU
)
328 static struct ip_tunnel
*ip_tunnel_create(struct net
*net
,
329 struct ip_tunnel_net
*itn
,
330 struct ip_tunnel_parm_kern
*parms
)
332 struct ip_tunnel
*nt
;
333 struct net_device
*dev
;
338 dev
= __ip_tunnel_create(net
, itn
->rtnl_link_ops
, parms
);
340 return ERR_CAST(dev
);
342 mtu
= ip_tunnel_bind_dev(dev
);
343 err
= dev_set_mtu(dev
, mtu
);
345 goto err_dev_set_mtu
;
347 nt
= netdev_priv(dev
);
348 t_hlen
= nt
->hlen
+ sizeof(struct iphdr
);
349 dev
->min_mtu
= ETH_MIN_MTU
;
350 dev
->max_mtu
= IP_MAX_MTU
- t_hlen
;
351 if (dev
->type
== ARPHRD_ETHER
)
352 dev
->max_mtu
-= dev
->hard_header_len
;
354 ip_tunnel_add(itn
, nt
);
358 unregister_netdevice(dev
);
362 void ip_tunnel_md_udp_encap(struct sk_buff
*skb
, struct ip_tunnel_info
*info
)
364 const struct iphdr
*iph
= ip_hdr(skb
);
365 const struct udphdr
*udph
;
367 if (iph
->protocol
!= IPPROTO_UDP
)
370 udph
= (struct udphdr
*)((__u8
*)iph
+ (iph
->ihl
<< 2));
371 info
->encap
.sport
= udph
->source
;
372 info
->encap
.dport
= udph
->dest
;
374 EXPORT_SYMBOL(ip_tunnel_md_udp_encap
);
376 int ip_tunnel_rcv(struct ip_tunnel
*tunnel
, struct sk_buff
*skb
,
377 const struct tnl_ptk_info
*tpi
, struct metadata_dst
*tun_dst
,
380 const struct iphdr
*iph
= ip_hdr(skb
);
383 #ifdef CONFIG_NET_IPGRE_BROADCAST
384 if (ipv4_is_multicast(iph
->daddr
)) {
385 DEV_STATS_INC(tunnel
->dev
, multicast
);
386 skb
->pkt_type
= PACKET_BROADCAST
;
390 if (test_bit(IP_TUNNEL_CSUM_BIT
, tunnel
->parms
.i_flags
) !=
391 test_bit(IP_TUNNEL_CSUM_BIT
, tpi
->flags
)) {
392 DEV_STATS_INC(tunnel
->dev
, rx_crc_errors
);
393 DEV_STATS_INC(tunnel
->dev
, rx_errors
);
397 if (test_bit(IP_TUNNEL_SEQ_BIT
, tunnel
->parms
.i_flags
)) {
398 if (!test_bit(IP_TUNNEL_SEQ_BIT
, tpi
->flags
) ||
399 (tunnel
->i_seqno
&& (s32
)(ntohl(tpi
->seq
) - tunnel
->i_seqno
) < 0)) {
400 DEV_STATS_INC(tunnel
->dev
, rx_fifo_errors
);
401 DEV_STATS_INC(tunnel
->dev
, rx_errors
);
404 tunnel
->i_seqno
= ntohl(tpi
->seq
) + 1;
407 /* Save offset of outer header relative to skb->head,
408 * because we are going to reset the network header to the inner header
409 * and might change skb->head.
411 nh
= skb_network_header(skb
) - skb
->head
;
413 skb_set_network_header(skb
, (tunnel
->dev
->type
== ARPHRD_ETHER
) ? ETH_HLEN
: 0);
415 if (!pskb_inet_may_pull(skb
)) {
416 DEV_STATS_INC(tunnel
->dev
, rx_length_errors
);
417 DEV_STATS_INC(tunnel
->dev
, rx_errors
);
420 iph
= (struct iphdr
*)(skb
->head
+ nh
);
422 err
= IP_ECN_decapsulate(iph
, skb
);
425 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
426 &iph
->saddr
, iph
->tos
);
428 DEV_STATS_INC(tunnel
->dev
, rx_frame_errors
);
429 DEV_STATS_INC(tunnel
->dev
, rx_errors
);
434 dev_sw_netstats_rx_add(tunnel
->dev
, skb
->len
);
435 skb_scrub_packet(skb
, !net_eq(tunnel
->net
, dev_net(tunnel
->dev
)));
437 if (tunnel
->dev
->type
== ARPHRD_ETHER
) {
438 skb
->protocol
= eth_type_trans(skb
, tunnel
->dev
);
439 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
441 skb
->dev
= tunnel
->dev
;
445 skb_dst_set(skb
, (struct dst_entry
*)tun_dst
);
447 gro_cells_receive(&tunnel
->gro_cells
, skb
);
452 dst_release((struct dst_entry
*)tun_dst
);
456 EXPORT_SYMBOL_GPL(ip_tunnel_rcv
);
458 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops
*ops
,
461 if (num
>= MAX_IPTUN_ENCAP_OPS
)
464 return !cmpxchg((const struct ip_tunnel_encap_ops
**)
468 EXPORT_SYMBOL(ip_tunnel_encap_add_ops
);
470 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops
*ops
,
475 if (num
>= MAX_IPTUN_ENCAP_OPS
)
478 ret
= (cmpxchg((const struct ip_tunnel_encap_ops
**)
480 ops
, NULL
) == ops
) ? 0 : -1;
486 EXPORT_SYMBOL(ip_tunnel_encap_del_ops
);
488 int ip_tunnel_encap_setup(struct ip_tunnel
*t
,
489 struct ip_tunnel_encap
*ipencap
)
493 memset(&t
->encap
, 0, sizeof(t
->encap
));
495 hlen
= ip_encap_hlen(ipencap
);
499 t
->encap
.type
= ipencap
->type
;
500 t
->encap
.sport
= ipencap
->sport
;
501 t
->encap
.dport
= ipencap
->dport
;
502 t
->encap
.flags
= ipencap
->flags
;
504 t
->encap_hlen
= hlen
;
505 t
->hlen
= t
->encap_hlen
+ t
->tun_hlen
;
509 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup
);
511 static int tnl_update_pmtu(struct net_device
*dev
, struct sk_buff
*skb
,
512 struct rtable
*rt
, __be16 df
,
513 const struct iphdr
*inner_iph
,
514 int tunnel_hlen
, __be32 dst
, bool md
)
516 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
520 tunnel_hlen
= md
? tunnel_hlen
: tunnel
->hlen
;
521 pkt_size
= skb
->len
- tunnel_hlen
;
522 pkt_size
-= dev
->type
== ARPHRD_ETHER
? dev
->hard_header_len
: 0;
525 mtu
= dst_mtu(&rt
->dst
) - (sizeof(struct iphdr
) + tunnel_hlen
);
526 mtu
-= dev
->type
== ARPHRD_ETHER
? dev
->hard_header_len
: 0;
528 mtu
= skb_valid_dst(skb
) ? dst_mtu(skb_dst(skb
)) : dev
->mtu
;
531 if (skb_valid_dst(skb
))
532 skb_dst_update_pmtu_no_confirm(skb
, mtu
);
534 if (skb
->protocol
== htons(ETH_P_IP
)) {
535 if (!skb_is_gso(skb
) &&
536 (inner_iph
->frag_off
& htons(IP_DF
)) &&
538 icmp_ndo_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
542 #if IS_ENABLED(CONFIG_IPV6)
543 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
544 struct rt6_info
*rt6
;
547 rt6
= skb_valid_dst(skb
) ? dst_rt6_info(skb_dst(skb
)) :
549 daddr
= md
? dst
: tunnel
->parms
.iph
.daddr
;
551 if (rt6
&& mtu
< dst_mtu(skb_dst(skb
)) &&
552 mtu
>= IPV6_MIN_MTU
) {
553 if ((daddr
&& !ipv4_is_multicast(daddr
)) ||
554 rt6
->rt6i_dst
.plen
== 128) {
555 rt6
->rt6i_flags
|= RTF_MODIFIED
;
556 dst_metric_set(skb_dst(skb
), RTAX_MTU
, mtu
);
560 if (!skb_is_gso(skb
) && mtu
>= IPV6_MIN_MTU
&&
562 icmpv6_ndo_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
570 static void ip_tunnel_adj_headroom(struct net_device
*dev
, unsigned int headroom
)
572 /* we must cap headroom to some upperlimit, else pskb_expand_head
573 * will overflow header offsets in skb_headers_offset_update().
575 static const unsigned int max_allowed
= 512;
577 if (headroom
> max_allowed
)
578 headroom
= max_allowed
;
580 if (headroom
> READ_ONCE(dev
->needed_headroom
))
581 WRITE_ONCE(dev
->needed_headroom
, headroom
);
584 void ip_md_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
585 u8 proto
, int tunnel_hlen
)
587 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
588 u32 headroom
= sizeof(struct iphdr
);
589 struct ip_tunnel_info
*tun_info
;
590 const struct ip_tunnel_key
*key
;
591 const struct iphdr
*inner_iph
;
592 struct rtable
*rt
= NULL
;
598 tun_info
= skb_tunnel_info(skb
);
599 if (unlikely(!tun_info
|| !(tun_info
->mode
& IP_TUNNEL_INFO_TX
) ||
600 ip_tunnel_info_af(tun_info
) != AF_INET
))
602 key
= &tun_info
->key
;
603 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
604 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
607 if (skb
->protocol
== htons(ETH_P_IP
))
608 tos
= inner_iph
->tos
;
609 else if (skb
->protocol
== htons(ETH_P_IPV6
))
610 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
612 ip_tunnel_init_flow(&fl4
, proto
, key
->u
.ipv4
.dst
, key
->u
.ipv4
.src
,
613 tunnel_id_to_key32(key
->tun_id
),
614 tos
& INET_DSCP_MASK
, dev_net(dev
), 0, skb
->mark
,
615 skb_get_hash(skb
), key
->flow_flags
);
618 tunnel_hlen
= ip_encap_hlen(&tun_info
->encap
);
620 if (ip_tunnel_encap(skb
, &tun_info
->encap
, &proto
, &fl4
) < 0)
623 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
625 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
, &fl4
.saddr
);
627 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
629 DEV_STATS_INC(dev
, tx_carrier_errors
);
633 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
636 if (rt
->dst
.dev
== dev
) {
638 DEV_STATS_INC(dev
, collisions
);
642 if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT
, key
->tun_flags
))
644 if (tnl_update_pmtu(dev
, skb
, rt
, df
, inner_iph
, tunnel_hlen
,
645 key
->u
.ipv4
.dst
, true)) {
650 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
653 if (skb
->protocol
== htons(ETH_P_IP
))
654 ttl
= inner_iph
->ttl
;
655 else if (skb
->protocol
== htons(ETH_P_IPV6
))
656 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
658 ttl
= ip4_dst_hoplimit(&rt
->dst
);
661 headroom
+= LL_RESERVED_SPACE(rt
->dst
.dev
) + rt
->dst
.header_len
;
662 if (skb_cow_head(skb
, headroom
)) {
667 ip_tunnel_adj_headroom(dev
, headroom
);
669 iptunnel_xmit(NULL
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, proto
, tos
, ttl
,
670 df
, !net_eq(tunnel
->net
, dev_net(dev
)));
673 DEV_STATS_INC(dev
, tx_errors
);
676 DEV_STATS_INC(dev
, tx_dropped
);
680 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit
);
682 void ip_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
683 const struct iphdr
*tnl_params
, u8 protocol
)
685 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
686 struct ip_tunnel_info
*tun_info
= NULL
;
687 const struct iphdr
*inner_iph
;
688 unsigned int max_headroom
; /* The extra header space needed */
689 struct rtable
*rt
= NULL
; /* Route to the other host */
690 __be16 payload_protocol
;
691 bool use_cache
= false;
699 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
700 connected
= (tunnel
->parms
.iph
.daddr
!= 0);
701 payload_protocol
= skb_protocol(skb
, true);
703 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
705 dst
= tnl_params
->daddr
;
710 DEV_STATS_INC(dev
, tx_fifo_errors
);
714 tun_info
= skb_tunnel_info(skb
);
715 if (tun_info
&& (tun_info
->mode
& IP_TUNNEL_INFO_TX
) &&
716 ip_tunnel_info_af(tun_info
) == AF_INET
&&
717 tun_info
->key
.u
.ipv4
.dst
) {
718 dst
= tun_info
->key
.u
.ipv4
.dst
;
721 } else if (payload_protocol
== htons(ETH_P_IP
)) {
722 rt
= skb_rtable(skb
);
723 dst
= rt_nexthop(rt
, inner_iph
->daddr
);
725 #if IS_ENABLED(CONFIG_IPV6)
726 else if (payload_protocol
== htons(ETH_P_IPV6
)) {
727 const struct in6_addr
*addr6
;
728 struct neighbour
*neigh
;
729 bool do_tx_error_icmp
;
732 neigh
= dst_neigh_lookup(skb_dst(skb
),
733 &ipv6_hdr(skb
)->daddr
);
737 addr6
= (const struct in6_addr
*)&neigh
->primary_key
;
738 addr_type
= ipv6_addr_type(addr6
);
740 if (addr_type
== IPV6_ADDR_ANY
) {
741 addr6
= &ipv6_hdr(skb
)->daddr
;
742 addr_type
= ipv6_addr_type(addr6
);
745 if ((addr_type
& IPV6_ADDR_COMPATv4
) == 0)
746 do_tx_error_icmp
= true;
748 do_tx_error_icmp
= false;
749 dst
= addr6
->s6_addr32
[3];
751 neigh_release(neigh
);
752 if (do_tx_error_icmp
)
763 tos
= tnl_params
->tos
;
766 if (payload_protocol
== htons(ETH_P_IP
)) {
767 tos
= inner_iph
->tos
;
769 } else if (payload_protocol
== htons(ETH_P_IPV6
)) {
770 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
775 ip_tunnel_init_flow(&fl4
, protocol
, dst
, tnl_params
->saddr
,
776 tunnel
->parms
.o_key
, tos
& INET_DSCP_MASK
,
777 dev_net(dev
), READ_ONCE(tunnel
->parms
.link
),
778 tunnel
->fwmark
, skb_get_hash(skb
), 0);
780 if (ip_tunnel_encap(skb
, &tunnel
->encap
, &protocol
, &fl4
) < 0)
783 if (connected
&& md
) {
784 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
786 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
,
789 rt
= connected
? dst_cache_get_ip4(&tunnel
->dst_cache
,
794 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
797 DEV_STATS_INC(dev
, tx_carrier_errors
);
801 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
803 else if (!md
&& connected
)
804 dst_cache_set_ip4(&tunnel
->dst_cache
, &rt
->dst
,
808 if (rt
->dst
.dev
== dev
) {
810 DEV_STATS_INC(dev
, collisions
);
814 df
= tnl_params
->frag_off
;
815 if (payload_protocol
== htons(ETH_P_IP
) && !tunnel
->ignore_df
)
816 df
|= (inner_iph
->frag_off
& htons(IP_DF
));
818 if (tnl_update_pmtu(dev
, skb
, rt
, df
, inner_iph
, 0, 0, false)) {
823 if (tunnel
->err_count
> 0) {
824 if (time_before(jiffies
,
825 tunnel
->err_time
+ IPTUNNEL_ERR_TIMEO
)) {
828 dst_link_failure(skb
);
830 tunnel
->err_count
= 0;
833 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
834 ttl
= tnl_params
->ttl
;
836 if (payload_protocol
== htons(ETH_P_IP
))
837 ttl
= inner_iph
->ttl
;
838 #if IS_ENABLED(CONFIG_IPV6)
839 else if (payload_protocol
== htons(ETH_P_IPV6
))
840 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
843 ttl
= ip4_dst_hoplimit(&rt
->dst
);
846 max_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + sizeof(struct iphdr
)
847 + rt
->dst
.header_len
+ ip_encap_hlen(&tunnel
->encap
);
849 if (skb_cow_head(skb
, max_headroom
)) {
851 DEV_STATS_INC(dev
, tx_dropped
);
856 ip_tunnel_adj_headroom(dev
, max_headroom
);
858 iptunnel_xmit(NULL
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, protocol
, tos
, ttl
,
859 df
, !net_eq(tunnel
->net
, dev_net(dev
)));
862 #if IS_ENABLED(CONFIG_IPV6)
864 dst_link_failure(skb
);
867 DEV_STATS_INC(dev
, tx_errors
);
870 EXPORT_SYMBOL_GPL(ip_tunnel_xmit
);
872 static void ip_tunnel_update(struct ip_tunnel_net
*itn
,
874 struct net_device
*dev
,
875 struct ip_tunnel_parm_kern
*p
,
879 ip_tunnel_del(itn
, t
);
880 t
->parms
.iph
.saddr
= p
->iph
.saddr
;
881 t
->parms
.iph
.daddr
= p
->iph
.daddr
;
882 t
->parms
.i_key
= p
->i_key
;
883 t
->parms
.o_key
= p
->o_key
;
884 if (dev
->type
!= ARPHRD_ETHER
) {
885 __dev_addr_set(dev
, &p
->iph
.saddr
, 4);
886 memcpy(dev
->broadcast
, &p
->iph
.daddr
, 4);
888 ip_tunnel_add(itn
, t
);
890 t
->parms
.iph
.ttl
= p
->iph
.ttl
;
891 t
->parms
.iph
.tos
= p
->iph
.tos
;
892 t
->parms
.iph
.frag_off
= p
->iph
.frag_off
;
894 if (t
->parms
.link
!= p
->link
|| t
->fwmark
!= fwmark
) {
897 WRITE_ONCE(t
->parms
.link
, p
->link
);
899 mtu
= ip_tunnel_bind_dev(dev
);
901 WRITE_ONCE(dev
->mtu
, mtu
);
903 dst_cache_reset(&t
->dst_cache
);
904 netdev_state_change(dev
);
907 int ip_tunnel_ctl(struct net_device
*dev
, struct ip_tunnel_parm_kern
*p
,
911 struct ip_tunnel
*t
= netdev_priv(dev
);
912 struct net
*net
= t
->net
;
913 struct ip_tunnel_net
*itn
= net_generic(net
, t
->ip_tnl_net_id
);
917 if (dev
== itn
->fb_tunnel_dev
) {
918 t
= ip_tunnel_find(itn
, p
, itn
->fb_tunnel_dev
->type
);
920 t
= netdev_priv(dev
);
922 memcpy(p
, &t
->parms
, sizeof(*p
));
928 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
931 p
->iph
.frag_off
|= htons(IP_DF
);
932 if (!test_bit(IP_TUNNEL_VTI_BIT
, p
->i_flags
)) {
933 if (!test_bit(IP_TUNNEL_KEY_BIT
, p
->i_flags
))
935 if (!test_bit(IP_TUNNEL_KEY_BIT
, p
->o_flags
))
939 t
= ip_tunnel_find(itn
, p
, itn
->type
);
941 if (cmd
== SIOCADDTUNNEL
) {
943 t
= ip_tunnel_create(net
, itn
, p
);
944 err
= PTR_ERR_OR_ZERO(t
);
951 if (dev
!= itn
->fb_tunnel_dev
&& cmd
== SIOCCHGTUNNEL
) {
958 unsigned int nflags
= 0;
960 if (ipv4_is_multicast(p
->iph
.daddr
))
961 nflags
= IFF_BROADCAST
;
962 else if (p
->iph
.daddr
)
963 nflags
= IFF_POINTOPOINT
;
965 if ((dev
->flags
^nflags
)&(IFF_POINTOPOINT
|IFF_BROADCAST
)) {
970 t
= netdev_priv(dev
);
976 ip_tunnel_update(itn
, t
, dev
, p
, true, 0);
984 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
987 if (dev
== itn
->fb_tunnel_dev
) {
989 t
= ip_tunnel_find(itn
, p
, itn
->fb_tunnel_dev
->type
);
993 if (t
== netdev_priv(itn
->fb_tunnel_dev
))
997 unregister_netdevice(dev
);
1008 EXPORT_SYMBOL_GPL(ip_tunnel_ctl
);
1010 bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern
*kp
,
1011 const void __user
*data
)
1013 struct ip_tunnel_parm p
;
1015 if (copy_from_user(&p
, data
, sizeof(p
)))
1018 strscpy(kp
->name
, p
.name
);
1020 ip_tunnel_flags_from_be16(kp
->i_flags
, p
.i_flags
);
1021 ip_tunnel_flags_from_be16(kp
->o_flags
, p
.o_flags
);
1022 kp
->i_key
= p
.i_key
;
1023 kp
->o_key
= p
.o_key
;
1024 memcpy(&kp
->iph
, &p
.iph
, min(sizeof(kp
->iph
), sizeof(p
.iph
)));
1028 EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user
);
1030 bool ip_tunnel_parm_to_user(void __user
*data
, struct ip_tunnel_parm_kern
*kp
)
1032 struct ip_tunnel_parm p
;
1034 if (!ip_tunnel_flags_is_be16_compat(kp
->i_flags
) ||
1035 !ip_tunnel_flags_is_be16_compat(kp
->o_flags
))
1038 memset(&p
, 0, sizeof(p
));
1040 strscpy(p
.name
, kp
->name
);
1042 p
.i_flags
= ip_tunnel_flags_to_be16(kp
->i_flags
);
1043 p
.o_flags
= ip_tunnel_flags_to_be16(kp
->o_flags
);
1044 p
.i_key
= kp
->i_key
;
1045 p
.o_key
= kp
->o_key
;
1046 memcpy(&p
.iph
, &kp
->iph
, min(sizeof(p
.iph
), sizeof(kp
->iph
)));
1048 return !copy_to_user(data
, &p
, sizeof(p
));
1050 EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user
);
1052 int ip_tunnel_siocdevprivate(struct net_device
*dev
, struct ifreq
*ifr
,
1053 void __user
*data
, int cmd
)
1055 struct ip_tunnel_parm_kern p
;
1058 if (!ip_tunnel_parm_from_user(&p
, data
))
1060 err
= dev
->netdev_ops
->ndo_tunnel_ctl(dev
, &p
, cmd
);
1061 if (!err
&& !ip_tunnel_parm_to_user(data
, &p
))
1065 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate
);
1067 int __ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
, bool strict
)
1069 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1070 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
1071 int max_mtu
= IP_MAX_MTU
- t_hlen
;
1073 if (dev
->type
== ARPHRD_ETHER
)
1074 max_mtu
-= dev
->hard_header_len
;
1076 if (new_mtu
< ETH_MIN_MTU
)
1079 if (new_mtu
> max_mtu
) {
1086 WRITE_ONCE(dev
->mtu
, new_mtu
);
1089 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu
);
1091 int ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
)
1093 return __ip_tunnel_change_mtu(dev
, new_mtu
, true);
1095 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu
);
1097 static void ip_tunnel_dev_free(struct net_device
*dev
)
1099 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1101 gro_cells_destroy(&tunnel
->gro_cells
);
1102 dst_cache_destroy(&tunnel
->dst_cache
);
1105 void ip_tunnel_dellink(struct net_device
*dev
, struct list_head
*head
)
1107 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1108 struct ip_tunnel_net
*itn
;
1110 itn
= net_generic(tunnel
->net
, tunnel
->ip_tnl_net_id
);
1112 if (itn
->fb_tunnel_dev
!= dev
) {
1113 ip_tunnel_del(itn
, netdev_priv(dev
));
1114 unregister_netdevice_queue(dev
, head
);
1117 EXPORT_SYMBOL_GPL(ip_tunnel_dellink
);
1119 struct net
*ip_tunnel_get_link_net(const struct net_device
*dev
)
1121 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1123 return READ_ONCE(tunnel
->net
);
1125 EXPORT_SYMBOL(ip_tunnel_get_link_net
);
1127 int ip_tunnel_get_iflink(const struct net_device
*dev
)
1129 const struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1131 return READ_ONCE(tunnel
->parms
.link
);
1133 EXPORT_SYMBOL(ip_tunnel_get_iflink
);
1135 int ip_tunnel_init_net(struct net
*net
, unsigned int ip_tnl_net_id
,
1136 struct rtnl_link_ops
*ops
, char *devname
)
1138 struct ip_tunnel_net
*itn
= net_generic(net
, ip_tnl_net_id
);
1139 struct ip_tunnel_parm_kern parms
;
1142 itn
->rtnl_link_ops
= ops
;
1143 for (i
= 0; i
< IP_TNL_HASH_SIZE
; i
++)
1144 INIT_HLIST_HEAD(&itn
->tunnels
[i
]);
1146 if (!ops
|| !net_has_fallback_tunnels(net
)) {
1147 struct ip_tunnel_net
*it_init_net
;
1149 it_init_net
= net_generic(&init_net
, ip_tnl_net_id
);
1150 itn
->type
= it_init_net
->type
;
1151 itn
->fb_tunnel_dev
= NULL
;
1155 memset(&parms
, 0, sizeof(parms
));
1157 strscpy(parms
.name
, devname
, IFNAMSIZ
);
1160 itn
->fb_tunnel_dev
= __ip_tunnel_create(net
, ops
, &parms
);
1161 /* FB netdevice is special: we have one, and only one per netns.
1162 * Allowing to move it to another netns is clearly unsafe.
1164 if (!IS_ERR(itn
->fb_tunnel_dev
)) {
1165 itn
->fb_tunnel_dev
->netns_local
= true;
1166 itn
->fb_tunnel_dev
->mtu
= ip_tunnel_bind_dev(itn
->fb_tunnel_dev
);
1167 ip_tunnel_add(itn
, netdev_priv(itn
->fb_tunnel_dev
));
1168 itn
->type
= itn
->fb_tunnel_dev
->type
;
1172 return PTR_ERR_OR_ZERO(itn
->fb_tunnel_dev
);
1174 EXPORT_SYMBOL_GPL(ip_tunnel_init_net
);
1176 static void ip_tunnel_destroy(struct net
*net
, struct ip_tunnel_net
*itn
,
1177 struct list_head
*head
,
1178 struct rtnl_link_ops
*ops
)
1180 struct net_device
*dev
, *aux
;
1183 for_each_netdev_safe(net
, dev
, aux
)
1184 if (dev
->rtnl_link_ops
== ops
)
1185 unregister_netdevice_queue(dev
, head
);
1187 for (h
= 0; h
< IP_TNL_HASH_SIZE
; h
++) {
1188 struct ip_tunnel
*t
;
1189 struct hlist_node
*n
;
1190 struct hlist_head
*thead
= &itn
->tunnels
[h
];
1192 hlist_for_each_entry_safe(t
, n
, thead
, hash_node
)
1193 /* If dev is in the same netns, it has already
1194 * been added to the list by the previous loop.
1196 if (!net_eq(dev_net(t
->dev
), net
))
1197 unregister_netdevice_queue(t
->dev
, head
);
1201 void ip_tunnel_delete_nets(struct list_head
*net_list
, unsigned int id
,
1202 struct rtnl_link_ops
*ops
,
1203 struct list_head
*dev_to_kill
)
1205 struct ip_tunnel_net
*itn
;
1209 list_for_each_entry(net
, net_list
, exit_list
) {
1210 itn
= net_generic(net
, id
);
1211 ip_tunnel_destroy(net
, itn
, dev_to_kill
, ops
);
1214 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets
);
1216 int ip_tunnel_newlink(struct net_device
*dev
, struct nlattr
*tb
[],
1217 struct ip_tunnel_parm_kern
*p
, __u32 fwmark
)
1219 struct ip_tunnel
*nt
;
1220 struct net
*net
= dev_net(dev
);
1221 struct ip_tunnel_net
*itn
;
1225 nt
= netdev_priv(dev
);
1226 itn
= net_generic(net
, nt
->ip_tnl_net_id
);
1228 if (nt
->collect_md
) {
1229 if (rtnl_dereference(itn
->collect_md_tun
))
1232 if (ip_tunnel_find(itn
, p
, dev
->type
))
1238 nt
->fwmark
= fwmark
;
1239 err
= register_netdevice(dev
);
1241 goto err_register_netdevice
;
1243 if (dev
->type
== ARPHRD_ETHER
&& !tb
[IFLA_ADDRESS
])
1244 eth_hw_addr_random(dev
);
1246 mtu
= ip_tunnel_bind_dev(dev
);
1248 unsigned int max
= IP_MAX_MTU
- (nt
->hlen
+ sizeof(struct iphdr
));
1250 if (dev
->type
== ARPHRD_ETHER
)
1251 max
-= dev
->hard_header_len
;
1253 mtu
= clamp(dev
->mtu
, (unsigned int)ETH_MIN_MTU
, max
);
1256 err
= dev_set_mtu(dev
, mtu
);
1258 goto err_dev_set_mtu
;
1260 ip_tunnel_add(itn
, nt
);
1264 unregister_netdevice(dev
);
1265 err_register_netdevice
:
1268 EXPORT_SYMBOL_GPL(ip_tunnel_newlink
);
1270 int ip_tunnel_changelink(struct net_device
*dev
, struct nlattr
*tb
[],
1271 struct ip_tunnel_parm_kern
*p
, __u32 fwmark
)
1273 struct ip_tunnel
*t
;
1274 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1275 struct net
*net
= tunnel
->net
;
1276 struct ip_tunnel_net
*itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
1278 if (dev
== itn
->fb_tunnel_dev
)
1281 t
= ip_tunnel_find(itn
, p
, dev
->type
);
1289 if (dev
->type
!= ARPHRD_ETHER
) {
1290 unsigned int nflags
= 0;
1292 if (ipv4_is_multicast(p
->iph
.daddr
))
1293 nflags
= IFF_BROADCAST
;
1294 else if (p
->iph
.daddr
)
1295 nflags
= IFF_POINTOPOINT
;
1297 if ((dev
->flags
^ nflags
) &
1298 (IFF_POINTOPOINT
| IFF_BROADCAST
))
1303 ip_tunnel_update(itn
, t
, dev
, p
, !tb
[IFLA_MTU
], fwmark
);
1306 EXPORT_SYMBOL_GPL(ip_tunnel_changelink
);
1308 int ip_tunnel_init(struct net_device
*dev
)
1310 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1311 struct iphdr
*iph
= &tunnel
->parms
.iph
;
1314 dev
->needs_free_netdev
= true;
1315 dev
->priv_destructor
= ip_tunnel_dev_free
;
1316 dev
->pcpu_stat_type
= NETDEV_PCPU_STAT_TSTATS
;
1318 err
= dst_cache_init(&tunnel
->dst_cache
, GFP_KERNEL
);
1322 err
= gro_cells_init(&tunnel
->gro_cells
, dev
);
1324 dst_cache_destroy(&tunnel
->dst_cache
);
1329 tunnel
->net
= dev_net(dev
);
1330 strscpy(tunnel
->parms
.name
, dev
->name
);
1334 if (tunnel
->collect_md
)
1335 netif_keep_dst(dev
);
1336 netdev_lockdep_set_classes(dev
);
1339 EXPORT_SYMBOL_GPL(ip_tunnel_init
);
1341 void ip_tunnel_uninit(struct net_device
*dev
)
1343 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1344 struct net
*net
= tunnel
->net
;
1345 struct ip_tunnel_net
*itn
;
1347 itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
1348 ip_tunnel_del(itn
, netdev_priv(dev
));
1349 if (itn
->fb_tunnel_dev
== dev
)
1350 WRITE_ONCE(itn
->fb_tunnel_dev
, NULL
);
1352 dst_cache_reset(&tunnel
->dst_cache
);
1354 EXPORT_SYMBOL_GPL(ip_tunnel_uninit
);
1356 /* Do least required initialization, rest of init is done in tunnel_init call */
1357 void ip_tunnel_setup(struct net_device
*dev
, unsigned int net_id
)
1359 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1360 tunnel
->ip_tnl_net_id
= net_id
;
1362 EXPORT_SYMBOL_GPL(ip_tunnel_setup
);
1364 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1365 MODULE_LICENSE("GPL");