dm writecache: fix incorrect flush sequence when doing SSD mode commit
[linux/fpc-iii.git] / net / ipv4 / ip_tunnel.c
blob0fe2a5d3e258fb4220155690016d28761fdde14c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2013 Nicira, Inc.
4 */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
56 IP_TNL_HASH_BITS);
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
65 else
66 /* key expected, none present */
67 return false;
68 } else
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
74 Tunnel hash table:
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
86 __be32 key)
88 unsigned int hash;
89 struct ip_tunnel *t, *cand = NULL;
90 struct hlist_head *head;
92 hash = ip_tunnel_hash(key, remote);
93 head = &itn->tunnels[hash];
95 hlist_for_each_entry_rcu(t, head, hash_node) {
96 if (local != t->parms.iph.saddr ||
97 remote != t->parms.iph.daddr ||
98 !(t->dev->flags & IFF_UP))
99 continue;
101 if (!ip_tunnel_key_match(&t->parms, flags, key))
102 continue;
104 if (t->parms.link == link)
105 return t;
106 else
107 cand = t;
110 hlist_for_each_entry_rcu(t, head, hash_node) {
111 if (remote != t->parms.iph.daddr ||
112 t->parms.iph.saddr != 0 ||
113 !(t->dev->flags & IFF_UP))
114 continue;
116 if (!ip_tunnel_key_match(&t->parms, flags, key))
117 continue;
119 if (t->parms.link == link)
120 return t;
121 else if (!cand)
122 cand = t;
125 hash = ip_tunnel_hash(key, 0);
126 head = &itn->tunnels[hash];
128 hlist_for_each_entry_rcu(t, head, hash_node) {
129 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
130 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
131 continue;
133 if (!(t->dev->flags & IFF_UP))
134 continue;
136 if (!ip_tunnel_key_match(&t->parms, flags, key))
137 continue;
139 if (t->parms.link == link)
140 return t;
141 else if (!cand)
142 cand = t;
145 if (flags & TUNNEL_NO_KEY)
146 goto skip_key_lookup;
148 hlist_for_each_entry_rcu(t, head, hash_node) {
149 if (t->parms.i_key != key ||
150 t->parms.iph.saddr != 0 ||
151 t->parms.iph.daddr != 0 ||
152 !(t->dev->flags & IFF_UP))
153 continue;
155 if (t->parms.link == link)
156 return t;
157 else if (!cand)
158 cand = t;
161 skip_key_lookup:
162 if (cand)
163 return cand;
165 t = rcu_dereference(itn->collect_md_tun);
166 if (t && t->dev->flags & IFF_UP)
167 return t;
169 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
170 return netdev_priv(itn->fb_tunnel_dev);
172 return NULL;
174 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
176 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
177 struct ip_tunnel_parm *parms)
179 unsigned int h;
180 __be32 remote;
181 __be32 i_key = parms->i_key;
183 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
184 remote = parms->iph.daddr;
185 else
186 remote = 0;
188 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 i_key = 0;
191 h = ip_tunnel_hash(i_key, remote);
192 return &itn->tunnels[h];
195 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
197 struct hlist_head *head = ip_bucket(itn, &t->parms);
199 if (t->collect_md)
200 rcu_assign_pointer(itn->collect_md_tun, t);
201 hlist_add_head_rcu(&t->hash_node, head);
204 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
206 if (t->collect_md)
207 rcu_assign_pointer(itn->collect_md_tun, NULL);
208 hlist_del_init_rcu(&t->hash_node);
211 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
212 struct ip_tunnel_parm *parms,
213 int type)
215 __be32 remote = parms->iph.daddr;
216 __be32 local = parms->iph.saddr;
217 __be32 key = parms->i_key;
218 __be16 flags = parms->i_flags;
219 int link = parms->link;
220 struct ip_tunnel *t = NULL;
221 struct hlist_head *head = ip_bucket(itn, parms);
223 hlist_for_each_entry_rcu(t, head, hash_node) {
224 if (local == t->parms.iph.saddr &&
225 remote == t->parms.iph.daddr &&
226 link == t->parms.link &&
227 type == t->dev->type &&
228 ip_tunnel_key_match(&t->parms, flags, key))
229 break;
231 return t;
234 static struct net_device *__ip_tunnel_create(struct net *net,
235 const struct rtnl_link_ops *ops,
236 struct ip_tunnel_parm *parms)
238 int err;
239 struct ip_tunnel *tunnel;
240 struct net_device *dev;
241 char name[IFNAMSIZ];
243 err = -E2BIG;
244 if (parms->name[0]) {
245 if (!dev_valid_name(parms->name))
246 goto failed;
247 strlcpy(name, parms->name, IFNAMSIZ);
248 } else {
249 if (strlen(ops->kind) > (IFNAMSIZ - 3))
250 goto failed;
251 strcpy(name, ops->kind);
252 strcat(name, "%d");
255 ASSERT_RTNL();
256 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
257 if (!dev) {
258 err = -ENOMEM;
259 goto failed;
261 dev_net_set(dev, net);
263 dev->rtnl_link_ops = ops;
265 tunnel = netdev_priv(dev);
266 tunnel->parms = *parms;
267 tunnel->net = net;
269 err = register_netdevice(dev);
270 if (err)
271 goto failed_free;
273 return dev;
275 failed_free:
276 free_netdev(dev);
277 failed:
278 return ERR_PTR(err);
281 static int ip_tunnel_bind_dev(struct net_device *dev)
283 struct net_device *tdev = NULL;
284 struct ip_tunnel *tunnel = netdev_priv(dev);
285 const struct iphdr *iph;
286 int hlen = LL_MAX_HEADER;
287 int mtu = ETH_DATA_LEN;
288 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
290 iph = &tunnel->parms.iph;
292 /* Guess output device to choose reasonable mtu and needed_headroom */
293 if (iph->daddr) {
294 struct flowi4 fl4;
295 struct rtable *rt;
297 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
298 iph->saddr, tunnel->parms.o_key,
299 RT_TOS(iph->tos), tunnel->parms.link,
300 tunnel->fwmark, 0);
301 rt = ip_route_output_key(tunnel->net, &fl4);
303 if (!IS_ERR(rt)) {
304 tdev = rt->dst.dev;
305 ip_rt_put(rt);
307 if (dev->type != ARPHRD_ETHER)
308 dev->flags |= IFF_POINTOPOINT;
310 dst_cache_reset(&tunnel->dst_cache);
313 if (!tdev && tunnel->parms.link)
314 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
316 if (tdev) {
317 hlen = tdev->hard_header_len + tdev->needed_headroom;
318 mtu = min(tdev->mtu, IP_MAX_MTU);
321 dev->needed_headroom = t_hlen + hlen;
322 mtu -= (dev->hard_header_len + t_hlen);
324 if (mtu < IPV4_MIN_MTU)
325 mtu = IPV4_MIN_MTU;
327 return mtu;
330 static struct ip_tunnel *ip_tunnel_create(struct net *net,
331 struct ip_tunnel_net *itn,
332 struct ip_tunnel_parm *parms)
334 struct ip_tunnel *nt;
335 struct net_device *dev;
336 int t_hlen;
337 int mtu;
338 int err;
340 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
341 if (IS_ERR(dev))
342 return ERR_CAST(dev);
344 mtu = ip_tunnel_bind_dev(dev);
345 err = dev_set_mtu(dev, mtu);
346 if (err)
347 goto err_dev_set_mtu;
349 nt = netdev_priv(dev);
350 t_hlen = nt->hlen + sizeof(struct iphdr);
351 dev->min_mtu = ETH_MIN_MTU;
352 dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
353 ip_tunnel_add(itn, nt);
354 return nt;
356 err_dev_set_mtu:
357 unregister_netdevice(dev);
358 return ERR_PTR(err);
361 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
362 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
363 bool log_ecn_error)
365 struct pcpu_sw_netstats *tstats;
366 const struct iphdr *iph = ip_hdr(skb);
367 int err;
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370 if (ipv4_is_multicast(iph->daddr)) {
371 tunnel->dev->stats.multicast++;
372 skb->pkt_type = PACKET_BROADCAST;
374 #endif
376 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378 tunnel->dev->stats.rx_crc_errors++;
379 tunnel->dev->stats.rx_errors++;
380 goto drop;
383 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384 if (!(tpi->flags&TUNNEL_SEQ) ||
385 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386 tunnel->dev->stats.rx_fifo_errors++;
387 tunnel->dev->stats.rx_errors++;
388 goto drop;
390 tunnel->i_seqno = ntohl(tpi->seq) + 1;
393 skb_reset_network_header(skb);
395 err = IP_ECN_decapsulate(iph, skb);
396 if (unlikely(err)) {
397 if (log_ecn_error)
398 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
399 &iph->saddr, iph->tos);
400 if (err > 1) {
401 ++tunnel->dev->stats.rx_frame_errors;
402 ++tunnel->dev->stats.rx_errors;
403 goto drop;
407 tstats = this_cpu_ptr(tunnel->dev->tstats);
408 u64_stats_update_begin(&tstats->syncp);
409 tstats->rx_packets++;
410 tstats->rx_bytes += skb->len;
411 u64_stats_update_end(&tstats->syncp);
413 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
415 if (tunnel->dev->type == ARPHRD_ETHER) {
416 skb->protocol = eth_type_trans(skb, tunnel->dev);
417 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
418 } else {
419 skb->dev = tunnel->dev;
422 if (tun_dst)
423 skb_dst_set(skb, (struct dst_entry *)tun_dst);
425 gro_cells_receive(&tunnel->gro_cells, skb);
426 return 0;
428 drop:
429 if (tun_dst)
430 dst_release((struct dst_entry *)tun_dst);
431 kfree_skb(skb);
432 return 0;
434 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
436 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
437 unsigned int num)
439 if (num >= MAX_IPTUN_ENCAP_OPS)
440 return -ERANGE;
442 return !cmpxchg((const struct ip_tunnel_encap_ops **)
443 &iptun_encaps[num],
444 NULL, ops) ? 0 : -1;
446 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
448 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
449 unsigned int num)
451 int ret;
453 if (num >= MAX_IPTUN_ENCAP_OPS)
454 return -ERANGE;
456 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
457 &iptun_encaps[num],
458 ops, NULL) == ops) ? 0 : -1;
460 synchronize_net();
462 return ret;
464 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
466 int ip_tunnel_encap_setup(struct ip_tunnel *t,
467 struct ip_tunnel_encap *ipencap)
469 int hlen;
471 memset(&t->encap, 0, sizeof(t->encap));
473 hlen = ip_encap_hlen(ipencap);
474 if (hlen < 0)
475 return hlen;
477 t->encap.type = ipencap->type;
478 t->encap.sport = ipencap->sport;
479 t->encap.dport = ipencap->dport;
480 t->encap.flags = ipencap->flags;
482 t->encap_hlen = hlen;
483 t->hlen = t->encap_hlen + t->tun_hlen;
485 return 0;
487 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
489 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
490 struct rtable *rt, __be16 df,
491 const struct iphdr *inner_iph,
492 int tunnel_hlen, __be32 dst, bool md)
494 struct ip_tunnel *tunnel = netdev_priv(dev);
495 int pkt_size;
496 int mtu;
498 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
499 pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
501 if (df)
502 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
503 - sizeof(struct iphdr) - tunnel_hlen;
504 else
505 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
507 if (skb_valid_dst(skb))
508 skb_dst_update_pmtu_no_confirm(skb, mtu);
510 if (skb->protocol == htons(ETH_P_IP)) {
511 if (!skb_is_gso(skb) &&
512 (inner_iph->frag_off & htons(IP_DF)) &&
513 mtu < pkt_size) {
514 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
515 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
516 return -E2BIG;
519 #if IS_ENABLED(CONFIG_IPV6)
520 else if (skb->protocol == htons(ETH_P_IPV6)) {
521 struct rt6_info *rt6;
522 __be32 daddr;
524 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
525 NULL;
526 daddr = md ? dst : tunnel->parms.iph.daddr;
528 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
529 mtu >= IPV6_MIN_MTU) {
530 if ((daddr && !ipv4_is_multicast(daddr)) ||
531 rt6->rt6i_dst.plen == 128) {
532 rt6->rt6i_flags |= RTF_MODIFIED;
533 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
537 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
538 mtu < pkt_size) {
539 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
540 return -E2BIG;
543 #endif
544 return 0;
547 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
548 u8 proto, int tunnel_hlen)
550 struct ip_tunnel *tunnel = netdev_priv(dev);
551 u32 headroom = sizeof(struct iphdr);
552 struct ip_tunnel_info *tun_info;
553 const struct ip_tunnel_key *key;
554 const struct iphdr *inner_iph;
555 struct rtable *rt = NULL;
556 struct flowi4 fl4;
557 __be16 df = 0;
558 u8 tos, ttl;
559 bool use_cache;
561 tun_info = skb_tunnel_info(skb);
562 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
563 ip_tunnel_info_af(tun_info) != AF_INET))
564 goto tx_error;
565 key = &tun_info->key;
566 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
567 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
568 tos = key->tos;
569 if (tos == 1) {
570 if (skb->protocol == htons(ETH_P_IP))
571 tos = inner_iph->tos;
572 else if (skb->protocol == htons(ETH_P_IPV6))
573 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
575 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
576 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
577 0, skb->mark, skb_get_hash(skb));
578 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
579 goto tx_error;
581 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
582 if (use_cache)
583 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
584 if (!rt) {
585 rt = ip_route_output_key(tunnel->net, &fl4);
586 if (IS_ERR(rt)) {
587 dev->stats.tx_carrier_errors++;
588 goto tx_error;
590 if (use_cache)
591 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
592 fl4.saddr);
594 if (rt->dst.dev == dev) {
595 ip_rt_put(rt);
596 dev->stats.collisions++;
597 goto tx_error;
600 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
601 df = htons(IP_DF);
602 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
603 key->u.ipv4.dst, true)) {
604 ip_rt_put(rt);
605 goto tx_error;
608 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
609 ttl = key->ttl;
610 if (ttl == 0) {
611 if (skb->protocol == htons(ETH_P_IP))
612 ttl = inner_iph->ttl;
613 else if (skb->protocol == htons(ETH_P_IPV6))
614 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
615 else
616 ttl = ip4_dst_hoplimit(&rt->dst);
619 if (!df && skb->protocol == htons(ETH_P_IP))
620 df = inner_iph->frag_off & htons(IP_DF);
622 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
623 if (headroom > dev->needed_headroom)
624 dev->needed_headroom = headroom;
626 if (skb_cow_head(skb, dev->needed_headroom)) {
627 ip_rt_put(rt);
628 goto tx_dropped;
630 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
631 df, !net_eq(tunnel->net, dev_net(dev)));
632 return;
633 tx_error:
634 dev->stats.tx_errors++;
635 goto kfree;
636 tx_dropped:
637 dev->stats.tx_dropped++;
638 kfree:
639 kfree_skb(skb);
641 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
643 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
644 const struct iphdr *tnl_params, u8 protocol)
646 struct ip_tunnel *tunnel = netdev_priv(dev);
647 struct ip_tunnel_info *tun_info = NULL;
648 const struct iphdr *inner_iph;
649 unsigned int max_headroom; /* The extra header space needed */
650 struct rtable *rt = NULL; /* Route to the other host */
651 bool use_cache = false;
652 struct flowi4 fl4;
653 bool md = false;
654 bool connected;
655 u8 tos, ttl;
656 __be32 dst;
657 __be16 df;
659 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
660 connected = (tunnel->parms.iph.daddr != 0);
662 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
664 dst = tnl_params->daddr;
665 if (dst == 0) {
666 /* NBMA tunnel */
668 if (!skb_dst(skb)) {
669 dev->stats.tx_fifo_errors++;
670 goto tx_error;
673 tun_info = skb_tunnel_info(skb);
674 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
675 ip_tunnel_info_af(tun_info) == AF_INET &&
676 tun_info->key.u.ipv4.dst) {
677 dst = tun_info->key.u.ipv4.dst;
678 md = true;
679 connected = true;
681 else if (skb->protocol == htons(ETH_P_IP)) {
682 rt = skb_rtable(skb);
683 dst = rt_nexthop(rt, inner_iph->daddr);
685 #if IS_ENABLED(CONFIG_IPV6)
686 else if (skb->protocol == htons(ETH_P_IPV6)) {
687 const struct in6_addr *addr6;
688 struct neighbour *neigh;
689 bool do_tx_error_icmp;
690 int addr_type;
692 neigh = dst_neigh_lookup(skb_dst(skb),
693 &ipv6_hdr(skb)->daddr);
694 if (!neigh)
695 goto tx_error;
697 addr6 = (const struct in6_addr *)&neigh->primary_key;
698 addr_type = ipv6_addr_type(addr6);
700 if (addr_type == IPV6_ADDR_ANY) {
701 addr6 = &ipv6_hdr(skb)->daddr;
702 addr_type = ipv6_addr_type(addr6);
705 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
706 do_tx_error_icmp = true;
707 else {
708 do_tx_error_icmp = false;
709 dst = addr6->s6_addr32[3];
711 neigh_release(neigh);
712 if (do_tx_error_icmp)
713 goto tx_error_icmp;
715 #endif
716 else
717 goto tx_error;
719 if (!md)
720 connected = false;
723 tos = tnl_params->tos;
724 if (tos & 0x1) {
725 tos &= ~0x1;
726 if (skb->protocol == htons(ETH_P_IP)) {
727 tos = inner_iph->tos;
728 connected = false;
729 } else if (skb->protocol == htons(ETH_P_IPV6)) {
730 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
731 connected = false;
735 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
736 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
737 tunnel->fwmark, skb_get_hash(skb));
739 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
740 goto tx_error;
742 if (connected && md) {
743 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
744 if (use_cache)
745 rt = dst_cache_get_ip4(&tun_info->dst_cache,
746 &fl4.saddr);
747 } else {
748 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
749 &fl4.saddr) : NULL;
752 if (!rt) {
753 rt = ip_route_output_key(tunnel->net, &fl4);
755 if (IS_ERR(rt)) {
756 dev->stats.tx_carrier_errors++;
757 goto tx_error;
759 if (use_cache)
760 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
761 fl4.saddr);
762 else if (!md && connected)
763 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
764 fl4.saddr);
767 if (rt->dst.dev == dev) {
768 ip_rt_put(rt);
769 dev->stats.collisions++;
770 goto tx_error;
773 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
774 0, 0, false)) {
775 ip_rt_put(rt);
776 goto tx_error;
779 if (tunnel->err_count > 0) {
780 if (time_before(jiffies,
781 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
782 tunnel->err_count--;
784 dst_link_failure(skb);
785 } else
786 tunnel->err_count = 0;
789 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
790 ttl = tnl_params->ttl;
791 if (ttl == 0) {
792 if (skb->protocol == htons(ETH_P_IP))
793 ttl = inner_iph->ttl;
794 #if IS_ENABLED(CONFIG_IPV6)
795 else if (skb->protocol == htons(ETH_P_IPV6))
796 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
797 #endif
798 else
799 ttl = ip4_dst_hoplimit(&rt->dst);
802 df = tnl_params->frag_off;
803 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
804 df |= (inner_iph->frag_off&htons(IP_DF));
806 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
807 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
808 if (max_headroom > dev->needed_headroom)
809 dev->needed_headroom = max_headroom;
811 if (skb_cow_head(skb, dev->needed_headroom)) {
812 ip_rt_put(rt);
813 dev->stats.tx_dropped++;
814 kfree_skb(skb);
815 return;
818 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
819 df, !net_eq(tunnel->net, dev_net(dev)));
820 return;
822 #if IS_ENABLED(CONFIG_IPV6)
823 tx_error_icmp:
824 dst_link_failure(skb);
825 #endif
826 tx_error:
827 dev->stats.tx_errors++;
828 kfree_skb(skb);
830 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
832 static void ip_tunnel_update(struct ip_tunnel_net *itn,
833 struct ip_tunnel *t,
834 struct net_device *dev,
835 struct ip_tunnel_parm *p,
836 bool set_mtu,
837 __u32 fwmark)
839 ip_tunnel_del(itn, t);
840 t->parms.iph.saddr = p->iph.saddr;
841 t->parms.iph.daddr = p->iph.daddr;
842 t->parms.i_key = p->i_key;
843 t->parms.o_key = p->o_key;
844 if (dev->type != ARPHRD_ETHER) {
845 memcpy(dev->dev_addr, &p->iph.saddr, 4);
846 memcpy(dev->broadcast, &p->iph.daddr, 4);
848 ip_tunnel_add(itn, t);
850 t->parms.iph.ttl = p->iph.ttl;
851 t->parms.iph.tos = p->iph.tos;
852 t->parms.iph.frag_off = p->iph.frag_off;
854 if (t->parms.link != p->link || t->fwmark != fwmark) {
855 int mtu;
857 t->parms.link = p->link;
858 t->fwmark = fwmark;
859 mtu = ip_tunnel_bind_dev(dev);
860 if (set_mtu)
861 dev->mtu = mtu;
863 dst_cache_reset(&t->dst_cache);
864 netdev_state_change(dev);
867 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
869 int err = 0;
870 struct ip_tunnel *t = netdev_priv(dev);
871 struct net *net = t->net;
872 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
874 switch (cmd) {
875 case SIOCGETTUNNEL:
876 if (dev == itn->fb_tunnel_dev) {
877 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
878 if (!t)
879 t = netdev_priv(dev);
881 memcpy(p, &t->parms, sizeof(*p));
882 break;
884 case SIOCADDTUNNEL:
885 case SIOCCHGTUNNEL:
886 err = -EPERM;
887 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
888 goto done;
889 if (p->iph.ttl)
890 p->iph.frag_off |= htons(IP_DF);
891 if (!(p->i_flags & VTI_ISVTI)) {
892 if (!(p->i_flags & TUNNEL_KEY))
893 p->i_key = 0;
894 if (!(p->o_flags & TUNNEL_KEY))
895 p->o_key = 0;
898 t = ip_tunnel_find(itn, p, itn->type);
900 if (cmd == SIOCADDTUNNEL) {
901 if (!t) {
902 t = ip_tunnel_create(net, itn, p);
903 err = PTR_ERR_OR_ZERO(t);
904 break;
907 err = -EEXIST;
908 break;
910 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
911 if (t) {
912 if (t->dev != dev) {
913 err = -EEXIST;
914 break;
916 } else {
917 unsigned int nflags = 0;
919 if (ipv4_is_multicast(p->iph.daddr))
920 nflags = IFF_BROADCAST;
921 else if (p->iph.daddr)
922 nflags = IFF_POINTOPOINT;
924 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
925 err = -EINVAL;
926 break;
929 t = netdev_priv(dev);
933 if (t) {
934 err = 0;
935 ip_tunnel_update(itn, t, dev, p, true, 0);
936 } else {
937 err = -ENOENT;
939 break;
941 case SIOCDELTUNNEL:
942 err = -EPERM;
943 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
944 goto done;
946 if (dev == itn->fb_tunnel_dev) {
947 err = -ENOENT;
948 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
949 if (!t)
950 goto done;
951 err = -EPERM;
952 if (t == netdev_priv(itn->fb_tunnel_dev))
953 goto done;
954 dev = t->dev;
956 unregister_netdevice(dev);
957 err = 0;
958 break;
960 default:
961 err = -EINVAL;
964 done:
965 return err;
967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
969 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
971 struct ip_tunnel *tunnel = netdev_priv(dev);
972 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
973 int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
975 if (new_mtu < ETH_MIN_MTU)
976 return -EINVAL;
978 if (new_mtu > max_mtu) {
979 if (strict)
980 return -EINVAL;
982 new_mtu = max_mtu;
985 dev->mtu = new_mtu;
986 return 0;
988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
990 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
992 return __ip_tunnel_change_mtu(dev, new_mtu, true);
994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
996 static void ip_tunnel_dev_free(struct net_device *dev)
998 struct ip_tunnel *tunnel = netdev_priv(dev);
1000 gro_cells_destroy(&tunnel->gro_cells);
1001 dst_cache_destroy(&tunnel->dst_cache);
1002 free_percpu(dev->tstats);
1005 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1007 struct ip_tunnel *tunnel = netdev_priv(dev);
1008 struct ip_tunnel_net *itn;
1010 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1012 if (itn->fb_tunnel_dev != dev) {
1013 ip_tunnel_del(itn, netdev_priv(dev));
1014 unregister_netdevice_queue(dev, head);
1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1019 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1021 struct ip_tunnel *tunnel = netdev_priv(dev);
1023 return tunnel->net;
1025 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1027 int ip_tunnel_get_iflink(const struct net_device *dev)
1029 struct ip_tunnel *tunnel = netdev_priv(dev);
1031 return tunnel->parms.link;
1033 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1035 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1036 struct rtnl_link_ops *ops, char *devname)
1038 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1039 struct ip_tunnel_parm parms;
1040 unsigned int i;
1042 itn->rtnl_link_ops = ops;
1043 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1044 INIT_HLIST_HEAD(&itn->tunnels[i]);
1046 if (!ops || !net_has_fallback_tunnels(net)) {
1047 struct ip_tunnel_net *it_init_net;
1049 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1050 itn->type = it_init_net->type;
1051 itn->fb_tunnel_dev = NULL;
1052 return 0;
1055 memset(&parms, 0, sizeof(parms));
1056 if (devname)
1057 strlcpy(parms.name, devname, IFNAMSIZ);
1059 rtnl_lock();
1060 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1061 /* FB netdevice is special: we have one, and only one per netns.
1062 * Allowing to move it to another netns is clearly unsafe.
1064 if (!IS_ERR(itn->fb_tunnel_dev)) {
1065 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1066 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1067 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1068 itn->type = itn->fb_tunnel_dev->type;
1070 rtnl_unlock();
1072 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1076 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1077 struct list_head *head,
1078 struct rtnl_link_ops *ops)
1080 struct net_device *dev, *aux;
1081 int h;
1083 for_each_netdev_safe(net, dev, aux)
1084 if (dev->rtnl_link_ops == ops)
1085 unregister_netdevice_queue(dev, head);
1087 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1088 struct ip_tunnel *t;
1089 struct hlist_node *n;
1090 struct hlist_head *thead = &itn->tunnels[h];
1092 hlist_for_each_entry_safe(t, n, thead, hash_node)
1093 /* If dev is in the same netns, it has already
1094 * been added to the list by the previous loop.
1096 if (!net_eq(dev_net(t->dev), net))
1097 unregister_netdevice_queue(t->dev, head);
1101 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1102 struct rtnl_link_ops *ops)
1104 struct ip_tunnel_net *itn;
1105 struct net *net;
1106 LIST_HEAD(list);
1108 rtnl_lock();
1109 list_for_each_entry(net, net_list, exit_list) {
1110 itn = net_generic(net, id);
1111 ip_tunnel_destroy(net, itn, &list, ops);
1113 unregister_netdevice_many(&list);
1114 rtnl_unlock();
1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1118 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1119 struct ip_tunnel_parm *p, __u32 fwmark)
1121 struct ip_tunnel *nt;
1122 struct net *net = dev_net(dev);
1123 struct ip_tunnel_net *itn;
1124 int mtu;
1125 int err;
1127 nt = netdev_priv(dev);
1128 itn = net_generic(net, nt->ip_tnl_net_id);
1130 if (nt->collect_md) {
1131 if (rtnl_dereference(itn->collect_md_tun))
1132 return -EEXIST;
1133 } else {
1134 if (ip_tunnel_find(itn, p, dev->type))
1135 return -EEXIST;
1138 nt->net = net;
1139 nt->parms = *p;
1140 nt->fwmark = fwmark;
1141 err = register_netdevice(dev);
1142 if (err)
1143 goto err_register_netdevice;
1145 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1146 eth_hw_addr_random(dev);
1148 mtu = ip_tunnel_bind_dev(dev);
1149 if (tb[IFLA_MTU]) {
1150 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1152 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1153 (unsigned int)(max - sizeof(struct iphdr)));
1156 err = dev_set_mtu(dev, mtu);
1157 if (err)
1158 goto err_dev_set_mtu;
1160 ip_tunnel_add(itn, nt);
1161 return 0;
1163 err_dev_set_mtu:
1164 unregister_netdevice(dev);
1165 err_register_netdevice:
1166 return err;
1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1170 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1171 struct ip_tunnel_parm *p, __u32 fwmark)
1173 struct ip_tunnel *t;
1174 struct ip_tunnel *tunnel = netdev_priv(dev);
1175 struct net *net = tunnel->net;
1176 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1178 if (dev == itn->fb_tunnel_dev)
1179 return -EINVAL;
1181 t = ip_tunnel_find(itn, p, dev->type);
1183 if (t) {
1184 if (t->dev != dev)
1185 return -EEXIST;
1186 } else {
1187 t = tunnel;
1189 if (dev->type != ARPHRD_ETHER) {
1190 unsigned int nflags = 0;
1192 if (ipv4_is_multicast(p->iph.daddr))
1193 nflags = IFF_BROADCAST;
1194 else if (p->iph.daddr)
1195 nflags = IFF_POINTOPOINT;
1197 if ((dev->flags ^ nflags) &
1198 (IFF_POINTOPOINT | IFF_BROADCAST))
1199 return -EINVAL;
1203 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1204 return 0;
1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1208 int ip_tunnel_init(struct net_device *dev)
1210 struct ip_tunnel *tunnel = netdev_priv(dev);
1211 struct iphdr *iph = &tunnel->parms.iph;
1212 int err;
1214 dev->needs_free_netdev = true;
1215 dev->priv_destructor = ip_tunnel_dev_free;
1216 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1217 if (!dev->tstats)
1218 return -ENOMEM;
1220 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1221 if (err) {
1222 free_percpu(dev->tstats);
1223 return err;
1226 err = gro_cells_init(&tunnel->gro_cells, dev);
1227 if (err) {
1228 dst_cache_destroy(&tunnel->dst_cache);
1229 free_percpu(dev->tstats);
1230 return err;
1233 tunnel->dev = dev;
1234 tunnel->net = dev_net(dev);
1235 strcpy(tunnel->parms.name, dev->name);
1236 iph->version = 4;
1237 iph->ihl = 5;
1239 if (tunnel->collect_md) {
1240 dev->features |= NETIF_F_NETNS_LOCAL;
1241 netif_keep_dst(dev);
1243 return 0;
1245 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1247 void ip_tunnel_uninit(struct net_device *dev)
1249 struct ip_tunnel *tunnel = netdev_priv(dev);
1250 struct net *net = tunnel->net;
1251 struct ip_tunnel_net *itn;
1253 itn = net_generic(net, tunnel->ip_tnl_net_id);
1254 /* fb_tunnel_dev will be unregisted in net-exit call. */
1255 if (itn->fb_tunnel_dev != dev)
1256 ip_tunnel_del(itn, netdev_priv(dev));
1258 dst_cache_reset(&tunnel->dst_cache);
1260 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1262 /* Do least required initialization, rest of init is done in tunnel_init call */
1263 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1265 struct ip_tunnel *tunnel = netdev_priv(dev);
1266 tunnel->ip_tnl_net_id = net_id;
1268 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1270 MODULE_LICENSE("GPL");