gro: Allow tunnel stacking in the case of FOU/GUE
[linux/fpc-iii.git] / net / ipv4 / ip_tunnel.c
blob35080a708b5956100369c7ac8576f6b62c7d19b5
1 /*
2 * Copyright (c) 2013 Nicira, Inc.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
68 return hash_32((__force u32)key ^ (__force u32)remote,
69 IP_TNL_HASH_BITS);
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 struct dst_entry *dst, __be32 saddr)
75 struct dst_entry *old_dst;
77 dst_clone(dst);
78 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 dst_release(old_dst);
80 idst->saddr = saddr;
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 struct dst_entry *dst, __be32 saddr)
86 __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
89 static void tunnel_dst_reset(struct ip_tunnel *t)
91 tunnel_dst_set(t, NULL, 0);
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
96 int i;
98 for_each_possible_cpu(i)
99 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 u32 cookie, __be32 *saddr)
106 struct ip_tunnel_dst *idst;
107 struct dst_entry *dst;
109 rcu_read_lock();
110 idst = raw_cpu_ptr(t->dst_cache);
111 dst = rcu_dereference(idst->dst);
112 if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 dst = NULL;
114 if (dst) {
115 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 *saddr = idst->saddr;
117 } else {
118 tunnel_dst_reset(t);
119 dst_release(dst);
120 dst = NULL;
123 rcu_read_unlock();
124 return (struct rtable *)dst;
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 __be16 flags, __be32 key)
130 if (p->i_flags & TUNNEL_KEY) {
131 if (flags & TUNNEL_KEY)
132 return key == p->i_key;
133 else
134 /* key expected, none present */
135 return false;
136 } else
137 return !(flags & TUNNEL_KEY);
140 /* Fallback tunnel: no source, no destination, no key, no options
142 Tunnel hash table:
143 We require exact key match i.e. if a key is present in packet
144 it will match only tunnel with the same key; if it is not present,
145 it will match only keyless tunnel.
147 All keysless packets, if not matched configured keyless tunnels
148 will match fallback tunnel.
149 Given src, dst and key, find appropriate for input tunnel.
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 int link, __be16 flags,
153 __be32 remote, __be32 local,
154 __be32 key)
156 unsigned int hash;
157 struct ip_tunnel *t, *cand = NULL;
158 struct hlist_head *head;
160 hash = ip_tunnel_hash(key, remote);
161 head = &itn->tunnels[hash];
163 hlist_for_each_entry_rcu(t, head, hash_node) {
164 if (local != t->parms.iph.saddr ||
165 remote != t->parms.iph.daddr ||
166 !(t->dev->flags & IFF_UP))
167 continue;
169 if (!ip_tunnel_key_match(&t->parms, flags, key))
170 continue;
172 if (t->parms.link == link)
173 return t;
174 else
175 cand = t;
178 hlist_for_each_entry_rcu(t, head, hash_node) {
179 if (remote != t->parms.iph.daddr ||
180 t->parms.iph.saddr != 0 ||
181 !(t->dev->flags & IFF_UP))
182 continue;
184 if (!ip_tunnel_key_match(&t->parms, flags, key))
185 continue;
187 if (t->parms.link == link)
188 return t;
189 else if (!cand)
190 cand = t;
193 hash = ip_tunnel_hash(key, 0);
194 head = &itn->tunnels[hash];
196 hlist_for_each_entry_rcu(t, head, hash_node) {
197 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 continue;
201 if (!(t->dev->flags & IFF_UP))
202 continue;
204 if (!ip_tunnel_key_match(&t->parms, flags, key))
205 continue;
207 if (t->parms.link == link)
208 return t;
209 else if (!cand)
210 cand = t;
213 if (flags & TUNNEL_NO_KEY)
214 goto skip_key_lookup;
216 hlist_for_each_entry_rcu(t, head, hash_node) {
217 if (t->parms.i_key != key ||
218 t->parms.iph.saddr != 0 ||
219 t->parms.iph.daddr != 0 ||
220 !(t->dev->flags & IFF_UP))
221 continue;
223 if (t->parms.link == link)
224 return t;
225 else if (!cand)
226 cand = t;
229 skip_key_lookup:
230 if (cand)
231 return cand;
233 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234 return netdev_priv(itn->fb_tunnel_dev);
237 return NULL;
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242 struct ip_tunnel_parm *parms)
244 unsigned int h;
245 __be32 remote;
246 __be32 i_key = parms->i_key;
248 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249 remote = parms->iph.daddr;
250 else
251 remote = 0;
253 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254 i_key = 0;
256 h = ip_tunnel_hash(i_key, remote);
257 return &itn->tunnels[h];
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
262 struct hlist_head *head = ip_bucket(itn, &t->parms);
264 hlist_add_head_rcu(&t->hash_node, head);
267 static void ip_tunnel_del(struct ip_tunnel *t)
269 hlist_del_init_rcu(&t->hash_node);
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273 struct ip_tunnel_parm *parms,
274 int type)
276 __be32 remote = parms->iph.daddr;
277 __be32 local = parms->iph.saddr;
278 __be32 key = parms->i_key;
279 __be16 flags = parms->i_flags;
280 int link = parms->link;
281 struct ip_tunnel *t = NULL;
282 struct hlist_head *head = ip_bucket(itn, parms);
284 hlist_for_each_entry_rcu(t, head, hash_node) {
285 if (local == t->parms.iph.saddr &&
286 remote == t->parms.iph.daddr &&
287 link == t->parms.link &&
288 type == t->dev->type &&
289 ip_tunnel_key_match(&t->parms, flags, key))
290 break;
292 return t;
295 static struct net_device *__ip_tunnel_create(struct net *net,
296 const struct rtnl_link_ops *ops,
297 struct ip_tunnel_parm *parms)
299 int err;
300 struct ip_tunnel *tunnel;
301 struct net_device *dev;
302 char name[IFNAMSIZ];
304 if (parms->name[0])
305 strlcpy(name, parms->name, IFNAMSIZ);
306 else {
307 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308 err = -E2BIG;
309 goto failed;
311 strlcpy(name, ops->kind, IFNAMSIZ);
312 strncat(name, "%d", 2);
315 ASSERT_RTNL();
316 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317 if (!dev) {
318 err = -ENOMEM;
319 goto failed;
321 dev_net_set(dev, net);
323 dev->rtnl_link_ops = ops;
325 tunnel = netdev_priv(dev);
326 tunnel->parms = *parms;
327 tunnel->net = net;
329 err = register_netdevice(dev);
330 if (err)
331 goto failed_free;
333 return dev;
335 failed_free:
336 free_netdev(dev);
337 failed:
338 return ERR_PTR(err);
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342 int proto,
343 __be32 daddr, __be32 saddr,
344 __be32 key, __u8 tos, int oif)
346 memset(fl4, 0, sizeof(*fl4));
347 fl4->flowi4_oif = oif;
348 fl4->daddr = daddr;
349 fl4->saddr = saddr;
350 fl4->flowi4_tos = tos;
351 fl4->flowi4_proto = proto;
352 fl4->fl4_gre_key = key;
355 static int ip_tunnel_bind_dev(struct net_device *dev)
357 struct net_device *tdev = NULL;
358 struct ip_tunnel *tunnel = netdev_priv(dev);
359 const struct iphdr *iph;
360 int hlen = LL_MAX_HEADER;
361 int mtu = ETH_DATA_LEN;
362 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
364 iph = &tunnel->parms.iph;
366 /* Guess output device to choose reasonable mtu and needed_headroom */
367 if (iph->daddr) {
368 struct flowi4 fl4;
369 struct rtable *rt;
371 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372 iph->saddr, tunnel->parms.o_key,
373 RT_TOS(iph->tos), tunnel->parms.link);
374 rt = ip_route_output_key(tunnel->net, &fl4);
376 if (!IS_ERR(rt)) {
377 tdev = rt->dst.dev;
378 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379 ip_rt_put(rt);
381 if (dev->type != ARPHRD_ETHER)
382 dev->flags |= IFF_POINTOPOINT;
385 if (!tdev && tunnel->parms.link)
386 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
388 if (tdev) {
389 hlen = tdev->hard_header_len + tdev->needed_headroom;
390 mtu = tdev->mtu;
393 dev->needed_headroom = t_hlen + hlen;
394 mtu -= (dev->hard_header_len + t_hlen);
396 if (mtu < 68)
397 mtu = 68;
399 return mtu;
402 static struct ip_tunnel *ip_tunnel_create(struct net *net,
403 struct ip_tunnel_net *itn,
404 struct ip_tunnel_parm *parms)
406 struct ip_tunnel *nt;
407 struct net_device *dev;
409 BUG_ON(!itn->fb_tunnel_dev);
410 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
411 if (IS_ERR(dev))
412 return ERR_CAST(dev);
414 dev->mtu = ip_tunnel_bind_dev(dev);
416 nt = netdev_priv(dev);
417 ip_tunnel_add(itn, nt);
418 return nt;
421 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
422 const struct tnl_ptk_info *tpi, bool log_ecn_error)
424 struct pcpu_sw_netstats *tstats;
425 const struct iphdr *iph = ip_hdr(skb);
426 int err;
428 #ifdef CONFIG_NET_IPGRE_BROADCAST
429 if (ipv4_is_multicast(iph->daddr)) {
430 tunnel->dev->stats.multicast++;
431 skb->pkt_type = PACKET_BROADCAST;
433 #endif
435 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
436 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
437 tunnel->dev->stats.rx_crc_errors++;
438 tunnel->dev->stats.rx_errors++;
439 goto drop;
442 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
443 if (!(tpi->flags&TUNNEL_SEQ) ||
444 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
445 tunnel->dev->stats.rx_fifo_errors++;
446 tunnel->dev->stats.rx_errors++;
447 goto drop;
449 tunnel->i_seqno = ntohl(tpi->seq) + 1;
452 skb_reset_network_header(skb);
454 err = IP_ECN_decapsulate(iph, skb);
455 if (unlikely(err)) {
456 if (log_ecn_error)
457 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
458 &iph->saddr, iph->tos);
459 if (err > 1) {
460 ++tunnel->dev->stats.rx_frame_errors;
461 ++tunnel->dev->stats.rx_errors;
462 goto drop;
466 tstats = this_cpu_ptr(tunnel->dev->tstats);
467 u64_stats_update_begin(&tstats->syncp);
468 tstats->rx_packets++;
469 tstats->rx_bytes += skb->len;
470 u64_stats_update_end(&tstats->syncp);
472 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
474 if (tunnel->dev->type == ARPHRD_ETHER) {
475 skb->protocol = eth_type_trans(skb, tunnel->dev);
476 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
477 } else {
478 skb->dev = tunnel->dev;
481 gro_cells_receive(&tunnel->gro_cells, skb);
482 return 0;
484 drop:
485 kfree_skb(skb);
486 return 0;
488 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
490 static int ip_encap_hlen(struct ip_tunnel_encap *e)
492 const struct ip_tunnel_encap_ops *ops;
493 int hlen = -EINVAL;
495 if (e->type == TUNNEL_ENCAP_NONE)
496 return 0;
498 if (e->type >= MAX_IPTUN_ENCAP_OPS)
499 return -EINVAL;
501 rcu_read_lock();
502 ops = rcu_dereference(iptun_encaps[e->type]);
503 if (likely(ops && ops->encap_hlen))
504 hlen = ops->encap_hlen(e);
505 rcu_read_unlock();
507 return hlen;
510 const struct ip_tunnel_encap_ops __rcu *
511 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
513 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
514 unsigned int num)
516 if (num >= MAX_IPTUN_ENCAP_OPS)
517 return -ERANGE;
519 return !cmpxchg((const struct ip_tunnel_encap_ops **)
520 &iptun_encaps[num],
521 NULL, ops) ? 0 : -1;
523 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
525 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
526 unsigned int num)
528 int ret;
530 if (num >= MAX_IPTUN_ENCAP_OPS)
531 return -ERANGE;
533 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
534 &iptun_encaps[num],
535 ops, NULL) == ops) ? 0 : -1;
537 synchronize_net();
539 return ret;
541 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
543 int ip_tunnel_encap_setup(struct ip_tunnel *t,
544 struct ip_tunnel_encap *ipencap)
546 int hlen;
548 memset(&t->encap, 0, sizeof(t->encap));
550 hlen = ip_encap_hlen(ipencap);
551 if (hlen < 0)
552 return hlen;
554 t->encap.type = ipencap->type;
555 t->encap.sport = ipencap->sport;
556 t->encap.dport = ipencap->dport;
557 t->encap.flags = ipencap->flags;
559 t->encap_hlen = hlen;
560 t->hlen = t->encap_hlen + t->tun_hlen;
562 return 0;
564 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
566 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
567 u8 *protocol, struct flowi4 *fl4)
569 const struct ip_tunnel_encap_ops *ops;
570 int ret = -EINVAL;
572 if (t->encap.type == TUNNEL_ENCAP_NONE)
573 return 0;
575 if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
576 return -EINVAL;
578 rcu_read_lock();
579 ops = rcu_dereference(iptun_encaps[t->encap.type]);
580 if (likely(ops && ops->build_header))
581 ret = ops->build_header(skb, &t->encap, protocol, fl4);
582 rcu_read_unlock();
584 return ret;
586 EXPORT_SYMBOL(ip_tunnel_encap);
588 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
589 struct rtable *rt, __be16 df,
590 const struct iphdr *inner_iph)
592 struct ip_tunnel *tunnel = netdev_priv(dev);
593 int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
594 int mtu;
596 if (df)
597 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598 - sizeof(struct iphdr) - tunnel->hlen;
599 else
600 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
602 if (skb_dst(skb))
603 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
605 if (skb->protocol == htons(ETH_P_IP)) {
606 if (!skb_is_gso(skb) &&
607 (inner_iph->frag_off & htons(IP_DF)) &&
608 mtu < pkt_size) {
609 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
610 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
611 return -E2BIG;
614 #if IS_ENABLED(CONFIG_IPV6)
615 else if (skb->protocol == htons(ETH_P_IPV6)) {
616 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
618 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
619 mtu >= IPV6_MIN_MTU) {
620 if ((tunnel->parms.iph.daddr &&
621 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
622 rt6->rt6i_dst.plen == 128) {
623 rt6->rt6i_flags |= RTF_MODIFIED;
624 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
628 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
629 mtu < pkt_size) {
630 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631 return -E2BIG;
634 #endif
635 return 0;
638 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
639 const struct iphdr *tnl_params, u8 protocol)
641 struct ip_tunnel *tunnel = netdev_priv(dev);
642 const struct iphdr *inner_iph;
643 struct flowi4 fl4;
644 u8 tos, ttl;
645 __be16 df;
646 struct rtable *rt; /* Route to the other host */
647 unsigned int max_headroom; /* The extra header space needed */
648 __be32 dst;
649 int err;
650 bool connected;
652 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
653 connected = (tunnel->parms.iph.daddr != 0);
655 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
657 dst = tnl_params->daddr;
658 if (dst == 0) {
659 /* NBMA tunnel */
661 if (!skb_dst(skb)) {
662 dev->stats.tx_fifo_errors++;
663 goto tx_error;
666 if (skb->protocol == htons(ETH_P_IP)) {
667 rt = skb_rtable(skb);
668 dst = rt_nexthop(rt, inner_iph->daddr);
670 #if IS_ENABLED(CONFIG_IPV6)
671 else if (skb->protocol == htons(ETH_P_IPV6)) {
672 const struct in6_addr *addr6;
673 struct neighbour *neigh;
674 bool do_tx_error_icmp;
675 int addr_type;
677 neigh = dst_neigh_lookup(skb_dst(skb),
678 &ipv6_hdr(skb)->daddr);
679 if (!neigh)
680 goto tx_error;
682 addr6 = (const struct in6_addr *)&neigh->primary_key;
683 addr_type = ipv6_addr_type(addr6);
685 if (addr_type == IPV6_ADDR_ANY) {
686 addr6 = &ipv6_hdr(skb)->daddr;
687 addr_type = ipv6_addr_type(addr6);
690 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
691 do_tx_error_icmp = true;
692 else {
693 do_tx_error_icmp = false;
694 dst = addr6->s6_addr32[3];
696 neigh_release(neigh);
697 if (do_tx_error_icmp)
698 goto tx_error_icmp;
700 #endif
701 else
702 goto tx_error;
704 connected = false;
707 tos = tnl_params->tos;
708 if (tos & 0x1) {
709 tos &= ~0x1;
710 if (skb->protocol == htons(ETH_P_IP)) {
711 tos = inner_iph->tos;
712 connected = false;
713 } else if (skb->protocol == htons(ETH_P_IPV6)) {
714 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
715 connected = false;
719 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
720 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
722 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
723 goto tx_error;
725 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
727 if (!rt) {
728 rt = ip_route_output_key(tunnel->net, &fl4);
730 if (IS_ERR(rt)) {
731 dev->stats.tx_carrier_errors++;
732 goto tx_error;
734 if (connected)
735 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
738 if (rt->dst.dev == dev) {
739 ip_rt_put(rt);
740 dev->stats.collisions++;
741 goto tx_error;
744 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
745 ip_rt_put(rt);
746 goto tx_error;
749 if (tunnel->err_count > 0) {
750 if (time_before(jiffies,
751 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
752 tunnel->err_count--;
754 dst_link_failure(skb);
755 } else
756 tunnel->err_count = 0;
759 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
760 ttl = tnl_params->ttl;
761 if (ttl == 0) {
762 if (skb->protocol == htons(ETH_P_IP))
763 ttl = inner_iph->ttl;
764 #if IS_ENABLED(CONFIG_IPV6)
765 else if (skb->protocol == htons(ETH_P_IPV6))
766 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
767 #endif
768 else
769 ttl = ip4_dst_hoplimit(&rt->dst);
772 df = tnl_params->frag_off;
773 if (skb->protocol == htons(ETH_P_IP))
774 df |= (inner_iph->frag_off&htons(IP_DF));
776 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
777 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
778 if (max_headroom > dev->needed_headroom)
779 dev->needed_headroom = max_headroom;
781 if (skb_cow_head(skb, dev->needed_headroom)) {
782 ip_rt_put(rt);
783 dev->stats.tx_dropped++;
784 kfree_skb(skb);
785 return;
788 err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
789 tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
790 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
792 return;
794 #if IS_ENABLED(CONFIG_IPV6)
795 tx_error_icmp:
796 dst_link_failure(skb);
797 #endif
798 tx_error:
799 dev->stats.tx_errors++;
800 kfree_skb(skb);
802 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
804 static void ip_tunnel_update(struct ip_tunnel_net *itn,
805 struct ip_tunnel *t,
806 struct net_device *dev,
807 struct ip_tunnel_parm *p,
808 bool set_mtu)
810 ip_tunnel_del(t);
811 t->parms.iph.saddr = p->iph.saddr;
812 t->parms.iph.daddr = p->iph.daddr;
813 t->parms.i_key = p->i_key;
814 t->parms.o_key = p->o_key;
815 if (dev->type != ARPHRD_ETHER) {
816 memcpy(dev->dev_addr, &p->iph.saddr, 4);
817 memcpy(dev->broadcast, &p->iph.daddr, 4);
819 ip_tunnel_add(itn, t);
821 t->parms.iph.ttl = p->iph.ttl;
822 t->parms.iph.tos = p->iph.tos;
823 t->parms.iph.frag_off = p->iph.frag_off;
825 if (t->parms.link != p->link) {
826 int mtu;
828 t->parms.link = p->link;
829 mtu = ip_tunnel_bind_dev(dev);
830 if (set_mtu)
831 dev->mtu = mtu;
833 ip_tunnel_dst_reset_all(t);
834 netdev_state_change(dev);
837 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
839 int err = 0;
840 struct ip_tunnel *t = netdev_priv(dev);
841 struct net *net = t->net;
842 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
844 BUG_ON(!itn->fb_tunnel_dev);
845 switch (cmd) {
846 case SIOCGETTUNNEL:
847 if (dev == itn->fb_tunnel_dev) {
848 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
849 if (!t)
850 t = netdev_priv(dev);
852 memcpy(p, &t->parms, sizeof(*p));
853 break;
855 case SIOCADDTUNNEL:
856 case SIOCCHGTUNNEL:
857 err = -EPERM;
858 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
859 goto done;
860 if (p->iph.ttl)
861 p->iph.frag_off |= htons(IP_DF);
862 if (!(p->i_flags & VTI_ISVTI)) {
863 if (!(p->i_flags & TUNNEL_KEY))
864 p->i_key = 0;
865 if (!(p->o_flags & TUNNEL_KEY))
866 p->o_key = 0;
869 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
871 if (cmd == SIOCADDTUNNEL) {
872 if (!t) {
873 t = ip_tunnel_create(net, itn, p);
874 err = PTR_ERR_OR_ZERO(t);
875 break;
878 err = -EEXIST;
879 break;
881 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
882 if (t) {
883 if (t->dev != dev) {
884 err = -EEXIST;
885 break;
887 } else {
888 unsigned int nflags = 0;
890 if (ipv4_is_multicast(p->iph.daddr))
891 nflags = IFF_BROADCAST;
892 else if (p->iph.daddr)
893 nflags = IFF_POINTOPOINT;
895 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
896 err = -EINVAL;
897 break;
900 t = netdev_priv(dev);
904 if (t) {
905 err = 0;
906 ip_tunnel_update(itn, t, dev, p, true);
907 } else {
908 err = -ENOENT;
910 break;
912 case SIOCDELTUNNEL:
913 err = -EPERM;
914 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
915 goto done;
917 if (dev == itn->fb_tunnel_dev) {
918 err = -ENOENT;
919 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
920 if (!t)
921 goto done;
922 err = -EPERM;
923 if (t == netdev_priv(itn->fb_tunnel_dev))
924 goto done;
925 dev = t->dev;
927 unregister_netdevice(dev);
928 err = 0;
929 break;
931 default:
932 err = -EINVAL;
935 done:
936 return err;
938 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
940 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
942 struct ip_tunnel *tunnel = netdev_priv(dev);
943 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
945 if (new_mtu < 68 ||
946 new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
947 return -EINVAL;
948 dev->mtu = new_mtu;
949 return 0;
951 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
953 static void ip_tunnel_dev_free(struct net_device *dev)
955 struct ip_tunnel *tunnel = netdev_priv(dev);
957 gro_cells_destroy(&tunnel->gro_cells);
958 free_percpu(tunnel->dst_cache);
959 free_percpu(dev->tstats);
960 free_netdev(dev);
963 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
965 struct ip_tunnel *tunnel = netdev_priv(dev);
966 struct ip_tunnel_net *itn;
968 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
970 if (itn->fb_tunnel_dev != dev) {
971 ip_tunnel_del(netdev_priv(dev));
972 unregister_netdevice_queue(dev, head);
975 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
977 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
979 struct ip_tunnel *tunnel = netdev_priv(dev);
981 return tunnel->net;
983 EXPORT_SYMBOL(ip_tunnel_get_link_net);
985 int ip_tunnel_get_iflink(const struct net_device *dev)
987 struct ip_tunnel *tunnel = netdev_priv(dev);
989 return tunnel->parms.link;
991 EXPORT_SYMBOL(ip_tunnel_get_iflink);
993 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
994 struct rtnl_link_ops *ops, char *devname)
996 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
997 struct ip_tunnel_parm parms;
998 unsigned int i;
1000 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1001 INIT_HLIST_HEAD(&itn->tunnels[i]);
1003 if (!ops) {
1004 itn->fb_tunnel_dev = NULL;
1005 return 0;
1008 memset(&parms, 0, sizeof(parms));
1009 if (devname)
1010 strlcpy(parms.name, devname, IFNAMSIZ);
1012 rtnl_lock();
1013 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1014 /* FB netdevice is special: we have one, and only one per netns.
1015 * Allowing to move it to another netns is clearly unsafe.
1017 if (!IS_ERR(itn->fb_tunnel_dev)) {
1018 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1019 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1020 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1022 rtnl_unlock();
1024 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1026 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1028 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1029 struct rtnl_link_ops *ops)
1031 struct net *net = dev_net(itn->fb_tunnel_dev);
1032 struct net_device *dev, *aux;
1033 int h;
1035 for_each_netdev_safe(net, dev, aux)
1036 if (dev->rtnl_link_ops == ops)
1037 unregister_netdevice_queue(dev, head);
1039 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1040 struct ip_tunnel *t;
1041 struct hlist_node *n;
1042 struct hlist_head *thead = &itn->tunnels[h];
1044 hlist_for_each_entry_safe(t, n, thead, hash_node)
1045 /* If dev is in the same netns, it has already
1046 * been added to the list by the previous loop.
1048 if (!net_eq(dev_net(t->dev), net))
1049 unregister_netdevice_queue(t->dev, head);
1053 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1055 LIST_HEAD(list);
1057 rtnl_lock();
1058 ip_tunnel_destroy(itn, &list, ops);
1059 unregister_netdevice_many(&list);
1060 rtnl_unlock();
1062 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1064 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1065 struct ip_tunnel_parm *p)
1067 struct ip_tunnel *nt;
1068 struct net *net = dev_net(dev);
1069 struct ip_tunnel_net *itn;
1070 int mtu;
1071 int err;
1073 nt = netdev_priv(dev);
1074 itn = net_generic(net, nt->ip_tnl_net_id);
1076 if (ip_tunnel_find(itn, p, dev->type))
1077 return -EEXIST;
1079 nt->net = net;
1080 nt->parms = *p;
1081 err = register_netdevice(dev);
1082 if (err)
1083 goto out;
1085 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1086 eth_hw_addr_random(dev);
1088 mtu = ip_tunnel_bind_dev(dev);
1089 if (!tb[IFLA_MTU])
1090 dev->mtu = mtu;
1092 ip_tunnel_add(itn, nt);
1094 out:
1095 return err;
1097 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1099 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1100 struct ip_tunnel_parm *p)
1102 struct ip_tunnel *t;
1103 struct ip_tunnel *tunnel = netdev_priv(dev);
1104 struct net *net = tunnel->net;
1105 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1107 if (dev == itn->fb_tunnel_dev)
1108 return -EINVAL;
1110 t = ip_tunnel_find(itn, p, dev->type);
1112 if (t) {
1113 if (t->dev != dev)
1114 return -EEXIST;
1115 } else {
1116 t = tunnel;
1118 if (dev->type != ARPHRD_ETHER) {
1119 unsigned int nflags = 0;
1121 if (ipv4_is_multicast(p->iph.daddr))
1122 nflags = IFF_BROADCAST;
1123 else if (p->iph.daddr)
1124 nflags = IFF_POINTOPOINT;
1126 if ((dev->flags ^ nflags) &
1127 (IFF_POINTOPOINT | IFF_BROADCAST))
1128 return -EINVAL;
1132 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1133 return 0;
1135 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1137 int ip_tunnel_init(struct net_device *dev)
1139 struct ip_tunnel *tunnel = netdev_priv(dev);
1140 struct iphdr *iph = &tunnel->parms.iph;
1141 int err;
1143 dev->destructor = ip_tunnel_dev_free;
1144 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1145 if (!dev->tstats)
1146 return -ENOMEM;
1148 tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1149 if (!tunnel->dst_cache) {
1150 free_percpu(dev->tstats);
1151 return -ENOMEM;
1154 err = gro_cells_init(&tunnel->gro_cells, dev);
1155 if (err) {
1156 free_percpu(tunnel->dst_cache);
1157 free_percpu(dev->tstats);
1158 return err;
1161 tunnel->dev = dev;
1162 tunnel->net = dev_net(dev);
1163 strcpy(tunnel->parms.name, dev->name);
1164 iph->version = 4;
1165 iph->ihl = 5;
1167 return 0;
1169 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1171 void ip_tunnel_uninit(struct net_device *dev)
1173 struct ip_tunnel *tunnel = netdev_priv(dev);
1174 struct net *net = tunnel->net;
1175 struct ip_tunnel_net *itn;
1177 itn = net_generic(net, tunnel->ip_tnl_net_id);
1178 /* fb_tunnel_dev will be unregisted in net-exit call. */
1179 if (itn->fb_tunnel_dev != dev)
1180 ip_tunnel_del(netdev_priv(dev));
1182 ip_tunnel_dst_reset_all(tunnel);
1184 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1186 /* Do least required initialization, rest of init is done in tunnel_init call */
1187 void ip_tunnel_setup(struct net_device *dev, int net_id)
1189 struct ip_tunnel *tunnel = netdev_priv(dev);
1190 tunnel->ip_tnl_net_id = net_id;
1192 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1194 MODULE_LICENSE("GPL");