Adding support for MOXA ART SoC. Testing port of linux-2.6.32.60-moxart.
[linux-3.6.7-moxart.git] / net / ipv4 / ip_gre.c
blobb062a98574f2e40e63d5b5eee8f1d7f2fda28425
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
58 Problems & solutions
59 --------------------
61 1. The most important issue is detecting local dead loops.
62 They would cause complete host lockup in transmit, which
63 would be "resolved" by stack overflow or, if queueing is enabled,
64 with infinite looping in net_bh.
66 We cannot track such dead loops during route installation,
67 it is infeasible task. The most general solutions would be
68 to keep skb->encapsulation counter (sort of local ttl),
69 and silently drop packet when it expires. It is a good
70 solution, but it supposes maintaining new variable in ALL
71 skb, even if no tunneling is used.
73 Current solution: xmit_recursion breaks dead loops. This is a percpu
74 counter, since when we enter the first ndo_xmit(), cpu migration is
75 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
77 2. Networking dead loops would not kill routers, but would really
78 kill network. IP hop limit plays role of "t->recursion" in this case,
79 if we copy it from packet being encapsulated to upper header.
80 It is very good solution, but it introduces two problems:
82 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83 do not work over tunnels.
84 - traceroute does not work. I planned to relay ICMP from tunnel,
85 so that this problem would be solved and traceroute output
86 would even more informative. This idea appeared to be wrong:
87 only Linux complies to rfc1812 now (yes, guys, Linux is the only
88 true router now :-)), all routers (at least, in neighbourhood of mine)
89 return only 8 bytes of payload. It is the end.
91 Hence, if we want that OSPF worked or traceroute said something reasonable,
92 we should search for another solution.
94 One of them is to parse packet trying to detect inner encapsulation
95 made by our node. It is difficult or even impossible, especially,
96 taking into account fragmentation. TO be short, ttl is not solution at all.
98 Current solution: The solution was UNEXPECTEDLY SIMPLE.
99 We force DF flag on tunnels with preconfigured hop limit,
100 that is ALL. :-) Well, it does not remove the problem completely,
101 but exponential growth of network traffic is changed to linear
102 (branches, that exceed pmtu are pruned) and tunnel mtu
103 rapidly degrades to value <68, where looping stops.
104 Yes, it is not good if there exists a router in the loop,
105 which does not force DF, even when encapsulating packets have DF set.
106 But it is not our problem! Nobody could accuse us, we made
107 all that we could make. Even if it is your gated who injected
108 fatal route to network, even if it were you who configured
109 fatal static route: you are innocent. :-)
113 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114 practically identical code. It would be good to glue them
115 together, but it is not very evident, how to make them modular.
116 sit is integral part of IPv6, ipip and gre are naturally modular.
117 We could extract common parts (hash table, ioctl etc)
118 to a separate module (ip_tunnel.c).
120 Alexey Kuznetsov.
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
128 /* Fallback tunnel: no source, no destination, no key, no options */
130 #define HASH_SIZE 16
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
136 struct net_device *fb_tunnel_dev;
139 /* Tunnel hash table */
142 4 hash tables:
144 3: (remote,local)
145 2: (remote,*)
146 1: (*,local)
147 0: (*,*)
149 We require exact key match i.e. if a key is present in packet
150 it will match only tunnel with the same key; if it is not present,
151 it will match only keyless tunnel.
153 All keysless packets, if not matched configured keyless tunnels
154 will match fallback tunnel.
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
159 #define tunnels_r_l tunnels[3]
160 #define tunnels_r tunnels[2]
161 #define tunnels_l tunnels[1]
162 #define tunnels_wc tunnels[0]
164 * Locking : hash tables are protected by RCU and RTNL
167 #define for_each_ip_tunnel_rcu(start) \
168 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172 u64 rx_packets;
173 u64 rx_bytes;
174 u64 tx_packets;
175 u64 tx_bytes;
176 struct u64_stats_sync syncp;
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180 struct rtnl_link_stats64 *tot)
182 int i;
184 for_each_possible_cpu(i) {
185 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187 unsigned int start;
189 do {
190 start = u64_stats_fetch_begin_bh(&tstats->syncp);
191 rx_packets = tstats->rx_packets;
192 tx_packets = tstats->tx_packets;
193 rx_bytes = tstats->rx_bytes;
194 tx_bytes = tstats->tx_bytes;
195 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
197 tot->rx_packets += rx_packets;
198 tot->tx_packets += tx_packets;
199 tot->rx_bytes += rx_bytes;
200 tot->tx_bytes += tx_bytes;
203 tot->multicast = dev->stats.multicast;
204 tot->rx_crc_errors = dev->stats.rx_crc_errors;
205 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206 tot->rx_length_errors = dev->stats.rx_length_errors;
207 tot->rx_errors = dev->stats.rx_errors;
208 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210 tot->tx_dropped = dev->stats.tx_dropped;
211 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212 tot->tx_errors = dev->stats.tx_errors;
214 return tot;
217 /* Given src, dst and key, find appropriate for input tunnel. */
219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220 __be32 remote, __be32 local,
221 __be32 key, __be16 gre_proto)
223 struct net *net = dev_net(dev);
224 int link = dev->ifindex;
225 unsigned int h0 = HASH(remote);
226 unsigned int h1 = HASH(key);
227 struct ip_tunnel *t, *cand = NULL;
228 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
229 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
230 ARPHRD_ETHER : ARPHRD_IPGRE;
231 int score, cand_score = 4;
233 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234 if (local != t->parms.iph.saddr ||
235 remote != t->parms.iph.daddr ||
236 key != t->parms.i_key ||
237 !(t->dev->flags & IFF_UP))
238 continue;
240 if (t->dev->type != ARPHRD_IPGRE &&
241 t->dev->type != dev_type)
242 continue;
244 score = 0;
245 if (t->parms.link != link)
246 score |= 1;
247 if (t->dev->type != dev_type)
248 score |= 2;
249 if (score == 0)
250 return t;
252 if (score < cand_score) {
253 cand = t;
254 cand_score = score;
258 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259 if (remote != t->parms.iph.daddr ||
260 key != t->parms.i_key ||
261 !(t->dev->flags & IFF_UP))
262 continue;
264 if (t->dev->type != ARPHRD_IPGRE &&
265 t->dev->type != dev_type)
266 continue;
268 score = 0;
269 if (t->parms.link != link)
270 score |= 1;
271 if (t->dev->type != dev_type)
272 score |= 2;
273 if (score == 0)
274 return t;
276 if (score < cand_score) {
277 cand = t;
278 cand_score = score;
282 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
283 if ((local != t->parms.iph.saddr &&
284 (local != t->parms.iph.daddr ||
285 !ipv4_is_multicast(local))) ||
286 key != t->parms.i_key ||
287 !(t->dev->flags & IFF_UP))
288 continue;
290 if (t->dev->type != ARPHRD_IPGRE &&
291 t->dev->type != dev_type)
292 continue;
294 score = 0;
295 if (t->parms.link != link)
296 score |= 1;
297 if (t->dev->type != dev_type)
298 score |= 2;
299 if (score == 0)
300 return t;
302 if (score < cand_score) {
303 cand = t;
304 cand_score = score;
308 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
309 if (t->parms.i_key != key ||
310 !(t->dev->flags & IFF_UP))
311 continue;
313 if (t->dev->type != ARPHRD_IPGRE &&
314 t->dev->type != dev_type)
315 continue;
317 score = 0;
318 if (t->parms.link != link)
319 score |= 1;
320 if (t->dev->type != dev_type)
321 score |= 2;
322 if (score == 0)
323 return t;
325 if (score < cand_score) {
326 cand = t;
327 cand_score = score;
331 if (cand != NULL)
332 return cand;
334 dev = ign->fb_tunnel_dev;
335 if (dev->flags & IFF_UP)
336 return netdev_priv(dev);
338 return NULL;
341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
342 struct ip_tunnel_parm *parms)
344 __be32 remote = parms->iph.daddr;
345 __be32 local = parms->iph.saddr;
346 __be32 key = parms->i_key;
347 unsigned int h = HASH(key);
348 int prio = 0;
350 if (local)
351 prio |= 1;
352 if (remote && !ipv4_is_multicast(remote)) {
353 prio |= 2;
354 h ^= HASH(remote);
357 return &ign->tunnels[prio][h];
360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
361 struct ip_tunnel *t)
363 return __ipgre_bucket(ign, &t->parms);
366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
368 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
370 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
371 rcu_assign_pointer(*tp, t);
374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
376 struct ip_tunnel __rcu **tp;
377 struct ip_tunnel *iter;
379 for (tp = ipgre_bucket(ign, t);
380 (iter = rtnl_dereference(*tp)) != NULL;
381 tp = &iter->next) {
382 if (t == iter) {
383 rcu_assign_pointer(*tp, t->next);
384 break;
389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
390 struct ip_tunnel_parm *parms,
391 int type)
393 __be32 remote = parms->iph.daddr;
394 __be32 local = parms->iph.saddr;
395 __be32 key = parms->i_key;
396 int link = parms->link;
397 struct ip_tunnel *t;
398 struct ip_tunnel __rcu **tp;
399 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
401 for (tp = __ipgre_bucket(ign, parms);
402 (t = rtnl_dereference(*tp)) != NULL;
403 tp = &t->next)
404 if (local == t->parms.iph.saddr &&
405 remote == t->parms.iph.daddr &&
406 key == t->parms.i_key &&
407 link == t->parms.link &&
408 type == t->dev->type)
409 break;
411 return t;
414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
415 struct ip_tunnel_parm *parms, int create)
417 struct ip_tunnel *t, *nt;
418 struct net_device *dev;
419 char name[IFNAMSIZ];
420 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
422 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
423 if (t || !create)
424 return t;
426 if (parms->name[0])
427 strlcpy(name, parms->name, IFNAMSIZ);
428 else
429 strcpy(name, "gre%d");
431 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
432 if (!dev)
433 return NULL;
435 dev_net_set(dev, net);
437 nt = netdev_priv(dev);
438 nt->parms = *parms;
439 dev->rtnl_link_ops = &ipgre_link_ops;
441 dev->mtu = ipgre_tunnel_bind_dev(dev);
443 if (register_netdevice(dev) < 0)
444 goto failed_free;
446 /* Can use a lockless transmit, unless we generate output sequences */
447 if (!(nt->parms.o_flags & GRE_SEQ))
448 dev->features |= NETIF_F_LLTX;
450 dev_hold(dev);
451 ipgre_tunnel_link(ign, nt);
452 return nt;
454 failed_free:
455 free_netdev(dev);
456 return NULL;
459 static void ipgre_tunnel_uninit(struct net_device *dev)
461 struct net *net = dev_net(dev);
462 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
464 ipgre_tunnel_unlink(ign, netdev_priv(dev));
465 dev_put(dev);
469 static void ipgre_err(struct sk_buff *skb, u32 info)
472 /* All the routers (except for Linux) return only
473 8 bytes of packet payload. It means, that precise relaying of
474 ICMP in the real Internet is absolutely infeasible.
476 Moreover, Cisco "wise men" put GRE key to the third word
477 in GRE header. It makes impossible maintaining even soft state for keyed
478 GRE tunnels with enabled checksum. Tell them "thank you".
480 Well, I wonder, rfc1812 was written by Cisco employee,
481 what the hell these idiots break standards established
482 by themselves???
485 const struct iphdr *iph = (const struct iphdr *)skb->data;
486 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
487 int grehlen = (iph->ihl<<2) + 4;
488 const int type = icmp_hdr(skb)->type;
489 const int code = icmp_hdr(skb)->code;
490 struct ip_tunnel *t;
491 __be16 flags;
493 flags = p[0];
494 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
495 if (flags&(GRE_VERSION|GRE_ROUTING))
496 return;
497 if (flags&GRE_KEY) {
498 grehlen += 4;
499 if (flags&GRE_CSUM)
500 grehlen += 4;
504 /* If only 8 bytes returned, keyed message will be dropped here */
505 if (skb_headlen(skb) < grehlen)
506 return;
508 switch (type) {
509 default:
510 case ICMP_PARAMETERPROB:
511 return;
513 case ICMP_DEST_UNREACH:
514 switch (code) {
515 case ICMP_SR_FAILED:
516 case ICMP_PORT_UNREACH:
517 /* Impossible event. */
518 return;
519 default:
520 /* All others are translated to HOST_UNREACH.
521 rfc2003 contains "deep thoughts" about NET_UNREACH,
522 I believe they are just ether pollution. --ANK
524 break;
526 break;
527 case ICMP_TIME_EXCEEDED:
528 if (code != ICMP_EXC_TTL)
529 return;
530 break;
532 case ICMP_REDIRECT:
533 break;
536 rcu_read_lock();
537 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538 flags & GRE_KEY ?
539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540 p[1]);
541 if (t == NULL)
542 goto out;
544 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546 t->parms.link, 0, IPPROTO_GRE, 0);
547 goto out;
549 if (type == ICMP_REDIRECT) {
550 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551 IPPROTO_GRE, 0);
552 goto out;
554 if (t->parms.iph.daddr == 0 ||
555 ipv4_is_multicast(t->parms.iph.daddr))
556 goto out;
558 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
559 goto out;
561 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
562 t->err_count++;
563 else
564 t->err_count = 1;
565 t->err_time = jiffies;
566 out:
567 rcu_read_unlock();
570 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
572 if (INET_ECN_is_ce(iph->tos)) {
573 if (skb->protocol == htons(ETH_P_IP)) {
574 IP_ECN_set_ce(ip_hdr(skb));
575 } else if (skb->protocol == htons(ETH_P_IPV6)) {
576 IP6_ECN_set_ce(ipv6_hdr(skb));
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
584 u8 inner = 0;
585 if (skb->protocol == htons(ETH_P_IP))
586 inner = old_iph->tos;
587 else if (skb->protocol == htons(ETH_P_IPV6))
588 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589 return INET_ECN_encapsulate(tos, inner);
592 static int ipgre_rcv(struct sk_buff *skb)
594 const struct iphdr *iph;
595 u8 *h;
596 __be16 flags;
597 __sum16 csum = 0;
598 __be32 key = 0;
599 u32 seqno = 0;
600 struct ip_tunnel *tunnel;
601 int offset = 4;
602 __be16 gre_proto;
604 if (!pskb_may_pull(skb, 16))
605 goto drop_nolock;
607 iph = ip_hdr(skb);
608 h = skb->data;
609 flags = *(__be16 *)h;
611 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
612 /* - Version must be 0.
613 - We do not support routing headers.
615 if (flags&(GRE_VERSION|GRE_ROUTING))
616 goto drop_nolock;
618 if (flags&GRE_CSUM) {
619 switch (skb->ip_summed) {
620 case CHECKSUM_COMPLETE:
621 csum = csum_fold(skb->csum);
622 if (!csum)
623 break;
624 /* fall through */
625 case CHECKSUM_NONE:
626 skb->csum = 0;
627 csum = __skb_checksum_complete(skb);
628 skb->ip_summed = CHECKSUM_COMPLETE;
630 offset += 4;
632 if (flags&GRE_KEY) {
633 key = *(__be32 *)(h + offset);
634 offset += 4;
636 if (flags&GRE_SEQ) {
637 seqno = ntohl(*(__be32 *)(h + offset));
638 offset += 4;
642 gre_proto = *(__be16 *)(h + 2);
644 rcu_read_lock();
645 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
646 iph->saddr, iph->daddr, key,
647 gre_proto))) {
648 struct pcpu_tstats *tstats;
650 secpath_reset(skb);
652 skb->protocol = gre_proto;
653 /* WCCP version 1 and 2 protocol decoding.
654 * - Change protocol to IP
655 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
657 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
658 skb->protocol = htons(ETH_P_IP);
659 if ((*(h + offset) & 0xF0) != 0x40)
660 offset += 4;
663 skb->mac_header = skb->network_header;
664 __pskb_pull(skb, offset);
665 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
666 skb->pkt_type = PACKET_HOST;
667 #ifdef CONFIG_NET_IPGRE_BROADCAST
668 if (ipv4_is_multicast(iph->daddr)) {
669 /* Looped back packet, drop it! */
670 if (rt_is_output_route(skb_rtable(skb)))
671 goto drop;
672 tunnel->dev->stats.multicast++;
673 skb->pkt_type = PACKET_BROADCAST;
675 #endif
677 if (((flags&GRE_CSUM) && csum) ||
678 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
679 tunnel->dev->stats.rx_crc_errors++;
680 tunnel->dev->stats.rx_errors++;
681 goto drop;
683 if (tunnel->parms.i_flags&GRE_SEQ) {
684 if (!(flags&GRE_SEQ) ||
685 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
686 tunnel->dev->stats.rx_fifo_errors++;
687 tunnel->dev->stats.rx_errors++;
688 goto drop;
690 tunnel->i_seqno = seqno + 1;
693 /* Warning: All skb pointers will be invalidated! */
694 if (tunnel->dev->type == ARPHRD_ETHER) {
695 if (!pskb_may_pull(skb, ETH_HLEN)) {
696 tunnel->dev->stats.rx_length_errors++;
697 tunnel->dev->stats.rx_errors++;
698 goto drop;
701 iph = ip_hdr(skb);
702 skb->protocol = eth_type_trans(skb, tunnel->dev);
703 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
706 tstats = this_cpu_ptr(tunnel->dev->tstats);
707 u64_stats_update_begin(&tstats->syncp);
708 tstats->rx_packets++;
709 tstats->rx_bytes += skb->len;
710 u64_stats_update_end(&tstats->syncp);
712 __skb_tunnel_rx(skb, tunnel->dev);
714 skb_reset_network_header(skb);
715 ipgre_ecn_decapsulate(iph, skb);
717 netif_rx(skb);
719 rcu_read_unlock();
720 return 0;
722 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
724 drop:
725 rcu_read_unlock();
726 drop_nolock:
727 kfree_skb(skb);
728 return 0;
731 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
733 struct ip_tunnel *tunnel = netdev_priv(dev);
734 struct pcpu_tstats *tstats;
735 const struct iphdr *old_iph = ip_hdr(skb);
736 const struct iphdr *tiph;
737 struct flowi4 fl4;
738 u8 tos;
739 __be16 df;
740 struct rtable *rt; /* Route to the other host */
741 struct net_device *tdev; /* Device to other host */
742 struct iphdr *iph; /* Our new IP header */
743 unsigned int max_headroom; /* The extra header space needed */
744 int gre_hlen;
745 __be32 dst;
746 int mtu;
748 if (dev->type == ARPHRD_ETHER)
749 IPCB(skb)->flags = 0;
751 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
752 gre_hlen = 0;
753 tiph = (const struct iphdr *)skb->data;
754 } else {
755 gre_hlen = tunnel->hlen;
756 tiph = &tunnel->parms.iph;
759 if ((dst = tiph->daddr) == 0) {
760 /* NBMA tunnel */
762 if (skb_dst(skb) == NULL) {
763 dev->stats.tx_fifo_errors++;
764 goto tx_error;
767 if (skb->protocol == htons(ETH_P_IP)) {
768 rt = skb_rtable(skb);
769 dst = rt_nexthop(rt, old_iph->daddr);
771 #if IS_ENABLED(CONFIG_IPV6)
772 else if (skb->protocol == htons(ETH_P_IPV6)) {
773 const struct in6_addr *addr6;
774 struct neighbour *neigh;
775 bool do_tx_error_icmp;
776 int addr_type;
778 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
779 if (neigh == NULL)
780 goto tx_error;
782 addr6 = (const struct in6_addr *)&neigh->primary_key;
783 addr_type = ipv6_addr_type(addr6);
785 if (addr_type == IPV6_ADDR_ANY) {
786 addr6 = &ipv6_hdr(skb)->daddr;
787 addr_type = ipv6_addr_type(addr6);
790 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
791 do_tx_error_icmp = true;
792 else {
793 do_tx_error_icmp = false;
794 dst = addr6->s6_addr32[3];
796 neigh_release(neigh);
797 if (do_tx_error_icmp)
798 goto tx_error_icmp;
800 #endif
801 else
802 goto tx_error;
805 tos = tiph->tos;
806 if (tos == 1) {
807 tos = 0;
808 if (skb->protocol == htons(ETH_P_IP))
809 tos = old_iph->tos;
810 else if (skb->protocol == htons(ETH_P_IPV6))
811 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
814 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
815 tunnel->parms.o_key, RT_TOS(tos),
816 tunnel->parms.link);
817 if (IS_ERR(rt)) {
818 dev->stats.tx_carrier_errors++;
819 goto tx_error;
821 tdev = rt->dst.dev;
823 if (tdev == dev) {
824 ip_rt_put(rt);
825 dev->stats.collisions++;
826 goto tx_error;
829 df = tiph->frag_off;
830 if (df)
831 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
832 else
833 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
835 if (skb_dst(skb))
836 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
838 if (skb->protocol == htons(ETH_P_IP)) {
839 df |= (old_iph->frag_off&htons(IP_DF));
841 if ((old_iph->frag_off&htons(IP_DF)) &&
842 mtu < ntohs(old_iph->tot_len)) {
843 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
844 ip_rt_put(rt);
845 goto tx_error;
848 #if IS_ENABLED(CONFIG_IPV6)
849 else if (skb->protocol == htons(ETH_P_IPV6)) {
850 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
852 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
853 if ((tunnel->parms.iph.daddr &&
854 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
855 rt6->rt6i_dst.plen == 128) {
856 rt6->rt6i_flags |= RTF_MODIFIED;
857 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
861 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
862 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
863 ip_rt_put(rt);
864 goto tx_error;
867 #endif
869 if (tunnel->err_count > 0) {
870 if (time_before(jiffies,
871 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
872 tunnel->err_count--;
874 dst_link_failure(skb);
875 } else
876 tunnel->err_count = 0;
879 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
881 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
882 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
883 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
884 if (max_headroom > dev->needed_headroom)
885 dev->needed_headroom = max_headroom;
886 if (!new_skb) {
887 ip_rt_put(rt);
888 dev->stats.tx_dropped++;
889 dev_kfree_skb(skb);
890 return NETDEV_TX_OK;
892 if (skb->sk)
893 skb_set_owner_w(new_skb, skb->sk);
894 dev_kfree_skb(skb);
895 skb = new_skb;
896 old_iph = ip_hdr(skb);
899 skb_reset_transport_header(skb);
900 skb_push(skb, gre_hlen);
901 skb_reset_network_header(skb);
902 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
903 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
904 IPSKB_REROUTED);
905 skb_dst_drop(skb);
906 skb_dst_set(skb, &rt->dst);
909 * Push down and install the IPIP header.
912 iph = ip_hdr(skb);
913 iph->version = 4;
914 iph->ihl = sizeof(struct iphdr) >> 2;
915 iph->frag_off = df;
916 iph->protocol = IPPROTO_GRE;
917 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
918 iph->daddr = fl4.daddr;
919 iph->saddr = fl4.saddr;
921 if ((iph->ttl = tiph->ttl) == 0) {
922 if (skb->protocol == htons(ETH_P_IP))
923 iph->ttl = old_iph->ttl;
924 #if IS_ENABLED(CONFIG_IPV6)
925 else if (skb->protocol == htons(ETH_P_IPV6))
926 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
927 #endif
928 else
929 iph->ttl = ip4_dst_hoplimit(&rt->dst);
932 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
933 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
934 htons(ETH_P_TEB) : skb->protocol;
936 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
937 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
939 if (tunnel->parms.o_flags&GRE_SEQ) {
940 ++tunnel->o_seqno;
941 *ptr = htonl(tunnel->o_seqno);
942 ptr--;
944 if (tunnel->parms.o_flags&GRE_KEY) {
945 *ptr = tunnel->parms.o_key;
946 ptr--;
948 if (tunnel->parms.o_flags&GRE_CSUM) {
949 *ptr = 0;
950 *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
954 nf_reset(skb);
955 tstats = this_cpu_ptr(dev->tstats);
956 __IPTUNNEL_XMIT(tstats, &dev->stats);
957 return NETDEV_TX_OK;
959 #if IS_ENABLED(CONFIG_IPV6)
960 tx_error_icmp:
961 dst_link_failure(skb);
962 #endif
963 tx_error:
964 dev->stats.tx_errors++;
965 dev_kfree_skb(skb);
966 return NETDEV_TX_OK;
969 static int ipgre_tunnel_bind_dev(struct net_device *dev)
971 struct net_device *tdev = NULL;
972 struct ip_tunnel *tunnel;
973 const struct iphdr *iph;
974 int hlen = LL_MAX_HEADER;
975 int mtu = ETH_DATA_LEN;
976 int addend = sizeof(struct iphdr) + 4;
978 tunnel = netdev_priv(dev);
979 iph = &tunnel->parms.iph;
981 /* Guess output device to choose reasonable mtu and needed_headroom */
983 if (iph->daddr) {
984 struct flowi4 fl4;
985 struct rtable *rt;
987 rt = ip_route_output_gre(dev_net(dev), &fl4,
988 iph->daddr, iph->saddr,
989 tunnel->parms.o_key,
990 RT_TOS(iph->tos),
991 tunnel->parms.link);
992 if (!IS_ERR(rt)) {
993 tdev = rt->dst.dev;
994 ip_rt_put(rt);
997 if (dev->type != ARPHRD_ETHER)
998 dev->flags |= IFF_POINTOPOINT;
1001 if (!tdev && tunnel->parms.link)
1002 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1004 if (tdev) {
1005 hlen = tdev->hard_header_len + tdev->needed_headroom;
1006 mtu = tdev->mtu;
1008 dev->iflink = tunnel->parms.link;
1010 /* Precalculate GRE options length */
1011 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1012 if (tunnel->parms.o_flags&GRE_CSUM)
1013 addend += 4;
1014 if (tunnel->parms.o_flags&GRE_KEY)
1015 addend += 4;
1016 if (tunnel->parms.o_flags&GRE_SEQ)
1017 addend += 4;
1019 dev->needed_headroom = addend + hlen;
1020 mtu -= dev->hard_header_len + addend;
1022 if (mtu < 68)
1023 mtu = 68;
1025 tunnel->hlen = addend;
1027 return mtu;
1030 static int
1031 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1033 int err = 0;
1034 struct ip_tunnel_parm p;
1035 struct ip_tunnel *t;
1036 struct net *net = dev_net(dev);
1037 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1039 switch (cmd) {
1040 case SIOCGETTUNNEL:
1041 t = NULL;
1042 if (dev == ign->fb_tunnel_dev) {
1043 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1044 err = -EFAULT;
1045 break;
1047 t = ipgre_tunnel_locate(net, &p, 0);
1049 if (t == NULL)
1050 t = netdev_priv(dev);
1051 memcpy(&p, &t->parms, sizeof(p));
1052 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1053 err = -EFAULT;
1054 break;
1056 case SIOCADDTUNNEL:
1057 case SIOCCHGTUNNEL:
1058 err = -EPERM;
1059 if (!capable(CAP_NET_ADMIN))
1060 goto done;
1062 err = -EFAULT;
1063 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1064 goto done;
1066 err = -EINVAL;
1067 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1068 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1069 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1070 goto done;
1071 if (p.iph.ttl)
1072 p.iph.frag_off |= htons(IP_DF);
1074 if (!(p.i_flags&GRE_KEY))
1075 p.i_key = 0;
1076 if (!(p.o_flags&GRE_KEY))
1077 p.o_key = 0;
1079 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1081 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1082 if (t != NULL) {
1083 if (t->dev != dev) {
1084 err = -EEXIST;
1085 break;
1087 } else {
1088 unsigned int nflags = 0;
1090 t = netdev_priv(dev);
1092 if (ipv4_is_multicast(p.iph.daddr))
1093 nflags = IFF_BROADCAST;
1094 else if (p.iph.daddr)
1095 nflags = IFF_POINTOPOINT;
1097 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1098 err = -EINVAL;
1099 break;
1101 ipgre_tunnel_unlink(ign, t);
1102 synchronize_net();
1103 t->parms.iph.saddr = p.iph.saddr;
1104 t->parms.iph.daddr = p.iph.daddr;
1105 t->parms.i_key = p.i_key;
1106 t->parms.o_key = p.o_key;
1107 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1108 memcpy(dev->broadcast, &p.iph.daddr, 4);
1109 ipgre_tunnel_link(ign, t);
1110 netdev_state_change(dev);
1114 if (t) {
1115 err = 0;
1116 if (cmd == SIOCCHGTUNNEL) {
1117 t->parms.iph.ttl = p.iph.ttl;
1118 t->parms.iph.tos = p.iph.tos;
1119 t->parms.iph.frag_off = p.iph.frag_off;
1120 if (t->parms.link != p.link) {
1121 t->parms.link = p.link;
1122 dev->mtu = ipgre_tunnel_bind_dev(dev);
1123 netdev_state_change(dev);
1126 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1127 err = -EFAULT;
1128 } else
1129 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1130 break;
1132 case SIOCDELTUNNEL:
1133 err = -EPERM;
1134 if (!capable(CAP_NET_ADMIN))
1135 goto done;
1137 if (dev == ign->fb_tunnel_dev) {
1138 err = -EFAULT;
1139 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1140 goto done;
1141 err = -ENOENT;
1142 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1143 goto done;
1144 err = -EPERM;
1145 if (t == netdev_priv(ign->fb_tunnel_dev))
1146 goto done;
1147 dev = t->dev;
1149 unregister_netdevice(dev);
1150 err = 0;
1151 break;
1153 default:
1154 err = -EINVAL;
1157 done:
1158 return err;
1161 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1163 struct ip_tunnel *tunnel = netdev_priv(dev);
1164 if (new_mtu < 68 ||
1165 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1166 return -EINVAL;
1167 dev->mtu = new_mtu;
1168 return 0;
1171 /* Nice toy. Unfortunately, useless in real life :-)
1172 It allows to construct virtual multiprotocol broadcast "LAN"
1173 over the Internet, provided multicast routing is tuned.
1176 I have no idea was this bicycle invented before me,
1177 so that I had to set ARPHRD_IPGRE to a random value.
1178 I have an impression, that Cisco could make something similar,
1179 but this feature is apparently missing in IOS<=11.2(8).
1181 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1182 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1184 ping -t 255 224.66.66.66
1186 If nobody answers, mbone does not work.
1188 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1189 ip addr add 10.66.66.<somewhat>/24 dev Universe
1190 ifconfig Universe up
1191 ifconfig Universe add fe80::<Your_real_addr>/10
1192 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1193 ftp 10.66.66.66
1195 ftp fec0:6666:6666::193.233.7.65
1200 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1201 unsigned short type,
1202 const void *daddr, const void *saddr, unsigned int len)
1204 struct ip_tunnel *t = netdev_priv(dev);
1205 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1206 __be16 *p = (__be16 *)(iph+1);
1208 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1209 p[0] = t->parms.o_flags;
1210 p[1] = htons(type);
1213 * Set the source hardware address.
1216 if (saddr)
1217 memcpy(&iph->saddr, saddr, 4);
1218 if (daddr)
1219 memcpy(&iph->daddr, daddr, 4);
1220 if (iph->daddr)
1221 return t->hlen;
1223 return -t->hlen;
1226 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1228 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1229 memcpy(haddr, &iph->saddr, 4);
1230 return 4;
1233 static const struct header_ops ipgre_header_ops = {
1234 .create = ipgre_header,
1235 .parse = ipgre_header_parse,
1238 #ifdef CONFIG_NET_IPGRE_BROADCAST
1239 static int ipgre_open(struct net_device *dev)
1241 struct ip_tunnel *t = netdev_priv(dev);
1243 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1244 struct flowi4 fl4;
1245 struct rtable *rt;
1247 rt = ip_route_output_gre(dev_net(dev), &fl4,
1248 t->parms.iph.daddr,
1249 t->parms.iph.saddr,
1250 t->parms.o_key,
1251 RT_TOS(t->parms.iph.tos),
1252 t->parms.link);
1253 if (IS_ERR(rt))
1254 return -EADDRNOTAVAIL;
1255 dev = rt->dst.dev;
1256 ip_rt_put(rt);
1257 if (__in_dev_get_rtnl(dev) == NULL)
1258 return -EADDRNOTAVAIL;
1259 t->mlink = dev->ifindex;
1260 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1262 return 0;
1265 static int ipgre_close(struct net_device *dev)
1267 struct ip_tunnel *t = netdev_priv(dev);
1269 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1270 struct in_device *in_dev;
1271 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1272 if (in_dev)
1273 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1275 return 0;
1278 #endif
1280 static const struct net_device_ops ipgre_netdev_ops = {
1281 .ndo_init = ipgre_tunnel_init,
1282 .ndo_uninit = ipgre_tunnel_uninit,
1283 #ifdef CONFIG_NET_IPGRE_BROADCAST
1284 .ndo_open = ipgre_open,
1285 .ndo_stop = ipgre_close,
1286 #endif
1287 .ndo_start_xmit = ipgre_tunnel_xmit,
1288 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1289 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1290 .ndo_get_stats64 = ipgre_get_stats64,
1293 static void ipgre_dev_free(struct net_device *dev)
1295 free_percpu(dev->tstats);
1296 free_netdev(dev);
1299 static void ipgre_tunnel_setup(struct net_device *dev)
1301 dev->netdev_ops = &ipgre_netdev_ops;
1302 dev->destructor = ipgre_dev_free;
1304 dev->type = ARPHRD_IPGRE;
1305 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1306 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1307 dev->flags = IFF_NOARP;
1308 dev->iflink = 0;
1309 dev->addr_len = 4;
1310 dev->features |= NETIF_F_NETNS_LOCAL;
1311 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1314 static int ipgre_tunnel_init(struct net_device *dev)
1316 struct ip_tunnel *tunnel;
1317 struct iphdr *iph;
1319 tunnel = netdev_priv(dev);
1320 iph = &tunnel->parms.iph;
1322 tunnel->dev = dev;
1323 strcpy(tunnel->parms.name, dev->name);
1325 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1326 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1328 if (iph->daddr) {
1329 #ifdef CONFIG_NET_IPGRE_BROADCAST
1330 if (ipv4_is_multicast(iph->daddr)) {
1331 if (!iph->saddr)
1332 return -EINVAL;
1333 dev->flags = IFF_BROADCAST;
1334 dev->header_ops = &ipgre_header_ops;
1336 #endif
1337 } else
1338 dev->header_ops = &ipgre_header_ops;
1340 dev->tstats = alloc_percpu(struct pcpu_tstats);
1341 if (!dev->tstats)
1342 return -ENOMEM;
1344 return 0;
1347 static void ipgre_fb_tunnel_init(struct net_device *dev)
1349 struct ip_tunnel *tunnel = netdev_priv(dev);
1350 struct iphdr *iph = &tunnel->parms.iph;
1352 tunnel->dev = dev;
1353 strcpy(tunnel->parms.name, dev->name);
1355 iph->version = 4;
1356 iph->protocol = IPPROTO_GRE;
1357 iph->ihl = 5;
1358 tunnel->hlen = sizeof(struct iphdr) + 4;
1360 dev_hold(dev);
1364 static const struct gre_protocol ipgre_protocol = {
1365 .handler = ipgre_rcv,
1366 .err_handler = ipgre_err,
1369 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1371 int prio;
1373 for (prio = 0; prio < 4; prio++) {
1374 int h;
1375 for (h = 0; h < HASH_SIZE; h++) {
1376 struct ip_tunnel *t;
1378 t = rtnl_dereference(ign->tunnels[prio][h]);
1380 while (t != NULL) {
1381 unregister_netdevice_queue(t->dev, head);
1382 t = rtnl_dereference(t->next);
1388 static int __net_init ipgre_init_net(struct net *net)
1390 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1391 int err;
1393 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1394 ipgre_tunnel_setup);
1395 if (!ign->fb_tunnel_dev) {
1396 err = -ENOMEM;
1397 goto err_alloc_dev;
1399 dev_net_set(ign->fb_tunnel_dev, net);
1401 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1402 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1404 if ((err = register_netdev(ign->fb_tunnel_dev)))
1405 goto err_reg_dev;
1407 rcu_assign_pointer(ign->tunnels_wc[0],
1408 netdev_priv(ign->fb_tunnel_dev));
1409 return 0;
1411 err_reg_dev:
1412 ipgre_dev_free(ign->fb_tunnel_dev);
1413 err_alloc_dev:
1414 return err;
1417 static void __net_exit ipgre_exit_net(struct net *net)
1419 struct ipgre_net *ign;
1420 LIST_HEAD(list);
1422 ign = net_generic(net, ipgre_net_id);
1423 rtnl_lock();
1424 ipgre_destroy_tunnels(ign, &list);
1425 unregister_netdevice_many(&list);
1426 rtnl_unlock();
1429 static struct pernet_operations ipgre_net_ops = {
1430 .init = ipgre_init_net,
1431 .exit = ipgre_exit_net,
1432 .id = &ipgre_net_id,
1433 .size = sizeof(struct ipgre_net),
1436 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1438 __be16 flags;
1440 if (!data)
1441 return 0;
1443 flags = 0;
1444 if (data[IFLA_GRE_IFLAGS])
1445 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1446 if (data[IFLA_GRE_OFLAGS])
1447 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1448 if (flags & (GRE_VERSION|GRE_ROUTING))
1449 return -EINVAL;
1451 return 0;
1454 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1456 __be32 daddr;
1458 if (tb[IFLA_ADDRESS]) {
1459 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1460 return -EINVAL;
1461 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1462 return -EADDRNOTAVAIL;
1465 if (!data)
1466 goto out;
1468 if (data[IFLA_GRE_REMOTE]) {
1469 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1470 if (!daddr)
1471 return -EINVAL;
1474 out:
1475 return ipgre_tunnel_validate(tb, data);
1478 static void ipgre_netlink_parms(struct nlattr *data[],
1479 struct ip_tunnel_parm *parms)
1481 memset(parms, 0, sizeof(*parms));
1483 parms->iph.protocol = IPPROTO_GRE;
1485 if (!data)
1486 return;
1488 if (data[IFLA_GRE_LINK])
1489 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1491 if (data[IFLA_GRE_IFLAGS])
1492 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1494 if (data[IFLA_GRE_OFLAGS])
1495 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1497 if (data[IFLA_GRE_IKEY])
1498 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1500 if (data[IFLA_GRE_OKEY])
1501 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1503 if (data[IFLA_GRE_LOCAL])
1504 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1506 if (data[IFLA_GRE_REMOTE])
1507 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1509 if (data[IFLA_GRE_TTL])
1510 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1512 if (data[IFLA_GRE_TOS])
1513 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1515 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1516 parms->iph.frag_off = htons(IP_DF);
1519 static int ipgre_tap_init(struct net_device *dev)
1521 struct ip_tunnel *tunnel;
1523 tunnel = netdev_priv(dev);
1525 tunnel->dev = dev;
1526 strcpy(tunnel->parms.name, dev->name);
1528 ipgre_tunnel_bind_dev(dev);
1530 dev->tstats = alloc_percpu(struct pcpu_tstats);
1531 if (!dev->tstats)
1532 return -ENOMEM;
1534 return 0;
1537 static const struct net_device_ops ipgre_tap_netdev_ops = {
1538 .ndo_init = ipgre_tap_init,
1539 .ndo_uninit = ipgre_tunnel_uninit,
1540 .ndo_start_xmit = ipgre_tunnel_xmit,
1541 .ndo_set_mac_address = eth_mac_addr,
1542 .ndo_validate_addr = eth_validate_addr,
1543 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1544 .ndo_get_stats64 = ipgre_get_stats64,
1547 static void ipgre_tap_setup(struct net_device *dev)
1550 ether_setup(dev);
1552 dev->netdev_ops = &ipgre_tap_netdev_ops;
1553 dev->destructor = ipgre_dev_free;
1555 dev->iflink = 0;
1556 dev->features |= NETIF_F_NETNS_LOCAL;
1559 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1560 struct nlattr *data[])
1562 struct ip_tunnel *nt;
1563 struct net *net = dev_net(dev);
1564 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1565 int mtu;
1566 int err;
1568 nt = netdev_priv(dev);
1569 ipgre_netlink_parms(data, &nt->parms);
1571 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1572 return -EEXIST;
1574 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1575 eth_hw_addr_random(dev);
1577 mtu = ipgre_tunnel_bind_dev(dev);
1578 if (!tb[IFLA_MTU])
1579 dev->mtu = mtu;
1581 /* Can use a lockless transmit, unless we generate output sequences */
1582 if (!(nt->parms.o_flags & GRE_SEQ))
1583 dev->features |= NETIF_F_LLTX;
1585 err = register_netdevice(dev);
1586 if (err)
1587 goto out;
1589 dev_hold(dev);
1590 ipgre_tunnel_link(ign, nt);
1592 out:
1593 return err;
1596 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1597 struct nlattr *data[])
1599 struct ip_tunnel *t, *nt;
1600 struct net *net = dev_net(dev);
1601 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1602 struct ip_tunnel_parm p;
1603 int mtu;
1605 if (dev == ign->fb_tunnel_dev)
1606 return -EINVAL;
1608 nt = netdev_priv(dev);
1609 ipgre_netlink_parms(data, &p);
1611 t = ipgre_tunnel_locate(net, &p, 0);
1613 if (t) {
1614 if (t->dev != dev)
1615 return -EEXIST;
1616 } else {
1617 t = nt;
1619 if (dev->type != ARPHRD_ETHER) {
1620 unsigned int nflags = 0;
1622 if (ipv4_is_multicast(p.iph.daddr))
1623 nflags = IFF_BROADCAST;
1624 else if (p.iph.daddr)
1625 nflags = IFF_POINTOPOINT;
1627 if ((dev->flags ^ nflags) &
1628 (IFF_POINTOPOINT | IFF_BROADCAST))
1629 return -EINVAL;
1632 ipgre_tunnel_unlink(ign, t);
1633 t->parms.iph.saddr = p.iph.saddr;
1634 t->parms.iph.daddr = p.iph.daddr;
1635 t->parms.i_key = p.i_key;
1636 if (dev->type != ARPHRD_ETHER) {
1637 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1638 memcpy(dev->broadcast, &p.iph.daddr, 4);
1640 ipgre_tunnel_link(ign, t);
1641 netdev_state_change(dev);
1644 t->parms.o_key = p.o_key;
1645 t->parms.iph.ttl = p.iph.ttl;
1646 t->parms.iph.tos = p.iph.tos;
1647 t->parms.iph.frag_off = p.iph.frag_off;
1649 if (t->parms.link != p.link) {
1650 t->parms.link = p.link;
1651 mtu = ipgre_tunnel_bind_dev(dev);
1652 if (!tb[IFLA_MTU])
1653 dev->mtu = mtu;
1654 netdev_state_change(dev);
1657 return 0;
1660 static size_t ipgre_get_size(const struct net_device *dev)
1662 return
1663 /* IFLA_GRE_LINK */
1664 nla_total_size(4) +
1665 /* IFLA_GRE_IFLAGS */
1666 nla_total_size(2) +
1667 /* IFLA_GRE_OFLAGS */
1668 nla_total_size(2) +
1669 /* IFLA_GRE_IKEY */
1670 nla_total_size(4) +
1671 /* IFLA_GRE_OKEY */
1672 nla_total_size(4) +
1673 /* IFLA_GRE_LOCAL */
1674 nla_total_size(4) +
1675 /* IFLA_GRE_REMOTE */
1676 nla_total_size(4) +
1677 /* IFLA_GRE_TTL */
1678 nla_total_size(1) +
1679 /* IFLA_GRE_TOS */
1680 nla_total_size(1) +
1681 /* IFLA_GRE_PMTUDISC */
1682 nla_total_size(1) +
1686 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1688 struct ip_tunnel *t = netdev_priv(dev);
1689 struct ip_tunnel_parm *p = &t->parms;
1691 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1692 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1693 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1694 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1695 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1696 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1697 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1698 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1699 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1700 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1701 !!(p->iph.frag_off & htons(IP_DF))))
1702 goto nla_put_failure;
1703 return 0;
1705 nla_put_failure:
1706 return -EMSGSIZE;
1709 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1710 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1711 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1712 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1713 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1714 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1715 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1716 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1717 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1718 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1719 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1722 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1723 .kind = "gre",
1724 .maxtype = IFLA_GRE_MAX,
1725 .policy = ipgre_policy,
1726 .priv_size = sizeof(struct ip_tunnel),
1727 .setup = ipgre_tunnel_setup,
1728 .validate = ipgre_tunnel_validate,
1729 .newlink = ipgre_newlink,
1730 .changelink = ipgre_changelink,
1731 .get_size = ipgre_get_size,
1732 .fill_info = ipgre_fill_info,
1735 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1736 .kind = "gretap",
1737 .maxtype = IFLA_GRE_MAX,
1738 .policy = ipgre_policy,
1739 .priv_size = sizeof(struct ip_tunnel),
1740 .setup = ipgre_tap_setup,
1741 .validate = ipgre_tap_validate,
1742 .newlink = ipgre_newlink,
1743 .changelink = ipgre_changelink,
1744 .get_size = ipgre_get_size,
1745 .fill_info = ipgre_fill_info,
1749 * And now the modules code and kernel interface.
1752 static int __init ipgre_init(void)
1754 int err;
1756 pr_info("GRE over IPv4 tunneling driver\n");
1758 err = register_pernet_device(&ipgre_net_ops);
1759 if (err < 0)
1760 return err;
1762 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1763 if (err < 0) {
1764 pr_info("%s: can't add protocol\n", __func__);
1765 goto add_proto_failed;
1768 err = rtnl_link_register(&ipgre_link_ops);
1769 if (err < 0)
1770 goto rtnl_link_failed;
1772 err = rtnl_link_register(&ipgre_tap_ops);
1773 if (err < 0)
1774 goto tap_ops_failed;
1776 out:
1777 return err;
1779 tap_ops_failed:
1780 rtnl_link_unregister(&ipgre_link_ops);
1781 rtnl_link_failed:
1782 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1783 add_proto_failed:
1784 unregister_pernet_device(&ipgre_net_ops);
1785 goto out;
1788 static void __exit ipgre_fini(void)
1790 rtnl_link_unregister(&ipgre_tap_ops);
1791 rtnl_link_unregister(&ipgre_link_ops);
1792 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1793 pr_info("%s: can't remove protocol\n", __func__);
1794 unregister_pernet_device(&ipgre_net_ops);
1797 module_init(ipgre_init);
1798 module_exit(ipgre_fini);
1799 MODULE_LICENSE("GPL");
1800 MODULE_ALIAS_RTNL_LINK("gre");
1801 MODULE_ALIAS_RTNL_LINK("gretap");
1802 MODULE_ALIAS_NETDEV("gre0");