mm-only debug patch...
[mmotm.git] / net / ipv4 / ip_gre.c
blob89ff9d5b1500cf90c7b8bfbe801edec8ac303549
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
54 Problems & solutions
55 --------------------
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: HARD_TX_LOCK lock breaks dead loops.
73 2. Networking dead loops would not kill routers, but would really
74 kill network. IP hop limit plays role of "t->recursion" in this case,
75 if we copy it from packet being encapsulated to upper header.
76 It is very good solution, but it introduces two problems:
78 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79 do not work over tunnels.
80 - traceroute does not work. I planned to relay ICMP from tunnel,
81 so that this problem would be solved and traceroute output
82 would even more informative. This idea appeared to be wrong:
83 only Linux complies to rfc1812 now (yes, guys, Linux is the only
84 true router now :-)), all routers (at least, in neighbourhood of mine)
85 return only 8 bytes of payload. It is the end.
87 Hence, if we want that OSPF worked or traceroute said something reasonable,
88 we should search for another solution.
90 One of them is to parse packet trying to detect inner encapsulation
91 made by our node. It is difficult or even impossible, especially,
92 taking into account fragmentation. TO be short, tt is not solution at all.
94 Current solution: The solution was UNEXPECTEDLY SIMPLE.
95 We force DF flag on tunnels with preconfigured hop limit,
96 that is ALL. :-) Well, it does not remove the problem completely,
97 but exponential growth of network traffic is changed to linear
98 (branches, that exceed pmtu are pruned) and tunnel mtu
99 fastly degrades to value <68, where looping stops.
100 Yes, it is not good if there exists a router in the loop,
101 which does not force DF, even when encapsulating packets have DF set.
102 But it is not our problem! Nobody could accuse us, we made
103 all that we could make. Even if it is your gated who injected
104 fatal route to network, even if it were you who configured
105 fatal static route: you are innocent. :-)
109 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110 practically identical code. It would be good to glue them
111 together, but it is not very evident, how to make them modular.
112 sit is integral part of IPv6, ipip and gre are naturally modular.
113 We could extract common parts (hash table, ioctl etc)
114 to a separate module (ip_tunnel.c).
116 Alexey Kuznetsov.
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124 /* Fallback tunnel: no source, no destination, no key, no options */
126 #define HASH_SIZE 16
128 static int ipgre_net_id;
129 struct ipgre_net {
130 struct ip_tunnel *tunnels[4][HASH_SIZE];
132 struct net_device *fb_tunnel_dev;
135 /* Tunnel hash table */
138 4 hash tables:
140 3: (remote,local)
141 2: (remote,*)
142 1: (*,local)
143 0: (*,*)
145 We require exact key match i.e. if a key is present in packet
146 it will match only tunnel with the same key; if it is not present,
147 it will match only keyless tunnel.
149 All keysless packets, if not matched configured keyless tunnels
150 will match fallback tunnel.
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155 #define tunnels_r_l tunnels[3]
156 #define tunnels_r tunnels[2]
157 #define tunnels_l tunnels[1]
158 #define tunnels_wc tunnels[0]
160 static DEFINE_RWLOCK(ipgre_lock);
162 /* Given src, dst and key, find appropriate for input tunnel. */
164 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
165 __be32 remote, __be32 local,
166 __be32 key, __be16 gre_proto)
168 struct net *net = dev_net(dev);
169 int link = dev->ifindex;
170 unsigned h0 = HASH(remote);
171 unsigned h1 = HASH(key);
172 struct ip_tunnel *t, *cand = NULL;
173 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
174 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
175 ARPHRD_ETHER : ARPHRD_IPGRE;
176 int score, cand_score = 4;
178 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
179 if (local != t->parms.iph.saddr ||
180 remote != t->parms.iph.daddr ||
181 key != t->parms.i_key ||
182 !(t->dev->flags & IFF_UP))
183 continue;
185 if (t->dev->type != ARPHRD_IPGRE &&
186 t->dev->type != dev_type)
187 continue;
189 score = 0;
190 if (t->parms.link != link)
191 score |= 1;
192 if (t->dev->type != dev_type)
193 score |= 2;
194 if (score == 0)
195 return t;
197 if (score < cand_score) {
198 cand = t;
199 cand_score = score;
203 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
204 if (remote != t->parms.iph.daddr ||
205 key != t->parms.i_key ||
206 !(t->dev->flags & IFF_UP))
207 continue;
209 if (t->dev->type != ARPHRD_IPGRE &&
210 t->dev->type != dev_type)
211 continue;
213 score = 0;
214 if (t->parms.link != link)
215 score |= 1;
216 if (t->dev->type != dev_type)
217 score |= 2;
218 if (score == 0)
219 return t;
221 if (score < cand_score) {
222 cand = t;
223 cand_score = score;
227 for (t = ign->tunnels_l[h1]; t; t = t->next) {
228 if ((local != t->parms.iph.saddr &&
229 (local != t->parms.iph.daddr ||
230 !ipv4_is_multicast(local))) ||
231 key != t->parms.i_key ||
232 !(t->dev->flags & IFF_UP))
233 continue;
235 if (t->dev->type != ARPHRD_IPGRE &&
236 t->dev->type != dev_type)
237 continue;
239 score = 0;
240 if (t->parms.link != link)
241 score |= 1;
242 if (t->dev->type != dev_type)
243 score |= 2;
244 if (score == 0)
245 return t;
247 if (score < cand_score) {
248 cand = t;
249 cand_score = score;
253 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
254 if (t->parms.i_key != key ||
255 !(t->dev->flags & IFF_UP))
256 continue;
258 if (t->dev->type != ARPHRD_IPGRE &&
259 t->dev->type != dev_type)
260 continue;
262 score = 0;
263 if (t->parms.link != link)
264 score |= 1;
265 if (t->dev->type != dev_type)
266 score |= 2;
267 if (score == 0)
268 return t;
270 if (score < cand_score) {
271 cand = t;
272 cand_score = score;
276 if (cand != NULL)
277 return cand;
279 if (ign->fb_tunnel_dev->flags & IFF_UP)
280 return netdev_priv(ign->fb_tunnel_dev);
282 return NULL;
285 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
286 struct ip_tunnel_parm *parms)
288 __be32 remote = parms->iph.daddr;
289 __be32 local = parms->iph.saddr;
290 __be32 key = parms->i_key;
291 unsigned h = HASH(key);
292 int prio = 0;
294 if (local)
295 prio |= 1;
296 if (remote && !ipv4_is_multicast(remote)) {
297 prio |= 2;
298 h ^= HASH(remote);
301 return &ign->tunnels[prio][h];
304 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
305 struct ip_tunnel *t)
307 return __ipgre_bucket(ign, &t->parms);
310 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
312 struct ip_tunnel **tp = ipgre_bucket(ign, t);
314 t->next = *tp;
315 write_lock_bh(&ipgre_lock);
316 *tp = t;
317 write_unlock_bh(&ipgre_lock);
320 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
322 struct ip_tunnel **tp;
324 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
325 if (t == *tp) {
326 write_lock_bh(&ipgre_lock);
327 *tp = t->next;
328 write_unlock_bh(&ipgre_lock);
329 break;
334 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
335 struct ip_tunnel_parm *parms,
336 int type)
338 __be32 remote = parms->iph.daddr;
339 __be32 local = parms->iph.saddr;
340 __be32 key = parms->i_key;
341 int link = parms->link;
342 struct ip_tunnel *t, **tp;
343 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
345 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
346 if (local == t->parms.iph.saddr &&
347 remote == t->parms.iph.daddr &&
348 key == t->parms.i_key &&
349 link == t->parms.link &&
350 type == t->dev->type)
351 break;
353 return t;
356 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
357 struct ip_tunnel_parm *parms, int create)
359 struct ip_tunnel *t, *nt;
360 struct net_device *dev;
361 char name[IFNAMSIZ];
362 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
364 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
365 if (t || !create)
366 return t;
368 if (parms->name[0])
369 strlcpy(name, parms->name, IFNAMSIZ);
370 else
371 sprintf(name, "gre%%d");
373 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
374 if (!dev)
375 return NULL;
377 dev_net_set(dev, net);
379 if (strchr(name, '%')) {
380 if (dev_alloc_name(dev, name) < 0)
381 goto failed_free;
384 nt = netdev_priv(dev);
385 nt->parms = *parms;
386 dev->rtnl_link_ops = &ipgre_link_ops;
388 dev->mtu = ipgre_tunnel_bind_dev(dev);
390 if (register_netdevice(dev) < 0)
391 goto failed_free;
393 dev_hold(dev);
394 ipgre_tunnel_link(ign, nt);
395 return nt;
397 failed_free:
398 free_netdev(dev);
399 return NULL;
402 static void ipgre_tunnel_uninit(struct net_device *dev)
404 struct net *net = dev_net(dev);
405 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
407 ipgre_tunnel_unlink(ign, netdev_priv(dev));
408 dev_put(dev);
412 static void ipgre_err(struct sk_buff *skb, u32 info)
415 /* All the routers (except for Linux) return only
416 8 bytes of packet payload. It means, that precise relaying of
417 ICMP in the real Internet is absolutely infeasible.
419 Moreover, Cisco "wise men" put GRE key to the third word
420 in GRE header. It makes impossible maintaining even soft state for keyed
421 GRE tunnels with enabled checksum. Tell them "thank you".
423 Well, I wonder, rfc1812 was written by Cisco employee,
424 what the hell these idiots break standrads established
425 by themself???
428 struct iphdr *iph = (struct iphdr *)skb->data;
429 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
430 int grehlen = (iph->ihl<<2) + 4;
431 const int type = icmp_hdr(skb)->type;
432 const int code = icmp_hdr(skb)->code;
433 struct ip_tunnel *t;
434 __be16 flags;
436 flags = p[0];
437 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
438 if (flags&(GRE_VERSION|GRE_ROUTING))
439 return;
440 if (flags&GRE_KEY) {
441 grehlen += 4;
442 if (flags&GRE_CSUM)
443 grehlen += 4;
447 /* If only 8 bytes returned, keyed message will be dropped here */
448 if (skb_headlen(skb) < grehlen)
449 return;
451 switch (type) {
452 default:
453 case ICMP_PARAMETERPROB:
454 return;
456 case ICMP_DEST_UNREACH:
457 switch (code) {
458 case ICMP_SR_FAILED:
459 case ICMP_PORT_UNREACH:
460 /* Impossible event. */
461 return;
462 case ICMP_FRAG_NEEDED:
463 /* Soft state for pmtu is maintained by IP core. */
464 return;
465 default:
466 /* All others are translated to HOST_UNREACH.
467 rfc2003 contains "deep thoughts" about NET_UNREACH,
468 I believe they are just ether pollution. --ANK
470 break;
472 break;
473 case ICMP_TIME_EXCEEDED:
474 if (code != ICMP_EXC_TTL)
475 return;
476 break;
479 read_lock(&ipgre_lock);
480 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
481 flags & GRE_KEY ?
482 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
483 p[1]);
484 if (t == NULL || t->parms.iph.daddr == 0 ||
485 ipv4_is_multicast(t->parms.iph.daddr))
486 goto out;
488 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
489 goto out;
491 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
492 t->err_count++;
493 else
494 t->err_count = 1;
495 t->err_time = jiffies;
496 out:
497 read_unlock(&ipgre_lock);
498 return;
501 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
503 if (INET_ECN_is_ce(iph->tos)) {
504 if (skb->protocol == htons(ETH_P_IP)) {
505 IP_ECN_set_ce(ip_hdr(skb));
506 } else if (skb->protocol == htons(ETH_P_IPV6)) {
507 IP6_ECN_set_ce(ipv6_hdr(skb));
512 static inline u8
513 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
515 u8 inner = 0;
516 if (skb->protocol == htons(ETH_P_IP))
517 inner = old_iph->tos;
518 else if (skb->protocol == htons(ETH_P_IPV6))
519 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
520 return INET_ECN_encapsulate(tos, inner);
523 static int ipgre_rcv(struct sk_buff *skb)
525 struct iphdr *iph;
526 u8 *h;
527 __be16 flags;
528 __sum16 csum = 0;
529 __be32 key = 0;
530 u32 seqno = 0;
531 struct ip_tunnel *tunnel;
532 int offset = 4;
533 __be16 gre_proto;
534 unsigned int len;
536 if (!pskb_may_pull(skb, 16))
537 goto drop_nolock;
539 iph = ip_hdr(skb);
540 h = skb->data;
541 flags = *(__be16*)h;
543 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
544 /* - Version must be 0.
545 - We do not support routing headers.
547 if (flags&(GRE_VERSION|GRE_ROUTING))
548 goto drop_nolock;
550 if (flags&GRE_CSUM) {
551 switch (skb->ip_summed) {
552 case CHECKSUM_COMPLETE:
553 csum = csum_fold(skb->csum);
554 if (!csum)
555 break;
556 /* fall through */
557 case CHECKSUM_NONE:
558 skb->csum = 0;
559 csum = __skb_checksum_complete(skb);
560 skb->ip_summed = CHECKSUM_COMPLETE;
562 offset += 4;
564 if (flags&GRE_KEY) {
565 key = *(__be32*)(h + offset);
566 offset += 4;
568 if (flags&GRE_SEQ) {
569 seqno = ntohl(*(__be32*)(h + offset));
570 offset += 4;
574 gre_proto = *(__be16 *)(h + 2);
576 read_lock(&ipgre_lock);
577 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
578 iph->saddr, iph->daddr, key,
579 gre_proto))) {
580 struct net_device_stats *stats = &tunnel->dev->stats;
582 secpath_reset(skb);
584 skb->protocol = gre_proto;
585 /* WCCP version 1 and 2 protocol decoding.
586 * - Change protocol to IP
587 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
589 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
590 skb->protocol = htons(ETH_P_IP);
591 if ((*(h + offset) & 0xF0) != 0x40)
592 offset += 4;
595 skb->mac_header = skb->network_header;
596 __pskb_pull(skb, offset);
597 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
598 skb->pkt_type = PACKET_HOST;
599 #ifdef CONFIG_NET_IPGRE_BROADCAST
600 if (ipv4_is_multicast(iph->daddr)) {
601 /* Looped back packet, drop it! */
602 if (skb_rtable(skb)->fl.iif == 0)
603 goto drop;
604 stats->multicast++;
605 skb->pkt_type = PACKET_BROADCAST;
607 #endif
609 if (((flags&GRE_CSUM) && csum) ||
610 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
611 stats->rx_crc_errors++;
612 stats->rx_errors++;
613 goto drop;
615 if (tunnel->parms.i_flags&GRE_SEQ) {
616 if (!(flags&GRE_SEQ) ||
617 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
618 stats->rx_fifo_errors++;
619 stats->rx_errors++;
620 goto drop;
622 tunnel->i_seqno = seqno + 1;
625 len = skb->len;
627 /* Warning: All skb pointers will be invalidated! */
628 if (tunnel->dev->type == ARPHRD_ETHER) {
629 if (!pskb_may_pull(skb, ETH_HLEN)) {
630 stats->rx_length_errors++;
631 stats->rx_errors++;
632 goto drop;
635 iph = ip_hdr(skb);
636 skb->protocol = eth_type_trans(skb, tunnel->dev);
637 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
640 stats->rx_packets++;
641 stats->rx_bytes += len;
642 skb->dev = tunnel->dev;
643 skb_dst_drop(skb);
644 nf_reset(skb);
646 skb_reset_network_header(skb);
647 ipgre_ecn_decapsulate(iph, skb);
649 netif_rx(skb);
650 read_unlock(&ipgre_lock);
651 return(0);
653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
655 drop:
656 read_unlock(&ipgre_lock);
657 drop_nolock:
658 kfree_skb(skb);
659 return(0);
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
664 struct ip_tunnel *tunnel = netdev_priv(dev);
665 struct net_device_stats *stats = &dev->stats;
666 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
667 struct iphdr *old_iph = ip_hdr(skb);
668 struct iphdr *tiph;
669 u8 tos;
670 __be16 df;
671 struct rtable *rt; /* Route to the other host */
672 struct net_device *tdev; /* Device to other host */
673 struct iphdr *iph; /* Our new IP header */
674 unsigned int max_headroom; /* The extra header space needed */
675 int gre_hlen;
676 __be32 dst;
677 int mtu;
679 if (dev->type == ARPHRD_ETHER)
680 IPCB(skb)->flags = 0;
682 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
683 gre_hlen = 0;
684 tiph = (struct iphdr *)skb->data;
685 } else {
686 gre_hlen = tunnel->hlen;
687 tiph = &tunnel->parms.iph;
690 if ((dst = tiph->daddr) == 0) {
691 /* NBMA tunnel */
693 if (skb_dst(skb) == NULL) {
694 stats->tx_fifo_errors++;
695 goto tx_error;
698 if (skb->protocol == htons(ETH_P_IP)) {
699 rt = skb_rtable(skb);
700 if ((dst = rt->rt_gateway) == 0)
701 goto tx_error_icmp;
703 #ifdef CONFIG_IPV6
704 else if (skb->protocol == htons(ETH_P_IPV6)) {
705 struct in6_addr *addr6;
706 int addr_type;
707 struct neighbour *neigh = skb_dst(skb)->neighbour;
709 if (neigh == NULL)
710 goto tx_error;
712 addr6 = (struct in6_addr *)&neigh->primary_key;
713 addr_type = ipv6_addr_type(addr6);
715 if (addr_type == IPV6_ADDR_ANY) {
716 addr6 = &ipv6_hdr(skb)->daddr;
717 addr_type = ipv6_addr_type(addr6);
720 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
721 goto tx_error_icmp;
723 dst = addr6->s6_addr32[3];
725 #endif
726 else
727 goto tx_error;
730 tos = tiph->tos;
731 if (tos == 1) {
732 tos = 0;
733 if (skb->protocol == htons(ETH_P_IP))
734 tos = old_iph->tos;
738 struct flowi fl = { .oif = tunnel->parms.link,
739 .nl_u = { .ip4_u =
740 { .daddr = dst,
741 .saddr = tiph->saddr,
742 .tos = RT_TOS(tos) } },
743 .proto = IPPROTO_GRE };
744 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
745 stats->tx_carrier_errors++;
746 goto tx_error;
749 tdev = rt->u.dst.dev;
751 if (tdev == dev) {
752 ip_rt_put(rt);
753 stats->collisions++;
754 goto tx_error;
757 df = tiph->frag_off;
758 if (df)
759 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
760 else
761 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
763 if (skb_dst(skb))
764 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
766 if (skb->protocol == htons(ETH_P_IP)) {
767 df |= (old_iph->frag_off&htons(IP_DF));
769 if ((old_iph->frag_off&htons(IP_DF)) &&
770 mtu < ntohs(old_iph->tot_len)) {
771 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
772 ip_rt_put(rt);
773 goto tx_error;
776 #ifdef CONFIG_IPV6
777 else if (skb->protocol == htons(ETH_P_IPV6)) {
778 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
781 if ((tunnel->parms.iph.daddr &&
782 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
783 rt6->rt6i_dst.plen == 128) {
784 rt6->rt6i_flags |= RTF_MODIFIED;
785 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
789 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
790 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
791 ip_rt_put(rt);
792 goto tx_error;
795 #endif
797 if (tunnel->err_count > 0) {
798 if (time_before(jiffies,
799 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
800 tunnel->err_count--;
802 dst_link_failure(skb);
803 } else
804 tunnel->err_count = 0;
807 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
809 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
810 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
811 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
812 if (!new_skb) {
813 ip_rt_put(rt);
814 txq->tx_dropped++;
815 dev_kfree_skb(skb);
816 return NETDEV_TX_OK;
818 if (skb->sk)
819 skb_set_owner_w(new_skb, skb->sk);
820 dev_kfree_skb(skb);
821 skb = new_skb;
822 old_iph = ip_hdr(skb);
825 skb_reset_transport_header(skb);
826 skb_push(skb, gre_hlen);
827 skb_reset_network_header(skb);
828 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
829 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
830 IPSKB_REROUTED);
831 skb_dst_drop(skb);
832 skb_dst_set(skb, &rt->u.dst);
835 * Push down and install the IPIP header.
838 iph = ip_hdr(skb);
839 iph->version = 4;
840 iph->ihl = sizeof(struct iphdr) >> 2;
841 iph->frag_off = df;
842 iph->protocol = IPPROTO_GRE;
843 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
844 iph->daddr = rt->rt_dst;
845 iph->saddr = rt->rt_src;
847 if ((iph->ttl = tiph->ttl) == 0) {
848 if (skb->protocol == htons(ETH_P_IP))
849 iph->ttl = old_iph->ttl;
850 #ifdef CONFIG_IPV6
851 else if (skb->protocol == htons(ETH_P_IPV6))
852 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
853 #endif
854 else
855 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
858 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
859 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
860 htons(ETH_P_TEB) : skb->protocol;
862 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
863 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
865 if (tunnel->parms.o_flags&GRE_SEQ) {
866 ++tunnel->o_seqno;
867 *ptr = htonl(tunnel->o_seqno);
868 ptr--;
870 if (tunnel->parms.o_flags&GRE_KEY) {
871 *ptr = tunnel->parms.o_key;
872 ptr--;
874 if (tunnel->parms.o_flags&GRE_CSUM) {
875 *ptr = 0;
876 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
880 nf_reset(skb);
882 IPTUNNEL_XMIT();
883 return NETDEV_TX_OK;
885 tx_error_icmp:
886 dst_link_failure(skb);
888 tx_error:
889 stats->tx_errors++;
890 dev_kfree_skb(skb);
891 return NETDEV_TX_OK;
894 static int ipgre_tunnel_bind_dev(struct net_device *dev)
896 struct net_device *tdev = NULL;
897 struct ip_tunnel *tunnel;
898 struct iphdr *iph;
899 int hlen = LL_MAX_HEADER;
900 int mtu = ETH_DATA_LEN;
901 int addend = sizeof(struct iphdr) + 4;
903 tunnel = netdev_priv(dev);
904 iph = &tunnel->parms.iph;
906 /* Guess output device to choose reasonable mtu and needed_headroom */
908 if (iph->daddr) {
909 struct flowi fl = { .oif = tunnel->parms.link,
910 .nl_u = { .ip4_u =
911 { .daddr = iph->daddr,
912 .saddr = iph->saddr,
913 .tos = RT_TOS(iph->tos) } },
914 .proto = IPPROTO_GRE };
915 struct rtable *rt;
916 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
917 tdev = rt->u.dst.dev;
918 ip_rt_put(rt);
921 if (dev->type != ARPHRD_ETHER)
922 dev->flags |= IFF_POINTOPOINT;
925 if (!tdev && tunnel->parms.link)
926 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
928 if (tdev) {
929 hlen = tdev->hard_header_len + tdev->needed_headroom;
930 mtu = tdev->mtu;
932 dev->iflink = tunnel->parms.link;
934 /* Precalculate GRE options length */
935 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
936 if (tunnel->parms.o_flags&GRE_CSUM)
937 addend += 4;
938 if (tunnel->parms.o_flags&GRE_KEY)
939 addend += 4;
940 if (tunnel->parms.o_flags&GRE_SEQ)
941 addend += 4;
943 dev->needed_headroom = addend + hlen;
944 mtu -= dev->hard_header_len + addend;
946 if (mtu < 68)
947 mtu = 68;
949 tunnel->hlen = addend;
951 return mtu;
954 static int
955 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
957 int err = 0;
958 struct ip_tunnel_parm p;
959 struct ip_tunnel *t;
960 struct net *net = dev_net(dev);
961 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
963 switch (cmd) {
964 case SIOCGETTUNNEL:
965 t = NULL;
966 if (dev == ign->fb_tunnel_dev) {
967 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
968 err = -EFAULT;
969 break;
971 t = ipgre_tunnel_locate(net, &p, 0);
973 if (t == NULL)
974 t = netdev_priv(dev);
975 memcpy(&p, &t->parms, sizeof(p));
976 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
977 err = -EFAULT;
978 break;
980 case SIOCADDTUNNEL:
981 case SIOCCHGTUNNEL:
982 err = -EPERM;
983 if (!capable(CAP_NET_ADMIN))
984 goto done;
986 err = -EFAULT;
987 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
988 goto done;
990 err = -EINVAL;
991 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
992 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
993 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
994 goto done;
995 if (p.iph.ttl)
996 p.iph.frag_off |= htons(IP_DF);
998 if (!(p.i_flags&GRE_KEY))
999 p.i_key = 0;
1000 if (!(p.o_flags&GRE_KEY))
1001 p.o_key = 0;
1003 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1005 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1006 if (t != NULL) {
1007 if (t->dev != dev) {
1008 err = -EEXIST;
1009 break;
1011 } else {
1012 unsigned nflags = 0;
1014 t = netdev_priv(dev);
1016 if (ipv4_is_multicast(p.iph.daddr))
1017 nflags = IFF_BROADCAST;
1018 else if (p.iph.daddr)
1019 nflags = IFF_POINTOPOINT;
1021 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1022 err = -EINVAL;
1023 break;
1025 ipgre_tunnel_unlink(ign, t);
1026 t->parms.iph.saddr = p.iph.saddr;
1027 t->parms.iph.daddr = p.iph.daddr;
1028 t->parms.i_key = p.i_key;
1029 t->parms.o_key = p.o_key;
1030 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1031 memcpy(dev->broadcast, &p.iph.daddr, 4);
1032 ipgre_tunnel_link(ign, t);
1033 netdev_state_change(dev);
1037 if (t) {
1038 err = 0;
1039 if (cmd == SIOCCHGTUNNEL) {
1040 t->parms.iph.ttl = p.iph.ttl;
1041 t->parms.iph.tos = p.iph.tos;
1042 t->parms.iph.frag_off = p.iph.frag_off;
1043 if (t->parms.link != p.link) {
1044 t->parms.link = p.link;
1045 dev->mtu = ipgre_tunnel_bind_dev(dev);
1046 netdev_state_change(dev);
1049 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1050 err = -EFAULT;
1051 } else
1052 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1053 break;
1055 case SIOCDELTUNNEL:
1056 err = -EPERM;
1057 if (!capable(CAP_NET_ADMIN))
1058 goto done;
1060 if (dev == ign->fb_tunnel_dev) {
1061 err = -EFAULT;
1062 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1063 goto done;
1064 err = -ENOENT;
1065 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1066 goto done;
1067 err = -EPERM;
1068 if (t == netdev_priv(ign->fb_tunnel_dev))
1069 goto done;
1070 dev = t->dev;
1072 unregister_netdevice(dev);
1073 err = 0;
1074 break;
1076 default:
1077 err = -EINVAL;
1080 done:
1081 return err;
1084 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1086 struct ip_tunnel *tunnel = netdev_priv(dev);
1087 if (new_mtu < 68 ||
1088 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1089 return -EINVAL;
1090 dev->mtu = new_mtu;
1091 return 0;
1094 /* Nice toy. Unfortunately, useless in real life :-)
1095 It allows to construct virtual multiprotocol broadcast "LAN"
1096 over the Internet, provided multicast routing is tuned.
1099 I have no idea was this bicycle invented before me,
1100 so that I had to set ARPHRD_IPGRE to a random value.
1101 I have an impression, that Cisco could make something similar,
1102 but this feature is apparently missing in IOS<=11.2(8).
1104 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1105 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1107 ping -t 255 224.66.66.66
1109 If nobody answers, mbone does not work.
1111 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1112 ip addr add 10.66.66.<somewhat>/24 dev Universe
1113 ifconfig Universe up
1114 ifconfig Universe add fe80::<Your_real_addr>/10
1115 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1116 ftp 10.66.66.66
1118 ftp fec0:6666:6666::193.233.7.65
1123 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1124 unsigned short type,
1125 const void *daddr, const void *saddr, unsigned len)
1127 struct ip_tunnel *t = netdev_priv(dev);
1128 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1129 __be16 *p = (__be16*)(iph+1);
1131 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1132 p[0] = t->parms.o_flags;
1133 p[1] = htons(type);
1136 * Set the source hardware address.
1139 if (saddr)
1140 memcpy(&iph->saddr, saddr, 4);
1142 if (daddr) {
1143 memcpy(&iph->daddr, daddr, 4);
1144 return t->hlen;
1146 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1147 return t->hlen;
1149 return -t->hlen;
1152 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1154 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1155 memcpy(haddr, &iph->saddr, 4);
1156 return 4;
1159 static const struct header_ops ipgre_header_ops = {
1160 .create = ipgre_header,
1161 .parse = ipgre_header_parse,
1164 #ifdef CONFIG_NET_IPGRE_BROADCAST
1165 static int ipgre_open(struct net_device *dev)
1167 struct ip_tunnel *t = netdev_priv(dev);
1169 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170 struct flowi fl = { .oif = t->parms.link,
1171 .nl_u = { .ip4_u =
1172 { .daddr = t->parms.iph.daddr,
1173 .saddr = t->parms.iph.saddr,
1174 .tos = RT_TOS(t->parms.iph.tos) } },
1175 .proto = IPPROTO_GRE };
1176 struct rtable *rt;
1177 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178 return -EADDRNOTAVAIL;
1179 dev = rt->u.dst.dev;
1180 ip_rt_put(rt);
1181 if (__in_dev_get_rtnl(dev) == NULL)
1182 return -EADDRNOTAVAIL;
1183 t->mlink = dev->ifindex;
1184 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1186 return 0;
1189 static int ipgre_close(struct net_device *dev)
1191 struct ip_tunnel *t = netdev_priv(dev);
1193 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194 struct in_device *in_dev;
1195 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 if (in_dev) {
1197 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198 in_dev_put(in_dev);
1201 return 0;
1204 #endif
1206 static const struct net_device_ops ipgre_netdev_ops = {
1207 .ndo_init = ipgre_tunnel_init,
1208 .ndo_uninit = ipgre_tunnel_uninit,
1209 #ifdef CONFIG_NET_IPGRE_BROADCAST
1210 .ndo_open = ipgre_open,
1211 .ndo_stop = ipgre_close,
1212 #endif
1213 .ndo_start_xmit = ipgre_tunnel_xmit,
1214 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1215 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1218 static void ipgre_tunnel_setup(struct net_device *dev)
1220 dev->netdev_ops = &ipgre_netdev_ops;
1221 dev->destructor = free_netdev;
1223 dev->type = ARPHRD_IPGRE;
1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1225 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1226 dev->flags = IFF_NOARP;
1227 dev->iflink = 0;
1228 dev->addr_len = 4;
1229 dev->features |= NETIF_F_NETNS_LOCAL;
1230 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1233 static int ipgre_tunnel_init(struct net_device *dev)
1235 struct ip_tunnel *tunnel;
1236 struct iphdr *iph;
1238 tunnel = netdev_priv(dev);
1239 iph = &tunnel->parms.iph;
1241 tunnel->dev = dev;
1242 strcpy(tunnel->parms.name, dev->name);
1244 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1245 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1247 if (iph->daddr) {
1248 #ifdef CONFIG_NET_IPGRE_BROADCAST
1249 if (ipv4_is_multicast(iph->daddr)) {
1250 if (!iph->saddr)
1251 return -EINVAL;
1252 dev->flags = IFF_BROADCAST;
1253 dev->header_ops = &ipgre_header_ops;
1255 #endif
1256 } else
1257 dev->header_ops = &ipgre_header_ops;
1259 return 0;
1262 static void ipgre_fb_tunnel_init(struct net_device *dev)
1264 struct ip_tunnel *tunnel = netdev_priv(dev);
1265 struct iphdr *iph = &tunnel->parms.iph;
1266 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1268 tunnel->dev = dev;
1269 strcpy(tunnel->parms.name, dev->name);
1271 iph->version = 4;
1272 iph->protocol = IPPROTO_GRE;
1273 iph->ihl = 5;
1274 tunnel->hlen = sizeof(struct iphdr) + 4;
1276 dev_hold(dev);
1277 ign->tunnels_wc[0] = tunnel;
1281 static const struct net_protocol ipgre_protocol = {
1282 .handler = ipgre_rcv,
1283 .err_handler = ipgre_err,
1284 .netns_ok = 1,
1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1289 int prio;
1291 for (prio = 0; prio < 4; prio++) {
1292 int h;
1293 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t;
1295 while ((t = ign->tunnels[prio][h]) != NULL)
1296 unregister_netdevice(t->dev);
1301 static int ipgre_init_net(struct net *net)
1303 int err;
1304 struct ipgre_net *ign;
1306 err = -ENOMEM;
1307 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1308 if (ign == NULL)
1309 goto err_alloc;
1311 err = net_assign_generic(net, ipgre_net_id, ign);
1312 if (err < 0)
1313 goto err_assign;
1315 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316 ipgre_tunnel_setup);
1317 if (!ign->fb_tunnel_dev) {
1318 err = -ENOMEM;
1319 goto err_alloc_dev;
1321 dev_net_set(ign->fb_tunnel_dev, net);
1323 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1326 if ((err = register_netdev(ign->fb_tunnel_dev)))
1327 goto err_reg_dev;
1329 return 0;
1331 err_reg_dev:
1332 free_netdev(ign->fb_tunnel_dev);
1333 err_alloc_dev:
1334 /* nothing */
1335 err_assign:
1336 kfree(ign);
1337 err_alloc:
1338 return err;
1341 static void ipgre_exit_net(struct net *net)
1343 struct ipgre_net *ign;
1345 ign = net_generic(net, ipgre_net_id);
1346 rtnl_lock();
1347 ipgre_destroy_tunnels(ign);
1348 rtnl_unlock();
1349 kfree(ign);
1352 static struct pernet_operations ipgre_net_ops = {
1353 .init = ipgre_init_net,
1354 .exit = ipgre_exit_net,
1357 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1359 __be16 flags;
1361 if (!data)
1362 return 0;
1364 flags = 0;
1365 if (data[IFLA_GRE_IFLAGS])
1366 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1367 if (data[IFLA_GRE_OFLAGS])
1368 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1369 if (flags & (GRE_VERSION|GRE_ROUTING))
1370 return -EINVAL;
1372 return 0;
1375 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1377 __be32 daddr;
1379 if (tb[IFLA_ADDRESS]) {
1380 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1381 return -EINVAL;
1382 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1383 return -EADDRNOTAVAIL;
1386 if (!data)
1387 goto out;
1389 if (data[IFLA_GRE_REMOTE]) {
1390 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1391 if (!daddr)
1392 return -EINVAL;
1395 out:
1396 return ipgre_tunnel_validate(tb, data);
1399 static void ipgre_netlink_parms(struct nlattr *data[],
1400 struct ip_tunnel_parm *parms)
1402 memset(parms, 0, sizeof(*parms));
1404 parms->iph.protocol = IPPROTO_GRE;
1406 if (!data)
1407 return;
1409 if (data[IFLA_GRE_LINK])
1410 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1412 if (data[IFLA_GRE_IFLAGS])
1413 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1415 if (data[IFLA_GRE_OFLAGS])
1416 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1418 if (data[IFLA_GRE_IKEY])
1419 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1421 if (data[IFLA_GRE_OKEY])
1422 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1424 if (data[IFLA_GRE_LOCAL])
1425 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1427 if (data[IFLA_GRE_REMOTE])
1428 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1430 if (data[IFLA_GRE_TTL])
1431 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1433 if (data[IFLA_GRE_TOS])
1434 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1436 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1437 parms->iph.frag_off = htons(IP_DF);
1440 static int ipgre_tap_init(struct net_device *dev)
1442 struct ip_tunnel *tunnel;
1444 tunnel = netdev_priv(dev);
1446 tunnel->dev = dev;
1447 strcpy(tunnel->parms.name, dev->name);
1449 ipgre_tunnel_bind_dev(dev);
1451 return 0;
1454 static const struct net_device_ops ipgre_tap_netdev_ops = {
1455 .ndo_init = ipgre_tap_init,
1456 .ndo_uninit = ipgre_tunnel_uninit,
1457 .ndo_start_xmit = ipgre_tunnel_xmit,
1458 .ndo_set_mac_address = eth_mac_addr,
1459 .ndo_validate_addr = eth_validate_addr,
1460 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1463 static void ipgre_tap_setup(struct net_device *dev)
1466 ether_setup(dev);
1468 dev->netdev_ops = &ipgre_netdev_ops;
1469 dev->destructor = free_netdev;
1471 dev->iflink = 0;
1472 dev->features |= NETIF_F_NETNS_LOCAL;
1475 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1476 struct nlattr *data[])
1478 struct ip_tunnel *nt;
1479 struct net *net = dev_net(dev);
1480 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1481 int mtu;
1482 int err;
1484 nt = netdev_priv(dev);
1485 ipgre_netlink_parms(data, &nt->parms);
1487 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1488 return -EEXIST;
1490 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1491 random_ether_addr(dev->dev_addr);
1493 mtu = ipgre_tunnel_bind_dev(dev);
1494 if (!tb[IFLA_MTU])
1495 dev->mtu = mtu;
1497 err = register_netdevice(dev);
1498 if (err)
1499 goto out;
1501 dev_hold(dev);
1502 ipgre_tunnel_link(ign, nt);
1504 out:
1505 return err;
1508 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1509 struct nlattr *data[])
1511 struct ip_tunnel *t, *nt;
1512 struct net *net = dev_net(dev);
1513 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1514 struct ip_tunnel_parm p;
1515 int mtu;
1517 if (dev == ign->fb_tunnel_dev)
1518 return -EINVAL;
1520 nt = netdev_priv(dev);
1521 ipgre_netlink_parms(data, &p);
1523 t = ipgre_tunnel_locate(net, &p, 0);
1525 if (t) {
1526 if (t->dev != dev)
1527 return -EEXIST;
1528 } else {
1529 unsigned nflags = 0;
1531 t = nt;
1533 if (ipv4_is_multicast(p.iph.daddr))
1534 nflags = IFF_BROADCAST;
1535 else if (p.iph.daddr)
1536 nflags = IFF_POINTOPOINT;
1538 if ((dev->flags ^ nflags) &
1539 (IFF_POINTOPOINT | IFF_BROADCAST))
1540 return -EINVAL;
1542 ipgre_tunnel_unlink(ign, t);
1543 t->parms.iph.saddr = p.iph.saddr;
1544 t->parms.iph.daddr = p.iph.daddr;
1545 t->parms.i_key = p.i_key;
1546 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1547 memcpy(dev->broadcast, &p.iph.daddr, 4);
1548 ipgre_tunnel_link(ign, t);
1549 netdev_state_change(dev);
1552 t->parms.o_key = p.o_key;
1553 t->parms.iph.ttl = p.iph.ttl;
1554 t->parms.iph.tos = p.iph.tos;
1555 t->parms.iph.frag_off = p.iph.frag_off;
1557 if (t->parms.link != p.link) {
1558 t->parms.link = p.link;
1559 mtu = ipgre_tunnel_bind_dev(dev);
1560 if (!tb[IFLA_MTU])
1561 dev->mtu = mtu;
1562 netdev_state_change(dev);
1565 return 0;
1568 static size_t ipgre_get_size(const struct net_device *dev)
1570 return
1571 /* IFLA_GRE_LINK */
1572 nla_total_size(4) +
1573 /* IFLA_GRE_IFLAGS */
1574 nla_total_size(2) +
1575 /* IFLA_GRE_OFLAGS */
1576 nla_total_size(2) +
1577 /* IFLA_GRE_IKEY */
1578 nla_total_size(4) +
1579 /* IFLA_GRE_OKEY */
1580 nla_total_size(4) +
1581 /* IFLA_GRE_LOCAL */
1582 nla_total_size(4) +
1583 /* IFLA_GRE_REMOTE */
1584 nla_total_size(4) +
1585 /* IFLA_GRE_TTL */
1586 nla_total_size(1) +
1587 /* IFLA_GRE_TOS */
1588 nla_total_size(1) +
1589 /* IFLA_GRE_PMTUDISC */
1590 nla_total_size(1) +
1594 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1596 struct ip_tunnel *t = netdev_priv(dev);
1597 struct ip_tunnel_parm *p = &t->parms;
1599 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1600 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1601 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1602 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1603 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1604 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1605 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1606 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1607 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1608 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1610 return 0;
1612 nla_put_failure:
1613 return -EMSGSIZE;
1616 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1617 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1618 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1619 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1620 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1621 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1622 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1623 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1624 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1625 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1626 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1629 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1630 .kind = "gre",
1631 .maxtype = IFLA_GRE_MAX,
1632 .policy = ipgre_policy,
1633 .priv_size = sizeof(struct ip_tunnel),
1634 .setup = ipgre_tunnel_setup,
1635 .validate = ipgre_tunnel_validate,
1636 .newlink = ipgre_newlink,
1637 .changelink = ipgre_changelink,
1638 .get_size = ipgre_get_size,
1639 .fill_info = ipgre_fill_info,
1642 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1643 .kind = "gretap",
1644 .maxtype = IFLA_GRE_MAX,
1645 .policy = ipgre_policy,
1646 .priv_size = sizeof(struct ip_tunnel),
1647 .setup = ipgre_tap_setup,
1648 .validate = ipgre_tap_validate,
1649 .newlink = ipgre_newlink,
1650 .changelink = ipgre_changelink,
1651 .get_size = ipgre_get_size,
1652 .fill_info = ipgre_fill_info,
1656 * And now the modules code and kernel interface.
1659 static int __init ipgre_init(void)
1661 int err;
1663 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1665 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1666 printk(KERN_INFO "ipgre init: can't add protocol\n");
1667 return -EAGAIN;
1670 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1671 if (err < 0)
1672 goto gen_device_failed;
1674 err = rtnl_link_register(&ipgre_link_ops);
1675 if (err < 0)
1676 goto rtnl_link_failed;
1678 err = rtnl_link_register(&ipgre_tap_ops);
1679 if (err < 0)
1680 goto tap_ops_failed;
1682 out:
1683 return err;
1685 tap_ops_failed:
1686 rtnl_link_unregister(&ipgre_link_ops);
1687 rtnl_link_failed:
1688 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1689 gen_device_failed:
1690 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1691 goto out;
1694 static void __exit ipgre_fini(void)
1696 rtnl_link_unregister(&ipgre_tap_ops);
1697 rtnl_link_unregister(&ipgre_link_ops);
1698 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1700 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1703 module_init(ipgre_init);
1704 module_exit(ipgre_fini);
1705 MODULE_LICENSE("GPL");
1706 MODULE_ALIAS_RTNL_LINK("gre");
1707 MODULE_ALIAS_RTNL_LINK("gretap");