Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[wrt350n-kernel.git] / net / ipv4 / ip_gre.c
blob5c988ff9be7e10a7baff9f07d1d182a37b567359
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
43 #ifdef CONFIG_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_fib.h>
46 #include <net/ip6_route.h>
47 #endif
50 Problems & solutions
51 --------------------
53 1. The most important issue is detecting local dead loops.
54 They would cause complete host lockup in transmit, which
55 would be "resolved" by stack overflow or, if queueing is enabled,
56 with infinite looping in net_bh.
58 We cannot track such dead loops during route installation,
59 it is infeasible task. The most general solutions would be
60 to keep skb->encapsulation counter (sort of local ttl),
61 and silently drop packet when it expires. It is the best
62 solution, but it supposes maintaing new variable in ALL
63 skb, even if no tunneling is used.
65 Current solution: t->recursion lock breaks dead loops. It looks
66 like dev->tbusy flag, but I preferred new variable, because
67 the semantics is different. One day, when hard_start_xmit
68 will be multithreaded we will have to use skb->encapsulation.
72 2. Networking dead loops would not kill routers, but would really
73 kill network. IP hop limit plays role of "t->recursion" in this case,
74 if we copy it from packet being encapsulated to upper header.
75 It is very good solution, but it introduces two problems:
77 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
78 do not work over tunnels.
79 - traceroute does not work. I planned to relay ICMP from tunnel,
80 so that this problem would be solved and traceroute output
81 would even more informative. This idea appeared to be wrong:
82 only Linux complies to rfc1812 now (yes, guys, Linux is the only
83 true router now :-)), all routers (at least, in neighbourhood of mine)
84 return only 8 bytes of payload. It is the end.
86 Hence, if we want that OSPF worked or traceroute said something reasonable,
87 we should search for another solution.
89 One of them is to parse packet trying to detect inner encapsulation
90 made by our node. It is difficult or even impossible, especially,
91 taking into account fragmentation. TO be short, tt is not solution at all.
93 Current solution: The solution was UNEXPECTEDLY SIMPLE.
94 We force DF flag on tunnels with preconfigured hop limit,
95 that is ALL. :-) Well, it does not remove the problem completely,
96 but exponential growth of network traffic is changed to linear
97 (branches, that exceed pmtu are pruned) and tunnel mtu
98 fastly degrades to value <68, where looping stops.
99 Yes, it is not good if there exists a router in the loop,
100 which does not force DF, even when encapsulating packets have DF set.
101 But it is not our problem! Nobody could accuse us, we made
102 all that we could make. Even if it is your gated who injected
103 fatal route to network, even if it were you who configured
104 fatal static route: you are innocent. :-)
108 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
109 practically identical code. It would be good to glue them
110 together, but it is not very evident, how to make them modular.
111 sit is integral part of IPv6, ipip and gre are naturally modular.
112 We could extract common parts (hash table, ioctl etc)
113 to a separate module (ip_tunnel.c).
115 Alexey Kuznetsov.
118 static int ipgre_tunnel_init(struct net_device *dev);
119 static void ipgre_tunnel_setup(struct net_device *dev);
121 /* Fallback tunnel: no source, no destination, no key, no options */
123 static int ipgre_fb_tunnel_init(struct net_device *dev);
125 static struct net_device *ipgre_fb_tunnel_dev;
127 /* Tunnel hash table */
130 4 hash tables:
132 3: (remote,local)
133 2: (remote,*)
134 1: (*,local)
135 0: (*,*)
137 We require exact key match i.e. if a key is present in packet
138 it will match only tunnel with the same key; if it is not present,
139 it will match only keyless tunnel.
141 All keysless packets, if not matched configured keyless tunnels
142 will match fallback tunnel.
145 #define HASH_SIZE 16
146 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
148 static struct ip_tunnel *tunnels[4][HASH_SIZE];
150 #define tunnels_r_l (tunnels[3])
151 #define tunnels_r (tunnels[2])
152 #define tunnels_l (tunnels[1])
153 #define tunnels_wc (tunnels[0])
155 static DEFINE_RWLOCK(ipgre_lock);
157 /* Given src, dst and key, find appropriate for input tunnel. */
159 static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be32 key)
161 unsigned h0 = HASH(remote);
162 unsigned h1 = HASH(key);
163 struct ip_tunnel *t;
165 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
166 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
167 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
168 return t;
171 for (t = tunnels_r[h0^h1]; t; t = t->next) {
172 if (remote == t->parms.iph.daddr) {
173 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
174 return t;
177 for (t = tunnels_l[h1]; t; t = t->next) {
178 if (local == t->parms.iph.saddr ||
179 (local == t->parms.iph.daddr &&
180 ipv4_is_multicast(local))) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182 return t;
185 for (t = tunnels_wc[h1]; t; t = t->next) {
186 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
187 return t;
190 if (ipgre_fb_tunnel_dev->flags&IFF_UP)
191 return netdev_priv(ipgre_fb_tunnel_dev);
192 return NULL;
195 static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms)
197 __be32 remote = parms->iph.daddr;
198 __be32 local = parms->iph.saddr;
199 __be32 key = parms->i_key;
200 unsigned h = HASH(key);
201 int prio = 0;
203 if (local)
204 prio |= 1;
205 if (remote && !ipv4_is_multicast(remote)) {
206 prio |= 2;
207 h ^= HASH(remote);
210 return &tunnels[prio][h];
213 static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
215 return __ipgre_bucket(&t->parms);
218 static void ipgre_tunnel_link(struct ip_tunnel *t)
220 struct ip_tunnel **tp = ipgre_bucket(t);
222 t->next = *tp;
223 write_lock_bh(&ipgre_lock);
224 *tp = t;
225 write_unlock_bh(&ipgre_lock);
228 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
230 struct ip_tunnel **tp;
232 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
233 if (t == *tp) {
234 write_lock_bh(&ipgre_lock);
235 *tp = t->next;
236 write_unlock_bh(&ipgre_lock);
237 break;
242 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
244 __be32 remote = parms->iph.daddr;
245 __be32 local = parms->iph.saddr;
246 __be32 key = parms->i_key;
247 struct ip_tunnel *t, **tp, *nt;
248 struct net_device *dev;
249 char name[IFNAMSIZ];
251 for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
252 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
253 if (key == t->parms.i_key)
254 return t;
257 if (!create)
258 return NULL;
260 if (parms->name[0])
261 strlcpy(name, parms->name, IFNAMSIZ);
262 <<<<<<< HEAD:net/ipv4/ip_gre.c
263 else {
264 int i;
265 for (i=1; i<100; i++) {
266 sprintf(name, "gre%d", i);
267 if (__dev_get_by_name(&init_net, name) == NULL)
268 break;
270 if (i==100)
271 goto failed;
273 =======
274 else
275 sprintf(name, "gre%%d");
276 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:net/ipv4/ip_gre.c
278 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
279 if (!dev)
280 return NULL;
282 <<<<<<< HEAD:net/ipv4/ip_gre.c
283 =======
284 if (strchr(name, '%')) {
285 if (dev_alloc_name(dev, name) < 0)
286 goto failed_free;
289 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:net/ipv4/ip_gre.c
290 dev->init = ipgre_tunnel_init;
291 nt = netdev_priv(dev);
292 nt->parms = *parms;
294 <<<<<<< HEAD:net/ipv4/ip_gre.c
295 if (register_netdevice(dev) < 0) {
296 free_netdev(dev);
297 goto failed;
299 =======
300 if (register_netdevice(dev) < 0)
301 goto failed_free;
302 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:net/ipv4/ip_gre.c
304 dev_hold(dev);
305 ipgre_tunnel_link(nt);
306 return nt;
308 <<<<<<< HEAD:net/ipv4/ip_gre.c
309 failed:
310 =======
311 failed_free:
312 free_netdev(dev);
313 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:net/ipv4/ip_gre.c
314 return NULL;
317 static void ipgre_tunnel_uninit(struct net_device *dev)
319 ipgre_tunnel_unlink(netdev_priv(dev));
320 dev_put(dev);
324 static void ipgre_err(struct sk_buff *skb, u32 info)
326 #ifndef I_WISH_WORLD_WERE_PERFECT
328 /* It is not :-( All the routers (except for Linux) return only
329 8 bytes of packet payload. It means, that precise relaying of
330 ICMP in the real Internet is absolutely infeasible.
332 Moreover, Cisco "wise men" put GRE key to the third word
333 in GRE header. It makes impossible maintaining even soft state for keyed
334 GRE tunnels with enabled checksum. Tell them "thank you".
336 Well, I wonder, rfc1812 was written by Cisco employee,
337 what the hell these idiots break standrads established
338 by themself???
341 struct iphdr *iph = (struct iphdr*)skb->data;
342 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
343 int grehlen = (iph->ihl<<2) + 4;
344 const int type = icmp_hdr(skb)->type;
345 const int code = icmp_hdr(skb)->code;
346 struct ip_tunnel *t;
347 __be16 flags;
349 flags = p[0];
350 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
351 if (flags&(GRE_VERSION|GRE_ROUTING))
352 return;
353 if (flags&GRE_KEY) {
354 grehlen += 4;
355 if (flags&GRE_CSUM)
356 grehlen += 4;
360 /* If only 8 bytes returned, keyed message will be dropped here */
361 if (skb_headlen(skb) < grehlen)
362 return;
364 switch (type) {
365 default:
366 case ICMP_PARAMETERPROB:
367 return;
369 case ICMP_DEST_UNREACH:
370 switch (code) {
371 case ICMP_SR_FAILED:
372 case ICMP_PORT_UNREACH:
373 /* Impossible event. */
374 return;
375 case ICMP_FRAG_NEEDED:
376 /* Soft state for pmtu is maintained by IP core. */
377 return;
378 default:
379 /* All others are translated to HOST_UNREACH.
380 rfc2003 contains "deep thoughts" about NET_UNREACH,
381 I believe they are just ether pollution. --ANK
383 break;
385 break;
386 case ICMP_TIME_EXCEEDED:
387 if (code != ICMP_EXC_TTL)
388 return;
389 break;
392 read_lock(&ipgre_lock);
393 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0);
394 if (t == NULL || t->parms.iph.daddr == 0 ||
395 ipv4_is_multicast(t->parms.iph.daddr))
396 goto out;
398 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
399 goto out;
401 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
402 t->err_count++;
403 else
404 t->err_count = 1;
405 t->err_time = jiffies;
406 out:
407 read_unlock(&ipgre_lock);
408 return;
409 #else
410 struct iphdr *iph = (struct iphdr*)dp;
411 struct iphdr *eiph;
412 __be16 *p = (__be16*)(dp+(iph->ihl<<2));
413 const int type = icmp_hdr(skb)->type;
414 const int code = icmp_hdr(skb)->code;
415 int rel_type = 0;
416 int rel_code = 0;
417 __be32 rel_info = 0;
418 __u32 n = 0;
419 __be16 flags;
420 int grehlen = (iph->ihl<<2) + 4;
421 struct sk_buff *skb2;
422 struct flowi fl;
423 struct rtable *rt;
425 if (p[1] != htons(ETH_P_IP))
426 return;
428 flags = p[0];
429 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
430 if (flags&(GRE_VERSION|GRE_ROUTING))
431 return;
432 if (flags&GRE_CSUM)
433 grehlen += 4;
434 if (flags&GRE_KEY)
435 grehlen += 4;
436 if (flags&GRE_SEQ)
437 grehlen += 4;
439 if (len < grehlen + sizeof(struct iphdr))
440 return;
441 eiph = (struct iphdr*)(dp + grehlen);
443 switch (type) {
444 default:
445 return;
446 case ICMP_PARAMETERPROB:
447 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
448 if (n < (iph->ihl<<2))
449 return;
451 /* So... This guy found something strange INSIDE encapsulated
452 packet. Well, he is fool, but what can we do ?
454 rel_type = ICMP_PARAMETERPROB;
455 n -= grehlen;
456 rel_info = htonl(n << 24);
457 break;
459 case ICMP_DEST_UNREACH:
460 switch (code) {
461 case ICMP_SR_FAILED:
462 case ICMP_PORT_UNREACH:
463 /* Impossible event. */
464 return;
465 case ICMP_FRAG_NEEDED:
466 /* And it is the only really necessary thing :-) */
467 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
468 if (n < grehlen+68)
469 return;
470 n -= grehlen;
471 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
472 if (n > ntohs(eiph->tot_len))
473 return;
474 rel_info = htonl(n);
475 break;
476 default:
477 /* All others are translated to HOST_UNREACH.
478 rfc2003 contains "deep thoughts" about NET_UNREACH,
479 I believe, it is just ether pollution. --ANK
481 rel_type = ICMP_DEST_UNREACH;
482 rel_code = ICMP_HOST_UNREACH;
483 break;
485 break;
486 case ICMP_TIME_EXCEEDED:
487 if (code != ICMP_EXC_TTL)
488 return;
489 break;
492 /* Prepare fake skb to feed it to icmp_send */
493 skb2 = skb_clone(skb, GFP_ATOMIC);
494 if (skb2 == NULL)
495 return;
496 dst_release(skb2->dst);
497 skb2->dst = NULL;
498 skb_pull(skb2, skb->data - (u8*)eiph);
499 skb_reset_network_header(skb2);
501 /* Try to guess incoming interface */
502 memset(&fl, 0, sizeof(fl));
503 fl.fl4_dst = eiph->saddr;
504 fl.fl4_tos = RT_TOS(eiph->tos);
505 fl.proto = IPPROTO_GRE;
506 if (ip_route_output_key(&init_net, &rt, &fl)) {
507 kfree_skb(skb2);
508 return;
510 skb2->dev = rt->u.dst.dev;
512 /* route "incoming" packet */
513 if (rt->rt_flags&RTCF_LOCAL) {
514 ip_rt_put(rt);
515 rt = NULL;
516 fl.fl4_dst = eiph->daddr;
517 fl.fl4_src = eiph->saddr;
518 fl.fl4_tos = eiph->tos;
519 if (ip_route_output_key(&init_net, &rt, &fl) ||
520 rt->u.dst.dev->type != ARPHRD_IPGRE) {
521 ip_rt_put(rt);
522 kfree_skb(skb2);
523 return;
525 } else {
526 ip_rt_put(rt);
527 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
528 skb2->dst->dev->type != ARPHRD_IPGRE) {
529 kfree_skb(skb2);
530 return;
534 /* change mtu on this route */
535 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
536 if (n > dst_mtu(skb2->dst)) {
537 kfree_skb(skb2);
538 return;
540 skb2->dst->ops->update_pmtu(skb2->dst, n);
541 } else if (type == ICMP_TIME_EXCEEDED) {
542 struct ip_tunnel *t = netdev_priv(skb2->dev);
543 if (t->parms.iph.ttl) {
544 rel_type = ICMP_DEST_UNREACH;
545 rel_code = ICMP_HOST_UNREACH;
549 icmp_send(skb2, rel_type, rel_code, rel_info);
550 kfree_skb(skb2);
551 #endif
554 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
556 if (INET_ECN_is_ce(iph->tos)) {
557 if (skb->protocol == htons(ETH_P_IP)) {
558 IP_ECN_set_ce(ip_hdr(skb));
559 } else if (skb->protocol == htons(ETH_P_IPV6)) {
560 IP6_ECN_set_ce(ipv6_hdr(skb));
565 static inline u8
566 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
568 u8 inner = 0;
569 if (skb->protocol == htons(ETH_P_IP))
570 inner = old_iph->tos;
571 else if (skb->protocol == htons(ETH_P_IPV6))
572 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
573 return INET_ECN_encapsulate(tos, inner);
576 static int ipgre_rcv(struct sk_buff *skb)
578 struct iphdr *iph;
579 u8 *h;
580 __be16 flags;
581 __sum16 csum = 0;
582 __be32 key = 0;
583 u32 seqno = 0;
584 struct ip_tunnel *tunnel;
585 int offset = 4;
587 if (!pskb_may_pull(skb, 16))
588 goto drop_nolock;
590 iph = ip_hdr(skb);
591 h = skb->data;
592 flags = *(__be16*)h;
594 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
595 /* - Version must be 0.
596 - We do not support routing headers.
598 if (flags&(GRE_VERSION|GRE_ROUTING))
599 goto drop_nolock;
601 if (flags&GRE_CSUM) {
602 switch (skb->ip_summed) {
603 case CHECKSUM_COMPLETE:
604 csum = csum_fold(skb->csum);
605 if (!csum)
606 break;
607 /* fall through */
608 case CHECKSUM_NONE:
609 skb->csum = 0;
610 csum = __skb_checksum_complete(skb);
611 skb->ip_summed = CHECKSUM_COMPLETE;
613 offset += 4;
615 if (flags&GRE_KEY) {
616 key = *(__be32*)(h + offset);
617 offset += 4;
619 if (flags&GRE_SEQ) {
620 seqno = ntohl(*(__be32*)(h + offset));
621 offset += 4;
625 read_lock(&ipgre_lock);
626 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
627 secpath_reset(skb);
629 skb->protocol = *(__be16*)(h + 2);
630 /* WCCP version 1 and 2 protocol decoding.
631 * - Change protocol to IP
632 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
634 if (flags == 0 &&
635 skb->protocol == htons(ETH_P_WCCP)) {
636 skb->protocol = htons(ETH_P_IP);
637 if ((*(h + offset) & 0xF0) != 0x40)
638 offset += 4;
641 skb->mac_header = skb->network_header;
642 __pskb_pull(skb, offset);
643 skb_reset_network_header(skb);
644 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
645 skb->pkt_type = PACKET_HOST;
646 #ifdef CONFIG_NET_IPGRE_BROADCAST
647 if (ipv4_is_multicast(iph->daddr)) {
648 /* Looped back packet, drop it! */
649 if (((struct rtable*)skb->dst)->fl.iif == 0)
650 goto drop;
651 tunnel->stat.multicast++;
652 skb->pkt_type = PACKET_BROADCAST;
654 #endif
656 if (((flags&GRE_CSUM) && csum) ||
657 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
658 tunnel->stat.rx_crc_errors++;
659 tunnel->stat.rx_errors++;
660 goto drop;
662 if (tunnel->parms.i_flags&GRE_SEQ) {
663 if (!(flags&GRE_SEQ) ||
664 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
665 tunnel->stat.rx_fifo_errors++;
666 tunnel->stat.rx_errors++;
667 goto drop;
669 tunnel->i_seqno = seqno + 1;
671 tunnel->stat.rx_packets++;
672 tunnel->stat.rx_bytes += skb->len;
673 skb->dev = tunnel->dev;
674 dst_release(skb->dst);
675 skb->dst = NULL;
676 nf_reset(skb);
677 ipgre_ecn_decapsulate(iph, skb);
678 netif_rx(skb);
679 read_unlock(&ipgre_lock);
680 return(0);
682 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
684 drop:
685 read_unlock(&ipgre_lock);
686 drop_nolock:
687 kfree_skb(skb);
688 return(0);
691 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
693 struct ip_tunnel *tunnel = netdev_priv(dev);
694 struct net_device_stats *stats = &tunnel->stat;
695 struct iphdr *old_iph = ip_hdr(skb);
696 struct iphdr *tiph;
697 u8 tos;
698 __be16 df;
699 struct rtable *rt; /* Route to the other host */
700 struct net_device *tdev; /* Device to other host */
701 struct iphdr *iph; /* Our new IP header */
702 unsigned int max_headroom; /* The extra header space needed */
703 int gre_hlen;
704 __be32 dst;
705 int mtu;
707 if (tunnel->recursion++) {
708 tunnel->stat.collisions++;
709 goto tx_error;
712 if (dev->header_ops) {
713 gre_hlen = 0;
714 tiph = (struct iphdr*)skb->data;
715 } else {
716 gre_hlen = tunnel->hlen;
717 tiph = &tunnel->parms.iph;
720 if ((dst = tiph->daddr) == 0) {
721 /* NBMA tunnel */
723 if (skb->dst == NULL) {
724 tunnel->stat.tx_fifo_errors++;
725 goto tx_error;
728 if (skb->protocol == htons(ETH_P_IP)) {
729 rt = (struct rtable*)skb->dst;
730 if ((dst = rt->rt_gateway) == 0)
731 goto tx_error_icmp;
733 #ifdef CONFIG_IPV6
734 else if (skb->protocol == htons(ETH_P_IPV6)) {
735 struct in6_addr *addr6;
736 int addr_type;
737 struct neighbour *neigh = skb->dst->neighbour;
739 if (neigh == NULL)
740 goto tx_error;
742 addr6 = (struct in6_addr*)&neigh->primary_key;
743 addr_type = ipv6_addr_type(addr6);
745 if (addr_type == IPV6_ADDR_ANY) {
746 addr6 = &ipv6_hdr(skb)->daddr;
747 addr_type = ipv6_addr_type(addr6);
750 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
751 goto tx_error_icmp;
753 dst = addr6->s6_addr32[3];
755 #endif
756 else
757 goto tx_error;
760 tos = tiph->tos;
761 if (tos&1) {
762 if (skb->protocol == htons(ETH_P_IP))
763 tos = old_iph->tos;
764 tos &= ~1;
768 struct flowi fl = { .oif = tunnel->parms.link,
769 .nl_u = { .ip4_u =
770 { .daddr = dst,
771 .saddr = tiph->saddr,
772 .tos = RT_TOS(tos) } },
773 .proto = IPPROTO_GRE };
774 if (ip_route_output_key(&init_net, &rt, &fl)) {
775 tunnel->stat.tx_carrier_errors++;
776 goto tx_error;
779 tdev = rt->u.dst.dev;
781 if (tdev == dev) {
782 ip_rt_put(rt);
783 tunnel->stat.collisions++;
784 goto tx_error;
787 df = tiph->frag_off;
788 if (df)
789 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
790 else
791 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
793 if (skb->dst)
794 skb->dst->ops->update_pmtu(skb->dst, mtu);
796 if (skb->protocol == htons(ETH_P_IP)) {
797 df |= (old_iph->frag_off&htons(IP_DF));
799 if ((old_iph->frag_off&htons(IP_DF)) &&
800 mtu < ntohs(old_iph->tot_len)) {
801 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
802 ip_rt_put(rt);
803 goto tx_error;
806 #ifdef CONFIG_IPV6
807 else if (skb->protocol == htons(ETH_P_IPV6)) {
808 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
810 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
811 if ((tunnel->parms.iph.daddr &&
812 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
813 rt6->rt6i_dst.plen == 128) {
814 rt6->rt6i_flags |= RTF_MODIFIED;
815 skb->dst->metrics[RTAX_MTU-1] = mtu;
819 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
820 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
821 ip_rt_put(rt);
822 goto tx_error;
825 #endif
827 if (tunnel->err_count > 0) {
828 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
829 tunnel->err_count--;
831 dst_link_failure(skb);
832 } else
833 tunnel->err_count = 0;
836 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
838 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
839 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
840 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
841 if (!new_skb) {
842 ip_rt_put(rt);
843 stats->tx_dropped++;
844 dev_kfree_skb(skb);
845 tunnel->recursion--;
846 return 0;
848 if (skb->sk)
849 skb_set_owner_w(new_skb, skb->sk);
850 dev_kfree_skb(skb);
851 skb = new_skb;
852 old_iph = ip_hdr(skb);
855 skb->transport_header = skb->network_header;
856 skb_push(skb, gre_hlen);
857 skb_reset_network_header(skb);
858 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
859 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
860 IPSKB_REROUTED);
861 dst_release(skb->dst);
862 skb->dst = &rt->u.dst;
865 * Push down and install the IPIP header.
868 iph = ip_hdr(skb);
869 iph->version = 4;
870 iph->ihl = sizeof(struct iphdr) >> 2;
871 iph->frag_off = df;
872 iph->protocol = IPPROTO_GRE;
873 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
874 iph->daddr = rt->rt_dst;
875 iph->saddr = rt->rt_src;
877 if ((iph->ttl = tiph->ttl) == 0) {
878 if (skb->protocol == htons(ETH_P_IP))
879 iph->ttl = old_iph->ttl;
880 #ifdef CONFIG_IPV6
881 else if (skb->protocol == htons(ETH_P_IPV6))
882 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
883 #endif
884 else
885 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
888 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
889 ((__be16*)(iph+1))[1] = skb->protocol;
891 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
892 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
894 if (tunnel->parms.o_flags&GRE_SEQ) {
895 ++tunnel->o_seqno;
896 *ptr = htonl(tunnel->o_seqno);
897 ptr--;
899 if (tunnel->parms.o_flags&GRE_KEY) {
900 *ptr = tunnel->parms.o_key;
901 ptr--;
903 if (tunnel->parms.o_flags&GRE_CSUM) {
904 *ptr = 0;
905 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
909 nf_reset(skb);
911 IPTUNNEL_XMIT();
912 tunnel->recursion--;
913 return 0;
915 tx_error_icmp:
916 dst_link_failure(skb);
918 tx_error:
919 stats->tx_errors++;
920 dev_kfree_skb(skb);
921 tunnel->recursion--;
922 return 0;
925 static void ipgre_tunnel_bind_dev(struct net_device *dev)
927 struct net_device *tdev = NULL;
928 struct ip_tunnel *tunnel;
929 struct iphdr *iph;
930 int hlen = LL_MAX_HEADER;
931 int mtu = ETH_DATA_LEN;
932 int addend = sizeof(struct iphdr) + 4;
934 tunnel = netdev_priv(dev);
935 iph = &tunnel->parms.iph;
937 /* Guess output device to choose reasonable mtu and hard_header_len */
939 if (iph->daddr) {
940 struct flowi fl = { .oif = tunnel->parms.link,
941 .nl_u = { .ip4_u =
942 { .daddr = iph->daddr,
943 .saddr = iph->saddr,
944 .tos = RT_TOS(iph->tos) } },
945 .proto = IPPROTO_GRE };
946 struct rtable *rt;
947 if (!ip_route_output_key(&init_net, &rt, &fl)) {
948 tdev = rt->u.dst.dev;
949 ip_rt_put(rt);
951 dev->flags |= IFF_POINTOPOINT;
954 if (!tdev && tunnel->parms.link)
955 tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
957 if (tdev) {
958 hlen = tdev->hard_header_len;
959 mtu = tdev->mtu;
961 dev->iflink = tunnel->parms.link;
963 /* Precalculate GRE options length */
964 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
965 if (tunnel->parms.o_flags&GRE_CSUM)
966 addend += 4;
967 if (tunnel->parms.o_flags&GRE_KEY)
968 addend += 4;
969 if (tunnel->parms.o_flags&GRE_SEQ)
970 addend += 4;
972 dev->hard_header_len = hlen + addend;
973 dev->mtu = mtu - addend;
974 tunnel->hlen = addend;
978 static int
979 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
981 int err = 0;
982 struct ip_tunnel_parm p;
983 struct ip_tunnel *t;
985 switch (cmd) {
986 case SIOCGETTUNNEL:
987 t = NULL;
988 if (dev == ipgre_fb_tunnel_dev) {
989 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
990 err = -EFAULT;
991 break;
993 t = ipgre_tunnel_locate(&p, 0);
995 if (t == NULL)
996 t = netdev_priv(dev);
997 memcpy(&p, &t->parms, sizeof(p));
998 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
999 err = -EFAULT;
1000 break;
1002 case SIOCADDTUNNEL:
1003 case SIOCCHGTUNNEL:
1004 err = -EPERM;
1005 if (!capable(CAP_NET_ADMIN))
1006 goto done;
1008 err = -EFAULT;
1009 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1010 goto done;
1012 err = -EINVAL;
1013 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1014 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1015 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1016 goto done;
1017 if (p.iph.ttl)
1018 p.iph.frag_off |= htons(IP_DF);
1020 if (!(p.i_flags&GRE_KEY))
1021 p.i_key = 0;
1022 if (!(p.o_flags&GRE_KEY))
1023 p.o_key = 0;
1025 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
1027 if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1028 if (t != NULL) {
1029 if (t->dev != dev) {
1030 err = -EEXIST;
1031 break;
1033 } else {
1034 unsigned nflags=0;
1036 t = netdev_priv(dev);
1038 if (ipv4_is_multicast(p.iph.daddr))
1039 nflags = IFF_BROADCAST;
1040 else if (p.iph.daddr)
1041 nflags = IFF_POINTOPOINT;
1043 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1044 err = -EINVAL;
1045 break;
1047 ipgre_tunnel_unlink(t);
1048 t->parms.iph.saddr = p.iph.saddr;
1049 t->parms.iph.daddr = p.iph.daddr;
1050 t->parms.i_key = p.i_key;
1051 t->parms.o_key = p.o_key;
1052 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1053 memcpy(dev->broadcast, &p.iph.daddr, 4);
1054 ipgre_tunnel_link(t);
1055 netdev_state_change(dev);
1059 if (t) {
1060 err = 0;
1061 if (cmd == SIOCCHGTUNNEL) {
1062 t->parms.iph.ttl = p.iph.ttl;
1063 t->parms.iph.tos = p.iph.tos;
1064 t->parms.iph.frag_off = p.iph.frag_off;
1065 if (t->parms.link != p.link) {
1066 t->parms.link = p.link;
1067 ipgre_tunnel_bind_dev(dev);
1068 netdev_state_change(dev);
1071 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1072 err = -EFAULT;
1073 } else
1074 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1075 break;
1077 case SIOCDELTUNNEL:
1078 err = -EPERM;
1079 if (!capable(CAP_NET_ADMIN))
1080 goto done;
1082 if (dev == ipgre_fb_tunnel_dev) {
1083 err = -EFAULT;
1084 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1085 goto done;
1086 err = -ENOENT;
1087 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
1088 goto done;
1089 err = -EPERM;
1090 if (t == netdev_priv(ipgre_fb_tunnel_dev))
1091 goto done;
1092 dev = t->dev;
1094 unregister_netdevice(dev);
1095 err = 0;
1096 break;
1098 default:
1099 err = -EINVAL;
1102 done:
1103 return err;
1106 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1108 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1111 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1113 struct ip_tunnel *tunnel = netdev_priv(dev);
1114 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1115 return -EINVAL;
1116 dev->mtu = new_mtu;
1117 return 0;
1120 /* Nice toy. Unfortunately, useless in real life :-)
1121 It allows to construct virtual multiprotocol broadcast "LAN"
1122 over the Internet, provided multicast routing is tuned.
1125 I have no idea was this bicycle invented before me,
1126 so that I had to set ARPHRD_IPGRE to a random value.
1127 I have an impression, that Cisco could make something similar,
1128 but this feature is apparently missing in IOS<=11.2(8).
1130 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1131 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1133 ping -t 255 224.66.66.66
1135 If nobody answers, mbone does not work.
1137 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1138 ip addr add 10.66.66.<somewhat>/24 dev Universe
1139 ifconfig Universe up
1140 ifconfig Universe add fe80::<Your_real_addr>/10
1141 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1142 ftp 10.66.66.66
1144 ftp fec0:6666:6666::193.233.7.65
1149 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1150 unsigned short type,
1151 const void *daddr, const void *saddr, unsigned len)
1153 struct ip_tunnel *t = netdev_priv(dev);
1154 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1155 __be16 *p = (__be16*)(iph+1);
1157 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1158 p[0] = t->parms.o_flags;
1159 p[1] = htons(type);
1162 * Set the source hardware address.
1165 if (saddr)
1166 memcpy(&iph->saddr, saddr, 4);
1168 if (daddr) {
1169 memcpy(&iph->daddr, daddr, 4);
1170 return t->hlen;
1172 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1173 return t->hlen;
1175 return -t->hlen;
1178 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1180 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1181 memcpy(haddr, &iph->saddr, 4);
1182 return 4;
1185 static const struct header_ops ipgre_header_ops = {
1186 .create = ipgre_header,
1187 .parse = ipgre_header_parse,
1190 #ifdef CONFIG_NET_IPGRE_BROADCAST
1191 static int ipgre_open(struct net_device *dev)
1193 struct ip_tunnel *t = netdev_priv(dev);
1195 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1196 struct flowi fl = { .oif = t->parms.link,
1197 .nl_u = { .ip4_u =
1198 { .daddr = t->parms.iph.daddr,
1199 .saddr = t->parms.iph.saddr,
1200 .tos = RT_TOS(t->parms.iph.tos) } },
1201 .proto = IPPROTO_GRE };
1202 struct rtable *rt;
1203 if (ip_route_output_key(&init_net, &rt, &fl))
1204 return -EADDRNOTAVAIL;
1205 dev = rt->u.dst.dev;
1206 ip_rt_put(rt);
1207 if (__in_dev_get_rtnl(dev) == NULL)
1208 return -EADDRNOTAVAIL;
1209 t->mlink = dev->ifindex;
1210 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1212 return 0;
1215 static int ipgre_close(struct net_device *dev)
1217 struct ip_tunnel *t = netdev_priv(dev);
1218 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1219 struct in_device *in_dev;
1220 in_dev = inetdev_by_index(dev->nd_net, t->mlink);
1221 if (in_dev) {
1222 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1223 in_dev_put(in_dev);
1226 return 0;
1229 #endif
1231 static void ipgre_tunnel_setup(struct net_device *dev)
1233 dev->uninit = ipgre_tunnel_uninit;
1234 dev->destructor = free_netdev;
1235 dev->hard_start_xmit = ipgre_tunnel_xmit;
1236 dev->get_stats = ipgre_tunnel_get_stats;
1237 dev->do_ioctl = ipgre_tunnel_ioctl;
1238 dev->change_mtu = ipgre_tunnel_change_mtu;
1240 dev->type = ARPHRD_IPGRE;
1241 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1242 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1243 dev->flags = IFF_NOARP;
1244 dev->iflink = 0;
1245 dev->addr_len = 4;
1248 static int ipgre_tunnel_init(struct net_device *dev)
1250 struct ip_tunnel *tunnel;
1251 struct iphdr *iph;
1253 tunnel = netdev_priv(dev);
1254 iph = &tunnel->parms.iph;
1256 tunnel->dev = dev;
1257 strcpy(tunnel->parms.name, dev->name);
1259 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1260 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1262 ipgre_tunnel_bind_dev(dev);
1264 if (iph->daddr) {
1265 #ifdef CONFIG_NET_IPGRE_BROADCAST
1266 if (ipv4_is_multicast(iph->daddr)) {
1267 if (!iph->saddr)
1268 return -EINVAL;
1269 dev->flags = IFF_BROADCAST;
1270 dev->header_ops = &ipgre_header_ops;
1271 dev->open = ipgre_open;
1272 dev->stop = ipgre_close;
1274 #endif
1275 } else
1276 dev->header_ops = &ipgre_header_ops;
1278 return 0;
1281 static int __init ipgre_fb_tunnel_init(struct net_device *dev)
1283 struct ip_tunnel *tunnel = netdev_priv(dev);
1284 struct iphdr *iph = &tunnel->parms.iph;
1286 tunnel->dev = dev;
1287 strcpy(tunnel->parms.name, dev->name);
1289 iph->version = 4;
1290 iph->protocol = IPPROTO_GRE;
1291 iph->ihl = 5;
1292 tunnel->hlen = sizeof(struct iphdr) + 4;
1294 dev_hold(dev);
1295 tunnels_wc[0] = tunnel;
1296 return 0;
1300 static struct net_protocol ipgre_protocol = {
1301 .handler = ipgre_rcv,
1302 .err_handler = ipgre_err,
1307 * And now the modules code and kernel interface.
1310 static int __init ipgre_init(void)
1312 int err;
1314 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1316 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1317 printk(KERN_INFO "ipgre init: can't add protocol\n");
1318 return -EAGAIN;
1321 ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1322 ipgre_tunnel_setup);
1323 if (!ipgre_fb_tunnel_dev) {
1324 err = -ENOMEM;
1325 goto err1;
1328 ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1330 if ((err = register_netdev(ipgre_fb_tunnel_dev)))
1331 goto err2;
1332 out:
1333 return err;
1334 err2:
1335 free_netdev(ipgre_fb_tunnel_dev);
1336 err1:
1337 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1338 goto out;
1341 static void __exit ipgre_destroy_tunnels(void)
1343 int prio;
1345 for (prio = 0; prio < 4; prio++) {
1346 int h;
1347 for (h = 0; h < HASH_SIZE; h++) {
1348 struct ip_tunnel *t;
1349 while ((t = tunnels[prio][h]) != NULL)
1350 unregister_netdevice(t->dev);
1355 static void __exit ipgre_fini(void)
1357 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1358 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1360 rtnl_lock();
1361 ipgre_destroy_tunnels();
1362 rtnl_unlock();
1365 module_init(ipgre_init);
1366 module_exit(ipgre_fini);
1367 MODULE_LICENSE("GPL");