2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.72 1999/08/30 10:17:12 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <linux/netfilter_ipv4.h>
86 #include <net/protocol.h>
88 #include <net/route.h>
90 #include <net/ip_fib.h>
95 #include <linux/sysctl.h>
98 #define IP_MAX_MTU 0xFFF0
100 #define RT_GC_TIMEOUT (300*HZ)
102 int ip_rt_min_delay
= 2*HZ
;
103 int ip_rt_max_delay
= 10*HZ
;
104 int ip_rt_gc_thresh
= RT_HASH_DIVISOR
;
105 int ip_rt_max_size
= RT_HASH_DIVISOR
*16;
106 int ip_rt_gc_timeout
= RT_GC_TIMEOUT
;
107 int ip_rt_gc_interval
= 60*HZ
;
108 int ip_rt_gc_min_interval
= 5*HZ
;
109 int ip_rt_redirect_number
= 9;
110 int ip_rt_redirect_load
= HZ
/50;
111 int ip_rt_redirect_silence
= ((HZ
/50) << (9+1));
112 int ip_rt_error_cost
= HZ
;
113 int ip_rt_error_burst
= 5*HZ
;
114 int ip_rt_gc_elasticity
= 8;
115 int ip_rt_mtu_expires
= 10*60*HZ
;
116 int ip_rt_min_pmtu
= 512+20+20;
117 int ip_rt_min_advmss
= 536;
119 static unsigned long rt_deadline
= 0;
121 #define RTprint(a...) printk(KERN_DEBUG a)
123 static void rt_run_flush(unsigned long dummy
);
125 static struct timer_list rt_flush_timer
=
126 { NULL
, NULL
, 0, 0L, rt_run_flush
};
127 static struct timer_list rt_periodic_timer
=
128 { NULL
, NULL
, 0, 0L, NULL
};
131 * Interface to generic destination cache.
134 static struct dst_entry
* ipv4_dst_check(struct dst_entry
* dst
, u32
);
135 static struct dst_entry
* ipv4_dst_reroute(struct dst_entry
* dst
,
137 static struct dst_entry
* ipv4_negative_advice(struct dst_entry
*);
138 static void ipv4_link_failure(struct sk_buff
*skb
);
139 static int rt_garbage_collect(void);
142 struct dst_ops ipv4_dst_ops
=
145 __constant_htons(ETH_P_IP
),
152 ipv4_negative_advice
,
154 sizeof(struct rtable
),
157 __u8 ip_tos2prio
[16] = {
170 TC_PRIO_INTERACTIVE_BULK
,
172 TC_PRIO_INTERACTIVE_BULK
,
181 /* The locking scheme is rather straight forward:
183 * 1) A BH protected rwlock protects the central route hash.
184 * 2) Only writers remove entries, and they hold the lock
185 * as they look at rtable reference counts.
186 * 3) Only readers acquire references to rtable entries,
187 * they do so with atomic increments and with the
191 static struct rtable
*rt_hash_table
[RT_HASH_DIVISOR
];
192 static rwlock_t rt_hash_lock
= RW_LOCK_UNLOCKED
;
194 static int rt_intern_hash(unsigned hash
, struct rtable
* rth
, struct rtable
** res
);
196 static __inline__
unsigned rt_hash_code(u32 daddr
, u32 saddr
, u8 tos
)
198 unsigned hash
= ((daddr
&0xF0F0F0F0)>>4)|((daddr
&0x0F0F0F0F)<<4);
199 hash
= hash
^saddr
^tos
;
200 hash
= hash
^(hash
>>16);
201 return (hash
^(hash
>>8)) & 0xFF;
204 #ifdef CONFIG_PROC_FS
206 static int rt_cache_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
217 sprintf(buffer
,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
222 read_lock_bh(&rt_hash_lock
);
224 for (i
= 0; i
<RT_HASH_DIVISOR
; i
++) {
225 for (r
= rt_hash_table
[i
]; r
; r
= r
->u
.rt_next
) {
227 * Spin through entries until we are ready
235 sprintf(temp
, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
236 r
->u
.dst
.dev
? r
->u
.dst
.dev
->name
: "*",
237 (unsigned long)r
->rt_dst
,
238 (unsigned long)r
->rt_gateway
,
240 atomic_read(&r
->u
.dst
.__refcnt
),
243 (unsigned long)r
->rt_src
, (int)r
->u
.dst
.advmss
+ 40,
245 (int)((r
->u
.dst
.rtt
>>3) + r
->u
.dst
.rttvar
),
247 r
->u
.dst
.hh
? atomic_read(&r
->u
.dst
.hh
->hh_refcnt
) : -1,
248 r
->u
.dst
.hh
? (r
->u
.dst
.hh
->hh_output
== dev_queue_xmit
) : 0,
250 sprintf(buffer
+len
,"%-127s\n",temp
);
252 if (pos
>= offset
+length
)
258 read_unlock_bh(&rt_hash_lock
);
260 *start
= buffer
+len
-(pos
-offset
);
268 static __inline__
void rt_free(struct rtable
*rt
)
270 dst_free(&rt
->u
.dst
);
273 static __inline__
void rt_drop(struct rtable
*rt
)
276 dst_free(&rt
->u
.dst
);
279 static __inline__
int rt_fast_clean(struct rtable
*rth
)
281 /* Kill broadcast/multicast entries very aggresively, if they
282 collide in hash table with more useful entries */
283 return ((rth
->rt_flags
&(RTCF_BROADCAST
|RTCF_MULTICAST
))
284 && rth
->key
.iif
&& rth
->u
.rt_next
);
287 static __inline__
int rt_valuable(struct rtable
*rth
)
289 return ((rth
->rt_flags
&(RTCF_REDIRECTED
|RTCF_NOTIFY
))
290 || rth
->u
.dst
.expires
);
293 static __inline__
int rt_may_expire(struct rtable
*rth
, int tmo1
, int tmo2
)
297 if (atomic_read(&rth
->u
.dst
.__refcnt
))
300 if (rth
->u
.dst
.expires
&& (long)(rth
->u
.dst
.expires
- jiffies
) <= 0)
303 age
= jiffies
- rth
->u
.dst
.lastuse
;
304 if (age
<= tmo1
&& !rt_fast_clean(rth
))
306 if (age
<= tmo2
&& rt_valuable(rth
))
311 /* This runs via a timer and thus is always in BH context. */
312 static void rt_check_expire(unsigned long dummy
)
316 struct rtable
*rth
, **rthp
;
317 unsigned long now
= jiffies
;
319 for (i
=0; i
<RT_HASH_DIVISOR
/5; i
++) {
320 unsigned tmo
= ip_rt_gc_timeout
;
322 rover
= (rover
+ 1) & (RT_HASH_DIVISOR
-1);
323 rthp
= &rt_hash_table
[rover
];
325 write_lock(&rt_hash_lock
);
326 while ((rth
= *rthp
) != NULL
) {
327 if (rth
->u
.dst
.expires
) {
328 /* Entrie is expired even if it is in use */
329 if ((long)(now
- rth
->u
.dst
.expires
) <= 0) {
331 rthp
= &rth
->u
.rt_next
;
334 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
)) {
336 rthp
= &rth
->u
.rt_next
;
341 * Cleanup aged off entries.
343 *rthp
= rth
->u
.rt_next
;
346 write_unlock(&rt_hash_lock
);
348 /* Fallback loop breaker. */
349 if ((jiffies
- now
) > 0)
352 rt_periodic_timer
.expires
= now
+ ip_rt_gc_interval
;
353 add_timer(&rt_periodic_timer
);
356 /* This can run from both BH and non-BH contexts, the latter
357 * in the case of a forced flush event.
359 static void rt_run_flush(unsigned long dummy
)
362 struct rtable
* rth
, * next
;
366 for (i
=0; i
<RT_HASH_DIVISOR
; i
++) {
367 write_lock_bh(&rt_hash_lock
);
368 rth
= rt_hash_table
[i
];
369 rt_hash_table
[i
] = NULL
;
370 write_unlock_bh(&rt_hash_lock
);
372 for (; rth
; rth
=next
) {
373 next
= rth
->u
.rt_next
;
379 static spinlock_t rt_flush_lock
= SPIN_LOCK_UNLOCKED
;
381 void rt_cache_flush(int delay
)
383 unsigned long now
= jiffies
;
384 int user_mode
= !in_interrupt();
387 delay
= ip_rt_min_delay
;
389 spin_lock_bh(&rt_flush_lock
);
391 if (del_timer(&rt_flush_timer
) && delay
> 0 && rt_deadline
) {
392 long tmo
= (long)(rt_deadline
- now
);
394 /* If flush timer is already running
395 and flush request is not immediate (delay > 0):
397 if deadline is not achieved, prolongate timer to "delay",
398 otherwise fire it at deadline time.
401 if (user_mode
&& tmo
< ip_rt_max_delay
-ip_rt_min_delay
)
409 spin_unlock_bh(&rt_flush_lock
);
414 if (rt_deadline
== 0)
415 rt_deadline
= now
+ ip_rt_max_delay
;
417 rt_flush_timer
.expires
= now
+ delay
;
418 add_timer(&rt_flush_timer
);
419 spin_unlock_bh(&rt_flush_lock
);
423 Short description of GC goals.
425 We want to build algorithm, which will keep routing cache
426 at some equilibrium point, when number of aged off entries
427 is kept approximately equal to newly generated ones.
429 Current expiration strength is variable "expire".
430 We try to adjust it dynamically, so that if networking
431 is idle expires is large enough to keep enough of warm entries,
432 and when load increases it reduces to limit cache size.
435 static int rt_garbage_collect(void)
437 static unsigned expire
= RT_GC_TIMEOUT
;
438 static unsigned long last_gc
;
440 static int equilibrium
;
441 struct rtable
*rth
, **rthp
;
442 unsigned long now
= jiffies
;
446 * Garbage collection is pretty expensive,
447 * do not make it too frequently.
449 if (now
- last_gc
< ip_rt_gc_min_interval
&&
450 atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
453 /* Calculate number of entries, which we want to expire now. */
454 goal
= atomic_read(&ipv4_dst_ops
.entries
) - RT_HASH_DIVISOR
*ip_rt_gc_elasticity
;
456 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
457 equilibrium
= ipv4_dst_ops
.gc_thresh
;
458 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
460 equilibrium
+= min(goal
/2, RT_HASH_DIVISOR
);
461 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
464 /* We are in dangerous area. Try to reduce cache really
467 goal
= max(goal
/2, RT_HASH_DIVISOR
);
468 equilibrium
= atomic_read(&ipv4_dst_ops
.entries
) - goal
;
471 if (now
- last_gc
>= ip_rt_gc_min_interval
)
482 /* The write lock is held during the entire hash
483 * traversal to ensure consistent state of the rover.
485 write_lock_bh(&rt_hash_lock
);
486 for (i
=0, k
=rover
; i
<RT_HASH_DIVISOR
; i
++) {
487 unsigned tmo
= expire
;
489 k
= (k
+ 1) & (RT_HASH_DIVISOR
-1);
490 rthp
= &rt_hash_table
[k
];
491 while ((rth
= *rthp
) != NULL
) {
492 if (!rt_may_expire(rth
, tmo
, expire
)) {
494 rthp
= &rth
->u
.rt_next
;
497 *rthp
= rth
->u
.rt_next
;
505 write_unlock_bh(&rt_hash_lock
);
510 /* Goal is not achieved. We stop process if:
512 - if expire reduced to zero. Otherwise, expire is halfed.
513 - if table is not full.
514 - if we are called from interrupt.
515 - jiffies check is just fallback/debug loop breaker.
516 We will not spin here for long time in any case.
523 #if RT_CACHE_DEBUG >= 2
524 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
, atomic_read(&ipv4_dst_ops
.entries
), goal
, i
);
527 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
529 } while (!in_interrupt() && jiffies
- now
< 1);
531 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
534 printk("dst cache overflow\n");
538 expire
+= ip_rt_gc_min_interval
;
539 if (expire
> ip_rt_gc_timeout
||
540 atomic_read(&ipv4_dst_ops
.entries
) < ipv4_dst_ops
.gc_thresh
)
541 expire
= ip_rt_gc_timeout
;
542 #if RT_CACHE_DEBUG >= 2
543 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
, atomic_read(&ipv4_dst_ops
.entries
), goal
, rover
);
548 static int rt_intern_hash(unsigned hash
, struct rtable
* rt
, struct rtable
** rp
)
550 struct rtable
*rth
, **rthp
;
551 unsigned long now
= jiffies
;
552 int attempts
= !in_interrupt();
555 rthp
= &rt_hash_table
[hash
];
557 write_lock_bh(&rt_hash_lock
);
558 while ((rth
= *rthp
) != NULL
) {
559 if (memcmp(&rth
->key
, &rt
->key
, sizeof(rt
->key
)) == 0) {
561 *rthp
= rth
->u
.rt_next
;
562 rth
->u
.rt_next
= rt_hash_table
[hash
];
563 rt_hash_table
[hash
] = rth
;
566 dst_hold(&rth
->u
.dst
);
567 rth
->u
.dst
.lastuse
= now
;
568 write_unlock_bh(&rt_hash_lock
);
575 rthp
= &rth
->u
.rt_next
;
578 /* Try to bind route to arp only if it is output
579 route or unicast forwarding path.
581 if (rt
->rt_type
== RTN_UNICAST
|| rt
->key
.iif
== 0) {
582 if (!arp_bind_neighbour(&rt
->u
.dst
)) {
583 write_unlock_bh(&rt_hash_lock
);
585 /* Neighbour tables are full and nothing
586 can be released. Try to shrink route cache,
587 it is most likely it holds some neighbour records.
589 if (attempts
-- > 0) {
590 int saved_elasticity
= ip_rt_gc_elasticity
;
591 int saved_int
= ip_rt_gc_min_interval
;
592 ip_rt_gc_elasticity
= 1;
593 ip_rt_gc_min_interval
= 0;
594 rt_garbage_collect();
595 ip_rt_gc_min_interval
= saved_int
;
596 ip_rt_gc_elasticity
= saved_elasticity
;
600 if (net_ratelimit()) {
601 if ((rt
->u
.dst
.dev
->flags
&IFF_UP
) &&
602 __in_dev_get(rt
->u
.dst
.dev
))
603 printk("Neighbour table overflow.\n");
605 printk("Device %s is down.\n", rt
->u
.dst
.dev
->name
);
612 rt
->u
.rt_next
= rt_hash_table
[hash
];
613 #if RT_CACHE_DEBUG >= 2
616 printk("rt_cache @%02x: %08x", hash
, rt
->rt_dst
);
617 for (trt
=rt
->u
.rt_next
; trt
; trt
=trt
->u
.rt_next
)
618 printk(" . %08x", trt
->rt_dst
);
622 rt_hash_table
[hash
] = rt
;
623 write_unlock_bh(&rt_hash_lock
);
628 static void rt_del(unsigned hash
, struct rtable
*rt
)
630 struct rtable
**rthp
;
632 write_lock_bh(&rt_hash_lock
);
634 for (rthp
= &rt_hash_table
[hash
]; *rthp
; rthp
= &(*rthp
)->u
.rt_next
) {
636 *rthp
= rt
->u
.rt_next
;
641 write_unlock_bh(&rt_hash_lock
);
644 void ip_rt_redirect(u32 old_gw
, u32 daddr
, u32 new_gw
,
645 u32 saddr
, u8 tos
, struct net_device
*dev
)
648 struct in_device
*in_dev
= in_dev_get(dev
);
649 struct rtable
*rth
, **rthp
;
650 u32 skeys
[2] = { saddr
, 0 };
651 int ikeys
[2] = { dev
->ifindex
, 0 };
653 tos
&= IPTOS_TOS_MASK
;
658 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
)
659 || MULTICAST(new_gw
) || BADCLASS(new_gw
) || ZERONET(new_gw
))
660 goto reject_redirect
;
662 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
663 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
664 goto reject_redirect
;
665 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
666 goto reject_redirect
;
668 if (inet_addr_type(new_gw
) != RTN_UNICAST
)
669 goto reject_redirect
;
672 for (i
=0; i
<2; i
++) {
673 for (k
=0; k
<2; k
++) {
674 unsigned hash
= rt_hash_code(daddr
, skeys
[i
]^(ikeys
[k
]<<5), tos
);
676 rthp
=&rt_hash_table
[hash
];
678 read_lock(&rt_hash_lock
);
679 while ( (rth
= *rthp
) != NULL
) {
682 if (rth
->key
.dst
!= daddr
||
683 rth
->key
.src
!= skeys
[i
] ||
684 rth
->key
.tos
!= tos
||
685 rth
->key
.oif
!= ikeys
[k
] ||
687 rthp
= &rth
->u
.rt_next
;
691 if (rth
->rt_dst
!= daddr
||
692 rth
->rt_src
!= saddr
||
694 rth
->rt_gateway
!= old_gw
||
695 rth
->u
.dst
.dev
!= dev
)
698 dst_clone(&rth
->u
.dst
);
699 read_unlock(&rt_hash_lock
);
701 rt
= dst_alloc(&ipv4_dst_ops
);
709 * Copy all the information.
713 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
715 dev_hold(rt
->u
.dst
.dev
);
716 rt
->u
.dst
.lastuse
= jiffies
;
717 rt
->u
.dst
.neighbour
= NULL
;
719 rt
->u
.dst
.obsolete
= 0;
721 rt
->rt_flags
|= RTCF_REDIRECTED
;
723 /* Gateway is different ... */
724 rt
->rt_gateway
= new_gw
;
726 /* Redirect received -> path was valid */
727 dst_confirm(&rth
->u
.dst
);
729 if (!arp_bind_neighbour(&rt
->u
.dst
) ||
730 !(rt
->u
.dst
.neighbour
->nud_state
&NUD_VALID
)) {
731 if (rt
->u
.dst
.neighbour
)
732 neigh_event_send(rt
->u
.dst
.neighbour
, NULL
);
739 if (!rt_intern_hash(hash
, rt
, &rt
))
743 read_unlock(&rt_hash_lock
);
752 #ifdef CONFIG_IP_ROUTE_VERBOSE
753 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
754 printk(KERN_INFO
"Redirect from %lX/%s to %lX ignored."
755 "Path = %lX -> %lX, tos %02x\n",
756 ntohl(old_gw
), dev
->name
, ntohl(new_gw
),
757 ntohl(saddr
), ntohl(daddr
), tos
);
762 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
764 struct rtable
*rt
= (struct rtable
*)dst
;
771 if ((rt
->rt_flags
&RTCF_REDIRECTED
) || rt
->u
.dst
.expires
) {
772 unsigned hash
= rt_hash_code(rt
->key
.dst
, rt
->key
.src
^(rt
->key
.oif
<<5), rt
->key
.tos
);
773 #if RT_CACHE_DEBUG >= 1
774 printk(KERN_DEBUG
"ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt
->rt_dst
), rt
->key
.tos
);
785 * 1. The first ip_rt_redirect_number redirects are sent
786 * with exponential backoff, then we stop sending them at all,
787 * assuming that the host ignores our redirects.
788 * 2. If we did not see packets requiring redirects
789 * during ip_rt_redirect_silence, we assume that the host
790 * forgot redirected route and start to send redirects again.
792 * This algorithm is much cheaper and more intelligent than dumb load limiting
795 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
796 * and "frag. need" (breaks PMTU discovery) in icmp.c.
799 void ip_rt_send_redirect(struct sk_buff
*skb
)
801 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
802 struct in_device
*in_dev
= in_dev_get(rt
->u
.dst
.dev
);
807 if (!IN_DEV_TX_REDIRECTS(in_dev
))
810 /* No redirected packets during ip_rt_redirect_silence;
811 * reset the algorithm.
813 if (jiffies
- rt
->u
.dst
.rate_last
> ip_rt_redirect_silence
)
814 rt
->u
.dst
.rate_tokens
= 0;
816 /* Too many ignored redirects; do not send anything
817 * set u.dst.rate_last to the last seen redirected packet.
819 if (rt
->u
.dst
.rate_tokens
>= ip_rt_redirect_number
) {
820 rt
->u
.dst
.rate_last
= jiffies
;
824 /* Check for load limit; set rate_last to the latest sent
827 if (jiffies
- rt
->u
.dst
.rate_last
> (ip_rt_redirect_load
<<rt
->u
.dst
.rate_tokens
)) {
828 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
829 rt
->u
.dst
.rate_last
= jiffies
;
830 ++rt
->u
.dst
.rate_tokens
;
831 #ifdef CONFIG_IP_ROUTE_VERBOSE
832 if (IN_DEV_LOG_MARTIANS(in_dev
) &&
833 rt
->u
.dst
.rate_tokens
== ip_rt_redirect_number
&& net_ratelimit())
834 printk(KERN_WARNING
"host %08x/if%d ignores redirects for %08x to %08x.\n",
835 rt
->rt_src
, rt
->rt_iif
, rt
->rt_dst
, rt
->rt_gateway
);
842 static int ip_error(struct sk_buff
*skb
)
844 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
848 switch (rt
->u
.dst
.error
) {
854 code
= ICMP_HOST_UNREACH
;
857 code
= ICMP_NET_UNREACH
;
860 code
= ICMP_PKT_FILTERED
;
865 if ((rt
->u
.dst
.rate_tokens
+= (now
- rt
->u
.dst
.rate_last
)) > ip_rt_error_burst
)
866 rt
->u
.dst
.rate_tokens
= ip_rt_error_burst
;
867 rt
->u
.dst
.rate_last
= now
;
868 if (rt
->u
.dst
.rate_tokens
>= ip_rt_error_cost
) {
869 rt
->u
.dst
.rate_tokens
-= ip_rt_error_cost
;
870 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
878 * The last two values are not from the RFC but
879 * are needed for AMPRnet AX.25 paths.
882 static unsigned short mtu_plateau
[] =
883 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
885 static __inline__
unsigned short guess_mtu(unsigned short old_mtu
)
889 for (i
= 0; i
< sizeof(mtu_plateau
)/sizeof(mtu_plateau
[0]); i
++)
890 if (old_mtu
> mtu_plateau
[i
])
891 return mtu_plateau
[i
];
895 unsigned short ip_rt_frag_needed(struct iphdr
*iph
, unsigned short new_mtu
)
898 unsigned short old_mtu
= ntohs(iph
->tot_len
);
900 u32 skeys
[2] = { iph
->saddr
, 0, };
901 u32 daddr
= iph
->daddr
;
902 u8 tos
= iph
->tos
& IPTOS_TOS_MASK
;
903 unsigned short est_mtu
= 0;
905 if (ipv4_config
.no_pmtu_disc
)
908 for (i
=0; i
<2; i
++) {
909 unsigned hash
= rt_hash_code(daddr
, skeys
[i
], tos
);
911 read_lock(&rt_hash_lock
);
912 for (rth
= rt_hash_table
[hash
]; rth
; rth
= rth
->u
.rt_next
) {
913 if (rth
->key
.dst
== daddr
&&
914 rth
->key
.src
== skeys
[i
] &&
915 rth
->rt_dst
== daddr
&&
916 rth
->rt_src
== iph
->saddr
&&
917 rth
->key
.tos
== tos
&&
919 !(rth
->u
.dst
.mxlock
&(1<<RTAX_MTU
))) {
920 unsigned short mtu
= new_mtu
;
922 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
924 /* BSD 4.2 compatibility hack :-( */
925 if (mtu
== 0 && old_mtu
>= rth
->u
.dst
.pmtu
&&
926 old_mtu
>= 68 + (iph
->ihl
<<2))
927 old_mtu
-= iph
->ihl
<<2;
929 mtu
= guess_mtu(old_mtu
);
931 if (mtu
<= rth
->u
.dst
.pmtu
) {
932 if (mtu
< rth
->u
.dst
.pmtu
) {
933 dst_confirm(&rth
->u
.dst
);
934 if (mtu
< ip_rt_min_pmtu
) {
935 mtu
= ip_rt_min_pmtu
;
936 rth
->u
.dst
.mxlock
|= (1<<RTAX_MTU
);
938 rth
->u
.dst
.pmtu
= mtu
;
939 dst_set_expires(&rth
->u
.dst
, ip_rt_mtu_expires
);
945 read_unlock(&rt_hash_lock
);
947 return est_mtu
? : new_mtu
;
950 void ip_rt_update_pmtu(struct dst_entry
*dst
, unsigned mtu
)
952 if (dst
->pmtu
> mtu
&& mtu
>= 68 &&
953 !(dst
->mxlock
&(1<<RTAX_MTU
))) {
954 if (mtu
< ip_rt_min_pmtu
) {
955 mtu
= ip_rt_min_pmtu
;
956 dst
->mxlock
|= (1<<RTAX_MTU
);
959 dst_set_expires(dst
, ip_rt_mtu_expires
);
963 static struct dst_entry
* ipv4_dst_check(struct dst_entry
* dst
, u32 cookie
)
969 static struct dst_entry
* ipv4_dst_reroute(struct dst_entry
* dst
,
975 static void ipv4_link_failure(struct sk_buff
*skb
)
979 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
981 rt
= (struct rtable
*) skb
->dst
;
983 dst_set_expires(&rt
->u
.dst
, 0);
986 static int ip_rt_bug(struct sk_buff
*skb
)
988 printk(KERN_DEBUG
"ip_rt_bug: %08x -> %08x, %s\n", skb
->nh
.iph
->saddr
,
989 skb
->nh
.iph
->daddr
, skb
->dev
? skb
->dev
->name
: "?");
995 We do not cache source address of outgoing interface,
996 because it is used only by IP RR, TS and SRR options,
997 so that it out of fast path.
999 BTW remember: "addr" is allowed to be not aligned
1003 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
1006 struct fib_result res
;
1008 if (rt
->key
.iif
== 0)
1010 else if (fib_lookup(&rt
->key
, &res
) == 0) {
1011 #ifdef CONFIG_IP_ROUTE_NAT
1012 if (res
.type
== RTN_NAT
)
1013 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
, RT_SCOPE_UNIVERSE
);
1016 src
= FIB_RES_PREFSRC(res
);
1019 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
, RT_SCOPE_UNIVERSE
);
1020 memcpy(addr
, &src
, 4);
1023 #ifdef CONFIG_NET_CLS_ROUTE
1024 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1026 if (!(rt
->u
.dst
.tclassid
&0xFFFF))
1027 rt
->u
.dst
.tclassid
|= tag
&0xFFFF;
1028 if (!(rt
->u
.dst
.tclassid
&0xFFFF0000))
1029 rt
->u
.dst
.tclassid
|= tag
&0xFFFF0000;
1033 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
1035 struct fib_info
*fi
= res
->fi
;
1038 if (FIB_RES_GW(*res
) && FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1039 rt
->rt_gateway
= FIB_RES_GW(*res
);
1040 memcpy(&rt
->u
.dst
.mxlock
, fi
->fib_metrics
, sizeof(fi
->fib_metrics
));
1041 if (fi
->fib_mtu
== 0) {
1042 rt
->u
.dst
.pmtu
= rt
->u
.dst
.dev
->mtu
;
1043 if (rt
->u
.dst
.pmtu
> IP_MAX_MTU
)
1044 rt
->u
.dst
.pmtu
= IP_MAX_MTU
;
1045 if (rt
->u
.dst
.mxlock
&(1<<RTAX_MTU
) &&
1046 rt
->rt_gateway
!= rt
->rt_dst
&&
1047 rt
->u
.dst
.pmtu
> 576)
1048 rt
->u
.dst
.pmtu
= 576;
1050 #ifdef CONFIG_NET_CLS_ROUTE
1051 rt
->u
.dst
.tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1054 rt
->u
.dst
.pmtu
= rt
->u
.dst
.dev
->mtu
;
1055 if (rt
->u
.dst
.pmtu
> IP_MAX_MTU
)
1056 rt
->u
.dst
.pmtu
= IP_MAX_MTU
;
1058 if (rt
->u
.dst
.advmss
== 0)
1059 rt
->u
.dst
.advmss
= max(rt
->u
.dst
.dev
->mtu
-40, ip_rt_min_advmss
);
1060 if (rt
->u
.dst
.advmss
> 65535-40)
1061 rt
->u
.dst
.advmss
= 65535-40;
1063 #ifdef CONFIG_NET_CLS_ROUTE
1064 #ifdef CONFIG_IP_MULTIPLE_TABLES
1065 set_class_tag(rt
, fib_rules_tclass(res
));
1067 set_class_tag(rt
, itag
);
1069 rt
->rt_type
= res
->type
;
1073 ip_route_input_mc(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1074 u8 tos
, struct net_device
*dev
, int our
)
1079 struct in_device
*in_dev
= in_dev_get(dev
);
1082 /* Primary sanity checks. */
1087 if (MULTICAST(saddr
) || BADCLASS(saddr
) || LOOPBACK(saddr
) ||
1088 skb
->protocol
!= __constant_htons(ETH_P_IP
))
1091 if (ZERONET(saddr
)) {
1092 if (!LOCAL_MCAST(daddr
))
1094 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1095 } else if (fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
, &itag
) < 0)
1098 rth
= dst_alloc(&ipv4_dst_ops
);
1102 rth
->u
.dst
.output
= ip_rt_bug
;
1104 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1105 rth
->key
.dst
= daddr
;
1106 rth
->rt_dst
= daddr
;
1108 #ifdef CONFIG_IP_ROUTE_FWMARK
1109 if (skb
->nfreason
== NF_REASON_FOR_ROUTING
)
1110 rth
->key
.fwmark
= skb
->nfmark
;
1112 rth
->key
.fwmark
= 0;
1114 rth
->key
.src
= saddr
;
1115 rth
->rt_src
= saddr
;
1116 #ifdef CONFIG_IP_ROUTE_NAT
1117 rth
->rt_dst_map
= daddr
;
1118 rth
->rt_src_map
= saddr
;
1120 #ifdef CONFIG_NET_CLS_ROUTE
1121 rth
->u
.dst
.tclassid
= itag
;
1124 rth
->key
.iif
= dev
->ifindex
;
1125 rth
->u
.dst
.dev
= &loopback_dev
;
1126 dev_hold(rth
->u
.dst
.dev
);
1128 rth
->rt_gateway
= daddr
;
1129 rth
->rt_spec_dst
= spec_dst
;
1130 rth
->rt_type
= RTN_MULTICAST
;
1131 rth
->rt_flags
= RTCF_MULTICAST
;
1133 rth
->u
.dst
.input
= ip_local_deliver
;
1134 rth
->rt_flags
|= RTCF_LOCAL
;
1137 #ifdef CONFIG_IP_MROUTE
1138 if (!LOCAL_MCAST(daddr
) && IN_DEV_MFORWARD(in_dev
))
1139 rth
->u
.dst
.input
= ip_mr_input
;
1143 hash
= rt_hash_code(daddr
, saddr
^(dev
->ifindex
<<5), tos
);
1144 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1156 * NOTE. We drop all the packets that has local source
1157 * addresses, because every properly looped back packet
1158 * must have correct destination already attached by output routine.
1160 * Such approach solves two big problems:
1161 * 1. Not simplex devices are handled properly.
1162 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1165 int ip_route_input_slow(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1166 u8 tos
, struct net_device
*dev
)
1169 struct fib_result res
;
1170 struct in_device
*in_dev
= in_dev_get(dev
);
1171 struct in_device
*out_dev
= NULL
;
1174 struct rtable
* rth
;
1181 * IP on this device is disabled.
1190 #ifdef CONFIG_IP_ROUTE_FWMARK
1191 if (skb
->nfreason
== NF_REASON_FOR_ROUTING
)
1192 key
.fwmark
= skb
->nfmark
;
1196 key
.iif
= dev
->ifindex
;
1198 key
.scope
= RT_SCOPE_UNIVERSE
;
1200 hash
= rt_hash_code(daddr
, saddr
^(key
.iif
<<5), tos
);
1202 /* Check for the most weird martians, which can be not detected
1206 if (MULTICAST(saddr
) || BADCLASS(saddr
) || LOOPBACK(saddr
))
1207 goto martian_source
;
1209 if (daddr
== 0xFFFFFFFF || (saddr
== 0 && daddr
== 0))
1212 /* Accept zero addresses only to limited broadcast;
1213 * I even do not know to fix it or not. Waiting for complains :-)
1216 goto martian_source
;
1218 if (BADCLASS(daddr
) || ZERONET(daddr
) || LOOPBACK(daddr
))
1219 goto martian_destination
;
1222 * Now we are ready to route packet.
1224 if ((err
= fib_lookup(&key
, &res
)) != 0) {
1225 if (!IN_DEV_FORWARD(in_dev
))
1231 #ifdef CONFIG_IP_ROUTE_NAT
1232 /* Policy is applied before mapping destination,
1233 but rerouting after map should be made with old source.
1237 u32 src_map
= saddr
;
1239 src_map
= fib_rules_policy(saddr
, &res
, &flags
);
1241 if (res
.type
== RTN_NAT
) {
1242 key
.dst
= fib_rules_map_destination(daddr
, &res
);
1245 if (fib_lookup(&key
, &res
))
1248 if (res
.type
!= RTN_UNICAST
)
1256 if (res
.type
== RTN_BROADCAST
)
1259 if (res
.type
== RTN_LOCAL
) {
1261 result
= fib_validate_source(saddr
, daddr
, tos
, loopback_dev
.ifindex
,
1262 dev
, &spec_dst
, &itag
);
1264 goto martian_source
;
1266 flags
|= RTCF_DIRECTSRC
;
1271 if (!IN_DEV_FORWARD(in_dev
))
1273 if (res
.type
!= RTN_UNICAST
)
1274 goto martian_destination
;
1276 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1277 if (res
.fi
->fib_nhs
> 1 && key
.oif
== 0)
1278 fib_select_multipath(&key
, &res
);
1280 out_dev
= in_dev_get(FIB_RES_DEV(res
));
1281 if (out_dev
== NULL
) {
1282 if (net_ratelimit())
1283 printk(KERN_CRIT
"Bug in ip_route_input_slow(). Please, report\n");
1287 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(res
), dev
, &spec_dst
, &itag
);
1289 goto martian_source
;
1292 flags
|= RTCF_DIRECTSRC
;
1294 if (out_dev
== in_dev
&& err
&& !(flags
&(RTCF_NAT
|RTCF_MASQ
)) &&
1295 (IN_DEV_SHARED_MEDIA(out_dev
)
1296 || inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(res
))))
1297 flags
|= RTCF_DOREDIRECT
;
1299 if (skb
->protocol
!= __constant_htons(ETH_P_IP
)) {
1300 /* Not IP (i.e. ARP). Do not create route, if it is
1301 * invalid for proxy arp. DNAT routes are always valid.
1303 if (out_dev
== in_dev
&& !(flags
&RTCF_DNAT
))
1307 rth
= dst_alloc(&ipv4_dst_ops
);
1311 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1312 rth
->key
.dst
= daddr
;
1313 rth
->rt_dst
= daddr
;
1315 #ifdef CONFIG_IP_ROUTE_FWMARK
1316 if (skb
->nfreason
== NF_REASON_FOR_ROUTING
)
1317 rth
->key
.fwmark
= skb
->nfmark
;
1319 rth
->key
.fwmark
= 0;
1321 rth
->key
.src
= saddr
;
1322 rth
->rt_src
= saddr
;
1323 rth
->rt_gateway
= daddr
;
1324 #ifdef CONFIG_IP_ROUTE_NAT
1325 rth
->rt_src_map
= key
.src
;
1326 rth
->rt_dst_map
= key
.dst
;
1327 if (flags
&RTCF_DNAT
)
1328 rth
->rt_gateway
= key
.dst
;
1331 rth
->key
.iif
= dev
->ifindex
;
1332 rth
->u
.dst
.dev
= out_dev
->dev
;
1333 dev_hold(rth
->u
.dst
.dev
);
1335 rth
->rt_spec_dst
= spec_dst
;
1337 rth
->u
.dst
.input
= ip_forward
;
1338 rth
->u
.dst
.output
= ip_output
;
1340 rt_set_nexthop(rth
, &res
, itag
);
1342 rth
->rt_flags
= flags
;
1344 #ifdef CONFIG_NET_FASTROUTE
1345 if (netdev_fastroute
&& !(flags
&(RTCF_NAT
|RTCF_MASQ
|RTCF_DOREDIRECT
))) {
1346 struct net_device
*odev
= rth
->u
.dst
.dev
;
1348 dev
->accept_fastpath
&&
1349 odev
->mtu
>= dev
->mtu
&&
1350 dev
->accept_fastpath(dev
, &rth
->u
.dst
) == 0)
1351 rth
->rt_flags
|= RTCF_FAST
;
1356 err
= rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1360 in_dev_put(out_dev
);
1366 if (skb
->protocol
!= __constant_htons(ETH_P_IP
))
1369 if (ZERONET(saddr
)) {
1370 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1372 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
, &itag
);
1374 goto martian_source
;
1376 flags
|= RTCF_DIRECTSRC
;
1378 flags
|= RTCF_BROADCAST
;
1379 res
.type
= RTN_BROADCAST
;
1382 rth
= dst_alloc(&ipv4_dst_ops
);
1386 rth
->u
.dst
.output
= ip_rt_bug
;
1388 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1389 rth
->key
.dst
= daddr
;
1390 rth
->rt_dst
= daddr
;
1392 #ifdef CONFIG_IP_ROUTE_FWMARK
1393 if (skb
->nfreason
== NF_REASON_FOR_ROUTING
)
1394 rth
->key
.fwmark
= skb
->nfmark
;
1396 rth
->key
.fwmark
= 0;
1398 rth
->key
.src
= saddr
;
1399 rth
->rt_src
= saddr
;
1400 #ifdef CONFIG_IP_ROUTE_NAT
1401 rth
->rt_dst_map
= key
.dst
;
1402 rth
->rt_src_map
= key
.src
;
1404 #ifdef CONFIG_NET_CLS_ROUTE
1405 rth
->u
.dst
.tclassid
= itag
;
1408 rth
->key
.iif
= dev
->ifindex
;
1409 rth
->u
.dst
.dev
= &loopback_dev
;
1410 dev_hold(rth
->u
.dst
.dev
);
1412 rth
->rt_gateway
= daddr
;
1413 rth
->rt_spec_dst
= spec_dst
;
1414 rth
->u
.dst
.input
= ip_local_deliver
;
1415 rth
->rt_flags
= flags
|RTCF_LOCAL
;
1416 if (res
.type
== RTN_UNREACHABLE
) {
1417 rth
->u
.dst
.input
= ip_error
;
1418 rth
->u
.dst
.error
= -err
;
1419 rth
->rt_flags
&= ~RTCF_LOCAL
;
1421 rth
->rt_type
= res
.type
;
1425 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
1426 res
.type
= RTN_UNREACHABLE
;
1430 * Do not cache martian addresses: they should be logged (RFC1812)
1432 martian_destination
:
1433 #ifdef CONFIG_IP_ROUTE_VERBOSE
1434 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1435 printk(KERN_WARNING
"martian destination %08x from %08x, dev %s\n", daddr
, saddr
, dev
->name
);
1446 #ifdef CONFIG_IP_ROUTE_VERBOSE
1447 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1449 * RFC1812 recommenadtion, if source is martian,
1450 * the only hint is MAC header.
1452 printk(KERN_WARNING
"martian source %08x for %08x, dev %s\n", saddr
, daddr
, dev
->name
);
1453 if (dev
->hard_header_len
) {
1455 unsigned char *p
= skb
->mac
.raw
;
1456 printk(KERN_WARNING
"ll header:");
1457 for (i
=0; i
<dev
->hard_header_len
; i
++, p
++)
1458 printk(" %02x", *p
);
1466 int ip_route_input(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1467 u8 tos
, struct net_device
*dev
)
1469 struct rtable
* rth
;
1471 int iif
= dev
->ifindex
;
1473 tos
&= IPTOS_TOS_MASK
;
1474 hash
= rt_hash_code(daddr
, saddr
^(iif
<<5), tos
);
1476 read_lock_bh(&rt_hash_lock
);
1477 for (rth
=rt_hash_table
[hash
]; rth
; rth
=rth
->u
.rt_next
) {
1478 if (rth
->key
.dst
== daddr
&&
1479 rth
->key
.src
== saddr
&&
1480 rth
->key
.iif
== iif
&&
1481 rth
->key
.oif
== 0 &&
1482 #ifdef CONFIG_IP_ROUTE_FWMARK
1484 == (skb
->nfreason
== NF_REASON_FOR_ROUTING
1485 ? skb
->nfmark
: 0) &&
1487 rth
->key
.tos
== tos
) {
1488 rth
->u
.dst
.lastuse
= jiffies
;
1489 dst_hold(&rth
->u
.dst
);
1491 read_unlock_bh(&rt_hash_lock
);
1492 skb
->dst
= (struct dst_entry
*)rth
;
1496 read_unlock_bh(&rt_hash_lock
);
1498 /* Multicast recognition logic is moved from route cache to here.
1499 The problem was that too many Ethernet cards have broken/missing
1500 hardware multicast filters :-( As result the host on multicasting
1501 network acquires a lot of useless route cache entries, sort of
1502 SDR messages from all the world. Now we try to get rid of them.
1503 Really, provided software IP multicast filter is organized
1504 reasonably (at least, hashed), it does not result in a slowdown
1505 comparing with route cache reject entries.
1506 Note, that multicast routers are not affected, because
1507 route cache entry is created eventually.
1509 if (MULTICAST(daddr
)) {
1510 struct in_device
*in_dev
;
1512 read_lock(&inetdev_lock
);
1513 if ((in_dev
= __in_dev_get(dev
)) != NULL
) {
1514 int our
= ip_check_mc(in_dev
, daddr
);
1516 #ifdef CONFIG_IP_MROUTE
1517 || (!LOCAL_MCAST(daddr
) && IN_DEV_MFORWARD(in_dev
))
1520 read_unlock(&inetdev_lock
);
1521 return ip_route_input_mc(skb
, daddr
, saddr
, tos
, dev
, our
);
1524 read_unlock(&inetdev_lock
);
1527 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
1531 * Major route resolver routine.
1534 int ip_route_output_slow(struct rtable
**rp
, u32 daddr
, u32 saddr
, u32 tos
, int oif
)
1537 struct fib_result res
;
1540 struct net_device
*dev_out
= NULL
;
1545 tos
&= IPTOS_TOS_MASK
|RTO_ONLINK
;
1548 key
.tos
= tos
&IPTOS_TOS_MASK
;
1549 key
.iif
= loopback_dev
.ifindex
;
1551 key
.scope
= (tos
&RTO_ONLINK
) ? RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
;
1553 #ifdef CONFIG_IP_MULTIPLE_TABLES
1558 if (MULTICAST(saddr
) || BADCLASS(saddr
) || ZERONET(saddr
))
1561 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1562 dev_out
= ip_dev_find(saddr
);
1563 if (dev_out
== NULL
)
1566 /* I removed check for oif == dev_out->oif here.
1567 It was wrong by three reasons:
1568 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1569 assigned to multiple interfaces.
1570 2. Moreover, we are allowed to send packets with saddr
1571 of another iface. --ANK
1575 (MULTICAST(daddr
) || daddr
== 0xFFFFFFFF)) {
1576 /* Special hack: user can direct multicasts
1577 and limited broadcast via necessary interface
1578 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1579 This hack is not just for fun, it allows
1580 vic,vat and friends to work.
1581 They bind socket to loopback, set ttl to zero
1582 and expect that it will work.
1583 From the viewpoint of routing cache they are broken,
1584 because we are not allowed to build multicast path
1585 with loopback source addr (look, routing cache
1586 cannot know, that ttl is zero, so that packet
1587 will not leave this host and route is valid).
1588 Luckily, this hack is good workaround.
1591 key
.oif
= dev_out
->ifindex
;
1599 dev_out
= dev_get_by_index(oif
);
1600 if (dev_out
== NULL
)
1602 if (__in_dev_get(dev_out
) == NULL
) {
1604 return -ENODEV
; /* Wrong error code */
1607 if (LOCAL_MCAST(daddr
) || daddr
== 0xFFFFFFFF) {
1609 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_LINK
);
1613 if (MULTICAST(daddr
))
1614 key
.src
= inet_select_addr(dev_out
, 0, key
.scope
);
1616 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_HOST
);
1623 key
.dst
= key
.src
= htonl(INADDR_LOOPBACK
);
1626 dev_out
= &loopback_dev
;
1628 key
.oif
= loopback_dev
.ifindex
;
1629 res
.type
= RTN_LOCAL
;
1630 flags
|= RTCF_LOCAL
;
1634 if (fib_lookup(&key
, &res
)) {
1637 /* Apparently, routing tables are wrong. Assume,
1638 that the destination is on link.
1641 Because we are allowed to send to iface
1642 even if it has NO routes and NO assigned
1643 addresses. When oif is specified, routing
1644 tables are looked up with only one purpose:
1645 to catch if destination is gatewayed, rather than
1646 direct. Moreover, if MSG_DONTROUTE is set,
1647 we send packet, ignoring both routing tables
1648 and ifaddr state. --ANK
1651 We could make it even if oif is unknown,
1652 likely IPv6, but we do not.
1656 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_LINK
);
1657 res
.type
= RTN_UNICAST
;
1662 return -ENETUNREACH
;
1666 if (res
.type
== RTN_NAT
)
1669 if (res
.type
== RTN_LOCAL
) {
1674 dev_out
= &loopback_dev
;
1676 key
.oif
= dev_out
->ifindex
;
1678 fib_info_put(res
.fi
);
1680 flags
|= RTCF_LOCAL
;
1684 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1685 if (res
.fi
->fib_nhs
> 1 && key
.oif
== 0)
1686 fib_select_multipath(&key
, &res
);
1689 if (res
.prefixlen
==0 && res
.type
== RTN_UNICAST
&& key
.oif
== 0)
1690 fib_select_default(&key
, &res
);
1693 key
.src
= FIB_RES_PREFSRC(res
);
1697 dev_out
= FIB_RES_DEV(res
);
1699 key
.oif
= dev_out
->ifindex
;
1702 if (LOOPBACK(key
.src
) && !(dev_out
->flags
&IFF_LOOPBACK
))
1705 if (key
.dst
== 0xFFFFFFFF)
1706 res
.type
= RTN_BROADCAST
;
1707 else if (MULTICAST(key
.dst
))
1708 res
.type
= RTN_MULTICAST
;
1709 else if (BADCLASS(key
.dst
) || ZERONET(key
.dst
))
1712 if (dev_out
->flags
&IFF_LOOPBACK
)
1713 flags
|= RTCF_LOCAL
;
1715 if (res
.type
== RTN_BROADCAST
) {
1716 flags
|= RTCF_BROADCAST
|RTCF_LOCAL
;
1718 fib_info_put(res
.fi
);
1721 } else if (res
.type
== RTN_MULTICAST
) {
1722 flags
|= RTCF_MULTICAST
|RTCF_LOCAL
;
1723 read_lock(&inetdev_lock
);
1724 if (!__in_dev_get(dev_out
) || !ip_check_mc(__in_dev_get(dev_out
), daddr
))
1725 flags
&= ~RTCF_LOCAL
;
1726 read_unlock(&inetdev_lock
);
1727 /* If multicast route do not exist use
1728 default one, but do not gateway in this case.
1731 if (res
.fi
&& res
.prefixlen
< 4) {
1732 fib_info_put(res
.fi
);
1737 rth
= dst_alloc(&ipv4_dst_ops
);
1741 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1742 rth
->key
.dst
= daddr
;
1744 rth
->key
.src
= saddr
;
1747 rth
->rt_dst
= key
.dst
;
1748 rth
->rt_src
= key
.src
;
1749 #ifdef CONFIG_IP_ROUTE_NAT
1750 rth
->rt_dst_map
= key
.dst
;
1751 rth
->rt_src_map
= key
.src
;
1753 rth
->rt_iif
= oif
? : dev_out
->ifindex
;
1754 rth
->u
.dst
.dev
= dev_out
;
1756 rth
->rt_gateway
= key
.dst
;
1757 rth
->rt_spec_dst
= key
.src
;
1759 rth
->u
.dst
.output
=ip_output
;
1761 if (flags
&RTCF_LOCAL
) {
1762 rth
->u
.dst
.input
= ip_local_deliver
;
1763 rth
->rt_spec_dst
= key
.dst
;
1765 if (flags
&(RTCF_BROADCAST
|RTCF_MULTICAST
)) {
1766 rth
->rt_spec_dst
= key
.src
;
1767 if (flags
&RTCF_LOCAL
&& !(dev_out
->flags
&IFF_LOOPBACK
))
1768 rth
->u
.dst
.output
= ip_mc_output
;
1769 #ifdef CONFIG_IP_MROUTE
1770 if (res
.type
== RTN_MULTICAST
) {
1771 struct in_device
*in_dev
= in_dev_get(dev_out
);
1773 if (IN_DEV_MFORWARD(in_dev
) && !LOCAL_MCAST(daddr
)) {
1774 rth
->u
.dst
.input
= ip_mr_input
;
1775 rth
->u
.dst
.output
= ip_mc_output
;
1783 rt_set_nexthop(rth
, &res
, 0);
1785 rth
->rt_flags
= flags
;
1787 hash
= rt_hash_code(daddr
, saddr
^(oif
<<5), tos
);
1788 err
= rt_intern_hash(hash
, rth
, rp
);
1804 int ip_route_output(struct rtable
**rp
, u32 daddr
, u32 saddr
, u32 tos
, int oif
)
1809 hash
= rt_hash_code(daddr
, saddr
^(oif
<<5), tos
);
1811 read_lock_bh(&rt_hash_lock
);
1812 for (rth
=rt_hash_table
[hash
]; rth
; rth
=rth
->u
.rt_next
) {
1813 if (rth
->key
.dst
== daddr
&&
1814 rth
->key
.src
== saddr
&&
1815 rth
->key
.iif
== 0 &&
1816 rth
->key
.oif
== oif
&&
1817 !((rth
->key
.tos
^tos
)&(IPTOS_TOS_MASK
|RTO_ONLINK
)) &&
1818 ((tos
&RTO_TPROXY
) || !(rth
->rt_flags
&RTCF_TPROXY
))
1820 rth
->u
.dst
.lastuse
= jiffies
;
1821 dst_hold(&rth
->u
.dst
);
1823 read_unlock_bh(&rt_hash_lock
);
1828 read_unlock_bh(&rt_hash_lock
);
1830 return ip_route_output_slow(rp
, daddr
, saddr
, tos
, oif
);
1833 #ifdef CONFIG_RTNETLINK
1835 static int rt_fill_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
, int nowait
)
1837 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
1839 struct nlmsghdr
*nlh
;
1840 unsigned char *b
= skb
->tail
;
1841 struct rta_cacheinfo ci
;
1842 #ifdef CONFIG_IP_MROUTE
1843 struct rtattr
*eptr
;
1846 nlh
= NLMSG_PUT(skb
, pid
, seq
, event
, sizeof(*r
));
1847 r
= NLMSG_DATA(nlh
);
1848 nlh
->nlmsg_flags
= (nowait
&& pid
) ? NLM_F_MULTI
: 0;
1849 r
->rtm_family
= AF_INET
;
1850 r
->rtm_dst_len
= 32;
1852 r
->rtm_tos
= rt
->key
.tos
;
1853 r
->rtm_table
= RT_TABLE_MAIN
;
1854 r
->rtm_type
= rt
->rt_type
;
1855 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
1856 r
->rtm_protocol
= RTPROT_UNSPEC
;
1857 r
->rtm_flags
= (rt
->rt_flags
&~0xFFFF) | RTM_F_CLONED
;
1858 if (rt
->rt_flags
& RTCF_NOTIFY
)
1859 r
->rtm_flags
|= RTM_F_NOTIFY
;
1860 RTA_PUT(skb
, RTA_DST
, 4, &rt
->rt_dst
);
1862 r
->rtm_src_len
= 32;
1863 RTA_PUT(skb
, RTA_SRC
, 4, &rt
->key
.src
);
1866 RTA_PUT(skb
, RTA_OIF
, sizeof(int), &rt
->u
.dst
.dev
->ifindex
);
1867 #ifdef CONFIG_NET_CLS_ROUTE
1868 if (rt
->u
.dst
.tclassid
)
1869 RTA_PUT(skb
, RTA_FLOW
, 4, &rt
->u
.dst
.tclassid
);
1872 RTA_PUT(skb
, RTA_PREFSRC
, 4, &rt
->rt_spec_dst
);
1873 else if (rt
->rt_src
!= rt
->key
.src
)
1874 RTA_PUT(skb
, RTA_PREFSRC
, 4, &rt
->rt_src
);
1875 if (rt
->rt_dst
!= rt
->rt_gateway
)
1876 RTA_PUT(skb
, RTA_GATEWAY
, 4, &rt
->rt_gateway
);
1877 if (rtnetlink_put_metrics(skb
, &rt
->u
.dst
.mxlock
) < 0)
1878 goto rtattr_failure
;
1879 ci
.rta_lastuse
= jiffies
- rt
->u
.dst
.lastuse
;
1880 ci
.rta_used
= rt
->u
.dst
.__use
;
1881 ci
.rta_clntref
= atomic_read(&rt
->u
.dst
.__refcnt
);
1882 if (rt
->u
.dst
.expires
)
1883 ci
.rta_expires
= rt
->u
.dst
.expires
- jiffies
;
1886 ci
.rta_error
= rt
->u
.dst
.error
;
1887 #ifdef CONFIG_IP_MROUTE
1888 eptr
= (struct rtattr
*)skb
->tail
;
1890 RTA_PUT(skb
, RTA_CACHEINFO
, sizeof(ci
), &ci
);
1892 #ifdef CONFIG_IP_MROUTE
1893 u32 dst
= rt
->rt_dst
;
1895 if (MULTICAST(dst
) && !LOCAL_MCAST(dst
) && ipv4_devconf
.mc_forwarding
) {
1896 int err
= ipmr_get_route(skb
, r
, nowait
);
1903 if (err
== -EMSGSIZE
)
1905 ((struct rta_cacheinfo
*)RTA_DATA(eptr
))->rta_error
= err
;
1911 RTA_PUT(skb
, RTA_IIF
, sizeof(int), &rt
->key
.iif
);
1915 nlh
->nlmsg_len
= skb
->tail
- b
;
1920 skb_trim(skb
, b
- skb
->data
);
1924 int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
1926 struct rtattr
**rta
= arg
;
1927 struct rtmsg
*rtm
= NLMSG_DATA(nlh
);
1928 struct rtable
*rt
= NULL
;
1933 struct sk_buff
*skb
;
1935 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1939 /* Reserve room for dummy headers, this skb can pass
1940 through good chunk of routing engine.
1942 skb
->mac
.raw
= skb
->data
;
1943 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
1946 memcpy(&src
, RTA_DATA(rta
[RTA_SRC
-1]), 4);
1948 memcpy(&dst
, RTA_DATA(rta
[RTA_DST
-1]), 4);
1950 memcpy(&iif
, RTA_DATA(rta
[RTA_IIF
-1]), sizeof(int));
1953 struct net_device
*dev
;
1954 dev
= __dev_get_by_index(iif
);
1957 skb
->protocol
= __constant_htons(ETH_P_IP
);
1959 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
1960 rt
= (struct rtable
*)skb
->dst
;
1961 if (!err
&& rt
->u
.dst
.error
)
1962 err
= -rt
->u
.dst
.error
;
1966 memcpy(&oif
, RTA_DATA(rta
[RTA_OIF
-1]), sizeof(int));
1967 err
= ip_route_output(&rt
, dst
, src
, rtm
->rtm_tos
, oif
);
1974 skb
->dst
= &rt
->u
.dst
;
1975 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
1976 rt
->rt_flags
|= RTCF_NOTIFY
;
1978 NETLINK_CB(skb
).dst_pid
= NETLINK_CB(in_skb
).pid
;
1980 err
= rt_fill_info(skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
, RTM_NEWROUTE
, 0);
1986 err
= netlink_unicast(rtnl
, skb
, NETLINK_CB(in_skb
).pid
, MSG_DONTWAIT
);
1993 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
2000 s_idx
= idx
= cb
->args
[1];
2001 for (h
=0; h
< RT_HASH_DIVISOR
; h
++) {
2002 if (h
< s_h
) continue;
2005 read_lock_bh(&rt_hash_lock
);
2006 for (rt
= rt_hash_table
[h
], idx
= 0; rt
; rt
= rt
->u
.rt_next
, idx
++) {
2009 skb
->dst
= dst_clone(&rt
->u
.dst
);
2010 if (rt_fill_info(skb
, NETLINK_CB(cb
->skb
).pid
,
2011 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
, 1) <= 0) {
2012 dst_release(xchg(&skb
->dst
, NULL
));
2013 read_unlock_bh(&rt_hash_lock
);
2016 dst_release(xchg(&skb
->dst
, NULL
));
2018 read_unlock_bh(&rt_hash_lock
);
2027 #endif /* CONFIG_RTNETLINK */
2029 void ip_rt_multicast_event(struct in_device
*in_dev
)
2036 #ifdef CONFIG_SYSCTL
2038 static int flush_delay
;
2041 int ipv4_sysctl_rtcache_flush(ctl_table
*ctl
, int write
, struct file
* filp
,
2042 void *buffer
, size_t *lenp
)
2045 proc_dointvec(ctl
, write
, filp
, buffer
, lenp
);
2046 rt_cache_flush(flush_delay
);
2052 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table
*table
, int *name
, int nlen
,
2053 void *oldval
, size_t *oldlenp
,
2054 void *newval
, size_t newlen
,
2058 if (newlen
!= sizeof(int))
2060 if (get_user(delay
,(int *)newval
))
2062 rt_cache_flush(delay
);
2066 ctl_table ipv4_route_table
[] = {
2067 {NET_IPV4_ROUTE_FLUSH
, "flush",
2068 &flush_delay
, sizeof(int), 0644, NULL
,
2069 &ipv4_sysctl_rtcache_flush
, &ipv4_sysctl_rtcache_flush_strategy
},
2070 {NET_IPV4_ROUTE_MIN_DELAY
, "min_delay",
2071 &ip_rt_min_delay
, sizeof(int), 0644, NULL
,
2072 &proc_dointvec_jiffies
, &sysctl_jiffies
},
2073 {NET_IPV4_ROUTE_MAX_DELAY
, "max_delay",
2074 &ip_rt_max_delay
, sizeof(int), 0644, NULL
,
2075 &proc_dointvec_jiffies
, &sysctl_jiffies
},
2076 {NET_IPV4_ROUTE_GC_THRESH
, "gc_thresh",
2077 &ipv4_dst_ops
.gc_thresh
, sizeof(int), 0644, NULL
,
2079 {NET_IPV4_ROUTE_MAX_SIZE
, "max_size",
2080 &ip_rt_max_size
, sizeof(int), 0644, NULL
,
2082 {NET_IPV4_ROUTE_GC_MIN_INTERVAL
, "gc_min_interval",
2083 &ip_rt_gc_min_interval
, sizeof(int), 0644, NULL
,
2084 &proc_dointvec_jiffies
, &sysctl_jiffies
},
2085 {NET_IPV4_ROUTE_GC_TIMEOUT
, "gc_timeout",
2086 &ip_rt_gc_timeout
, sizeof(int), 0644, NULL
,
2087 &proc_dointvec_jiffies
, &sysctl_jiffies
},
2088 {NET_IPV4_ROUTE_GC_INTERVAL
, "gc_interval",
2089 &ip_rt_gc_interval
, sizeof(int), 0644, NULL
,
2090 &proc_dointvec_jiffies
, &sysctl_jiffies
},
2091 {NET_IPV4_ROUTE_REDIRECT_LOAD
, "redirect_load",
2092 &ip_rt_redirect_load
, sizeof(int), 0644, NULL
,
2094 {NET_IPV4_ROUTE_REDIRECT_NUMBER
, "redirect_number",
2095 &ip_rt_redirect_number
, sizeof(int), 0644, NULL
,
2097 {NET_IPV4_ROUTE_REDIRECT_SILENCE
, "redirect_silence",
2098 &ip_rt_redirect_silence
, sizeof(int), 0644, NULL
,
2100 {NET_IPV4_ROUTE_ERROR_COST
, "error_cost",
2101 &ip_rt_error_cost
, sizeof(int), 0644, NULL
,
2103 {NET_IPV4_ROUTE_ERROR_BURST
, "error_burst",
2104 &ip_rt_error_burst
, sizeof(int), 0644, NULL
,
2106 {NET_IPV4_ROUTE_GC_ELASTICITY
, "gc_elasticity",
2107 &ip_rt_gc_elasticity
, sizeof(int), 0644, NULL
,
2109 {NET_IPV4_ROUTE_MTU_EXPIRES
, "mtu_expires",
2110 &ip_rt_mtu_expires
, sizeof(int), 0644, NULL
,
2111 &proc_dointvec_jiffies
, &sysctl_jiffies
},
2112 {NET_IPV4_ROUTE_MIN_PMTU
, "min_pmtu",
2113 &ip_rt_min_pmtu
, sizeof(int), 0644, NULL
,
2115 {NET_IPV4_ROUTE_MIN_ADVMSS
, "min_adv_mss",
2116 &ip_rt_min_advmss
, sizeof(int), 0644, NULL
,
2122 #ifdef CONFIG_NET_CLS_ROUTE
2123 struct ip_rt_acct ip_rt_acct
[256];
2124 rwlock_t ip_rt_acct_lock
= RW_LOCK_UNLOCKED
;
2126 #ifdef CONFIG_PROC_FS
2127 static int ip_rt_acct_read(char *buffer
, char **start
, off_t offset
,
2128 int length
, int *eof
, void *data
)
2132 if (offset
+ length
> sizeof(ip_rt_acct
)) {
2133 length
= sizeof(ip_rt_acct
) - offset
;
2137 read_lock_bh(&ip_rt_acct_lock
);
2138 memcpy(buffer
, ((u8
*)&ip_rt_acct
)+offset
, length
);
2139 read_unlock_bh(&ip_rt_acct_lock
);
2148 void __init
ip_rt_init(void)
2150 #ifdef CONFIG_PROC_FS
2151 #ifdef CONFIG_NET_CLS_ROUTE
2152 struct proc_dir_entry
*ent
;
2155 ipv4_dst_ops
.kmem_cachep
= kmem_cache_create("ip_dst_cache",
2156 sizeof(struct rtable
),
2157 0, SLAB_HWCACHE_ALIGN
,
2162 rt_periodic_timer
.function
= rt_check_expire
;
2163 /* All the timers, started at system startup tend
2164 to synchronize. Perturb it a bit.
2166 rt_periodic_timer
.expires
= jiffies
+ net_random()%ip_rt_gc_interval
2167 + ip_rt_gc_interval
;
2168 add_timer(&rt_periodic_timer
);
2170 #ifdef CONFIG_PROC_FS
2171 proc_net_register(&(struct proc_dir_entry
) {
2172 PROC_NET_RTCACHE
, 8, "rt_cache",
2173 S_IFREG
| S_IRUGO
, 1, 0, 0,
2174 0, &proc_net_inode_operations
,
2177 #ifdef CONFIG_NET_CLS_ROUTE
2178 ent
= create_proc_entry("net/rt_acct", 0, 0);
2179 ent
->read_proc
= ip_rt_acct_read
;