* added 0.99 linux version
[mascara-docs.git] / i386 / linux / linux-2.3.21 / net / ipv4 / route.c
blob72bb073365b295f242e6d983f84c2d19c2e65ccc
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.72 1999/08/30 10:17:12 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
69 #include <linux/mm.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <linux/netfilter_ipv4.h>
86 #include <net/protocol.h>
87 #include <net/ip.h>
88 #include <net/route.h>
89 #include <net/sock.h>
90 #include <net/ip_fib.h>
91 #include <net/arp.h>
92 #include <net/tcp.h>
93 #include <net/icmp.h>
94 #ifdef CONFIG_SYSCTL
95 #include <linux/sysctl.h>
96 #endif
98 #define IP_MAX_MTU 0xFFF0
100 #define RT_GC_TIMEOUT (300*HZ)
102 int ip_rt_min_delay = 2*HZ;
103 int ip_rt_max_delay = 10*HZ;
104 int ip_rt_gc_thresh = RT_HASH_DIVISOR;
105 int ip_rt_max_size = RT_HASH_DIVISOR*16;
106 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
107 int ip_rt_gc_interval = 60*HZ;
108 int ip_rt_gc_min_interval = 5*HZ;
109 int ip_rt_redirect_number = 9;
110 int ip_rt_redirect_load = HZ/50;
111 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
112 int ip_rt_error_cost = HZ;
113 int ip_rt_error_burst = 5*HZ;
114 int ip_rt_gc_elasticity = 8;
115 int ip_rt_mtu_expires = 10*60*HZ;
116 int ip_rt_min_pmtu = 512+20+20;
117 int ip_rt_min_advmss = 536;
119 static unsigned long rt_deadline = 0;
121 #define RTprint(a...) printk(KERN_DEBUG a)
123 static void rt_run_flush(unsigned long dummy);
125 static struct timer_list rt_flush_timer =
126 { NULL, NULL, 0, 0L, rt_run_flush };
127 static struct timer_list rt_periodic_timer =
128 { NULL, NULL, 0, 0L, NULL };
131 * Interface to generic destination cache.
134 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
135 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
136 struct sk_buff *);
137 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
138 static void ipv4_link_failure(struct sk_buff *skb);
139 static int rt_garbage_collect(void);
142 struct dst_ops ipv4_dst_ops =
144 AF_INET,
145 __constant_htons(ETH_P_IP),
146 RT_HASH_DIVISOR,
148 rt_garbage_collect,
149 ipv4_dst_check,
150 ipv4_dst_reroute,
151 NULL,
152 ipv4_negative_advice,
153 ipv4_link_failure,
154 sizeof(struct rtable),
157 __u8 ip_tos2prio[16] = {
158 TC_PRIO_BESTEFFORT,
159 TC_PRIO_FILLER,
160 TC_PRIO_BESTEFFORT,
161 TC_PRIO_FILLER,
162 TC_PRIO_BULK,
163 TC_PRIO_FILLER,
164 TC_PRIO_BULK,
165 TC_PRIO_FILLER,
166 TC_PRIO_INTERACTIVE,
167 TC_PRIO_FILLER,
168 TC_PRIO_INTERACTIVE,
169 TC_PRIO_FILLER,
170 TC_PRIO_INTERACTIVE_BULK,
171 TC_PRIO_FILLER,
172 TC_PRIO_INTERACTIVE_BULK,
173 TC_PRIO_FILLER
178 * Route cache.
181 /* The locking scheme is rather straight forward:
183 * 1) A BH protected rwlock protects the central route hash.
184 * 2) Only writers remove entries, and they hold the lock
185 * as they look at rtable reference counts.
186 * 3) Only readers acquire references to rtable entries,
187 * they do so with atomic increments and with the
188 * lock held.
191 static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
192 static rwlock_t rt_hash_lock = RW_LOCK_UNLOCKED;
194 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
196 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
198 unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
199 hash = hash^saddr^tos;
200 hash = hash^(hash>>16);
201 return (hash^(hash>>8)) & 0xFF;
204 #ifdef CONFIG_PROC_FS
206 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
208 int len=0;
209 off_t pos=0;
210 char temp[129];
211 struct rtable *r;
212 int i;
214 pos = 128;
216 if (offset<128) {
217 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
218 len = 128;
222 read_lock_bh(&rt_hash_lock);
224 for (i = 0; i<RT_HASH_DIVISOR; i++) {
225 for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
227 * Spin through entries until we are ready
229 pos += 128;
231 if (pos <= offset) {
232 len = 0;
233 continue;
235 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
236 r->u.dst.dev ? r->u.dst.dev->name : "*",
237 (unsigned long)r->rt_dst,
238 (unsigned long)r->rt_gateway,
239 r->rt_flags,
240 atomic_read(&r->u.dst.__refcnt),
241 r->u.dst.__use,
243 (unsigned long)r->rt_src, (int)r->u.dst.advmss + 40,
244 r->u.dst.window,
245 (int)((r->u.dst.rtt>>3) + r->u.dst.rttvar),
246 r->key.tos,
247 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
248 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
249 r->rt_spec_dst);
250 sprintf(buffer+len,"%-127s\n",temp);
251 len += 128;
252 if (pos >= offset+length)
253 goto done;
257 done:
258 read_unlock_bh(&rt_hash_lock);
260 *start = buffer+len-(pos-offset);
261 len = pos-offset;
262 if (len>length)
263 len = length;
264 return len;
266 #endif
268 static __inline__ void rt_free(struct rtable *rt)
270 dst_free(&rt->u.dst);
273 static __inline__ void rt_drop(struct rtable *rt)
275 ip_rt_put(rt);
276 dst_free(&rt->u.dst);
279 static __inline__ int rt_fast_clean(struct rtable *rth)
281 /* Kill broadcast/multicast entries very aggresively, if they
282 collide in hash table with more useful entries */
283 return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
284 && rth->key.iif && rth->u.rt_next);
287 static __inline__ int rt_valuable(struct rtable *rth)
289 return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
290 || rth->u.dst.expires);
293 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
295 int age;
297 if (atomic_read(&rth->u.dst.__refcnt))
298 return 0;
300 if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
301 return 1;
303 age = jiffies - rth->u.dst.lastuse;
304 if (age <= tmo1 && !rt_fast_clean(rth))
305 return 0;
306 if (age <= tmo2 && rt_valuable(rth))
307 return 0;
308 return 1;
311 /* This runs via a timer and thus is always in BH context. */
312 static void rt_check_expire(unsigned long dummy)
314 int i;
315 static int rover;
316 struct rtable *rth, **rthp;
317 unsigned long now = jiffies;
319 for (i=0; i<RT_HASH_DIVISOR/5; i++) {
320 unsigned tmo = ip_rt_gc_timeout;
322 rover = (rover + 1) & (RT_HASH_DIVISOR-1);
323 rthp = &rt_hash_table[rover];
325 write_lock(&rt_hash_lock);
326 while ((rth = *rthp) != NULL) {
327 if (rth->u.dst.expires) {
328 /* Entrie is expired even if it is in use */
329 if ((long)(now - rth->u.dst.expires) <= 0) {
330 tmo >>= 1;
331 rthp = &rth->u.rt_next;
332 continue;
334 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
335 tmo >>= 1;
336 rthp = &rth->u.rt_next;
337 continue;
341 * Cleanup aged off entries.
343 *rthp = rth->u.rt_next;
344 rt_free(rth);
346 write_unlock(&rt_hash_lock);
348 /* Fallback loop breaker. */
349 if ((jiffies - now) > 0)
350 break;
352 rt_periodic_timer.expires = now + ip_rt_gc_interval;
353 add_timer(&rt_periodic_timer);
356 /* This can run from both BH and non-BH contexts, the latter
357 * in the case of a forced flush event.
359 static void rt_run_flush(unsigned long dummy)
361 int i;
362 struct rtable * rth, * next;
364 rt_deadline = 0;
366 for (i=0; i<RT_HASH_DIVISOR; i++) {
367 write_lock_bh(&rt_hash_lock);
368 rth = rt_hash_table[i];
369 rt_hash_table[i] = NULL;
370 write_unlock_bh(&rt_hash_lock);
372 for (; rth; rth=next) {
373 next = rth->u.rt_next;
374 rt_free(rth);
379 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
381 void rt_cache_flush(int delay)
383 unsigned long now = jiffies;
384 int user_mode = !in_interrupt();
386 if (delay < 0)
387 delay = ip_rt_min_delay;
389 spin_lock_bh(&rt_flush_lock);
391 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
392 long tmo = (long)(rt_deadline - now);
394 /* If flush timer is already running
395 and flush request is not immediate (delay > 0):
397 if deadline is not achieved, prolongate timer to "delay",
398 otherwise fire it at deadline time.
401 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
402 tmo = 0;
404 if (delay > tmo)
405 delay = tmo;
408 if (delay <= 0) {
409 spin_unlock_bh(&rt_flush_lock);
410 rt_run_flush(0);
411 return;
414 if (rt_deadline == 0)
415 rt_deadline = now + ip_rt_max_delay;
417 rt_flush_timer.expires = now + delay;
418 add_timer(&rt_flush_timer);
419 spin_unlock_bh(&rt_flush_lock);
423 Short description of GC goals.
425 We want to build algorithm, which will keep routing cache
426 at some equilibrium point, when number of aged off entries
427 is kept approximately equal to newly generated ones.
429 Current expiration strength is variable "expire".
430 We try to adjust it dynamically, so that if networking
431 is idle expires is large enough to keep enough of warm entries,
432 and when load increases it reduces to limit cache size.
435 static int rt_garbage_collect(void)
437 static unsigned expire = RT_GC_TIMEOUT;
438 static unsigned long last_gc;
439 static int rover;
440 static int equilibrium;
441 struct rtable *rth, **rthp;
442 unsigned long now = jiffies;
443 int goal;
446 * Garbage collection is pretty expensive,
447 * do not make it too frequently.
449 if (now - last_gc < ip_rt_gc_min_interval &&
450 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
451 return 0;
453 /* Calculate number of entries, which we want to expire now. */
454 goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
455 if (goal <= 0) {
456 if (equilibrium < ipv4_dst_ops.gc_thresh)
457 equilibrium = ipv4_dst_ops.gc_thresh;
458 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
459 if (goal > 0) {
460 equilibrium += min(goal/2, RT_HASH_DIVISOR);
461 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
463 } else {
464 /* We are in dangerous area. Try to reduce cache really
465 * aggressively.
467 goal = max(goal/2, RT_HASH_DIVISOR);
468 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
471 if (now - last_gc >= ip_rt_gc_min_interval)
472 last_gc = now;
474 if (goal <= 0) {
475 equilibrium += goal;
476 goto work_done;
479 do {
480 int i, k;
482 /* The write lock is held during the entire hash
483 * traversal to ensure consistent state of the rover.
485 write_lock_bh(&rt_hash_lock);
486 for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
487 unsigned tmo = expire;
489 k = (k + 1) & (RT_HASH_DIVISOR-1);
490 rthp = &rt_hash_table[k];
491 while ((rth = *rthp) != NULL) {
492 if (!rt_may_expire(rth, tmo, expire)) {
493 tmo >>= 1;
494 rthp = &rth->u.rt_next;
495 continue;
497 *rthp = rth->u.rt_next;
498 rt_free(rth);
499 goal--;
501 if (goal <= 0)
502 break;
504 rover = k;
505 write_unlock_bh(&rt_hash_lock);
507 if (goal <= 0)
508 goto work_done;
510 /* Goal is not achieved. We stop process if:
512 - if expire reduced to zero. Otherwise, expire is halfed.
513 - if table is not full.
514 - if we are called from interrupt.
515 - jiffies check is just fallback/debug loop breaker.
516 We will not spin here for long time in any case.
519 if (expire == 0)
520 break;
522 expire >>= 1;
523 #if RT_CACHE_DEBUG >= 2
524 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
525 #endif
527 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
528 return 0;
529 } while (!in_interrupt() && jiffies - now < 1);
531 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
532 return 0;
533 if (net_ratelimit())
534 printk("dst cache overflow\n");
535 return 1;
537 work_done:
538 expire += ip_rt_gc_min_interval;
539 if (expire > ip_rt_gc_timeout ||
540 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
541 expire = ip_rt_gc_timeout;
542 #if RT_CACHE_DEBUG >= 2
543 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
544 #endif
545 return 0;
548 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
550 struct rtable *rth, **rthp;
551 unsigned long now = jiffies;
552 int attempts = !in_interrupt();
554 restart:
555 rthp = &rt_hash_table[hash];
557 write_lock_bh(&rt_hash_lock);
558 while ((rth = *rthp) != NULL) {
559 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
560 /* Put it first */
561 *rthp = rth->u.rt_next;
562 rth->u.rt_next = rt_hash_table[hash];
563 rt_hash_table[hash] = rth;
565 rth->u.dst.__use++;
566 dst_hold(&rth->u.dst);
567 rth->u.dst.lastuse = now;
568 write_unlock_bh(&rt_hash_lock);
570 rt_drop(rt);
571 *rp = rth;
572 return 0;
575 rthp = &rth->u.rt_next;
578 /* Try to bind route to arp only if it is output
579 route or unicast forwarding path.
581 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
582 if (!arp_bind_neighbour(&rt->u.dst)) {
583 write_unlock_bh(&rt_hash_lock);
585 /* Neighbour tables are full and nothing
586 can be released. Try to shrink route cache,
587 it is most likely it holds some neighbour records.
589 if (attempts-- > 0) {
590 int saved_elasticity = ip_rt_gc_elasticity;
591 int saved_int = ip_rt_gc_min_interval;
592 ip_rt_gc_elasticity = 1;
593 ip_rt_gc_min_interval = 0;
594 rt_garbage_collect();
595 ip_rt_gc_min_interval = saved_int;
596 ip_rt_gc_elasticity = saved_elasticity;
597 goto restart;
600 if (net_ratelimit()) {
601 if ((rt->u.dst.dev->flags&IFF_UP) &&
602 __in_dev_get(rt->u.dst.dev))
603 printk("Neighbour table overflow.\n");
604 else
605 printk("Device %s is down.\n", rt->u.dst.dev->name);
607 rt_drop(rt);
608 return -ENOBUFS;
612 rt->u.rt_next = rt_hash_table[hash];
613 #if RT_CACHE_DEBUG >= 2
614 if (rt->u.rt_next) {
615 struct rtable * trt;
616 printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
617 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
618 printk(" . %08x", trt->rt_dst);
619 printk("\n");
621 #endif
622 rt_hash_table[hash] = rt;
623 write_unlock_bh(&rt_hash_lock);
624 *rp = rt;
625 return 0;
628 static void rt_del(unsigned hash, struct rtable *rt)
630 struct rtable **rthp;
632 write_lock_bh(&rt_hash_lock);
633 ip_rt_put(rt);
634 for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
635 if (*rthp == rt) {
636 *rthp = rt->u.rt_next;
637 rt_free(rt);
638 break;
641 write_unlock_bh(&rt_hash_lock);
644 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
645 u32 saddr, u8 tos, struct net_device *dev)
647 int i, k;
648 struct in_device *in_dev = in_dev_get(dev);
649 struct rtable *rth, **rthp;
650 u32 skeys[2] = { saddr, 0 };
651 int ikeys[2] = { dev->ifindex, 0 };
653 tos &= IPTOS_TOS_MASK;
655 if (!in_dev)
656 return;
658 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
659 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
660 goto reject_redirect;
662 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
663 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
664 goto reject_redirect;
665 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
666 goto reject_redirect;
667 } else {
668 if (inet_addr_type(new_gw) != RTN_UNICAST)
669 goto reject_redirect;
672 for (i=0; i<2; i++) {
673 for (k=0; k<2; k++) {
674 unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
676 rthp=&rt_hash_table[hash];
678 read_lock(&rt_hash_lock);
679 while ( (rth = *rthp) != NULL) {
680 struct rtable *rt;
682 if (rth->key.dst != daddr ||
683 rth->key.src != skeys[i] ||
684 rth->key.tos != tos ||
685 rth->key.oif != ikeys[k] ||
686 rth->key.iif != 0) {
687 rthp = &rth->u.rt_next;
688 continue;
691 if (rth->rt_dst != daddr ||
692 rth->rt_src != saddr ||
693 rth->u.dst.error ||
694 rth->rt_gateway != old_gw ||
695 rth->u.dst.dev != dev)
696 break;
698 dst_clone(&rth->u.dst);
699 read_unlock(&rt_hash_lock);
701 rt = dst_alloc(&ipv4_dst_ops);
702 if (rt == NULL) {
703 ip_rt_put(rth);
704 in_dev_put(in_dev);
705 return;
709 * Copy all the information.
711 *rt = *rth;
712 rt->u.dst.__use = 1;
713 atomic_set(&rt->u.dst.__refcnt, 1);
714 if (rt->u.dst.dev)
715 dev_hold(rt->u.dst.dev);
716 rt->u.dst.lastuse = jiffies;
717 rt->u.dst.neighbour = NULL;
718 rt->u.dst.hh = NULL;
719 rt->u.dst.obsolete = 0;
721 rt->rt_flags |= RTCF_REDIRECTED;
723 /* Gateway is different ... */
724 rt->rt_gateway = new_gw;
726 /* Redirect received -> path was valid */
727 dst_confirm(&rth->u.dst);
729 if (!arp_bind_neighbour(&rt->u.dst) ||
730 !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
731 if (rt->u.dst.neighbour)
732 neigh_event_send(rt->u.dst.neighbour, NULL);
733 ip_rt_put(rth);
734 rt_drop(rt);
735 goto do_next;
738 rt_del(hash, rt);
739 if (!rt_intern_hash(hash, rt, &rt))
740 ip_rt_put(rt);
741 goto do_next;
743 read_unlock(&rt_hash_lock);
744 do_next:
748 in_dev_put(in_dev);
749 return;
751 reject_redirect:
752 #ifdef CONFIG_IP_ROUTE_VERBOSE
753 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
754 printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
755 "Path = %lX -> %lX, tos %02x\n",
756 ntohl(old_gw), dev->name, ntohl(new_gw),
757 ntohl(saddr), ntohl(daddr), tos);
758 #endif
759 in_dev_put(in_dev);
762 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
764 struct rtable *rt = (struct rtable*)dst;
766 if (rt != NULL) {
767 if (dst->obsolete) {
768 ip_rt_put(rt);
769 return NULL;
771 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
772 unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
773 #if RT_CACHE_DEBUG >= 1
774 printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
775 #endif
776 rt_del(hash, rt);
777 return NULL;
780 return dst;
784 * Algorithm:
785 * 1. The first ip_rt_redirect_number redirects are sent
786 * with exponential backoff, then we stop sending them at all,
787 * assuming that the host ignores our redirects.
788 * 2. If we did not see packets requiring redirects
789 * during ip_rt_redirect_silence, we assume that the host
790 * forgot redirected route and start to send redirects again.
792 * This algorithm is much cheaper and more intelligent than dumb load limiting
793 * in icmp.c.
795 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
796 * and "frag. need" (breaks PMTU discovery) in icmp.c.
799 void ip_rt_send_redirect(struct sk_buff *skb)
801 struct rtable *rt = (struct rtable*)skb->dst;
802 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
804 if (!in_dev)
805 return;
807 if (!IN_DEV_TX_REDIRECTS(in_dev))
808 goto out;
810 /* No redirected packets during ip_rt_redirect_silence;
811 * reset the algorithm.
813 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
814 rt->u.dst.rate_tokens = 0;
816 /* Too many ignored redirects; do not send anything
817 * set u.dst.rate_last to the last seen redirected packet.
819 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
820 rt->u.dst.rate_last = jiffies;
821 goto out;
824 /* Check for load limit; set rate_last to the latest sent
825 * redirect.
827 if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
828 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
829 rt->u.dst.rate_last = jiffies;
830 ++rt->u.dst.rate_tokens;
831 #ifdef CONFIG_IP_ROUTE_VERBOSE
832 if (IN_DEV_LOG_MARTIANS(in_dev) &&
833 rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
834 printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
835 rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
836 #endif
838 out:
839 in_dev_put(in_dev);
842 static int ip_error(struct sk_buff *skb)
844 struct rtable *rt = (struct rtable*)skb->dst;
845 unsigned long now;
846 int code;
848 switch (rt->u.dst.error) {
849 case EINVAL:
850 default:
851 kfree_skb(skb);
852 return 0;
853 case EHOSTUNREACH:
854 code = ICMP_HOST_UNREACH;
855 break;
856 case ENETUNREACH:
857 code = ICMP_NET_UNREACH;
858 break;
859 case EACCES:
860 code = ICMP_PKT_FILTERED;
861 break;
864 now = jiffies;
865 if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
866 rt->u.dst.rate_tokens = ip_rt_error_burst;
867 rt->u.dst.rate_last = now;
868 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
869 rt->u.dst.rate_tokens -= ip_rt_error_cost;
870 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
873 kfree_skb(skb);
874 return 0;
878 * The last two values are not from the RFC but
879 * are needed for AMPRnet AX.25 paths.
882 static unsigned short mtu_plateau[] =
883 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
885 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
887 int i;
889 for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
890 if (old_mtu > mtu_plateau[i])
891 return mtu_plateau[i];
892 return 68;
895 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
897 int i;
898 unsigned short old_mtu = ntohs(iph->tot_len);
899 struct rtable *rth;
900 u32 skeys[2] = { iph->saddr, 0, };
901 u32 daddr = iph->daddr;
902 u8 tos = iph->tos & IPTOS_TOS_MASK;
903 unsigned short est_mtu = 0;
905 if (ipv4_config.no_pmtu_disc)
906 return 0;
908 for (i=0; i<2; i++) {
909 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
911 read_lock(&rt_hash_lock);
912 for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
913 if (rth->key.dst == daddr &&
914 rth->key.src == skeys[i] &&
915 rth->rt_dst == daddr &&
916 rth->rt_src == iph->saddr &&
917 rth->key.tos == tos &&
918 rth->key.iif == 0 &&
919 !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
920 unsigned short mtu = new_mtu;
922 if (new_mtu < 68 || new_mtu >= old_mtu) {
924 /* BSD 4.2 compatibility hack :-( */
925 if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
926 old_mtu >= 68 + (iph->ihl<<2))
927 old_mtu -= iph->ihl<<2;
929 mtu = guess_mtu(old_mtu);
931 if (mtu <= rth->u.dst.pmtu) {
932 if (mtu < rth->u.dst.pmtu) {
933 dst_confirm(&rth->u.dst);
934 if (mtu < ip_rt_min_pmtu) {
935 mtu = ip_rt_min_pmtu;
936 rth->u.dst.mxlock |= (1<<RTAX_MTU);
938 rth->u.dst.pmtu = mtu;
939 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
941 est_mtu = mtu;
945 read_unlock(&rt_hash_lock);
947 return est_mtu ? : new_mtu;
950 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
952 if (dst->pmtu > mtu && mtu >= 68 &&
953 !(dst->mxlock&(1<<RTAX_MTU))) {
954 if (mtu < ip_rt_min_pmtu) {
955 mtu = ip_rt_min_pmtu;
956 dst->mxlock |= (1<<RTAX_MTU);
958 dst->pmtu = mtu;
959 dst_set_expires(dst, ip_rt_mtu_expires);
963 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
965 dst_release(dst);
966 return NULL;
969 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
970 struct sk_buff *skb)
972 return NULL;
975 static void ipv4_link_failure(struct sk_buff *skb)
977 struct rtable *rt;
979 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
981 rt = (struct rtable *) skb->dst;
982 if (rt)
983 dst_set_expires(&rt->u.dst, 0);
986 static int ip_rt_bug(struct sk_buff *skb)
988 printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
989 skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
990 kfree_skb(skb);
991 return 0;
995 We do not cache source address of outgoing interface,
996 because it is used only by IP RR, TS and SRR options,
997 so that it out of fast path.
999 BTW remember: "addr" is allowed to be not aligned
1000 in IP options!
1003 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1005 u32 src;
1006 struct fib_result res;
1008 if (rt->key.iif == 0)
1009 src = rt->rt_src;
1010 else if (fib_lookup(&rt->key, &res) == 0) {
1011 #ifdef CONFIG_IP_ROUTE_NAT
1012 if (res.type == RTN_NAT)
1013 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
1014 else
1015 #endif
1016 src = FIB_RES_PREFSRC(res);
1017 fib_res_put(&res);
1018 } else
1019 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
1020 memcpy(addr, &src, 4);
1023 #ifdef CONFIG_NET_CLS_ROUTE
1024 static void set_class_tag(struct rtable *rt, u32 tag)
1026 if (!(rt->u.dst.tclassid&0xFFFF))
1027 rt->u.dst.tclassid |= tag&0xFFFF;
1028 if (!(rt->u.dst.tclassid&0xFFFF0000))
1029 rt->u.dst.tclassid |= tag&0xFFFF0000;
1031 #endif
1033 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1035 struct fib_info *fi = res->fi;
1037 if (fi) {
1038 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1039 rt->rt_gateway = FIB_RES_GW(*res);
1040 memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics));
1041 if (fi->fib_mtu == 0) {
1042 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1043 if (rt->u.dst.pmtu > IP_MAX_MTU)
1044 rt->u.dst.pmtu = IP_MAX_MTU;
1045 if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
1046 rt->rt_gateway != rt->rt_dst &&
1047 rt->u.dst.pmtu > 576)
1048 rt->u.dst.pmtu = 576;
1050 #ifdef CONFIG_NET_CLS_ROUTE
1051 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1052 #endif
1053 } else {
1054 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1055 if (rt->u.dst.pmtu > IP_MAX_MTU)
1056 rt->u.dst.pmtu = IP_MAX_MTU;
1058 if (rt->u.dst.advmss == 0)
1059 rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss);
1060 if (rt->u.dst.advmss > 65535-40)
1061 rt->u.dst.advmss = 65535-40;
1063 #ifdef CONFIG_NET_CLS_ROUTE
1064 #ifdef CONFIG_IP_MULTIPLE_TABLES
1065 set_class_tag(rt, fib_rules_tclass(res));
1066 #endif
1067 set_class_tag(rt, itag);
1068 #endif
1069 rt->rt_type = res->type;
1072 static int
1073 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1074 u8 tos, struct net_device *dev, int our)
1076 unsigned hash;
1077 struct rtable *rth;
1078 u32 spec_dst;
1079 struct in_device *in_dev = in_dev_get(dev);
1080 u32 itag = 0;
1082 /* Primary sanity checks. */
1084 if (in_dev == NULL)
1085 return -EINVAL;
1087 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1088 skb->protocol != __constant_htons(ETH_P_IP))
1089 goto e_inval;
1091 if (ZERONET(saddr)) {
1092 if (!LOCAL_MCAST(daddr))
1093 goto e_inval;
1094 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1095 } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1096 goto e_inval;
1098 rth = dst_alloc(&ipv4_dst_ops);
1099 if (!rth)
1100 goto e_nobufs;
1102 rth->u.dst.output= ip_rt_bug;
1104 atomic_set(&rth->u.dst.__refcnt, 1);
1105 rth->key.dst = daddr;
1106 rth->rt_dst = daddr;
1107 rth->key.tos = tos;
1108 #ifdef CONFIG_IP_ROUTE_FWMARK
1109 if (skb->nfreason == NF_REASON_FOR_ROUTING)
1110 rth->key.fwmark = skb->nfmark;
1111 else
1112 rth->key.fwmark = 0;
1113 #endif
1114 rth->key.src = saddr;
1115 rth->rt_src = saddr;
1116 #ifdef CONFIG_IP_ROUTE_NAT
1117 rth->rt_dst_map = daddr;
1118 rth->rt_src_map = saddr;
1119 #endif
1120 #ifdef CONFIG_NET_CLS_ROUTE
1121 rth->u.dst.tclassid = itag;
1122 #endif
1123 rth->rt_iif =
1124 rth->key.iif = dev->ifindex;
1125 rth->u.dst.dev = &loopback_dev;
1126 dev_hold(rth->u.dst.dev);
1127 rth->key.oif = 0;
1128 rth->rt_gateway = daddr;
1129 rth->rt_spec_dst= spec_dst;
1130 rth->rt_type = RTN_MULTICAST;
1131 rth->rt_flags = RTCF_MULTICAST;
1132 if (our) {
1133 rth->u.dst.input= ip_local_deliver;
1134 rth->rt_flags |= RTCF_LOCAL;
1137 #ifdef CONFIG_IP_MROUTE
1138 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1139 rth->u.dst.input = ip_mr_input;
1140 #endif
1142 in_dev_put(in_dev);
1143 hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1144 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1146 e_nobufs:
1147 in_dev_put(in_dev);
1148 return -ENOBUFS;
1150 e_inval:
1151 in_dev_put(in_dev);
1152 return -EINVAL;
1156 * NOTE. We drop all the packets that has local source
1157 * addresses, because every properly looped back packet
1158 * must have correct destination already attached by output routine.
1160 * Such approach solves two big problems:
1161 * 1. Not simplex devices are handled properly.
1162 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1165 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1166 u8 tos, struct net_device *dev)
1168 struct rt_key key;
1169 struct fib_result res;
1170 struct in_device *in_dev = in_dev_get(dev);
1171 struct in_device *out_dev = NULL;
1172 unsigned flags = 0;
1173 u32 itag = 0;
1174 struct rtable * rth;
1175 unsigned hash;
1176 u32 spec_dst;
1177 int err = -EINVAL;
1178 int free_res = 0;
1181 * IP on this device is disabled.
1184 if (!in_dev)
1185 return -EINVAL;
1187 key.dst = daddr;
1188 key.src = saddr;
1189 key.tos = tos;
1190 #ifdef CONFIG_IP_ROUTE_FWMARK
1191 if (skb->nfreason == NF_REASON_FOR_ROUTING)
1192 key.fwmark = skb->nfmark;
1193 else
1194 key.fwmark = 0;
1195 #endif
1196 key.iif = dev->ifindex;
1197 key.oif = 0;
1198 key.scope = RT_SCOPE_UNIVERSE;
1200 hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1202 /* Check for the most weird martians, which can be not detected
1203 by fib_lookup.
1206 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1207 goto martian_source;
1209 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1210 goto brd_input;
1212 /* Accept zero addresses only to limited broadcast;
1213 * I even do not know to fix it or not. Waiting for complains :-)
1215 if (ZERONET(saddr))
1216 goto martian_source;
1218 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1219 goto martian_destination;
1222 * Now we are ready to route packet.
1224 if ((err = fib_lookup(&key, &res)) != 0) {
1225 if (!IN_DEV_FORWARD(in_dev))
1226 goto e_inval;
1227 goto no_route;
1229 free_res = 1;
1231 #ifdef CONFIG_IP_ROUTE_NAT
1232 /* Policy is applied before mapping destination,
1233 but rerouting after map should be made with old source.
1236 if (1) {
1237 u32 src_map = saddr;
1238 if (res.r)
1239 src_map = fib_rules_policy(saddr, &res, &flags);
1241 if (res.type == RTN_NAT) {
1242 key.dst = fib_rules_map_destination(daddr, &res);
1243 fib_res_put(&res);
1244 free_res = 0;
1245 if (fib_lookup(&key, &res))
1246 goto e_inval;
1247 free_res = 1;
1248 if (res.type != RTN_UNICAST)
1249 goto e_inval;
1250 flags |= RTCF_DNAT;
1252 key.src = src_map;
1254 #endif
1256 if (res.type == RTN_BROADCAST)
1257 goto brd_input;
1259 if (res.type == RTN_LOCAL) {
1260 int result;
1261 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1262 dev, &spec_dst, &itag);
1263 if (result < 0)
1264 goto martian_source;
1265 if (result)
1266 flags |= RTCF_DIRECTSRC;
1267 spec_dst = daddr;
1268 goto local_input;
1271 if (!IN_DEV_FORWARD(in_dev))
1272 goto e_inval;
1273 if (res.type != RTN_UNICAST)
1274 goto martian_destination;
1276 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1277 if (res.fi->fib_nhs > 1 && key.oif == 0)
1278 fib_select_multipath(&key, &res);
1279 #endif
1280 out_dev = in_dev_get(FIB_RES_DEV(res));
1281 if (out_dev == NULL) {
1282 if (net_ratelimit())
1283 printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1284 goto e_inval;
1287 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1288 if (err < 0)
1289 goto martian_source;
1291 if (err)
1292 flags |= RTCF_DIRECTSRC;
1294 if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1295 (IN_DEV_SHARED_MEDIA(out_dev)
1296 || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1297 flags |= RTCF_DOREDIRECT;
1299 if (skb->protocol != __constant_htons(ETH_P_IP)) {
1300 /* Not IP (i.e. ARP). Do not create route, if it is
1301 * invalid for proxy arp. DNAT routes are always valid.
1303 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1304 goto e_inval;
1307 rth = dst_alloc(&ipv4_dst_ops);
1308 if (!rth)
1309 goto e_nobufs;
1311 atomic_set(&rth->u.dst.__refcnt, 1);
1312 rth->key.dst = daddr;
1313 rth->rt_dst = daddr;
1314 rth->key.tos = tos;
1315 #ifdef CONFIG_IP_ROUTE_FWMARK
1316 if (skb->nfreason == NF_REASON_FOR_ROUTING)
1317 rth->key.fwmark = skb->nfmark;
1318 else
1319 rth->key.fwmark = 0;
1320 #endif
1321 rth->key.src = saddr;
1322 rth->rt_src = saddr;
1323 rth->rt_gateway = daddr;
1324 #ifdef CONFIG_IP_ROUTE_NAT
1325 rth->rt_src_map = key.src;
1326 rth->rt_dst_map = key.dst;
1327 if (flags&RTCF_DNAT)
1328 rth->rt_gateway = key.dst;
1329 #endif
1330 rth->rt_iif =
1331 rth->key.iif = dev->ifindex;
1332 rth->u.dst.dev = out_dev->dev;
1333 dev_hold(rth->u.dst.dev);
1334 rth->key.oif = 0;
1335 rth->rt_spec_dst= spec_dst;
1337 rth->u.dst.input = ip_forward;
1338 rth->u.dst.output = ip_output;
1340 rt_set_nexthop(rth, &res, itag);
1342 rth->rt_flags = flags;
1344 #ifdef CONFIG_NET_FASTROUTE
1345 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1346 struct net_device *odev = rth->u.dst.dev;
1347 if (odev != dev &&
1348 dev->accept_fastpath &&
1349 odev->mtu >= dev->mtu &&
1350 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1351 rth->rt_flags |= RTCF_FAST;
1353 #endif
1355 intern:
1356 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1357 done:
1358 in_dev_put(in_dev);
1359 if (out_dev)
1360 in_dev_put(out_dev);
1361 if (free_res)
1362 fib_res_put(&res);
1363 return err;
1365 brd_input:
1366 if (skb->protocol != __constant_htons(ETH_P_IP))
1367 goto e_inval;
1369 if (ZERONET(saddr)) {
1370 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1371 } else {
1372 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1373 if (err < 0)
1374 goto martian_source;
1375 if (err)
1376 flags |= RTCF_DIRECTSRC;
1378 flags |= RTCF_BROADCAST;
1379 res.type = RTN_BROADCAST;
1381 local_input:
1382 rth = dst_alloc(&ipv4_dst_ops);
1383 if (!rth)
1384 goto e_nobufs;
1386 rth->u.dst.output= ip_rt_bug;
1388 atomic_set(&rth->u.dst.__refcnt, 1);
1389 rth->key.dst = daddr;
1390 rth->rt_dst = daddr;
1391 rth->key.tos = tos;
1392 #ifdef CONFIG_IP_ROUTE_FWMARK
1393 if (skb->nfreason == NF_REASON_FOR_ROUTING)
1394 rth->key.fwmark = skb->nfmark;
1395 else
1396 rth->key.fwmark = 0;
1397 #endif
1398 rth->key.src = saddr;
1399 rth->rt_src = saddr;
1400 #ifdef CONFIG_IP_ROUTE_NAT
1401 rth->rt_dst_map = key.dst;
1402 rth->rt_src_map = key.src;
1403 #endif
1404 #ifdef CONFIG_NET_CLS_ROUTE
1405 rth->u.dst.tclassid = itag;
1406 #endif
1407 rth->rt_iif =
1408 rth->key.iif = dev->ifindex;
1409 rth->u.dst.dev = &loopback_dev;
1410 dev_hold(rth->u.dst.dev);
1411 rth->key.oif = 0;
1412 rth->rt_gateway = daddr;
1413 rth->rt_spec_dst= spec_dst;
1414 rth->u.dst.input= ip_local_deliver;
1415 rth->rt_flags = flags|RTCF_LOCAL;
1416 if (res.type == RTN_UNREACHABLE) {
1417 rth->u.dst.input= ip_error;
1418 rth->u.dst.error= -err;
1419 rth->rt_flags &= ~RTCF_LOCAL;
1421 rth->rt_type = res.type;
1422 goto intern;
1424 no_route:
1425 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1426 res.type = RTN_UNREACHABLE;
1427 goto local_input;
1430 * Do not cache martian addresses: they should be logged (RFC1812)
1432 martian_destination:
1433 #ifdef CONFIG_IP_ROUTE_VERBOSE
1434 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1435 printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1436 #endif
1437 e_inval:
1438 err = -EINVAL;
1439 goto done;
1441 e_nobufs:
1442 err = -ENOBUFS;
1443 goto done;
1445 martian_source:
1446 #ifdef CONFIG_IP_ROUTE_VERBOSE
1447 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1449 * RFC1812 recommenadtion, if source is martian,
1450 * the only hint is MAC header.
1452 printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1453 if (dev->hard_header_len) {
1454 int i;
1455 unsigned char *p = skb->mac.raw;
1456 printk(KERN_WARNING "ll header:");
1457 for (i=0; i<dev->hard_header_len; i++, p++)
1458 printk(" %02x", *p);
1459 printk("\n");
1462 #endif
1463 goto e_inval;
1466 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1467 u8 tos, struct net_device *dev)
1469 struct rtable * rth;
1470 unsigned hash;
1471 int iif = dev->ifindex;
1473 tos &= IPTOS_TOS_MASK;
1474 hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1476 read_lock_bh(&rt_hash_lock);
1477 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1478 if (rth->key.dst == daddr &&
1479 rth->key.src == saddr &&
1480 rth->key.iif == iif &&
1481 rth->key.oif == 0 &&
1482 #ifdef CONFIG_IP_ROUTE_FWMARK
1483 rth->key.fwmark
1484 == (skb->nfreason == NF_REASON_FOR_ROUTING
1485 ? skb->nfmark : 0) &&
1486 #endif
1487 rth->key.tos == tos) {
1488 rth->u.dst.lastuse = jiffies;
1489 dst_hold(&rth->u.dst);
1490 rth->u.dst.__use++;
1491 read_unlock_bh(&rt_hash_lock);
1492 skb->dst = (struct dst_entry*)rth;
1493 return 0;
1496 read_unlock_bh(&rt_hash_lock);
1498 /* Multicast recognition logic is moved from route cache to here.
1499 The problem was that too many Ethernet cards have broken/missing
1500 hardware multicast filters :-( As result the host on multicasting
1501 network acquires a lot of useless route cache entries, sort of
1502 SDR messages from all the world. Now we try to get rid of them.
1503 Really, provided software IP multicast filter is organized
1504 reasonably (at least, hashed), it does not result in a slowdown
1505 comparing with route cache reject entries.
1506 Note, that multicast routers are not affected, because
1507 route cache entry is created eventually.
1509 if (MULTICAST(daddr)) {
1510 struct in_device *in_dev;
1512 read_lock(&inetdev_lock);
1513 if ((in_dev = __in_dev_get(dev)) != NULL) {
1514 int our = ip_check_mc(in_dev, daddr);
1515 if (our
1516 #ifdef CONFIG_IP_MROUTE
1517 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1518 #endif
1520 read_unlock(&inetdev_lock);
1521 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1524 read_unlock(&inetdev_lock);
1525 return -EINVAL;
1527 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1531 * Major route resolver routine.
1534 int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1536 struct rt_key key;
1537 struct fib_result res;
1538 unsigned flags = 0;
1539 struct rtable *rth;
1540 struct net_device *dev_out = NULL;
1541 unsigned hash;
1542 int free_res = 0;
1543 int err;
1545 tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1546 key.dst = daddr;
1547 key.src = saddr;
1548 key.tos = tos&IPTOS_TOS_MASK;
1549 key.iif = loopback_dev.ifindex;
1550 key.oif = oif;
1551 key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1552 res.fi = NULL;
1553 #ifdef CONFIG_IP_MULTIPLE_TABLES
1554 res.r = NULL;
1555 #endif
1557 if (saddr) {
1558 if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1559 return -EINVAL;
1561 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1562 dev_out = ip_dev_find(saddr);
1563 if (dev_out == NULL)
1564 return -EINVAL;
1566 /* I removed check for oif == dev_out->oif here.
1567 It was wrong by three reasons:
1568 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1569 assigned to multiple interfaces.
1570 2. Moreover, we are allowed to send packets with saddr
1571 of another iface. --ANK
1574 if (oif == 0 &&
1575 (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1576 /* Special hack: user can direct multicasts
1577 and limited broadcast via necessary interface
1578 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1579 This hack is not just for fun, it allows
1580 vic,vat and friends to work.
1581 They bind socket to loopback, set ttl to zero
1582 and expect that it will work.
1583 From the viewpoint of routing cache they are broken,
1584 because we are not allowed to build multicast path
1585 with loopback source addr (look, routing cache
1586 cannot know, that ttl is zero, so that packet
1587 will not leave this host and route is valid).
1588 Luckily, this hack is good workaround.
1591 key.oif = dev_out->ifindex;
1592 goto make_route;
1594 if (dev_out)
1595 dev_put(dev_out);
1596 dev_out = NULL;
1598 if (oif) {
1599 dev_out = dev_get_by_index(oif);
1600 if (dev_out == NULL)
1601 return -ENODEV;
1602 if (__in_dev_get(dev_out) == NULL) {
1603 dev_put(dev_out);
1604 return -ENODEV; /* Wrong error code */
1607 if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1608 if (!key.src)
1609 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1610 goto make_route;
1612 if (!key.src) {
1613 if (MULTICAST(daddr))
1614 key.src = inet_select_addr(dev_out, 0, key.scope);
1615 else if (!daddr)
1616 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1620 if (!key.dst) {
1621 key.dst = key.src;
1622 if (!key.dst)
1623 key.dst = key.src = htonl(INADDR_LOOPBACK);
1624 if (dev_out)
1625 dev_put(dev_out);
1626 dev_out = &loopback_dev;
1627 dev_hold(dev_out);
1628 key.oif = loopback_dev.ifindex;
1629 res.type = RTN_LOCAL;
1630 flags |= RTCF_LOCAL;
1631 goto make_route;
1634 if (fib_lookup(&key, &res)) {
1635 res.fi = NULL;
1636 if (oif) {
1637 /* Apparently, routing tables are wrong. Assume,
1638 that the destination is on link.
1640 WHY? DW.
1641 Because we are allowed to send to iface
1642 even if it has NO routes and NO assigned
1643 addresses. When oif is specified, routing
1644 tables are looked up with only one purpose:
1645 to catch if destination is gatewayed, rather than
1646 direct. Moreover, if MSG_DONTROUTE is set,
1647 we send packet, ignoring both routing tables
1648 and ifaddr state. --ANK
1651 We could make it even if oif is unknown,
1652 likely IPv6, but we do not.
1655 if (key.src == 0)
1656 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1657 res.type = RTN_UNICAST;
1658 goto make_route;
1660 if (dev_out)
1661 dev_put(dev_out);
1662 return -ENETUNREACH;
1664 free_res = 1;
1666 if (res.type == RTN_NAT)
1667 goto e_inval;
1669 if (res.type == RTN_LOCAL) {
1670 if (!key.src)
1671 key.src = key.dst;
1672 if (dev_out)
1673 dev_put(dev_out);
1674 dev_out = &loopback_dev;
1675 dev_hold(dev_out);
1676 key.oif = dev_out->ifindex;
1677 if (res.fi)
1678 fib_info_put(res.fi);
1679 res.fi = NULL;
1680 flags |= RTCF_LOCAL;
1681 goto make_route;
1684 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1685 if (res.fi->fib_nhs > 1 && key.oif == 0)
1686 fib_select_multipath(&key, &res);
1687 else
1688 #endif
1689 if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1690 fib_select_default(&key, &res);
1692 if (!key.src)
1693 key.src = FIB_RES_PREFSRC(res);
1695 if (dev_out)
1696 dev_put(dev_out);
1697 dev_out = FIB_RES_DEV(res);
1698 dev_hold(dev_out);
1699 key.oif = dev_out->ifindex;
1701 make_route:
1702 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1703 goto e_inval;
1705 if (key.dst == 0xFFFFFFFF)
1706 res.type = RTN_BROADCAST;
1707 else if (MULTICAST(key.dst))
1708 res.type = RTN_MULTICAST;
1709 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1710 goto e_inval;
1712 if (dev_out->flags&IFF_LOOPBACK)
1713 flags |= RTCF_LOCAL;
1715 if (res.type == RTN_BROADCAST) {
1716 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1717 if (res.fi) {
1718 fib_info_put(res.fi);
1719 res.fi = NULL;
1721 } else if (res.type == RTN_MULTICAST) {
1722 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1723 read_lock(&inetdev_lock);
1724 if (!__in_dev_get(dev_out) || !ip_check_mc(__in_dev_get(dev_out), daddr))
1725 flags &= ~RTCF_LOCAL;
1726 read_unlock(&inetdev_lock);
1727 /* If multicast route do not exist use
1728 default one, but do not gateway in this case.
1729 Yes, it is hack.
1731 if (res.fi && res.prefixlen < 4) {
1732 fib_info_put(res.fi);
1733 res.fi = NULL;
1737 rth = dst_alloc(&ipv4_dst_ops);
1738 if (!rth)
1739 goto e_nobufs;
1741 atomic_set(&rth->u.dst.__refcnt, 1);
1742 rth->key.dst = daddr;
1743 rth->key.tos = tos;
1744 rth->key.src = saddr;
1745 rth->key.iif = 0;
1746 rth->key.oif = oif;
1747 rth->rt_dst = key.dst;
1748 rth->rt_src = key.src;
1749 #ifdef CONFIG_IP_ROUTE_NAT
1750 rth->rt_dst_map = key.dst;
1751 rth->rt_src_map = key.src;
1752 #endif
1753 rth->rt_iif = oif ? : dev_out->ifindex;
1754 rth->u.dst.dev = dev_out;
1755 dev_hold(dev_out);
1756 rth->rt_gateway = key.dst;
1757 rth->rt_spec_dst= key.src;
1759 rth->u.dst.output=ip_output;
1761 if (flags&RTCF_LOCAL) {
1762 rth->u.dst.input = ip_local_deliver;
1763 rth->rt_spec_dst = key.dst;
1765 if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1766 rth->rt_spec_dst = key.src;
1767 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1768 rth->u.dst.output = ip_mc_output;
1769 #ifdef CONFIG_IP_MROUTE
1770 if (res.type == RTN_MULTICAST) {
1771 struct in_device *in_dev = in_dev_get(dev_out);
1772 if (in_dev) {
1773 if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1774 rth->u.dst.input = ip_mr_input;
1775 rth->u.dst.output = ip_mc_output;
1777 in_dev_put(in_dev);
1780 #endif
1783 rt_set_nexthop(rth, &res, 0);
1785 rth->rt_flags = flags;
1787 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1788 err = rt_intern_hash(hash, rth, rp);
1789 done:
1790 if (free_res)
1791 fib_res_put(&res);
1792 if (dev_out)
1793 dev_put(dev_out);
1794 return err;
1796 e_inval:
1797 err = -EINVAL;
1798 goto done;
1799 e_nobufs:
1800 err = -ENOBUFS;
1801 goto done;
1804 int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1806 unsigned hash;
1807 struct rtable *rth;
1809 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1811 read_lock_bh(&rt_hash_lock);
1812 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1813 if (rth->key.dst == daddr &&
1814 rth->key.src == saddr &&
1815 rth->key.iif == 0 &&
1816 rth->key.oif == oif &&
1817 !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1818 ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1820 rth->u.dst.lastuse = jiffies;
1821 dst_hold(&rth->u.dst);
1822 rth->u.dst.__use++;
1823 read_unlock_bh(&rt_hash_lock);
1824 *rp = rth;
1825 return 0;
1828 read_unlock_bh(&rt_hash_lock);
1830 return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1833 #ifdef CONFIG_RTNETLINK
1835 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1837 struct rtable *rt = (struct rtable*)skb->dst;
1838 struct rtmsg *r;
1839 struct nlmsghdr *nlh;
1840 unsigned char *b = skb->tail;
1841 struct rta_cacheinfo ci;
1842 #ifdef CONFIG_IP_MROUTE
1843 struct rtattr *eptr;
1844 #endif
1846 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1847 r = NLMSG_DATA(nlh);
1848 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1849 r->rtm_family = AF_INET;
1850 r->rtm_dst_len = 32;
1851 r->rtm_src_len = 0;
1852 r->rtm_tos = rt->key.tos;
1853 r->rtm_table = RT_TABLE_MAIN;
1854 r->rtm_type = rt->rt_type;
1855 r->rtm_scope = RT_SCOPE_UNIVERSE;
1856 r->rtm_protocol = RTPROT_UNSPEC;
1857 r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1858 if (rt->rt_flags & RTCF_NOTIFY)
1859 r->rtm_flags |= RTM_F_NOTIFY;
1860 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1861 if (rt->key.src) {
1862 r->rtm_src_len = 32;
1863 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1865 if (rt->u.dst.dev)
1866 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1867 #ifdef CONFIG_NET_CLS_ROUTE
1868 if (rt->u.dst.tclassid)
1869 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1870 #endif
1871 if (rt->key.iif)
1872 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1873 else if (rt->rt_src != rt->key.src)
1874 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1875 if (rt->rt_dst != rt->rt_gateway)
1876 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1877 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
1878 goto rtattr_failure;
1879 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1880 ci.rta_used = rt->u.dst.__use;
1881 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1882 if (rt->u.dst.expires)
1883 ci.rta_expires = rt->u.dst.expires - jiffies;
1884 else
1885 ci.rta_expires = 0;
1886 ci.rta_error = rt->u.dst.error;
1887 #ifdef CONFIG_IP_MROUTE
1888 eptr = (struct rtattr*)skb->tail;
1889 #endif
1890 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1891 if (rt->key.iif) {
1892 #ifdef CONFIG_IP_MROUTE
1893 u32 dst = rt->rt_dst;
1895 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1896 int err = ipmr_get_route(skb, r, nowait);
1897 if (err <= 0) {
1898 if (!nowait) {
1899 if (err == 0)
1900 return 0;
1901 goto nlmsg_failure;
1902 } else {
1903 if (err == -EMSGSIZE)
1904 goto nlmsg_failure;
1905 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1908 } else
1909 #endif
1911 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1915 nlh->nlmsg_len = skb->tail - b;
1916 return skb->len;
1918 nlmsg_failure:
1919 rtattr_failure:
1920 skb_trim(skb, b - skb->data);
1921 return -1;
1924 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1926 struct rtattr **rta = arg;
1927 struct rtmsg *rtm = NLMSG_DATA(nlh);
1928 struct rtable *rt = NULL;
1929 u32 dst = 0;
1930 u32 src = 0;
1931 int iif = 0;
1932 int err;
1933 struct sk_buff *skb;
1935 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1936 if (skb == NULL)
1937 return -ENOBUFS;
1939 /* Reserve room for dummy headers, this skb can pass
1940 through good chunk of routing engine.
1942 skb->mac.raw = skb->data;
1943 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1945 if (rta[RTA_SRC-1])
1946 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1947 if (rta[RTA_DST-1])
1948 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1949 if (rta[RTA_IIF-1])
1950 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1952 if (iif) {
1953 struct net_device *dev;
1954 dev = __dev_get_by_index(iif);
1955 if (!dev)
1956 return -ENODEV;
1957 skb->protocol = __constant_htons(ETH_P_IP);
1958 skb->dev = dev;
1959 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1960 rt = (struct rtable*)skb->dst;
1961 if (!err && rt->u.dst.error)
1962 err = -rt->u.dst.error;
1963 } else {
1964 int oif = 0;
1965 if (rta[RTA_OIF-1])
1966 memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1967 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1969 if (err) {
1970 kfree_skb(skb);
1971 return err;
1974 skb->dst = &rt->u.dst;
1975 if (rtm->rtm_flags & RTM_F_NOTIFY)
1976 rt->rt_flags |= RTCF_NOTIFY;
1978 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1980 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1981 if (err == 0)
1982 return 0;
1983 if (err < 0)
1984 return -EMSGSIZE;
1986 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1987 if (err < 0)
1988 return err;
1989 return 0;
1993 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
1995 struct rtable *rt;
1996 int h, s_h;
1997 int idx, s_idx;
1999 s_h = cb->args[0];
2000 s_idx = idx = cb->args[1];
2001 for (h=0; h < RT_HASH_DIVISOR; h++) {
2002 if (h < s_h) continue;
2003 if (h > s_h)
2004 s_idx = 0;
2005 read_lock_bh(&rt_hash_lock);
2006 for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
2007 if (idx < s_idx)
2008 continue;
2009 skb->dst = dst_clone(&rt->u.dst);
2010 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2011 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
2012 dst_release(xchg(&skb->dst, NULL));
2013 read_unlock_bh(&rt_hash_lock);
2014 goto done;
2016 dst_release(xchg(&skb->dst, NULL));
2018 read_unlock_bh(&rt_hash_lock);
2021 done:
2022 cb->args[0] = h;
2023 cb->args[1] = idx;
2024 return skb->len;
2027 #endif /* CONFIG_RTNETLINK */
2029 void ip_rt_multicast_event(struct in_device *in_dev)
2031 rt_cache_flush(0);
2036 #ifdef CONFIG_SYSCTL
2038 static int flush_delay;
2040 static
2041 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2042 void *buffer, size_t *lenp)
2044 if (write) {
2045 proc_dointvec(ctl, write, filp, buffer, lenp);
2046 rt_cache_flush(flush_delay);
2047 return 0;
2048 } else
2049 return -EINVAL;
2052 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen,
2053 void *oldval, size_t *oldlenp,
2054 void *newval, size_t newlen,
2055 void **context)
2057 int delay;
2058 if (newlen != sizeof(int))
2059 return -EINVAL;
2060 if (get_user(delay,(int *)newval))
2061 return -EFAULT;
2062 rt_cache_flush(delay);
2063 return 0;
2066 ctl_table ipv4_route_table[] = {
2067 {NET_IPV4_ROUTE_FLUSH, "flush",
2068 &flush_delay, sizeof(int), 0644, NULL,
2069 &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy },
2070 {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
2071 &ip_rt_min_delay, sizeof(int), 0644, NULL,
2072 &proc_dointvec_jiffies, &sysctl_jiffies},
2073 {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
2074 &ip_rt_max_delay, sizeof(int), 0644, NULL,
2075 &proc_dointvec_jiffies, &sysctl_jiffies},
2076 {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
2077 &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
2078 &proc_dointvec},
2079 {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
2080 &ip_rt_max_size, sizeof(int), 0644, NULL,
2081 &proc_dointvec},
2082 {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
2083 &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
2084 &proc_dointvec_jiffies, &sysctl_jiffies},
2085 {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
2086 &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
2087 &proc_dointvec_jiffies, &sysctl_jiffies},
2088 {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
2089 &ip_rt_gc_interval, sizeof(int), 0644, NULL,
2090 &proc_dointvec_jiffies, &sysctl_jiffies},
2091 {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
2092 &ip_rt_redirect_load, sizeof(int), 0644, NULL,
2093 &proc_dointvec},
2094 {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
2095 &ip_rt_redirect_number, sizeof(int), 0644, NULL,
2096 &proc_dointvec},
2097 {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
2098 &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
2099 &proc_dointvec},
2100 {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
2101 &ip_rt_error_cost, sizeof(int), 0644, NULL,
2102 &proc_dointvec},
2103 {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
2104 &ip_rt_error_burst, sizeof(int), 0644, NULL,
2105 &proc_dointvec},
2106 {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
2107 &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
2108 &proc_dointvec},
2109 {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
2110 &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
2111 &proc_dointvec_jiffies, &sysctl_jiffies},
2112 {NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu",
2113 &ip_rt_min_pmtu, sizeof(int), 0644, NULL,
2114 &proc_dointvec},
2115 {NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss",
2116 &ip_rt_min_advmss, sizeof(int), 0644, NULL,
2117 &proc_dointvec},
2120 #endif
2122 #ifdef CONFIG_NET_CLS_ROUTE
2123 struct ip_rt_acct ip_rt_acct[256];
2124 rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
2126 #ifdef CONFIG_PROC_FS
2127 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2128 int length, int *eof, void *data)
2130 *start=buffer;
2132 if (offset + length > sizeof(ip_rt_acct)) {
2133 length = sizeof(ip_rt_acct) - offset;
2134 *eof = 1;
2136 if (length > 0) {
2137 read_lock_bh(&ip_rt_acct_lock);
2138 memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
2139 read_unlock_bh(&ip_rt_acct_lock);
2140 return length;
2142 return 0;
2144 #endif
2145 #endif
2148 void __init ip_rt_init(void)
2150 #ifdef CONFIG_PROC_FS
2151 #ifdef CONFIG_NET_CLS_ROUTE
2152 struct proc_dir_entry *ent;
2153 #endif
2154 #endif
2155 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2156 sizeof(struct rtable),
2157 0, SLAB_HWCACHE_ALIGN,
2158 NULL, NULL);
2160 devinet_init();
2161 ip_fib_init();
2162 rt_periodic_timer.function = rt_check_expire;
2163 /* All the timers, started at system startup tend
2164 to synchronize. Perturb it a bit.
2166 rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2167 + ip_rt_gc_interval;
2168 add_timer(&rt_periodic_timer);
2170 #ifdef CONFIG_PROC_FS
2171 proc_net_register(&(struct proc_dir_entry) {
2172 PROC_NET_RTCACHE, 8, "rt_cache",
2173 S_IFREG | S_IRUGO, 1, 0, 0,
2174 0, &proc_net_inode_operations,
2175 rt_cache_get_info
2177 #ifdef CONFIG_NET_CLS_ROUTE
2178 ent = create_proc_entry("net/rt_acct", 0, 0);
2179 ent->read_proc = ip_rt_acct_read;
2180 #endif
2181 #endif