[PATCH] v850: call init_page_count() instead of set_page_count()
[linux-2.6/verdex.git] / net / ipv4 / route.c
blob2dc6dbb284678916db25257405da673cfea06f72
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_min_delay = 2 * HZ;
119 static int ip_rt_max_delay = 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval = 60 * HZ;
123 static int ip_rt_gc_min_interval = HZ / 2;
124 static int ip_rt_redirect_number = 9;
125 static int ip_rt_redirect_load = HZ / 50;
126 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost = HZ;
128 static int ip_rt_error_burst = 5 * HZ;
129 static int ip_rt_gc_elasticity = 8;
130 static int ip_rt_mtu_expires = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu = 512 + 20 + 20;
132 static int ip_rt_min_advmss = 256;
133 static int ip_rt_secret_interval = 10 * 60 * HZ;
134 static unsigned long rt_deadline;
136 #define RTprint(a...) printk(KERN_DEBUG a)
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
143 * Interface to generic destination cache.
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void ipv4_link_failure(struct sk_buff *skb);
152 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
156 static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
169 #define ECN_OR_COST(class) TC_PRIO_##class
171 __u8 ip_tos2prio[16] = {
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
192 * Route cache.
195 /* The locking scheme is rather straight forward:
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
205 struct rt_hash_bucket {
206 struct rtable *chain;
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ 256
217 #else
218 # if NR_CPUS >= 32
219 # define RT_HASH_LOCK_SZ 4096
220 # elif NR_CPUS >= 16
221 # define RT_HASH_LOCK_SZ 2048
222 # elif NR_CPUS >= 8
223 # define RT_HASH_LOCK_SZ 1024
224 # elif NR_CPUS >= 4
225 # define RT_HASH_LOCK_SZ 512
226 # else
227 # define RT_HASH_LOCK_SZ 256
228 # endif
229 #endif
231 static spinlock_t *rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init() { \
234 int i; \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
240 #else
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
243 #endif
245 static struct rt_hash_bucket *rt_hash_table;
246 static unsigned rt_hash_mask;
247 static int rt_hash_log;
248 static unsigned int rt_hash_rnd;
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252 (__raw_get_cpu_var(rt_cache_stat).field++)
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
260 & rt_hash_mask);
263 #ifdef CONFIG_PROC_FS
264 struct rt_cache_iter_state {
265 int bucket;
268 static struct rtable *rt_cache_get_first(struct seq_file *seq)
270 struct rtable *r = NULL;
271 struct rt_cache_iter_state *st = seq->private;
273 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
274 rcu_read_lock_bh();
275 r = rt_hash_table[st->bucket].chain;
276 if (r)
277 break;
278 rcu_read_unlock_bh();
280 return r;
283 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
285 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
287 r = r->u.rt_next;
288 while (!r) {
289 rcu_read_unlock_bh();
290 if (--st->bucket < 0)
291 break;
292 rcu_read_lock_bh();
293 r = rt_hash_table[st->bucket].chain;
295 return r;
298 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
300 struct rtable *r = rt_cache_get_first(seq);
302 if (r)
303 while (pos && (r = rt_cache_get_next(seq, r)))
304 --pos;
305 return pos ? NULL : r;
308 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
310 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
313 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
315 struct rtable *r = NULL;
317 if (v == SEQ_START_TOKEN)
318 r = rt_cache_get_first(seq);
319 else
320 r = rt_cache_get_next(seq, v);
321 ++*pos;
322 return r;
325 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
327 if (v && v != SEQ_START_TOKEN)
328 rcu_read_unlock_bh();
331 static int rt_cache_seq_show(struct seq_file *seq, void *v)
333 if (v == SEQ_START_TOKEN)
334 seq_printf(seq, "%-127s\n",
335 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
336 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
337 "HHUptod\tSpecDst");
338 else {
339 struct rtable *r = v;
340 char temp[256];
342 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
343 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
344 r->u.dst.dev ? r->u.dst.dev->name : "*",
345 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
346 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
347 r->u.dst.__use, 0, (unsigned long)r->rt_src,
348 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
349 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
350 dst_metric(&r->u.dst, RTAX_WINDOW),
351 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
352 dst_metric(&r->u.dst, RTAX_RTTVAR)),
353 r->fl.fl4_tos,
354 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
355 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
356 dev_queue_xmit) : 0,
357 r->rt_spec_dst);
358 seq_printf(seq, "%-127s\n", temp);
360 return 0;
363 static struct seq_operations rt_cache_seq_ops = {
364 .start = rt_cache_seq_start,
365 .next = rt_cache_seq_next,
366 .stop = rt_cache_seq_stop,
367 .show = rt_cache_seq_show,
370 static int rt_cache_seq_open(struct inode *inode, struct file *file)
372 struct seq_file *seq;
373 int rc = -ENOMEM;
374 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
376 if (!s)
377 goto out;
378 rc = seq_open(file, &rt_cache_seq_ops);
379 if (rc)
380 goto out_kfree;
381 seq = file->private_data;
382 seq->private = s;
383 memset(s, 0, sizeof(*s));
384 out:
385 return rc;
386 out_kfree:
387 kfree(s);
388 goto out;
391 static struct file_operations rt_cache_seq_fops = {
392 .owner = THIS_MODULE,
393 .open = rt_cache_seq_open,
394 .read = seq_read,
395 .llseek = seq_lseek,
396 .release = seq_release_private,
400 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
402 int cpu;
404 if (*pos == 0)
405 return SEQ_START_TOKEN;
407 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
408 if (!cpu_possible(cpu))
409 continue;
410 *pos = cpu+1;
411 return &per_cpu(rt_cache_stat, cpu);
413 return NULL;
416 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
418 int cpu;
420 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
421 if (!cpu_possible(cpu))
422 continue;
423 *pos = cpu+1;
424 return &per_cpu(rt_cache_stat, cpu);
426 return NULL;
430 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
437 struct rt_cache_stat *st = v;
439 if (v == SEQ_START_TOKEN) {
440 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
441 return 0;
444 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
445 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
446 atomic_read(&ipv4_dst_ops.entries),
447 st->in_hit,
448 st->in_slow_tot,
449 st->in_slow_mc,
450 st->in_no_route,
451 st->in_brd,
452 st->in_martian_dst,
453 st->in_martian_src,
455 st->out_hit,
456 st->out_slow_tot,
457 st->out_slow_mc,
459 st->gc_total,
460 st->gc_ignored,
461 st->gc_goal_miss,
462 st->gc_dst_overflow,
463 st->in_hlist_search,
464 st->out_hlist_search
466 return 0;
469 static struct seq_operations rt_cpu_seq_ops = {
470 .start = rt_cpu_seq_start,
471 .next = rt_cpu_seq_next,
472 .stop = rt_cpu_seq_stop,
473 .show = rt_cpu_seq_show,
477 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
479 return seq_open(file, &rt_cpu_seq_ops);
482 static struct file_operations rt_cpu_seq_fops = {
483 .owner = THIS_MODULE,
484 .open = rt_cpu_seq_open,
485 .read = seq_read,
486 .llseek = seq_lseek,
487 .release = seq_release,
490 #endif /* CONFIG_PROC_FS */
492 static __inline__ void rt_free(struct rtable *rt)
494 multipath_remove(rt);
495 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
498 static __inline__ void rt_drop(struct rtable *rt)
500 multipath_remove(rt);
501 ip_rt_put(rt);
502 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
505 static __inline__ int rt_fast_clean(struct rtable *rth)
507 /* Kill broadcast/multicast entries very aggresively, if they
508 collide in hash table with more useful entries */
509 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
510 rth->fl.iif && rth->u.rt_next;
513 static __inline__ int rt_valuable(struct rtable *rth)
515 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
516 rth->u.dst.expires;
519 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
521 unsigned long age;
522 int ret = 0;
524 if (atomic_read(&rth->u.dst.__refcnt))
525 goto out;
527 ret = 1;
528 if (rth->u.dst.expires &&
529 time_after_eq(jiffies, rth->u.dst.expires))
530 goto out;
532 age = jiffies - rth->u.dst.lastuse;
533 ret = 0;
534 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
535 (age <= tmo2 && rt_valuable(rth)))
536 goto out;
537 ret = 1;
538 out: return ret;
541 /* Bits of score are:
542 * 31: very valuable
543 * 30: not quite useless
544 * 29..0: usage counter
546 static inline u32 rt_score(struct rtable *rt)
548 u32 score = jiffies - rt->u.dst.lastuse;
550 score = ~score & ~(3<<30);
552 if (rt_valuable(rt))
553 score |= (1<<31);
555 if (!rt->fl.iif ||
556 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
557 score |= (1<<30);
559 return score;
562 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
564 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
565 fl1->oif == fl2->oif &&
566 fl1->iif == fl2->iif;
569 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
570 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
571 struct rtable *expentry,
572 int *removed_count)
574 int passedexpired = 0;
575 struct rtable **nextstep = NULL;
576 struct rtable **rthp = chain_head;
577 struct rtable *rth;
579 if (removed_count)
580 *removed_count = 0;
582 while ((rth = *rthp) != NULL) {
583 if (rth == expentry)
584 passedexpired = 1;
586 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
587 compare_keys(&(*rthp)->fl, &expentry->fl)) {
588 if (*rthp == expentry) {
589 *rthp = rth->u.rt_next;
590 continue;
591 } else {
592 *rthp = rth->u.rt_next;
593 rt_free(rth);
594 if (removed_count)
595 ++(*removed_count);
597 } else {
598 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
599 passedexpired && !nextstep)
600 nextstep = &rth->u.rt_next;
602 rthp = &rth->u.rt_next;
606 rt_free(expentry);
607 if (removed_count)
608 ++(*removed_count);
610 return nextstep;
612 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
615 /* This runs via a timer and thus is always in BH context. */
616 static void rt_check_expire(unsigned long dummy)
618 static unsigned int rover;
619 unsigned int i = rover, goal;
620 struct rtable *rth, **rthp;
621 unsigned long now = jiffies;
622 u64 mult;
624 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
625 if (ip_rt_gc_timeout > 1)
626 do_div(mult, ip_rt_gc_timeout);
627 goal = (unsigned int)mult;
628 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
629 for (; goal > 0; goal--) {
630 unsigned long tmo = ip_rt_gc_timeout;
632 i = (i + 1) & rt_hash_mask;
633 rthp = &rt_hash_table[i].chain;
635 if (*rthp == 0)
636 continue;
637 spin_lock(rt_hash_lock_addr(i));
638 while ((rth = *rthp) != NULL) {
639 if (rth->u.dst.expires) {
640 /* Entry is expired even if it is in use */
641 if (time_before_eq(now, rth->u.dst.expires)) {
642 tmo >>= 1;
643 rthp = &rth->u.rt_next;
644 continue;
646 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
647 tmo >>= 1;
648 rthp = &rth->u.rt_next;
649 continue;
652 /* Cleanup aged off entries. */
653 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 /* remove all related balanced entries if necessary */
655 if (rth->u.dst.flags & DST_BALANCED) {
656 rthp = rt_remove_balanced_route(
657 &rt_hash_table[i].chain,
658 rth, NULL);
659 if (!rthp)
660 break;
661 } else {
662 *rthp = rth->u.rt_next;
663 rt_free(rth);
665 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
666 *rthp = rth->u.rt_next;
667 rt_free(rth);
668 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
670 spin_unlock(rt_hash_lock_addr(i));
672 /* Fallback loop breaker. */
673 if (time_after(jiffies, now))
674 break;
676 rover = i;
677 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
680 /* This can run from both BH and non-BH contexts, the latter
681 * in the case of a forced flush event.
683 static void rt_run_flush(unsigned long dummy)
685 int i;
686 struct rtable *rth, *next;
688 rt_deadline = 0;
690 get_random_bytes(&rt_hash_rnd, 4);
692 for (i = rt_hash_mask; i >= 0; i--) {
693 spin_lock_bh(rt_hash_lock_addr(i));
694 rth = rt_hash_table[i].chain;
695 if (rth)
696 rt_hash_table[i].chain = NULL;
697 spin_unlock_bh(rt_hash_lock_addr(i));
699 for (; rth; rth = next) {
700 next = rth->u.rt_next;
701 rt_free(rth);
706 static DEFINE_SPINLOCK(rt_flush_lock);
708 void rt_cache_flush(int delay)
710 unsigned long now = jiffies;
711 int user_mode = !in_softirq();
713 if (delay < 0)
714 delay = ip_rt_min_delay;
716 /* flush existing multipath state*/
717 multipath_flush();
719 spin_lock_bh(&rt_flush_lock);
721 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
722 long tmo = (long)(rt_deadline - now);
724 /* If flush timer is already running
725 and flush request is not immediate (delay > 0):
727 if deadline is not achieved, prolongate timer to "delay",
728 otherwise fire it at deadline time.
731 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
732 tmo = 0;
734 if (delay > tmo)
735 delay = tmo;
738 if (delay <= 0) {
739 spin_unlock_bh(&rt_flush_lock);
740 rt_run_flush(0);
741 return;
744 if (rt_deadline == 0)
745 rt_deadline = now + ip_rt_max_delay;
747 mod_timer(&rt_flush_timer, now+delay);
748 spin_unlock_bh(&rt_flush_lock);
751 static void rt_secret_rebuild(unsigned long dummy)
753 unsigned long now = jiffies;
755 rt_cache_flush(0);
756 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
760 Short description of GC goals.
762 We want to build algorithm, which will keep routing cache
763 at some equilibrium point, when number of aged off entries
764 is kept approximately equal to newly generated ones.
766 Current expiration strength is variable "expire".
767 We try to adjust it dynamically, so that if networking
768 is idle expires is large enough to keep enough of warm entries,
769 and when load increases it reduces to limit cache size.
772 static int rt_garbage_collect(void)
774 static unsigned long expire = RT_GC_TIMEOUT;
775 static unsigned long last_gc;
776 static int rover;
777 static int equilibrium;
778 struct rtable *rth, **rthp;
779 unsigned long now = jiffies;
780 int goal;
783 * Garbage collection is pretty expensive,
784 * do not make it too frequently.
787 RT_CACHE_STAT_INC(gc_total);
789 if (now - last_gc < ip_rt_gc_min_interval &&
790 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
791 RT_CACHE_STAT_INC(gc_ignored);
792 goto out;
795 /* Calculate number of entries, which we want to expire now. */
796 goal = atomic_read(&ipv4_dst_ops.entries) -
797 (ip_rt_gc_elasticity << rt_hash_log);
798 if (goal <= 0) {
799 if (equilibrium < ipv4_dst_ops.gc_thresh)
800 equilibrium = ipv4_dst_ops.gc_thresh;
801 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
802 if (goal > 0) {
803 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
806 } else {
807 /* We are in dangerous area. Try to reduce cache really
808 * aggressively.
810 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
811 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
814 if (now - last_gc >= ip_rt_gc_min_interval)
815 last_gc = now;
817 if (goal <= 0) {
818 equilibrium += goal;
819 goto work_done;
822 do {
823 int i, k;
825 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
826 unsigned long tmo = expire;
828 k = (k + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[k].chain;
830 spin_lock_bh(rt_hash_lock_addr(k));
831 while ((rth = *rthp) != NULL) {
832 if (!rt_may_expire(rth, tmo, expire)) {
833 tmo >>= 1;
834 rthp = &rth->u.rt_next;
835 continue;
837 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
838 /* remove all related balanced entries
839 * if necessary
841 if (rth->u.dst.flags & DST_BALANCED) {
842 int r;
844 rthp = rt_remove_balanced_route(
845 &rt_hash_table[k].chain,
846 rth,
847 &r);
848 goal -= r;
849 if (!rthp)
850 break;
851 } else {
852 *rthp = rth->u.rt_next;
853 rt_free(rth);
854 goal--;
856 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
857 *rthp = rth->u.rt_next;
858 rt_free(rth);
859 goal--;
860 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
862 spin_unlock_bh(rt_hash_lock_addr(k));
863 if (goal <= 0)
864 break;
866 rover = k;
868 if (goal <= 0)
869 goto work_done;
871 /* Goal is not achieved. We stop process if:
873 - if expire reduced to zero. Otherwise, expire is halfed.
874 - if table is not full.
875 - if we are called from interrupt.
876 - jiffies check is just fallback/debug loop breaker.
877 We will not spin here for long time in any case.
880 RT_CACHE_STAT_INC(gc_goal_miss);
882 if (expire == 0)
883 break;
885 expire >>= 1;
886 #if RT_CACHE_DEBUG >= 2
887 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
888 atomic_read(&ipv4_dst_ops.entries), goal, i);
889 #endif
891 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
892 goto out;
893 } while (!in_softirq() && time_before_eq(jiffies, now));
895 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
896 goto out;
897 if (net_ratelimit())
898 printk(KERN_WARNING "dst cache overflow\n");
899 RT_CACHE_STAT_INC(gc_dst_overflow);
900 return 1;
902 work_done:
903 expire += ip_rt_gc_min_interval;
904 if (expire > ip_rt_gc_timeout ||
905 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
906 expire = ip_rt_gc_timeout;
907 #if RT_CACHE_DEBUG >= 2
908 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
909 atomic_read(&ipv4_dst_ops.entries), goal, rover);
910 #endif
911 out: return 0;
914 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
916 struct rtable *rth, **rthp;
917 unsigned long now;
918 struct rtable *cand, **candp;
919 u32 min_score;
920 int chain_length;
921 int attempts = !in_softirq();
923 restart:
924 chain_length = 0;
925 min_score = ~(u32)0;
926 cand = NULL;
927 candp = NULL;
928 now = jiffies;
930 rthp = &rt_hash_table[hash].chain;
932 spin_lock_bh(rt_hash_lock_addr(hash));
933 while ((rth = *rthp) != NULL) {
934 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
935 if (!(rth->u.dst.flags & DST_BALANCED) &&
936 compare_keys(&rth->fl, &rt->fl)) {
937 #else
938 if (compare_keys(&rth->fl, &rt->fl)) {
939 #endif
940 /* Put it first */
941 *rthp = rth->u.rt_next;
943 * Since lookup is lockfree, the deletion
944 * must be visible to another weakly ordered CPU before
945 * the insertion at the start of the hash chain.
947 rcu_assign_pointer(rth->u.rt_next,
948 rt_hash_table[hash].chain);
950 * Since lookup is lockfree, the update writes
951 * must be ordered for consistency on SMP.
953 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
955 rth->u.dst.__use++;
956 dst_hold(&rth->u.dst);
957 rth->u.dst.lastuse = now;
958 spin_unlock_bh(rt_hash_lock_addr(hash));
960 rt_drop(rt);
961 *rp = rth;
962 return 0;
965 if (!atomic_read(&rth->u.dst.__refcnt)) {
966 u32 score = rt_score(rth);
968 if (score <= min_score) {
969 cand = rth;
970 candp = rthp;
971 min_score = score;
975 chain_length++;
977 rthp = &rth->u.rt_next;
980 if (cand) {
981 /* ip_rt_gc_elasticity used to be average length of chain
982 * length, when exceeded gc becomes really aggressive.
984 * The second limit is less certain. At the moment it allows
985 * only 2 entries per bucket. We will see.
987 if (chain_length > ip_rt_gc_elasticity) {
988 *candp = cand->u.rt_next;
989 rt_free(cand);
993 /* Try to bind route to arp only if it is output
994 route or unicast forwarding path.
996 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
997 int err = arp_bind_neighbour(&rt->u.dst);
998 if (err) {
999 spin_unlock_bh(rt_hash_lock_addr(hash));
1001 if (err != -ENOBUFS) {
1002 rt_drop(rt);
1003 return err;
1006 /* Neighbour tables are full and nothing
1007 can be released. Try to shrink route cache,
1008 it is most likely it holds some neighbour records.
1010 if (attempts-- > 0) {
1011 int saved_elasticity = ip_rt_gc_elasticity;
1012 int saved_int = ip_rt_gc_min_interval;
1013 ip_rt_gc_elasticity = 1;
1014 ip_rt_gc_min_interval = 0;
1015 rt_garbage_collect();
1016 ip_rt_gc_min_interval = saved_int;
1017 ip_rt_gc_elasticity = saved_elasticity;
1018 goto restart;
1021 if (net_ratelimit())
1022 printk(KERN_WARNING "Neighbour table overflow.\n");
1023 rt_drop(rt);
1024 return -ENOBUFS;
1028 rt->u.rt_next = rt_hash_table[hash].chain;
1029 #if RT_CACHE_DEBUG >= 2
1030 if (rt->u.rt_next) {
1031 struct rtable *trt;
1032 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1033 NIPQUAD(rt->rt_dst));
1034 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1035 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1036 printk("\n");
1038 #endif
1039 rt_hash_table[hash].chain = rt;
1040 spin_unlock_bh(rt_hash_lock_addr(hash));
1041 *rp = rt;
1042 return 0;
1045 void rt_bind_peer(struct rtable *rt, int create)
1047 static DEFINE_SPINLOCK(rt_peer_lock);
1048 struct inet_peer *peer;
1050 peer = inet_getpeer(rt->rt_dst, create);
1052 spin_lock_bh(&rt_peer_lock);
1053 if (rt->peer == NULL) {
1054 rt->peer = peer;
1055 peer = NULL;
1057 spin_unlock_bh(&rt_peer_lock);
1058 if (peer)
1059 inet_putpeer(peer);
1063 * Peer allocation may fail only in serious out-of-memory conditions. However
1064 * we still can generate some output.
1065 * Random ID selection looks a bit dangerous because we have no chances to
1066 * select ID being unique in a reasonable period of time.
1067 * But broken packet identifier may be better than no packet at all.
1069 static void ip_select_fb_ident(struct iphdr *iph)
1071 static DEFINE_SPINLOCK(ip_fb_id_lock);
1072 static u32 ip_fallback_id;
1073 u32 salt;
1075 spin_lock_bh(&ip_fb_id_lock);
1076 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1077 iph->id = htons(salt & 0xFFFF);
1078 ip_fallback_id = salt;
1079 spin_unlock_bh(&ip_fb_id_lock);
1082 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1084 struct rtable *rt = (struct rtable *) dst;
1086 if (rt) {
1087 if (rt->peer == NULL)
1088 rt_bind_peer(rt, 1);
1090 /* If peer is attached to destination, it is never detached,
1091 so that we need not to grab a lock to dereference it.
1093 if (rt->peer) {
1094 iph->id = htons(inet_getid(rt->peer, more));
1095 return;
1097 } else
1098 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1099 __builtin_return_address(0));
1101 ip_select_fb_ident(iph);
1104 static void rt_del(unsigned hash, struct rtable *rt)
1106 struct rtable **rthp;
1108 spin_lock_bh(rt_hash_lock_addr(hash));
1109 ip_rt_put(rt);
1110 for (rthp = &rt_hash_table[hash].chain; *rthp;
1111 rthp = &(*rthp)->u.rt_next)
1112 if (*rthp == rt) {
1113 *rthp = rt->u.rt_next;
1114 rt_free(rt);
1115 break;
1117 spin_unlock_bh(rt_hash_lock_addr(hash));
1120 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1121 u32 saddr, struct net_device *dev)
1123 int i, k;
1124 struct in_device *in_dev = in_dev_get(dev);
1125 struct rtable *rth, **rthp;
1126 u32 skeys[2] = { saddr, 0 };
1127 int ikeys[2] = { dev->ifindex, 0 };
1129 if (!in_dev)
1130 return;
1132 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1133 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1134 goto reject_redirect;
1136 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1137 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1138 goto reject_redirect;
1139 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1140 goto reject_redirect;
1141 } else {
1142 if (inet_addr_type(new_gw) != RTN_UNICAST)
1143 goto reject_redirect;
1146 for (i = 0; i < 2; i++) {
1147 for (k = 0; k < 2; k++) {
1148 unsigned hash = rt_hash_code(daddr,
1149 skeys[i] ^ (ikeys[k] << 5));
1151 rthp=&rt_hash_table[hash].chain;
1153 rcu_read_lock();
1154 while ((rth = rcu_dereference(*rthp)) != NULL) {
1155 struct rtable *rt;
1157 if (rth->fl.fl4_dst != daddr ||
1158 rth->fl.fl4_src != skeys[i] ||
1159 rth->fl.oif != ikeys[k] ||
1160 rth->fl.iif != 0) {
1161 rthp = &rth->u.rt_next;
1162 continue;
1165 if (rth->rt_dst != daddr ||
1166 rth->rt_src != saddr ||
1167 rth->u.dst.error ||
1168 rth->rt_gateway != old_gw ||
1169 rth->u.dst.dev != dev)
1170 break;
1172 dst_hold(&rth->u.dst);
1173 rcu_read_unlock();
1175 rt = dst_alloc(&ipv4_dst_ops);
1176 if (rt == NULL) {
1177 ip_rt_put(rth);
1178 in_dev_put(in_dev);
1179 return;
1182 /* Copy all the information. */
1183 *rt = *rth;
1184 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1185 rt->u.dst.__use = 1;
1186 atomic_set(&rt->u.dst.__refcnt, 1);
1187 rt->u.dst.child = NULL;
1188 if (rt->u.dst.dev)
1189 dev_hold(rt->u.dst.dev);
1190 if (rt->idev)
1191 in_dev_hold(rt->idev);
1192 rt->u.dst.obsolete = 0;
1193 rt->u.dst.lastuse = jiffies;
1194 rt->u.dst.path = &rt->u.dst;
1195 rt->u.dst.neighbour = NULL;
1196 rt->u.dst.hh = NULL;
1197 rt->u.dst.xfrm = NULL;
1199 rt->rt_flags |= RTCF_REDIRECTED;
1201 /* Gateway is different ... */
1202 rt->rt_gateway = new_gw;
1204 /* Redirect received -> path was valid */
1205 dst_confirm(&rth->u.dst);
1207 if (rt->peer)
1208 atomic_inc(&rt->peer->refcnt);
1210 if (arp_bind_neighbour(&rt->u.dst) ||
1211 !(rt->u.dst.neighbour->nud_state &
1212 NUD_VALID)) {
1213 if (rt->u.dst.neighbour)
1214 neigh_event_send(rt->u.dst.neighbour, NULL);
1215 ip_rt_put(rth);
1216 rt_drop(rt);
1217 goto do_next;
1220 rt_del(hash, rth);
1221 if (!rt_intern_hash(hash, rt, &rt))
1222 ip_rt_put(rt);
1223 goto do_next;
1225 rcu_read_unlock();
1226 do_next:
1230 in_dev_put(in_dev);
1231 return;
1233 reject_redirect:
1234 #ifdef CONFIG_IP_ROUTE_VERBOSE
1235 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1236 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1237 "%u.%u.%u.%u ignored.\n"
1238 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1239 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1240 NIPQUAD(saddr), NIPQUAD(daddr));
1241 #endif
1242 in_dev_put(in_dev);
1245 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1247 struct rtable *rt = (struct rtable*)dst;
1248 struct dst_entry *ret = dst;
1250 if (rt) {
1251 if (dst->obsolete) {
1252 ip_rt_put(rt);
1253 ret = NULL;
1254 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1255 rt->u.dst.expires) {
1256 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1257 rt->fl.fl4_src ^
1258 (rt->fl.oif << 5));
1259 #if RT_CACHE_DEBUG >= 1
1260 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1261 "%u.%u.%u.%u/%02x dropped\n",
1262 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1263 #endif
1264 rt_del(hash, rt);
1265 ret = NULL;
1268 return ret;
1272 * Algorithm:
1273 * 1. The first ip_rt_redirect_number redirects are sent
1274 * with exponential backoff, then we stop sending them at all,
1275 * assuming that the host ignores our redirects.
1276 * 2. If we did not see packets requiring redirects
1277 * during ip_rt_redirect_silence, we assume that the host
1278 * forgot redirected route and start to send redirects again.
1280 * This algorithm is much cheaper and more intelligent than dumb load limiting
1281 * in icmp.c.
1283 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1284 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1287 void ip_rt_send_redirect(struct sk_buff *skb)
1289 struct rtable *rt = (struct rtable*)skb->dst;
1290 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1292 if (!in_dev)
1293 return;
1295 if (!IN_DEV_TX_REDIRECTS(in_dev))
1296 goto out;
1298 /* No redirected packets during ip_rt_redirect_silence;
1299 * reset the algorithm.
1301 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1302 rt->u.dst.rate_tokens = 0;
1304 /* Too many ignored redirects; do not send anything
1305 * set u.dst.rate_last to the last seen redirected packet.
1307 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1308 rt->u.dst.rate_last = jiffies;
1309 goto out;
1312 /* Check for load limit; set rate_last to the latest sent
1313 * redirect.
1315 if (time_after(jiffies,
1316 (rt->u.dst.rate_last +
1317 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1318 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1319 rt->u.dst.rate_last = jiffies;
1320 ++rt->u.dst.rate_tokens;
1321 #ifdef CONFIG_IP_ROUTE_VERBOSE
1322 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1323 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1324 net_ratelimit())
1325 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1326 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1327 NIPQUAD(rt->rt_src), rt->rt_iif,
1328 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1329 #endif
1331 out:
1332 in_dev_put(in_dev);
1335 static int ip_error(struct sk_buff *skb)
1337 struct rtable *rt = (struct rtable*)skb->dst;
1338 unsigned long now;
1339 int code;
1341 switch (rt->u.dst.error) {
1342 case EINVAL:
1343 default:
1344 goto out;
1345 case EHOSTUNREACH:
1346 code = ICMP_HOST_UNREACH;
1347 break;
1348 case ENETUNREACH:
1349 code = ICMP_NET_UNREACH;
1350 break;
1351 case EACCES:
1352 code = ICMP_PKT_FILTERED;
1353 break;
1356 now = jiffies;
1357 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1358 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1359 rt->u.dst.rate_tokens = ip_rt_error_burst;
1360 rt->u.dst.rate_last = now;
1361 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1362 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1363 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1366 out: kfree_skb(skb);
1367 return 0;
1371 * The last two values are not from the RFC but
1372 * are needed for AMPRnet AX.25 paths.
1375 static const unsigned short mtu_plateau[] =
1376 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1378 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1380 int i;
1382 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1383 if (old_mtu > mtu_plateau[i])
1384 return mtu_plateau[i];
1385 return 68;
1388 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1390 int i;
1391 unsigned short old_mtu = ntohs(iph->tot_len);
1392 struct rtable *rth;
1393 u32 skeys[2] = { iph->saddr, 0, };
1394 u32 daddr = iph->daddr;
1395 unsigned short est_mtu = 0;
1397 if (ipv4_config.no_pmtu_disc)
1398 return 0;
1400 for (i = 0; i < 2; i++) {
1401 unsigned hash = rt_hash_code(daddr, skeys[i]);
1403 rcu_read_lock();
1404 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 rth = rcu_dereference(rth->u.rt_next)) {
1406 if (rth->fl.fl4_dst == daddr &&
1407 rth->fl.fl4_src == skeys[i] &&
1408 rth->rt_dst == daddr &&
1409 rth->rt_src == iph->saddr &&
1410 rth->fl.iif == 0 &&
1411 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1412 unsigned short mtu = new_mtu;
1414 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416 /* BSD 4.2 compatibility hack :-( */
1417 if (mtu == 0 &&
1418 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1419 old_mtu >= 68 + (iph->ihl << 2))
1420 old_mtu -= iph->ihl << 2;
1422 mtu = guess_mtu(old_mtu);
1424 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1425 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1426 dst_confirm(&rth->u.dst);
1427 if (mtu < ip_rt_min_pmtu) {
1428 mtu = ip_rt_min_pmtu;
1429 rth->u.dst.metrics[RTAX_LOCK-1] |=
1430 (1 << RTAX_MTU);
1432 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1433 dst_set_expires(&rth->u.dst,
1434 ip_rt_mtu_expires);
1436 est_mtu = mtu;
1440 rcu_read_unlock();
1442 return est_mtu ? : new_mtu;
1445 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1448 !(dst_metric_locked(dst, RTAX_MTU))) {
1449 if (mtu < ip_rt_min_pmtu) {
1450 mtu = ip_rt_min_pmtu;
1451 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 dst->metrics[RTAX_MTU-1] = mtu;
1454 dst_set_expires(dst, ip_rt_mtu_expires);
1458 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 return NULL;
1463 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 struct rtable *rt = (struct rtable *) dst;
1466 struct inet_peer *peer = rt->peer;
1467 struct in_device *idev = rt->idev;
1469 if (peer) {
1470 rt->peer = NULL;
1471 inet_putpeer(peer);
1474 if (idev) {
1475 rt->idev = NULL;
1476 in_dev_put(idev);
1480 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1481 int how)
1483 struct rtable *rt = (struct rtable *) dst;
1484 struct in_device *idev = rt->idev;
1485 if (dev != &loopback_dev && idev && idev->dev == dev) {
1486 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1487 if (loopback_idev) {
1488 rt->idev = loopback_idev;
1489 in_dev_put(idev);
1494 static void ipv4_link_failure(struct sk_buff *skb)
1496 struct rtable *rt;
1498 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500 rt = (struct rtable *) skb->dst;
1501 if (rt)
1502 dst_set_expires(&rt->u.dst, 0);
1505 static int ip_rt_bug(struct sk_buff *skb)
1507 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1508 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1509 skb->dev ? skb->dev->name : "?");
1510 kfree_skb(skb);
1511 return 0;
1515 We do not cache source address of outgoing interface,
1516 because it is used only by IP RR, TS and SRR options,
1517 so that it out of fast path.
1519 BTW remember: "addr" is allowed to be not aligned
1520 in IP options!
1523 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 u32 src;
1526 struct fib_result res;
1528 if (rt->fl.iif == 0)
1529 src = rt->rt_src;
1530 else if (fib_lookup(&rt->fl, &res) == 0) {
1531 src = FIB_RES_PREFSRC(res);
1532 fib_res_put(&res);
1533 } else
1534 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1535 RT_SCOPE_UNIVERSE);
1536 memcpy(addr, &src, 4);
1539 #ifdef CONFIG_NET_CLS_ROUTE
1540 static void set_class_tag(struct rtable *rt, u32 tag)
1542 if (!(rt->u.dst.tclassid & 0xFFFF))
1543 rt->u.dst.tclassid |= tag & 0xFFFF;
1544 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1545 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 #endif
1549 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 struct fib_info *fi = res->fi;
1553 if (fi) {
1554 if (FIB_RES_GW(*res) &&
1555 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1556 rt->rt_gateway = FIB_RES_GW(*res);
1557 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1558 sizeof(rt->u.dst.metrics));
1559 if (fi->fib_mtu == 0) {
1560 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1561 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1562 rt->rt_gateway != rt->rt_dst &&
1563 rt->u.dst.dev->mtu > 576)
1564 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 #ifdef CONFIG_NET_CLS_ROUTE
1567 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1568 #endif
1569 } else
1570 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1573 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1574 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1575 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1576 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1577 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1578 ip_rt_min_advmss);
1579 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1580 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582 #ifdef CONFIG_NET_CLS_ROUTE
1583 #ifdef CONFIG_IP_MULTIPLE_TABLES
1584 set_class_tag(rt, fib_rules_tclass(res));
1585 #endif
1586 set_class_tag(rt, itag);
1587 #endif
1588 rt->rt_type = res->type;
1591 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1592 u8 tos, struct net_device *dev, int our)
1594 unsigned hash;
1595 struct rtable *rth;
1596 u32 spec_dst;
1597 struct in_device *in_dev = in_dev_get(dev);
1598 u32 itag = 0;
1600 /* Primary sanity checks. */
1602 if (in_dev == NULL)
1603 return -EINVAL;
1605 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1606 skb->protocol != htons(ETH_P_IP))
1607 goto e_inval;
1609 if (ZERONET(saddr)) {
1610 if (!LOCAL_MCAST(daddr))
1611 goto e_inval;
1612 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1613 } else if (fib_validate_source(saddr, 0, tos, 0,
1614 dev, &spec_dst, &itag) < 0)
1615 goto e_inval;
1617 rth = dst_alloc(&ipv4_dst_ops);
1618 if (!rth)
1619 goto e_nobufs;
1621 rth->u.dst.output= ip_rt_bug;
1623 atomic_set(&rth->u.dst.__refcnt, 1);
1624 rth->u.dst.flags= DST_HOST;
1625 if (in_dev->cnf.no_policy)
1626 rth->u.dst.flags |= DST_NOPOLICY;
1627 rth->fl.fl4_dst = daddr;
1628 rth->rt_dst = daddr;
1629 rth->fl.fl4_tos = tos;
1630 #ifdef CONFIG_IP_ROUTE_FWMARK
1631 rth->fl.fl4_fwmark= skb->nfmark;
1632 #endif
1633 rth->fl.fl4_src = saddr;
1634 rth->rt_src = saddr;
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 rth->u.dst.tclassid = itag;
1637 #endif
1638 rth->rt_iif =
1639 rth->fl.iif = dev->ifindex;
1640 rth->u.dst.dev = &loopback_dev;
1641 dev_hold(rth->u.dst.dev);
1642 rth->idev = in_dev_get(rth->u.dst.dev);
1643 rth->fl.oif = 0;
1644 rth->rt_gateway = daddr;
1645 rth->rt_spec_dst= spec_dst;
1646 rth->rt_type = RTN_MULTICAST;
1647 rth->rt_flags = RTCF_MULTICAST;
1648 if (our) {
1649 rth->u.dst.input= ip_local_deliver;
1650 rth->rt_flags |= RTCF_LOCAL;
1653 #ifdef CONFIG_IP_MROUTE
1654 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1655 rth->u.dst.input = ip_mr_input;
1656 #endif
1657 RT_CACHE_STAT_INC(in_slow_mc);
1659 in_dev_put(in_dev);
1660 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1661 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663 e_nobufs:
1664 in_dev_put(in_dev);
1665 return -ENOBUFS;
1667 e_inval:
1668 in_dev_put(in_dev);
1669 return -EINVAL;
1673 static void ip_handle_martian_source(struct net_device *dev,
1674 struct in_device *in_dev,
1675 struct sk_buff *skb,
1676 u32 daddr,
1677 u32 saddr)
1679 RT_CACHE_STAT_INC(in_martian_src);
1680 #ifdef CONFIG_IP_ROUTE_VERBOSE
1681 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 * RFC1812 recommendation, if source is martian,
1684 * the only hint is MAC header.
1686 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1687 "%u.%u.%u.%u, on dev %s\n",
1688 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1689 if (dev->hard_header_len && skb->mac.raw) {
1690 int i;
1691 unsigned char *p = skb->mac.raw;
1692 printk(KERN_WARNING "ll header: ");
1693 for (i = 0; i < dev->hard_header_len; i++, p++) {
1694 printk("%02x", *p);
1695 if (i < (dev->hard_header_len - 1))
1696 printk(":");
1698 printk("\n");
1701 #endif
1704 static inline int __mkroute_input(struct sk_buff *skb,
1705 struct fib_result* res,
1706 struct in_device *in_dev,
1707 u32 daddr, u32 saddr, u32 tos,
1708 struct rtable **result)
1711 struct rtable *rth;
1712 int err;
1713 struct in_device *out_dev;
1714 unsigned flags = 0;
1715 u32 spec_dst, itag;
1717 /* get a working reference to the output device */
1718 out_dev = in_dev_get(FIB_RES_DEV(*res));
1719 if (out_dev == NULL) {
1720 if (net_ratelimit())
1721 printk(KERN_CRIT "Bug in ip_route_input" \
1722 "_slow(). Please, report\n");
1723 return -EINVAL;
1727 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1728 in_dev->dev, &spec_dst, &itag);
1729 if (err < 0) {
1730 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1731 saddr);
1733 err = -EINVAL;
1734 goto cleanup;
1737 if (err)
1738 flags |= RTCF_DIRECTSRC;
1740 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1741 (IN_DEV_SHARED_MEDIA(out_dev) ||
1742 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1743 flags |= RTCF_DOREDIRECT;
1745 if (skb->protocol != htons(ETH_P_IP)) {
1746 /* Not IP (i.e. ARP). Do not create route, if it is
1747 * invalid for proxy arp. DNAT routes are always valid.
1749 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1750 err = -EINVAL;
1751 goto cleanup;
1756 rth = dst_alloc(&ipv4_dst_ops);
1757 if (!rth) {
1758 err = -ENOBUFS;
1759 goto cleanup;
1762 atomic_set(&rth->u.dst.__refcnt, 1);
1763 rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 if (res->fi->fib_nhs > 1)
1766 rth->u.dst.flags |= DST_BALANCED;
1767 #endif
1768 if (in_dev->cnf.no_policy)
1769 rth->u.dst.flags |= DST_NOPOLICY;
1770 if (in_dev->cnf.no_xfrm)
1771 rth->u.dst.flags |= DST_NOXFRM;
1772 rth->fl.fl4_dst = daddr;
1773 rth->rt_dst = daddr;
1774 rth->fl.fl4_tos = tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776 rth->fl.fl4_fwmark= skb->nfmark;
1777 #endif
1778 rth->fl.fl4_src = saddr;
1779 rth->rt_src = saddr;
1780 rth->rt_gateway = daddr;
1781 rth->rt_iif =
1782 rth->fl.iif = in_dev->dev->ifindex;
1783 rth->u.dst.dev = (out_dev)->dev;
1784 dev_hold(rth->u.dst.dev);
1785 rth->idev = in_dev_get(rth->u.dst.dev);
1786 rth->fl.oif = 0;
1787 rth->rt_spec_dst= spec_dst;
1789 rth->u.dst.input = ip_forward;
1790 rth->u.dst.output = ip_output;
1792 rt_set_nexthop(rth, res, itag);
1794 rth->rt_flags = flags;
1796 *result = rth;
1797 err = 0;
1798 cleanup:
1799 /* release the working reference to the output device */
1800 in_dev_put(out_dev);
1801 return err;
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 struct fib_result* res,
1806 const struct flowi *fl,
1807 struct in_device *in_dev,
1808 u32 daddr, u32 saddr, u32 tos)
1810 struct rtable* rth = NULL;
1811 int err;
1812 unsigned hash;
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 fib_select_multipath(fl, res);
1817 #endif
1819 /* create a routing cache entry */
1820 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1821 if (err)
1822 return err;
1824 /* put it into the cache */
1825 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1826 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830 struct fib_result* res,
1831 const struct flowi *fl,
1832 struct in_device *in_dev,
1833 u32 daddr, u32 saddr, u32 tos)
1835 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1836 struct rtable* rth = NULL, *rtres;
1837 unsigned char hop, hopcount;
1838 int err = -EINVAL;
1839 unsigned int hash;
1841 if (res->fi)
1842 hopcount = res->fi->fib_nhs;
1843 else
1844 hopcount = 1;
1846 /* distinguish between multipath and singlepath */
1847 if (hopcount < 2)
1848 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1849 saddr, tos);
1851 /* add all alternatives to the routing cache */
1852 for (hop = 0; hop < hopcount; hop++) {
1853 res->nh_sel = hop;
1855 /* put reference to previous result */
1856 if (hop)
1857 ip_rt_put(rtres);
1859 /* create a routing cache entry */
1860 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1861 &rth);
1862 if (err)
1863 return err;
1865 /* put it into the cache */
1866 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1867 err = rt_intern_hash(hash, rth, &rtres);
1868 if (err)
1869 return err;
1871 /* forward hop information to multipath impl. */
1872 multipath_set_nhinfo(rth,
1873 FIB_RES_NETWORK(*res),
1874 FIB_RES_NETMASK(*res),
1875 res->prefixlen,
1876 &FIB_RES_NH(*res));
1878 skb->dst = &rtres->u.dst;
1879 return err;
1880 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1881 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1882 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1887 * NOTE. We drop all the packets that has local source
1888 * addresses, because every properly looped back packet
1889 * must have correct destination already attached by output routine.
1891 * Such approach solves two big problems:
1892 * 1. Not simplex devices are handled properly.
1893 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1896 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1897 u8 tos, struct net_device *dev)
1899 struct fib_result res;
1900 struct in_device *in_dev = in_dev_get(dev);
1901 struct flowi fl = { .nl_u = { .ip4_u =
1902 { .daddr = daddr,
1903 .saddr = saddr,
1904 .tos = tos,
1905 .scope = RT_SCOPE_UNIVERSE,
1906 #ifdef CONFIG_IP_ROUTE_FWMARK
1907 .fwmark = skb->nfmark
1908 #endif
1909 } },
1910 .iif = dev->ifindex };
1911 unsigned flags = 0;
1912 u32 itag = 0;
1913 struct rtable * rth;
1914 unsigned hash;
1915 u32 spec_dst;
1916 int err = -EINVAL;
1917 int free_res = 0;
1919 /* IP on this device is disabled. */
1921 if (!in_dev)
1922 goto out;
1924 /* Check for the most weird martians, which can be not detected
1925 by fib_lookup.
1928 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1929 goto martian_source;
1931 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1932 goto brd_input;
1934 /* Accept zero addresses only to limited broadcast;
1935 * I even do not know to fix it or not. Waiting for complains :-)
1937 if (ZERONET(saddr))
1938 goto martian_source;
1940 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1941 goto martian_destination;
1944 * Now we are ready to route packet.
1946 if ((err = fib_lookup(&fl, &res)) != 0) {
1947 if (!IN_DEV_FORWARD(in_dev))
1948 goto e_hostunreach;
1949 goto no_route;
1951 free_res = 1;
1953 RT_CACHE_STAT_INC(in_slow_tot);
1955 if (res.type == RTN_BROADCAST)
1956 goto brd_input;
1958 if (res.type == RTN_LOCAL) {
1959 int result;
1960 result = fib_validate_source(saddr, daddr, tos,
1961 loopback_dev.ifindex,
1962 dev, &spec_dst, &itag);
1963 if (result < 0)
1964 goto martian_source;
1965 if (result)
1966 flags |= RTCF_DIRECTSRC;
1967 spec_dst = daddr;
1968 goto local_input;
1971 if (!IN_DEV_FORWARD(in_dev))
1972 goto e_hostunreach;
1973 if (res.type != RTN_UNICAST)
1974 goto martian_destination;
1976 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1977 if (err == -ENOBUFS)
1978 goto e_nobufs;
1979 if (err == -EINVAL)
1980 goto e_inval;
1982 done:
1983 in_dev_put(in_dev);
1984 if (free_res)
1985 fib_res_put(&res);
1986 out: return err;
1988 brd_input:
1989 if (skb->protocol != htons(ETH_P_IP))
1990 goto e_inval;
1992 if (ZERONET(saddr))
1993 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1994 else {
1995 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1996 &itag);
1997 if (err < 0)
1998 goto martian_source;
1999 if (err)
2000 flags |= RTCF_DIRECTSRC;
2002 flags |= RTCF_BROADCAST;
2003 res.type = RTN_BROADCAST;
2004 RT_CACHE_STAT_INC(in_brd);
2006 local_input:
2007 rth = dst_alloc(&ipv4_dst_ops);
2008 if (!rth)
2009 goto e_nobufs;
2011 rth->u.dst.output= ip_rt_bug;
2013 atomic_set(&rth->u.dst.__refcnt, 1);
2014 rth->u.dst.flags= DST_HOST;
2015 if (in_dev->cnf.no_policy)
2016 rth->u.dst.flags |= DST_NOPOLICY;
2017 rth->fl.fl4_dst = daddr;
2018 rth->rt_dst = daddr;
2019 rth->fl.fl4_tos = tos;
2020 #ifdef CONFIG_IP_ROUTE_FWMARK
2021 rth->fl.fl4_fwmark= skb->nfmark;
2022 #endif
2023 rth->fl.fl4_src = saddr;
2024 rth->rt_src = saddr;
2025 #ifdef CONFIG_NET_CLS_ROUTE
2026 rth->u.dst.tclassid = itag;
2027 #endif
2028 rth->rt_iif =
2029 rth->fl.iif = dev->ifindex;
2030 rth->u.dst.dev = &loopback_dev;
2031 dev_hold(rth->u.dst.dev);
2032 rth->idev = in_dev_get(rth->u.dst.dev);
2033 rth->rt_gateway = daddr;
2034 rth->rt_spec_dst= spec_dst;
2035 rth->u.dst.input= ip_local_deliver;
2036 rth->rt_flags = flags|RTCF_LOCAL;
2037 if (res.type == RTN_UNREACHABLE) {
2038 rth->u.dst.input= ip_error;
2039 rth->u.dst.error= -err;
2040 rth->rt_flags &= ~RTCF_LOCAL;
2042 rth->rt_type = res.type;
2043 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2044 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2045 goto done;
2047 no_route:
2048 RT_CACHE_STAT_INC(in_no_route);
2049 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2050 res.type = RTN_UNREACHABLE;
2051 goto local_input;
2054 * Do not cache martian addresses: they should be logged (RFC1812)
2056 martian_destination:
2057 RT_CACHE_STAT_INC(in_martian_dst);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2060 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2061 "%u.%u.%u.%u, dev %s\n",
2062 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2063 #endif
2065 e_hostunreach:
2066 err = -EHOSTUNREACH;
2067 goto done;
2069 e_inval:
2070 err = -EINVAL;
2071 goto done;
2073 e_nobufs:
2074 err = -ENOBUFS;
2075 goto done;
2077 martian_source:
2078 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2079 goto e_inval;
2082 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2083 u8 tos, struct net_device *dev)
2085 struct rtable * rth;
2086 unsigned hash;
2087 int iif = dev->ifindex;
2089 tos &= IPTOS_RT_MASK;
2090 hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2092 rcu_read_lock();
2093 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2094 rth = rcu_dereference(rth->u.rt_next)) {
2095 if (rth->fl.fl4_dst == daddr &&
2096 rth->fl.fl4_src == saddr &&
2097 rth->fl.iif == iif &&
2098 rth->fl.oif == 0 &&
2099 #ifdef CONFIG_IP_ROUTE_FWMARK
2100 rth->fl.fl4_fwmark == skb->nfmark &&
2101 #endif
2102 rth->fl.fl4_tos == tos) {
2103 rth->u.dst.lastuse = jiffies;
2104 dst_hold(&rth->u.dst);
2105 rth->u.dst.__use++;
2106 RT_CACHE_STAT_INC(in_hit);
2107 rcu_read_unlock();
2108 skb->dst = (struct dst_entry*)rth;
2109 return 0;
2111 RT_CACHE_STAT_INC(in_hlist_search);
2113 rcu_read_unlock();
2115 /* Multicast recognition logic is moved from route cache to here.
2116 The problem was that too many Ethernet cards have broken/missing
2117 hardware multicast filters :-( As result the host on multicasting
2118 network acquires a lot of useless route cache entries, sort of
2119 SDR messages from all the world. Now we try to get rid of them.
2120 Really, provided software IP multicast filter is organized
2121 reasonably (at least, hashed), it does not result in a slowdown
2122 comparing with route cache reject entries.
2123 Note, that multicast routers are not affected, because
2124 route cache entry is created eventually.
2126 if (MULTICAST(daddr)) {
2127 struct in_device *in_dev;
2129 rcu_read_lock();
2130 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2131 int our = ip_check_mc(in_dev, daddr, saddr,
2132 skb->nh.iph->protocol);
2133 if (our
2134 #ifdef CONFIG_IP_MROUTE
2135 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2136 #endif
2138 rcu_read_unlock();
2139 return ip_route_input_mc(skb, daddr, saddr,
2140 tos, dev, our);
2143 rcu_read_unlock();
2144 return -EINVAL;
2146 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2149 static inline int __mkroute_output(struct rtable **result,
2150 struct fib_result* res,
2151 const struct flowi *fl,
2152 const struct flowi *oldflp,
2153 struct net_device *dev_out,
2154 unsigned flags)
2156 struct rtable *rth;
2157 struct in_device *in_dev;
2158 u32 tos = RT_FL_TOS(oldflp);
2159 int err = 0;
2161 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2162 return -EINVAL;
2164 if (fl->fl4_dst == 0xFFFFFFFF)
2165 res->type = RTN_BROADCAST;
2166 else if (MULTICAST(fl->fl4_dst))
2167 res->type = RTN_MULTICAST;
2168 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2169 return -EINVAL;
2171 if (dev_out->flags & IFF_LOOPBACK)
2172 flags |= RTCF_LOCAL;
2174 /* get work reference to inet device */
2175 in_dev = in_dev_get(dev_out);
2176 if (!in_dev)
2177 return -EINVAL;
2179 if (res->type == RTN_BROADCAST) {
2180 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2181 if (res->fi) {
2182 fib_info_put(res->fi);
2183 res->fi = NULL;
2185 } else if (res->type == RTN_MULTICAST) {
2186 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2187 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2188 oldflp->proto))
2189 flags &= ~RTCF_LOCAL;
2190 /* If multicast route do not exist use
2191 default one, but do not gateway in this case.
2192 Yes, it is hack.
2194 if (res->fi && res->prefixlen < 4) {
2195 fib_info_put(res->fi);
2196 res->fi = NULL;
2201 rth = dst_alloc(&ipv4_dst_ops);
2202 if (!rth) {
2203 err = -ENOBUFS;
2204 goto cleanup;
2207 atomic_set(&rth->u.dst.__refcnt, 1);
2208 rth->u.dst.flags= DST_HOST;
2209 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2210 if (res->fi) {
2211 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2212 if (res->fi->fib_nhs > 1)
2213 rth->u.dst.flags |= DST_BALANCED;
2215 #endif
2216 if (in_dev->cnf.no_xfrm)
2217 rth->u.dst.flags |= DST_NOXFRM;
2218 if (in_dev->cnf.no_policy)
2219 rth->u.dst.flags |= DST_NOPOLICY;
2221 rth->fl.fl4_dst = oldflp->fl4_dst;
2222 rth->fl.fl4_tos = tos;
2223 rth->fl.fl4_src = oldflp->fl4_src;
2224 rth->fl.oif = oldflp->oif;
2225 #ifdef CONFIG_IP_ROUTE_FWMARK
2226 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2227 #endif
2228 rth->rt_dst = fl->fl4_dst;
2229 rth->rt_src = fl->fl4_src;
2230 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2231 /* get references to the devices that are to be hold by the routing
2232 cache entry */
2233 rth->u.dst.dev = dev_out;
2234 dev_hold(dev_out);
2235 rth->idev = in_dev_get(dev_out);
2236 rth->rt_gateway = fl->fl4_dst;
2237 rth->rt_spec_dst= fl->fl4_src;
2239 rth->u.dst.output=ip_output;
2241 RT_CACHE_STAT_INC(out_slow_tot);
2243 if (flags & RTCF_LOCAL) {
2244 rth->u.dst.input = ip_local_deliver;
2245 rth->rt_spec_dst = fl->fl4_dst;
2247 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2248 rth->rt_spec_dst = fl->fl4_src;
2249 if (flags & RTCF_LOCAL &&
2250 !(dev_out->flags & IFF_LOOPBACK)) {
2251 rth->u.dst.output = ip_mc_output;
2252 RT_CACHE_STAT_INC(out_slow_mc);
2254 #ifdef CONFIG_IP_MROUTE
2255 if (res->type == RTN_MULTICAST) {
2256 if (IN_DEV_MFORWARD(in_dev) &&
2257 !LOCAL_MCAST(oldflp->fl4_dst)) {
2258 rth->u.dst.input = ip_mr_input;
2259 rth->u.dst.output = ip_mc_output;
2262 #endif
2265 rt_set_nexthop(rth, res, 0);
2267 rth->rt_flags = flags;
2269 *result = rth;
2270 cleanup:
2271 /* release work reference to inet device */
2272 in_dev_put(in_dev);
2274 return err;
2277 static inline int ip_mkroute_output_def(struct rtable **rp,
2278 struct fib_result* res,
2279 const struct flowi *fl,
2280 const struct flowi *oldflp,
2281 struct net_device *dev_out,
2282 unsigned flags)
2284 struct rtable *rth = NULL;
2285 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2286 unsigned hash;
2287 if (err == 0) {
2288 hash = rt_hash_code(oldflp->fl4_dst,
2289 oldflp->fl4_src ^ (oldflp->oif << 5));
2290 err = rt_intern_hash(hash, rth, rp);
2293 return err;
2296 static inline int ip_mkroute_output(struct rtable** rp,
2297 struct fib_result* res,
2298 const struct flowi *fl,
2299 const struct flowi *oldflp,
2300 struct net_device *dev_out,
2301 unsigned flags)
2303 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2304 unsigned char hop;
2305 unsigned hash;
2306 int err = -EINVAL;
2307 struct rtable *rth = NULL;
2309 if (res->fi && res->fi->fib_nhs > 1) {
2310 unsigned char hopcount = res->fi->fib_nhs;
2312 for (hop = 0; hop < hopcount; hop++) {
2313 struct net_device *dev2nexthop;
2315 res->nh_sel = hop;
2317 /* hold a work reference to the output device */
2318 dev2nexthop = FIB_RES_DEV(*res);
2319 dev_hold(dev2nexthop);
2321 /* put reference to previous result */
2322 if (hop)
2323 ip_rt_put(*rp);
2325 err = __mkroute_output(&rth, res, fl, oldflp,
2326 dev2nexthop, flags);
2328 if (err != 0)
2329 goto cleanup;
2331 hash = rt_hash_code(oldflp->fl4_dst,
2332 oldflp->fl4_src ^
2333 (oldflp->oif << 5));
2334 err = rt_intern_hash(hash, rth, rp);
2336 /* forward hop information to multipath impl. */
2337 multipath_set_nhinfo(rth,
2338 FIB_RES_NETWORK(*res),
2339 FIB_RES_NETMASK(*res),
2340 res->prefixlen,
2341 &FIB_RES_NH(*res));
2342 cleanup:
2343 /* release work reference to output device */
2344 dev_put(dev2nexthop);
2346 if (err != 0)
2347 return err;
2349 return err;
2350 } else {
2351 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2352 flags);
2354 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2355 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2356 #endif
2360 * Major route resolver routine.
2363 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 u32 tos = RT_FL_TOS(oldflp);
2366 struct flowi fl = { .nl_u = { .ip4_u =
2367 { .daddr = oldflp->fl4_dst,
2368 .saddr = oldflp->fl4_src,
2369 .tos = tos & IPTOS_RT_MASK,
2370 .scope = ((tos & RTO_ONLINK) ?
2371 RT_SCOPE_LINK :
2372 RT_SCOPE_UNIVERSE),
2373 #ifdef CONFIG_IP_ROUTE_FWMARK
2374 .fwmark = oldflp->fl4_fwmark
2375 #endif
2376 } },
2377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2380 unsigned flags = 0;
2381 struct net_device *dev_out = NULL;
2382 int free_res = 0;
2383 int err;
2386 res.fi = NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2388 res.r = NULL;
2389 #endif
2391 if (oldflp->fl4_src) {
2392 err = -EINVAL;
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2396 goto out;
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2401 goto out;
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2411 if (oldflp->oif == 0
2412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2428 fl.oif = dev_out->ifindex;
2429 goto make_route;
2431 if (dev_out)
2432 dev_put(dev_out);
2433 dev_out = NULL;
2437 if (oldflp->oif) {
2438 dev_out = dev_get_by_index(oldflp->oif);
2439 err = -ENODEV;
2440 if (dev_out == NULL)
2441 goto out;
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
2445 dev_put(dev_out);
2446 goto out; /* Wrong error code */
2449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2450 if (!fl.fl4_src)
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 RT_SCOPE_LINK);
2453 goto make_route;
2455 if (!fl.fl4_src) {
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2458 fl.fl4_scope);
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2461 RT_SCOPE_HOST);
2465 if (!fl.fl4_dst) {
2466 fl.fl4_dst = fl.fl4_src;
2467 if (!fl.fl4_dst)
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 if (dev_out)
2470 dev_put(dev_out);
2471 dev_out = &loopback_dev;
2472 dev_hold(dev_out);
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2476 goto make_route;
2479 if (fib_lookup(&fl, &res)) {
2480 res.fi = NULL;
2481 if (oldflp->oif) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2485 WHY? DW.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2502 RT_SCOPE_LINK);
2503 res.type = RTN_UNICAST;
2504 goto make_route;
2506 if (dev_out)
2507 dev_put(dev_out);
2508 err = -ENETUNREACH;
2509 goto out;
2511 free_res = 1;
2513 if (res.type == RTN_LOCAL) {
2514 if (!fl.fl4_src)
2515 fl.fl4_src = fl.fl4_dst;
2516 if (dev_out)
2517 dev_put(dev_out);
2518 dev_out = &loopback_dev;
2519 dev_hold(dev_out);
2520 fl.oif = dev_out->ifindex;
2521 if (res.fi)
2522 fib_info_put(res.fi);
2523 res.fi = NULL;
2524 flags |= RTCF_LOCAL;
2525 goto make_route;
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2531 else
2532 #endif
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2536 if (!fl.fl4_src)
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2539 if (dev_out)
2540 dev_put(dev_out);
2541 dev_out = FIB_RES_DEV(res);
2542 dev_hold(dev_out);
2543 fl.oif = dev_out->ifindex;
2546 make_route:
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2550 if (free_res)
2551 fib_res_put(&res);
2552 if (dev_out)
2553 dev_put(dev_out);
2554 out: return err;
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2559 unsigned hash;
2560 struct rtable *rth;
2562 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2564 rcu_read_lock_bh();
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2569 rth->fl.iif == 0 &&
2570 rth->fl.oif == flp->oif &&
2571 #ifdef CONFIG_IP_ROUTE_FWMARK
2572 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2573 #endif
2574 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2575 (IPTOS_RT_MASK | RTO_ONLINK))) {
2577 /* check for multipath routes and choose one if
2578 * necessary
2580 if (multipath_select_route(flp, rth, rp)) {
2581 dst_hold(&(*rp)->u.dst);
2582 RT_CACHE_STAT_INC(out_hit);
2583 rcu_read_unlock_bh();
2584 return 0;
2587 rth->u.dst.lastuse = jiffies;
2588 dst_hold(&rth->u.dst);
2589 rth->u.dst.__use++;
2590 RT_CACHE_STAT_INC(out_hit);
2591 rcu_read_unlock_bh();
2592 *rp = rth;
2593 return 0;
2595 RT_CACHE_STAT_INC(out_hlist_search);
2597 rcu_read_unlock_bh();
2599 return ip_route_output_slow(rp, flp);
2602 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2604 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2606 int err;
2608 if ((err = __ip_route_output_key(rp, flp)) != 0)
2609 return err;
2611 if (flp->proto) {
2612 if (!flp->fl4_src)
2613 flp->fl4_src = (*rp)->rt_src;
2614 if (!flp->fl4_dst)
2615 flp->fl4_dst = (*rp)->rt_dst;
2616 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2619 return 0;
2622 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2624 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2626 return ip_route_output_flow(rp, flp, NULL, 0);
2629 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2630 int nowait, unsigned int flags)
2632 struct rtable *rt = (struct rtable*)skb->dst;
2633 struct rtmsg *r;
2634 struct nlmsghdr *nlh;
2635 unsigned char *b = skb->tail;
2636 struct rta_cacheinfo ci;
2637 #ifdef CONFIG_IP_MROUTE
2638 struct rtattr *eptr;
2639 #endif
2640 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2641 r = NLMSG_DATA(nlh);
2642 r->rtm_family = AF_INET;
2643 r->rtm_dst_len = 32;
2644 r->rtm_src_len = 0;
2645 r->rtm_tos = rt->fl.fl4_tos;
2646 r->rtm_table = RT_TABLE_MAIN;
2647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
2653 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2654 if (rt->fl.fl4_src) {
2655 r->rtm_src_len = 32;
2656 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2658 if (rt->u.dst.dev)
2659 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2660 #ifdef CONFIG_NET_CLS_ROUTE
2661 if (rt->u.dst.tclassid)
2662 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2663 #endif
2664 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2665 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2666 __u32 alg = rt->rt_multipath_alg;
2668 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2670 #endif
2671 if (rt->fl.iif)
2672 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2673 else if (rt->rt_src != rt->fl.fl4_src)
2674 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2675 if (rt->rt_dst != rt->rt_gateway)
2676 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2677 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678 goto rtattr_failure;
2679 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2680 ci.rta_used = rt->u.dst.__use;
2681 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2682 if (rt->u.dst.expires)
2683 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2684 else
2685 ci.rta_expires = 0;
2686 ci.rta_error = rt->u.dst.error;
2687 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2688 if (rt->peer) {
2689 ci.rta_id = rt->peer->ip_id_count;
2690 if (rt->peer->tcp_ts_stamp) {
2691 ci.rta_ts = rt->peer->tcp_ts;
2692 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2695 #ifdef CONFIG_IP_MROUTE
2696 eptr = (struct rtattr*)skb->tail;
2697 #endif
2698 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2699 if (rt->fl.iif) {
2700 #ifdef CONFIG_IP_MROUTE
2701 u32 dst = rt->rt_dst;
2703 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2704 ipv4_devconf.mc_forwarding) {
2705 int err = ipmr_get_route(skb, r, nowait);
2706 if (err <= 0) {
2707 if (!nowait) {
2708 if (err == 0)
2709 return 0;
2710 goto nlmsg_failure;
2711 } else {
2712 if (err == -EMSGSIZE)
2713 goto nlmsg_failure;
2714 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2717 } else
2718 #endif
2719 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2722 nlh->nlmsg_len = skb->tail - b;
2723 return skb->len;
2725 nlmsg_failure:
2726 rtattr_failure:
2727 skb_trim(skb, b - skb->data);
2728 return -1;
2731 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2733 struct rtattr **rta = arg;
2734 struct rtmsg *rtm = NLMSG_DATA(nlh);
2735 struct rtable *rt = NULL;
2736 u32 dst = 0;
2737 u32 src = 0;
2738 int iif = 0;
2739 int err = -ENOBUFS;
2740 struct sk_buff *skb;
2742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2743 if (!skb)
2744 goto out;
2746 /* Reserve room for dummy headers, this skb can pass
2747 through good chunk of routing engine.
2749 skb->mac.raw = skb->nh.raw = skb->data;
2751 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752 skb->nh.iph->protocol = IPPROTO_ICMP;
2753 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 if (rta[RTA_SRC - 1])
2756 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 if (rta[RTA_DST - 1])
2758 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 if (rta[RTA_IIF - 1])
2760 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2762 if (iif) {
2763 struct net_device *dev = __dev_get_by_index(iif);
2764 err = -ENODEV;
2765 if (!dev)
2766 goto out_free;
2767 skb->protocol = htons(ETH_P_IP);
2768 skb->dev = dev;
2769 local_bh_disable();
2770 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2771 local_bh_enable();
2772 rt = (struct rtable*)skb->dst;
2773 if (!err && rt->u.dst.error)
2774 err = -rt->u.dst.error;
2775 } else {
2776 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2777 .saddr = src,
2778 .tos = rtm->rtm_tos } } };
2779 int oif = 0;
2780 if (rta[RTA_OIF - 1])
2781 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2782 fl.oif = oif;
2783 err = ip_route_output_key(&rt, &fl);
2785 if (err)
2786 goto out_free;
2788 skb->dst = &rt->u.dst;
2789 if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 rt->rt_flags |= RTCF_NOTIFY;
2792 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2794 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795 RTM_NEWROUTE, 0, 0);
2796 if (!err)
2797 goto out_free;
2798 if (err < 0) {
2799 err = -EMSGSIZE;
2800 goto out_free;
2803 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2804 if (err > 0)
2805 err = 0;
2806 out: return err;
2808 out_free:
2809 kfree_skb(skb);
2810 goto out;
2813 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2815 struct rtable *rt;
2816 int h, s_h;
2817 int idx, s_idx;
2819 s_h = cb->args[0];
2820 s_idx = idx = cb->args[1];
2821 for (h = 0; h <= rt_hash_mask; h++) {
2822 if (h < s_h) continue;
2823 if (h > s_h)
2824 s_idx = 0;
2825 rcu_read_lock_bh();
2826 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 rt = rcu_dereference(rt->u.rt_next), idx++) {
2828 if (idx < s_idx)
2829 continue;
2830 skb->dst = dst_clone(&rt->u.dst);
2831 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 1, NLM_F_MULTI) <= 0) {
2834 dst_release(xchg(&skb->dst, NULL));
2835 rcu_read_unlock_bh();
2836 goto done;
2838 dst_release(xchg(&skb->dst, NULL));
2840 rcu_read_unlock_bh();
2843 done:
2844 cb->args[0] = h;
2845 cb->args[1] = idx;
2846 return skb->len;
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2851 rt_cache_flush(0);
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 struct file *filp, void __user *buffer,
2859 size_t *lenp, loff_t *ppos)
2861 if (write) {
2862 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 rt_cache_flush(flush_delay);
2864 return 0;
2867 return -EINVAL;
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871 int __user *name,
2872 int nlen,
2873 void __user *oldval,
2874 size_t __user *oldlenp,
2875 void __user *newval,
2876 size_t newlen,
2877 void **context)
2879 int delay;
2880 if (newlen != sizeof(int))
2881 return -EINVAL;
2882 if (get_user(delay, (int __user *)newval))
2883 return -EFAULT;
2884 rt_cache_flush(delay);
2885 return 0;
2888 ctl_table ipv4_route_table[] = {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
2894 .mode = 0200,
2895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = &proc_dointvec,
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
2930 .proc_handler = &proc_dointvec,
2933 /* Deprecated. Use gc_min_interval_ms */
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2939 .mode = 0644,
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2957 .mode = 0644,
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2975 .mode = 0644,
2976 .proc_handler = &proc_dointvec,
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec,
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3015 .mode = 0644,
3016 .proc_handler = &proc_dointvec,
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3023 .mode = 0644,
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3032 .mode = 0644,
3033 .proc_handler = &proc_dointvec,
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3040 .mode = 0644,
3041 .proc_handler = &proc_dointvec,
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3052 { .ctl_name = 0 }
3054 #endif
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3059 /* This code sucks. But you should have seen it before! --RR */
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3068 unsigned int i;
3070 if ((offset & 3) || (length & 3))
3071 return -EIO;
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 *eof = 1;
3075 return 0;
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 *eof = 1;
3083 offset /= sizeof(u32);
3085 if (length > 0) {
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3089 /* Copy first cpu. */
3090 *start = buffer;
3091 memcpy(dst, src, length);
3093 /* Add the other cpus in, one int at a time */
3094 for_each_possible_cpu(i) {
3095 unsigned int j;
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 for (j = 0; j < length/4; j++)
3100 dst[j] += src[j];
3103 return length;
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3111 if (!str)
3112 return 0;
3113 rhash_entries = simple_strtoul(str, &str, 0);
3114 return 1;
3116 __setup("rhash_entries=", set_rhash_entries);
3118 int __init ip_rt_init(void)
3120 int rc = 0;
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3125 #ifdef CONFIG_NET_CLS_ROUTE
3127 int order;
3128 for (order = 0;
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 /* NOTHING */;
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 if (!ip_rt_acct)
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3136 #endif
3138 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 sizeof(struct rtable),
3140 0, SLAB_HWCACHE_ALIGN,
3141 NULL, NULL);
3143 if (!ipv4_dst_ops.kmem_cachep)
3144 panic("IP: failed to allocate ip_dst_cache\n");
3146 rt_hash_table = (struct rt_hash_bucket *)
3147 alloc_large_system_hash("IP route cache",
3148 sizeof(struct rt_hash_bucket),
3149 rhash_entries,
3150 (num_physpages >= 128 * 1024) ?
3151 15 : 17,
3152 HASH_HIGHMEM,
3153 &rt_hash_log,
3154 &rt_hash_mask,
3156 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 rt_hash_lock_init();
3159 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3162 devinet_init();
3163 ip_fib_init();
3165 init_timer(&rt_flush_timer);
3166 rt_flush_timer.function = rt_run_flush;
3167 init_timer(&rt_periodic_timer);
3168 rt_periodic_timer.function = rt_check_expire;
3169 init_timer(&rt_secret_timer);
3170 rt_secret_timer.function = rt_secret_rebuild;
3172 /* All the timers, started at system startup tend
3173 to synchronize. Perturb it a bit.
3175 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3176 ip_rt_gc_interval;
3177 add_timer(&rt_periodic_timer);
3179 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3180 ip_rt_secret_interval;
3181 add_timer(&rt_secret_timer);
3183 #ifdef CONFIG_PROC_FS
3185 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3186 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3187 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3188 proc_net_stat))) {
3189 return -ENOMEM;
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195 #endif
3196 #endif
3197 #ifdef CONFIG_XFRM
3198 xfrm_init();
3199 xfrm4_init();
3200 #endif
3201 return rc;
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);