2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
47 #include "fib_lookup.h"
49 #define FSprintk(a...)
51 static DEFINE_RWLOCK(fib_info_lock
);
52 static struct hlist_head
*fib_info_hash
;
53 static struct hlist_head
*fib_info_laddrhash
;
54 static unsigned int fib_hash_size
;
55 static unsigned int fib_info_cnt
;
57 #define DEVINDEX_HASHBITS 8
58 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
59 static struct hlist_head fib_info_devhash
[DEVINDEX_HASHSIZE
];
61 #ifdef CONFIG_IP_ROUTE_MULTIPATH
63 static DEFINE_SPINLOCK(fib_multipath_lock
);
65 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
66 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
68 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
69 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
71 #else /* CONFIG_IP_ROUTE_MULTIPATH */
73 /* Hope, that gcc will optimize it to get rid of dummy loop */
75 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
76 for (nhsel=0; nhsel < 1; nhsel++)
78 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
79 for (nhsel=0; nhsel < 1; nhsel++)
81 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
83 #define endfor_nexthops(fi) }
90 } fib_props
[RTA_MAX
+ 1] = {
93 .scope
= RT_SCOPE_NOWHERE
,
97 .scope
= RT_SCOPE_UNIVERSE
,
101 .scope
= RT_SCOPE_HOST
,
105 .scope
= RT_SCOPE_LINK
,
106 }, /* RTN_BROADCAST */
109 .scope
= RT_SCOPE_LINK
,
113 .scope
= RT_SCOPE_UNIVERSE
,
114 }, /* RTN_MULTICAST */
117 .scope
= RT_SCOPE_UNIVERSE
,
118 }, /* RTN_BLACKHOLE */
120 .error
= -EHOSTUNREACH
,
121 .scope
= RT_SCOPE_UNIVERSE
,
122 }, /* RTN_UNREACHABLE */
125 .scope
= RT_SCOPE_UNIVERSE
,
126 }, /* RTN_PROHIBIT */
129 .scope
= RT_SCOPE_UNIVERSE
,
133 .scope
= RT_SCOPE_NOWHERE
,
137 .scope
= RT_SCOPE_NOWHERE
,
138 }, /* RTN_XRESOLVE */
142 /* Release a nexthop info record */
144 void free_fib_info(struct fib_info
*fi
)
146 if (fi
->fib_dead
== 0) {
147 printk("Freeing alive fib_info %p\n", fi
);
150 change_nexthops(fi
) {
154 } endfor_nexthops(fi
);
159 void fib_release_info(struct fib_info
*fi
)
161 write_lock(&fib_info_lock
);
162 if (fi
&& --fi
->fib_treeref
== 0) {
163 hlist_del(&fi
->fib_hash
);
165 hlist_del(&fi
->fib_lhash
);
166 change_nexthops(fi
) {
169 hlist_del(&nh
->nh_hash
);
170 } endfor_nexthops(fi
)
174 write_unlock(&fib_info_lock
);
177 static __inline__
int nh_comp(const struct fib_info
*fi
, const struct fib_info
*ofi
)
179 const struct fib_nh
*onh
= ofi
->fib_nh
;
182 if (nh
->nh_oif
!= onh
->nh_oif
||
183 nh
->nh_gw
!= onh
->nh_gw
||
184 nh
->nh_scope
!= onh
->nh_scope
||
185 #ifdef CONFIG_IP_ROUTE_MULTIPATH
186 nh
->nh_weight
!= onh
->nh_weight
||
188 #ifdef CONFIG_NET_CLS_ROUTE
189 nh
->nh_tclassid
!= onh
->nh_tclassid
||
191 ((nh
->nh_flags
^onh
->nh_flags
)&~RTNH_F_DEAD
))
194 } endfor_nexthops(fi
);
198 static inline unsigned int fib_info_hashfn(const struct fib_info
*fi
)
200 unsigned int mask
= (fib_hash_size
- 1);
201 unsigned int val
= fi
->fib_nhs
;
203 val
^= fi
->fib_protocol
;
204 val
^= fi
->fib_prefsrc
;
205 val
^= fi
->fib_priority
;
207 return (val
^ (val
>> 7) ^ (val
>> 12)) & mask
;
210 static struct fib_info
*fib_find_info(const struct fib_info
*nfi
)
212 struct hlist_head
*head
;
213 struct hlist_node
*node
;
217 hash
= fib_info_hashfn(nfi
);
218 head
= &fib_info_hash
[hash
];
220 hlist_for_each_entry(fi
, node
, head
, fib_hash
) {
221 if (fi
->fib_nhs
!= nfi
->fib_nhs
)
223 if (nfi
->fib_protocol
== fi
->fib_protocol
&&
224 nfi
->fib_prefsrc
== fi
->fib_prefsrc
&&
225 nfi
->fib_priority
== fi
->fib_priority
&&
226 memcmp(nfi
->fib_metrics
, fi
->fib_metrics
,
227 sizeof(fi
->fib_metrics
)) == 0 &&
228 ((nfi
->fib_flags
^fi
->fib_flags
)&~RTNH_F_DEAD
) == 0 &&
229 (nfi
->fib_nhs
== 0 || nh_comp(fi
, nfi
) == 0))
236 static inline unsigned int fib_devindex_hashfn(unsigned int val
)
238 unsigned int mask
= DEVINDEX_HASHSIZE
- 1;
241 (val
>> DEVINDEX_HASHBITS
) ^
242 (val
>> (DEVINDEX_HASHBITS
* 2))) & mask
;
245 /* Check, that the gateway is already configured.
246 Used only by redirect accept routine.
249 int ip_fib_check_default(u32 gw
, struct net_device
*dev
)
251 struct hlist_head
*head
;
252 struct hlist_node
*node
;
256 read_lock(&fib_info_lock
);
258 hash
= fib_devindex_hashfn(dev
->ifindex
);
259 head
= &fib_info_devhash
[hash
];
260 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
261 if (nh
->nh_dev
== dev
&&
263 !(nh
->nh_flags
&RTNH_F_DEAD
)) {
264 read_unlock(&fib_info_lock
);
269 read_unlock(&fib_info_lock
);
274 void rtmsg_fib(int event
, u32 key
, struct fib_alias
*fa
,
276 struct nlmsghdr
*n
, struct netlink_skb_parms
*req
)
279 u32 pid
= req
? req
->pid
: n
->nlmsg_pid
;
280 int size
= NLMSG_SPACE(sizeof(struct rtmsg
)+256);
282 skb
= alloc_skb(size
, GFP_KERNEL
);
286 if (fib_dump_info(skb
, pid
, n
->nlmsg_seq
, event
, tb_id
,
287 fa
->fa_type
, fa
->fa_scope
, &key
, z
,
289 fa
->fa_info
, 0) < 0) {
293 NETLINK_CB(skb
).dst_groups
= RTMGRP_IPV4_ROUTE
;
294 if (n
->nlmsg_flags
&NLM_F_ECHO
)
295 atomic_inc(&skb
->users
);
296 netlink_broadcast(rtnl
, skb
, pid
, RTMGRP_IPV4_ROUTE
, GFP_KERNEL
);
297 if (n
->nlmsg_flags
&NLM_F_ECHO
)
298 netlink_unicast(rtnl
, skb
, pid
, MSG_DONTWAIT
);
301 /* Return the first fib alias matching TOS with
302 * priority less than or equal to PRIO.
304 struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
)
307 struct fib_alias
*fa
;
308 list_for_each_entry(fa
, fah
, fa_list
) {
309 if (fa
->fa_tos
> tos
)
311 if (fa
->fa_info
->fib_priority
>= prio
||
319 int fib_detect_death(struct fib_info
*fi
, int order
,
320 struct fib_info
**last_resort
, int *last_idx
, int *dflt
)
323 int state
= NUD_NONE
;
325 n
= neigh_lookup(&arp_tbl
, &fi
->fib_nh
[0].nh_gw
, fi
->fib_dev
);
327 state
= n
->nud_state
;
330 if (state
==NUD_REACHABLE
)
332 if ((state
&NUD_VALID
) && order
!= *dflt
)
334 if ((state
&NUD_VALID
) ||
335 (*last_idx
<0 && order
> *dflt
)) {
342 #ifdef CONFIG_IP_ROUTE_MULTIPATH
344 static u32
fib_get_attr32(struct rtattr
*attr
, int attrlen
, int type
)
346 while (RTA_OK(attr
,attrlen
)) {
347 if (attr
->rta_type
== type
)
348 return *(u32
*)RTA_DATA(attr
);
349 attr
= RTA_NEXT(attr
, attrlen
);
355 fib_count_nexthops(struct rtattr
*rta
)
358 struct rtnexthop
*nhp
= RTA_DATA(rta
);
359 int nhlen
= RTA_PAYLOAD(rta
);
361 while (nhlen
>= (int)sizeof(struct rtnexthop
)) {
362 if ((nhlen
-= nhp
->rtnh_len
) < 0)
365 nhp
= RTNH_NEXT(nhp
);
371 fib_get_nhs(struct fib_info
*fi
, const struct rtattr
*rta
, const struct rtmsg
*r
)
373 struct rtnexthop
*nhp
= RTA_DATA(rta
);
374 int nhlen
= RTA_PAYLOAD(rta
);
376 change_nexthops(fi
) {
377 int attrlen
= nhlen
- sizeof(struct rtnexthop
);
378 if (attrlen
< 0 || (nhlen
-= nhp
->rtnh_len
) < 0)
380 nh
->nh_flags
= (r
->rtm_flags
&~0xFF) | nhp
->rtnh_flags
;
381 nh
->nh_oif
= nhp
->rtnh_ifindex
;
382 nh
->nh_weight
= nhp
->rtnh_hops
+ 1;
384 nh
->nh_gw
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_GATEWAY
);
385 #ifdef CONFIG_NET_CLS_ROUTE
386 nh
->nh_tclassid
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_FLOW
);
389 nhp
= RTNH_NEXT(nhp
);
390 } endfor_nexthops(fi
);
396 int fib_nh_match(struct rtmsg
*r
, struct nlmsghdr
*nlh
, struct kern_rta
*rta
,
399 #ifdef CONFIG_IP_ROUTE_MULTIPATH
400 struct rtnexthop
*nhp
;
404 if (rta
->rta_priority
&&
405 *rta
->rta_priority
!= fi
->fib_priority
)
408 if (rta
->rta_oif
|| rta
->rta_gw
) {
409 if ((!rta
->rta_oif
|| *rta
->rta_oif
== fi
->fib_nh
->nh_oif
) &&
410 (!rta
->rta_gw
|| memcmp(rta
->rta_gw
, &fi
->fib_nh
->nh_gw
, 4) == 0))
415 #ifdef CONFIG_IP_ROUTE_MULTIPATH
416 if (rta
->rta_mp
== NULL
)
418 nhp
= RTA_DATA(rta
->rta_mp
);
419 nhlen
= RTA_PAYLOAD(rta
->rta_mp
);
422 int attrlen
= nhlen
- sizeof(struct rtnexthop
);
425 if (attrlen
< 0 || (nhlen
-= nhp
->rtnh_len
) < 0)
427 if (nhp
->rtnh_ifindex
&& nhp
->rtnh_ifindex
!= nh
->nh_oif
)
430 gw
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_GATEWAY
);
431 if (gw
&& gw
!= nh
->nh_gw
)
433 #ifdef CONFIG_NET_CLS_ROUTE
434 gw
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_FLOW
);
435 if (gw
&& gw
!= nh
->nh_tclassid
)
439 nhp
= RTNH_NEXT(nhp
);
440 } endfor_nexthops(fi
);
450 Semantics of nexthop is very messy by historical reasons.
451 We have to take into account, that:
452 a) gateway can be actually local interface address,
453 so that gatewayed route is direct.
454 b) gateway must be on-link address, possibly
455 described not by an ifaddr, but also by a direct route.
456 c) If both gateway and interface are specified, they should not
458 d) If we use tunnel routes, gateway could be not on-link.
460 Attempt to reconcile all of these (alas, self-contradictory) conditions
461 results in pretty ugly and hairy code with obscure logic.
463 I chose to generalized it instead, so that the size
464 of code does not increase practically, but it becomes
466 Every prefix is assigned a "scope" value: "host" is local address,
467 "link" is direct route,
468 [ ... "site" ... "interior" ... ]
469 and "universe" is true gateway route with global meaning.
471 Every prefix refers to a set of "nexthop"s (gw, oif),
472 where gw must have narrower scope. This recursion stops
473 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
474 which means that gw is forced to be on link.
476 Code is still hairy, but now it is apparently logically
477 consistent and very flexible. F.e. as by-product it allows
478 to co-exists in peace independent exterior and interior
481 Normally it looks as following.
483 {universe prefix} -> (gw, oif) [scope link]
485 |-> {link prefix} -> (gw, oif) [scope local]
487 |-> {local prefix} (terminal node)
490 static int fib_check_nh(const struct rtmsg
*r
, struct fib_info
*fi
, struct fib_nh
*nh
)
495 struct fib_result res
;
497 #ifdef CONFIG_IP_ROUTE_PERVASIVE
498 if (nh
->nh_flags
&RTNH_F_PERVASIVE
)
501 if (nh
->nh_flags
&RTNH_F_ONLINK
) {
502 struct net_device
*dev
;
504 if (r
->rtm_scope
>= RT_SCOPE_LINK
)
506 if (inet_addr_type(nh
->nh_gw
) != RTN_UNICAST
)
508 if ((dev
= __dev_get_by_index(nh
->nh_oif
)) == NULL
)
510 if (!(dev
->flags
&IFF_UP
))
514 nh
->nh_scope
= RT_SCOPE_LINK
;
518 struct flowi fl
= { .nl_u
= { .ip4_u
=
519 { .daddr
= nh
->nh_gw
,
520 .scope
= r
->rtm_scope
+ 1 } },
523 /* It is not necessary, but requires a bit of thinking */
524 if (fl
.fl4_scope
< RT_SCOPE_LINK
)
525 fl
.fl4_scope
= RT_SCOPE_LINK
;
526 if ((err
= fib_lookup(&fl
, &res
)) != 0)
530 if (res
.type
!= RTN_UNICAST
&& res
.type
!= RTN_LOCAL
)
532 nh
->nh_scope
= res
.scope
;
533 nh
->nh_oif
= FIB_RES_OIF(res
);
534 if ((nh
->nh_dev
= FIB_RES_DEV(res
)) == NULL
)
536 dev_hold(nh
->nh_dev
);
538 if (!(nh
->nh_dev
->flags
& IFF_UP
))
545 struct in_device
*in_dev
;
547 if (nh
->nh_flags
&(RTNH_F_PERVASIVE
|RTNH_F_ONLINK
))
550 in_dev
= inetdev_by_index(nh
->nh_oif
);
553 if (!(in_dev
->dev
->flags
&IFF_UP
)) {
557 nh
->nh_dev
= in_dev
->dev
;
558 dev_hold(nh
->nh_dev
);
559 nh
->nh_scope
= RT_SCOPE_HOST
;
565 static inline unsigned int fib_laddr_hashfn(u32 val
)
567 unsigned int mask
= (fib_hash_size
- 1);
569 return (val
^ (val
>> 7) ^ (val
>> 14)) & mask
;
572 static struct hlist_head
*fib_hash_alloc(int bytes
)
574 if (bytes
<= PAGE_SIZE
)
575 return kmalloc(bytes
, GFP_KERNEL
);
577 return (struct hlist_head
*)
578 __get_free_pages(GFP_KERNEL
, get_order(bytes
));
581 static void fib_hash_free(struct hlist_head
*hash
, int bytes
)
586 if (bytes
<= PAGE_SIZE
)
589 free_pages((unsigned long) hash
, get_order(bytes
));
592 static void fib_hash_move(struct hlist_head
*new_info_hash
,
593 struct hlist_head
*new_laddrhash
,
594 unsigned int new_size
)
596 unsigned int old_size
= fib_hash_size
;
599 write_lock(&fib_info_lock
);
600 fib_hash_size
= new_size
;
602 for (i
= 0; i
< old_size
; i
++) {
603 struct hlist_head
*head
= &fib_info_hash
[i
];
604 struct hlist_node
*node
, *n
;
607 hlist_for_each_entry_safe(fi
, node
, n
, head
, fib_hash
) {
608 struct hlist_head
*dest
;
609 unsigned int new_hash
;
611 hlist_del(&fi
->fib_hash
);
613 new_hash
= fib_info_hashfn(fi
);
614 dest
= &new_info_hash
[new_hash
];
615 hlist_add_head(&fi
->fib_hash
, dest
);
618 fib_info_hash
= new_info_hash
;
620 for (i
= 0; i
< old_size
; i
++) {
621 struct hlist_head
*lhead
= &fib_info_laddrhash
[i
];
622 struct hlist_node
*node
, *n
;
625 hlist_for_each_entry_safe(fi
, node
, n
, lhead
, fib_lhash
) {
626 struct hlist_head
*ldest
;
627 unsigned int new_hash
;
629 hlist_del(&fi
->fib_lhash
);
631 new_hash
= fib_laddr_hashfn(fi
->fib_prefsrc
);
632 ldest
= &new_laddrhash
[new_hash
];
633 hlist_add_head(&fi
->fib_lhash
, ldest
);
636 fib_info_laddrhash
= new_laddrhash
;
638 write_unlock(&fib_info_lock
);
642 fib_create_info(const struct rtmsg
*r
, struct kern_rta
*rta
,
643 const struct nlmsghdr
*nlh
, int *errp
)
646 struct fib_info
*fi
= NULL
;
647 struct fib_info
*ofi
;
648 #ifdef CONFIG_IP_ROUTE_MULTIPATH
653 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 u32 mp_alg
= IP_MP_ALG_NONE
;
657 /* Fast check to catch the most weird cases */
658 if (fib_props
[r
->rtm_type
].scope
> r
->rtm_scope
)
661 #ifdef CONFIG_IP_ROUTE_MULTIPATH
663 nhs
= fib_count_nexthops(rta
->rta_mp
);
668 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
669 if (rta
->rta_mp_alg
) {
670 mp_alg
= *rta
->rta_mp_alg
;
672 if (mp_alg
< IP_MP_ALG_NONE
||
673 mp_alg
> IP_MP_ALG_MAX
)
679 if (fib_info_cnt
>= fib_hash_size
) {
680 unsigned int new_size
= fib_hash_size
<< 1;
681 struct hlist_head
*new_info_hash
;
682 struct hlist_head
*new_laddrhash
;
687 bytes
= new_size
* sizeof(struct hlist_head
*);
688 new_info_hash
= fib_hash_alloc(bytes
);
689 new_laddrhash
= fib_hash_alloc(bytes
);
690 if (!new_info_hash
|| !new_laddrhash
) {
691 fib_hash_free(new_info_hash
, bytes
);
692 fib_hash_free(new_laddrhash
, bytes
);
694 memset(new_info_hash
, 0, bytes
);
695 memset(new_laddrhash
, 0, bytes
);
697 fib_hash_move(new_info_hash
, new_laddrhash
, new_size
);
704 fi
= kmalloc(sizeof(*fi
)+nhs
*sizeof(struct fib_nh
), GFP_KERNEL
);
708 memset(fi
, 0, sizeof(*fi
)+nhs
*sizeof(struct fib_nh
));
710 fi
->fib_protocol
= r
->rtm_protocol
;
713 change_nexthops(fi
) {
715 } endfor_nexthops(fi
)
717 fi
->fib_flags
= r
->rtm_flags
;
718 if (rta
->rta_priority
)
719 fi
->fib_priority
= *rta
->rta_priority
;
721 int attrlen
= RTA_PAYLOAD(rta
->rta_mx
);
722 struct rtattr
*attr
= RTA_DATA(rta
->rta_mx
);
724 while (RTA_OK(attr
, attrlen
)) {
725 unsigned flavor
= attr
->rta_type
;
727 if (flavor
> RTAX_MAX
)
729 fi
->fib_metrics
[flavor
-1] = *(unsigned*)RTA_DATA(attr
);
731 attr
= RTA_NEXT(attr
, attrlen
);
734 if (rta
->rta_prefsrc
)
735 memcpy(&fi
->fib_prefsrc
, rta
->rta_prefsrc
, 4);
738 #ifdef CONFIG_IP_ROUTE_MULTIPATH
739 if ((err
= fib_get_nhs(fi
, rta
->rta_mp
, r
)) != 0)
741 if (rta
->rta_oif
&& fi
->fib_nh
->nh_oif
!= *rta
->rta_oif
)
743 if (rta
->rta_gw
&& memcmp(&fi
->fib_nh
->nh_gw
, rta
->rta_gw
, 4))
745 #ifdef CONFIG_NET_CLS_ROUTE
746 if (rta
->rta_flow
&& memcmp(&fi
->fib_nh
->nh_tclassid
, rta
->rta_flow
, 4))
753 struct fib_nh
*nh
= fi
->fib_nh
;
755 nh
->nh_oif
= *rta
->rta_oif
;
757 memcpy(&nh
->nh_gw
, rta
->rta_gw
, 4);
758 #ifdef CONFIG_NET_CLS_ROUTE
760 memcpy(&nh
->nh_tclassid
, rta
->rta_flow
, 4);
762 nh
->nh_flags
= r
->rtm_flags
;
763 #ifdef CONFIG_IP_ROUTE_MULTIPATH
768 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
769 fi
->fib_mp_alg
= mp_alg
;
772 if (fib_props
[r
->rtm_type
].error
) {
773 if (rta
->rta_gw
|| rta
->rta_oif
|| rta
->rta_mp
)
778 if (r
->rtm_scope
> RT_SCOPE_HOST
)
781 if (r
->rtm_scope
== RT_SCOPE_HOST
) {
782 struct fib_nh
*nh
= fi
->fib_nh
;
784 /* Local address is added. */
785 if (nhs
!= 1 || nh
->nh_gw
)
787 nh
->nh_scope
= RT_SCOPE_NOWHERE
;
788 nh
->nh_dev
= dev_get_by_index(fi
->fib_nh
->nh_oif
);
790 if (nh
->nh_dev
== NULL
)
793 change_nexthops(fi
) {
794 if ((err
= fib_check_nh(r
, fi
, nh
)) != 0)
796 } endfor_nexthops(fi
)
799 if (fi
->fib_prefsrc
) {
800 if (r
->rtm_type
!= RTN_LOCAL
|| rta
->rta_dst
== NULL
||
801 memcmp(&fi
->fib_prefsrc
, rta
->rta_dst
, 4))
802 if (inet_addr_type(fi
->fib_prefsrc
) != RTN_LOCAL
)
807 if ((ofi
= fib_find_info(fi
)) != NULL
) {
815 atomic_inc(&fi
->fib_clntref
);
816 write_lock(&fib_info_lock
);
817 hlist_add_head(&fi
->fib_hash
,
818 &fib_info_hash
[fib_info_hashfn(fi
)]);
819 if (fi
->fib_prefsrc
) {
820 struct hlist_head
*head
;
822 head
= &fib_info_laddrhash
[fib_laddr_hashfn(fi
->fib_prefsrc
)];
823 hlist_add_head(&fi
->fib_lhash
, head
);
825 change_nexthops(fi
) {
826 struct hlist_head
*head
;
831 hash
= fib_devindex_hashfn(nh
->nh_dev
->ifindex
);
832 head
= &fib_info_devhash
[hash
];
833 hlist_add_head(&nh
->nh_hash
, head
);
834 } endfor_nexthops(fi
)
835 write_unlock(&fib_info_lock
);
850 int fib_semantic_match(struct list_head
*head
, const struct flowi
*flp
,
851 struct fib_result
*res
, __u32 zone
, __u32 mask
,
854 struct fib_alias
*fa
;
857 list_for_each_entry(fa
, head
, fa_list
) {
861 fa
->fa_tos
!= flp
->fl4_tos
)
864 if (fa
->fa_scope
< flp
->fl4_scope
)
867 fa
->fa_state
|= FA_S_ACCESSED
;
869 err
= fib_props
[fa
->fa_type
].error
;
871 struct fib_info
*fi
= fa
->fa_info
;
873 if (fi
->fib_flags
& RTNH_F_DEAD
)
876 switch (fa
->fa_type
) {
883 if (nh
->nh_flags
&RTNH_F_DEAD
)
885 if (!flp
->oif
|| flp
->oif
== nh
->nh_oif
)
888 #ifdef CONFIG_IP_ROUTE_MULTIPATH
889 if (nhsel
< fi
->fib_nhs
) {
902 printk(KERN_DEBUG
"impossible 102\n");
911 res
->prefixlen
= prefixlen
;
912 res
->nh_sel
= nh_sel
;
913 res
->type
= fa
->fa_type
;
914 res
->scope
= fa
->fa_scope
;
915 res
->fi
= fa
->fa_info
;
916 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
918 res
->network
= zone
&
919 (0xFFFFFFFF >> (32 - prefixlen
));
921 atomic_inc(&res
->fi
->fib_clntref
);
925 /* Find appropriate source address to this destination */
927 u32
__fib_res_prefsrc(struct fib_result
*res
)
929 return inet_select_addr(FIB_RES_DEV(*res
), FIB_RES_GW(*res
), res
->scope
);
933 fib_dump_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
934 u8 tb_id
, u8 type
, u8 scope
, void *dst
, int dst_len
, u8 tos
,
935 struct fib_info
*fi
, unsigned int flags
)
938 struct nlmsghdr
*nlh
;
939 unsigned char *b
= skb
->tail
;
941 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*rtm
), flags
);
942 rtm
= NLMSG_DATA(nlh
);
943 rtm
->rtm_family
= AF_INET
;
944 rtm
->rtm_dst_len
= dst_len
;
945 rtm
->rtm_src_len
= 0;
947 rtm
->rtm_table
= tb_id
;
948 rtm
->rtm_type
= type
;
949 rtm
->rtm_flags
= fi
->fib_flags
;
950 rtm
->rtm_scope
= scope
;
951 if (rtm
->rtm_dst_len
)
952 RTA_PUT(skb
, RTA_DST
, 4, dst
);
953 rtm
->rtm_protocol
= fi
->fib_protocol
;
954 if (fi
->fib_priority
)
955 RTA_PUT(skb
, RTA_PRIORITY
, 4, &fi
->fib_priority
);
956 #ifdef CONFIG_NET_CLS_ROUTE
957 if (fi
->fib_nh
[0].nh_tclassid
)
958 RTA_PUT(skb
, RTA_FLOW
, 4, &fi
->fib_nh
[0].nh_tclassid
);
960 if (rtnetlink_put_metrics(skb
, fi
->fib_metrics
) < 0)
963 RTA_PUT(skb
, RTA_PREFSRC
, 4, &fi
->fib_prefsrc
);
964 if (fi
->fib_nhs
== 1) {
965 if (fi
->fib_nh
->nh_gw
)
966 RTA_PUT(skb
, RTA_GATEWAY
, 4, &fi
->fib_nh
->nh_gw
);
967 if (fi
->fib_nh
->nh_oif
)
968 RTA_PUT(skb
, RTA_OIF
, sizeof(int), &fi
->fib_nh
->nh_oif
);
970 #ifdef CONFIG_IP_ROUTE_MULTIPATH
971 if (fi
->fib_nhs
> 1) {
972 struct rtnexthop
*nhp
;
973 struct rtattr
*mp_head
;
974 if (skb_tailroom(skb
) <= RTA_SPACE(0))
976 mp_head
= (struct rtattr
*)skb_put(skb
, RTA_SPACE(0));
979 if (skb_tailroom(skb
) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp
)) + 4))
981 nhp
= (struct rtnexthop
*)skb_put(skb
, RTA_ALIGN(sizeof(*nhp
)));
982 nhp
->rtnh_flags
= nh
->nh_flags
& 0xFF;
983 nhp
->rtnh_hops
= nh
->nh_weight
-1;
984 nhp
->rtnh_ifindex
= nh
->nh_oif
;
986 RTA_PUT(skb
, RTA_GATEWAY
, 4, &nh
->nh_gw
);
987 nhp
->rtnh_len
= skb
->tail
- (unsigned char*)nhp
;
988 } endfor_nexthops(fi
);
989 mp_head
->rta_type
= RTA_MULTIPATH
;
990 mp_head
->rta_len
= skb
->tail
- (u8
*)mp_head
;
993 nlh
->nlmsg_len
= skb
->tail
- b
;
998 skb_trim(skb
, b
- skb
->data
);
1002 #ifndef CONFIG_IP_NOSIOCRT
1005 fib_convert_rtentry(int cmd
, struct nlmsghdr
*nl
, struct rtmsg
*rtm
,
1006 struct kern_rta
*rta
, struct rtentry
*r
)
1011 memset(rtm
, 0, sizeof(*rtm
));
1012 memset(rta
, 0, sizeof(*rta
));
1014 if (r
->rt_dst
.sa_family
!= AF_INET
)
1015 return -EAFNOSUPPORT
;
1017 /* Check mask for validity:
1018 a) it must be contiguous.
1019 b) destination must have all host bits clear.
1020 c) if application forgot to set correct family (AF_INET),
1021 reject request unless it is absolutely clear i.e.
1022 both family and mask are zero.
1025 ptr
= &((struct sockaddr_in
*)&r
->rt_dst
)->sin_addr
.s_addr
;
1026 if (!(r
->rt_flags
&RTF_HOST
)) {
1027 u32 mask
= ((struct sockaddr_in
*)&r
->rt_genmask
)->sin_addr
.s_addr
;
1028 if (r
->rt_genmask
.sa_family
!= AF_INET
) {
1029 if (mask
|| r
->rt_genmask
.sa_family
)
1030 return -EAFNOSUPPORT
;
1032 if (bad_mask(mask
, *ptr
))
1034 plen
= inet_mask_len(mask
);
1037 nl
->nlmsg_flags
= NLM_F_REQUEST
;
1038 nl
->nlmsg_pid
= current
->pid
;
1040 nl
->nlmsg_len
= NLMSG_LENGTH(sizeof(*rtm
));
1041 if (cmd
== SIOCDELRT
) {
1042 nl
->nlmsg_type
= RTM_DELROUTE
;
1043 nl
->nlmsg_flags
= 0;
1045 nl
->nlmsg_type
= RTM_NEWROUTE
;
1046 nl
->nlmsg_flags
= NLM_F_REQUEST
|NLM_F_CREATE
;
1047 rtm
->rtm_protocol
= RTPROT_BOOT
;
1050 rtm
->rtm_dst_len
= plen
;
1054 *(u32
*)&r
->rt_pad3
= r
->rt_metric
- 1;
1055 rta
->rta_priority
= (u32
*)&r
->rt_pad3
;
1057 if (r
->rt_flags
&RTF_REJECT
) {
1058 rtm
->rtm_scope
= RT_SCOPE_HOST
;
1059 rtm
->rtm_type
= RTN_UNREACHABLE
;
1062 rtm
->rtm_scope
= RT_SCOPE_NOWHERE
;
1063 rtm
->rtm_type
= RTN_UNICAST
;
1067 struct net_device
*dev
;
1068 char devname
[IFNAMSIZ
];
1070 if (copy_from_user(devname
, r
->rt_dev
, IFNAMSIZ
-1))
1072 devname
[IFNAMSIZ
-1] = 0;
1073 colon
= strchr(devname
, ':');
1076 dev
= __dev_get_by_name(devname
);
1079 rta
->rta_oif
= &dev
->ifindex
;
1081 struct in_ifaddr
*ifa
;
1082 struct in_device
*in_dev
= __in_dev_get(dev
);
1086 for (ifa
= in_dev
->ifa_list
; ifa
; ifa
= ifa
->ifa_next
)
1087 if (strcmp(ifa
->ifa_label
, devname
) == 0)
1091 rta
->rta_prefsrc
= &ifa
->ifa_local
;
1095 ptr
= &((struct sockaddr_in
*)&r
->rt_gateway
)->sin_addr
.s_addr
;
1096 if (r
->rt_gateway
.sa_family
== AF_INET
&& *ptr
) {
1098 if (r
->rt_flags
&RTF_GATEWAY
&& inet_addr_type(*ptr
) == RTN_UNICAST
)
1099 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
1102 if (cmd
== SIOCDELRT
)
1105 if (r
->rt_flags
&RTF_GATEWAY
&& rta
->rta_gw
== NULL
)
1108 if (rtm
->rtm_scope
== RT_SCOPE_NOWHERE
)
1109 rtm
->rtm_scope
= RT_SCOPE_LINK
;
1111 if (r
->rt_flags
&(RTF_MTU
|RTF_WINDOW
|RTF_IRTT
)) {
1113 struct rtattr
*mx
= kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL
);
1117 mx
->rta_type
= RTA_METRICS
;
1118 mx
->rta_len
= RTA_LENGTH(0);
1119 if (r
->rt_flags
&RTF_MTU
) {
1120 rec
= (void*)((char*)mx
+ RTA_ALIGN(mx
->rta_len
));
1121 rec
->rta_type
= RTAX_ADVMSS
;
1122 rec
->rta_len
= RTA_LENGTH(4);
1123 mx
->rta_len
+= RTA_LENGTH(4);
1124 *(u32
*)RTA_DATA(rec
) = r
->rt_mtu
- 40;
1126 if (r
->rt_flags
&RTF_WINDOW
) {
1127 rec
= (void*)((char*)mx
+ RTA_ALIGN(mx
->rta_len
));
1128 rec
->rta_type
= RTAX_WINDOW
;
1129 rec
->rta_len
= RTA_LENGTH(4);
1130 mx
->rta_len
+= RTA_LENGTH(4);
1131 *(u32
*)RTA_DATA(rec
) = r
->rt_window
;
1133 if (r
->rt_flags
&RTF_IRTT
) {
1134 rec
= (void*)((char*)mx
+ RTA_ALIGN(mx
->rta_len
));
1135 rec
->rta_type
= RTAX_RTT
;
1136 rec
->rta_len
= RTA_LENGTH(4);
1137 mx
->rta_len
+= RTA_LENGTH(4);
1138 *(u32
*)RTA_DATA(rec
) = r
->rt_irtt
<<3;
1148 - local address disappeared -> we must delete all the entries
1150 - device went down -> we must shutdown all nexthops going via it.
1153 int fib_sync_down(u32 local
, struct net_device
*dev
, int force
)
1156 int scope
= RT_SCOPE_NOWHERE
;
1161 if (local
&& fib_info_laddrhash
) {
1162 unsigned int hash
= fib_laddr_hashfn(local
);
1163 struct hlist_head
*head
= &fib_info_laddrhash
[hash
];
1164 struct hlist_node
*node
;
1165 struct fib_info
*fi
;
1167 hlist_for_each_entry(fi
, node
, head
, fib_lhash
) {
1168 if (fi
->fib_prefsrc
== local
) {
1169 fi
->fib_flags
|= RTNH_F_DEAD
;
1176 struct fib_info
*prev_fi
= NULL
;
1177 unsigned int hash
= fib_devindex_hashfn(dev
->ifindex
);
1178 struct hlist_head
*head
= &fib_info_devhash
[hash
];
1179 struct hlist_node
*node
;
1182 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1183 struct fib_info
*fi
= nh
->nh_parent
;
1186 BUG_ON(!fi
->fib_nhs
);
1187 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1191 change_nexthops(fi
) {
1192 if (nh
->nh_flags
&RTNH_F_DEAD
)
1194 else if (nh
->nh_dev
== dev
&&
1195 nh
->nh_scope
!= scope
) {
1196 nh
->nh_flags
|= RTNH_F_DEAD
;
1197 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1198 spin_lock_bh(&fib_multipath_lock
);
1199 fi
->fib_power
-= nh
->nh_power
;
1201 spin_unlock_bh(&fib_multipath_lock
);
1205 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1206 if (force
> 1 && nh
->nh_dev
== dev
) {
1211 } endfor_nexthops(fi
)
1212 if (dead
== fi
->fib_nhs
) {
1213 fi
->fib_flags
|= RTNH_F_DEAD
;
1222 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1225 Dead device goes up. We wake up dead nexthops.
1226 It takes sense only on multipath routes.
1229 int fib_sync_up(struct net_device
*dev
)
1231 struct fib_info
*prev_fi
;
1233 struct hlist_head
*head
;
1234 struct hlist_node
*node
;
1238 if (!(dev
->flags
&IFF_UP
))
1242 hash
= fib_devindex_hashfn(dev
->ifindex
);
1243 head
= &fib_info_devhash
[hash
];
1246 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1247 struct fib_info
*fi
= nh
->nh_parent
;
1250 BUG_ON(!fi
->fib_nhs
);
1251 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1256 change_nexthops(fi
) {
1257 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1261 if (nh
->nh_dev
== NULL
|| !(nh
->nh_dev
->flags
&IFF_UP
))
1263 if (nh
->nh_dev
!= dev
|| __in_dev_get(dev
) == NULL
)
1266 spin_lock_bh(&fib_multipath_lock
);
1268 nh
->nh_flags
&= ~RTNH_F_DEAD
;
1269 spin_unlock_bh(&fib_multipath_lock
);
1270 } endfor_nexthops(fi
)
1273 fi
->fib_flags
&= ~RTNH_F_DEAD
;
1282 The algorithm is suboptimal, but it provides really
1283 fair weighted route distribution.
1286 void fib_select_multipath(const struct flowi
*flp
, struct fib_result
*res
)
1288 struct fib_info
*fi
= res
->fi
;
1291 spin_lock_bh(&fib_multipath_lock
);
1292 if (fi
->fib_power
<= 0) {
1294 change_nexthops(fi
) {
1295 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1296 power
+= nh
->nh_weight
;
1297 nh
->nh_power
= nh
->nh_weight
;
1299 } endfor_nexthops(fi
);
1300 fi
->fib_power
= power
;
1302 spin_unlock_bh(&fib_multipath_lock
);
1303 /* Race condition: route has just become dead. */
1310 /* w should be random number [0..fi->fib_power-1],
1311 it is pretty bad approximation.
1314 w
= jiffies
% fi
->fib_power
;
1316 change_nexthops(fi
) {
1317 if (!(nh
->nh_flags
&RTNH_F_DEAD
) && nh
->nh_power
) {
1318 if ((w
-= nh
->nh_power
) <= 0) {
1321 res
->nh_sel
= nhsel
;
1322 spin_unlock_bh(&fib_multipath_lock
);
1326 } endfor_nexthops(fi
);
1328 /* Race condition: route has just become dead. */
1330 spin_unlock_bh(&fib_multipath_lock
);