1 /* $NetBSD: ip_flow.c,v 1.57 2009/02/01 17:04:11 pooka Exp $ */
4 * Copyright (c) 1998 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: ip_flow.c,v 1.57 2009/02/01 17:04:11 pooka Exp $");
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/errno.h>
45 #include <sys/kernel.h>
47 #include <sys/sysctl.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/ip_private.h>
63 * Similar code is very well commented in netinet6/ip6_flow.c
67 LIST_ENTRY(ipflow
) ipf_list
; /* next in active list */
68 LIST_ENTRY(ipflow
) ipf_hash
; /* next ipflow in bucket */
69 struct in_addr ipf_dst
; /* destination address */
70 struct in_addr ipf_src
; /* source address */
71 uint8_t ipf_tos
; /* type-of-service */
72 struct route ipf_ro
; /* associated route entry */
73 u_long ipf_uses
; /* number of uses in this period */
74 u_long ipf_last_uses
; /* number of uses in last period */
75 u_long ipf_dropped
; /* ENOBUFS retured by if_output */
76 u_long ipf_errors
; /* other errors returned by if_output */
77 u_int ipf_timer
; /* lifetime timer */
78 time_t ipf_start
; /* creation time */
81 #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */
83 static struct pool ipflow_pool
;
85 LIST_HEAD(ipflowhead
, ipflow
);
87 #define IPFLOW_TIMER (5 * PR_SLOWHZ)
88 #define IPFLOW_DEFAULT_HASHSIZE (1 << IPFLOW_HASHBITS)
90 static struct ipflowhead
*ipflowtable
= NULL
;
91 static struct ipflowhead ipflowlist
;
92 static int ipflow_inuse
;
94 #define IPFLOW_INSERT(bucket, ipf) \
96 LIST_INSERT_HEAD((bucket), (ipf), ipf_hash); \
97 LIST_INSERT_HEAD(&ipflowlist, (ipf), ipf_list); \
98 } while (/*CONSTCOND*/ 0)
100 #define IPFLOW_REMOVE(ipf) \
102 LIST_REMOVE((ipf), ipf_hash); \
103 LIST_REMOVE((ipf), ipf_list); \
104 } while (/*CONSTCOND*/ 0)
107 #define IPFLOW_MAX 256
109 int ip_maxflows
= IPFLOW_MAX
;
110 int ip_hashsize
= IPFLOW_DEFAULT_HASHSIZE
;
113 ipflow_hash(const struct ip
*ip
)
115 size_t hash
= ip
->ip_tos
;
118 for (idx
= 0; idx
< 32; idx
+= IPFLOW_HASHBITS
) {
119 hash
+= (ip
->ip_dst
.s_addr
>> (32 - idx
)) +
120 (ip
->ip_src
.s_addr
>> idx
);
123 return hash
& (ip_hashsize
-1);
126 static struct ipflow
*
127 ipflow_lookup(const struct ip
*ip
)
132 hash
= ipflow_hash(ip
);
134 LIST_FOREACH(ipf
, &ipflowtable
[hash
], ipf_hash
) {
135 if (ip
->ip_dst
.s_addr
== ipf
->ipf_dst
.s_addr
136 && ip
->ip_src
.s_addr
== ipf
->ipf_src
.s_addr
137 && ip
->ip_tos
== ipf
->ipf_tos
)
144 ipflow_poolinit(void)
147 pool_init(&ipflow_pool
, sizeof(struct ipflow
), 0, 0, 0, "ipflowpl",
152 ipflow_init(int table_size
)
154 struct ipflowhead
*new_table
;
157 new_table
= (struct ipflowhead
*)malloc(sizeof(struct ipflowhead
) *
158 table_size
, M_RTABLE
, M_NOWAIT
);
160 if (new_table
== NULL
)
163 if (ipflowtable
!= NULL
)
164 free(ipflowtable
, M_RTABLE
);
166 ipflowtable
= new_table
;
167 ip_hashsize
= table_size
;
169 LIST_INIT(&ipflowlist
);
170 for (i
= 0; i
< ip_hashsize
; i
++)
171 LIST_INIT(&ipflowtable
[i
]);
177 ipflow_fastforward(struct mbuf
*m
)
183 const struct sockaddr
*dst
;
188 * Are we forwarding packets? Big enough for an IP packet?
190 if (!ipforwarding
|| ipflow_inuse
== 0 || m
->m_len
< sizeof(struct ip
))
194 * Was packet received as a link-level multicast or broadcast?
195 * If so, don't try to fast forward..
197 if ((m
->m_flags
& (M_BCAST
|M_MCAST
)) != 0)
201 * IP header with no option and valid version and length
203 if (IP_HDR_ALIGNED_P(mtod(m
, const void *)))
204 ip
= mtod(m
, struct ip
*);
206 memcpy(&ip_store
, mtod(m
, const void *), sizeof(ip_store
));
209 iplen
= ntohs(ip
->ip_len
);
210 if (ip
->ip_v
!= IPVERSION
|| ip
->ip_hl
!= (sizeof(struct ip
) >> 2) ||
211 iplen
< sizeof(struct ip
) || iplen
> m
->m_pkthdr
.len
)
216 if ((ipf
= ipflow_lookup(ip
)) == NULL
)
220 * Verify the IP header checksum.
222 switch (m
->m_pkthdr
.csum_flags
&
223 ((m
->m_pkthdr
.rcvif
->if_csum_flags_rx
& M_CSUM_IPv4
) |
225 case M_CSUM_IPv4
|M_CSUM_IPv4_BAD
:
229 /* Checksum was okay. */
233 /* Must compute it ourselves. */
234 if (in_cksum(m
, sizeof(struct ip
)) != 0)
240 * Route and interface still up?
242 if ((rt
= rtcache_validate(&ipf
->ipf_ro
)) == NULL
||
243 (rt
->rt_ifp
->if_flags
& IFF_UP
) == 0)
247 * Packet size OK? TTL?
249 if (m
->m_pkthdr
.len
> rt
->rt_ifp
->if_mtu
|| ip
->ip_ttl
<= IPTTLDEC
)
253 * Clear any in-bound checksum flags for this packet.
255 m
->m_pkthdr
.csum_flags
= 0;
258 * Everything checks out and so we can forward this packet.
259 * Modify the TTL and incrementally change the checksum.
261 * This method of adding the checksum works on either endian CPU.
262 * If htons() is inlined, all the arithmetic is folded; otherwise
263 * the htons()s are combined by CSE due to the const attribute.
265 * Don't bother using HW checksumming here -- the incremental
266 * update is pretty fast.
268 ip
->ip_ttl
-= IPTTLDEC
;
269 if (ip
->ip_sum
>= (u_int16_t
) ~htons(IPTTLDEC
<< 8))
270 ip
->ip_sum
-= ~htons(IPTTLDEC
<< 8);
272 ip
->ip_sum
+= htons(IPTTLDEC
<< 8);
275 * Done modifying the header; copy it back, if necessary.
277 * XXX Use m_copyback_cow(9) here? --dyoung
279 if (IP_HDR_ALIGNED_P(mtod(m
, void *)) == 0)
280 memcpy(mtod(m
, void *), &ip_store
, sizeof(ip_store
));
283 * Trim the packet in case it's too long..
285 if (m
->m_pkthdr
.len
> iplen
) {
286 if (m
->m_len
== m
->m_pkthdr
.len
) {
288 m
->m_pkthdr
.len
= iplen
;
290 m_adj(m
, iplen
- m
->m_pkthdr
.len
);
294 * Send the packet on it's way. All we can get back is ENOBUFS
297 PRT_SLOW_ARM(ipf
->ipf_timer
, IPFLOW_TIMER
);
299 if (rt
->rt_flags
& RTF_GATEWAY
)
300 dst
= rt
->rt_gateway
;
302 dst
= rtcache_getdst(&ipf
->ipf_ro
);
304 if ((error
= (*rt
->rt_ifp
->if_output
)(rt
->rt_ifp
, m
, dst
, rt
)) != 0) {
305 if (error
== ENOBUFS
)
314 ipflow_addstats(struct ipflow
*ipf
)
319 if ((rt
= rtcache_validate(&ipf
->ipf_ro
)) != NULL
)
320 rt
->rt_use
+= ipf
->ipf_uses
;
322 ips
= IP_STAT_GETREF();
323 ips
[IP_STAT_CANTFORWARD
] += ipf
->ipf_errors
+ ipf
->ipf_dropped
;
324 ips
[IP_STAT_TOTAL
] += ipf
->ipf_uses
;
325 ips
[IP_STAT_FORWARD
] += ipf
->ipf_uses
;
326 ips
[IP_STAT_FASTFORWARD
] += ipf
->ipf_uses
;
331 ipflow_free(struct ipflow
*ipf
)
335 * Remove the flow from the hash table (at elevated IPL).
336 * Once it's off the list, we can deal with it at normal
342 ipflow_addstats(ipf
);
343 rtcache_free(&ipf
->ipf_ro
);
346 pool_put(&ipflow_pool
, ipf
);
350 static struct ipflow
*
351 ipflow_reap(bool just_one
)
353 while (just_one
|| ipflow_inuse
> ip_maxflows
) {
354 struct ipflow
*ipf
, *maybe_ipf
= NULL
;
357 ipf
= LIST_FIRST(&ipflowlist
);
358 while (ipf
!= NULL
) {
360 * If this no longer points to a valid route
363 if (rtcache_validate(&ipf
->ipf_ro
) == NULL
)
366 * choose the one that's been least recently
367 * used or has had the least uses in the
368 * last 1.5 intervals.
370 if (maybe_ipf
== NULL
||
371 ipf
->ipf_timer
< maybe_ipf
->ipf_timer
||
372 (ipf
->ipf_timer
== maybe_ipf
->ipf_timer
&&
373 ipf
->ipf_last_uses
+ ipf
->ipf_uses
<
374 maybe_ipf
->ipf_last_uses
+
375 maybe_ipf
->ipf_uses
))
377 ipf
= LIST_NEXT(ipf
, ipf_list
);
382 * Remove the entry from the flow table.
387 ipflow_addstats(ipf
);
388 rtcache_free(&ipf
->ipf_ro
);
391 pool_put(&ipflow_pool
, ipf
);
401 (void) ipflow_reap(false);
405 ipflow_slowtimo(void)
408 struct ipflow
*ipf
, *next_ipf
;
411 mutex_enter(softnet_lock
);
412 KERNEL_LOCK(1, NULL
);
413 for (ipf
= LIST_FIRST(&ipflowlist
); ipf
!= NULL
; ipf
= next_ipf
) {
414 next_ipf
= LIST_NEXT(ipf
, ipf_list
);
415 if (PRT_SLOW_ISEXPIRED(ipf
->ipf_timer
) ||
416 (rt
= rtcache_validate(&ipf
->ipf_ro
)) == NULL
) {
419 ipf
->ipf_last_uses
= ipf
->ipf_uses
;
420 rt
->rt_use
+= ipf
->ipf_uses
;
421 ips
= IP_STAT_GETREF();
422 ips
[IP_STAT_TOTAL
] += ipf
->ipf_uses
;
423 ips
[IP_STAT_FORWARD
] += ipf
->ipf_uses
;
424 ips
[IP_STAT_FASTFORWARD
] += ipf
->ipf_uses
;
429 KERNEL_UNLOCK_ONE(NULL
);
430 mutex_exit(softnet_lock
);
434 ipflow_create(const struct route
*ro
, struct mbuf
*m
)
436 const struct ip
*const ip
= mtod(m
, const struct ip
*);
442 * Don't create cache entries for ICMP messages.
444 if (ip_maxflows
== 0 || ip
->ip_p
== IPPROTO_ICMP
)
447 * See if an existing flow struct exists. If so remove it from it's
448 * list and free the old route. If not, try to malloc a new one
449 * (if we aren't at our limit).
451 ipf
= ipflow_lookup(ip
);
453 if (ipflow_inuse
>= ip_maxflows
) {
454 ipf
= ipflow_reap(true);
457 ipf
= pool_get(&ipflow_pool
, PR_NOWAIT
);
463 memset(ipf
, 0, sizeof(*ipf
));
468 ipflow_addstats(ipf
);
469 rtcache_free(&ipf
->ipf_ro
);
470 ipf
->ipf_uses
= ipf
->ipf_last_uses
= 0;
471 ipf
->ipf_errors
= ipf
->ipf_dropped
= 0;
475 * Fill in the updated information.
477 rtcache_copy(&ipf
->ipf_ro
, ro
);
478 ipf
->ipf_dst
= ip
->ip_dst
;
479 ipf
->ipf_src
= ip
->ip_src
;
480 ipf
->ipf_tos
= ip
->ip_tos
;
481 PRT_SLOW_ARM(ipf
->ipf_timer
, IPFLOW_TIMER
);
482 ipf
->ipf_start
= time_uptime
;
484 * Insert into the approriate bucket of the flow table.
486 hash
= ipflow_hash(ip
);
488 IPFLOW_INSERT(&ipflowtable
[hash
], ipf
);
493 ipflow_invalidate_all(int new_size
)
495 struct ipflow
*ipf
, *next_ipf
;
500 for (ipf
= LIST_FIRST(&ipflowlist
); ipf
!= NULL
; ipf
= next_ipf
) {
501 next_ipf
= LIST_NEXT(ipf
, ipf_list
);
506 error
= ipflow_init(new_size
);