1 /* $NetBSD: tcp_vtw.h,v 1.6 2012/11/23 14:48:31 joerg Exp $ */
3 * Copyright (c) 2011 The NetBSD Foundation, Inc.
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Coyote Point Systems, Inc.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * Vestigial time-wait.
34 * This implementation uses cache-efficient techniques, which will
35 * appear somewhat peculiar. The main philosophy is to optimise the
36 * amount of information available within a cache line. Cache miss is
37 * expensive. So we employ ad-hoc techniques to pull a series of
38 * linked-list follows into a cache line. One cache line, multiple
39 * linked-list equivalents.
41 * One such ad-hoc technique is fat pointers. Additional degrees of
42 * ad-hoqueness result from having to hand tune it for pointer size
43 * and for cache line size.
45 * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
46 * data structures into one cache line. The additional 32 bits in the
47 * cache line are used for linking fat pointers, and for
48 * allocation/bookkeeping.
50 * The 15 32-bit tags encode the pointers to the linked list elements,
51 * and also encode the results of a search comparison.
53 * First, some more assumptions/restrictions.
55 * All the fat pointers are from a contiguous allocation arena. Thus,
56 * we can refer to them by offset from a base, not as full pointers.
58 * All the linked list data elements are also from a contiguous
59 * allocation arena, again so that we can refer to them as offset from
62 * In order to add a data element to a fat pointer, a key value is
63 * computed, based on unique data within the data element. It is the
64 * linear searching of the linked lists of these elements based on
65 * these unique data that are being optimised here.
67 * Lets call the function that computes the key k(e), where e is the
68 * data element. In this example, k(e) returns 32-bits.
70 * Consider a set E (say of order 15) of data elements. Let K be
71 * the set of the k(e) for e in E.
73 * Let O be the set of the offsets from the base of the data elements in E.
75 * For each x in K, for each matching o in O, let t be x ^ o. These
76 * are the tags. (More or less).
78 * In order to search all the data elements in E, we compute the
79 * search key, and one at a time, XOR the key into the tags. If any
80 * result is a valid data element index, we have a possible match. If
81 * not, there is no match.
83 * The no-match cases mean we do not have to de-reference the pointer
84 * to the data element in question. We save cache miss penalty and
85 * cache load decreases. Only in the case of a valid looking data
86 * element index, do we have to look closer.
88 * Thus, in the absence of false positives, 15 data elements can be
89 * searched with one cache line fill, as opposed to 15 cache line
90 * fills for the usual implementation.
92 * The vestigial time waits (vtw_t), the data elements in the above, are
93 * searched by faddr, fport, laddr, lport. The key is a function of
96 * We hash these keys into the traditional hash chains to reduce the
97 * search time, and use fat pointers to reduce the cache impacts of
100 * The vtw_t are, per requirement, in a contiguous chunk. Allocation
101 * is done with a clock hand, and all vtw_t within one allocation
102 * domain have the same lifetime, so they will always be sorted by
105 * A vtw_t will be allocated, timestamped, and have a fixed future
106 * expiration. It will be added to a hash bucket implemented with fat
107 * pointers, which means that a cache line will be allocated in the
108 * hash bucket, placed at the head (more recent in time) and the vtw_t
109 * will be added to this. As more entries are added, the fat pointer
110 * cache line will fill, requiring additional cache lines for fat
111 * pointers to be allocated. These will be added at the head, and the
112 * aged entries will hang down, tapeworm like. As the vtw_t entries
113 * expire, the corresponding slot in the fat pointer will be
114 * reclaimed, and eventually the cache line will completely empty and
115 * be re-cycled, if not at the head of the chain.
117 * At times, a time-wait timer is restarted. This corresponds to
118 * deleting the current entry and re-adding it.
120 * Most of the time, they are just placed here to die.
122 #ifndef _NETINET_TCP_VTW_H
123 #define _NETINET_TCP_VTW_H
125 #include <sys/types.h>
126 #include <sys/socket.h>
127 #include <sys/sysctl.h>
129 #include <net/route.h>
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/in_var.h>
135 #include <netinet/ip_var.h>
136 #include <netinet/in.h>
137 #include <netinet/tcp.h>
138 #include <netinet/tcp_timer.h>
139 #include <netinet/tcp_var.h>
140 #include <netinet6/in6.h>
141 #include <netinet/ip6.h>
142 #include <netinet6/ip6_var.h>
143 #include <netinet6/in6_pcb.h>
144 #include <netinet6/ip6_var.h>
145 #include <netinet6/in6_var.h>
146 #include <netinet/icmp6.h>
147 #include <netinet6/nd6.h>
149 #define VTW_NCLASS (1+3) /* # different classes */
156 typedef uint32_t fatp_word_t
;
158 typedef struct fatp_mi fatp_t
;
160 /* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(),
161 * fatp_slot_from_key(), fatp_xtra[].
163 #define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
164 #define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
166 #define FATP_MAX (1 << FATP_NXT_WIDTH)
168 /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
169 * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t.
170 * The comments on the fatp_mi members, below, correspond to the worked
174 fatp_word_t inuse
: FATP_NTAGS
; /* (1+15)*4 == CL_SIZE */
175 fatp_word_t nxt
: FATP_NXT_WIDTH
;/* at most 2^17 fat pointers */
176 fatp_word_t tag
[FATP_NTAGS
]; /* 15 tags per CL */
186 fatp_full(fatp_t
*fp
)
190 full
.inuse
= (1U << FATP_NTAGS
) - 1U;
192 return (fp
->inuse
== full
.inuse
);
200 /*!\brief common to all vtw
202 typedef struct vtw_common
{
203 struct timeval expire
; /* date of birth+msl */
204 uint32_t key
; /* hash key: full hash */
205 uint32_t port_key
; /* hash key: local port hash */
209 uint32_t snd_scale
: 8; /* window scaling for send win */
210 uint32_t msl_class
: 2; /* TCP MSL class {0,1,2,3} */
211 uint32_t reuse_port
: 1;
212 uint32_t reuse_addr
: 1;
214 uint32_t hashed
: 1; /* reachable via FATP */
218 /*!\brief vestigial timewait for IPv4
220 typedef struct vtw_v4
{
221 vtw_t common
; /* must be first */
228 /*!\brief vestigial timewait for IPv6
230 typedef struct vtw_v6
{
231 vtw_t common
; /* must be first */
234 struct in6_addr laddr
;
235 struct in6_addr faddr
;
239 typedef struct vtw_ctl vtw_ctl_t
;
240 typedef struct fatp_ctl fatp_ctl_t
;
243 * The vestigial time waits are kept in a contiguous chunk.
244 * Allocation and free pointers run as clock hands thru this array.
247 fatp_ctl_t
*fat
; /* collection of fatp to use */
248 vtw_ctl_t
*ctl
; /* <! controller's controller */
250 vtw_t
*v
; /* common */
251 struct vtw_v4
*v4
; /* IPv4 resources */
252 struct vtw_v6
*v6
; /* IPv6 resources */
253 } base
, /* base of vtw_t array */
254 /**/ lim
, /* extent of vtw_t array */
255 /**/ alloc
, /* allocation pointer */
256 /**/ oldest
; /* ^ to oldest */
257 uint32_t nfree
; /* # free */
258 uint32_t nalloc
; /* # allocated */
259 uint32_t idx_mask
; /* mask capturing all index bits*/
262 uint32_t idx_bits
: 6;
263 uint32_t clidx
: 3; /* <! class index */
266 /*!\brief Collections of fat pointers.
269 vtw_ctl_t
*vtw
; /* associated VTWs */
270 fatp_t
*base
; /* base of fatp_t array */
271 fatp_t
*lim
; /* extent of fatp_t array */
272 fatp_t
*free
; /* free list */
273 uint32_t mask
; /* hash mask */
274 uint32_t nfree
; /* # free */
275 uint32_t nalloc
; /* # allocated */
276 fatp_t
**hash
; /* hash anchors */
277 fatp_t
**port
; /* port hash anchors */
283 uint64_t ins
; /* <! inserts */
284 uint64_t del
; /* <! deleted */
285 uint64_t kill
; /* <! assassination */
286 uint64_t look
[2]; /* <! lookup: full hash, port hash */
287 uint64_t hit
[2]; /* <! lookups that hit */
288 uint64_t miss
[2]; /* <! lookups that miss */
289 uint64_t probe
[2]; /* <! hits+miss */
290 uint64_t losing
[2]; /* <! misses requiring dereference */
291 uint64_t max_chain
[2]; /* <! max fatp chain traversed */
292 uint64_t max_probe
[2]; /* <! max probes in any one chain */
293 uint64_t max_loss
[2]; /* <! max losing probes in any one
298 typedef struct vtw_stats vtw_stats_t
;
300 /*!\brief follow fatp next 'pointer'
302 static inline fatp_t
*
303 fatp_next(fatp_ctl_t
*fat
, fatp_t
*fp
)
305 return fp
->nxt
? fat
->base
+ fp
->nxt
-1 : 0;
308 /*!\brief determine a collection-relative fat pointer index.
310 static inline uint32_t
311 fatp_index(fatp_ctl_t
*fat
, fatp_t
*fp
)
313 return fp
? 1 + (fp
- fat
->base
) : 0;
317 static inline uint32_t
318 v4_tag(uint32_t faddr
, uint32_t fport
, uint32_t laddr
, uint32_t lport
)
320 return (ntohl(faddr
) + ntohs(fport
)
321 + ntohl(laddr
) + ntohs(lport
));
324 static inline uint32_t
325 v6_tag(const struct in6_addr
*faddr
, uint16_t fport
,
326 const struct in6_addr
*laddr
, uint16_t lport
)
329 return IN6_HASH(faddr
, fport
, laddr
, lport
);
335 static inline uint32_t
336 v4_port_tag(uint16_t lport
)
338 uint32_t tag
= lport
^ (lport
<< 11);
350 static inline uint32_t
351 v6_port_tag(uint16_t lport
)
353 return v4_port_tag(lport
);
359 int vtw_add(int, struct tcpcb
*);
360 void vtw_del(vtw_ctl_t
*, vtw_t
*);
361 int vtw_lookup_v4(const struct ip
*ip
, const struct tcphdr
*th
,
362 uint32_t faddr
, uint16_t fport
,
363 uint32_t laddr
, uint16_t lport
);
367 int vtw_lookup_v6(const struct ip6_hdr
*ip
, const struct tcphdr
*th
,
368 const struct in6_addr
*faddr
, uint16_t fport
,
369 const struct in6_addr
*laddr
, uint16_t lport
);
371 typedef struct vestigial_inpcb
{
376 uint16_t fport
, lport
;
379 uint32_t reuse_addr
: 1;
380 uint32_t reuse_port
: 1;
382 uint32_t more_tbd
: 1;
387 struct vtw_common
*vtw
;
392 void vtw_restart(vestigial_inpcb_t
*);
393 int vtw_earlyinit(void);
394 int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO
);
398 typedef struct sin_either
{
408 int vtw_debug_add(int af
, sin_either_t
*, sin_either_t
*, int, int);
410 typedef struct vtw_sysargs
{
416 #endif /* VTW_DEBUG */
418 #endif /* _NETINET_TCP_VTW_H */