2 * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
3 * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/cdefs.h>
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/eventhandler.h>
38 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/limits.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
49 #include <net/if_var.h>
50 #include <net/if_private.h>
51 #include <net/rss_config.h>
52 #include <net/netisr.h>
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip_var.h>
58 #include <netinet/in_rss.h>
60 #include <security/mac/mac_framework.h>
63 SYSCTL_DECL(_net_inet_ip
);
66 * Reassembly headers are stored in hash buckets.
68 #define IPREASS_NHASH_LOG2 10
69 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
70 #define IPREASS_HMASK (V_ipq_hashsize - 1)
73 TAILQ_HEAD(ipqhead
, ipq
) head
;
82 VNET_DEFINE_STATIC(struct ipqbucket
*, ipq
);
83 #define V_ipq VNET(ipq)
84 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed
);
85 #define V_ipq_hashseed VNET(ipq_hashseed)
86 VNET_DEFINE_STATIC(uint32_t, ipq_hashsize
);
87 #define V_ipq_hashsize VNET(ipq_hashsize)
89 #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock)
90 #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock)
91 #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock)
92 #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED)
93 #define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED)
95 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize
);
96 #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize)
98 void ipreass_init(void);
99 void ipreass_vnet_init(void);
101 void ipreass_destroy(void);
103 static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS
);
104 static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS
);
105 static int sysctl_fragttl(SYSCTL_HANDLER_ARGS
);
106 static void ipreass_zone_change(void *);
107 static void ipreass_drain_tomax(void);
108 static void ipq_free(struct ipqbucket
*, struct ipq
*);
109 static struct ipq
* ipq_reuse(int);
110 static void ipreass_callout(void *);
111 static void ipreass_reschedule(struct ipqbucket
*);
114 ipq_timeout(struct ipqbucket
*bucket
, struct ipq
*fp
)
117 IPSTAT_ADD(ips_fragtimeout
, fp
->ipq_nfrags
);
118 ipq_free(bucket
, fp
);
122 ipq_drop(struct ipqbucket
*bucket
, struct ipq
*fp
)
125 IPSTAT_ADD(ips_fragdropped
, fp
->ipq_nfrags
);
126 ipq_free(bucket
, fp
);
127 ipreass_reschedule(bucket
);
131 * By default, limit the number of IP fragments across all reassembly
132 * queues to 1/32 of the total number of mbuf clusters.
134 * Limit the total number of reassembly queues per VNET to the
135 * IP fragment limit, but ensure the limit will not allow any bucket
136 * to grow above 100 items. (The bucket limit is
137 * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct
138 * multiplier to reach a 100-item limit.)
139 * The 100-item limit was chosen as brief testing seems to show that
140 * this produces "reasonable" performance on some subset of systems
143 #define IP_MAXFRAGS (nmbclusters / 32)
144 #define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, V_ipq_hashsize * 50))
147 static u_int __exclusive_cache_line nfrags
;
148 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, maxfrags
, CTLFLAG_RW
,
150 "Maximum number of IPv4 fragments allowed across all reassembly queues");
151 SYSCTL_UINT(_net_inet_ip
, OID_AUTO
, curfrags
, CTLFLAG_RD
,
153 "Current number of IPv4 fragments across all reassembly queues");
155 VNET_DEFINE_STATIC(uma_zone_t
, ipq_zone
);
156 #define V_ipq_zone VNET(ipq_zone)
158 SYSCTL_UINT(_net_inet_ip
, OID_AUTO
, reass_hashsize
,
159 CTLFLAG_VNET
| CTLFLAG_RDTUN
, &VNET_NAME(ipq_hashsize
), 0,
160 "Size of IP fragment reassembly hashtable");
162 SYSCTL_PROC(_net_inet_ip
, OID_AUTO
, maxfragpackets
,
163 CTLFLAG_VNET
| CTLTYPE_INT
| CTLFLAG_RW
| CTLFLAG_NEEDGIANT
,
164 NULL
, 0, sysctl_maxfragpackets
, "I",
165 "Maximum number of IPv4 fragment reassembly queue entries");
166 SYSCTL_UMA_CUR(_net_inet_ip
, OID_AUTO
, fragpackets
, CTLFLAG_VNET
,
167 &VNET_NAME(ipq_zone
),
168 "Current number of IPv4 fragment reassembly queue entries");
170 VNET_DEFINE_STATIC(int, noreass
);
171 #define V_noreass VNET(noreass)
173 VNET_DEFINE_STATIC(int, maxfragsperpacket
);
174 #define V_maxfragsperpacket VNET(maxfragsperpacket)
175 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, maxfragsperpacket
, CTLFLAG_VNET
| CTLFLAG_RW
,
176 &VNET_NAME(maxfragsperpacket
), 0,
177 "Maximum number of IPv4 fragments allowed per packet");
178 SYSCTL_PROC(_net_inet_ip
, OID_AUTO
, maxfragbucketsize
,
179 CTLFLAG_VNET
| CTLTYPE_INT
| CTLFLAG_MPSAFE
| CTLFLAG_RW
, NULL
, 0,
180 sysctl_maxfragbucketsize
, "I",
181 "Maximum number of IPv4 fragment reassembly queue entries per bucket");
183 VNET_DEFINE_STATIC(u_int
, ipfragttl
) = 30;
184 #define V_ipfragttl VNET(ipfragttl)
185 SYSCTL_PROC(_net_inet_ip
, OID_AUTO
, fragttl
, CTLTYPE_INT
| CTLFLAG_RW
|
186 CTLFLAG_MPSAFE
| CTLFLAG_VNET
, NULL
, 0, sysctl_fragttl
, "IU",
187 "IP fragment life time on reassembly queue (seconds)");
190 * Take incoming datagram fragment and try to reassemble it into
191 * whole datagram. If the argument is the first fragment or one
192 * in between the function will return NULL and store the mbuf
193 * in the fragment chain. If the argument is the last fragment
194 * the packet will be reassembled and the pointer to the new
195 * mbuf returned for further processing. Only m_tags attached
196 * to the first packet/fragment are preserved.
197 * The IP header is *NOT* adjusted out of iplen.
199 #define M_IP_FRAG M_PROTO9
201 ip_reass(struct mbuf
*m
)
204 struct mbuf
*p
, *q
, *nq
, *t
;
206 struct ifnet
*srcifp
;
207 struct ipqhead
*head
;
208 int i
, hlen
, next
, tmpmax
;
210 uint32_t hash
, hashkey
[3];
212 uint32_t rss_hash
, rss_type
;
216 * If no reassembling or maxfragsperpacket are 0,
217 * never accept fragments.
218 * Also, drop packet if it would exceed the maximum
219 * number of fragments.
222 if (V_noreass
== 1 || V_maxfragsperpacket
== 0 ||
223 (tmpmax
>= 0 && atomic_load_int(&nfrags
) >= (u_int
)tmpmax
)) {
224 IPSTAT_INC(ips_fragments
);
225 IPSTAT_INC(ips_fragdropped
);
230 ip
= mtod(m
, struct ip
*);
231 hlen
= ip
->ip_hl
<< 2;
234 * Adjust ip_len to not reflect header,
235 * convert offset of this to bytes.
237 ip
->ip_len
= htons(ntohs(ip
->ip_len
) - hlen
);
239 * Make sure that fragments have a data length
240 * that's a non-zero multiple of 8 bytes, unless
241 * this is the last fragment.
243 if (ip
->ip_len
== htons(0) ||
244 ((ip
->ip_off
& htons(IP_MF
)) && (ntohs(ip
->ip_len
) & 0x7) != 0)) {
245 IPSTAT_INC(ips_toosmall
); /* XXX */
246 IPSTAT_INC(ips_fragdropped
);
250 if (ip
->ip_off
& htons(IP_MF
))
251 m
->m_flags
|= M_IP_FRAG
;
253 m
->m_flags
&= ~M_IP_FRAG
;
254 ip
->ip_off
= htons(ntohs(ip
->ip_off
) << 3);
257 * Make sure the fragment lies within a packet of valid size.
259 if (ntohs(ip
->ip_len
) + ntohs(ip
->ip_off
) > IP_MAXPACKET
) {
260 IPSTAT_INC(ips_toolong
);
261 IPSTAT_INC(ips_fragdropped
);
267 * Store receive network interface pointer for later.
269 srcifp
= m
->m_pkthdr
.rcvif
;
272 * Attempt reassembly; if it succeeds, proceed.
273 * ip_reass() will return a different mbuf.
275 IPSTAT_INC(ips_fragments
);
276 m
->m_pkthdr
.PH_loc
.ptr
= ip
;
279 * Presence of header sizes in mbufs
280 * would confuse code below.
285 hashkey
[0] = ip
->ip_src
.s_addr
;
286 hashkey
[1] = ip
->ip_dst
.s_addr
;
287 hashkey
[2] = (uint32_t)ip
->ip_p
<< 16;
288 hashkey
[2] += ip
->ip_id
;
289 hash
= jenkins_hash32(hashkey
, nitems(hashkey
), V_ipq_hashseed
);
290 hash
&= IPREASS_HMASK
;
291 head
= &V_ipq
[hash
].head
;
295 * Look for queue of fragments
298 TAILQ_FOREACH(fp
, head
, ipq_list
)
299 if (ip
->ip_id
== fp
->ipq_id
&&
300 ip
->ip_src
.s_addr
== fp
->ipq_src
.s_addr
&&
301 ip
->ip_dst
.s_addr
== fp
->ipq_dst
.s_addr
&&
303 mac_ipq_match(m
, fp
) &&
305 ip
->ip_p
== fp
->ipq_p
)
308 * If first fragment to arrive, create a reassembly queue.
311 if (V_ipq
[hash
].count
< V_ipreass_maxbucketsize
)
312 fp
= uma_zalloc(V_ipq_zone
, M_NOWAIT
);
314 fp
= ipq_reuse(hash
);
318 if (mac_ipq_init(fp
, M_NOWAIT
) != 0) {
319 uma_zfree(V_ipq_zone
, fp
);
323 mac_ipq_create(m
, fp
);
325 TAILQ_INSERT_HEAD(head
, fp
, ipq_list
);
328 atomic_add_int(&nfrags
, 1);
329 fp
->ipq_expire
= time_uptime
+ V_ipfragttl
;
330 fp
->ipq_p
= ip
->ip_p
;
331 fp
->ipq_id
= ip
->ip_id
;
332 fp
->ipq_src
= ip
->ip_src
;
333 fp
->ipq_dst
= ip
->ip_dst
;
335 if (m
->m_flags
& M_IP_FRAG
)
338 fp
->ipq_maxoff
= ntohs(ip
->ip_off
) + ntohs(ip
->ip_len
);
340 if (fp
== TAILQ_LAST(head
, ipqhead
))
341 callout_reset_sbt(&V_ipq
[hash
].timer
,
342 SBT_1S
* V_ipfragttl
, SBT_1S
, ipreass_callout
,
345 MPASS(callout_active(&V_ipq
[hash
].timer
));
349 * If we already saw the last fragment, make sure
350 * this fragment's offset looks sane. Otherwise, if
351 * this is the last fragment, record its endpoint.
353 if (fp
->ipq_maxoff
> 0) {
354 i
= ntohs(ip
->ip_off
) + ntohs(ip
->ip_len
);
355 if (((m
->m_flags
& M_IP_FRAG
) && i
>= fp
->ipq_maxoff
) ||
356 ((m
->m_flags
& M_IP_FRAG
) == 0 &&
357 i
!= fp
->ipq_maxoff
)) {
361 } else if ((m
->m_flags
& M_IP_FRAG
) == 0)
362 fp
->ipq_maxoff
= ntohs(ip
->ip_off
) + ntohs(ip
->ip_len
);
364 atomic_add_int(&nfrags
, 1);
366 mac_ipq_update(m
, fp
);
370 #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
373 * Handle ECN by comparing this segment with the first one;
374 * if CE is set, do not lose CE.
375 * drop if CE and not-ECT are mixed for the same packet.
377 ecn
= ip
->ip_tos
& IPTOS_ECN_MASK
;
378 ecn0
= GETIP(fp
->ipq_frags
)->ip_tos
& IPTOS_ECN_MASK
;
379 if (ecn
== IPTOS_ECN_CE
) {
380 if (ecn0
== IPTOS_ECN_NOTECT
)
382 if (ecn0
!= IPTOS_ECN_CE
)
383 GETIP(fp
->ipq_frags
)->ip_tos
|= IPTOS_ECN_CE
;
385 if (ecn
== IPTOS_ECN_NOTECT
&& ecn0
!= IPTOS_ECN_NOTECT
)
389 * Find a segment which begins after this one does.
391 for (p
= NULL
, q
= fp
->ipq_frags
; q
; p
= q
, q
= q
->m_nextpkt
)
392 if (ntohs(GETIP(q
)->ip_off
) > ntohs(ip
->ip_off
))
396 * If there is a preceding segment, it may provide some of
397 * our data already. If so, drop the data from the incoming
398 * segment. If it provides all of our data, drop us, otherwise
399 * stick new segment in the proper place.
401 * If some of the data is dropped from the preceding
402 * segment, then it's checksum is invalidated.
405 i
= ntohs(GETIP(p
)->ip_off
) + ntohs(GETIP(p
)->ip_len
) -
408 if (i
>= ntohs(ip
->ip_len
))
411 m
->m_pkthdr
.csum_flags
= 0;
412 ip
->ip_off
= htons(ntohs(ip
->ip_off
) + i
);
413 ip
->ip_len
= htons(ntohs(ip
->ip_len
) - i
);
415 m
->m_nextpkt
= p
->m_nextpkt
;
418 m
->m_nextpkt
= fp
->ipq_frags
;
423 * While we overlap succeeding segments trim them or,
424 * if they are completely covered, dequeue them.
426 for (; q
!= NULL
&& ntohs(ip
->ip_off
) + ntohs(ip
->ip_len
) >
427 ntohs(GETIP(q
)->ip_off
); q
= nq
) {
428 i
= (ntohs(ip
->ip_off
) + ntohs(ip
->ip_len
)) -
429 ntohs(GETIP(q
)->ip_off
);
430 if (i
< ntohs(GETIP(q
)->ip_len
)) {
431 GETIP(q
)->ip_len
= htons(ntohs(GETIP(q
)->ip_len
) - i
);
432 GETIP(q
)->ip_off
= htons(ntohs(GETIP(q
)->ip_off
) + i
);
434 q
->m_pkthdr
.csum_flags
= 0;
439 IPSTAT_INC(ips_fragdropped
);
441 atomic_subtract_int(&nfrags
, 1);
446 * Check for complete reassembly and perform frag per packet
449 * Frag limiting is performed here so that the nth frag has
450 * a chance to complete the packet before we drop the packet.
451 * As a result, n+1 frags are actually allowed per packet, but
452 * only n will ever be stored. (n = maxfragsperpacket.)
456 for (p
= NULL
, q
= fp
->ipq_frags
; q
; p
= q
, q
= q
->m_nextpkt
) {
457 if (ntohs(GETIP(q
)->ip_off
) != next
) {
458 if (fp
->ipq_nfrags
> V_maxfragsperpacket
)
459 ipq_drop(&V_ipq
[hash
], fp
);
462 next
+= ntohs(GETIP(q
)->ip_len
);
464 /* Make sure the last packet didn't have the IP_MF flag */
465 if (p
->m_flags
& M_IP_FRAG
) {
466 if (fp
->ipq_nfrags
> V_maxfragsperpacket
)
467 ipq_drop(&V_ipq
[hash
], fp
);
472 * Reassembly is complete. Make sure the packet is a sane size.
476 if (next
+ (ip
->ip_hl
<< 2) > IP_MAXPACKET
) {
477 IPSTAT_INC(ips_toolong
);
478 ipq_drop(&V_ipq
[hash
], fp
);
483 * Concatenate fragments.
491 for (q
= nq
; q
!= NULL
; q
= nq
) {
494 m
->m_pkthdr
.csum_flags
&= q
->m_pkthdr
.csum_flags
;
495 m
->m_pkthdr
.csum_data
+= q
->m_pkthdr
.csum_data
;
500 * In order to do checksumming faster we do 'end-around carry' here
501 * (and not in for{} loop), though it implies we are not going to
502 * reassemble more than 64k fragments.
504 while (m
->m_pkthdr
.csum_data
& 0xffff0000)
505 m
->m_pkthdr
.csum_data
= (m
->m_pkthdr
.csum_data
& 0xffff) +
506 (m
->m_pkthdr
.csum_data
>> 16);
507 atomic_subtract_int(&nfrags
, fp
->ipq_nfrags
);
509 mac_ipq_reassemble(fp
, m
);
514 * Create header for new ip packet by modifying header of first
515 * packet; dequeue and discard fragment reassembly header.
516 * Make header visible.
518 ip
->ip_len
= htons((ip
->ip_hl
<< 2) + next
);
519 ip
->ip_src
= fp
->ipq_src
;
520 ip
->ip_dst
= fp
->ipq_dst
;
521 TAILQ_REMOVE(head
, fp
, ipq_list
);
523 uma_zfree(V_ipq_zone
, fp
);
524 m
->m_len
+= (ip
->ip_hl
<< 2);
525 m
->m_data
-= (ip
->ip_hl
<< 2);
526 /* some debugging cruft by sklower, below, will go away soon */
527 if (m
->m_flags
& M_PKTHDR
) { /* XXX this should be done elsewhere */
529 /* set valid receive interface pointer */
530 m
->m_pkthdr
.rcvif
= srcifp
;
532 IPSTAT_INC(ips_reassembled
);
533 ipreass_reschedule(&V_ipq
[hash
]);
538 * Query the RSS layer for the flowid / flowtype for the
541 * For now, just assume we have to calculate a new one.
542 * Later on we should check to see if the assigned flowid matches
543 * what RSS wants for the given IP protocol and if so, just keep it.
545 * We then queue into the relevant netisr so it can be dispatched
546 * to the correct CPU.
548 * Note - this may return 1, which means the flowid in the mbuf
549 * is correct for the configured RSS hash types and can be used.
551 if (rss_mbuf_software_hash_v4(m
, 0, &rss_hash
, &rss_type
) == 0) {
552 m
->m_pkthdr
.flowid
= rss_hash
;
553 M_HASHTYPE_SET(m
, rss_type
);
557 * Queue/dispatch for reprocessing.
559 * Note: this is much slower than just handling the frame in the
560 * current receive context. It's likely worth investigating
563 netisr_dispatch(NETISR_IP_DIRECT
, m
);
571 IPSTAT_INC(ips_fragdropped
);
574 atomic_subtract_int(&nfrags
, 1);
585 * Timer expired on a bucket.
586 * There should be at least one ipq to be timed out.
589 ipreass_callout(void *arg
)
591 struct ipqbucket
*bucket
= arg
;
594 IPQ_BUCKET_LOCK_ASSERT(bucket
);
595 MPASS(atomic_load_int(&nfrags
) > 0);
597 CURVNET_SET(bucket
->vnet
);
598 fp
= TAILQ_LAST(&bucket
->head
, ipqhead
);
599 KASSERT(fp
!= NULL
&& fp
->ipq_expire
<= time_uptime
,
600 ("%s: stray callout on bucket %p, %ju < %ju", __func__
, bucket
,
601 fp
? (uintmax_t)fp
->ipq_expire
: 0, (uintmax_t)time_uptime
));
603 while (fp
!= NULL
&& fp
->ipq_expire
<= time_uptime
) {
604 ipq_timeout(bucket
, fp
);
605 fp
= TAILQ_LAST(&bucket
->head
, ipqhead
);
607 ipreass_reschedule(bucket
);
612 ipreass_reschedule(struct ipqbucket
*bucket
)
616 IPQ_BUCKET_LOCK_ASSERT(bucket
);
618 if ((fp
= TAILQ_LAST(&bucket
->head
, ipqhead
)) != NULL
) {
621 /* Protect against time_uptime tick. */
622 t
= fp
->ipq_expire
- time_uptime
;
624 callout_reset_sbt(&bucket
->timer
, SBT_1S
* t
, SBT_1S
,
625 ipreass_callout
, bucket
, 0);
627 callout_stop(&bucket
->timer
);
631 ipreass_drain_vnet(void)
635 for (int i
= 0; i
< V_ipq_hashsize
; i
++) {
639 resched
= !TAILQ_EMPTY(&V_ipq
[i
].head
);
640 while(!TAILQ_EMPTY(&V_ipq
[i
].head
)) {
641 struct ipq
*fp
= TAILQ_FIRST(&V_ipq
[i
].head
);
643 dropped
+= fp
->ipq_nfrags
;
644 ipq_free(&V_ipq
[i
], fp
);
647 ipreass_reschedule(&V_ipq
[i
]);
648 KASSERT(V_ipq
[i
].count
== 0,
649 ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__
, i
,
650 V_ipq
[i
].count
, V_ipq
));
653 IPSTAT_ADD(ips_fragdropped
, dropped
);
657 * Drain off all datagram fragments.
662 VNET_ITERATOR_DECL(vnet_iter
);
665 VNET_FOREACH(vnet_iter
) {
666 CURVNET_SET(vnet_iter
);
667 ipreass_drain_vnet();
675 * Initialize IP reassembly structures.
677 MALLOC_DEFINE(M_IPREASS_HASH
, "IP reass", "IP packet reassembly hash headers");
679 ipreass_vnet_init(void)
683 V_ipq_hashsize
= IPREASS_NHASH
;
684 TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize
);
685 V_ipq
= malloc(sizeof(struct ipqbucket
) * V_ipq_hashsize
,
686 M_IPREASS_HASH
, M_WAITOK
);
688 for (int i
= 0; i
< V_ipq_hashsize
; i
++) {
689 TAILQ_INIT(&V_ipq
[i
].head
);
690 mtx_init(&V_ipq
[i
].lock
, "IP reassembly", NULL
,
691 MTX_DEF
| MTX_DUPOK
| MTX_NEW
);
692 callout_init_mtx(&V_ipq
[i
].timer
, &V_ipq
[i
].lock
, 0);
695 V_ipq
[i
].vnet
= curvnet
;
698 V_ipq_hashseed
= arc4random();
699 V_maxfragsperpacket
= 16;
700 V_ipq_zone
= uma_zcreate("ipq", sizeof(struct ipq
), NULL
, NULL
, NULL
,
701 NULL
, UMA_ALIGN_PTR
, 0);
702 max
= IP_MAXFRAGPACKETS
;
703 max
= uma_zone_set_max(V_ipq_zone
, max
);
704 V_ipreass_maxbucketsize
= imax(max
/ (V_ipq_hashsize
/ 2), 1);
711 maxfrags
= IP_MAXFRAGS
;
712 EVENTHANDLER_REGISTER(nmbclusters_change
, ipreass_zone_change
,
713 NULL
, EVENTHANDLER_PRI_ANY
);
714 EVENTHANDLER_REGISTER(vm_lowmem
, ipreass_drain
, NULL
,
716 EVENTHANDLER_REGISTER(mbuf_lowmem
, ipreass_drain
, NULL
,
721 * Drain off all datagram fragments belonging to
722 * the given network interface.
725 ipreass_cleanup(void *arg __unused
, struct ifnet
*ifp
)
727 struct ipq
*fp
, *temp
;
731 KASSERT(ifp
!= NULL
, ("%s: ifp is NULL", __func__
));
733 CURVNET_SET_QUIET(ifp
->if_vnet
);
736 * Skip processing if IPv4 reassembly is not initialised or
737 * torn down by ipreass_destroy().
739 if (V_ipq_zone
== NULL
) {
744 for (i
= 0; i
< V_ipq_hashsize
; i
++) {
746 /* Scan fragment list. */
747 TAILQ_FOREACH_SAFE(fp
, &V_ipq
[i
].head
, ipq_list
, temp
) {
748 for (m
= fp
->ipq_frags
; m
!= NULL
; m
= m
->m_nextpkt
) {
749 /* clear no longer valid rcvif pointer */
750 if (m
->m_pkthdr
.rcvif
== ifp
)
751 m
->m_pkthdr
.rcvif
= NULL
;
758 EVENTHANDLER_DEFINE(ifnet_departure_event
, ipreass_cleanup
, NULL
, 0);
762 * Destroy IP reassembly structures.
765 ipreass_destroy(void)
768 ipreass_drain_vnet();
769 uma_zdestroy(V_ipq_zone
);
771 for (int i
= 0; i
< V_ipq_hashsize
; i
++)
772 mtx_destroy(&V_ipq
[i
].lock
);
773 free(V_ipq
, M_IPREASS_HASH
);
778 * After maxnipq has been updated, propagate the change to UMA. The UMA zone
779 * max has slightly different semantics than the sysctl, for historical
783 ipreass_drain_tomax(void)
789 * Make sure each bucket is under the new limit. If
790 * necessary, drop enough of the oldest elements from
791 * each bucket to get under the new limit.
793 for (int i
= 0; i
< V_ipq_hashsize
; i
++) {
795 while (V_ipq
[i
].count
> V_ipreass_maxbucketsize
&&
796 (fp
= TAILQ_LAST(&V_ipq
[i
].head
, ipqhead
)) != NULL
)
797 ipq_timeout(&V_ipq
[i
], fp
);
798 ipreass_reschedule(&V_ipq
[i
]);
803 * If we are over the maximum number of fragments,
804 * drain off enough to get down to the new limit,
805 * stripping off last elements on queues. Every
806 * run we strip the oldest element from each bucket.
808 target
= uma_zone_get_max(V_ipq_zone
);
809 while (uma_zone_get_cur(V_ipq_zone
) > target
) {
810 for (int i
= 0; i
< V_ipq_hashsize
; i
++) {
812 fp
= TAILQ_LAST(&V_ipq
[i
].head
, ipqhead
);
814 ipq_timeout(&V_ipq
[i
], fp
);
815 ipreass_reschedule(&V_ipq
[i
]);
823 ipreass_zone_change(void *tag
)
825 VNET_ITERATOR_DECL(vnet_iter
);
828 maxfrags
= IP_MAXFRAGS
;
829 max
= IP_MAXFRAGPACKETS
;
830 VNET_LIST_RLOCK_NOSLEEP();
831 VNET_FOREACH(vnet_iter
) {
832 CURVNET_SET(vnet_iter
);
833 max
= uma_zone_set_max(V_ipq_zone
, max
);
834 V_ipreass_maxbucketsize
= imax(max
/ (V_ipq_hashsize
/ 2), 1);
835 ipreass_drain_tomax();
838 VNET_LIST_RUNLOCK_NOSLEEP();
842 * Change the limit on the UMA zone, or disable the fragment allocation
843 * at all. Since 0 and -1 is a special values here, we need our own handler,
844 * instead of sysctl_handle_uma_zone_max().
847 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS
)
851 if (V_noreass
== 0) {
852 max
= uma_zone_get_max(V_ipq_zone
);
857 error
= sysctl_handle_int(oidp
, &max
, 0, req
);
858 if (error
|| !req
->newptr
)
862 * XXXRW: Might be a good idea to sanity check the argument
863 * and place an extreme upper bound.
865 max
= uma_zone_set_max(V_ipq_zone
, max
);
866 V_ipreass_maxbucketsize
= imax(max
/ (V_ipq_hashsize
/ 2), 1);
867 ipreass_drain_tomax();
869 } else if (max
== 0) {
872 } else if (max
== -1) {
874 uma_zone_set_max(V_ipq_zone
, 0);
875 V_ipreass_maxbucketsize
= INT_MAX
;
882 * Seek for old fragment queue header that can be reused. Try to
883 * reuse a header from currently locked hash bucket.
891 IPQ_LOCK_ASSERT(start
);
893 for (i
= 0; i
< V_ipq_hashsize
; i
++) {
894 bucket
= (start
+ i
) % V_ipq_hashsize
;
895 if (bucket
!= start
&& IPQ_TRYLOCK(bucket
) == 0)
897 fp
= TAILQ_LAST(&V_ipq
[bucket
].head
, ipqhead
);
901 IPSTAT_ADD(ips_fragtimeout
, fp
->ipq_nfrags
);
902 atomic_subtract_int(&nfrags
, fp
->ipq_nfrags
);
903 while (fp
->ipq_frags
) {
905 fp
->ipq_frags
= m
->m_nextpkt
;
908 TAILQ_REMOVE(&V_ipq
[bucket
].head
, fp
, ipq_list
);
909 V_ipq
[bucket
].count
--;
910 ipreass_reschedule(&V_ipq
[bucket
]);
918 IPQ_LOCK_ASSERT(start
);
923 * Free a fragment reassembly header and all associated datagrams.
926 ipq_free(struct ipqbucket
*bucket
, struct ipq
*fp
)
930 atomic_subtract_int(&nfrags
, fp
->ipq_nfrags
);
931 while (fp
->ipq_frags
) {
933 fp
->ipq_frags
= q
->m_nextpkt
;
936 TAILQ_REMOVE(&bucket
->head
, fp
, ipq_list
);
938 uma_zfree(V_ipq_zone
, fp
);
942 * Get or set the maximum number of reassembly queues per bucket.
945 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS
)
949 max
= V_ipreass_maxbucketsize
;
950 error
= sysctl_handle_int(oidp
, &max
, 0, req
);
951 if (error
|| !req
->newptr
)
955 V_ipreass_maxbucketsize
= max
;
956 ipreass_drain_tomax();
961 * Get or set the IP fragment time to live.
964 sysctl_fragttl(SYSCTL_HANDLER_ARGS
)
970 error
= sysctl_handle_int(oidp
, &ttl
, 0, req
);
971 if (error
|| !req
->newptr
)
974 if (ttl
< 1 || ttl
> MAXTTL
)
977 atomic_store_int(&V_ipfragttl
, ttl
);