2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2001 McAfee, Inc.
5 * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG
8 * This software was developed for the FreeBSD Project by Jonathan Lemon
9 * and McAfee Research, the Security Research Division of McAfee, Inc. under
10 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
11 * DARPA CHATS research program. [2001 McAfee, Inc.]
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
37 #include "opt_inet6.h"
38 #include "opt_ipsec.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
43 #include <sys/refcount.h>
44 #include <sys/kernel.h>
45 #include <sys/sysctl.h>
46 #include <sys/limits.h>
48 #include <sys/mutex.h>
49 #include <sys/malloc.h>
51 #include <sys/proc.h> /* for proc0 declaration */
52 #include <sys/random.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/syslog.h>
56 #include <sys/ucred.h>
59 #include <crypto/siphash/siphash.h>
64 #include <net/if_var.h>
65 #include <net/route.h>
68 #include <netinet/in.h>
69 #include <netinet/in_kdtrace.h>
70 #include <netinet/in_systm.h>
71 #include <netinet/ip.h>
72 #include <netinet/in_var.h>
73 #include <netinet/in_pcb.h>
74 #include <netinet/ip_var.h>
75 #include <netinet/ip_options.h>
77 #include <netinet/ip6.h>
78 #include <netinet/icmp6.h>
79 #include <netinet6/nd6.h>
80 #include <netinet6/ip6_var.h>
81 #include <netinet6/in6_pcb.h>
83 #include <netinet/tcp.h>
84 #include <netinet/tcp_fastopen.h>
85 #include <netinet/tcp_fsm.h>
86 #include <netinet/tcp_seq.h>
87 #include <netinet/tcp_timer.h>
88 #include <netinet/tcp_var.h>
89 #include <netinet/tcp_syncache.h>
90 #include <netinet/tcp_ecn.h>
92 #include <netinet/tcp_log_buf.h>
95 #include <netinet/toecore.h>
97 #include <netinet/udp.h>
99 #include <netipsec/ipsec_support.h>
101 #include <machine/in_cksum.h>
103 #include <security/mac/mac_framework.h>
105 VNET_DEFINE_STATIC(int, tcp_syncookies
) = 1;
106 #define V_tcp_syncookies VNET(tcp_syncookies)
107 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, syncookies
, CTLFLAG_VNET
| CTLFLAG_RW
,
108 &VNET_NAME(tcp_syncookies
), 0,
109 "Use TCP SYN cookies if the syncache overflows");
111 VNET_DEFINE_STATIC(int, tcp_syncookiesonly
) = 0;
112 #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly)
113 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, syncookies_only
, CTLFLAG_VNET
| CTLFLAG_RW
,
114 &VNET_NAME(tcp_syncookiesonly
), 0,
115 "Use only TCP SYN cookies");
118 #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
121 static void syncache_drop(struct syncache
*, struct syncache_head
*);
122 static void syncache_free(struct syncache
*);
123 static void syncache_insert(struct syncache
*, struct syncache_head
*);
124 static int syncache_respond(struct syncache
*, const struct mbuf
*, int);
125 static struct socket
*syncache_socket(struct syncache
*, struct socket
*,
127 static void syncache_timeout(struct syncache
*sc
, struct syncache_head
*sch
,
129 static void syncache_timer(void *);
131 static uint32_t syncookie_mac(struct in_conninfo
*, tcp_seq
, uint8_t,
132 uint8_t *, uintptr_t);
133 static tcp_seq
syncookie_generate(struct syncache_head
*, struct syncache
*);
134 static struct syncache
135 *syncookie_lookup(struct in_conninfo
*, struct syncache_head
*,
136 struct syncache
*, struct tcphdr
*, struct tcpopt
*,
137 struct socket
*, uint16_t);
138 static void syncache_pause(struct in_conninfo
*);
139 static void syncache_unpause(void *);
140 static void syncookie_reseed(void *);
142 static int syncookie_cmp(struct in_conninfo
*inc
, struct syncache_head
*sch
,
143 struct syncache
*sc
, struct tcphdr
*th
, struct tcpopt
*to
,
144 struct socket
*lso
, uint16_t port
);
148 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
149 * 3 retransmits corresponds to a timeout with default values of
150 * tcp_rexmit_initial * ( 1 +
153 * tcp_backoff[3]) + 3 * tcp_rexmit_slop,
154 * 1000 ms * (1 + 2 + 4 + 8) + 3 * 200 ms = 15600 ms,
155 * the odds are that the user has given up attempting to connect by then.
157 #define SYNCACHE_MAXREXMTS 3
159 /* Arbitrary values */
160 #define TCP_SYNCACHE_HASHSIZE 512
161 #define TCP_SYNCACHE_BUCKETLIMIT 30
163 VNET_DEFINE_STATIC(struct tcp_syncache
, tcp_syncache
);
164 #define V_tcp_syncache VNET(tcp_syncache)
166 static SYSCTL_NODE(_net_inet_tcp
, OID_AUTO
, syncache
,
167 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
170 SYSCTL_UINT(_net_inet_tcp_syncache
, OID_AUTO
, bucketlimit
, CTLFLAG_VNET
| CTLFLAG_RDTUN
,
171 &VNET_NAME(tcp_syncache
.bucket_limit
), 0,
172 "Per-bucket hash limit for syncache");
174 SYSCTL_UINT(_net_inet_tcp_syncache
, OID_AUTO
, cachelimit
, CTLFLAG_VNET
| CTLFLAG_RDTUN
,
175 &VNET_NAME(tcp_syncache
.cache_limit
), 0,
176 "Overall entry limit for syncache");
178 SYSCTL_UMA_CUR(_net_inet_tcp_syncache
, OID_AUTO
, count
, CTLFLAG_VNET
,
179 &VNET_NAME(tcp_syncache
.zone
), "Current number of entries in syncache");
181 SYSCTL_UINT(_net_inet_tcp_syncache
, OID_AUTO
, hashsize
, CTLFLAG_VNET
| CTLFLAG_RDTUN
,
182 &VNET_NAME(tcp_syncache
.hashsize
), 0,
183 "Size of TCP syncache hashtable");
185 SYSCTL_BOOL(_net_inet_tcp_syncache
, OID_AUTO
, see_other
, CTLFLAG_VNET
|
186 CTLFLAG_RW
, &VNET_NAME(tcp_syncache
.see_other
), 0,
187 "All syncache(4) entries are visible, ignoring UID/GID, jail(2) "
188 "and mac(4) checks");
191 sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS
)
196 new = V_tcp_syncache
.rexmt_limit
;
197 error
= sysctl_handle_int(oidp
, &new, 0, req
);
198 if ((error
== 0) && (req
->newptr
!= NULL
)) {
199 if (new > TCP_MAXRXTSHIFT
)
202 V_tcp_syncache
.rexmt_limit
= new;
207 SYSCTL_PROC(_net_inet_tcp_syncache
, OID_AUTO
, rexmtlimit
,
208 CTLFLAG_VNET
| CTLTYPE_UINT
| CTLFLAG_RW
| CTLFLAG_NEEDGIANT
,
209 &VNET_NAME(tcp_syncache
.rexmt_limit
), 0,
210 sysctl_net_inet_tcp_syncache_rexmtlimit_check
, "IU",
211 "Limit on SYN/ACK retransmissions");
213 VNET_DEFINE(int, tcp_sc_rst_sock_fail
) = 1;
214 SYSCTL_INT(_net_inet_tcp_syncache
, OID_AUTO
, rst_on_sock_fail
,
215 CTLFLAG_VNET
| CTLFLAG_RW
, &VNET_NAME(tcp_sc_rst_sock_fail
), 0,
216 "Send reset on socket allocation failure");
218 static MALLOC_DEFINE(M_SYNCACHE
, "syncache", "TCP syncache");
220 #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx)
221 #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx)
222 #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED)
225 * Requires the syncache entry to be already removed from the bucket list.
228 syncache_free(struct syncache
*sc
)
232 (void)m_free(sc
->sc_ipopts
);
236 mac_syncache_destroy(&sc
->sc_label
);
239 uma_zfree(V_tcp_syncache
.zone
, sc
);
247 V_tcp_syncache
.hashsize
= TCP_SYNCACHE_HASHSIZE
;
248 V_tcp_syncache
.bucket_limit
= TCP_SYNCACHE_BUCKETLIMIT
;
249 V_tcp_syncache
.rexmt_limit
= SYNCACHE_MAXREXMTS
;
250 V_tcp_syncache
.hash_secret
= arc4random();
252 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
253 &V_tcp_syncache
.hashsize
);
254 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
255 &V_tcp_syncache
.bucket_limit
);
256 if (!powerof2(V_tcp_syncache
.hashsize
) ||
257 V_tcp_syncache
.hashsize
== 0) {
258 printf("WARNING: syncache hash size is not a power of 2.\n");
259 V_tcp_syncache
.hashsize
= TCP_SYNCACHE_HASHSIZE
;
261 V_tcp_syncache
.hashmask
= V_tcp_syncache
.hashsize
- 1;
264 V_tcp_syncache
.cache_limit
=
265 V_tcp_syncache
.hashsize
* V_tcp_syncache
.bucket_limit
;
266 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
267 &V_tcp_syncache
.cache_limit
);
269 /* Allocate the hash table. */
270 V_tcp_syncache
.hashbase
= malloc(V_tcp_syncache
.hashsize
*
271 sizeof(struct syncache_head
), M_SYNCACHE
, M_WAITOK
| M_ZERO
);
274 V_tcp_syncache
.vnet
= curvnet
;
277 /* Initialize the hash buckets. */
278 for (i
= 0; i
< V_tcp_syncache
.hashsize
; i
++) {
279 TAILQ_INIT(&V_tcp_syncache
.hashbase
[i
].sch_bucket
);
280 mtx_init(&V_tcp_syncache
.hashbase
[i
].sch_mtx
, "tcp_sc_head",
282 callout_init_mtx(&V_tcp_syncache
.hashbase
[i
].sch_timer
,
283 &V_tcp_syncache
.hashbase
[i
].sch_mtx
, 0);
284 V_tcp_syncache
.hashbase
[i
].sch_length
= 0;
285 V_tcp_syncache
.hashbase
[i
].sch_sc
= &V_tcp_syncache
;
286 V_tcp_syncache
.hashbase
[i
].sch_last_overflow
=
287 -(SYNCOOKIE_LIFETIME
+ 1);
290 /* Create the syncache entry zone. */
291 V_tcp_syncache
.zone
= uma_zcreate("syncache", sizeof(struct syncache
),
292 NULL
, NULL
, NULL
, NULL
, UMA_ALIGN_PTR
, 0);
293 V_tcp_syncache
.cache_limit
= uma_zone_set_max(V_tcp_syncache
.zone
,
294 V_tcp_syncache
.cache_limit
);
296 /* Start the SYN cookie reseeder callout. */
297 callout_init(&V_tcp_syncache
.secret
.reseed
, 1);
298 arc4rand(V_tcp_syncache
.secret
.key
[0], SYNCOOKIE_SECRET_SIZE
, 0);
299 arc4rand(V_tcp_syncache
.secret
.key
[1], SYNCOOKIE_SECRET_SIZE
, 0);
300 callout_reset(&V_tcp_syncache
.secret
.reseed
, SYNCOOKIE_LIFETIME
* hz
,
301 syncookie_reseed
, &V_tcp_syncache
);
303 /* Initialize the pause machinery. */
304 mtx_init(&V_tcp_syncache
.pause_mtx
, "tcp_sc_pause", NULL
, MTX_DEF
);
305 callout_init_mtx(&V_tcp_syncache
.pause_co
, &V_tcp_syncache
.pause_mtx
,
307 V_tcp_syncache
.pause_until
= time_uptime
- TCP_SYNCACHE_PAUSE_TIME
;
308 V_tcp_syncache
.pause_backoff
= 0;
309 V_tcp_syncache
.paused
= false;
314 syncache_destroy(void)
316 struct syncache_head
*sch
;
317 struct syncache
*sc
, *nsc
;
321 * Stop the re-seed timer before freeing resources. No need to
322 * possibly schedule it another time.
324 callout_drain(&V_tcp_syncache
.secret
.reseed
);
326 /* Stop the SYN cache pause callout. */
327 mtx_lock(&V_tcp_syncache
.pause_mtx
);
328 if (callout_stop(&V_tcp_syncache
.pause_co
) == 0) {
329 mtx_unlock(&V_tcp_syncache
.pause_mtx
);
330 callout_drain(&V_tcp_syncache
.pause_co
);
332 mtx_unlock(&V_tcp_syncache
.pause_mtx
);
334 /* Cleanup hash buckets: stop timers, free entries, destroy locks. */
335 for (i
= 0; i
< V_tcp_syncache
.hashsize
; i
++) {
336 sch
= &V_tcp_syncache
.hashbase
[i
];
337 callout_drain(&sch
->sch_timer
);
340 TAILQ_FOREACH_SAFE(sc
, &sch
->sch_bucket
, sc_hash
, nsc
)
341 syncache_drop(sc
, sch
);
343 KASSERT(TAILQ_EMPTY(&sch
->sch_bucket
),
344 ("%s: sch->sch_bucket not empty", __func__
));
345 KASSERT(sch
->sch_length
== 0, ("%s: sch->sch_length %d not 0",
346 __func__
, sch
->sch_length
));
347 mtx_destroy(&sch
->sch_mtx
);
350 KASSERT(uma_zone_get_cur(V_tcp_syncache
.zone
) == 0,
351 ("%s: cache_count not 0", __func__
));
353 /* Free the allocated global resources. */
354 uma_zdestroy(V_tcp_syncache
.zone
);
355 free(V_tcp_syncache
.hashbase
, M_SYNCACHE
);
356 mtx_destroy(&V_tcp_syncache
.pause_mtx
);
361 * Inserts a syncache entry into the specified bucket row.
362 * Locks and unlocks the syncache_head autonomously.
365 syncache_insert(struct syncache
*sc
, struct syncache_head
*sch
)
367 struct syncache
*sc2
;
372 * Make sure that we don't overflow the per-bucket limit.
373 * If the bucket is full, toss the oldest element.
375 if (sch
->sch_length
>= V_tcp_syncache
.bucket_limit
) {
376 KASSERT(!TAILQ_EMPTY(&sch
->sch_bucket
),
377 ("sch->sch_length incorrect"));
378 syncache_pause(&sc
->sc_inc
);
379 sc2
= TAILQ_LAST(&sch
->sch_bucket
, sch_head
);
380 sch
->sch_last_overflow
= time_uptime
;
381 syncache_drop(sc2
, sch
);
384 /* Put it into the bucket. */
385 TAILQ_INSERT_HEAD(&sch
->sch_bucket
, sc
, sc_hash
);
389 if (ADDED_BY_TOE(sc
)) {
390 struct toedev
*tod
= sc
->sc_tod
;
392 tod
->tod_syncache_added(tod
, sc
->sc_todctx
);
396 /* Reinitialize the bucket row's timer. */
397 if (sch
->sch_length
== 1)
398 sch
->sch_nextc
= ticks
+ INT_MAX
;
399 syncache_timeout(sc
, sch
, 1);
403 TCPSTATES_INC(TCPS_SYN_RECEIVED
);
404 TCPSTAT_INC(tcps_sc_added
);
408 * Remove and free entry from syncache bucket row.
409 * Expects locked syncache head.
412 syncache_drop(struct syncache
*sc
, struct syncache_head
*sch
)
415 SCH_LOCK_ASSERT(sch
);
417 TCPSTATES_DEC(TCPS_SYN_RECEIVED
);
418 TAILQ_REMOVE(&sch
->sch_bucket
, sc
, sc_hash
);
422 if (ADDED_BY_TOE(sc
)) {
423 struct toedev
*tod
= sc
->sc_tod
;
425 tod
->tod_syncache_removed(tod
, sc
->sc_todctx
);
433 * Engage/reengage time on bucket row.
436 syncache_timeout(struct syncache
*sc
, struct syncache_head
*sch
, int docallout
)
440 if (sc
->sc_rxmits
== 0)
441 rexmt
= tcp_rexmit_initial
;
444 tcp_rexmit_initial
* tcp_backoff
[sc
->sc_rxmits
],
445 tcp_rexmit_min
, TCPTV_REXMTMAX
);
446 sc
->sc_rxttime
= ticks
+ rexmt
;
448 if (TSTMP_LT(sc
->sc_rxttime
, sch
->sch_nextc
)) {
449 sch
->sch_nextc
= sc
->sc_rxttime
;
451 callout_reset(&sch
->sch_timer
, sch
->sch_nextc
- ticks
,
452 syncache_timer
, (void *)sch
);
457 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
458 * If we have retransmitted an entry the maximum number of times, expire it.
459 * One separate timer for each bucket row.
462 syncache_timer(void *xsch
)
464 struct syncache_head
*sch
= (struct syncache_head
*)xsch
;
465 struct syncache
*sc
, *nsc
;
466 struct epoch_tracker et
;
471 CURVNET_SET(sch
->sch_sc
->vnet
);
473 /* NB: syncache_head has already been locked by the callout. */
474 SCH_LOCK_ASSERT(sch
);
477 * In the following cycle we may remove some entries and/or
478 * advance some timeouts, so re-initialize the bucket timer.
480 sch
->sch_nextc
= tick
+ INT_MAX
;
483 * If we have paused processing, unconditionally remove
484 * all syncache entries.
486 mtx_lock(&V_tcp_syncache
.pause_mtx
);
487 paused
= V_tcp_syncache
.paused
;
488 mtx_unlock(&V_tcp_syncache
.pause_mtx
);
490 TAILQ_FOREACH_SAFE(sc
, &sch
->sch_bucket
, sc_hash
, nsc
) {
492 syncache_drop(sc
, sch
);
496 * We do not check if the listen socket still exists
497 * and accept the case where the listen socket may be
498 * gone by the time we resend the SYN/ACK. We do
499 * not expect this to happens often. If it does,
500 * then the RST will be sent by the time the remote
501 * host does the SYN/ACK->ACK.
503 if (TSTMP_GT(sc
->sc_rxttime
, tick
)) {
504 if (TSTMP_LT(sc
->sc_rxttime
, sch
->sch_nextc
))
505 sch
->sch_nextc
= sc
->sc_rxttime
;
508 if (sc
->sc_rxmits
> V_tcp_ecn_maxretries
) {
509 sc
->sc_flags
&= ~SCF_ECN_MASK
;
511 if (sc
->sc_rxmits
> V_tcp_syncache
.rexmt_limit
) {
512 if ((s
= tcp_log_addrs(&sc
->sc_inc
, NULL
, NULL
, NULL
))) {
513 log(LOG_DEBUG
, "%s; %s: Retransmits exhausted, "
514 "giving up and removing syncache entry\n",
518 syncache_drop(sc
, sch
);
519 TCPSTAT_INC(tcps_sc_stale
);
522 if ((s
= tcp_log_addrs(&sc
->sc_inc
, NULL
, NULL
, NULL
))) {
523 log(LOG_DEBUG
, "%s; %s: Response timeout, "
524 "retransmitting (%u) SYN|ACK\n",
525 s
, __func__
, sc
->sc_rxmits
);
530 if (syncache_respond(sc
, NULL
, TH_SYN
|TH_ACK
) == 0) {
531 syncache_timeout(sc
, sch
, 0);
532 TCPSTAT_INC(tcps_sndacks
);
533 TCPSTAT_INC(tcps_sndtotal
);
534 TCPSTAT_INC(tcps_sc_retransmitted
);
536 syncache_drop(sc
, sch
);
537 TCPSTAT_INC(tcps_sc_dropped
);
541 if (!TAILQ_EMPTY(&(sch
)->sch_bucket
))
542 callout_reset(&(sch
)->sch_timer
, (sch
)->sch_nextc
- tick
,
543 syncache_timer
, (void *)(sch
));
548 * Returns true if the system is only using cookies at the moment.
549 * This could be due to a sysadmin decision to only use cookies, or it
550 * could be due to the system detecting an attack.
553 syncache_cookiesonly(void)
556 return (V_tcp_syncookies
&& (V_tcp_syncache
.paused
||
557 V_tcp_syncookiesonly
));
561 * Find the hash bucket for the given connection.
563 static struct syncache_head
*
564 syncache_hashbucket(struct in_conninfo
*inc
)
569 * The hash is built on foreign port + local port + foreign address.
570 * We rely on the fact that struct in_conninfo starts with 16 bits
571 * of foreign port, then 16 bits of local port then followed by 128
572 * bits of foreign address. In case of IPv4 address, the first 3
573 * 32-bit words of the address always are zeroes.
575 hash
= jenkins_hash32((uint32_t *)&inc
->inc_ie
, 5,
576 V_tcp_syncache
.hash_secret
) & V_tcp_syncache
.hashmask
;
578 return (&V_tcp_syncache
.hashbase
[hash
]);
582 * Find an entry in the syncache.
583 * Returns always with locked syncache_head plus a matching entry or NULL.
585 static struct syncache
*
586 syncache_lookup(struct in_conninfo
*inc
, struct syncache_head
**schp
)
589 struct syncache_head
*sch
;
591 *schp
= sch
= syncache_hashbucket(inc
);
594 /* Circle through bucket row to find matching entry. */
595 TAILQ_FOREACH(sc
, &sch
->sch_bucket
, sc_hash
)
596 if (bcmp(&inc
->inc_ie
, &sc
->sc_inc
.inc_ie
,
597 sizeof(struct in_endpoints
)) == 0)
600 return (sc
); /* Always returns with locked sch. */
604 * This function is called when we get a RST for a
605 * non-existent connection, so that we can see if the
606 * connection is in the syn cache. If it is, zap it.
607 * If required send a challenge ACK.
610 syncache_chkrst(struct in_conninfo
*inc
, struct tcphdr
*th
, struct mbuf
*m
,
614 struct syncache_head
*sch
;
617 if (syncache_cookiesonly())
619 sc
= syncache_lookup(inc
, &sch
); /* returns locked sch */
620 SCH_LOCK_ASSERT(sch
);
623 * No corresponding connection was found in syncache.
624 * If syncookies are enabled and possibly exclusively
625 * used, or we are under memory pressure, a valid RST
626 * may not find a syncache entry. In that case we're
627 * done and no SYN|ACK retransmissions will happen.
628 * Otherwise the RST was misdirected or spoofed.
631 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
632 log(LOG_DEBUG
, "%s; %s: Spurious RST without matching "
633 "syncache entry (possibly syncookie only), "
634 "segment ignored\n", s
, __func__
);
635 TCPSTAT_INC(tcps_badrst
);
639 /* The remote UDP encaps port does not match. */
640 if (sc
->sc_port
!= port
) {
641 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
642 log(LOG_DEBUG
, "%s; %s: Spurious RST with matching "
643 "syncache entry but non-matching UDP encaps port, "
644 "segment ignored\n", s
, __func__
);
645 TCPSTAT_INC(tcps_badrst
);
650 * If the RST bit is set, check the sequence number to see
651 * if this is a valid reset segment.
654 * In all states except SYN-SENT, all reset (RST) segments
655 * are validated by checking their SEQ-fields. A reset is
656 * valid if its sequence number is in the window.
659 * There are four cases for the acceptability test for an incoming
662 * Segment Receive Test
664 * ------- ------- -------------------------------------------
665 * 0 0 SEG.SEQ = RCV.NXT
666 * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
667 * >0 0 not acceptable
668 * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
669 * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
671 * Note that when receiving a SYN segment in the LISTEN state,
672 * IRS is set to SEG.SEQ and RCV.NXT is set to SEG.SEQ+1, as
673 * described in RFC 793, page 66.
675 if ((SEQ_GEQ(th
->th_seq
, sc
->sc_irs
+ 1) &&
676 SEQ_LT(th
->th_seq
, sc
->sc_irs
+ 1 + sc
->sc_wnd
)) ||
677 (sc
->sc_wnd
== 0 && th
->th_seq
== sc
->sc_irs
+ 1)) {
678 if (V_tcp_insecure_rst
||
679 th
->th_seq
== sc
->sc_irs
+ 1) {
680 syncache_drop(sc
, sch
);
681 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
683 "%s; %s: Our SYN|ACK was rejected, "
684 "connection attempt aborted by remote "
687 TCPSTAT_INC(tcps_sc_reset
);
689 TCPSTAT_INC(tcps_badrst
);
690 /* Send challenge ACK. */
691 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
692 log(LOG_DEBUG
, "%s; %s: RST with invalid "
693 " SEQ %u != NXT %u (+WND %u), "
694 "sending challenge ACK\n",
696 th
->th_seq
, sc
->sc_irs
+ 1, sc
->sc_wnd
);
697 if (syncache_respond(sc
, m
, TH_ACK
) == 0) {
698 TCPSTAT_INC(tcps_sndacks
);
699 TCPSTAT_INC(tcps_sndtotal
);
701 syncache_drop(sc
, sch
);
702 TCPSTAT_INC(tcps_sc_dropped
);
706 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
707 log(LOG_DEBUG
, "%s; %s: RST with invalid SEQ %u != "
708 "NXT %u (+WND %u), segment ignored\n",
710 th
->th_seq
, sc
->sc_irs
+ 1, sc
->sc_wnd
);
711 TCPSTAT_INC(tcps_badrst
);
721 syncache_badack(struct in_conninfo
*inc
, uint16_t port
)
724 struct syncache_head
*sch
;
726 if (syncache_cookiesonly())
728 sc
= syncache_lookup(inc
, &sch
); /* returns locked sch */
729 SCH_LOCK_ASSERT(sch
);
730 if ((sc
!= NULL
) && (sc
->sc_port
== port
)) {
731 syncache_drop(sc
, sch
);
732 TCPSTAT_INC(tcps_sc_badack
);
738 syncache_unreach(struct in_conninfo
*inc
, tcp_seq th_seq
, uint16_t port
)
741 struct syncache_head
*sch
;
743 if (syncache_cookiesonly())
745 sc
= syncache_lookup(inc
, &sch
); /* returns locked sch */
746 SCH_LOCK_ASSERT(sch
);
750 /* If the port != sc_port, then it's a bogus ICMP msg */
751 if (port
!= sc
->sc_port
)
754 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
755 if (ntohl(th_seq
) != sc
->sc_iss
)
759 * If we've rertransmitted 3 times and this is our second error,
760 * we remove the entry. Otherwise, we allow it to continue on.
761 * This prevents us from incorrectly nuking an entry during a
762 * spurious network outage.
766 if ((sc
->sc_flags
& SCF_UNREACH
) == 0 || sc
->sc_rxmits
< 3 + 1) {
767 sc
->sc_flags
|= SCF_UNREACH
;
770 syncache_drop(sc
, sch
);
771 TCPSTAT_INC(tcps_sc_unreach
);
777 * Build a new TCP socket structure from a syncache entry.
779 * On success return the newly created socket with its underlying inp locked.
781 static struct socket
*
782 syncache_socket(struct syncache
*sc
, struct socket
*lso
, struct mbuf
*m
)
784 struct inpcb
*inp
= NULL
;
793 * Ok, create the full blown connection, and set things up
794 * as they would have been set up if we had created the
795 * connection when the SYN arrived.
797 if ((so
= solisten_clone(lso
)) == NULL
)
800 mac_socketpeer_set_from_mbuf(m
, so
);
802 error
= in_pcballoc(so
, &V_tcbinfo
);
808 if ((tp
= tcp_newtcpcb(inp
, sototcpcb(lso
))) == NULL
) {
813 inp
->inp_inc
.inc_flags
= sc
->sc_inc
.inc_flags
;
815 if (sc
->sc_inc
.inc_flags
& INC_ISIPV6
) {
816 inp
->inp_vflag
&= ~INP_IPV4
;
817 inp
->inp_vflag
|= INP_IPV6
;
818 inp
->in6p_laddr
= sc
->sc_inc
.inc6_laddr
;
820 inp
->inp_vflag
&= ~INP_IPV6
;
821 inp
->inp_vflag
|= INP_IPV4
;
823 inp
->inp_ip_ttl
= sc
->sc_ip_ttl
;
824 inp
->inp_ip_tos
= sc
->sc_ip_tos
;
825 inp
->inp_laddr
= sc
->sc_inc
.inc_laddr
;
831 * If there's an mbuf and it has a flowid, then let's initialise the
832 * inp with that particular flowid.
834 if (m
!= NULL
&& M_HASHTYPE_GET(m
) != M_HASHTYPE_NONE
) {
835 inp
->inp_flowid
= m
->m_pkthdr
.flowid
;
836 inp
->inp_flowtype
= M_HASHTYPE_GET(m
);
838 inp
->inp_numa_domain
= m
->m_pkthdr
.numa_domain
;
842 inp
->inp_lport
= sc
->sc_inc
.inc_lport
;
844 if (inp
->inp_vflag
& INP_IPV6PROTO
) {
845 struct inpcb
*oinp
= sotoinpcb(lso
);
848 * Inherit socket options from the listening socket.
849 * Note that in6p_inputopts are not (and should not be)
850 * copied, since it stores previously received options and is
851 * used to detect if each new option is different than the
852 * previous one and hence should be passed to a user.
853 * If we copied in6p_inputopts, a user would not be able to
854 * receive options just after calling the accept system call.
856 inp
->inp_flags
|= oinp
->inp_flags
& INP_CONTROLOPTS
;
857 if (oinp
->in6p_outputopts
)
858 inp
->in6p_outputopts
=
859 ip6_copypktopts(oinp
->in6p_outputopts
, M_NOWAIT
);
860 inp
->in6p_hops
= oinp
->in6p_hops
;
863 if (sc
->sc_inc
.inc_flags
& INC_ISIPV6
) {
864 struct sockaddr_in6 sin6
;
866 sin6
.sin6_family
= AF_INET6
;
867 sin6
.sin6_len
= sizeof(sin6
);
868 sin6
.sin6_addr
= sc
->sc_inc
.inc6_faddr
;
869 sin6
.sin6_port
= sc
->sc_inc
.inc_fport
;
870 sin6
.sin6_flowinfo
= sin6
.sin6_scope_id
= 0;
871 INP_HASH_WLOCK(&V_tcbinfo
);
872 error
= in6_pcbconnect(inp
, &sin6
, thread0
.td_ucred
, false);
873 INP_HASH_WUNLOCK(&V_tcbinfo
);
876 /* Override flowlabel from in6_pcbconnect. */
877 inp
->inp_flow
&= ~IPV6_FLOWLABEL_MASK
;
878 inp
->inp_flow
|= sc
->sc_flowlabel
;
881 #if defined(INET) && defined(INET6)
886 struct sockaddr_in sin
;
888 inp
->inp_options
= (m
) ? ip_srcroute(m
) : NULL
;
890 if (inp
->inp_options
== NULL
) {
891 inp
->inp_options
= sc
->sc_ipopts
;
892 sc
->sc_ipopts
= NULL
;
895 sin
.sin_family
= AF_INET
;
896 sin
.sin_len
= sizeof(sin
);
897 sin
.sin_addr
= sc
->sc_inc
.inc_faddr
;
898 sin
.sin_port
= sc
->sc_inc
.inc_fport
;
899 bzero((caddr_t
)sin
.sin_zero
, sizeof(sin
.sin_zero
));
900 INP_HASH_WLOCK(&V_tcbinfo
);
901 error
= in_pcbconnect(inp
, &sin
, thread0
.td_ucred
);
902 INP_HASH_WUNLOCK(&V_tcbinfo
);
907 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
908 /* Copy old policy into new socket's. */
909 if (ipsec_copy_pcbpolicy(sotoinpcb(lso
), inp
) != 0)
910 printf("syncache_socket: could not copy policy\n");
912 tp
->t_state
= TCPS_SYN_RECEIVED
;
913 tp
->iss
= sc
->sc_iss
;
914 tp
->irs
= sc
->sc_irs
;
915 tp
->t_port
= sc
->sc_port
;
918 tp
->snd_wl1
= sc
->sc_irs
;
919 tp
->snd_max
= tp
->iss
+ 1;
920 tp
->snd_nxt
= tp
->iss
+ 1;
921 tp
->rcv_up
= sc
->sc_irs
+ 1;
922 tp
->rcv_wnd
= sc
->sc_wnd
;
923 tp
->rcv_adv
+= tp
->rcv_wnd
;
924 tp
->last_ack_sent
= tp
->rcv_nxt
;
926 tp
->t_flags
= sototcpcb(lso
)->t_flags
&
927 (TF_LRD
|TF_NOPUSH
|TF_NODELAY
);
928 if (sc
->sc_flags
& SCF_NOOPT
)
929 tp
->t_flags
|= TF_NOOPT
;
931 if (sc
->sc_flags
& SCF_WINSCALE
) {
932 tp
->t_flags
|= TF_REQ_SCALE
|TF_RCVD_SCALE
;
933 tp
->snd_scale
= sc
->sc_requested_s_scale
;
934 tp
->request_r_scale
= sc
->sc_requested_r_scale
;
936 if (sc
->sc_flags
& SCF_TIMESTAMP
) {
937 tp
->t_flags
|= TF_REQ_TSTMP
|TF_RCVD_TSTMP
;
938 tp
->ts_recent
= sc
->sc_tsreflect
;
939 tp
->ts_recent_age
= tcp_ts_getticks();
940 tp
->ts_offset
= sc
->sc_tsoff
;
942 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
943 if (sc
->sc_flags
& SCF_SIGNATURE
)
944 tp
->t_flags
|= TF_SIGNATURE
;
946 if (sc
->sc_flags
& SCF_SACK
)
947 tp
->t_flags
|= TF_SACK_PERMIT
;
950 tcp_ecn_syncache_socket(tp
, sc
);
953 * Set up MSS and get cached values from tcp_hostcache.
954 * This might overwrite some of the defaults we just set.
956 tcp_mss(tp
, sc
->sc_peer_mss
);
959 * If the SYN,ACK was retransmitted, indicate that CWND to be
960 * limited to one segment in cc_conn_init().
961 * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits.
963 if (sc
->sc_rxmits
> 1)
968 * Allow a TOE driver to install its hooks. Note that we hold the
969 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
970 * new connection before the TOE driver has done its thing.
972 if (ADDED_BY_TOE(sc
)) {
973 struct toedev
*tod
= sc
->sc_tod
;
975 tod
->tod_offload_socket(tod
, sc
->sc_todctx
, so
);
980 * Inherit the log state from the listening socket, if
981 * - the log state of the listening socket is not off and
982 * - the listening socket was not auto selected from all sessions and
983 * - a log id is not set on the listening socket.
984 * This avoids inheriting a log state which was automatically set.
986 if ((tcp_get_bblog_state(sototcpcb(lso
)) != TCP_LOG_STATE_OFF
) &&
987 ((sototcpcb(lso
)->t_flags2
& TF2_LOG_AUTO
) == 0) &&
988 (sototcpcb(lso
)->t_lib
== NULL
)) {
989 tcp_log_state_change(tp
, tcp_get_bblog_state(sototcpcb(lso
)));
993 * Copy and activate timers.
995 tp
->t_maxunacktime
= sototcpcb(lso
)->t_maxunacktime
;
996 tp
->t_keepinit
= sototcpcb(lso
)->t_keepinit
;
997 tp
->t_keepidle
= sototcpcb(lso
)->t_keepidle
;
998 tp
->t_keepintvl
= sototcpcb(lso
)->t_keepintvl
;
999 tp
->t_keepcnt
= sototcpcb(lso
)->t_keepcnt
;
1000 tcp_timer_activate(tp
, TT_KEEP
, TP_KEEPINIT(tp
));
1002 TCPSTAT_INC(tcps_accepts
);
1003 TCP_PROBE6(state__change
, NULL
, tp
, NULL
, tp
, NULL
, TCPS_LISTEN
);
1005 if (!solisten_enqueue(so
, SS_ISCONNECTED
))
1006 tp
->t_flags
|= TF_SONOTCONN
;
1007 /* Can we inherit anything from the listener? */
1008 if (tp
->t_fb
->tfb_inherit
!= NULL
) {
1009 (*tp
->t_fb
->tfb_inherit
)(tp
, sotoinpcb(lso
));
1015 * Drop the connection; we will either send a RST or have the peer
1016 * retransmit its SYN again after its RTO and try again.
1018 if ((s
= tcp_log_addrs(&sc
->sc_inc
, NULL
, NULL
, NULL
))) {
1019 log(LOG_DEBUG
, "%s; %s: Socket create failed "
1020 "due to limits or memory shortage\n",
1024 TCPSTAT_INC(tcps_listendrop
);
1031 if ((s
= tcp_log_addrs(&sc
->sc_inc
, NULL
, NULL
, NULL
))) {
1032 log(LOG_DEBUG
, "%s; %s: in%s_pcbconnect failed with error %i\n",
1033 s
, __func__
, (sc
->sc_inc
.inc_flags
& INC_ISIPV6
) ? "6" : "",
1037 TCPSTAT_INC(tcps_listendrop
);
1042 * This function gets called when we receive an ACK for a
1043 * socket in the LISTEN state. We look up the connection
1044 * in the syncache, and if its there, we pull it out of
1045 * the cache and turn it into a full-blown connection in
1046 * the SYN-RECEIVED state.
1048 * On syncache_socket() success the newly created socket
1049 * has its underlying inp locked.
1052 syncache_expand(struct in_conninfo
*inc
, struct tcpopt
*to
, struct tcphdr
*th
,
1053 struct socket
**lsop
, struct mbuf
*m
, uint16_t port
)
1055 struct syncache
*sc
;
1056 struct syncache_head
*sch
;
1057 struct syncache scs
;
1062 KASSERT((tcp_get_flags(th
) & (TH_RST
|TH_ACK
|TH_SYN
)) == TH_ACK
,
1063 ("%s: can handle only ACK", __func__
));
1065 if (syncache_cookiesonly()) {
1067 sch
= syncache_hashbucket(inc
);
1070 sc
= syncache_lookup(inc
, &sch
); /* returns locked sch */
1072 SCH_LOCK_ASSERT(sch
);
1077 * Test code for syncookies comparing the syncache stored
1078 * values with the reconstructed values from the cookie.
1081 syncookie_cmp(inc
, sch
, sc
, th
, to
, *lsop
, port
);
1086 * There is no syncache entry, so see if this ACK is
1087 * a returning syncookie. To do this, first:
1088 * A. Check if syncookies are used in case of syncache
1090 * B. See if this socket has had a syncache entry dropped in
1091 * the recent past. We don't want to accept a bogus
1092 * syncookie if we've never received a SYN or accept it
1094 * C. check that the syncookie is valid. If it is, then
1095 * cobble up a fake syncache entry, and return.
1097 if (locked
&& !V_tcp_syncookies
) {
1099 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
1100 log(LOG_DEBUG
, "%s; %s: Spurious ACK, "
1101 "segment rejected (syncookies disabled)\n",
1105 if (locked
&& !V_tcp_syncookiesonly
&&
1106 sch
->sch_last_overflow
< time_uptime
- SYNCOOKIE_LIFETIME
) {
1108 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
1109 log(LOG_DEBUG
, "%s; %s: Spurious ACK, "
1110 "segment rejected (no syncache entry)\n",
1114 bzero(&scs
, sizeof(scs
));
1115 sc
= syncookie_lookup(inc
, sch
, &scs
, th
, to
, *lsop
, port
);
1119 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
1120 log(LOG_DEBUG
, "%s; %s: Segment failed "
1121 "SYNCOOKIE authentication, segment rejected "
1122 "(probably spoofed)\n", s
, __func__
);
1125 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1126 /* If received ACK has MD5 signature, check it. */
1127 if ((to
->to_flags
& TOF_SIGNATURE
) != 0 &&
1128 (!TCPMD5_ENABLED() ||
1129 TCPMD5_INPUT(m
, th
, to
->to_signature
) != 0)) {
1131 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1132 log(LOG_DEBUG
, "%s; %s: Segment rejected, "
1133 "MD5 signature doesn't match.\n",
1137 TCPSTAT_INC(tcps_sig_err_sigopt
);
1138 return (-1); /* Do not send RST */
1140 #endif /* TCP_SIGNATURE */
1141 TCPSTATES_INC(TCPS_SYN_RECEIVED
);
1143 if (sc
->sc_port
!= port
) {
1147 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1149 * If listening socket requested TCP digests, check that
1150 * received ACK has signature and it is correct.
1151 * If not, drop the ACK and leave sc entry in th cache,
1152 * because SYN was received with correct signature.
1154 if (sc
->sc_flags
& SCF_SIGNATURE
) {
1155 if ((to
->to_flags
& TOF_SIGNATURE
) == 0) {
1157 TCPSTAT_INC(tcps_sig_err_nosigopt
);
1159 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1160 log(LOG_DEBUG
, "%s; %s: Segment "
1161 "rejected, MD5 signature wasn't "
1162 "provided.\n", s
, __func__
);
1165 return (-1); /* Do not send RST */
1167 if (!TCPMD5_ENABLED() ||
1168 TCPMD5_INPUT(m
, th
, to
->to_signature
) != 0) {
1169 /* Doesn't match or no SA */
1171 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1172 log(LOG_DEBUG
, "%s; %s: Segment "
1173 "rejected, MD5 signature doesn't "
1174 "match.\n", s
, __func__
);
1177 return (-1); /* Do not send RST */
1180 #endif /* TCP_SIGNATURE */
1183 * RFC 7323 PAWS: If we have a timestamp on this segment and
1184 * it's less than ts_recent, drop it.
1185 * XXXMT: RFC 7323 also requires to send an ACK.
1186 * In tcp_input.c this is only done for TCP segments
1187 * with user data, so be consistent here and just drop
1190 if (sc
->sc_flags
& SCF_TIMESTAMP
&& to
->to_flags
& TOF_TS
&&
1191 TSTMP_LT(to
->to_tsval
, sc
->sc_tsreflect
)) {
1193 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1195 "%s; %s: SEG.TSval %u < TS.Recent %u, "
1196 "segment dropped\n", s
, __func__
,
1197 to
->to_tsval
, sc
->sc_tsreflect
);
1200 return (-1); /* Do not send RST */
1204 * If timestamps were not negotiated during SYN/ACK and a
1205 * segment with a timestamp is received, ignore the
1206 * timestamp and process the packet normally.
1207 * See section 3.2 of RFC 7323.
1209 if (!(sc
->sc_flags
& SCF_TIMESTAMP
) &&
1210 (to
->to_flags
& TOF_TS
)) {
1211 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1212 log(LOG_DEBUG
, "%s; %s: Timestamp not "
1213 "expected, segment processed normally\n",
1221 * If timestamps were negotiated during SYN/ACK and a
1222 * segment without a timestamp is received, silently drop
1223 * the segment, unless the missing timestamps are tolerated.
1224 * See section 3.2 of RFC 7323.
1226 if ((sc
->sc_flags
& SCF_TIMESTAMP
) &&
1227 !(to
->to_flags
& TOF_TS
)) {
1228 if (V_tcp_tolerate_missing_ts
) {
1229 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1231 "%s; %s: Timestamp missing, "
1232 "segment processed normally\n",
1238 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
))) {
1240 "%s; %s: Timestamp missing, "
1241 "segment silently dropped\n",
1245 return (-1); /* Do not send RST */
1248 TAILQ_REMOVE(&sch
->sch_bucket
, sc
, sc_hash
);
1251 if (ADDED_BY_TOE(sc
)) {
1252 struct toedev
*tod
= sc
->sc_tod
;
1254 tod
->tod_syncache_removed(tod
, sc
->sc_todctx
);
1261 * Segment validation:
1262 * ACK must match our initial sequence number + 1 (the SYN|ACK).
1264 if (th
->th_ack
!= sc
->sc_iss
+ 1) {
1265 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
1266 log(LOG_DEBUG
, "%s; %s: ACK %u != ISS+1 %u, segment "
1267 "rejected\n", s
, __func__
, th
->th_ack
, sc
->sc_iss
);
1272 * The SEQ must fall in the window starting at the received
1273 * initial receive sequence number + 1 (the SYN).
1275 if (SEQ_LEQ(th
->th_seq
, sc
->sc_irs
) ||
1276 SEQ_GT(th
->th_seq
, sc
->sc_irs
+ sc
->sc_wnd
)) {
1277 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)))
1278 log(LOG_DEBUG
, "%s; %s: SEQ %u != IRS+1 %u, segment "
1279 "rejected\n", s
, __func__
, th
->th_seq
, sc
->sc_irs
);
1283 *lsop
= syncache_socket(sc
, *lsop
, m
);
1285 if (__predict_false(*lsop
== NULL
)) {
1286 TCPSTAT_INC(tcps_sc_aborted
);
1287 TCPSTATES_DEC(TCPS_SYN_RECEIVED
);
1289 TCPSTAT_INC(tcps_sc_completed
);
1291 /* how do we find the inp for the new socket? */
1297 TCPSTATES_DEC(TCPS_SYN_RECEIVED
);
1307 static struct socket
*
1308 syncache_tfo_expand(struct syncache
*sc
, struct socket
*lso
, struct mbuf
*m
,
1309 uint64_t response_cookie
)
1313 unsigned int *pending_counter
;
1318 pending_counter
= intotcpcb(sotoinpcb(lso
))->t_tfo_pending
;
1319 so
= syncache_socket(sc
, lso
, m
);
1321 TCPSTAT_INC(tcps_sc_aborted
);
1322 atomic_subtract_int(pending_counter
, 1);
1325 inp
= sotoinpcb(so
);
1326 tp
= intotcpcb(inp
);
1327 tp
->t_flags
|= TF_FASTOPEN
;
1328 tp
->t_tfo_cookie
.server
= response_cookie
;
1329 tp
->snd_max
= tp
->iss
;
1330 tp
->snd_nxt
= tp
->iss
;
1331 tp
->t_tfo_pending
= pending_counter
;
1332 TCPSTATES_INC(TCPS_SYN_RECEIVED
);
1333 TCPSTAT_INC(tcps_sc_completed
);
1340 * Given a LISTEN socket and an inbound SYN request, add
1341 * this to the syn cache, and send back a segment:
1342 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
1345 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
1346 * Doing so would require that we hold onto the data and deliver it
1347 * to the application. However, if we are the target of a SYN-flood
1348 * DoS attack, an attacker could send data which would eventually
1349 * consume all available buffer space if it were ACKed. By not ACKing
1350 * the data, we avoid this DoS scenario.
1352 * The exception to the above is when a SYN with a valid TCP Fast Open (TFO)
1353 * cookie is processed and a new socket is created. In this case, any data
1354 * accompanying the SYN will be queued to the socket by tcp_input() and will
1355 * be ACKed either when the application sends response data or the delayed
1356 * ACK timer expires, whichever comes first.
1359 syncache_add(struct in_conninfo
*inc
, struct tcpopt
*to
, struct tcphdr
*th
,
1360 struct inpcb
*inp
, struct socket
*so
, struct mbuf
*m
, void *tod
,
1361 void *todctx
, uint8_t iptos
, uint16_t port
)
1364 struct socket
*rv
= NULL
;
1365 struct syncache
*sc
= NULL
;
1366 struct syncache_head
*sch
;
1367 struct mbuf
*ipopts
= NULL
;
1369 int win
, ip_ttl
, ip_tos
;
1372 int autoflowlabel
= 0;
1375 struct label
*maclabel
= NULL
;
1377 struct syncache scs
;
1378 uint64_t tfo_response_cookie
;
1379 unsigned int *tfo_pending
= NULL
;
1380 int tfo_cookie_valid
= 0;
1381 int tfo_response_cookie_valid
= 0;
1384 INP_RLOCK_ASSERT(inp
); /* listen socket */
1385 KASSERT((tcp_get_flags(th
) & (TH_RST
|TH_ACK
|TH_SYN
)) == TH_SYN
,
1386 ("%s: unexpected tcp flags", __func__
));
1389 * Combine all so/tp operations very early to drop the INP lock as
1392 KASSERT(SOLISTENING(so
), ("%s: %p not listening", __func__
, so
));
1396 if (inc
->inc_flags
& INC_ISIPV6
) {
1397 if (inp
->inp_flags
& IN6P_AUTOFLOWLABEL
) {
1400 ip_ttl
= in6_selecthlim(inp
, NULL
);
1401 if ((inp
->in6p_outputopts
== NULL
) ||
1402 (inp
->in6p_outputopts
->ip6po_tclass
== -1)) {
1405 ip_tos
= inp
->in6p_outputopts
->ip6po_tclass
;
1409 #if defined(INET6) && defined(INET)
1414 ip_ttl
= inp
->inp_ip_ttl
;
1415 ip_tos
= inp
->inp_ip_tos
;
1418 win
= so
->sol_sbrcv_hiwat
;
1419 ltflags
= (tp
->t_flags
& (TF_NOOPT
| TF_SIGNATURE
));
1421 if (V_tcp_fastopen_server_enable
&& (tp
->t_flags
& TF_FASTOPEN
) &&
1422 (tp
->t_tfo_pending
!= NULL
) &&
1423 (to
->to_flags
& TOF_FASTOPEN
)) {
1425 * Limit the number of pending TFO connections to
1426 * approximately half of the queue limit. This prevents TFO
1427 * SYN floods from starving the service by filling the
1428 * listen queue with bogus TFO connections.
1430 if (atomic_fetchadd_int(tp
->t_tfo_pending
, 1) <=
1431 (so
->sol_qlimit
/ 2)) {
1434 result
= tcp_fastopen_check_cookie(inc
,
1435 to
->to_tfo_cookie
, to
->to_tfo_len
,
1436 &tfo_response_cookie
);
1437 tfo_cookie_valid
= (result
> 0);
1438 tfo_response_cookie_valid
= (result
>= 0);
1442 * Remember the TFO pending counter as it will have to be
1443 * decremented below if we don't make it to syncache_tfo_expand().
1445 tfo_pending
= tp
->t_tfo_pending
;
1449 if (mac_syncache_init(&maclabel
) != 0) {
1453 mac_syncache_create(maclabel
, inp
);
1455 if (!tfo_cookie_valid
)
1459 * Remember the IP options, if any.
1462 if (!(inc
->inc_flags
& INC_ISIPV6
))
1465 ipopts
= (m
) ? ip_srcroute(m
) : NULL
;
1470 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1472 * When the socket is TCP-MD5 enabled check that,
1473 * - a signed packet is valid
1474 * - a non-signed packet does not have a security association
1476 * If a signed packet fails validation or a non-signed packet has a
1477 * security association, the packet will be dropped.
1479 if (ltflags
& TF_SIGNATURE
) {
1480 if (to
->to_flags
& TOF_SIGNATURE
) {
1481 if (!TCPMD5_ENABLED() ||
1482 TCPMD5_INPUT(m
, th
, to
->to_signature
) != 0)
1485 if (TCPMD5_ENABLED() &&
1486 TCPMD5_INPUT(m
, NULL
, NULL
) != ENOENT
)
1489 } else if (to
->to_flags
& TOF_SIGNATURE
)
1491 #endif /* TCP_SIGNATURE */
1493 * See if we already have an entry for this connection.
1494 * If we do, resend the SYN,ACK, and reset the retransmit timer.
1496 * XXX: should the syncache be re-initialized with the contents
1497 * of the new SYN here (which may have different options?)
1499 * XXX: We do not check the sequence number to see if this is a
1500 * real retransmit or a new connection attempt. The question is
1501 * how to handle such a case; either ignore it as spoofed, or
1502 * drop the current entry and create a new one?
1504 if (syncache_cookiesonly()) {
1506 sch
= syncache_hashbucket(inc
);
1509 sc
= syncache_lookup(inc
, &sch
); /* returns locked sch */
1511 SCH_LOCK_ASSERT(sch
);
1514 if (tfo_cookie_valid
)
1516 TCPSTAT_INC(tcps_sc_dupsyn
);
1519 * If we were remembering a previous source route,
1520 * forget it and use the new one we've been given.
1523 (void)m_free(sc
->sc_ipopts
);
1524 sc
->sc_ipopts
= ipopts
;
1527 * Update timestamp if present.
1529 if ((sc
->sc_flags
& SCF_TIMESTAMP
) && (to
->to_flags
& TOF_TS
))
1530 sc
->sc_tsreflect
= to
->to_tsval
;
1532 sc
->sc_flags
&= ~SCF_TIMESTAMP
;
1534 * Adjust ECN response if needed, e.g. different
1535 * IP ECN field, or a fallback by the remote host.
1537 if (sc
->sc_flags
& SCF_ECN_MASK
) {
1538 sc
->sc_flags
&= ~SCF_ECN_MASK
;
1539 sc
->sc_flags
|= tcp_ecn_syncache_add(tcp_get_flags(th
), iptos
);
1543 * Since we have already unconditionally allocated label
1544 * storage, free it up. The syncache entry will already
1545 * have an initialized label we can use.
1547 mac_syncache_destroy(&maclabel
);
1549 TCP_PROBE5(receive
, NULL
, NULL
, m
, NULL
, th
);
1550 /* Retransmit SYN|ACK and reset retransmit count. */
1551 if ((s
= tcp_log_addrs(&sc
->sc_inc
, th
, NULL
, NULL
))) {
1552 log(LOG_DEBUG
, "%s; %s: Received duplicate SYN, "
1553 "resetting timer and retransmitting SYN|ACK\n",
1557 if (syncache_respond(sc
, m
, TH_SYN
|TH_ACK
) == 0) {
1559 syncache_timeout(sc
, sch
, 1);
1560 TCPSTAT_INC(tcps_sndacks
);
1561 TCPSTAT_INC(tcps_sndtotal
);
1563 syncache_drop(sc
, sch
);
1564 TCPSTAT_INC(tcps_sc_dropped
);
1570 KASSERT(sc
== NULL
, ("sc(%p) != NULL", sc
));
1572 * Skip allocating a syncache entry if we are just going to discard
1575 if (!locked
|| tfo_cookie_valid
) {
1576 bzero(&scs
, sizeof(scs
));
1579 sc
= uma_zalloc(V_tcp_syncache
.zone
, M_NOWAIT
| M_ZERO
);
1582 * The zone allocator couldn't provide more entries.
1583 * Treat this as if the cache was full; drop the oldest
1584 * entry and insert the new one.
1586 TCPSTAT_INC(tcps_sc_zonefail
);
1587 sc
= TAILQ_LAST(&sch
->sch_bucket
, sch_head
);
1589 sch
->sch_last_overflow
= time_uptime
;
1590 syncache_drop(sc
, sch
);
1591 syncache_pause(inc
);
1593 sc
= uma_zalloc(V_tcp_syncache
.zone
, M_NOWAIT
| M_ZERO
);
1595 if (V_tcp_syncookies
) {
1596 bzero(&scs
, sizeof(scs
));
1600 ("%s: bucket unexpectedly unlocked",
1609 KASSERT(sc
!= NULL
, ("sc == NULL"));
1610 if (!tfo_cookie_valid
&& tfo_response_cookie_valid
)
1611 sc
->sc_tfo_cookie
= &tfo_response_cookie
;
1614 * Fill in the syncache values.
1617 sc
->sc_label
= maclabel
;
1620 * sc_cred is only used in syncache_pcblist() to list TCP endpoints in
1621 * TCPS_SYN_RECEIVED state when V_tcp_syncache.see_other is false.
1622 * Therefore, store the credentials and take a reference count only
1624 * - sc is allocated from the zone and not using the on stack instance.
1625 * - the sysctl variable net.inet.tcp.syncache.see_other is false.
1626 * The reference count is decremented when a zone allocated sc is
1627 * freed in syncache_free().
1629 if (sc
!= &scs
&& !V_tcp_syncache
.see_other
)
1630 sc
->sc_cred
= crhold(so
->so_cred
);
1634 sc
->sc_ipopts
= ipopts
;
1635 bcopy(inc
, &sc
->sc_inc
, sizeof(struct in_conninfo
));
1636 sc
->sc_ip_tos
= ip_tos
;
1637 sc
->sc_ip_ttl
= ip_ttl
;
1640 sc
->sc_todctx
= todctx
;
1642 sc
->sc_irs
= th
->th_seq
;
1644 sc
->sc_flowlabel
= 0;
1647 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
1648 * win was derived from socket earlier in the function.
1651 win
= imin(win
, TCP_MAXWIN
);
1654 if (V_tcp_do_rfc1323
&&
1655 !(ltflags
& TF_NOOPT
)) {
1657 * A timestamp received in a SYN makes
1658 * it ok to send timestamp requests and replies.
1660 if ((to
->to_flags
& TOF_TS
) && (V_tcp_do_rfc1323
!= 2)) {
1661 sc
->sc_tsreflect
= to
->to_tsval
;
1662 sc
->sc_flags
|= SCF_TIMESTAMP
;
1663 sc
->sc_tsoff
= tcp_new_ts_offset(inc
);
1665 if ((to
->to_flags
& TOF_SCALE
) && (V_tcp_do_rfc1323
!= 3)) {
1669 * Pick the smallest possible scaling factor that
1670 * will still allow us to scale up to sb_max, aka
1671 * kern.ipc.maxsockbuf.
1673 * We do this because there are broken firewalls that
1674 * will corrupt the window scale option, leading to
1675 * the other endpoint believing that our advertised
1676 * window is unscaled. At scale factors larger than
1677 * 5 the unscaled window will drop below 1500 bytes,
1678 * leading to serious problems when traversing these
1681 * With the default maxsockbuf of 256K, a scale factor
1682 * of 3 will be chosen by this algorithm. Those who
1683 * choose a larger maxsockbuf should watch out
1684 * for the compatibility problems mentioned above.
1686 * RFC1323: The Window field in a SYN (i.e., a <SYN>
1687 * or <SYN,ACK>) segment itself is never scaled.
1689 while (wscale
< TCP_MAX_WINSHIFT
&&
1690 (TCP_MAXWIN
<< wscale
) < sb_max
)
1692 sc
->sc_requested_r_scale
= wscale
;
1693 sc
->sc_requested_s_scale
= to
->to_wscale
;
1694 sc
->sc_flags
|= SCF_WINSCALE
;
1697 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1699 * If incoming packet has an MD5 signature, flag this in the
1700 * syncache so that syncache_respond() will do the right thing
1703 if (to
->to_flags
& TOF_SIGNATURE
)
1704 sc
->sc_flags
|= SCF_SIGNATURE
;
1705 #endif /* TCP_SIGNATURE */
1706 if (to
->to_flags
& TOF_SACKPERM
)
1707 sc
->sc_flags
|= SCF_SACK
;
1708 if (to
->to_flags
& TOF_MSS
)
1709 sc
->sc_peer_mss
= to
->to_mss
; /* peer mss may be zero */
1710 if (ltflags
& TF_NOOPT
)
1711 sc
->sc_flags
|= SCF_NOOPT
;
1713 if (V_tcp_do_ecn
&& (tp
->t_flags2
& TF2_CANNOT_DO_ECN
) == 0)
1714 sc
->sc_flags
|= tcp_ecn_syncache_add(tcp_get_flags(th
), iptos
);
1716 if (V_tcp_syncookies
)
1717 sc
->sc_iss
= syncookie_generate(sch
, sc
);
1719 sc
->sc_iss
= arc4random();
1721 if (autoflowlabel
) {
1722 if (V_tcp_syncookies
)
1723 sc
->sc_flowlabel
= sc
->sc_iss
;
1725 sc
->sc_flowlabel
= ip6_randomflowlabel();
1726 sc
->sc_flowlabel
= htonl(sc
->sc_flowlabel
) & IPV6_FLOWLABEL_MASK
;
1732 if (tfo_cookie_valid
) {
1733 rv
= syncache_tfo_expand(sc
, so
, m
, tfo_response_cookie
);
1734 /* INP_RUNLOCK(inp) will be performed by the caller */
1738 TCP_PROBE5(receive
, NULL
, NULL
, m
, NULL
, th
);
1740 * Do a standard 3-way handshake.
1742 if (syncache_respond(sc
, m
, TH_SYN
|TH_ACK
) == 0) {
1744 syncache_insert(sc
, sch
); /* locks and unlocks sch */
1745 TCPSTAT_INC(tcps_sndacks
);
1746 TCPSTAT_INC(tcps_sndtotal
);
1750 TCPSTAT_INC(tcps_sc_dropped
);
1755 TCP_PROBE5(receive
, NULL
, NULL
, m
, NULL
, th
);
1760 * If tfo_pending is not NULL here, then a TFO SYN that did not
1761 * result in a new socket was processed and the associated pending
1762 * counter has not yet been decremented. All such TFO processing paths
1763 * transit this point.
1765 if (tfo_pending
!= NULL
)
1766 tcp_fastopen_decrement_counter(tfo_pending
);
1769 if (sc
== NULL
|| sc
== &scs
) {
1771 mac_syncache_destroy(&maclabel
);
1774 (void)m_free(ipopts
);
1780 * Send SYN|ACK or ACK to the peer. Either in response to a peer's segment,
1781 * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL.
1784 syncache_respond(struct syncache
*sc
, const struct mbuf
*m0
, int flags
)
1786 struct ip
*ip
= NULL
;
1788 struct tcphdr
*th
= NULL
;
1789 struct udphdr
*udp
= NULL
;
1790 int optlen
, error
= 0; /* Make compiler happy */
1791 u_int16_t hlen
, tlen
, mssopt
, ulen
;
1794 struct ip6_hdr
*ip6
= NULL
;
1801 (sc
->sc_inc
.inc_flags
& INC_ISIPV6
) ? sizeof(struct ip6_hdr
) :
1804 tlen
= hlen
+ sizeof(struct tcphdr
);
1806 tlen
+= sizeof(struct udphdr
);
1808 /* Determine MSS we advertize to other end of connection. */
1809 mssopt
= tcp_mssopt(&sc
->sc_inc
);
1811 mssopt
-= V_tcp_udp_tunneling_overhead
;
1812 mssopt
= max(mssopt
, V_tcp_minmss
);
1814 /* XXX: Assume that the entire packet will fit in a header mbuf. */
1815 KASSERT(max_linkhdr
+ tlen
+ TCP_MAXOLEN
<= MHLEN
,
1816 ("syncache: mbuf too small: hlen %u, sc_port %u, max_linkhdr %d + "
1817 "tlen %d + TCP_MAXOLEN %ju <= MHLEN %d", hlen
, sc
->sc_port
,
1818 max_linkhdr
, tlen
, (uintmax_t)TCP_MAXOLEN
, MHLEN
));
1820 /* Create the IP+TCP header from scratch. */
1821 m
= m_gethdr(M_NOWAIT
, MT_DATA
);
1825 mac_syncache_create_mbuf(sc
->sc_label
, m
);
1827 m
->m_data
+= max_linkhdr
;
1829 m
->m_pkthdr
.len
= tlen
;
1830 m
->m_pkthdr
.rcvif
= NULL
;
1833 if (sc
->sc_inc
.inc_flags
& INC_ISIPV6
) {
1834 ip6
= mtod(m
, struct ip6_hdr
*);
1835 ip6
->ip6_vfc
= IPV6_VERSION
;
1836 ip6
->ip6_src
= sc
->sc_inc
.inc6_laddr
;
1837 ip6
->ip6_dst
= sc
->sc_inc
.inc6_faddr
;
1838 ip6
->ip6_plen
= htons(tlen
- hlen
);
1839 /* ip6_hlim is set after checksum */
1840 /* Zero out traffic class and flow label. */
1841 ip6
->ip6_flow
&= ~IPV6_FLOWINFO_MASK
;
1842 ip6
->ip6_flow
|= sc
->sc_flowlabel
;
1843 if (sc
->sc_port
!= 0) {
1844 ip6
->ip6_nxt
= IPPROTO_UDP
;
1845 udp
= (struct udphdr
*)(ip6
+ 1);
1846 udp
->uh_sport
= htons(V_tcp_udp_tunneling_port
);
1847 udp
->uh_dport
= sc
->sc_port
;
1848 ulen
= (tlen
- sizeof(struct ip6_hdr
));
1849 th
= (struct tcphdr
*)(udp
+ 1);
1851 ip6
->ip6_nxt
= IPPROTO_TCP
;
1852 th
= (struct tcphdr
*)(ip6
+ 1);
1854 ip6
->ip6_flow
|= htonl(sc
->sc_ip_tos
<< IPV6_FLOWLABEL_LEN
);
1857 #if defined(INET6) && defined(INET)
1862 ip
= mtod(m
, struct ip
*);
1863 ip
->ip_v
= IPVERSION
;
1864 ip
->ip_hl
= sizeof(struct ip
) >> 2;
1865 ip
->ip_len
= htons(tlen
);
1869 ip
->ip_src
= sc
->sc_inc
.inc_laddr
;
1870 ip
->ip_dst
= sc
->sc_inc
.inc_faddr
;
1871 ip
->ip_ttl
= sc
->sc_ip_ttl
;
1872 ip
->ip_tos
= sc
->sc_ip_tos
;
1875 * See if we should do MTU discovery. Route lookups are
1876 * expensive, so we will only unset the DF bit if:
1878 * 1) path_mtu_discovery is disabled
1879 * 2) the SCF_UNREACH flag has been set
1881 if (V_path_mtu_discovery
&& ((sc
->sc_flags
& SCF_UNREACH
) == 0))
1882 ip
->ip_off
|= htons(IP_DF
);
1883 if (sc
->sc_port
== 0) {
1884 ip
->ip_p
= IPPROTO_TCP
;
1885 th
= (struct tcphdr
*)(ip
+ 1);
1887 ip
->ip_p
= IPPROTO_UDP
;
1888 udp
= (struct udphdr
*)(ip
+ 1);
1889 udp
->uh_sport
= htons(V_tcp_udp_tunneling_port
);
1890 udp
->uh_dport
= sc
->sc_port
;
1891 ulen
= (tlen
- sizeof(struct ip
));
1892 th
= (struct tcphdr
*)(udp
+ 1);
1896 th
->th_sport
= sc
->sc_inc
.inc_lport
;
1897 th
->th_dport
= sc
->sc_inc
.inc_fport
;
1900 th
->th_seq
= htonl(sc
->sc_iss
);
1902 th
->th_seq
= htonl(sc
->sc_iss
+ 1);
1903 th
->th_ack
= htonl(sc
->sc_irs
+ 1);
1904 th
->th_off
= sizeof(struct tcphdr
) >> 2;
1905 th
->th_win
= htons(sc
->sc_wnd
);
1908 flags
= tcp_ecn_syncache_respond(flags
, sc
);
1909 tcp_set_flags(th
, flags
);
1911 /* Tack on the TCP options. */
1912 if ((sc
->sc_flags
& SCF_NOOPT
) == 0) {
1915 if (flags
& TH_SYN
) {
1917 to
.to_flags
= TOF_MSS
;
1918 if (sc
->sc_flags
& SCF_WINSCALE
) {
1919 to
.to_wscale
= sc
->sc_requested_r_scale
;
1920 to
.to_flags
|= TOF_SCALE
;
1922 if (sc
->sc_flags
& SCF_SACK
)
1923 to
.to_flags
|= TOF_SACKPERM
;
1924 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1925 if (sc
->sc_flags
& SCF_SIGNATURE
)
1926 to
.to_flags
|= TOF_SIGNATURE
;
1928 if (sc
->sc_tfo_cookie
) {
1929 to
.to_flags
|= TOF_FASTOPEN
;
1930 to
.to_tfo_len
= TCP_FASTOPEN_COOKIE_LEN
;
1931 to
.to_tfo_cookie
= sc
->sc_tfo_cookie
;
1932 /* don't send cookie again when retransmitting response */
1933 sc
->sc_tfo_cookie
= NULL
;
1936 if (sc
->sc_flags
& SCF_TIMESTAMP
) {
1937 to
.to_tsval
= sc
->sc_tsoff
+ tcp_ts_getticks();
1938 to
.to_tsecr
= sc
->sc_tsreflect
;
1939 to
.to_flags
|= TOF_TS
;
1941 optlen
= tcp_addoptions(&to
, (u_char
*)(th
+ 1));
1943 /* Adjust headers by option size. */
1944 th
->th_off
= (sizeof(struct tcphdr
) + optlen
) >> 2;
1946 m
->m_pkthdr
.len
+= optlen
;
1948 if (sc
->sc_inc
.inc_flags
& INC_ISIPV6
)
1949 ip6
->ip6_plen
= htons(ntohs(ip6
->ip6_plen
) + optlen
);
1952 ip
->ip_len
= htons(ntohs(ip
->ip_len
) + optlen
);
1953 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1954 if (sc
->sc_flags
& SCF_SIGNATURE
) {
1955 KASSERT(to
.to_flags
& TOF_SIGNATURE
,
1956 ("tcp_addoptions() didn't set tcp_signature"));
1958 /* NOTE: to.to_signature is inside of mbuf */
1959 if (!TCPMD5_ENABLED() ||
1960 TCPMD5_OUTPUT(m
, th
, to
.to_signature
) != 0) {
1971 udp
->uh_ulen
= htons(ulen
);
1973 M_SETFIB(m
, sc
->sc_inc
.inc_fibnum
);
1975 * If we have peer's SYN and it has a flowid, then let's assign it to
1976 * our SYN|ACK. ip6_output() and ip_output() will not assign flowid
1977 * to SYN|ACK due to lack of inp here.
1979 if (m0
!= NULL
&& M_HASHTYPE_GET(m0
) != M_HASHTYPE_NONE
) {
1980 m
->m_pkthdr
.flowid
= m0
->m_pkthdr
.flowid
;
1981 M_HASHTYPE_SET(m
, M_HASHTYPE_GET(m0
));
1984 if (sc
->sc_inc
.inc_flags
& INC_ISIPV6
) {
1986 m
->m_pkthdr
.csum_flags
= CSUM_UDP_IPV6
;
1987 m
->m_pkthdr
.csum_data
= offsetof(struct udphdr
, uh_sum
);
1988 udp
->uh_sum
= in6_cksum_pseudo(ip6
, ulen
,
1990 th
->th_sum
= htons(0);
1992 m
->m_pkthdr
.csum_flags
= CSUM_TCP_IPV6
;
1993 m
->m_pkthdr
.csum_data
= offsetof(struct tcphdr
, th_sum
);
1994 th
->th_sum
= in6_cksum_pseudo(ip6
, tlen
+ optlen
- hlen
,
1997 ip6
->ip6_hlim
= sc
->sc_ip_ttl
;
1999 if (ADDED_BY_TOE(sc
)) {
2000 struct toedev
*tod
= sc
->sc_tod
;
2002 error
= tod
->tod_syncache_respond(tod
, sc
->sc_todctx
, m
);
2007 TCP_PROBE5(send
, NULL
, NULL
, ip6
, NULL
, th
);
2008 error
= ip6_output(m
, NULL
, NULL
, 0, NULL
, NULL
, NULL
);
2011 #if defined(INET6) && defined(INET)
2017 m
->m_pkthdr
.csum_flags
= CSUM_UDP
;
2018 m
->m_pkthdr
.csum_data
= offsetof(struct udphdr
, uh_sum
);
2019 udp
->uh_sum
= in_pseudo(ip
->ip_src
.s_addr
,
2020 ip
->ip_dst
.s_addr
, htons(ulen
+ IPPROTO_UDP
));
2021 th
->th_sum
= htons(0);
2023 m
->m_pkthdr
.csum_flags
= CSUM_TCP
;
2024 m
->m_pkthdr
.csum_data
= offsetof(struct tcphdr
, th_sum
);
2025 th
->th_sum
= in_pseudo(ip
->ip_src
.s_addr
, ip
->ip_dst
.s_addr
,
2026 htons(tlen
+ optlen
- hlen
+ IPPROTO_TCP
));
2029 if (ADDED_BY_TOE(sc
)) {
2030 struct toedev
*tod
= sc
->sc_tod
;
2032 error
= tod
->tod_syncache_respond(tod
, sc
->sc_todctx
, m
);
2037 TCP_PROBE5(send
, NULL
, NULL
, ip
, NULL
, th
);
2038 error
= ip_output(m
, sc
->sc_ipopts
, NULL
, 0, NULL
, NULL
);
2045 * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks
2046 * that exceed the capacity of the syncache by avoiding the storage of any
2047 * of the SYNs we receive. Syncookies defend against blind SYN flooding
2048 * attacks where the attacker does not have access to our responses.
2050 * Syncookies encode and include all necessary information about the
2051 * connection setup within the SYN|ACK that we send back. That way we
2052 * can avoid keeping any local state until the ACK to our SYN|ACK returns
2053 * (if ever). Normally the syncache and syncookies are running in parallel
2054 * with the latter taking over when the former is exhausted. When matching
2055 * syncache entry is found the syncookie is ignored.
2057 * The only reliable information persisting the 3WHS is our initial sequence
2058 * number ISS of 32 bits. Syncookies embed a cryptographically sufficient
2059 * strong hash (MAC) value and a few bits of TCP SYN options in the ISS
2060 * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK
2061 * returns and signifies a legitimate connection if it matches the ACK.
2063 * The available space of 32 bits to store the hash and to encode the SYN
2064 * option information is very tight and we should have at least 24 bits for
2065 * the MAC to keep the number of guesses by blind spoofing reasonably high.
2067 * SYN option information we have to encode to fully restore a connection:
2068 * MSS: is imporant to chose an optimal segment size to avoid IP level
2069 * fragmentation along the path. The common MSS values can be encoded
2070 * in a 3-bit table. Uncommon values are captured by the next lower value
2071 * in the table leading to a slight increase in packetization overhead.
2072 * WSCALE: is necessary to allow large windows to be used for high delay-
2073 * bandwidth product links. Not scaling the window when it was initially
2074 * negotiated is bad for performance as lack of scaling further decreases
2075 * the apparent available send window. We only need to encode the WSCALE
2076 * we received from the remote end. Our end can be recalculated at any
2077 * time. The common WSCALE values can be encoded in a 3-bit table.
2078 * Uncommon values are captured by the next lower value in the table
2079 * making us under-estimate the available window size halving our
2080 * theoretically possible maximum throughput for that connection.
2081 * SACK: Greatly assists in packet loss recovery and requires 1 bit.
2082 * TIMESTAMP and SIGNATURE is not encoded because they are permanent options
2083 * that are included in all segments on a connection. We enable them when
2086 * Security of syncookies and attack vectors:
2088 * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod)
2089 * together with the gloabl secret to make it unique per connection attempt.
2090 * Thus any change of any of those parameters results in a different MAC output
2091 * in an unpredictable way unless a collision is encountered. 24 bits of the
2092 * MAC are embedded into the ISS.
2094 * To prevent replay attacks two rotating global secrets are updated with a
2095 * new random value every 15 seconds. The life-time of a syncookie is thus
2098 * Vector 1: Attacking the secret. This requires finding a weakness in the
2099 * MAC itself or the way it is used here. The attacker can do a chosen plain
2100 * text attack by varying and testing the all parameters under his control.
2101 * The strength depends on the size and randomness of the secret, and the
2102 * cryptographic security of the MAC function. Due to the constant updating
2103 * of the secret the attacker has at most 29.999 seconds to find the secret
2104 * and launch spoofed connections. After that he has to start all over again.
2106 * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC
2107 * size an average of 4,823 attempts are required for a 50% chance of success
2108 * to spoof a single syncookie (birthday collision paradox). However the
2109 * attacker is blind and doesn't know if one of his attempts succeeded unless
2110 * he has a side channel to interfere success from. A single connection setup
2111 * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets.
2112 * This many attempts are required for each one blind spoofed connection. For
2113 * every additional spoofed connection he has to launch another N attempts.
2114 * Thus for a sustained rate 100 spoofed connections per second approximately
2115 * 1,800,000 packets per second would have to be sent.
2117 * NB: The MAC function should be fast so that it doesn't become a CPU
2118 * exhaustion attack vector itself.
2121 * RFC4987 TCP SYN Flooding Attacks and Common Mitigations
2122 * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996
2123 * http://cr.yp.to/syncookies.html (overview)
2124 * http://cr.yp.to/syncookies/archive (details)
2127 * Schematic construction of a syncookie enabled Initial Sequence Number:
2129 * 12345678901234567890123456789012
2130 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP|
2132 * x 24 MAC (truncated)
2133 * W 3 Send Window Scale index
2135 * S 1 SACK permitted
2136 * P 1 Odd/even secret
2140 * Distribution and probability of certain MSS values. Those in between are
2141 * rounded down to the next lower one.
2142 * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011]
2143 * .2% .3% 5% 7% 7% 20% 15% 45%
2145 static int tcp_sc_msstab
[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 };
2148 * Distribution and probability of certain WSCALE values. We have to map the
2149 * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3
2150 * bits based on prevalence of certain values. Where we don't have an exact
2151 * match for are rounded down to the next lower one letting us under-estimate
2152 * the true available window. At the moment this would happen only for the
2153 * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer
2154 * and window size). The absence of the WSCALE option (no scaling in either
2155 * direction) is encoded with index zero.
2156 * [WSCALE values histograms, Allman, 2012]
2157 * X 10 10 35 5 6 14 10% by host
2158 * X 11 4 5 5 18 49 3% by connections
2160 static int tcp_sc_wstab
[] = { 0, 0, 1, 2, 4, 6, 7, 8 };
2163 * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed
2164 * and good cryptographic properties.
2167 syncookie_mac(struct in_conninfo
*inc
, tcp_seq irs
, uint8_t flags
,
2168 uint8_t *secbits
, uintptr_t secmod
)
2171 uint32_t siphash
[2];
2173 SipHash24_Init(&ctx
);
2174 SipHash_SetKey(&ctx
, secbits
);
2175 switch (inc
->inc_flags
& INC_ISIPV6
) {
2178 SipHash_Update(&ctx
, &inc
->inc_faddr
, sizeof(inc
->inc_faddr
));
2179 SipHash_Update(&ctx
, &inc
->inc_laddr
, sizeof(inc
->inc_laddr
));
2184 SipHash_Update(&ctx
, &inc
->inc6_faddr
, sizeof(inc
->inc6_faddr
));
2185 SipHash_Update(&ctx
, &inc
->inc6_laddr
, sizeof(inc
->inc6_laddr
));
2189 SipHash_Update(&ctx
, &inc
->inc_fport
, sizeof(inc
->inc_fport
));
2190 SipHash_Update(&ctx
, &inc
->inc_lport
, sizeof(inc
->inc_lport
));
2191 SipHash_Update(&ctx
, &irs
, sizeof(irs
));
2192 SipHash_Update(&ctx
, &flags
, sizeof(flags
));
2193 SipHash_Update(&ctx
, &secmod
, sizeof(secmod
));
2194 SipHash_Final((u_int8_t
*)&siphash
, &ctx
);
2196 return (siphash
[0] ^ siphash
[1]);
2200 syncookie_generate(struct syncache_head
*sch
, struct syncache
*sc
)
2202 u_int i
, secbit
, wscale
;
2205 union syncookie cookie
;
2209 /* Map our computed MSS into the 3-bit index. */
2210 for (i
= nitems(tcp_sc_msstab
) - 1;
2211 tcp_sc_msstab
[i
] > sc
->sc_peer_mss
&& i
> 0;
2214 cookie
.flags
.mss_idx
= i
;
2217 * Map the send window scale into the 3-bit index but only if
2218 * the wscale option was received.
2220 if (sc
->sc_flags
& SCF_WINSCALE
) {
2221 wscale
= sc
->sc_requested_s_scale
;
2222 for (i
= nitems(tcp_sc_wstab
) - 1;
2223 tcp_sc_wstab
[i
] > wscale
&& i
> 0;
2226 cookie
.flags
.wscale_idx
= i
;
2229 /* Can we do SACK? */
2230 if (sc
->sc_flags
& SCF_SACK
)
2231 cookie
.flags
.sack_ok
= 1;
2233 /* Which of the two secrets to use. */
2234 secbit
= V_tcp_syncache
.secret
.oddeven
& 0x1;
2235 cookie
.flags
.odd_even
= secbit
;
2237 secbits
= V_tcp_syncache
.secret
.key
[secbit
];
2238 hash
= syncookie_mac(&sc
->sc_inc
, sc
->sc_irs
, cookie
.cookie
, secbits
,
2242 * Put the flags into the hash and XOR them to get better ISS number
2243 * variance. This doesn't enhance the cryptographic strength and is
2244 * done to prevent the 8 cookie bits from showing up directly on the
2248 iss
|= cookie
.cookie
^ (hash
>> 24);
2250 TCPSTAT_INC(tcps_sc_sendcookie
);
2254 static struct syncache
*
2255 syncookie_lookup(struct in_conninfo
*inc
, struct syncache_head
*sch
,
2256 struct syncache
*sc
, struct tcphdr
*th
, struct tcpopt
*to
,
2257 struct socket
*lso
, uint16_t port
)
2262 int wnd
, wscale
= 0;
2263 union syncookie cookie
;
2266 * Pull information out of SYN-ACK/ACK and revert sequence number
2269 ack
= th
->th_ack
- 1;
2270 seq
= th
->th_seq
- 1;
2273 * Unpack the flags containing enough information to restore the
2276 cookie
.cookie
= (ack
& 0xff) ^ (ack
>> 24);
2278 /* Which of the two secrets to use. */
2279 secbits
= V_tcp_syncache
.secret
.key
[cookie
.flags
.odd_even
];
2281 hash
= syncookie_mac(inc
, seq
, cookie
.cookie
, secbits
, (uintptr_t)sch
);
2283 /* The recomputed hash matches the ACK if this was a genuine cookie. */
2284 if ((ack
& ~0xff) != (hash
& ~0xff))
2287 /* Fill in the syncache values. */
2289 bcopy(inc
, &sc
->sc_inc
, sizeof(struct in_conninfo
));
2290 sc
->sc_ipopts
= NULL
;
2295 switch (inc
->inc_flags
& INC_ISIPV6
) {
2298 sc
->sc_ip_ttl
= sotoinpcb(lso
)->inp_ip_ttl
;
2299 sc
->sc_ip_tos
= sotoinpcb(lso
)->inp_ip_tos
;
2304 if (sotoinpcb(lso
)->inp_flags
& IN6P_AUTOFLOWLABEL
)
2306 htonl(sc
->sc_iss
) & IPV6_FLOWLABEL_MASK
;
2311 sc
->sc_peer_mss
= tcp_sc_msstab
[cookie
.flags
.mss_idx
];
2313 /* We can simply recompute receive window scale we sent earlier. */
2314 while (wscale
< TCP_MAX_WINSHIFT
&& (TCP_MAXWIN
<< wscale
) < sb_max
)
2317 /* Only use wscale if it was enabled in the orignal SYN. */
2318 if (cookie
.flags
.wscale_idx
> 0) {
2319 sc
->sc_requested_r_scale
= wscale
;
2320 sc
->sc_requested_s_scale
= tcp_sc_wstab
[cookie
.flags
.wscale_idx
];
2321 sc
->sc_flags
|= SCF_WINSCALE
;
2324 wnd
= lso
->sol_sbrcv_hiwat
;
2326 wnd
= imin(wnd
, TCP_MAXWIN
);
2329 if (cookie
.flags
.sack_ok
)
2330 sc
->sc_flags
|= SCF_SACK
;
2332 if (to
->to_flags
& TOF_TS
) {
2333 sc
->sc_flags
|= SCF_TIMESTAMP
;
2334 sc
->sc_tsreflect
= to
->to_tsval
;
2335 sc
->sc_tsoff
= tcp_new_ts_offset(inc
);
2338 if (to
->to_flags
& TOF_SIGNATURE
)
2339 sc
->sc_flags
|= SCF_SIGNATURE
;
2345 TCPSTAT_INC(tcps_sc_recvcookie
);
2351 syncookie_cmp(struct in_conninfo
*inc
, struct syncache_head
*sch
,
2352 struct syncache
*sc
, struct tcphdr
*th
, struct tcpopt
*to
,
2353 struct socket
*lso
, uint16_t port
)
2355 struct syncache scs
, *scx
;
2358 bzero(&scs
, sizeof(scs
));
2359 scx
= syncookie_lookup(inc
, sch
, &scs
, th
, to
, lso
, port
);
2361 if ((s
= tcp_log_addrs(inc
, th
, NULL
, NULL
)) == NULL
)
2365 if (sc
->sc_peer_mss
!= scx
->sc_peer_mss
)
2366 log(LOG_DEBUG
, "%s; %s: mss different %i vs %i\n",
2367 s
, __func__
, sc
->sc_peer_mss
, scx
->sc_peer_mss
);
2369 if (sc
->sc_requested_r_scale
!= scx
->sc_requested_r_scale
)
2370 log(LOG_DEBUG
, "%s; %s: rwscale different %i vs %i\n",
2371 s
, __func__
, sc
->sc_requested_r_scale
,
2372 scx
->sc_requested_r_scale
);
2374 if (sc
->sc_requested_s_scale
!= scx
->sc_requested_s_scale
)
2375 log(LOG_DEBUG
, "%s; %s: swscale different %i vs %i\n",
2376 s
, __func__
, sc
->sc_requested_s_scale
,
2377 scx
->sc_requested_s_scale
);
2379 if ((sc
->sc_flags
& SCF_SACK
) != (scx
->sc_flags
& SCF_SACK
))
2380 log(LOG_DEBUG
, "%s; %s: SACK different\n", s
, __func__
);
2387 #endif /* INVARIANTS */
2390 syncookie_reseed(void *arg
)
2392 struct tcp_syncache
*sc
= arg
;
2397 * Reseeding the secret doesn't have to be protected by a lock.
2398 * It only must be ensured that the new random values are visible
2399 * to all CPUs in a SMP environment. The atomic with release
2400 * semantics ensures that.
2402 secbit
= (sc
->secret
.oddeven
& 0x1) ? 0 : 1;
2403 secbits
= sc
->secret
.key
[secbit
];
2404 arc4rand(secbits
, SYNCOOKIE_SECRET_SIZE
, 0);
2405 atomic_add_rel_int(&sc
->secret
.oddeven
, 1);
2407 /* Reschedule ourself. */
2408 callout_schedule(&sc
->secret
.reseed
, SYNCOOKIE_LIFETIME
* hz
);
2412 * We have overflowed a bucket. Let's pause dealing with the syncache.
2413 * This function will increment the bucketoverflow statistics appropriately
2414 * (once per pause when pausing is enabled; otherwise, once per overflow).
2417 syncache_pause(struct in_conninfo
*inc
)
2423 * 2. Add sysctl read here so we don't get the benefit of this
2424 * change without the new sysctl.
2428 * Try an unlocked read. If we already know that another thread
2429 * has activated the feature, there is no need to proceed.
2431 if (V_tcp_syncache
.paused
)
2434 /* Are cookied enabled? If not, we can't pause. */
2435 if (!V_tcp_syncookies
) {
2436 TCPSTAT_INC(tcps_sc_bucketoverflow
);
2441 * We may be the first thread to find an overflow. Get the lock
2442 * and evaluate if we need to take action.
2444 mtx_lock(&V_tcp_syncache
.pause_mtx
);
2445 if (V_tcp_syncache
.paused
) {
2446 mtx_unlock(&V_tcp_syncache
.pause_mtx
);
2450 /* Activate protection. */
2451 V_tcp_syncache
.paused
= true;
2452 TCPSTAT_INC(tcps_sc_bucketoverflow
);
2455 * Determine the last backoff time. If we are seeing a re-newed
2456 * attack within that same time after last reactivating the syncache,
2457 * consider it an extension of the same attack.
2459 delta
= TCP_SYNCACHE_PAUSE_TIME
<< V_tcp_syncache
.pause_backoff
;
2460 if (V_tcp_syncache
.pause_until
+ delta
- time_uptime
> 0) {
2461 if (V_tcp_syncache
.pause_backoff
< TCP_SYNCACHE_MAX_BACKOFF
) {
2463 V_tcp_syncache
.pause_backoff
++;
2466 delta
= TCP_SYNCACHE_PAUSE_TIME
;
2467 V_tcp_syncache
.pause_backoff
= 0;
2470 /* Log a warning, including IP addresses, if able. */
2472 s
= tcp_log_addrs(inc
, NULL
, NULL
, NULL
);
2474 s
= (const char *)NULL
;
2475 log(LOG_WARNING
, "TCP syncache overflow detected; using syncookies for "
2476 "the next %lld seconds%s%s%s\n", (long long)delta
,
2477 (s
!= NULL
) ? " (last SYN: " : "", (s
!= NULL
) ? s
: "",
2478 (s
!= NULL
) ? ")" : "");
2479 free(__DECONST(void *, s
), M_TCPLOG
);
2481 /* Use the calculated delta to set a new pause time. */
2482 V_tcp_syncache
.pause_until
= time_uptime
+ delta
;
2483 callout_reset(&V_tcp_syncache
.pause_co
, delta
* hz
, syncache_unpause
,
2485 mtx_unlock(&V_tcp_syncache
.pause_mtx
);
2488 /* Evaluate whether we need to unpause. */
2490 syncache_unpause(void *arg
)
2492 struct tcp_syncache
*sc
;
2496 mtx_assert(&sc
->pause_mtx
, MA_OWNED
| MA_NOTRECURSED
);
2497 callout_deactivate(&sc
->pause_co
);
2500 * Check to make sure we are not running early. If the pause
2501 * time has expired, then deactivate the protection.
2503 if ((delta
= sc
->pause_until
- time_uptime
) > 0)
2504 callout_schedule(&sc
->pause_co
, delta
* hz
);
2510 * Exports the syncache entries to userland so that netstat can display
2511 * them alongside the other sockets. This function is intended to be
2512 * called only from tcp_pcblist.
2514 * Due to concurrency on an active system, the number of pcbs exported
2515 * may have no relation to max_pcbs. max_pcbs merely indicates the
2516 * amount of space the caller allocated for this function to use.
2519 syncache_pcblist(struct sysctl_req
*req
)
2522 struct syncache
*sc
;
2523 struct syncache_head
*sch
;
2526 bzero(&xt
, sizeof(xt
));
2527 xt
.xt_len
= sizeof(xt
);
2528 xt
.t_state
= TCPS_SYN_RECEIVED
;
2529 xt
.xt_inp
.xi_socket
.xso_protocol
= IPPROTO_TCP
;
2530 xt
.xt_inp
.xi_socket
.xso_len
= sizeof (struct xsocket
);
2531 xt
.xt_inp
.xi_socket
.so_type
= SOCK_STREAM
;
2532 xt
.xt_inp
.xi_socket
.so_state
= SS_ISCONNECTING
;
2534 for (i
= 0; i
< V_tcp_syncache
.hashsize
; i
++) {
2535 sch
= &V_tcp_syncache
.hashbase
[i
];
2537 TAILQ_FOREACH(sc
, &sch
->sch_bucket
, sc_hash
) {
2538 if (sc
->sc_cred
!= NULL
&&
2539 cr_cansee(req
->td
->td_ucred
, sc
->sc_cred
) != 0)
2541 if (sc
->sc_inc
.inc_flags
& INC_ISIPV6
)
2542 xt
.xt_inp
.inp_vflag
= INP_IPV6
;
2544 xt
.xt_inp
.inp_vflag
= INP_IPV4
;
2545 xt
.xt_encaps_port
= sc
->sc_port
;
2546 bcopy(&sc
->sc_inc
, &xt
.xt_inp
.inp_inc
,
2547 sizeof (struct in_conninfo
));
2548 error
= SYSCTL_OUT(req
, &xt
, sizeof xt
);