1 /* $NetBSD: tcp_input.c,v 1.298 2009/07/18 23:09:53 minskim Exp $ */
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL).
73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
74 * All rights reserved.
76 * This code is derived from software contributed to The NetBSD Foundation
77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78 * Facility, NASA Ames Research Center.
79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Charles M. Hannum.
81 * This code is derived from software contributed to The NetBSD Foundation
84 * Redistribution and use in source and binary forms, with or without
85 * modification, are permitted provided that the following conditions
87 * 1. Redistributions of source code must retain the above copyright
88 * notice, this list of conditions and the following disclaimer.
89 * 2. Redistributions in binary form must reproduce the above copyright
90 * notice, this list of conditions and the following disclaimer in the
91 * documentation and/or other materials provided with the distribution.
93 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
94 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
96 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
97 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
98 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
99 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
100 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
101 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
103 * POSSIBILITY OF SUCH DAMAGE.
107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108 * The Regents of the University of California. All rights reserved.
110 * Redistribution and use in source and binary forms, with or without
111 * modification, are permitted provided that the following conditions
113 * 1. Redistributions of source code must retain the above copyright
114 * notice, this list of conditions and the following disclaimer.
115 * 2. Redistributions in binary form must reproduce the above copyright
116 * notice, this list of conditions and the following disclaimer in the
117 * documentation and/or other materials provided with the distribution.
118 * 3. Neither the name of the University nor the names of its contributors
119 * may be used to endorse or promote products derived from this software
120 * without specific prior written permission.
122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
138 * TODO list for SYN cache stuff:
140 * Find room for a "state" field, which is needed to keep a
141 * compressed state for TIME_WAIT TCBs. It's been noted already
142 * that this is fairly important for very high-volume web and
143 * mail servers, which use a large number of short-lived
147 #include <sys/cdefs.h>
148 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.298 2009/07/18 23:09:53 minskim Exp $");
150 #include "opt_inet.h"
151 #include "opt_ipsec.h"
152 #include "opt_inet_csum.h"
153 #include "opt_tcp_debug.h"
155 #include <sys/param.h>
156 #include <sys/systm.h>
157 #include <sys/malloc.h>
158 #include <sys/mbuf.h>
159 #include <sys/protosw.h>
160 #include <sys/socket.h>
161 #include <sys/socketvar.h>
162 #include <sys/errno.h>
163 #include <sys/syslog.h>
164 #include <sys/pool.h>
165 #include <sys/domain.h>
166 #include <sys/kernel.h>
170 #include <sys/lwp.h> /* for lwp0 */
173 #include <net/route.h>
174 #include <net/if_types.h>
176 #include <netinet/in.h>
177 #include <netinet/in_systm.h>
178 #include <netinet/ip.h>
179 #include <netinet/in_pcb.h>
180 #include <netinet/in_var.h>
181 #include <netinet/ip_var.h>
182 #include <netinet/in_offload.h>
186 #include <netinet/in.h>
188 #include <netinet/ip6.h>
189 #include <netinet6/ip6_var.h>
190 #include <netinet6/in6_pcb.h>
191 #include <netinet6/ip6_var.h>
192 #include <netinet6/in6_var.h>
193 #include <netinet/icmp6.h>
194 #include <netinet6/nd6.h>
196 #include <netinet6/scope6_var.h>
201 /* always need ip6.h for IP6_EXTHDR_GET */
202 #include <netinet/ip6.h>
205 #include <netinet/tcp.h>
206 #include <netinet/tcp_fsm.h>
207 #include <netinet/tcp_seq.h>
208 #include <netinet/tcp_timer.h>
209 #include <netinet/tcp_var.h>
210 #include <netinet/tcp_private.h>
211 #include <netinet/tcpip.h>
212 #include <netinet/tcp_congctl.h>
213 #include <netinet/tcp_debug.h>
215 #include <machine/stdarg.h>
218 #include <netinet6/ipsec.h>
219 #include <netinet6/ipsec_private.h>
220 #include <netkey/key.h>
224 #if defined(NFAITH) && NFAITH > 0
225 #include <net/if_faith.h>
230 #include <netipsec/ipsec.h>
231 #include <netipsec/ipsec_var.h>
232 #include <netipsec/ipsec_private.h>
233 #include <netipsec/key.h>
235 #include <netipsec/ipsec6.h>
237 #endif /* FAST_IPSEC*/
239 int tcprexmtthresh
= 3;
242 int tcp_do_autorcvbuf
= 0;
243 int tcp_autorcvbuf_inc
= 16 * 1024;
244 int tcp_autorcvbuf_max
= 256 * 1024;
245 int tcp_msl
= (TCPTV_MSL
/ PR_SLOWHZ
);
247 static int tcp_rst_ppslim_count
= 0;
248 static struct timeval tcp_rst_ppslim_last
;
249 static int tcp_ackdrop_ppslim_count
= 0;
250 static struct timeval tcp_ackdrop_ppslim_last
;
252 #define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ)
254 /* for modulo comparisons of timestamps */
255 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
256 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
259 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
263 nd6_hint(struct tcpcb
*tp
)
267 if (tp
!= NULL
&& tp
->t_in6pcb
!= NULL
&& tp
->t_family
== AF_INET6
&&
268 (rt
= rtcache_validate(&tp
->t_in6pcb
->in6p_route
)) != NULL
)
269 nd6_nud_hint(rt
, NULL
, 0);
273 nd6_hint(struct tcpcb
*tp
)
279 * Compute ACK transmission behavior. Delay the ACK unless
280 * we have already delayed an ACK (must send an ACK every two segments).
281 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
285 tcp_setup_ack(struct tcpcb
*tp
, const struct tcphdr
*th
)
288 if (tp
->t_flags
& TF_DELACK
||
289 (tcp_ack_on_push
&& th
->th_flags
& TH_PUSH
))
290 tp
->t_flags
|= TF_ACKNOW
;
296 icmp_check(struct tcpcb
*tp
, const struct tcphdr
*th
, int acked
)
300 * If we had a pending ICMP message that refers to data that have
301 * just been acknowledged, disregard the recorded ICMP message.
303 if ((tp
->t_flags
& TF_PMTUD_PEND
) &&
304 SEQ_GT(th
->th_ack
, tp
->t_pmtud_th_seq
))
305 tp
->t_flags
&= ~TF_PMTUD_PEND
;
308 * Keep track of the largest chunk of data
309 * acknowledged since last PMTU update
311 if (tp
->t_pmtud_mss_acked
< acked
)
312 tp
->t_pmtud_mss_acked
= acked
;
316 * Convert TCP protocol fields to host order for easier processing.
319 tcp_fields_to_host(struct tcphdr
*th
)
329 * ... and reverse the above.
332 tcp_fields_to_net(struct tcphdr
*th
)
341 #ifdef TCP_CSUM_COUNTERS
342 #include <sys/device.h>
345 extern struct evcnt tcp_hwcsum_ok
;
346 extern struct evcnt tcp_hwcsum_bad
;
347 extern struct evcnt tcp_hwcsum_data
;
348 extern struct evcnt tcp_swcsum
;
349 #endif /* defined(INET) */
351 extern struct evcnt tcp6_hwcsum_ok
;
352 extern struct evcnt tcp6_hwcsum_bad
;
353 extern struct evcnt tcp6_hwcsum_data
;
354 extern struct evcnt tcp6_swcsum
;
355 #endif /* defined(INET6) */
357 #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
361 #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */
363 #endif /* TCP_CSUM_COUNTERS */
365 #ifdef TCP_REASS_COUNTERS
366 #include <sys/device.h>
368 extern struct evcnt tcp_reass_
;
369 extern struct evcnt tcp_reass_empty
;
370 extern struct evcnt tcp_reass_iteration
[8];
371 extern struct evcnt tcp_reass_prependfirst
;
372 extern struct evcnt tcp_reass_prepend
;
373 extern struct evcnt tcp_reass_insert
;
374 extern struct evcnt tcp_reass_inserttail
;
375 extern struct evcnt tcp_reass_append
;
376 extern struct evcnt tcp_reass_appendtail
;
377 extern struct evcnt tcp_reass_overlaptail
;
378 extern struct evcnt tcp_reass_overlapfront
;
379 extern struct evcnt tcp_reass_segdup
;
380 extern struct evcnt tcp_reass_fragdup
;
382 #define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++
386 #define TCP_REASS_COUNTER_INCR(ev) /* nothing */
388 #endif /* TCP_REASS_COUNTERS */
390 static int tcp_reass(struct tcpcb
*, const struct tcphdr
*, struct mbuf
*,
392 static int tcp_dooptions(struct tcpcb
*, const u_char
*, int,
393 struct tcphdr
*, struct mbuf
*, int, struct tcp_opt_info
*);
396 static void tcp4_log_refused(const struct ip
*, const struct tcphdr
*);
399 static void tcp6_log_refused(const struct ip6_hdr
*, const struct tcphdr
*);
402 #define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
404 #if defined(MBUFTRACE)
405 struct mowner tcp_reass_mowner
= MOWNER_INIT("tcp", "reass");
406 #endif /* defined(MBUFTRACE) */
408 static struct pool tcpipqent_pool
;
414 pool_init(&tcpipqent_pool
, sizeof(struct ipqent
), 0, 0, 0, "tcpipqepl",
419 tcpipqent_alloc(void)
425 ipqe
= pool_get(&tcpipqent_pool
, PR_NOWAIT
);
432 tcpipqent_free(struct ipqent
*ipqe
)
437 pool_put(&tcpipqent_pool
, ipqe
);
442 tcp_reass(struct tcpcb
*tp
, const struct tcphdr
*th
, struct mbuf
*m
, int *tlen
)
444 struct ipqent
*p
, *q
, *nq
, *tiqe
= NULL
;
445 struct socket
*so
= NULL
;
449 u_long rcvpartdupbyte
= 0;
451 #ifdef TCP_REASS_COUNTERS
457 so
= tp
->t_inpcb
->inp_socket
;
459 else if (tp
->t_in6pcb
)
460 so
= tp
->t_in6pcb
->in6p_socket
;
463 TCP_REASS_LOCK_CHECK(tp
);
466 * Call with th==0 after become established to
467 * force pre-ESTABLISHED data up to user socket.
472 m_claimm(m
, &tcp_reass_mowner
);
476 * Copy these to local variables because the tcpiphdr
477 * gets munged while we are collapsing mbufs.
479 pkt_seq
= th
->th_seq
;
481 pkt_flags
= th
->th_flags
;
483 TCP_REASS_COUNTER_INCR(&tcp_reass_
);
485 if ((p
= TAILQ_LAST(&tp
->segq
, ipqehead
)) != NULL
) {
487 * When we miss a packet, the vast majority of time we get
488 * packets that follow it in order. So optimize for that.
490 if (pkt_seq
== p
->ipqe_seq
+ p
->ipqe_len
) {
491 p
->ipqe_len
+= pkt_len
;
492 p
->ipqe_flags
|= pkt_flags
;
493 m_cat(p
->ipre_mlast
, m
);
494 TRAVERSE(p
->ipre_mlast
);
497 TAILQ_REMOVE(&tp
->timeq
, p
, ipqe_timeq
);
498 TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail
);
499 goto skip_replacement
;
502 * While we're here, if the pkt is completely beyond
503 * anything we have, just insert it at the tail.
505 if (SEQ_GT(pkt_seq
, p
->ipqe_seq
+ p
->ipqe_len
)) {
506 TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail
);
511 q
= TAILQ_FIRST(&tp
->segq
);
515 * If this segment immediately precedes the first out-of-order
516 * block, simply slap the segment in front of it and (mostly)
517 * skip the complicated logic.
519 if (pkt_seq
+ pkt_len
== q
->ipqe_seq
) {
520 q
->ipqe_seq
= pkt_seq
;
521 q
->ipqe_len
+= pkt_len
;
522 q
->ipqe_flags
|= pkt_flags
;
525 q
->ipre_mlast
= m
; /* last mbuf may have changed */
526 TRAVERSE(q
->ipre_mlast
);
528 TAILQ_REMOVE(&tp
->timeq
, q
, ipqe_timeq
);
529 TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst
);
530 goto skip_replacement
;
533 TCP_REASS_COUNTER_INCR(&tcp_reass_empty
);
537 * Find a segment which begins after this one does.
539 for (p
= NULL
; q
!= NULL
; q
= nq
) {
540 nq
= TAILQ_NEXT(q
, ipqe_q
);
541 #ifdef TCP_REASS_COUNTERS
545 * If the received segment is just right after this
546 * fragment, merge the two together and then check
547 * for further overlaps.
549 if (q
->ipqe_seq
+ q
->ipqe_len
== pkt_seq
) {
550 #ifdef TCPREASS_DEBUG
551 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
552 tp
, pkt_seq
, pkt_seq
+ pkt_len
, pkt_len
,
553 q
->ipqe_seq
, q
->ipqe_seq
+ q
->ipqe_len
, q
->ipqe_len
);
555 pkt_len
+= q
->ipqe_len
;
556 pkt_flags
|= q
->ipqe_flags
;
557 pkt_seq
= q
->ipqe_seq
;
558 m_cat(q
->ipre_mlast
, m
);
559 TRAVERSE(q
->ipre_mlast
);
561 TCP_REASS_COUNTER_INCR(&tcp_reass_append
);
565 * If the received segment is completely past this
566 * fragment, we need to go the next fragment.
568 if (SEQ_LT(q
->ipqe_seq
+ q
->ipqe_len
, pkt_seq
)) {
573 * If the fragment is past the received segment,
574 * it (or any following) can't be concatenated.
576 if (SEQ_GT(q
->ipqe_seq
, pkt_seq
+ pkt_len
)) {
577 TCP_REASS_COUNTER_INCR(&tcp_reass_insert
);
582 * We've received all the data in this segment before.
583 * mark it as a duplicate and return.
585 if (SEQ_LEQ(q
->ipqe_seq
, pkt_seq
) &&
586 SEQ_GEQ(q
->ipqe_seq
+ q
->ipqe_len
, pkt_seq
+ pkt_len
)) {
587 tcps
= TCP_STAT_GETREF();
588 tcps
[TCP_STAT_RCVDUPPACK
]++;
589 tcps
[TCP_STAT_RCVDUPBYTE
] += pkt_len
;
591 tcp_new_dsack(tp
, pkt_seq
, pkt_len
);
594 tcpipqent_free(tiqe
);
596 TCP_REASS_COUNTER_INCR(&tcp_reass_segdup
);
600 * Received segment completely overlaps this fragment
601 * so we drop the fragment (this keeps the temporal
602 * ordering of segments correct).
604 if (SEQ_GEQ(q
->ipqe_seq
, pkt_seq
) &&
605 SEQ_LEQ(q
->ipqe_seq
+ q
->ipqe_len
, pkt_seq
+ pkt_len
)) {
606 rcvpartdupbyte
+= q
->ipqe_len
;
608 TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup
);
612 * RX'ed segment extends past the end of the
613 * fragment. Drop the overlapping bytes. Then
614 * merge the fragment and segment then treat as
615 * a longer received packet.
617 if (SEQ_LT(q
->ipqe_seq
, pkt_seq
) &&
618 SEQ_GT(q
->ipqe_seq
+ q
->ipqe_len
, pkt_seq
)) {
619 int overlap
= q
->ipqe_seq
+ q
->ipqe_len
- pkt_seq
;
620 #ifdef TCPREASS_DEBUG
621 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
623 pkt_seq
, pkt_seq
+ pkt_len
, pkt_len
);
626 rcvpartdupbyte
+= overlap
;
627 m_cat(q
->ipre_mlast
, m
);
628 TRAVERSE(q
->ipre_mlast
);
630 pkt_seq
= q
->ipqe_seq
;
631 pkt_len
+= q
->ipqe_len
- overlap
;
632 rcvoobyte
-= overlap
;
633 TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail
);
637 * RX'ed segment extends past the front of the
638 * fragment. Drop the overlapping bytes on the
639 * received packet. The packet will then be
640 * contatentated with this fragment a bit later.
642 if (SEQ_GT(q
->ipqe_seq
, pkt_seq
) &&
643 SEQ_LT(q
->ipqe_seq
, pkt_seq
+ pkt_len
)) {
644 int overlap
= pkt_seq
+ pkt_len
- q
->ipqe_seq
;
645 #ifdef TCPREASS_DEBUG
646 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
648 pkt_seq
, pkt_seq
+ pkt_len
, pkt_len
);
652 rcvpartdupbyte
+= overlap
;
653 TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront
);
654 rcvoobyte
-= overlap
;
657 * If the received segment immediates precedes this
658 * fragment then tack the fragment onto this segment
659 * and reinsert the data.
661 if (q
->ipqe_seq
== pkt_seq
+ pkt_len
) {
662 #ifdef TCPREASS_DEBUG
663 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
664 tp
, q
->ipqe_seq
, q
->ipqe_seq
+ q
->ipqe_len
, q
->ipqe_len
,
665 pkt_seq
, pkt_seq
+ pkt_len
, pkt_len
);
667 pkt_len
+= q
->ipqe_len
;
668 pkt_flags
|= q
->ipqe_flags
;
670 TAILQ_REMOVE(&tp
->segq
, q
, ipqe_q
);
671 TAILQ_REMOVE(&tp
->timeq
, q
, ipqe_timeq
);
673 KASSERT(tp
->t_segqlen
>= 0);
674 KASSERT(tp
->t_segqlen
!= 0 ||
675 (TAILQ_EMPTY(&tp
->segq
) &&
676 TAILQ_EMPTY(&tp
->timeq
)));
682 TCP_REASS_COUNTER_INCR(&tcp_reass_prepend
);
686 * If the fragment is before the segment, remember it.
687 * When this loop is terminated, p will contain the
688 * pointer to fragment that is right before the received
691 if (SEQ_LEQ(q
->ipqe_seq
, pkt_seq
))
697 * This is a common operation. It also will allow
698 * to save doing a malloc/free in most instances.
701 TAILQ_REMOVE(&tp
->segq
, q
, ipqe_q
);
702 TAILQ_REMOVE(&tp
->timeq
, q
, ipqe_timeq
);
704 KASSERT(tp
->t_segqlen
>= 0);
705 KASSERT(tp
->t_segqlen
!= 0 ||
706 (TAILQ_EMPTY(&tp
->segq
) && TAILQ_EMPTY(&tp
->timeq
)));
714 #ifdef TCP_REASS_COUNTERS
716 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration
[0]);
718 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration
[count
]);
724 * Allocate a new queue entry since the received segment did not
725 * collapse onto any other out-of-order block; thus we are allocating
726 * a new block. If it had collapsed, tiqe would not be NULL and
727 * we would be reusing it.
728 * XXX If we can't, just drop the packet. XXX
731 tiqe
= tcpipqent_alloc();
733 TCP_STATINC(TCP_STAT_RCVMEMDROP
);
740 * Update the counters.
742 tcps
= TCP_STAT_GETREF();
743 tcps
[TCP_STAT_RCVOOPACK
]++;
744 tcps
[TCP_STAT_RCVOOBYTE
] += rcvoobyte
;
745 if (rcvpartdupbyte
) {
746 tcps
[TCP_STAT_RCVPARTDUPPACK
]++;
747 tcps
[TCP_STAT_RCVPARTDUPBYTE
] += rcvpartdupbyte
;
752 * Insert the new fragment queue entry into both queues.
755 tiqe
->ipre_mlast
= m
;
756 tiqe
->ipqe_seq
= pkt_seq
;
757 tiqe
->ipqe_len
= pkt_len
;
758 tiqe
->ipqe_flags
= pkt_flags
;
760 TAILQ_INSERT_HEAD(&tp
->segq
, tiqe
, ipqe_q
);
761 #ifdef TCPREASS_DEBUG
762 if (tiqe
->ipqe_seq
!= tp
->rcv_nxt
)
763 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
764 tp
, pkt_seq
, pkt_seq
+ pkt_len
, pkt_len
);
767 TAILQ_INSERT_AFTER(&tp
->segq
, p
, tiqe
, ipqe_q
);
768 #ifdef TCPREASS_DEBUG
769 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
770 tp
, pkt_seq
, pkt_seq
+ pkt_len
, pkt_len
,
771 p
->ipqe_seq
, p
->ipqe_seq
+ p
->ipqe_len
, p
->ipqe_len
);
778 TAILQ_INSERT_HEAD(&tp
->timeq
, tiqe
, ipqe_timeq
);
782 * Present data to user, advancing rcv_nxt through
783 * completed sequence space.
785 if (TCPS_HAVEESTABLISHED(tp
->t_state
) == 0)
787 q
= TAILQ_FIRST(&tp
->segq
);
788 if (q
== NULL
|| q
->ipqe_seq
!= tp
->rcv_nxt
)
790 if (tp
->t_state
== TCPS_SYN_RECEIVED
&& q
->ipqe_len
)
793 tp
->rcv_nxt
+= q
->ipqe_len
;
794 pkt_flags
= q
->ipqe_flags
& TH_FIN
;
797 TAILQ_REMOVE(&tp
->segq
, q
, ipqe_q
);
798 TAILQ_REMOVE(&tp
->timeq
, q
, ipqe_timeq
);
800 KASSERT(tp
->t_segqlen
>= 0);
801 KASSERT(tp
->t_segqlen
!= 0 ||
802 (TAILQ_EMPTY(&tp
->segq
) && TAILQ_EMPTY(&tp
->timeq
)));
803 if (so
->so_state
& SS_CANTRCVMORE
)
806 sbappendstream(&so
->so_rcv
, q
->ipqe_m
);
814 tcp6_input(struct mbuf
**mp
, int *offp
, int proto
)
816 struct mbuf
*m
= *mp
;
819 * draft-itojun-ipv6-tcp-to-anycast
820 * better place to put this in?
822 if (m
->m_flags
& M_ANYCAST6
) {
824 if (m
->m_len
< sizeof(struct ip6_hdr
)) {
825 if ((m
= m_pullup(m
, sizeof(struct ip6_hdr
))) == NULL
) {
826 TCP_STATINC(TCP_STAT_RCVSHORT
);
830 ip6
= mtod(m
, struct ip6_hdr
*);
831 icmp6_error(m
, ICMP6_DST_UNREACH
, ICMP6_DST_UNREACH_ADDR
,
832 (char *)&ip6
->ip6_dst
- (char *)ip6
);
836 tcp_input(m
, *offp
, proto
);
843 tcp4_log_refused(const struct ip
*ip
, const struct tcphdr
*th
)
845 char src
[4*sizeof "123"];
846 char dst
[4*sizeof "123"];
849 strlcpy(src
, inet_ntoa(ip
->ip_src
), sizeof(src
));
850 strlcpy(dst
, inet_ntoa(ip
->ip_dst
), sizeof(dst
));
853 strlcpy(src
, "(unknown)", sizeof(src
));
854 strlcpy(dst
, "(unknown)", sizeof(dst
));
857 "Connection attempt to TCP %s:%d from %s:%d\n",
858 dst
, ntohs(th
->th_dport
),
859 src
, ntohs(th
->th_sport
));
865 tcp6_log_refused(const struct ip6_hdr
*ip6
, const struct tcphdr
*th
)
867 char src
[INET6_ADDRSTRLEN
];
868 char dst
[INET6_ADDRSTRLEN
];
871 strlcpy(src
, ip6_sprintf(&ip6
->ip6_src
), sizeof(src
));
872 strlcpy(dst
, ip6_sprintf(&ip6
->ip6_dst
), sizeof(dst
));
875 strlcpy(src
, "(unknown v6)", sizeof(src
));
876 strlcpy(dst
, "(unknown v6)", sizeof(dst
));
879 "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
880 dst
, ntohs(th
->th_dport
),
881 src
, ntohs(th
->th_sport
));
886 * Checksum extended TCP header and data.
889 tcp_input_checksum(int af
, struct mbuf
*m
, const struct tcphdr
*th
,
890 int toff
, int off
, int tlen
)
894 * XXX it's better to record and check if this mbuf is
901 switch (m
->m_pkthdr
.csum_flags
&
902 ((m
->m_pkthdr
.rcvif
->if_csum_flags_rx
& M_CSUM_TCPv4
) |
903 M_CSUM_TCP_UDP_BAD
| M_CSUM_DATA
)) {
904 case M_CSUM_TCPv4
|M_CSUM_TCP_UDP_BAD
:
905 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad
);
908 case M_CSUM_TCPv4
|M_CSUM_DATA
: {
909 u_int32_t hw_csum
= m
->m_pkthdr
.csum_data
;
911 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data
);
912 if (m
->m_pkthdr
.csum_flags
& M_CSUM_NO_PSEUDOHDR
) {
913 const struct ip
*ip
=
914 mtod(m
, const struct ip
*);
916 hw_csum
= in_cksum_phdr(ip
->ip_src
.s_addr
,
918 htons(hw_csum
+ tlen
+ off
+ IPPROTO_TCP
));
920 if ((hw_csum
^ 0xffff) != 0)
926 /* Checksum was okay. */
927 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok
);
932 * Must compute it ourselves. Maybe skip checksum
933 * on loopback interfaces.
935 if (__predict_true(!(m
->m_pkthdr
.rcvif
->if_flags
&
937 tcp_do_loopback_cksum
)) {
938 TCP_CSUM_COUNTER_INCR(&tcp_swcsum
);
939 if (in4_cksum(m
, IPPROTO_TCP
, toff
,
950 switch (m
->m_pkthdr
.csum_flags
&
951 ((m
->m_pkthdr
.rcvif
->if_csum_flags_rx
& M_CSUM_TCPv6
) |
952 M_CSUM_TCP_UDP_BAD
| M_CSUM_DATA
)) {
953 case M_CSUM_TCPv6
|M_CSUM_TCP_UDP_BAD
:
954 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad
);
958 case M_CSUM_TCPv6
|M_CSUM_DATA
:
962 /* Checksum was okay. */
963 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok
);
968 * Must compute it ourselves. Maybe skip checksum
969 * on loopback interfaces.
971 if (__predict_true((m
->m_flags
& M_LOOP
) == 0 ||
972 tcp_do_loopback_cksum
)) {
973 TCP_CSUM_COUNTER_INCR(&tcp6_swcsum
);
974 if (in6_cksum(m
, IPPROTO_TCP
, toff
,
986 TCP_STATINC(TCP_STAT_RCVBADSUM
);
991 * TCP input routine, follows pages 65-76 of RFC 793 very closely.
994 tcp_input(struct mbuf
*m
, ...)
1000 struct ip6_hdr
*ip6
;
1001 struct in6pcb
*in6p
;
1003 u_int8_t
*optp
= NULL
;
1005 int len
, tlen
, toff
, hdroptlen
= 0;
1006 struct tcpcb
*tp
= 0;
1008 struct socket
*so
= NULL
;
1009 int todrop
, dupseg
, acked
, ourfinisacked
, needoutput
= 0;
1014 struct tcp_opt_info opti
;
1017 int af
; /* af on the wire */
1018 struct mbuf
*tcp_saveti
= NULL
;
1023 MCLAIM(m
, &tcp_rx_mowner
);
1025 toff
= va_arg(ap
, int);
1026 (void)va_arg(ap
, int); /* ignore value, advance ap */
1029 TCP_STATINC(TCP_STAT_RCVTOTAL
);
1031 memset(&opti
, 0, sizeof(opti
));
1032 opti
.ts_present
= 0;
1036 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
1038 * TCP is, by definition, unicast, so we reject all
1039 * multicast outright.
1041 * Note, there are additional src/dst address checks in
1042 * the AF-specific code below.
1044 if (m
->m_flags
& (M_BCAST
|M_MCAST
)) {
1049 if (m
->m_flags
& M_ANYCAST6
) {
1056 * Get IP and TCP header.
1057 * Note: IP leaves IP header in first mbuf.
1059 ip
= mtod(m
, struct ip
*);
1067 iphlen
= sizeof(struct ip
);
1068 ip
= mtod(m
, struct ip
*);
1069 IP6_EXTHDR_GET(th
, struct tcphdr
*, m
, toff
,
1070 sizeof(struct tcphdr
));
1072 TCP_STATINC(TCP_STAT_RCVSHORT
);
1075 /* We do the checksum after PCB lookup... */
1076 len
= ntohs(ip
->ip_len
);
1084 iphlen
= sizeof(struct ip6_hdr
);
1086 ip6
= mtod(m
, struct ip6_hdr
*);
1087 IP6_EXTHDR_GET(th
, struct tcphdr
*, m
, toff
,
1088 sizeof(struct tcphdr
));
1090 TCP_STATINC(TCP_STAT_RCVSHORT
);
1094 /* Be proactive about malicious use of IPv4 mapped address */
1095 if (IN6_IS_ADDR_V4MAPPED(&ip6
->ip6_src
) ||
1096 IN6_IS_ADDR_V4MAPPED(&ip6
->ip6_dst
)) {
1102 * Be proactive about unspecified IPv6 address in source.
1103 * As we use all-zero to indicate unbounded/unconnected pcb,
1104 * unspecified IPv6 address can be used to confuse us.
1106 * Note that packets with unspecified IPv6 destination is
1107 * already dropped in ip6_input.
1109 if (IN6_IS_ADDR_UNSPECIFIED(&ip6
->ip6_src
)) {
1115 * Make sure destination address is not multicast.
1116 * Source address checked in ip6_input().
1118 if (IN6_IS_ADDR_MULTICAST(&ip6
->ip6_dst
)) {
1123 /* We do the checksum after PCB lookup... */
1124 len
= m
->m_pkthdr
.len
;
1126 iptos
= (ntohl(ip6
->ip6_flow
) >> 20) & 0xff;
1134 KASSERT(TCP_HDR_ALIGNED_P(th
));
1137 * Check that TCP offset makes sense,
1138 * pull out TCP options and adjust length. XXX
1140 off
= th
->th_off
<< 2;
1141 if (off
< sizeof (struct tcphdr
) || off
> tlen
) {
1142 TCP_STATINC(TCP_STAT_RCVBADOFF
);
1148 * tcp_input() has been modified to use tlen to mean the TCP data
1149 * length throughout the function. Other functions can use
1150 * m->m_pkthdr.len as the basis for calculating the TCP data length.
1154 if (off
> sizeof (struct tcphdr
)) {
1155 IP6_EXTHDR_GET(th
, struct tcphdr
*, m
, toff
, off
);
1157 TCP_STATINC(TCP_STAT_RCVSHORT
);
1161 * NOTE: ip/ip6 will not be affected by m_pulldown()
1162 * (as they're before toff) and we don't need to update those.
1164 KASSERT(TCP_HDR_ALIGNED_P(th
));
1165 optlen
= off
- sizeof (struct tcphdr
);
1166 optp
= ((u_int8_t
*)th
) + sizeof(struct tcphdr
);
1168 * Do quick retrieval of timestamp options ("options
1169 * prediction?"). If timestamp is the only option and it's
1170 * formatted as recommended in RFC 1323 appendix A, we
1171 * quickly get the values now and not bother calling
1172 * tcp_dooptions(), etc.
1174 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
1175 (optlen
> TCPOLEN_TSTAMP_APPA
&&
1176 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
1177 *(u_int32_t
*)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
1178 (th
->th_flags
& TH_SYN
) == 0) {
1179 opti
.ts_present
= 1;
1180 opti
.ts_val
= ntohl(*(u_int32_t
*)(optp
+ 4));
1181 opti
.ts_ecr
= ntohl(*(u_int32_t
*)(optp
+ 8));
1182 optp
= NULL
; /* we've parsed the options */
1185 tiflags
= th
->th_flags
;
1188 * Locate pcb for segment.
1198 inp
= in_pcblookup_connect(&tcbtable
, ip
->ip_src
, th
->th_sport
,
1199 ip
->ip_dst
, th
->th_dport
);
1201 TCP_STATINC(TCP_STAT_PCBHASHMISS
);
1202 inp
= in_pcblookup_bind(&tcbtable
, ip
->ip_dst
, th
->th_dport
);
1206 struct in6_addr s
, d
;
1208 /* mapped addr case */
1209 memset(&s
, 0, sizeof(s
));
1210 s
.s6_addr16
[5] = htons(0xffff);
1211 bcopy(&ip
->ip_src
, &s
.s6_addr32
[3], sizeof(ip
->ip_src
));
1212 memset(&d
, 0, sizeof(d
));
1213 d
.s6_addr16
[5] = htons(0xffff);
1214 bcopy(&ip
->ip_dst
, &d
.s6_addr32
[3], sizeof(ip
->ip_dst
));
1215 in6p
= in6_pcblookup_connect(&tcbtable
, &s
,
1216 th
->th_sport
, &d
, th
->th_dport
, 0);
1218 TCP_STATINC(TCP_STAT_PCBHASHMISS
);
1219 in6p
= in6_pcblookup_bind(&tcbtable
, &d
,
1227 if (inp
== 0 && in6p
== 0)
1230 TCP_STATINC(TCP_STAT_NOPORT
);
1231 if (tcp_log_refused
&&
1232 (tiflags
& (TH_RST
|TH_ACK
|TH_SYN
)) == TH_SYN
) {
1233 tcp4_log_refused(ip
, th
);
1235 tcp_fields_to_host(th
);
1236 goto dropwithreset_ratelim
;
1238 #if defined(IPSEC) || defined(FAST_IPSEC)
1239 if (inp
&& (inp
->inp_socket
->so_options
& SO_ACCEPTCONN
) == 0 &&
1240 ipsec4_in_reject(m
, inp
)) {
1241 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO
);
1246 (in6p
->in6p_socket
->so_options
& SO_ACCEPTCONN
) == 0 &&
1247 ipsec6_in_reject_so(m
, in6p
->in6p_socket
)) {
1248 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO
);
1260 #if defined(NFAITH) && NFAITH > 0
1261 faith
= faithprefix(&ip6
->ip6_dst
);
1265 in6p
= in6_pcblookup_connect(&tcbtable
, &ip6
->ip6_src
,
1266 th
->th_sport
, &ip6
->ip6_dst
, th
->th_dport
, faith
);
1268 TCP_STATINC(TCP_STAT_PCBHASHMISS
);
1269 in6p
= in6_pcblookup_bind(&tcbtable
, &ip6
->ip6_dst
,
1270 th
->th_dport
, faith
);
1273 TCP_STATINC(TCP_STAT_NOPORT
);
1274 if (tcp_log_refused
&&
1275 (tiflags
& (TH_RST
|TH_ACK
|TH_SYN
)) == TH_SYN
) {
1276 tcp6_log_refused(ip6
, th
);
1278 tcp_fields_to_host(th
);
1279 goto dropwithreset_ratelim
;
1281 #if defined(IPSEC) || defined(FAST_IPSEC)
1282 if ((in6p
->in6p_socket
->so_options
& SO_ACCEPTCONN
) == 0 &&
1283 ipsec6_in_reject(m
, in6p
)) {
1284 IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO
);
1294 * If the state is CLOSED (i.e., TCB does not exist) then
1295 * all data in the incoming segment is discarded.
1296 * If the TCB exists but is in CLOSED state, it is embryonic,
1297 * but should either do a listen or a connect soon.
1302 /* Check the minimum TTL for socket. */
1303 if (ip
->ip_ttl
< inp
->inp_ip_minttl
)
1306 tp
= intotcpcb(inp
);
1307 so
= inp
->inp_socket
;
1311 tp
= in6totcpcb(in6p
);
1312 so
= in6p
->in6p_socket
;
1316 tcp_fields_to_host(th
);
1317 goto dropwithreset_ratelim
;
1319 if (tp
->t_state
== TCPS_CLOSED
)
1322 KASSERT(so
->so_lock
== softnet_lock
);
1323 KASSERT(solocked(so
));
1326 * Checksum extended TCP header and data.
1328 if (tcp_input_checksum(af
, m
, th
, toff
, off
, tlen
))
1331 tcp_fields_to_host(th
);
1333 /* Unscale the window into a 32-bit value. */
1334 if ((tiflags
& TH_SYN
) == 0)
1335 tiwin
= th
->th_win
<< tp
->snd_scale
;
1340 /* save packet options if user wanted */
1341 if (in6p
&& (in6p
->in6p_flags
& IN6P_CONTROLOPTS
)) {
1342 if (in6p
->in6p_options
) {
1343 m_freem(in6p
->in6p_options
);
1344 in6p
->in6p_options
= 0;
1346 KASSERT(ip6
!= NULL
);
1347 ip6_savecontrol(in6p
, &in6p
->in6p_options
, ip6
, m
);
1351 if (so
->so_options
& (SO_DEBUG
|SO_ACCEPTCONN
)) {
1352 union syn_cache_sa src
;
1353 union syn_cache_sa dst
;
1355 memset(&src
, 0, sizeof(src
));
1356 memset(&dst
, 0, sizeof(dst
));
1360 src
.sin
.sin_len
= sizeof(struct sockaddr_in
);
1361 src
.sin
.sin_family
= AF_INET
;
1362 src
.sin
.sin_addr
= ip
->ip_src
;
1363 src
.sin
.sin_port
= th
->th_sport
;
1365 dst
.sin
.sin_len
= sizeof(struct sockaddr_in
);
1366 dst
.sin
.sin_family
= AF_INET
;
1367 dst
.sin
.sin_addr
= ip
->ip_dst
;
1368 dst
.sin
.sin_port
= th
->th_dport
;
1373 src
.sin6
.sin6_len
= sizeof(struct sockaddr_in6
);
1374 src
.sin6
.sin6_family
= AF_INET6
;
1375 src
.sin6
.sin6_addr
= ip6
->ip6_src
;
1376 src
.sin6
.sin6_port
= th
->th_sport
;
1378 dst
.sin6
.sin6_len
= sizeof(struct sockaddr_in6
);
1379 dst
.sin6
.sin6_family
= AF_INET6
;
1380 dst
.sin6
.sin6_addr
= ip6
->ip6_dst
;
1381 dst
.sin6
.sin6_port
= th
->th_dport
;
1385 goto badsyn
; /*sanity*/
1388 if (so
->so_options
& SO_DEBUG
) {
1390 ostate
= tp
->t_state
;
1394 if (iphlen
+ sizeof(struct tcphdr
) > MHLEN
)
1397 if (m
->m_len
> iphlen
&& (m
->m_flags
& M_EXT
) == 0) {
1398 tcp_saveti
= m_copym(m
, 0, iphlen
, M_DONTWAIT
);
1402 MGETHDR(tcp_saveti
, M_DONTWAIT
, MT_HEADER
);
1405 MCLAIM(m
, &tcp_mowner
);
1406 tcp_saveti
->m_len
= iphlen
;
1407 m_copydata(m
, 0, iphlen
,
1408 mtod(tcp_saveti
, void *));
1411 if (M_TRAILINGSPACE(tcp_saveti
) < sizeof(struct tcphdr
)) {
1412 m_freem(tcp_saveti
);
1415 tcp_saveti
->m_len
+= sizeof(struct tcphdr
);
1416 memcpy(mtod(tcp_saveti
, char *) + iphlen
, th
,
1417 sizeof(struct tcphdr
));
1421 if (so
->so_options
& SO_ACCEPTCONN
) {
1422 if ((tiflags
& (TH_RST
|TH_ACK
|TH_SYN
)) != TH_SYN
) {
1423 if (tiflags
& TH_RST
) {
1424 syn_cache_reset(&src
.sa
, &dst
.sa
, th
);
1425 } else if ((tiflags
& (TH_ACK
|TH_SYN
)) ==
1428 * Received a SYN,ACK. This should
1429 * never happen while we are in
1430 * LISTEN. Send an RST.
1433 } else if (tiflags
& TH_ACK
) {
1434 so
= syn_cache_get(&src
.sa
, &dst
.sa
,
1435 th
, toff
, tlen
, so
, m
);
1438 * We don't have a SYN for
1439 * this ACK; send an RST.
1443 (struct socket
*)(-1)) {
1445 * We were unable to create
1446 * the connection. If the
1447 * 3-way handshake was
1448 * completed, and RST has
1449 * been sent to the peer.
1450 * Since the mbuf might be
1451 * in use for the reply,
1458 * full-blown connection.
1465 switch (so
->so_proto
->pr_domain
->dom_family
) {
1468 inp
= sotoinpcb(so
);
1469 tp
= intotcpcb(inp
);
1474 in6p
= sotoin6pcb(so
);
1475 tp
= in6totcpcb(in6p
);
1480 goto badsyn
; /*XXX*/
1481 tiwin
<<= tp
->snd_scale
;
1486 * None of RST, SYN or ACK was set.
1487 * This is an invalid packet for a
1488 * TCB in LISTEN state. Send a RST.
1496 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1498 if (m
->m_flags
& (M_BCAST
|M_MCAST
))
1504 if (IN6_IS_ADDR_MULTICAST(&ip6
->ip6_dst
))
1509 if (IN_MULTICAST(ip
->ip_dst
.s_addr
) ||
1510 in_broadcast(ip
->ip_dst
, m
->m_pkthdr
.rcvif
))
1517 * If deprecated address is forbidden, we do
1518 * not accept SYN to deprecated interface
1519 * address to prevent any new inbound
1520 * connection from getting established.
1521 * When we do not accept SYN, we send a TCP
1522 * RST, with deprecated source address (instead
1523 * of dropping it). We compromise it as it is
1524 * much better for peer to send a RST, and
1525 * RST will be the final packet for the
1528 * If we do not forbid deprecated addresses, we
1529 * accept the SYN packet. RFC2462 does not
1530 * suggest dropping SYN in this case.
1531 * If we decipher RFC2462 5.5.4, it says like
1533 * 1. use of deprecated addr with existing
1534 * communication is okay - "SHOULD continue
1536 * 2. use of it with new communication:
1537 * (2a) "SHOULD NOT be used if alternate
1538 * address with sufficient scope is
1540 * (2b) nothing mentioned otherwise.
1541 * Here we fall into (2b) case as we have no
1542 * choice in our source address selection - we
1543 * must obey the peer.
1545 * The wording in RFC2462 is confusing, and
1546 * there are multiple description text for
1547 * deprecated address handling - worse, they
1548 * are not exactly the same. I believe 5.5.4
1549 * is the best one, so we follow 5.5.4.
1551 if (af
== AF_INET6
&& !ip6_use_deprecated
) {
1552 struct in6_ifaddr
*ia6
;
1553 if ((ia6
= in6ifa_ifpwithaddr(m
->m_pkthdr
.rcvif
,
1555 (ia6
->ia6_flags
& IN6_IFF_DEPRECATED
)) {
1562 #if defined(IPSEC) || defined(FAST_IPSEC)
1566 if (ipsec4_in_reject_so(m
, so
)) {
1567 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO
);
1575 if (ipsec6_in_reject_so(m
, so
)) {
1576 IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO
);
1586 * LISTEN socket received a SYN
1587 * from itself? This can't possibly
1588 * be valid; drop the packet.
1590 if (th
->th_sport
== th
->th_dport
) {
1596 i
= in_hosteq(ip
->ip_src
, ip
->ip_dst
);
1601 i
= IN6_ARE_ADDR_EQUAL(&ip6
->ip6_src
, &ip6
->ip6_dst
);
1608 TCP_STATINC(TCP_STAT_BADSYN
);
1614 * SYN looks ok; create compressed TCP
1617 if (so
->so_qlen
<= so
->so_qlimit
&&
1618 syn_cache_add(&src
.sa
, &dst
.sa
, th
, tlen
,
1619 so
, m
, optp
, optlen
, &opti
))
1629 * Should not happen now that all embryonic connections
1630 * are handled with compressed state.
1632 if (tp
->t_state
== TCPS_LISTEN
)
1633 panic("tcp_input: TCPS_LISTEN");
1637 * Segment received on connection.
1638 * Reset idle time and keep-alive timer.
1640 tp
->t_rcvtime
= tcp_now
;
1641 if (TCPS_HAVEESTABLISHED(tp
->t_state
))
1642 TCP_TIMER_ARM(tp
, TCPT_KEEP
, tp
->t_keepidle
);
1647 #ifdef TCP_SIGNATURE
1648 if (optp
|| (tp
->t_flags
& TF_SIGNATURE
))
1652 if (tcp_dooptions(tp
, optp
, optlen
, th
, m
, toff
, &opti
) < 0)
1655 if (TCP_SACK_ENABLED(tp
)) {
1656 tcp_del_sackholes(tp
, th
);
1659 if (TCP_ECN_ALLOWED(tp
)) {
1660 switch (iptos
& IPTOS_ECN_MASK
) {
1662 tp
->t_flags
|= TF_ECN_SND_ECE
;
1663 TCP_STATINC(TCP_STAT_ECN_CE
);
1665 case IPTOS_ECN_ECT0
:
1666 TCP_STATINC(TCP_STAT_ECN_ECT
);
1668 case IPTOS_ECN_ECT1
:
1669 /* XXX: ignore for now -- rpaulo */
1673 if (tiflags
& TH_CWR
)
1674 tp
->t_flags
&= ~TF_ECN_SND_ECE
;
1677 * Congestion experienced.
1678 * Ignore if we are already trying to recover.
1680 if ((tiflags
& TH_ECE
) && SEQ_GEQ(tp
->snd_una
, tp
->snd_recover
))
1681 tp
->t_congctl
->cong_exp(tp
);
1684 if (opti
.ts_present
&& opti
.ts_ecr
) {
1686 * Calculate the RTT from the returned time stamp and the
1687 * connection's time base. If the time stamp is later than
1688 * the current time, or is extremely old, fall back to non-1323
1689 * RTT calculation. Since ts_ecr is unsigned, we can test both
1692 ts_rtt
= TCP_TIMESTAMP(tp
) - opti
.ts_ecr
+ 1;
1693 if (ts_rtt
> TCP_PAWS_IDLE
)
1700 * Header prediction: check for the two common cases
1701 * of a uni-directional data xfer. If the packet has
1702 * no control flags, is in-sequence, the window didn't
1703 * change and we're not retransmitting, it's a
1704 * candidate. If the length is zero and the ack moved
1705 * forward, we're the sender side of the xfer. Just
1706 * free the data acked & wake any higher level process
1707 * that was blocked waiting for space. If the length
1708 * is non-zero and the ack didn't move, we're the
1709 * receiver side. If we're getting packets in-order
1710 * (the reassembly queue is empty), add the data to
1711 * the socket buffer and note that we need a delayed ack.
1713 if (tp
->t_state
== TCPS_ESTABLISHED
&&
1714 (tiflags
& (TH_SYN
|TH_FIN
|TH_RST
|TH_URG
|TH_ECE
|TH_CWR
|TH_ACK
))
1716 (!opti
.ts_present
|| TSTMP_GEQ(opti
.ts_val
, tp
->ts_recent
)) &&
1717 th
->th_seq
== tp
->rcv_nxt
&&
1718 tiwin
&& tiwin
== tp
->snd_wnd
&&
1719 tp
->snd_nxt
== tp
->snd_max
) {
1722 * If last ACK falls within this segment's sequence numbers,
1723 * record the timestamp.
1724 * NOTE that the test is modified according to the latest
1725 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1727 * note that we already know
1728 * TSTMP_GEQ(opti.ts_val, tp->ts_recent)
1730 if (opti
.ts_present
&&
1731 SEQ_LEQ(th
->th_seq
, tp
->last_ack_sent
)) {
1732 tp
->ts_recent_age
= tcp_now
;
1733 tp
->ts_recent
= opti
.ts_val
;
1737 /* Ack prediction. */
1738 if (SEQ_GT(th
->th_ack
, tp
->snd_una
) &&
1739 SEQ_LEQ(th
->th_ack
, tp
->snd_max
) &&
1740 tp
->snd_cwnd
>= tp
->snd_wnd
&&
1741 tp
->t_partialacks
< 0) {
1743 * this is a pure ack for outstanding data.
1746 tcp_xmit_timer(tp
, ts_rtt
);
1747 else if (tp
->t_rtttime
&&
1748 SEQ_GT(th
->th_ack
, tp
->t_rtseq
))
1750 tcp_now
- tp
->t_rtttime
);
1751 acked
= th
->th_ack
- tp
->snd_una
;
1752 tcps
= TCP_STAT_GETREF();
1753 tcps
[TCP_STAT_PREDACK
]++;
1754 tcps
[TCP_STAT_RCVACKPACK
]++;
1755 tcps
[TCP_STAT_RCVACKBYTE
] += acked
;
1759 if (acked
> (tp
->t_lastoff
- tp
->t_inoff
))
1761 sbdrop(&so
->so_snd
, acked
);
1762 tp
->t_lastoff
-= acked
;
1764 icmp_check(tp
, th
, acked
);
1766 tp
->snd_una
= th
->th_ack
;
1767 tp
->snd_fack
= tp
->snd_una
;
1768 if (SEQ_LT(tp
->snd_high
, tp
->snd_una
))
1769 tp
->snd_high
= tp
->snd_una
;
1773 * If all outstanding data are acked, stop
1774 * retransmit timer, otherwise restart timer
1775 * using current (possibly backed-off) value.
1776 * If process is waiting for space,
1777 * wakeup/selnotify/signal. If data
1778 * are ready to send, let tcp_output
1779 * decide between more output or persist.
1781 if (tp
->snd_una
== tp
->snd_max
)
1782 TCP_TIMER_DISARM(tp
, TCPT_REXMT
);
1783 else if (TCP_TIMER_ISARMED(tp
,
1785 TCP_TIMER_ARM(tp
, TCPT_REXMT
,
1789 if (so
->so_snd
.sb_cc
)
1790 (void) tcp_output(tp
);
1792 m_freem(tcp_saveti
);
1795 } else if (th
->th_ack
== tp
->snd_una
&&
1796 TAILQ_FIRST(&tp
->segq
) == NULL
&&
1797 tlen
<= sbspace(&so
->so_rcv
)) {
1798 int newsize
= 0; /* automatic sockbuf scaling */
1801 * this is a pure, in-sequence data packet
1802 * with nothing on the reassembly queue and
1803 * we have enough buffer space to take it.
1805 tp
->rcv_nxt
+= tlen
;
1806 tcps
= TCP_STAT_GETREF();
1807 tcps
[TCP_STAT_PREDDAT
]++;
1808 tcps
[TCP_STAT_RCVPACK
]++;
1809 tcps
[TCP_STAT_RCVBYTE
] += tlen
;
1814 * Automatic sizing enables the performance of large buffers
1815 * and most of the efficiency of small ones by only allocating
1816 * space when it is needed.
1818 * On the receive side the socket buffer memory is only rarely
1819 * used to any significant extent. This allows us to be much
1820 * more aggressive in scaling the receive socket buffer. For
1821 * the case that the buffer space is actually used to a large
1822 * extent and we run out of kernel memory we can simply drop
1823 * the new segments; TCP on the sender will just retransmit it
1824 * later. Setting the buffer size too big may only consume too
1825 * much kernel memory if the application doesn't read() from
1826 * the socket or packet loss or reordering makes use of the
1829 * The criteria to step up the receive buffer one notch are:
1830 * 1. the number of bytes received during the time it takes
1831 * one timestamp to be reflected back to us (the RTT);
1832 * 2. received bytes per RTT is within seven eighth of the
1833 * current socket buffer size;
1834 * 3. receive buffer size has not hit maximal automatic size;
1836 * This algorithm does one step per RTT at most and only if
1837 * we receive a bulk stream w/o packet losses or reorderings.
1838 * Shrinking the buffer during idle times is not necessary as
1839 * it doesn't consume any memory when idle.
1841 * TODO: Only step up if the application is actually serving
1842 * the buffer to better manage the socket buffer resources.
1844 if (tcp_do_autorcvbuf
&&
1846 (so
->so_rcv
.sb_flags
& SB_AUTOSIZE
)) {
1847 if (opti
.ts_ecr
> tp
->rfbuf_ts
&&
1848 opti
.ts_ecr
- tp
->rfbuf_ts
< PR_SLOWHZ
) {
1850 (so
->so_rcv
.sb_hiwat
/ 8 * 7) &&
1851 so
->so_rcv
.sb_hiwat
<
1852 tcp_autorcvbuf_max
) {
1854 min(so
->so_rcv
.sb_hiwat
+
1856 tcp_autorcvbuf_max
);
1858 /* Start over with next RTT. */
1862 tp
->rfbuf_cnt
+= tlen
; /* add up */
1866 * Drop TCP, IP headers and TCP options then add data
1869 if (so
->so_state
& SS_CANTRCVMORE
)
1873 * Set new socket buffer size.
1874 * Give up when limit is reached.
1877 if (!sbreserve(&so
->so_rcv
,
1879 so
->so_rcv
.sb_flags
&= ~SB_AUTOSIZE
;
1880 m_adj(m
, toff
+ off
);
1881 sbappendstream(&so
->so_rcv
, m
);
1884 tcp_setup_ack(tp
, th
);
1885 if (tp
->t_flags
& TF_ACKNOW
)
1886 (void) tcp_output(tp
);
1888 m_freem(tcp_saveti
);
1894 * Compute mbuf offset to TCP data segment.
1896 hdroptlen
= toff
+ off
;
1899 * Calculate amount of space in receive window,
1900 * and then do TCP input processing.
1901 * Receive window is amount of space in rcv queue,
1902 * but not less than advertised window.
1906 win
= sbspace(&so
->so_rcv
);
1909 tp
->rcv_wnd
= imax(win
, (int)(tp
->rcv_adv
- tp
->rcv_nxt
));
1912 /* Reset receive buffer auto scaling when not in bulk receive mode. */
1916 switch (tp
->t_state
) {
1918 * If the state is SYN_SENT:
1919 * if seg contains an ACK, but not for our SYN, drop the input.
1920 * if seg contains a RST, then drop the connection.
1921 * if seg does not contain SYN, then drop it.
1922 * Otherwise this is an acceptable SYN segment
1923 * initialize tp->rcv_nxt and tp->irs
1924 * if seg contains ack then advance tp->snd_una
1925 * if seg contains a ECE and ECN support is enabled, the stream
1927 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1928 * arrange for segment to be acked (eventually)
1929 * continue processing rest of data/controls, beginning with URG
1932 if ((tiflags
& TH_ACK
) &&
1933 (SEQ_LEQ(th
->th_ack
, tp
->iss
) ||
1934 SEQ_GT(th
->th_ack
, tp
->snd_max
)))
1936 if (tiflags
& TH_RST
) {
1937 if (tiflags
& TH_ACK
)
1938 tp
= tcp_drop(tp
, ECONNREFUSED
);
1941 if ((tiflags
& TH_SYN
) == 0)
1943 if (tiflags
& TH_ACK
) {
1944 tp
->snd_una
= th
->th_ack
;
1945 if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
))
1946 tp
->snd_nxt
= tp
->snd_una
;
1947 if (SEQ_LT(tp
->snd_high
, tp
->snd_una
))
1948 tp
->snd_high
= tp
->snd_una
;
1949 TCP_TIMER_DISARM(tp
, TCPT_REXMT
);
1951 if ((tiflags
& TH_ECE
) && tcp_do_ecn
) {
1952 tp
->t_flags
|= TF_ECN_PERMIT
;
1953 TCP_STATINC(TCP_STAT_ECN_SHS
);
1957 tp
->irs
= th
->th_seq
;
1959 tp
->t_flags
|= TF_ACKNOW
;
1960 tcp_mss_from_peer(tp
, opti
.maxseg
);
1963 * Initialize the initial congestion window. If we
1964 * had to retransmit the SYN, we must initialize cwnd
1965 * to 1 segment (i.e. the Loss Window).
1967 if (tp
->t_flags
& TF_SYN_REXMT
)
1968 tp
->snd_cwnd
= tp
->t_peermss
;
1970 int ss
= tcp_init_win
;
1972 if (inp
!= NULL
&& in_localaddr(inp
->inp_faddr
))
1973 ss
= tcp_init_win_local
;
1976 if (in6p
!= NULL
&& in6_localaddr(&in6p
->in6p_faddr
))
1977 ss
= tcp_init_win_local
;
1979 tp
->snd_cwnd
= TCP_INITIAL_WINDOW(ss
, tp
->t_peermss
);
1983 if (tiflags
& TH_ACK
) {
1984 TCP_STATINC(TCP_STAT_CONNECTS
);
1986 tcp_established(tp
);
1987 /* Do window scaling on this connection? */
1988 if ((tp
->t_flags
& (TF_RCVD_SCALE
|TF_REQ_SCALE
)) ==
1989 (TF_RCVD_SCALE
|TF_REQ_SCALE
)) {
1990 tp
->snd_scale
= tp
->requested_s_scale
;
1991 tp
->rcv_scale
= tp
->request_r_scale
;
1994 (void) tcp_reass(tp
, NULL
, (struct mbuf
*)0, &tlen
);
1995 TCP_REASS_UNLOCK(tp
);
1997 * if we didn't have to retransmit the SYN,
1998 * use its rtt as our initial srtt & rtt var.
2001 tcp_xmit_timer(tp
, tcp_now
- tp
->t_rtttime
);
2003 tp
->t_state
= TCPS_SYN_RECEIVED
;
2006 * Advance th->th_seq to correspond to first data byte.
2007 * If data, trim to stay within window,
2008 * dropping FIN if necessary.
2011 if (tlen
> tp
->rcv_wnd
) {
2012 todrop
= tlen
- tp
->rcv_wnd
;
2016 tcps
= TCP_STAT_GETREF();
2017 tcps
[TCP_STAT_RCVPACKAFTERWIN
]++;
2018 tcps
[TCP_STAT_RCVBYTEAFTERWIN
] += todrop
;
2021 tp
->snd_wl1
= th
->th_seq
- 1;
2022 tp
->rcv_up
= th
->th_seq
;
2026 * If the state is SYN_RECEIVED:
2027 * If seg contains an ACK, but not for our SYN, drop the input
2028 * and generate an RST. See page 36, rfc793
2030 case TCPS_SYN_RECEIVED
:
2031 if ((tiflags
& TH_ACK
) &&
2032 (SEQ_LEQ(th
->th_ack
, tp
->iss
) ||
2033 SEQ_GT(th
->th_ack
, tp
->snd_max
)))
2039 * States other than LISTEN or SYN_SENT.
2040 * First check timestamp, if present.
2041 * Then check that at least some bytes of segment are within
2042 * receive window. If segment begins before rcv_nxt,
2043 * drop leading data (and SYN); if nothing left, just ack.
2045 * RFC 1323 PAWS: If we have a timestamp reply on this segment
2046 * and it's less than ts_recent, drop it.
2048 if (opti
.ts_present
&& (tiflags
& TH_RST
) == 0 && tp
->ts_recent
&&
2049 TSTMP_LT(opti
.ts_val
, tp
->ts_recent
)) {
2051 /* Check to see if ts_recent is over 24 days old. */
2052 if (tcp_now
- tp
->ts_recent_age
> TCP_PAWS_IDLE
) {
2054 * Invalidate ts_recent. If this segment updates
2055 * ts_recent, the age will be reset later and ts_recent
2056 * will get a valid value. If it does not, setting
2057 * ts_recent to zero will at least satisfy the
2058 * requirement that zero be placed in the timestamp
2059 * echo reply when ts_recent isn't valid. The
2060 * age isn't reset until we get a valid ts_recent
2061 * because we don't want out-of-order segments to be
2062 * dropped when ts_recent is old.
2066 tcps
= TCP_STAT_GETREF();
2067 tcps
[TCP_STAT_RCVDUPPACK
]++;
2068 tcps
[TCP_STAT_RCVDUPBYTE
] += tlen
;
2069 tcps
[TCP_STAT_PAWSDROP
]++;
2071 tcp_new_dsack(tp
, th
->th_seq
, tlen
);
2076 todrop
= tp
->rcv_nxt
- th
->th_seq
;
2079 if (tiflags
& TH_SYN
) {
2090 if (todrop
> tlen
||
2091 (todrop
== tlen
&& (tiflags
& TH_FIN
) == 0)) {
2093 * Any valid FIN or RST must be to the left of the
2094 * window. At this point the FIN or RST must be a
2095 * duplicate or out of sequence; drop it.
2097 if (tiflags
& TH_RST
)
2099 tiflags
&= ~(TH_FIN
|TH_RST
);
2101 * Send an ACK to resynchronize and drop any data.
2102 * But keep on processing for RST or ACK.
2104 tp
->t_flags
|= TF_ACKNOW
;
2107 tcps
= TCP_STAT_GETREF();
2108 tcps
[TCP_STAT_RCVDUPPACK
]++;
2109 tcps
[TCP_STAT_RCVDUPBYTE
] += todrop
;
2111 } else if ((tiflags
& TH_RST
) &&
2112 th
->th_seq
!= tp
->rcv_nxt
) {
2114 * Test for reset before adjusting the sequence
2115 * number for overlapping data.
2117 goto dropafterack_ratelim
;
2119 tcps
= TCP_STAT_GETREF();
2120 tcps
[TCP_STAT_RCVPARTDUPPACK
]++;
2121 tcps
[TCP_STAT_RCVPARTDUPBYTE
] += todrop
;
2124 tcp_new_dsack(tp
, th
->th_seq
, todrop
);
2125 hdroptlen
+= todrop
; /*drop from head afterwards*/
2126 th
->th_seq
+= todrop
;
2128 if (th
->th_urp
> todrop
)
2129 th
->th_urp
-= todrop
;
2137 * If new data are received on a connection after the
2138 * user processes are gone, then RST the other end.
2140 if ((so
->so_state
& SS_NOFDREF
) &&
2141 tp
->t_state
> TCPS_CLOSE_WAIT
&& tlen
) {
2143 TCP_STATINC(TCP_STAT_RCVAFTERCLOSE
);
2148 * If segment ends after window, drop trailing data
2149 * (and PUSH and FIN); if nothing left, just ACK.
2151 todrop
= (th
->th_seq
+ tlen
) - (tp
->rcv_nxt
+tp
->rcv_wnd
);
2153 TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN
);
2154 if (todrop
>= tlen
) {
2156 * The segment actually starts after the window.
2157 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
2158 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
2159 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
2161 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN
, tlen
);
2163 * If a new connection request is received
2164 * while in TIME_WAIT, drop the old connection
2165 * and start over if the sequence numbers
2166 * are above the previous ones.
2168 * NOTE: We will checksum the packet again, and
2169 * so we need to put the header fields back into
2171 * XXX This kind of sucks, but we don't expect
2172 * XXX this to happen very often, so maybe it
2173 * XXX doesn't matter so much.
2175 if (tiflags
& TH_SYN
&&
2176 tp
->t_state
== TCPS_TIME_WAIT
&&
2177 SEQ_GT(th
->th_seq
, tp
->rcv_nxt
)) {
2179 tcp_fields_to_net(th
);
2183 * If window is closed can only take segments at
2184 * window edge, and have to drop data and PUSH from
2185 * incoming segments. Continue processing, but
2186 * remember to ack. Otherwise, drop segment
2187 * and (if not RST) ack.
2189 if (tp
->rcv_wnd
== 0 && th
->th_seq
== tp
->rcv_nxt
) {
2190 tp
->t_flags
|= TF_ACKNOW
;
2191 TCP_STATINC(TCP_STAT_RCVWINPROBE
);
2195 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN
, todrop
);
2198 tiflags
&= ~(TH_PUSH
|TH_FIN
);
2202 * If last ACK falls within this segment's sequence numbers,
2203 * record the timestamp.
2205 * 1) That the test incorporates suggestions from the latest
2206 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2207 * 2) That updating only on newer timestamps interferes with
2208 * our earlier PAWS tests, so this check should be solely
2209 * predicated on the sequence space of this segment.
2210 * 3) That we modify the segment boundary check to be
2211 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
2212 * instead of RFC1323's
2213 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
2214 * This modified check allows us to overcome RFC1323's
2215 * limitations as described in Stevens TCP/IP Illustrated
2216 * Vol. 2 p.869. In such cases, we can still calculate the
2217 * RTT correctly when RCV.NXT == Last.ACK.Sent.
2219 if (opti
.ts_present
&&
2220 SEQ_LEQ(th
->th_seq
, tp
->last_ack_sent
) &&
2221 SEQ_LEQ(tp
->last_ack_sent
, th
->th_seq
+ tlen
+
2222 ((tiflags
& (TH_SYN
|TH_FIN
)) != 0))) {
2223 tp
->ts_recent_age
= tcp_now
;
2224 tp
->ts_recent
= opti
.ts_val
;
2228 * If the RST bit is set examine the state:
2229 * SYN_RECEIVED STATE:
2230 * If passive open, return to LISTEN state.
2231 * If active open, inform user that connection was refused.
2232 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
2233 * Inform user that connection was reset, and close tcb.
2234 * CLOSING, LAST_ACK, TIME_WAIT STATES
2237 if (tiflags
& TH_RST
) {
2238 if (th
->th_seq
!= tp
->rcv_nxt
)
2239 goto dropafterack_ratelim
;
2241 switch (tp
->t_state
) {
2242 case TCPS_SYN_RECEIVED
:
2243 so
->so_error
= ECONNREFUSED
;
2246 case TCPS_ESTABLISHED
:
2247 case TCPS_FIN_WAIT_1
:
2248 case TCPS_FIN_WAIT_2
:
2249 case TCPS_CLOSE_WAIT
:
2250 so
->so_error
= ECONNRESET
;
2252 tp
->t_state
= TCPS_CLOSED
;
2253 TCP_STATINC(TCP_STAT_DROPS
);
2259 case TCPS_TIME_WAIT
:
2266 * Since we've covered the SYN-SENT and SYN-RECEIVED states above
2267 * we must be in a synchronized state. RFC791 states (under RST
2268 * generation) that any unacceptable segment (an out-of-order SYN
2269 * qualifies) received in a synchronized state must elicit only an
2270 * empty acknowledgment segment ... and the connection remains in
2273 if (tiflags
& TH_SYN
) {
2274 if (tp
->rcv_nxt
== th
->th_seq
) {
2275 tcp_respond(tp
, m
, m
, th
, (tcp_seq
)0, th
->th_ack
- 1,
2278 m_freem(tcp_saveti
);
2282 goto dropafterack_ratelim
;
2286 * If the ACK bit is off we drop the segment and return.
2288 if ((tiflags
& TH_ACK
) == 0) {
2289 if (tp
->t_flags
& TF_ACKNOW
)
2298 switch (tp
->t_state
) {
2301 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
2302 * ESTABLISHED state and continue processing, otherwise
2305 case TCPS_SYN_RECEIVED
:
2306 if (SEQ_GT(tp
->snd_una
, th
->th_ack
) ||
2307 SEQ_GT(th
->th_ack
, tp
->snd_max
))
2309 TCP_STATINC(TCP_STAT_CONNECTS
);
2311 tcp_established(tp
);
2312 /* Do window scaling? */
2313 if ((tp
->t_flags
& (TF_RCVD_SCALE
|TF_REQ_SCALE
)) ==
2314 (TF_RCVD_SCALE
|TF_REQ_SCALE
)) {
2315 tp
->snd_scale
= tp
->requested_s_scale
;
2316 tp
->rcv_scale
= tp
->request_r_scale
;
2319 (void) tcp_reass(tp
, NULL
, (struct mbuf
*)0, &tlen
);
2320 TCP_REASS_UNLOCK(tp
);
2321 tp
->snd_wl1
= th
->th_seq
- 1;
2325 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2326 * ACKs. If the ack is in the range
2327 * tp->snd_una < th->th_ack <= tp->snd_max
2328 * then advance tp->snd_una to th->th_ack and drop
2329 * data from the retransmission queue. If this ACK reflects
2330 * more up to date window information we update our window information.
2332 case TCPS_ESTABLISHED
:
2333 case TCPS_FIN_WAIT_1
:
2334 case TCPS_FIN_WAIT_2
:
2335 case TCPS_CLOSE_WAIT
:
2338 case TCPS_TIME_WAIT
:
2340 if (SEQ_LEQ(th
->th_ack
, tp
->snd_una
)) {
2341 if (tlen
== 0 && !dupseg
&& tiwin
== tp
->snd_wnd
) {
2342 TCP_STATINC(TCP_STAT_RCVDUPPACK
);
2344 * If we have outstanding data (other than
2345 * a window probe), this is a completely
2346 * duplicate ack (ie, window info didn't
2347 * change), the ack is the biggest we've
2348 * seen and we've seen exactly our rexmt
2349 * threshhold of them, assume a packet
2350 * has been dropped and retransmit it.
2351 * Kludge snd_nxt & the congestion
2352 * window so we send only this one
2355 if (TCP_TIMER_ISARMED(tp
, TCPT_REXMT
) == 0 ||
2356 th
->th_ack
!= tp
->snd_una
)
2358 else if (tp
->t_partialacks
< 0 &&
2359 (++tp
->t_dupacks
== tcprexmtthresh
||
2360 TCP_FACK_FASTRECOV(tp
))) {
2362 * Do the fast retransmit, and adjust
2363 * congestion control paramenters.
2365 if (tp
->t_congctl
->fast_retransmit(tp
, th
)) {
2366 /* False fast retransmit */
2370 } else if (tp
->t_dupacks
> tcprexmtthresh
) {
2371 tp
->snd_cwnd
+= tp
->t_segsz
;
2372 (void) tcp_output(tp
);
2377 * If the ack appears to be very old, only
2378 * allow data that is in-sequence. This
2379 * makes it somewhat more difficult to insert
2380 * forged data by guessing sequence numbers.
2381 * Sent an ack to try to update the send
2382 * sequence number on the other side.
2384 if (tlen
&& th
->th_seq
!= tp
->rcv_nxt
&&
2386 tp
->snd_una
- tp
->max_sndwnd
))
2392 * If the congestion window was inflated to account
2393 * for the other side's cached packets, retract it.
2395 /* XXX: make SACK have his own congestion control
2396 * struct -- rpaulo */
2397 if (TCP_SACK_ENABLED(tp
))
2398 tcp_sack_newack(tp
, th
);
2400 tp
->t_congctl
->fast_retransmit_newack(tp
, th
);
2401 if (SEQ_GT(th
->th_ack
, tp
->snd_max
)) {
2402 TCP_STATINC(TCP_STAT_RCVACKTOOMUCH
);
2405 acked
= th
->th_ack
- tp
->snd_una
;
2406 tcps
= TCP_STAT_GETREF();
2407 tcps
[TCP_STAT_RCVACKPACK
]++;
2408 tcps
[TCP_STAT_RCVACKBYTE
] += acked
;
2412 * If we have a timestamp reply, update smoothed
2413 * round trip time. If no timestamp is present but
2414 * transmit timer is running and timed sequence
2415 * number was acked, update smoothed round trip time.
2416 * Since we now have an rtt measurement, cancel the
2417 * timer backoff (cf., Phil Karn's retransmit alg.).
2418 * Recompute the initial retransmit timer.
2421 tcp_xmit_timer(tp
, ts_rtt
);
2422 else if (tp
->t_rtttime
&& SEQ_GT(th
->th_ack
, tp
->t_rtseq
))
2423 tcp_xmit_timer(tp
, tcp_now
- tp
->t_rtttime
);
2426 * If all outstanding data is acked, stop retransmit
2427 * timer and remember to restart (more output or persist).
2428 * If there is more data to be acked, restart retransmit
2429 * timer, using current (possibly backed-off) value.
2431 if (th
->th_ack
== tp
->snd_max
) {
2432 TCP_TIMER_DISARM(tp
, TCPT_REXMT
);
2434 } else if (TCP_TIMER_ISARMED(tp
, TCPT_PERSIST
) == 0)
2435 TCP_TIMER_ARM(tp
, TCPT_REXMT
, tp
->t_rxtcur
);
2438 * New data has been acked, adjust the congestion window.
2440 tp
->t_congctl
->newack(tp
, th
);
2443 if (acked
> so
->so_snd
.sb_cc
) {
2444 tp
->snd_wnd
-= so
->so_snd
.sb_cc
;
2445 sbdrop(&so
->so_snd
, (int)so
->so_snd
.sb_cc
);
2448 if (acked
> (tp
->t_lastoff
- tp
->t_inoff
))
2450 sbdrop(&so
->so_snd
, acked
);
2451 tp
->t_lastoff
-= acked
;
2452 tp
->snd_wnd
-= acked
;
2457 icmp_check(tp
, th
, acked
);
2459 tp
->snd_una
= th
->th_ack
;
2460 if (SEQ_GT(tp
->snd_una
, tp
->snd_fack
))
2461 tp
->snd_fack
= tp
->snd_una
;
2462 if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
))
2463 tp
->snd_nxt
= tp
->snd_una
;
2464 if (SEQ_LT(tp
->snd_high
, tp
->snd_una
))
2465 tp
->snd_high
= tp
->snd_una
;
2467 switch (tp
->t_state
) {
2470 * In FIN_WAIT_1 STATE in addition to the processing
2471 * for the ESTABLISHED state if our FIN is now acknowledged
2472 * then enter FIN_WAIT_2.
2474 case TCPS_FIN_WAIT_1
:
2475 if (ourfinisacked
) {
2477 * If we can't receive any more
2478 * data, then closing user can proceed.
2479 * Starting the timer is contrary to the
2480 * specification, but if we don't get a FIN
2481 * we'll hang forever.
2483 if (so
->so_state
& SS_CANTRCVMORE
) {
2484 soisdisconnected(so
);
2485 if (tp
->t_maxidle
> 0)
2486 TCP_TIMER_ARM(tp
, TCPT_2MSL
,
2489 tp
->t_state
= TCPS_FIN_WAIT_2
;
2494 * In CLOSING STATE in addition to the processing for
2495 * the ESTABLISHED state if the ACK acknowledges our FIN
2496 * then enter the TIME-WAIT state, otherwise ignore
2500 if (ourfinisacked
) {
2501 tp
->t_state
= TCPS_TIME_WAIT
;
2502 tcp_canceltimers(tp
);
2503 TCP_TIMER_ARM(tp
, TCPT_2MSL
,
2504 2 * PR_SLOWHZ
* tcp_msl
);
2505 soisdisconnected(so
);
2510 * In LAST_ACK, we may still be waiting for data to drain
2511 * and/or to be acked, as well as for the ack of our FIN.
2512 * If our FIN is now acknowledged, delete the TCB,
2513 * enter the closed state and return.
2516 if (ourfinisacked
) {
2523 * In TIME_WAIT state the only thing that should arrive
2524 * is a retransmission of the remote FIN. Acknowledge
2525 * it and restart the finack timer.
2527 case TCPS_TIME_WAIT
:
2528 TCP_TIMER_ARM(tp
, TCPT_2MSL
, 2 * PR_SLOWHZ
* tcp_msl
);
2535 * Update window information.
2536 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2538 if ((tiflags
& TH_ACK
) && (SEQ_LT(tp
->snd_wl1
, th
->th_seq
) ||
2539 (tp
->snd_wl1
== th
->th_seq
&& (SEQ_LT(tp
->snd_wl2
, th
->th_ack
) ||
2540 (tp
->snd_wl2
== th
->th_ack
&& tiwin
> tp
->snd_wnd
))))) {
2541 /* keep track of pure window updates */
2543 tp
->snd_wl2
== th
->th_ack
&& tiwin
> tp
->snd_wnd
)
2544 TCP_STATINC(TCP_STAT_RCVWINUPD
);
2545 tp
->snd_wnd
= tiwin
;
2546 tp
->snd_wl1
= th
->th_seq
;
2547 tp
->snd_wl2
= th
->th_ack
;
2548 if (tp
->snd_wnd
> tp
->max_sndwnd
)
2549 tp
->max_sndwnd
= tp
->snd_wnd
;
2554 * Process segments with URG.
2556 if ((tiflags
& TH_URG
) && th
->th_urp
&&
2557 TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
2559 * This is a kludge, but if we receive and accept
2560 * random urgent pointers, we'll crash in
2561 * soreceive. It's hard to imagine someone
2562 * actually wanting to send this much urgent data.
2564 if (th
->th_urp
+ so
->so_rcv
.sb_cc
> sb_max
) {
2565 th
->th_urp
= 0; /* XXX */
2566 tiflags
&= ~TH_URG
; /* XXX */
2567 goto dodata
; /* XXX */
2570 * If this segment advances the known urgent pointer,
2571 * then mark the data stream. This should not happen
2572 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2573 * a FIN has been received from the remote side.
2574 * In these states we ignore the URG.
2576 * According to RFC961 (Assigned Protocols),
2577 * the urgent pointer points to the last octet
2578 * of urgent data. We continue, however,
2579 * to consider it to indicate the first octet
2580 * of data past the urgent section as the original
2581 * spec states (in one of two places).
2583 if (SEQ_GT(th
->th_seq
+th
->th_urp
, tp
->rcv_up
)) {
2584 tp
->rcv_up
= th
->th_seq
+ th
->th_urp
;
2585 so
->so_oobmark
= so
->so_rcv
.sb_cc
+
2586 (tp
->rcv_up
- tp
->rcv_nxt
) - 1;
2587 if (so
->so_oobmark
== 0)
2588 so
->so_state
|= SS_RCVATMARK
;
2590 tp
->t_oobflags
&= ~(TCPOOB_HAVEDATA
| TCPOOB_HADDATA
);
2593 * Remove out of band data so doesn't get presented to user.
2594 * This can happen independent of advancing the URG pointer,
2595 * but if two URG's are pending at once, some out-of-band
2596 * data may creep in... ick.
2598 if (th
->th_urp
<= (u_int16_t
) tlen
2600 && (so
->so_options
& SO_OOBINLINE
) == 0
2603 tcp_pulloutofband(so
, th
, m
, hdroptlen
);
2606 * If no out of band data is expected,
2607 * pull receive urgent pointer along
2608 * with the receive window.
2610 if (SEQ_GT(tp
->rcv_nxt
, tp
->rcv_up
))
2611 tp
->rcv_up
= tp
->rcv_nxt
;
2615 * Process the segment text, merging it into the TCP sequencing queue,
2616 * and arranging for acknowledgement of receipt if necessary.
2617 * This process logically involves adjusting tp->rcv_wnd as data
2618 * is presented to the user (this happens in tcp_usrreq.c,
2619 * case PRU_RCVD). If a FIN has already been received on this
2620 * connection then we just ignore the text.
2622 if ((tlen
|| (tiflags
& TH_FIN
)) &&
2623 TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
2625 * Insert segment ti into reassembly queue of tcp with
2626 * control block tp. Return TH_FIN if reassembly now includes
2627 * a segment with FIN. The macro form does the common case
2628 * inline (segment is the next to be received on an
2629 * established connection, and the queue is empty),
2630 * avoiding linkage into and removal from the queue and
2631 * repetition of various conversions.
2632 * Set DELACK for segments received in order, but ack
2633 * immediately when segments are out of order
2634 * (so fast retransmit can work).
2636 /* NOTE: this was TCP_REASS() macro, but used only once */
2638 if (th
->th_seq
== tp
->rcv_nxt
&&
2639 TAILQ_FIRST(&tp
->segq
) == NULL
&&
2640 tp
->t_state
== TCPS_ESTABLISHED
) {
2641 tcp_setup_ack(tp
, th
);
2642 tp
->rcv_nxt
+= tlen
;
2643 tiflags
= th
->th_flags
& TH_FIN
;
2644 tcps
= TCP_STAT_GETREF();
2645 tcps
[TCP_STAT_RCVPACK
]++;
2646 tcps
[TCP_STAT_RCVBYTE
] += tlen
;
2649 if (so
->so_state
& SS_CANTRCVMORE
)
2652 m_adj(m
, hdroptlen
);
2653 sbappendstream(&(so
)->so_rcv
, m
);
2655 TCP_REASS_UNLOCK(tp
);
2658 m_adj(m
, hdroptlen
);
2659 tiflags
= tcp_reass(tp
, th
, m
, &tlen
);
2660 tp
->t_flags
|= TF_ACKNOW
;
2661 TCP_REASS_UNLOCK(tp
);
2665 * Note the amount of data that peer has sent into
2666 * our window, in order to estimate the sender's
2669 len
= so
->so_rcv
.sb_hiwat
- (tp
->rcv_adv
- tp
->rcv_nxt
);
2677 * If FIN is received ACK the FIN and let the user know
2678 * that the connection is closing. Ignore a FIN received before
2679 * the connection is fully established.
2681 if ((tiflags
& TH_FIN
) && TCPS_HAVEESTABLISHED(tp
->t_state
)) {
2682 if (TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
2684 tp
->t_flags
|= TF_ACKNOW
;
2687 switch (tp
->t_state
) {
2690 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2692 case TCPS_ESTABLISHED
:
2693 tp
->t_state
= TCPS_CLOSE_WAIT
;
2697 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2698 * enter the CLOSING state.
2700 case TCPS_FIN_WAIT_1
:
2701 tp
->t_state
= TCPS_CLOSING
;
2705 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2706 * starting the time-wait timer, turning off the other
2709 case TCPS_FIN_WAIT_2
:
2710 tp
->t_state
= TCPS_TIME_WAIT
;
2711 tcp_canceltimers(tp
);
2712 TCP_TIMER_ARM(tp
, TCPT_2MSL
, 2 * PR_SLOWHZ
* tcp_msl
);
2713 soisdisconnected(so
);
2717 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2719 case TCPS_TIME_WAIT
:
2720 TCP_TIMER_ARM(tp
, TCPT_2MSL
, 2 * PR_SLOWHZ
* tcp_msl
);
2725 if (so
->so_options
& SO_DEBUG
)
2726 tcp_trace(TA_INPUT
, ostate
, tp
, tcp_saveti
, 0);
2730 * Return any desired output.
2732 if (needoutput
|| (tp
->t_flags
& TF_ACKNOW
)) {
2733 (void) tcp_output(tp
);
2736 m_freem(tcp_saveti
);
2741 * Received a bad SYN. Increment counters and dropwithreset.
2743 TCP_STATINC(TCP_STAT_BADSYN
);
2749 * Generate an ACK dropping incoming segment if it occupies
2750 * sequence space, where the ACK reflects our state.
2752 if (tiflags
& TH_RST
)
2756 dropafterack_ratelim
:
2758 * We may want to rate-limit ACKs against SYN/RST attack.
2760 if (ppsratecheck(&tcp_ackdrop_ppslim_last
, &tcp_ackdrop_ppslim_count
,
2761 tcp_ackdrop_ppslim
) == 0) {
2765 /* ...fall into dropafterack2... */
2769 tp
->t_flags
|= TF_ACKNOW
;
2770 (void) tcp_output(tp
);
2772 m_freem(tcp_saveti
);
2775 dropwithreset_ratelim
:
2777 * We may want to rate-limit RSTs in certain situations,
2778 * particularly if we are sending an RST in response to
2779 * an attempt to connect to or otherwise communicate with
2780 * a port for which we have no socket.
2782 if (ppsratecheck(&tcp_rst_ppslim_last
, &tcp_rst_ppslim_count
,
2783 tcp_rst_ppslim
) == 0) {
2787 /* ...fall into dropwithreset... */
2791 * Generate a RST, dropping incoming segment.
2792 * Make ACK acceptable to originator of segment.
2794 if (tiflags
& TH_RST
)
2800 /* For following calls to tcp_respond */
2801 if (IN6_IS_ADDR_MULTICAST(&ip6
->ip6_dst
))
2806 if (IN_MULTICAST(ip
->ip_dst
.s_addr
) ||
2807 in_broadcast(ip
->ip_dst
, m
->m_pkthdr
.rcvif
))
2811 if (tiflags
& TH_ACK
)
2812 (void)tcp_respond(tp
, m
, m
, th
, (tcp_seq
)0, th
->th_ack
, TH_RST
);
2814 if (tiflags
& TH_SYN
)
2816 (void)tcp_respond(tp
, m
, m
, th
, th
->th_seq
+ tlen
, (tcp_seq
)0,
2820 m_freem(tcp_saveti
);
2826 * Drop space held by incoming segment and return.
2830 so
= tp
->t_inpcb
->inp_socket
;
2832 else if (tp
->t_in6pcb
)
2833 so
= tp
->t_in6pcb
->in6p_socket
;
2838 if (so
&& (so
->so_options
& SO_DEBUG
) != 0)
2839 tcp_trace(TA_DROP
, ostate
, tp
, tcp_saveti
, 0);
2843 m_freem(tcp_saveti
);
2848 #ifdef TCP_SIGNATURE
2850 tcp_signature_apply(void *fstate
, void *data
, u_int len
)
2853 MD5Update(fstate
, (u_char
*)data
, len
);
2858 tcp_signature_getsav(struct mbuf
*m
, struct tcphdr
*th
)
2860 struct secasvar
*sav
;
2862 union sockaddr_union dst
;
2865 struct ip6_hdr
*ip6
;
2867 ip
= mtod(m
, struct ip
*);
2870 ip
= mtod(m
, struct ip
*);
2875 ip6
= mtod(m
, struct ip6_hdr
*);
2882 /* Extract the destination from the IP header in the mbuf. */
2883 memset(&dst
, 0, sizeof(union sockaddr_union
));
2885 dst
.sa
.sa_len
= sizeof(struct sockaddr_in
);
2886 dst
.sa
.sa_family
= AF_INET
;
2887 dst
.sin
.sin_addr
= ip
->ip_dst
;
2889 dst
.sa
.sa_len
= sizeof(struct sockaddr_in6
);
2890 dst
.sa
.sa_family
= AF_INET6
;
2891 dst
.sin6
.sin6_addr
= ip6
->ip6_dst
;
2895 * Look up an SADB entry which matches the address of the peer.
2897 sav
= KEY_ALLOCSA(&dst
, IPPROTO_TCP
, htonl(TCP_SIG_SPI
));
2900 sav
= key_allocsa(AF_INET
, (void *)&ip
->ip_src
,
2901 (void *)&ip
->ip_dst
, IPPROTO_TCP
,
2902 htonl(TCP_SIG_SPI
), 0, 0);
2904 sav
= key_allocsa(AF_INET6
, (void *)&ip6
->ip6_src
,
2905 (void *)&ip6
->ip6_dst
, IPPROTO_TCP
,
2906 htonl(TCP_SIG_SPI
), 0, 0);
2909 return (sav
); /* freesav must be performed by caller */
2913 tcp_signature(struct mbuf
*m
, struct tcphdr
*th
, int thoff
,
2914 struct secasvar
*sav
, char *sig
)
2918 struct ipovly
*ipovly
;
2919 struct ip6_hdr
*ip6
;
2920 struct ippseudo ippseudo
;
2921 struct ip6_hdr_pseudo ip6pseudo
;
2928 tcphdrlen
= th
->th_off
* 4;
2930 switch (mtod(m
, struct ip
*)->ip_v
) {
2932 ip
= mtod(m
, struct ip
*);
2937 ip6
= mtod(m
, struct ip6_hdr
*);
2946 memset(&ippseudo
, 0, sizeof(ippseudo
));
2947 ipovly
= (struct ipovly
*)ip
;
2948 ippseudo
.ippseudo_src
= ipovly
->ih_src
;
2949 ippseudo
.ippseudo_dst
= ipovly
->ih_dst
;
2950 ippseudo
.ippseudo_pad
= 0;
2951 ippseudo
.ippseudo_p
= IPPROTO_TCP
;
2952 ippseudo
.ippseudo_len
= htons(m
->m_pkthdr
.len
- thoff
);
2953 MD5Update(&ctx
, (char *)&ippseudo
, sizeof(ippseudo
));
2955 memset(&ip6pseudo
, 0, sizeof(ip6pseudo
));
2956 ip6pseudo
.ip6ph_src
= ip6
->ip6_src
;
2957 in6_clearscope(&ip6pseudo
.ip6ph_src
);
2958 ip6pseudo
.ip6ph_dst
= ip6
->ip6_dst
;
2959 in6_clearscope(&ip6pseudo
.ip6ph_dst
);
2960 ip6pseudo
.ip6ph_len
= htons(m
->m_pkthdr
.len
- thoff
);
2961 ip6pseudo
.ip6ph_nxt
= IPPROTO_TCP
;
2962 MD5Update(&ctx
, (char *)&ip6pseudo
, sizeof(ip6pseudo
));
2967 MD5Update(&ctx
, (char *)&th0
, sizeof(th0
));
2969 l
= m
->m_pkthdr
.len
- thoff
- tcphdrlen
;
2971 m_apply(m
, thoff
+ tcphdrlen
,
2972 m
->m_pkthdr
.len
- thoff
- tcphdrlen
,
2973 tcp_signature_apply
, &ctx
);
2975 MD5Update(&ctx
, _KEYBUF(sav
->key_auth
), _KEYLEN(sav
->key_auth
));
2976 MD5Final(sig
, &ctx
);
2983 tcp_dooptions(struct tcpcb
*tp
, const u_char
*cp
, int cnt
,
2985 struct mbuf
*m
, int toff
, struct tcp_opt_info
*oi
)
2988 int opt
, optlen
= 0;
2989 #ifdef TCP_SIGNATURE
2991 char sigbuf
[TCP_SIGLEN
];
2992 struct secasvar
*sav
= NULL
;
2995 for (; cp
&& cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
2997 if (opt
== TCPOPT_EOL
)
2999 if (opt
== TCPOPT_NOP
)
3005 if (optlen
< 2 || optlen
> cnt
)
3014 if (optlen
!= TCPOLEN_MAXSEG
)
3016 if (!(th
->th_flags
& TH_SYN
))
3018 if (TCPS_HAVERCVDSYN(tp
->t_state
))
3020 bcopy(cp
+ 2, &mss
, sizeof(mss
));
3021 oi
->maxseg
= ntohs(mss
);
3025 if (optlen
!= TCPOLEN_WINDOW
)
3027 if (!(th
->th_flags
& TH_SYN
))
3029 if (TCPS_HAVERCVDSYN(tp
->t_state
))
3031 tp
->t_flags
|= TF_RCVD_SCALE
;
3032 tp
->requested_s_scale
= cp
[2];
3033 if (tp
->requested_s_scale
> TCP_MAX_WINSHIFT
) {
3038 p
= ntohl(ip
->ip_src
);
3041 p
= ip6_sprintf(&ip6
->ip6_src
);
3045 log(LOG_ERR
, "TCP: invalid wscale %d from %s, "
3047 tp
->requested_s_scale
, p
,
3050 log(LOG_ERR
, "TCP: invalid wscale %d, "
3052 tp
->requested_s_scale
,
3055 tp
->requested_s_scale
= TCP_MAX_WINSHIFT
;
3059 case TCPOPT_TIMESTAMP
:
3060 if (optlen
!= TCPOLEN_TIMESTAMP
)
3063 bcopy(cp
+ 2, &oi
->ts_val
, sizeof(oi
->ts_val
));
3065 bcopy(cp
+ 6, &oi
->ts_ecr
, sizeof(oi
->ts_ecr
));
3068 if (!(th
->th_flags
& TH_SYN
))
3070 if (TCPS_HAVERCVDSYN(tp
->t_state
))
3073 * A timestamp received in a SYN makes
3074 * it ok to send timestamp requests and replies.
3076 tp
->t_flags
|= TF_RCVD_TSTMP
;
3077 tp
->ts_recent
= oi
->ts_val
;
3078 tp
->ts_recent_age
= tcp_now
;
3081 case TCPOPT_SACK_PERMITTED
:
3082 if (optlen
!= TCPOLEN_SACK_PERMITTED
)
3084 if (!(th
->th_flags
& TH_SYN
))
3086 if (TCPS_HAVERCVDSYN(tp
->t_state
))
3089 tp
->t_flags
|= TF_SACK_PERMIT
;
3090 tp
->t_flags
|= TF_WILL_SACK
;
3095 tcp_sack_option(tp
, th
, cp
, optlen
);
3097 #ifdef TCP_SIGNATURE
3098 case TCPOPT_SIGNATURE
:
3099 if (optlen
!= TCPOLEN_SIGNATURE
)
3101 if (sigp
&& memcmp(sigp
, cp
+ 2, TCP_SIGLEN
))
3105 memcpy(sigbuf
, cp
+ 2, TCP_SIGLEN
);
3106 tp
->t_flags
|= TF_SIGNATURE
;
3112 #ifdef TCP_SIGNATURE
3113 if (tp
->t_flags
& TF_SIGNATURE
) {
3115 sav
= tcp_signature_getsav(m
, th
);
3117 if (sav
== NULL
&& tp
->t_state
== TCPS_LISTEN
)
3121 if ((sigp
? TF_SIGNATURE
: 0) ^ (tp
->t_flags
& TF_SIGNATURE
)) {
3133 char sig
[TCP_SIGLEN
];
3135 tcp_fields_to_net(th
);
3136 if (tcp_signature(m
, th
, toff
, sav
, sig
) < 0) {
3137 tcp_fields_to_host(th
);
3147 tcp_fields_to_host(th
);
3149 if (memcmp(sig
, sigp
, TCP_SIGLEN
)) {
3150 TCP_STATINC(TCP_STAT_BADSIG
);
3160 TCP_STATINC(TCP_STAT_GOODSIG
);
3162 key_sa_recordxfer(sav
, m
);
3175 * Pull out of band byte out of a segment so
3176 * it doesn't appear in the user's data queue.
3177 * It is still reflected in the segment length for
3178 * sequencing purposes.
3181 tcp_pulloutofband(struct socket
*so
, struct tcphdr
*th
,
3182 struct mbuf
*m
, int off
)
3184 int cnt
= off
+ th
->th_urp
- 1;
3187 if (m
->m_len
> cnt
) {
3188 char *cp
= mtod(m
, char *) + cnt
;
3189 struct tcpcb
*tp
= sototcpcb(so
);
3192 tp
->t_oobflags
|= TCPOOB_HAVEDATA
;
3193 bcopy(cp
+1, cp
, (unsigned)(m
->m_len
- cnt
- 1));
3202 panic("tcp_pulloutofband");
3206 * Collect new round-trip time estimate
3207 * and update averages and current timeout.
3210 tcp_xmit_timer(struct tcpcb
*tp
, uint32_t rtt
)
3214 TCP_STATINC(TCP_STAT_RTTUPDATED
);
3215 if (tp
->t_srtt
!= 0) {
3217 * srtt is stored as fixed point with 3 bits after the
3218 * binary point (i.e., scaled by 8). The following magic
3219 * is equivalent to the smoothing algorithm in rfc793 with
3220 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3221 * point). Adjust rtt to origin 0.
3223 delta
= (rtt
<< 2) - (tp
->t_srtt
>> TCP_RTT_SHIFT
);
3224 if ((tp
->t_srtt
+= delta
) <= 0)
3225 tp
->t_srtt
= 1 << 2;
3227 * We accumulate a smoothed rtt variance (actually, a
3228 * smoothed mean difference), then set the retransmit
3229 * timer to smoothed rtt + 4 times the smoothed variance.
3230 * rttvar is stored as fixed point with 2 bits after the
3231 * binary point (scaled by 4). The following is
3232 * equivalent to rfc793 smoothing with an alpha of .75
3233 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
3234 * rfc793's wired-in beta.
3238 delta
-= (tp
->t_rttvar
>> TCP_RTTVAR_SHIFT
);
3239 if ((tp
->t_rttvar
+= delta
) <= 0)
3240 tp
->t_rttvar
= 1 << 2;
3243 * No rtt measurement yet - use the unsmoothed rtt.
3244 * Set the variance to half the rtt (so our first
3245 * retransmit happens at 3*rtt).
3247 tp
->t_srtt
= rtt
<< (TCP_RTT_SHIFT
+ 2);
3248 tp
->t_rttvar
= rtt
<< (TCP_RTTVAR_SHIFT
+ 2 - 1);
3254 * the retransmit should happen at rtt + 4 * rttvar.
3255 * Because of the way we do the smoothing, srtt and rttvar
3256 * will each average +1/2 tick of bias. When we compute
3257 * the retransmit timer, we want 1/2 tick of rounding and
3258 * 1 extra tick because of +-1/2 tick uncertainty in the
3259 * firing of the timer. The bias will give us exactly the
3260 * 1.5 tick we need. But, because the bias is
3261 * statistical, we have to test that we don't drop below
3262 * the minimum feasible timer (which is 2 ticks).
3264 TCPT_RANGESET(tp
->t_rxtcur
, TCP_REXMTVAL(tp
),
3265 max(tp
->t_rttmin
, rtt
+ 2), TCPTV_REXMTMAX
);
3268 * We received an ack for a packet that wasn't retransmitted;
3269 * it is probably safe to discard any error indications we've
3270 * received recently. This isn't quite right, but close enough
3271 * for now (a route might have failed after we sent a segment,
3272 * and the return path might not be symmetrical).
3274 tp
->t_softerror
= 0;
3279 * TCP compressed state engine. Currently used to hold compressed
3280 * state for SYN_RECEIVED.
3283 u_long syn_cache_count
;
3284 u_int32_t syn_hash1
, syn_hash2
;
3286 #define SYN_HASH(sa, sp, dp) \
3287 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3288 ((u_int32_t)(sp)))^syn_hash2)))
3290 #define SYN_HASHALL(hash, src, dst) \
3292 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3293 ((const struct sockaddr_in *)(src))->sin_port, \
3294 ((const struct sockaddr_in *)(dst))->sin_port); \
3295 } while (/*CONSTCOND*/ 0)
3297 #define SYN_HASH6(sa, sp, dp) \
3298 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3299 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3302 #define SYN_HASHALL(hash, src, dst) \
3304 switch ((src)->sa_family) { \
3306 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3307 ((const struct sockaddr_in *)(src))->sin_port, \
3308 ((const struct sockaddr_in *)(dst))->sin_port); \
3311 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
3312 ((const struct sockaddr_in6 *)(src))->sin6_port, \
3313 ((const struct sockaddr_in6 *)(dst))->sin6_port); \
3318 } while (/*CONSTCOND*/0)
3321 static struct pool syn_cache_pool
;
3324 * We don't estimate RTT with SYNs, so each packet starts with the default
3325 * RTT and each timer step has a fixed timeout value.
3327 #define SYN_CACHE_TIMER_ARM(sc) \
3329 TCPT_RANGESET((sc)->sc_rxtcur, \
3330 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3332 callout_reset(&(sc)->sc_timer, \
3333 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
3334 } while (/*CONSTCOND*/0)
3336 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
3339 syn_cache_rm(struct syn_cache
*sc
)
3341 TAILQ_REMOVE(&tcp_syn_cache
[sc
->sc_bucketidx
].sch_bucket
,
3344 LIST_REMOVE(sc
, sc_tpq
);
3345 tcp_syn_cache
[sc
->sc_bucketidx
].sch_length
--;
3346 callout_stop(&sc
->sc_timer
);
3351 syn_cache_put(struct syn_cache
*sc
)
3354 (void) m_free(sc
->sc_ipopts
);
3355 rtcache_free(&sc
->sc_route
);
3356 if (callout_invoking(&sc
->sc_timer
))
3357 sc
->sc_flags
|= SCF_DEAD
;
3359 callout_destroy(&sc
->sc_timer
);
3360 pool_put(&syn_cache_pool
, sc
);
3365 syn_cache_init(void)
3369 pool_init(&syn_cache_pool
, sizeof(struct syn_cache
), 0, 0, 0,
3370 "synpl", NULL
, IPL_SOFTNET
);
3372 /* Initialize the hash buckets. */
3373 for (i
= 0; i
< tcp_syn_cache_size
; i
++)
3374 TAILQ_INIT(&tcp_syn_cache
[i
].sch_bucket
);
3378 syn_cache_insert(struct syn_cache
*sc
, struct tcpcb
*tp
)
3380 struct syn_cache_head
*scp
;
3381 struct syn_cache
*sc2
;
3385 * If there are no entries in the hash table, reinitialize
3388 if (syn_cache_count
== 0) {
3389 syn_hash1
= arc4random();
3390 syn_hash2
= arc4random();
3393 SYN_HASHALL(sc
->sc_hash
, &sc
->sc_src
.sa
, &sc
->sc_dst
.sa
);
3394 sc
->sc_bucketidx
= sc
->sc_hash
% tcp_syn_cache_size
;
3395 scp
= &tcp_syn_cache
[sc
->sc_bucketidx
];
3398 * Make sure that we don't overflow the per-bucket
3399 * limit or the total cache size limit.
3402 if (scp
->sch_length
>= tcp_syn_bucket_limit
) {
3403 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW
);
3405 * The bucket is full. Toss the oldest element in the
3406 * bucket. This will be the first entry in the bucket.
3408 sc2
= TAILQ_FIRST(&scp
->sch_bucket
);
3411 * This should never happen; we should always find an
3412 * entry in our bucket.
3415 panic("syn_cache_insert: bucketoverflow: impossible");
3418 syn_cache_put(sc2
); /* calls pool_put but see spl above */
3419 } else if (syn_cache_count
>= tcp_syn_cache_limit
) {
3420 struct syn_cache_head
*scp2
, *sce
;
3422 TCP_STATINC(TCP_STAT_SC_OVERFLOWED
);
3424 * The cache is full. Toss the oldest entry in the
3425 * first non-empty bucket we can find.
3427 * XXX We would really like to toss the oldest
3428 * entry in the cache, but we hope that this
3429 * condition doesn't happen very often.
3432 if (TAILQ_EMPTY(&scp2
->sch_bucket
)) {
3433 sce
= &tcp_syn_cache
[tcp_syn_cache_size
];
3434 for (++scp2
; scp2
!= scp
; scp2
++) {
3436 scp2
= &tcp_syn_cache
[0];
3437 if (! TAILQ_EMPTY(&scp2
->sch_bucket
))
3442 * This should never happen; we should always find a
3446 panic("syn_cache_insert: cacheoverflow: "
3450 sc2
= TAILQ_FIRST(&scp2
->sch_bucket
);
3452 syn_cache_put(sc2
); /* calls pool_put but see spl above */
3456 * Initialize the entry's timer.
3459 sc
->sc_rxtshift
= 0;
3460 SYN_CACHE_TIMER_ARM(sc
);
3462 /* Link it from tcpcb entry */
3463 LIST_INSERT_HEAD(&tp
->t_sc
, sc
, sc_tpq
);
3465 /* Put it into the bucket. */
3466 TAILQ_INSERT_TAIL(&scp
->sch_bucket
, sc
, sc_bucketq
);
3470 TCP_STATINC(TCP_STAT_SC_ADDED
);
3475 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3476 * If we have retransmitted an entry the maximum number of times, expire
3480 syn_cache_timer(void *arg
)
3482 struct syn_cache
*sc
= arg
;
3484 mutex_enter(softnet_lock
);
3485 KERNEL_LOCK(1, NULL
);
3486 callout_ack(&sc
->sc_timer
);
3488 if (__predict_false(sc
->sc_flags
& SCF_DEAD
)) {
3489 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE
);
3490 callout_destroy(&sc
->sc_timer
);
3491 pool_put(&syn_cache_pool
, sc
);
3492 KERNEL_UNLOCK_ONE(NULL
);
3493 mutex_exit(softnet_lock
);
3497 if (__predict_false(sc
->sc_rxtshift
== TCP_MAXRXTSHIFT
)) {
3498 /* Drop it -- too many retransmissions. */
3503 * Compute the total amount of time this entry has
3504 * been on a queue. If this entry has been on longer
3505 * than the keep alive timer would allow, expire it.
3507 sc
->sc_rxttot
+= sc
->sc_rxtcur
;
3508 if (sc
->sc_rxttot
>= tcp_keepinit
)
3511 TCP_STATINC(TCP_STAT_SC_RETRANSMITTED
);
3512 (void) syn_cache_respond(sc
, NULL
);
3514 /* Advance the timer back-off. */
3516 SYN_CACHE_TIMER_ARM(sc
);
3518 KERNEL_UNLOCK_ONE(NULL
);
3519 mutex_exit(softnet_lock
);
3523 TCP_STATINC(TCP_STAT_SC_TIMED_OUT
);
3525 syn_cache_put(sc
); /* calls pool_put but see spl above */
3526 KERNEL_UNLOCK_ONE(NULL
);
3527 mutex_exit(softnet_lock
);
3531 * Remove syn cache created by the specified tcb entry,
3532 * because this does not make sense to keep them
3533 * (if there's no tcb entry, syn cache entry will never be used)
3536 syn_cache_cleanup(struct tcpcb
*tp
)
3538 struct syn_cache
*sc
, *nsc
;
3543 for (sc
= LIST_FIRST(&tp
->t_sc
); sc
!= NULL
; sc
= nsc
) {
3544 nsc
= LIST_NEXT(sc
, sc_tpq
);
3547 if (sc
->sc_tp
!= tp
)
3548 panic("invalid sc_tp in syn_cache_cleanup");
3551 syn_cache_put(sc
); /* calls pool_put but see spl above */
3553 /* just for safety */
3554 LIST_INIT(&tp
->t_sc
);
3560 * Find an entry in the syn cache.
3563 syn_cache_lookup(const struct sockaddr
*src
, const struct sockaddr
*dst
,
3564 struct syn_cache_head
**headp
)
3566 struct syn_cache
*sc
;
3567 struct syn_cache_head
*scp
;
3571 SYN_HASHALL(hash
, src
, dst
);
3573 scp
= &tcp_syn_cache
[hash
% tcp_syn_cache_size
];
3576 for (sc
= TAILQ_FIRST(&scp
->sch_bucket
); sc
!= NULL
;
3577 sc
= TAILQ_NEXT(sc
, sc_bucketq
)) {
3578 if (sc
->sc_hash
!= hash
)
3580 if (!memcmp(&sc
->sc_src
, src
, src
->sa_len
) &&
3581 !memcmp(&sc
->sc_dst
, dst
, dst
->sa_len
)) {
3591 * This function gets called when we receive an ACK for a
3592 * socket in the LISTEN state. We look up the connection
3593 * in the syn cache, and if its there, we pull it out of
3594 * the cache and turn it into a full-blown connection in
3595 * the SYN-RECEIVED state.
3597 * The return values may not be immediately obvious, and their effects
3598 * can be subtle, so here they are:
3600 * NULL SYN was not found in cache; caller should drop the
3601 * packet and send an RST.
3603 * -1 We were unable to create the new connection, and are
3604 * aborting it. An ACK,RST is being sent to the peer
3605 * (unless we got screwey sequence numbners; see below),
3606 * because the 3-way handshake has been completed. Caller
3607 * should not free the mbuf, since we may be using it. If
3608 * we are not, we will free it.
3610 * Otherwise, the return value is a pointer to the new socket
3611 * associated with the connection.
3614 syn_cache_get(struct sockaddr
*src
, struct sockaddr
*dst
,
3615 struct tcphdr
*th
, unsigned int hlen
, unsigned int tlen
,
3616 struct socket
*so
, struct mbuf
*m
)
3618 struct syn_cache
*sc
;
3619 struct syn_cache_head
*scp
;
3620 struct inpcb
*inp
= NULL
;
3622 struct in6pcb
*in6p
= NULL
;
3624 struct tcpcb
*tp
= 0;
3630 if ((sc
= syn_cache_lookup(src
, dst
, &scp
)) == NULL
) {
3636 * Verify the sequence and ack numbers. Try getting the correct
3639 if ((th
->th_ack
!= sc
->sc_iss
+ 1) ||
3640 SEQ_LEQ(th
->th_seq
, sc
->sc_irs
) ||
3641 SEQ_GT(th
->th_seq
, sc
->sc_irs
+ 1 + sc
->sc_win
)) {
3642 (void) syn_cache_respond(sc
, m
);
3644 return ((struct socket
*)(-1));
3647 /* Remove this cache entry */
3652 * Ok, create the full blown connection, and set things up
3653 * as they would have been set up if we had created the
3654 * connection when the SYN arrived. If we can't create
3655 * the connection, abort it.
3658 * inp still has the OLD in_pcb stuff, set the
3659 * v6-related flags on the new guy, too. This is
3660 * done particularly for the case where an AF_INET6
3661 * socket is bound only to a port, and a v4 connection
3662 * comes in on that port.
3663 * we also copy the flowinfo from the original pcb
3667 so
= sonewconn(so
, SS_ISCONNECTED
);
3671 switch (so
->so_proto
->pr_domain
->dom_family
) {
3674 inp
= sotoinpcb(so
);
3679 in6p
= sotoin6pcb(so
);
3683 switch (src
->sa_family
) {
3687 inp
->inp_laddr
= ((struct sockaddr_in
*)dst
)->sin_addr
;
3688 inp
->inp_lport
= ((struct sockaddr_in
*)dst
)->sin_port
;
3689 inp
->inp_options
= ip_srcroute();
3690 in_pcbstate(inp
, INP_BOUND
);
3691 if (inp
->inp_options
== NULL
) {
3692 inp
->inp_options
= sc
->sc_ipopts
;
3693 sc
->sc_ipopts
= NULL
;
3698 /* IPv4 packet to AF_INET6 socket */
3699 memset(&in6p
->in6p_laddr
, 0, sizeof(in6p
->in6p_laddr
));
3700 in6p
->in6p_laddr
.s6_addr16
[5] = htons(0xffff);
3701 bcopy(&((struct sockaddr_in
*)dst
)->sin_addr
,
3702 &in6p
->in6p_laddr
.s6_addr32
[3],
3703 sizeof(((struct sockaddr_in
*)dst
)->sin_addr
));
3704 in6p
->in6p_lport
= ((struct sockaddr_in
*)dst
)->sin_port
;
3705 in6totcpcb(in6p
)->t_family
= AF_INET
;
3706 if (sotoin6pcb(oso
)->in6p_flags
& IN6P_IPV6_V6ONLY
)
3707 in6p
->in6p_flags
|= IN6P_IPV6_V6ONLY
;
3709 in6p
->in6p_flags
&= ~IN6P_IPV6_V6ONLY
;
3710 in6_pcbstate(in6p
, IN6P_BOUND
);
3718 in6p
->in6p_laddr
= ((struct sockaddr_in6
*)dst
)->sin6_addr
;
3719 in6p
->in6p_lport
= ((struct sockaddr_in6
*)dst
)->sin6_port
;
3720 in6_pcbstate(in6p
, IN6P_BOUND
);
3726 if (in6p
&& in6totcpcb(in6p
)->t_family
== AF_INET6
&& sotoinpcb(oso
)) {
3727 struct in6pcb
*oin6p
= sotoin6pcb(oso
);
3728 /* inherit socket options from the listening socket */
3729 in6p
->in6p_flags
|= (oin6p
->in6p_flags
& IN6P_CONTROLOPTS
);
3730 if (in6p
->in6p_flags
& IN6P_CONTROLOPTS
) {
3731 m_freem(in6p
->in6p_options
);
3732 in6p
->in6p_options
= 0;
3734 ip6_savecontrol(in6p
, &in6p
->in6p_options
,
3735 mtod(m
, struct ip6_hdr
*), m
);
3739 #if defined(IPSEC) || defined(FAST_IPSEC)
3741 * we make a copy of policy, instead of sharing the policy,
3742 * for better behavior in terms of SA lookup and dead SA removal.
3745 /* copy old policy into new socket's */
3746 if (ipsec_copy_pcbpolicy(sotoinpcb(oso
)->inp_sp
, inp
->inp_sp
))
3747 printf("tcp_input: could not copy policy\n");
3751 /* copy old policy into new socket's */
3752 if (ipsec_copy_pcbpolicy(sotoin6pcb(oso
)->in6p_sp
,
3754 printf("tcp_input: could not copy policy\n");
3760 * Give the new socket our cached route reference.
3763 rtcache_copy(&inp
->inp_route
, &sc
->sc_route
);
3764 rtcache_free(&sc
->sc_route
);
3768 rtcache_copy(&in6p
->in6p_route
, &sc
->sc_route
);
3769 rtcache_free(&sc
->sc_route
);
3773 am
= m_get(M_DONTWAIT
, MT_SONAME
); /* XXX */
3776 MCLAIM(am
, &tcp_mowner
);
3777 am
->m_len
= src
->sa_len
;
3778 bcopy(src
, mtod(am
, void *), src
->sa_len
);
3780 if (in_pcbconnect(inp
, am
, &lwp0
)) {
3787 if (src
->sa_family
== AF_INET
) {
3788 /* IPv4 packet to AF_INET6 socket */
3789 struct sockaddr_in6
*sin6
;
3790 sin6
= mtod(am
, struct sockaddr_in6
*);
3791 am
->m_len
= sizeof(*sin6
);
3792 memset(sin6
, 0, sizeof(*sin6
));
3793 sin6
->sin6_family
= AF_INET6
;
3794 sin6
->sin6_len
= sizeof(*sin6
);
3795 sin6
->sin6_port
= ((struct sockaddr_in
*)src
)->sin_port
;
3796 sin6
->sin6_addr
.s6_addr16
[5] = htons(0xffff);
3797 bcopy(&((struct sockaddr_in
*)src
)->sin_addr
,
3798 &sin6
->sin6_addr
.s6_addr32
[3],
3799 sizeof(sin6
->sin6_addr
.s6_addr32
[3]));
3801 if (in6_pcbconnect(in6p
, am
, NULL
)) {
3814 tp
= intotcpcb(inp
);
3817 tp
= in6totcpcb(in6p
);
3821 tp
->t_flags
= sototcpcb(oso
)->t_flags
& TF_NODELAY
;
3822 if (sc
->sc_request_r_scale
!= 15) {
3823 tp
->requested_s_scale
= sc
->sc_requested_s_scale
;
3824 tp
->request_r_scale
= sc
->sc_request_r_scale
;
3825 tp
->snd_scale
= sc
->sc_requested_s_scale
;
3826 tp
->rcv_scale
= sc
->sc_request_r_scale
;
3827 tp
->t_flags
|= TF_REQ_SCALE
|TF_RCVD_SCALE
;
3829 if (sc
->sc_flags
& SCF_TIMESTAMP
)
3830 tp
->t_flags
|= TF_REQ_TSTMP
|TF_RCVD_TSTMP
;
3831 tp
->ts_timebase
= sc
->sc_timebase
;
3833 tp
->t_template
= tcp_template(tp
);
3834 if (tp
->t_template
== 0) {
3835 tp
= tcp_drop(tp
, ENOBUFS
); /* destroys socket */
3841 tp
->iss
= sc
->sc_iss
;
3842 tp
->irs
= sc
->sc_irs
;
3843 tcp_sendseqinit(tp
);
3845 tp
->t_state
= TCPS_SYN_RECEIVED
;
3846 TCP_TIMER_ARM(tp
, TCPT_KEEP
, tp
->t_keepinit
);
3847 TCP_STATINC(TCP_STAT_ACCEPTS
);
3849 if ((sc
->sc_flags
& SCF_SACK_PERMIT
) && tcp_do_sack
)
3850 tp
->t_flags
|= TF_WILL_SACK
;
3852 if ((sc
->sc_flags
& SCF_ECN_PERMIT
) && tcp_do_ecn
)
3853 tp
->t_flags
|= TF_ECN_PERMIT
;
3855 #ifdef TCP_SIGNATURE
3856 if (sc
->sc_flags
& SCF_SIGNATURE
)
3857 tp
->t_flags
|= TF_SIGNATURE
;
3860 /* Initialize tp->t_ourmss before we deal with the peer's! */
3861 tp
->t_ourmss
= sc
->sc_ourmaxseg
;
3862 tcp_mss_from_peer(tp
, sc
->sc_peermaxseg
);
3865 * Initialize the initial congestion window. If we
3866 * had to retransmit the SYN,ACK, we must initialize cwnd
3867 * to 1 segment (i.e. the Loss Window).
3869 if (sc
->sc_rxtshift
)
3870 tp
->snd_cwnd
= tp
->t_peermss
;
3872 int ss
= tcp_init_win
;
3874 if (inp
!= NULL
&& in_localaddr(inp
->inp_faddr
))
3875 ss
= tcp_init_win_local
;
3878 if (in6p
!= NULL
&& in6_localaddr(&in6p
->in6p_faddr
))
3879 ss
= tcp_init_win_local
;
3881 tp
->snd_cwnd
= TCP_INITIAL_WINDOW(ss
, tp
->t_peermss
);
3885 tp
->snd_wl1
= sc
->sc_irs
;
3886 tp
->rcv_up
= sc
->sc_irs
+ 1;
3889 * This is what whould have happened in tcp_output() when
3890 * the SYN,ACK was sent.
3892 tp
->snd_up
= tp
->snd_una
;
3893 tp
->snd_max
= tp
->snd_nxt
= tp
->iss
+1;
3894 TCP_TIMER_ARM(tp
, TCPT_REXMT
, tp
->t_rxtcur
);
3895 if (sc
->sc_win
> 0 && SEQ_GT(tp
->rcv_nxt
+ sc
->sc_win
, tp
->rcv_adv
))
3896 tp
->rcv_adv
= tp
->rcv_nxt
+ sc
->sc_win
;
3897 tp
->last_ack_sent
= tp
->rcv_nxt
;
3898 tp
->t_partialacks
= -1;
3901 TCP_STATINC(TCP_STAT_SC_COMPLETED
);
3908 (void)tcp_respond(NULL
, m
, m
, th
, (tcp_seq
)0, th
->th_ack
, TH_RST
);
3911 (void) soqremque(so
, 1);
3913 mutex_enter(softnet_lock
);
3918 TCP_STATINC(TCP_STAT_SC_ABORTED
);
3919 return ((struct socket
*)(-1));
3923 * This function is called when we get a RST for a
3924 * non-existent connection, so that we can see if the
3925 * connection is in the syn cache. If it is, zap it.
3929 syn_cache_reset(struct sockaddr
*src
, struct sockaddr
*dst
, struct tcphdr
*th
)
3931 struct syn_cache
*sc
;
3932 struct syn_cache_head
*scp
;
3933 int s
= splsoftnet();
3935 if ((sc
= syn_cache_lookup(src
, dst
, &scp
)) == NULL
) {
3939 if (SEQ_LT(th
->th_seq
, sc
->sc_irs
) ||
3940 SEQ_GT(th
->th_seq
, sc
->sc_irs
+1)) {
3945 TCP_STATINC(TCP_STAT_SC_RESET
);
3946 syn_cache_put(sc
); /* calls pool_put but see spl above */
3951 syn_cache_unreach(const struct sockaddr
*src
, const struct sockaddr
*dst
,
3954 struct syn_cache
*sc
;
3955 struct syn_cache_head
*scp
;
3959 if ((sc
= syn_cache_lookup(src
, dst
, &scp
)) == NULL
) {
3963 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3964 if (ntohl (th
->th_seq
) != sc
->sc_iss
) {
3970 * If we've retransmitted 3 times and this is our second error,
3971 * we remove the entry. Otherwise, we allow it to continue on.
3972 * This prevents us from incorrectly nuking an entry during a
3973 * spurious network outage.
3977 if ((sc
->sc_flags
& SCF_UNREACH
) == 0 || sc
->sc_rxtshift
< 3) {
3978 sc
->sc_flags
|= SCF_UNREACH
;
3984 TCP_STATINC(TCP_STAT_SC_UNREACH
);
3985 syn_cache_put(sc
); /* calls pool_put but see spl above */
3990 * Given a LISTEN socket and an inbound SYN request, add
3991 * this to the syn cache, and send back a segment:
3992 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3995 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3996 * Doing so would require that we hold onto the data and deliver it
3997 * to the application. However, if we are the target of a SYN-flood
3998 * DoS attack, an attacker could send data which would eventually
3999 * consume all available buffer space if it were ACKed. By not ACKing
4000 * the data, we avoid this DoS scenario.
4004 syn_cache_add(struct sockaddr
*src
, struct sockaddr
*dst
, struct tcphdr
*th
,
4005 unsigned int hlen
, struct socket
*so
, struct mbuf
*m
, u_char
*optp
,
4006 int optlen
, struct tcp_opt_info
*oi
)
4008 struct tcpcb tb
, *tp
;
4010 struct syn_cache
*sc
;
4011 struct syn_cache_head
*scp
;
4012 struct mbuf
*ipopts
;
4013 struct tcp_opt_info opti
;
4018 memset(&opti
, 0, sizeof(opti
));
4021 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
4023 * Note this check is performed in tcp_input() very early on.
4027 * Initialize some local state.
4029 win
= sbspace(&so
->so_rcv
);
4030 if (win
> TCP_MAXWIN
)
4033 switch (src
->sa_family
) {
4037 * Remember the IP options, if any.
4039 ipopts
= ip_srcroute();
4046 #ifdef TCP_SIGNATURE
4047 if (optp
|| (tp
->t_flags
& TF_SIGNATURE
))
4052 tb
.t_flags
= tcp_do_rfc1323
? (TF_REQ_SCALE
|TF_REQ_TSTMP
) : 0;
4053 #ifdef TCP_SIGNATURE
4054 tb
.t_flags
|= (tp
->t_flags
& TF_SIGNATURE
);
4056 tb
.t_state
= TCPS_LISTEN
;
4057 if (tcp_dooptions(&tb
, optp
, optlen
, th
, m
, m
->m_pkthdr
.len
-
4058 sizeof(struct tcphdr
) - optlen
- hlen
, oi
) < 0)
4064 * See if we already have an entry for this connection.
4065 * If we do, resend the SYN,ACK. We do not count this
4066 * as a retransmission (XXX though maybe we should).
4068 if ((sc
= syn_cache_lookup(src
, dst
, &scp
)) != NULL
) {
4069 TCP_STATINC(TCP_STAT_SC_DUPESYN
);
4072 * If we were remembering a previous source route,
4073 * forget it and use the new one we've been given.
4076 (void) m_free(sc
->sc_ipopts
);
4077 sc
->sc_ipopts
= ipopts
;
4079 sc
->sc_timestamp
= tb
.ts_recent
;
4080 if (syn_cache_respond(sc
, m
) == 0) {
4081 uint64_t *tcps
= TCP_STAT_GETREF();
4082 tcps
[TCP_STAT_SNDACKS
]++;
4083 tcps
[TCP_STAT_SNDTOTAL
]++;
4090 sc
= pool_get(&syn_cache_pool
, PR_NOWAIT
);
4094 (void) m_free(ipopts
);
4099 * Fill in the cache, and put the necessary IP and TCP
4100 * options into the reply.
4102 memset(sc
, 0, sizeof(struct syn_cache
));
4103 callout_init(&sc
->sc_timer
, CALLOUT_MPSAFE
);
4104 bcopy(src
, &sc
->sc_src
, src
->sa_len
);
4105 bcopy(dst
, &sc
->sc_dst
, dst
->sa_len
);
4107 sc
->sc_ipopts
= ipopts
;
4108 sc
->sc_irs
= th
->th_seq
;
4109 switch (src
->sa_family
) {
4113 struct sockaddr_in
*srcin
= (void *) src
;
4114 struct sockaddr_in
*dstin
= (void *) dst
;
4116 sc
->sc_iss
= tcp_new_iss1(&dstin
->sin_addr
,
4117 &srcin
->sin_addr
, dstin
->sin_port
,
4118 srcin
->sin_port
, sizeof(dstin
->sin_addr
), 0);
4125 struct sockaddr_in6
*srcin6
= (void *) src
;
4126 struct sockaddr_in6
*dstin6
= (void *) dst
;
4128 sc
->sc_iss
= tcp_new_iss1(&dstin6
->sin6_addr
,
4129 &srcin6
->sin6_addr
, dstin6
->sin6_port
,
4130 srcin6
->sin6_port
, sizeof(dstin6
->sin6_addr
), 0);
4135 sc
->sc_peermaxseg
= oi
->maxseg
;
4136 sc
->sc_ourmaxseg
= tcp_mss_to_advertise(m
->m_flags
& M_PKTHDR
?
4137 m
->m_pkthdr
.rcvif
: NULL
,
4138 sc
->sc_src
.sa
.sa_family
);
4140 sc
->sc_timebase
= tcp_now
- 1; /* see tcp_newtcpcb() */
4141 sc
->sc_timestamp
= tb
.ts_recent
;
4142 if ((tb
.t_flags
& (TF_REQ_TSTMP
|TF_RCVD_TSTMP
)) ==
4143 (TF_REQ_TSTMP
|TF_RCVD_TSTMP
))
4144 sc
->sc_flags
|= SCF_TIMESTAMP
;
4145 if ((tb
.t_flags
& (TF_RCVD_SCALE
|TF_REQ_SCALE
)) ==
4146 (TF_RCVD_SCALE
|TF_REQ_SCALE
)) {
4147 sc
->sc_requested_s_scale
= tb
.requested_s_scale
;
4148 sc
->sc_request_r_scale
= 0;
4150 * Pick the smallest possible scaling factor that
4151 * will still allow us to scale up to sb_max.
4153 * We do this because there are broken firewalls that
4154 * will corrupt the window scale option, leading to
4155 * the other endpoint believing that our advertised
4156 * window is unscaled. At scale factors larger than
4157 * 5 the unscaled window will drop below 1500 bytes,
4158 * leading to serious problems when traversing these
4161 * With the default sbmax of 256K, a scale factor
4162 * of 3 will be chosen by this algorithm. Those who
4163 * choose a larger sbmax should watch out
4164 * for the compatiblity problems mentioned above.
4166 * RFC1323: The Window field in a SYN (i.e., a <SYN>
4167 * or <SYN,ACK>) segment itself is never scaled.
4169 while (sc
->sc_request_r_scale
< TCP_MAX_WINSHIFT
&&
4170 (TCP_MAXWIN
<< sc
->sc_request_r_scale
) < sb_max
)
4171 sc
->sc_request_r_scale
++;
4173 sc
->sc_requested_s_scale
= 15;
4174 sc
->sc_request_r_scale
= 15;
4176 if ((tb
.t_flags
& TF_SACK_PERMIT
) && tcp_do_sack
)
4177 sc
->sc_flags
|= SCF_SACK_PERMIT
;
4180 * ECN setup packet recieved.
4182 if ((th
->th_flags
& (TH_ECE
|TH_CWR
)) && tcp_do_ecn
)
4183 sc
->sc_flags
|= SCF_ECN_PERMIT
;
4185 #ifdef TCP_SIGNATURE
4186 if (tb
.t_flags
& TF_SIGNATURE
)
4187 sc
->sc_flags
|= SCF_SIGNATURE
;
4190 if (syn_cache_respond(sc
, m
) == 0) {
4191 uint64_t *tcps
= TCP_STAT_GETREF();
4192 tcps
[TCP_STAT_SNDACKS
]++;
4193 tcps
[TCP_STAT_SNDTOTAL
]++;
4195 syn_cache_insert(sc
, tp
);
4200 TCP_STATINC(TCP_STAT_SC_DROPPED
);
4206 syn_cache_respond(struct syn_cache
*sc
, struct mbuf
*m
)
4215 struct ip
*ip
= NULL
;
4217 struct ip6_hdr
*ip6
= NULL
;
4219 struct tcpcb
*tp
= NULL
;
4225 switch (sc
->sc_src
.sa
.sa_family
) {
4227 hlen
= sizeof(struct ip
);
4231 hlen
= sizeof(struct ip6_hdr
);
4237 return (EAFNOSUPPORT
);
4240 /* Compute the size of the TCP options. */
4241 optlen
= 4 + (sc
->sc_request_r_scale
!= 15 ? 4 : 0) +
4242 ((sc
->sc_flags
& SCF_SACK_PERMIT
) ? (TCPOLEN_SACK_PERMITTED
+ 2) : 0) +
4243 #ifdef TCP_SIGNATURE
4244 ((sc
->sc_flags
& SCF_SIGNATURE
) ? (TCPOLEN_SIGNATURE
+ 2) : 0) +
4246 ((sc
->sc_flags
& SCF_TIMESTAMP
) ? TCPOLEN_TSTAMP_APPA
: 0);
4248 tlen
= hlen
+ sizeof(struct tcphdr
) + optlen
;
4251 * Create the IP+TCP header from scratch.
4256 if (max_linkhdr
+ tlen
> MCLBYTES
)
4259 MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
4260 if (m
&& tlen
> MHLEN
) {
4261 MCLGET(m
, M_DONTWAIT
);
4262 if ((m
->m_flags
& M_EXT
) == 0) {
4269 MCLAIM(m
, &tcp_tx_mowner
);
4271 /* Fixup the mbuf. */
4272 m
->m_data
+= max_linkhdr
;
4273 m
->m_len
= m
->m_pkthdr
.len
= tlen
;
4277 so
= tp
->t_inpcb
->inp_socket
;
4279 else if (tp
->t_in6pcb
)
4280 so
= tp
->t_in6pcb
->in6p_socket
;
4286 m
->m_pkthdr
.rcvif
= NULL
;
4287 memset(mtod(m
, u_char
*), 0, tlen
);
4289 switch (sc
->sc_src
.sa
.sa_family
) {
4291 ip
= mtod(m
, struct ip
*);
4293 ip
->ip_dst
= sc
->sc_src
.sin
.sin_addr
;
4294 ip
->ip_src
= sc
->sc_dst
.sin
.sin_addr
;
4295 ip
->ip_p
= IPPROTO_TCP
;
4296 th
= (struct tcphdr
*)(ip
+ 1);
4297 th
->th_dport
= sc
->sc_src
.sin
.sin_port
;
4298 th
->th_sport
= sc
->sc_dst
.sin
.sin_port
;
4302 ip6
= mtod(m
, struct ip6_hdr
*);
4303 ip6
->ip6_vfc
= IPV6_VERSION
;
4304 ip6
->ip6_dst
= sc
->sc_src
.sin6
.sin6_addr
;
4305 ip6
->ip6_src
= sc
->sc_dst
.sin6
.sin6_addr
;
4306 ip6
->ip6_nxt
= IPPROTO_TCP
;
4307 /* ip6_plen will be updated in ip6_output() */
4308 th
= (struct tcphdr
*)(ip6
+ 1);
4309 th
->th_dport
= sc
->sc_src
.sin6
.sin6_port
;
4310 th
->th_sport
= sc
->sc_dst
.sin6
.sin6_port
;
4317 th
->th_seq
= htonl(sc
->sc_iss
);
4318 th
->th_ack
= htonl(sc
->sc_irs
+ 1);
4319 th
->th_off
= (sizeof(struct tcphdr
) + optlen
) >> 2;
4320 th
->th_flags
= TH_SYN
|TH_ACK
;
4321 th
->th_win
= htons(sc
->sc_win
);
4322 /* th_sum already 0 */
4323 /* th_urp already 0 */
4325 /* Tack on the TCP options. */
4326 optp
= (u_int8_t
*)(th
+ 1);
4327 *optp
++ = TCPOPT_MAXSEG
;
4329 *optp
++ = (sc
->sc_ourmaxseg
>> 8) & 0xff;
4330 *optp
++ = sc
->sc_ourmaxseg
& 0xff;
4332 if (sc
->sc_request_r_scale
!= 15) {
4333 *((u_int32_t
*)optp
) = htonl(TCPOPT_NOP
<< 24 |
4334 TCPOPT_WINDOW
<< 16 | TCPOLEN_WINDOW
<< 8 |
4335 sc
->sc_request_r_scale
);
4339 if (sc
->sc_flags
& SCF_TIMESTAMP
) {
4340 u_int32_t
*lp
= (u_int32_t
*)(optp
);
4341 /* Form timestamp option as shown in appendix A of RFC 1323. */
4342 *lp
++ = htonl(TCPOPT_TSTAMP_HDR
);
4343 *lp
++ = htonl(SYN_CACHE_TIMESTAMP(sc
));
4344 *lp
= htonl(sc
->sc_timestamp
);
4345 optp
+= TCPOLEN_TSTAMP_APPA
;
4348 if (sc
->sc_flags
& SCF_SACK_PERMIT
) {
4351 /* Let the peer know that we will SACK. */
4352 p
[0] = TCPOPT_SACK_PERMITTED
;
4360 * Send ECN SYN-ACK setup packet.
4361 * Routes can be asymetric, so, even if we receive a packet
4362 * with ECE and CWR set, we must not assume no one will block
4363 * the ECE packet we are about to send.
4365 if ((sc
->sc_flags
& SCF_ECN_PERMIT
) && tp
&&
4366 SEQ_GEQ(tp
->snd_nxt
, tp
->snd_max
)) {
4367 th
->th_flags
|= TH_ECE
;
4368 TCP_STATINC(TCP_STAT_ECN_SHS
);
4371 * draft-ietf-tcpm-ecnsyn-00.txt
4373 * "[...] a TCP node MAY respond to an ECN-setup
4374 * SYN packet by setting ECT in the responding
4375 * ECN-setup SYN/ACK packet, indicating to routers
4376 * that the SYN/ACK packet is ECN-Capable.
4377 * This allows a congested router along the path
4378 * to mark the packet instead of dropping the
4379 * packet as an indication of congestion."
4381 * "[...] There can be a great benefit in setting
4382 * an ECN-capable codepoint in SYN/ACK packets [...]
4383 * Congestion is most likely to occur in
4384 * the server-to-client direction. As a result,
4385 * setting an ECN-capable codepoint in SYN/ACK
4386 * packets can reduce the occurence of three-second
4387 * retransmit timeouts resulting from the drop
4388 * of SYN/ACK packets."
4390 * Page 4 and 6, January 2006.
4393 switch (sc
->sc_src
.sa
.sa_family
) {
4396 ip
->ip_tos
|= IPTOS_ECN_ECT0
;
4401 ip6
->ip6_flow
|= htonl(IPTOS_ECN_ECT0
<< 20);
4405 TCP_STATINC(TCP_STAT_ECN_ECT
);
4408 #ifdef TCP_SIGNATURE
4409 if (sc
->sc_flags
& SCF_SIGNATURE
) {
4410 struct secasvar
*sav
;
4413 sav
= tcp_signature_getsav(m
, th
);
4421 *optp
++ = TCPOPT_SIGNATURE
;
4422 *optp
++ = TCPOLEN_SIGNATURE
;
4424 memset(optp
, 0, TCP_SIGLEN
);
4426 *optp
++ = TCPOPT_NOP
;
4427 *optp
++ = TCPOPT_EOL
;
4429 (void)tcp_signature(m
, th
, hlen
, sav
, sigp
);
4431 key_sa_recordxfer(sav
, m
);
4440 /* Compute the packet's checksum. */
4441 switch (sc
->sc_src
.sa
.sa_family
) {
4443 ip
->ip_len
= htons(tlen
- hlen
);
4445 th
->th_sum
= in4_cksum(m
, IPPROTO_TCP
, hlen
, tlen
- hlen
);
4449 ip6
->ip6_plen
= htons(tlen
- hlen
);
4451 th
->th_sum
= in6_cksum(m
, IPPROTO_TCP
, hlen
, tlen
- hlen
);
4457 * Fill in some straggling IP bits. Note the stack expects
4458 * ip_len to be in host order, for convenience.
4460 switch (sc
->sc_src
.sa
.sa_family
) {
4463 ip
->ip_len
= htons(tlen
);
4464 ip
->ip_ttl
= ip_defttl
;
4470 ip6
->ip6_vfc
&= ~IPV6_VERSION_MASK
;
4471 ip6
->ip6_vfc
|= IPV6_VERSION
;
4472 ip6
->ip6_plen
= htons(tlen
- hlen
);
4473 /* ip6_hlim will be initialized afterwards */
4474 /* XXX flowlabel? */
4479 /* XXX use IPsec policy on listening socket, on SYN ACK */
4482 switch (sc
->sc_src
.sa
.sa_family
) {
4485 error
= ip_output(m
, sc
->sc_ipopts
, ro
,
4486 (ip_mtudisc
? IP_MTUDISC
: 0),
4487 (struct ip_moptions
*)NULL
, so
);
4492 ip6
->ip6_hlim
= in6_selecthlim(NULL
,
4493 (rt
= rtcache_validate(ro
)) != NULL
? rt
->rt_ifp
4496 error
= ip6_output(m
, NULL
/*XXX*/, ro
, 0, NULL
, so
, NULL
);
4500 error
= EAFNOSUPPORT
;