sys/netinet/tcp_input.c

   1 /*      $NetBSD: tcp_input.c,v 1.298 2009/07/18 23:09:53 minskim Exp $  */
   2
   3 /*
   4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the project nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
  34  *
  35  * NRL grants permission for redistribution and use in source and binary
  36  * forms, with or without modification, of the software and documentation
  37  * created at NRL provided that the following conditions are met:
  38  *
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgements:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  *      This product includes software developed at the Information
  49  *      Technology Division, US Naval Research Laboratory.
  50  * 4. Neither the name of the NRL nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
  55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
  58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  65  *
  66  * The views and conclusions contained in the software and documentation
  67  * are those of the authors and should not be interpreted as representing
  68  * official policies, either expressed or implied, of the US Naval
  69  * Research Laboratory (NRL).
  70  */
  71
  72 /*-
  73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
  74  * All rights reserved.
  75  *
  76  * This code is derived from software contributed to The NetBSD Foundation
  77  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
  78  * Facility, NASA Ames Research Center.
  79  * This code is derived from software contributed to The NetBSD Foundation
  80  * by Charles M. Hannum.
  81  * This code is derived from software contributed to The NetBSD Foundation
  82  * by Rui Paulo.
  83  *
  84  * Redistribution and use in source and binary forms, with or without
  85  * modification, are permitted provided that the following conditions
  86  * are met:
  87  * 1. Redistributions of source code must retain the above copyright
  88  *    notice, this list of conditions and the following disclaimer.
  89  * 2. Redistributions in binary form must reproduce the above copyright
  90  *    notice, this list of conditions and the following disclaimer in the
  91  *    documentation and/or other materials provided with the distribution.
  92  *
  93  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  94  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  95  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  96  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  97  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  98  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  99  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 100  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 101  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 102  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 103  * POSSIBILITY OF SUCH DAMAGE.
 104  */
 105
 106 /*
 107  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
 108  *      The Regents of the University of California.  All rights reserved.
 109  *
 110  * Redistribution and use in source and binary forms, with or without
 111  * modification, are permitted provided that the following conditions
 112  * are met:
 113  * 1. Redistributions of source code must retain the above copyright
 114  *    notice, this list of conditions and the following disclaimer.
 115  * 2. Redistributions in binary form must reproduce the above copyright
 116  *    notice, this list of conditions and the following disclaimer in the
 117  *    documentation and/or other materials provided with the distribution.
 118  * 3. Neither the name of the University nor the names of its contributors
 119  *    may be used to endorse or promote products derived from this software
 120  *    without specific prior written permission.
 121  *
 122  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 123  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 124  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 125  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 126  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 127  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 128  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 129  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 130  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 131  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 132  * SUCH DAMAGE.
 133  *
 134  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
 135  */
 136
 137 /*
 138  *      TODO list for SYN cache stuff:
 139  *
 140  *      Find room for a "state" field, which is needed to keep a
 141  *      compressed state for TIME_WAIT TCBs.  It's been noted already
 142  *      that this is fairly important for very high-volume web and
 143  *      mail servers, which use a large number of short-lived
 144  *      connections.
 145  */
 146
 147 #include <sys/cdefs.h>
 148 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.298 2009/07/18 23:09:53 minskim Exp $");
 149
 150 #include "opt_inet.h"
 151 #include "opt_ipsec.h"
 152 #include "opt_inet_csum.h"
 153 #include "opt_tcp_debug.h"
 154
 155 #include <sys/param.h>
 156 #include <sys/systm.h>
 157 #include <sys/malloc.h>
 158 #include <sys/mbuf.h>
 159 #include <sys/protosw.h>
 160 #include <sys/socket.h>
 161 #include <sys/socketvar.h>
 162 #include <sys/errno.h>
 163 #include <sys/syslog.h>
 164 #include <sys/pool.h>
 165 #include <sys/domain.h>
 166 #include <sys/kernel.h>
 167 #ifdef TCP_SIGNATURE
 168 #include <sys/md5.h>
 169 #endif
 170 #include <sys/lwp.h> /* for lwp0 */
 171
 172 #include <net/if.h>
 173 #include <net/route.h>
 174 #include <net/if_types.h>
 175
 176 #include <netinet/in.h>
 177 #include <netinet/in_systm.h>
 178 #include <netinet/ip.h>
 179 #include <netinet/in_pcb.h>
 180 #include <netinet/in_var.h>
 181 #include <netinet/ip_var.h>
 182 #include <netinet/in_offload.h>
 183
 184 #ifdef INET6
 185 #ifndef INET
 186 #include <netinet/in.h>
 187 #endif
 188 #include <netinet/ip6.h>
 189 #include <netinet6/ip6_var.h>
 190 #include <netinet6/in6_pcb.h>
 191 #include <netinet6/ip6_var.h>
 192 #include <netinet6/in6_var.h>
 193 #include <netinet/icmp6.h>
 194 #include <netinet6/nd6.h>
 195 #ifdef TCP_SIGNATURE
 196 #include <netinet6/scope6_var.h>
 197 #endif
 198 #endif
 199
 200 #ifndef INET6
 201 /* always need ip6.h for IP6_EXTHDR_GET */
 202 #include <netinet/ip6.h>
 203 #endif
 204
 205 #include <netinet/tcp.h>
 206 #include <netinet/tcp_fsm.h>
 207 #include <netinet/tcp_seq.h>
 208 #include <netinet/tcp_timer.h>
 209 #include <netinet/tcp_var.h>
 210 #include <netinet/tcp_private.h>
 211 #include <netinet/tcpip.h>
 212 #include <netinet/tcp_congctl.h>
 213 #include <netinet/tcp_debug.h>
 214
 215 #include <machine/stdarg.h>
 216
 217 #ifdef IPSEC
 218 #include <netinet6/ipsec.h>
 219 #include <netinet6/ipsec_private.h>
 220 #include <netkey/key.h>
 221 #endif /*IPSEC*/
 222 #ifdef INET6
 223 #include "faith.h"
 224 #if defined(NFAITH) && NFAITH > 0
 225 #include <net/if_faith.h>
 226 #endif
 227 #endif  /* IPSEC */
 228
 229 #ifdef FAST_IPSEC
 230 #include <netipsec/ipsec.h>
 231 #include <netipsec/ipsec_var.h>
 232 #include <netipsec/ipsec_private.h>
 233 #include <netipsec/key.h>
 234 #ifdef INET6
 235 #include <netipsec/ipsec6.h>
 236 #endif
 237 #endif  /* FAST_IPSEC*/
 238
 239 int     tcprexmtthresh = 3;
 240 int     tcp_log_refused;
 241
 242 int     tcp_do_autorcvbuf = 0;
 243 int     tcp_autorcvbuf_inc = 16 * 1024;
 244 int     tcp_autorcvbuf_max = 256 * 1024;
 245 int     tcp_msl = (TCPTV_MSL / PR_SLOWHZ);
 246
 247 static int tcp_rst_ppslim_count = 0;
 248 static struct timeval tcp_rst_ppslim_last;
 249 static int tcp_ackdrop_ppslim_count = 0;
 250 static struct timeval tcp_ackdrop_ppslim_last;
 251
 252 #define TCP_PAWS_IDLE   (24U * 24 * 60 * 60 * PR_SLOWHZ)
 253
 254 /* for modulo comparisons of timestamps */
 255 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
 256 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
 257
 258 /*
 259  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
 260  */
 261 #ifdef INET6
 262 static inline void
 263 nd6_hint(struct tcpcb *tp)
 264 {
 265         struct rtentry *rt;
 266
 267         if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 &&
 268             (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL)
 269                 nd6_nud_hint(rt, NULL, 0);
 270 }
 271 #else
 272 static inline void
 273 nd6_hint(struct tcpcb *tp)
 274 {
 275 }
 276 #endif
 277
 278 /*
 279  * Compute ACK transmission behavior.  Delay the ACK unless
 280  * we have already delayed an ACK (must send an ACK every two segments).
 281  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
 282  * option is enabled.
 283  */
 284 static void
 285 tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th)
 286 {
 287
 288         if (tp->t_flags & TF_DELACK ||
 289             (tcp_ack_on_push && th->th_flags & TH_PUSH))
 290                 tp->t_flags |= TF_ACKNOW;
 291         else
 292                 TCP_SET_DELACK(tp);
 293 }
 294
 295 static void
 296 icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked)
 297 {
 298
 299         /*
 300          * If we had a pending ICMP message that refers to data that have
 301          * just been acknowledged, disregard the recorded ICMP message.
 302          */
 303         if ((tp->t_flags & TF_PMTUD_PEND) &&
 304             SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
 305                 tp->t_flags &= ~TF_PMTUD_PEND;
 306
 307         /*
 308          * Keep track of the largest chunk of data
 309          * acknowledged since last PMTU update
 310          */
 311         if (tp->t_pmtud_mss_acked < acked)
 312                 tp->t_pmtud_mss_acked = acked;
 313 }
 314
 315 /*
 316  * Convert TCP protocol fields to host order for easier processing.
 317  */
 318 static void
 319 tcp_fields_to_host(struct tcphdr *th)
 320 {
 321
 322         NTOHL(th->th_seq);
 323         NTOHL(th->th_ack);
 324         NTOHS(th->th_win);
 325         NTOHS(th->th_urp);
 326 }
 327
 328 /*
 329  * ... and reverse the above.
 330  */
 331 static void
 332 tcp_fields_to_net(struct tcphdr *th)
 333 {
 334
 335         HTONL(th->th_seq);
 336         HTONL(th->th_ack);
 337         HTONS(th->th_win);
 338         HTONS(th->th_urp);
 339 }
 340
 341 #ifdef TCP_CSUM_COUNTERS
 342 #include <sys/device.h>
 343
 344 #if defined(INET)
 345 extern struct evcnt tcp_hwcsum_ok;
 346 extern struct evcnt tcp_hwcsum_bad;
 347 extern struct evcnt tcp_hwcsum_data;
 348 extern struct evcnt tcp_swcsum;
 349 #endif /* defined(INET) */
 350 #if defined(INET6)
 351 extern struct evcnt tcp6_hwcsum_ok;
 352 extern struct evcnt tcp6_hwcsum_bad;
 353 extern struct evcnt tcp6_hwcsum_data;
 354 extern struct evcnt tcp6_swcsum;
 355 #endif /* defined(INET6) */
 356
 357 #define TCP_CSUM_COUNTER_INCR(ev)       (ev)->ev_count++
 358
 359 #else
 360
 361 #define TCP_CSUM_COUNTER_INCR(ev)       /* nothing */
 362
 363 #endif /* TCP_CSUM_COUNTERS */
 364
 365 #ifdef TCP_REASS_COUNTERS
 366 #include <sys/device.h>
 367
 368 extern struct evcnt tcp_reass_;
 369 extern struct evcnt tcp_reass_empty;
 370 extern struct evcnt tcp_reass_iteration[8];
 371 extern struct evcnt tcp_reass_prependfirst;
 372 extern struct evcnt tcp_reass_prepend;
 373 extern struct evcnt tcp_reass_insert;
 374 extern struct evcnt tcp_reass_inserttail;
 375 extern struct evcnt tcp_reass_append;
 376 extern struct evcnt tcp_reass_appendtail;
 377 extern struct evcnt tcp_reass_overlaptail;
 378 extern struct evcnt tcp_reass_overlapfront;
 379 extern struct evcnt tcp_reass_segdup;
 380 extern struct evcnt tcp_reass_fragdup;
 381
 382 #define TCP_REASS_COUNTER_INCR(ev)      (ev)->ev_count++
 383
 384 #else
 385
 386 #define TCP_REASS_COUNTER_INCR(ev)      /* nothing */
 387
 388 #endif /* TCP_REASS_COUNTERS */
 389
 390 static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *,
 391     int *);
 392 static int tcp_dooptions(struct tcpcb *, const u_char *, int,
 393     struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
 394
 395 #ifdef INET
 396 static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
 397 #endif
 398 #ifdef INET6
 399 static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
 400 #endif
 401
 402 #define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
 403
 404 #if defined(MBUFTRACE)
 405 struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass");
 406 #endif /* defined(MBUFTRACE) */
 407
 408 static struct pool tcpipqent_pool;
 409
 410 void
 411 tcpipqent_init(void)
 412 {
 413
 414         pool_init(&tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl",
 415             NULL, IPL_VM);
 416 }
 417
 418 struct ipqent *
 419 tcpipqent_alloc(void)
 420 {
 421         struct ipqent *ipqe;
 422         int s;
 423
 424         s = splvm();
 425         ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
 426         splx(s);
 427
 428         return ipqe;
 429 }
 430
 431 void
 432 tcpipqent_free(struct ipqent *ipqe)
 433 {
 434         int s;
 435
 436         s = splvm();
 437         pool_put(&tcpipqent_pool, ipqe);
 438         splx(s);
 439 }
 440
 441 static int
 442 tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int *tlen)
 443 {
 444         struct ipqent *p, *q, *nq, *tiqe = NULL;
 445         struct socket *so = NULL;
 446         int pkt_flags;
 447         tcp_seq pkt_seq;
 448         unsigned pkt_len;
 449         u_long rcvpartdupbyte = 0;
 450         u_long rcvoobyte;
 451 #ifdef TCP_REASS_COUNTERS
 452         u_int count = 0;
 453 #endif
 454         uint64_t *tcps;
 455
 456         if (tp->t_inpcb)
 457                 so = tp->t_inpcb->inp_socket;
 458 #ifdef INET6
 459         else if (tp->t_in6pcb)
 460                 so = tp->t_in6pcb->in6p_socket;
 461 #endif
 462
 463         TCP_REASS_LOCK_CHECK(tp);
 464
 465         /*
 466          * Call with th==0 after become established to
 467          * force pre-ESTABLISHED data up to user socket.
 468          */
 469         if (th == 0)
 470                 goto present;
 471
 472         m_claimm(m, &tcp_reass_mowner);
 473
 474         rcvoobyte = *tlen;
 475         /*
 476          * Copy these to local variables because the tcpiphdr
 477          * gets munged while we are collapsing mbufs.
 478          */
 479         pkt_seq = th->th_seq;
 480         pkt_len = *tlen;
 481         pkt_flags = th->th_flags;
 482
 483         TCP_REASS_COUNTER_INCR(&tcp_reass_);
 484
 485         if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
 486                 /*
 487                  * When we miss a packet, the vast majority of time we get
 488                  * packets that follow it in order.  So optimize for that.
 489                  */
 490                 if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
 491                         p->ipqe_len += pkt_len;
 492                         p->ipqe_flags |= pkt_flags;
 493                         m_cat(p->ipre_mlast, m);
 494                         TRAVERSE(p->ipre_mlast);
 495                         m = NULL;
 496                         tiqe = p;
 497                         TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
 498                         TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
 499                         goto skip_replacement;
 500                 }
 501                 /*
 502                  * While we're here, if the pkt is completely beyond
 503                  * anything we have, just insert it at the tail.
 504                  */
 505                 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
 506                         TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
 507                         goto insert_it;
 508                 }
 509         }
 510
 511         q = TAILQ_FIRST(&tp->segq);
 512
 513         if (q != NULL) {
 514                 /*
 515                  * If this segment immediately precedes the first out-of-order
 516                  * block, simply slap the segment in front of it and (mostly)
 517                  * skip the complicated logic.
 518                  */
 519                 if (pkt_seq + pkt_len == q->ipqe_seq) {
 520                         q->ipqe_seq = pkt_seq;
 521                         q->ipqe_len += pkt_len;
 522                         q->ipqe_flags |= pkt_flags;
 523                         m_cat(m, q->ipqe_m);
 524                         q->ipqe_m = m;
 525                         q->ipre_mlast = m; /* last mbuf may have changed */
 526                         TRAVERSE(q->ipre_mlast);
 527                         tiqe = q;
 528                         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
 529                         TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
 530                         goto skip_replacement;
 531                 }
 532         } else {
 533                 TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
 534         }
 535
 536         /*
 537          * Find a segment which begins after this one does.
 538          */
 539         for (p = NULL; q != NULL; q = nq) {
 540                 nq = TAILQ_NEXT(q, ipqe_q);
 541 #ifdef TCP_REASS_COUNTERS
 542                 count++;
 543 #endif
 544                 /*
 545                  * If the received segment is just right after this
 546                  * fragment, merge the two together and then check
 547                  * for further overlaps.
 548                  */
 549                 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
 550 #ifdef TCPREASS_DEBUG
 551                         printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
 552                                tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
 553                                q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
 554 #endif
 555                         pkt_len += q->ipqe_len;
 556                         pkt_flags |= q->ipqe_flags;
 557                         pkt_seq = q->ipqe_seq;
 558                         m_cat(q->ipre_mlast, m);
 559                         TRAVERSE(q->ipre_mlast);
 560                         m = q->ipqe_m;
 561                         TCP_REASS_COUNTER_INCR(&tcp_reass_append);
 562                         goto free_ipqe;
 563                 }
 564                 /*
 565                  * If the received segment is completely past this
 566                  * fragment, we need to go the next fragment.
 567                  */
 568                 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
 569                         p = q;
 570                         continue;
 571                 }
 572                 /*
 573                  * If the fragment is past the received segment,
 574                  * it (or any following) can't be concatenated.
 575                  */
 576                 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
 577                         TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
 578                         break;
 579                 }
 580
 581                 /*
 582                  * We've received all the data in this segment before.
 583                  * mark it as a duplicate and return.
 584                  */
 585                 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
 586                     SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
 587                         tcps = TCP_STAT_GETREF();
 588                         tcps[TCP_STAT_RCVDUPPACK]++;
 589                         tcps[TCP_STAT_RCVDUPBYTE] += pkt_len;
 590                         TCP_STAT_PUTREF();
 591                         tcp_new_dsack(tp, pkt_seq, pkt_len);
 592                         m_freem(m);
 593                         if (tiqe != NULL) {
 594                                 tcpipqent_free(tiqe);
 595                         }
 596                         TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
 597                         return (0);
 598                 }
 599                 /*
 600                  * Received segment completely overlaps this fragment
 601                  * so we drop the fragment (this keeps the temporal
 602                  * ordering of segments correct).
 603                  */
 604                 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
 605                     SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
 606                         rcvpartdupbyte += q->ipqe_len;
 607                         m_freem(q->ipqe_m);
 608                         TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
 609                         goto free_ipqe;
 610                 }
 611                 /*
 612                  * RX'ed segment extends past the end of the
 613                  * fragment.  Drop the overlapping bytes.  Then
 614                  * merge the fragment and segment then treat as
 615                  * a longer received packet.
 616                  */
 617                 if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
 618                     SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq))  {
 619                         int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
 620 #ifdef TCPREASS_DEBUG
 621                         printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
 622                                tp, overlap,
 623                                pkt_seq, pkt_seq + pkt_len, pkt_len);
 624 #endif
 625                         m_adj(m, overlap);
 626                         rcvpartdupbyte += overlap;
 627                         m_cat(q->ipre_mlast, m);
 628                         TRAVERSE(q->ipre_mlast);
 629                         m = q->ipqe_m;
 630                         pkt_seq = q->ipqe_seq;
 631                         pkt_len += q->ipqe_len - overlap;
 632                         rcvoobyte -= overlap;
 633                         TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
 634                         goto free_ipqe;
 635                 }
 636                 /*
 637                  * RX'ed segment extends past the front of the
 638                  * fragment.  Drop the overlapping bytes on the
 639                  * received packet.  The packet will then be
 640                  * contatentated with this fragment a bit later.
 641                  */
 642                 if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
 643                     SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len))  {
 644                         int overlap = pkt_seq + pkt_len - q->ipqe_seq;
 645 #ifdef TCPREASS_DEBUG
 646                         printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
 647                                tp, overlap,
 648                                pkt_seq, pkt_seq + pkt_len, pkt_len);
 649 #endif
 650                         m_adj(m, -overlap);
 651                         pkt_len -= overlap;
 652                         rcvpartdupbyte += overlap;
 653                         TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
 654                         rcvoobyte -= overlap;
 655                 }
 656                 /*
 657                  * If the received segment immediates precedes this
 658                  * fragment then tack the fragment onto this segment
 659                  * and reinsert the data.
 660                  */
 661                 if (q->ipqe_seq == pkt_seq + pkt_len) {
 662 #ifdef TCPREASS_DEBUG
 663                         printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
 664                                tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
 665                                pkt_seq, pkt_seq + pkt_len, pkt_len);
 666 #endif
 667                         pkt_len += q->ipqe_len;
 668                         pkt_flags |= q->ipqe_flags;
 669                         m_cat(m, q->ipqe_m);
 670                         TAILQ_REMOVE(&tp->segq, q, ipqe_q);
 671                         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
 672                         tp->t_segqlen--;
 673                         KASSERT(tp->t_segqlen >= 0);
 674                         KASSERT(tp->t_segqlen != 0 ||
 675                             (TAILQ_EMPTY(&tp->segq) &&
 676                             TAILQ_EMPTY(&tp->timeq)));
 677                         if (tiqe == NULL) {
 678                                 tiqe = q;
 679                         } else {
 680                                 tcpipqent_free(q);
 681                         }
 682                         TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
 683                         break;
 684                 }
 685                 /*
 686                  * If the fragment is before the segment, remember it.
 687                  * When this loop is terminated, p will contain the
 688                  * pointer to fragment that is right before the received
 689                  * segment.
 690                  */
 691                 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
 692                         p = q;
 693
 694                 continue;
 695
 696                 /*
 697                  * This is a common operation.  It also will allow
 698                  * to save doing a malloc/free in most instances.
 699                  */
 700           free_ipqe:
 701                 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
 702                 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
 703                 tp->t_segqlen--;
 704                 KASSERT(tp->t_segqlen >= 0);
 705                 KASSERT(tp->t_segqlen != 0 ||
 706                     (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
 707                 if (tiqe == NULL) {
 708                         tiqe = q;
 709                 } else {
 710                         tcpipqent_free(q);
 711                 }
 712         }
 713
 714 #ifdef TCP_REASS_COUNTERS
 715         if (count > 7)
 716                 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
 717         else if (count > 0)
 718                 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
 719 #endif
 720
 721     insert_it:
 722
 723         /*
 724          * Allocate a new queue entry since the received segment did not
 725          * collapse onto any other out-of-order block; thus we are allocating
 726          * a new block.  If it had collapsed, tiqe would not be NULL and
 727          * we would be reusing it.
 728          * XXX If we can't, just drop the packet.  XXX
 729          */
 730         if (tiqe == NULL) {
 731                 tiqe = tcpipqent_alloc();
 732                 if (tiqe == NULL) {
 733                         TCP_STATINC(TCP_STAT_RCVMEMDROP);
 734                         m_freem(m);
 735                         return (0);
 736                 }
 737         }
 738
 739         /*
 740          * Update the counters.
 741          */
 742         tcps = TCP_STAT_GETREF();
 743         tcps[TCP_STAT_RCVOOPACK]++;
 744         tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
 745         if (rcvpartdupbyte) {
 746             tcps[TCP_STAT_RCVPARTDUPPACK]++;
 747             tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte;
 748         }
 749         TCP_STAT_PUTREF();
 750
 751         /*
 752          * Insert the new fragment queue entry into both queues.
 753          */
 754         tiqe->ipqe_m = m;
 755         tiqe->ipre_mlast = m;
 756         tiqe->ipqe_seq = pkt_seq;
 757         tiqe->ipqe_len = pkt_len;
 758         tiqe->ipqe_flags = pkt_flags;
 759         if (p == NULL) {
 760                 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
 761 #ifdef TCPREASS_DEBUG
 762                 if (tiqe->ipqe_seq != tp->rcv_nxt)
 763                         printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
 764                                tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
 765 #endif
 766         } else {
 767                 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
 768 #ifdef TCPREASS_DEBUG
 769                 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
 770                        tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
 771                        p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
 772 #endif
 773         }
 774         tp->t_segqlen++;
 775
 776 skip_replacement:
 777
 778         TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
 779
 780 present:
 781         /*
 782          * Present data to user, advancing rcv_nxt through
 783          * completed sequence space.
 784          */
 785         if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
 786                 return (0);
 787         q = TAILQ_FIRST(&tp->segq);
 788         if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
 789                 return (0);
 790         if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
 791                 return (0);
 792
 793         tp->rcv_nxt += q->ipqe_len;
 794         pkt_flags = q->ipqe_flags & TH_FIN;
 795         nd6_hint(tp);
 796
 797         TAILQ_REMOVE(&tp->segq, q, ipqe_q);
 798         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
 799         tp->t_segqlen--;
 800         KASSERT(tp->t_segqlen >= 0);
 801         KASSERT(tp->t_segqlen != 0 ||
 802             (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
 803         if (so->so_state & SS_CANTRCVMORE)
 804                 m_freem(q->ipqe_m);
 805         else
 806                 sbappendstream(&so->so_rcv, q->ipqe_m);
 807         tcpipqent_free(q);
 808         sorwakeup(so);
 809         return (pkt_flags);
 810 }
 811
 812 #ifdef INET6
 813 int
 814 tcp6_input(struct mbuf **mp, int *offp, int proto)
 815 {
 816         struct mbuf *m = *mp;
 817
 818         /*
 819          * draft-itojun-ipv6-tcp-to-anycast
 820          * better place to put this in?
 821          */
 822         if (m->m_flags & M_ANYCAST6) {
 823                 struct ip6_hdr *ip6;
 824                 if (m->m_len < sizeof(struct ip6_hdr)) {
 825                         if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 826                                 TCP_STATINC(TCP_STAT_RCVSHORT);
 827                                 return IPPROTO_DONE;
 828                         }
 829                 }
 830                 ip6 = mtod(m, struct ip6_hdr *);
 831                 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 832                     (char *)&ip6->ip6_dst - (char *)ip6);
 833                 return IPPROTO_DONE;
 834         }
 835
 836         tcp_input(m, *offp, proto);
 837         return IPPROTO_DONE;
 838 }
 839 #endif
 840
 841 #ifdef INET
 842 static void
 843 tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
 844 {
 845         char src[4*sizeof "123"];
 846         char dst[4*sizeof "123"];
 847
 848         if (ip) {
 849                 strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src));
 850                 strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst));
 851         }
 852         else {
 853                 strlcpy(src, "(unknown)", sizeof(src));
 854                 strlcpy(dst, "(unknown)", sizeof(dst));
 855         }
 856         log(LOG_INFO,
 857             "Connection attempt to TCP %s:%d from %s:%d\n",
 858             dst, ntohs(th->th_dport),
 859             src, ntohs(th->th_sport));
 860 }
 861 #endif
 862
 863 #ifdef INET6
 864 static void
 865 tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
 866 {
 867         char src[INET6_ADDRSTRLEN];
 868         char dst[INET6_ADDRSTRLEN];
 869
 870         if (ip6) {
 871                 strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src));
 872                 strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst));
 873         }
 874         else {
 875                 strlcpy(src, "(unknown v6)", sizeof(src));
 876                 strlcpy(dst, "(unknown v6)", sizeof(dst));
 877         }
 878         log(LOG_INFO,
 879             "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
 880             dst, ntohs(th->th_dport),
 881             src, ntohs(th->th_sport));
 882 }
 883 #endif
 884
 885 /*
 886  * Checksum extended TCP header and data.
 887  */
 888 int
 889 tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th,
 890     int toff, int off, int tlen)
 891 {
 892
 893         /*
 894          * XXX it's better to record and check if this mbuf is
 895          * already checked.
 896          */
 897
 898         switch (af) {
 899 #ifdef INET
 900         case AF_INET:
 901                 switch (m->m_pkthdr.csum_flags &
 902                         ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
 903                          M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
 904                 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
 905                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
 906                         goto badcsum;
 907
 908                 case M_CSUM_TCPv4|M_CSUM_DATA: {
 909                         u_int32_t hw_csum = m->m_pkthdr.csum_data;
 910
 911                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
 912                         if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
 913                                 const struct ip *ip =
 914                                     mtod(m, const struct ip *);
 915
 916                                 hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
 917                                     ip->ip_dst.s_addr,
 918                                     htons(hw_csum + tlen + off + IPPROTO_TCP));
 919                         }
 920                         if ((hw_csum ^ 0xffff) != 0)
 921                                 goto badcsum;
 922                         break;
 923                 }
 924
 925                 case M_CSUM_TCPv4:
 926                         /* Checksum was okay. */
 927                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
 928                         break;
 929
 930                 default:
 931                         /*
 932                          * Must compute it ourselves.  Maybe skip checksum
 933                          * on loopback interfaces.
 934                          */
 935                         if (__predict_true(!(m->m_pkthdr.rcvif->if_flags &
 936                                              IFF_LOOPBACK) ||
 937                                            tcp_do_loopback_cksum)) {
 938                                 TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
 939                                 if (in4_cksum(m, IPPROTO_TCP, toff,
 940                                               tlen + off) != 0)
 941                                         goto badcsum;
 942                         }
 943                         break;
 944                 }
 945                 break;
 946 #endif /* INET4 */
 947
 948 #ifdef INET6
 949         case AF_INET6:
 950                 switch (m->m_pkthdr.csum_flags &
 951                         ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv6) |
 952                          M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
 953                 case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD:
 954                         TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
 955                         goto badcsum;
 956
 957 #if 0 /* notyet */
 958                 case M_CSUM_TCPv6|M_CSUM_DATA:
 959 #endif
 960
 961                 case M_CSUM_TCPv6:
 962                         /* Checksum was okay. */
 963                         TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
 964                         break;
 965
 966                 default:
 967                         /*
 968                          * Must compute it ourselves.  Maybe skip checksum
 969                          * on loopback interfaces.
 970                          */
 971                         if (__predict_true((m->m_flags & M_LOOP) == 0 ||
 972                             tcp_do_loopback_cksum)) {
 973                                 TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
 974                                 if (in6_cksum(m, IPPROTO_TCP, toff,
 975                                     tlen + off) != 0)
 976                                         goto badcsum;
 977                         }
 978                 }
 979                 break;
 980 #endif /* INET6 */
 981         }
 982
 983         return 0;
 984
 985 badcsum:
 986         TCP_STATINC(TCP_STAT_RCVBADSUM);
 987         return -1;
 988 }
 989
 990 /*
 991  * TCP input routine, follows pages 65-76 of RFC 793 very closely.
 992  */
 993 void
 994 tcp_input(struct mbuf *m, ...)
 995 {
 996         struct tcphdr *th;
 997         struct ip *ip;
 998         struct inpcb *inp;
 999 #ifdef INET6
1000         struct ip6_hdr *ip6;
1001         struct in6pcb *in6p;
1002 #endif
1003         u_int8_t *optp = NULL;
1004         int optlen = 0;
1005         int len, tlen, toff, hdroptlen = 0;
1006         struct tcpcb *tp = 0;
1007         int tiflags;
1008         struct socket *so = NULL;
1009         int todrop, dupseg, acked, ourfinisacked, needoutput = 0;
1010 #ifdef TCP_DEBUG
1011         short ostate = 0;
1012 #endif
1013         u_long tiwin;
1014         struct tcp_opt_info opti;
1015         int off, iphlen;
1016         va_list ap;
1017         int af;         /* af on the wire */
1018         struct mbuf *tcp_saveti = NULL;
1019         uint32_t ts_rtt;
1020         uint8_t iptos;
1021         uint64_t *tcps;
1022
1023         MCLAIM(m, &tcp_rx_mowner);
1024         va_start(ap, m);
1025         toff = va_arg(ap, int);
1026         (void)va_arg(ap, int);          /* ignore value, advance ap */
1027         va_end(ap);
1028
1029         TCP_STATINC(TCP_STAT_RCVTOTAL);
1030
1031         memset(&opti, 0, sizeof(opti));
1032         opti.ts_present = 0;
1033         opti.maxseg = 0;
1034
1035         /*
1036          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
1037          *
1038          * TCP is, by definition, unicast, so we reject all
1039          * multicast outright.
1040          *
1041          * Note, there are additional src/dst address checks in
1042          * the AF-specific code below.
1043          */
1044         if (m->m_flags & (M_BCAST|M_MCAST)) {
1045                 /* XXX stat */
1046                 goto drop;
1047         }
1048 #ifdef INET6
1049         if (m->m_flags & M_ANYCAST6) {
1050                 /* XXX stat */
1051                 goto drop;
1052         }
1053 #endif
1054
1055         /*
1056          * Get IP and TCP header.
1057          * Note: IP leaves IP header in first mbuf.
1058          */
1059         ip = mtod(m, struct ip *);
1060 #ifdef INET6
1061         ip6 = NULL;
1062 #endif
1063         switch (ip->ip_v) {
1064 #ifdef INET
1065         case 4:
1066                 af = AF_INET;
1067                 iphlen = sizeof(struct ip);
1068                 ip = mtod(m, struct ip *);
1069                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1070                         sizeof(struct tcphdr));
1071                 if (th == NULL) {
1072                         TCP_STATINC(TCP_STAT_RCVSHORT);
1073                         return;
1074                 }
1075                 /* We do the checksum after PCB lookup... */
1076                 len = ntohs(ip->ip_len);
1077                 tlen = len - toff;
1078                 iptos = ip->ip_tos;
1079                 break;
1080 #endif
1081 #ifdef INET6
1082         case 6:
1083                 ip = NULL;
1084                 iphlen = sizeof(struct ip6_hdr);
1085                 af = AF_INET6;
1086                 ip6 = mtod(m, struct ip6_hdr *);
1087                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1088                         sizeof(struct tcphdr));
1089                 if (th == NULL) {
1090                         TCP_STATINC(TCP_STAT_RCVSHORT);
1091                         return;
1092                 }
1093
1094                 /* Be proactive about malicious use of IPv4 mapped address */
1095                 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
1096                     IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
1097                         /* XXX stat */
1098                         goto drop;
1099                 }
1100
1101                 /*
1102                  * Be proactive about unspecified IPv6 address in source.
1103                  * As we use all-zero to indicate unbounded/unconnected pcb,
1104                  * unspecified IPv6 address can be used to confuse us.
1105                  *
1106                  * Note that packets with unspecified IPv6 destination is
1107                  * already dropped in ip6_input.
1108                  */
1109                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1110                         /* XXX stat */
1111                         goto drop;
1112                 }
1113
1114                 /*
1115                  * Make sure destination address is not multicast.
1116                  * Source address checked in ip6_input().
1117                  */
1118                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
1119                         /* XXX stat */
1120                         goto drop;
1121                 }
1122
1123                 /* We do the checksum after PCB lookup... */
1124                 len = m->m_pkthdr.len;
1125                 tlen = len - toff;
1126                 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1127                 break;
1128 #endif
1129         default:
1130                 m_freem(m);
1131                 return;
1132         }
1133
1134         KASSERT(TCP_HDR_ALIGNED_P(th));
1135
1136         /*
1137          * Check that TCP offset makes sense,
1138          * pull out TCP options and adjust length.              XXX
1139          */
1140         off = th->th_off << 2;
1141         if (off < sizeof (struct tcphdr) || off > tlen) {
1142                 TCP_STATINC(TCP_STAT_RCVBADOFF);
1143                 goto drop;
1144         }
1145         tlen -= off;
1146
1147         /*
1148          * tcp_input() has been modified to use tlen to mean the TCP data
1149          * length throughout the function.  Other functions can use
1150          * m->m_pkthdr.len as the basis for calculating the TCP data length.
1151          * rja
1152          */
1153
1154         if (off > sizeof (struct tcphdr)) {
1155                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
1156                 if (th == NULL) {
1157                         TCP_STATINC(TCP_STAT_RCVSHORT);
1158                         return;
1159                 }
1160                 /*
1161                  * NOTE: ip/ip6 will not be affected by m_pulldown()
1162                  * (as they're before toff) and we don't need to update those.
1163                  */
1164                 KASSERT(TCP_HDR_ALIGNED_P(th));
1165                 optlen = off - sizeof (struct tcphdr);
1166                 optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
1167                 /*
1168                  * Do quick retrieval of timestamp options ("options
1169                  * prediction?").  If timestamp is the only option and it's
1170                  * formatted as recommended in RFC 1323 appendix A, we
1171                  * quickly get the values now and not bother calling
1172                  * tcp_dooptions(), etc.
1173                  */
1174                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1175                      (optlen > TCPOLEN_TSTAMP_APPA &&
1176                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1177                      *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1178                      (th->th_flags & TH_SYN) == 0) {
1179                         opti.ts_present = 1;
1180                         opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
1181                         opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
1182                         optp = NULL;    /* we've parsed the options */
1183                 }
1184         }
1185         tiflags = th->th_flags;
1186
1187         /*
1188          * Locate pcb for segment.
1189          */
1190 findpcb:
1191         inp = NULL;
1192 #ifdef INET6
1193         in6p = NULL;
1194 #endif
1195         switch (af) {
1196 #ifdef INET
1197         case AF_INET:
1198                 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
1199                     ip->ip_dst, th->th_dport);
1200                 if (inp == 0) {
1201                         TCP_STATINC(TCP_STAT_PCBHASHMISS);
1202                         inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
1203                 }
1204 #ifdef INET6
1205                 if (inp == 0) {
1206                         struct in6_addr s, d;
1207
1208                         /* mapped addr case */
1209                         memset(&s, 0, sizeof(s));
1210                         s.s6_addr16[5] = htons(0xffff);
1211                         bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src));
1212                         memset(&d, 0, sizeof(d));
1213                         d.s6_addr16[5] = htons(0xffff);
1214                         bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst));
1215                         in6p = in6_pcblookup_connect(&tcbtable, &s,
1216                             th->th_sport, &d, th->th_dport, 0);
1217                         if (in6p == 0) {
1218                                 TCP_STATINC(TCP_STAT_PCBHASHMISS);
1219                                 in6p = in6_pcblookup_bind(&tcbtable, &d,
1220                                     th->th_dport, 0);
1221                         }
1222                 }
1223 #endif
1224 #ifndef INET6
1225                 if (inp == 0)
1226 #else
1227                 if (inp == 0 && in6p == 0)
1228 #endif
1229                 {
1230                         TCP_STATINC(TCP_STAT_NOPORT);
1231                         if (tcp_log_refused &&
1232                             (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1233                                 tcp4_log_refused(ip, th);
1234                         }
1235                         tcp_fields_to_host(th);
1236                         goto dropwithreset_ratelim;
1237                 }
1238 #if defined(IPSEC) || defined(FAST_IPSEC)
1239                 if (inp && (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 &&
1240                     ipsec4_in_reject(m, inp)) {
1241                         IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1242                         goto drop;
1243                 }
1244 #ifdef INET6
1245                 else if (in6p &&
1246                     (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
1247                     ipsec6_in_reject_so(m, in6p->in6p_socket)) {
1248                         IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1249                         goto drop;
1250                 }
1251 #endif
1252 #endif /*IPSEC*/
1253                 break;
1254 #endif /*INET*/
1255 #ifdef INET6
1256         case AF_INET6:
1257             {
1258                 int faith;
1259
1260 #if defined(NFAITH) && NFAITH > 0
1261                 faith = faithprefix(&ip6->ip6_dst);
1262 #else
1263                 faith = 0;
1264 #endif
1265                 in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
1266                     th->th_sport, &ip6->ip6_dst, th->th_dport, faith);
1267                 if (in6p == NULL) {
1268                         TCP_STATINC(TCP_STAT_PCBHASHMISS);
1269                         in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
1270                                 th->th_dport, faith);
1271                 }
1272                 if (in6p == NULL) {
1273                         TCP_STATINC(TCP_STAT_NOPORT);
1274                         if (tcp_log_refused &&
1275                             (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1276                                 tcp6_log_refused(ip6, th);
1277                         }
1278                         tcp_fields_to_host(th);
1279                         goto dropwithreset_ratelim;
1280                 }
1281 #if defined(IPSEC) || defined(FAST_IPSEC)
1282                 if ((in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
1283                     ipsec6_in_reject(m, in6p)) {
1284                         IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO);
1285                         goto drop;
1286                 }
1287 #endif /*IPSEC*/
1288                 break;
1289             }
1290 #endif
1291         }
1292
1293         /*
1294          * If the state is CLOSED (i.e., TCB does not exist) then
1295          * all data in the incoming segment is discarded.
1296          * If the TCB exists but is in CLOSED state, it is embryonic,
1297          * but should either do a listen or a connect soon.
1298          */
1299         tp = NULL;
1300         so = NULL;
1301         if (inp) {
1302                 /* Check the minimum TTL for socket. */
1303                 if (ip->ip_ttl < inp->inp_ip_minttl)
1304                         goto drop;
1305
1306                 tp = intotcpcb(inp);
1307                 so = inp->inp_socket;
1308         }
1309 #ifdef INET6
1310         else if (in6p) {
1311                 tp = in6totcpcb(in6p);
1312                 so = in6p->in6p_socket;
1313         }
1314 #endif
1315         if (tp == 0) {
1316                 tcp_fields_to_host(th);
1317                 goto dropwithreset_ratelim;
1318         }
1319         if (tp->t_state == TCPS_CLOSED)
1320                 goto drop;
1321
1322         KASSERT(so->so_lock == softnet_lock);
1323         KASSERT(solocked(so));
1324
1325         /*
1326          * Checksum extended TCP header and data.
1327          */
1328         if (tcp_input_checksum(af, m, th, toff, off, tlen))
1329                 goto badcsum;
1330
1331         tcp_fields_to_host(th);
1332
1333         /* Unscale the window into a 32-bit value. */
1334         if ((tiflags & TH_SYN) == 0)
1335                 tiwin = th->th_win << tp->snd_scale;
1336         else
1337                 tiwin = th->th_win;
1338
1339 #ifdef INET6
1340         /* save packet options if user wanted */
1341         if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
1342                 if (in6p->in6p_options) {
1343                         m_freem(in6p->in6p_options);
1344                         in6p->in6p_options = 0;
1345                 }
1346                 KASSERT(ip6 != NULL);
1347                 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
1348         }
1349 #endif
1350
1351         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1352                 union syn_cache_sa src;
1353                 union syn_cache_sa dst;
1354
1355                 memset(&src, 0, sizeof(src));
1356                 memset(&dst, 0, sizeof(dst));
1357                 switch (af) {
1358 #ifdef INET
1359                 case AF_INET:
1360                         src.sin.sin_len = sizeof(struct sockaddr_in);
1361                         src.sin.sin_family = AF_INET;
1362                         src.sin.sin_addr = ip->ip_src;
1363                         src.sin.sin_port = th->th_sport;
1364
1365                         dst.sin.sin_len = sizeof(struct sockaddr_in);
1366                         dst.sin.sin_family = AF_INET;
1367                         dst.sin.sin_addr = ip->ip_dst;
1368                         dst.sin.sin_port = th->th_dport;
1369                         break;
1370 #endif
1371 #ifdef INET6
1372                 case AF_INET6:
1373                         src.sin6.sin6_len = sizeof(struct sockaddr_in6);
1374                         src.sin6.sin6_family = AF_INET6;
1375                         src.sin6.sin6_addr = ip6->ip6_src;
1376                         src.sin6.sin6_port = th->th_sport;
1377
1378                         dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
1379                         dst.sin6.sin6_family = AF_INET6;
1380                         dst.sin6.sin6_addr = ip6->ip6_dst;
1381                         dst.sin6.sin6_port = th->th_dport;
1382                         break;
1383 #endif /* INET6 */
1384                 default:
1385                         goto badsyn;    /*sanity*/
1386                 }
1387
1388                 if (so->so_options & SO_DEBUG) {
1389 #ifdef TCP_DEBUG
1390                         ostate = tp->t_state;
1391 #endif
1392
1393                         tcp_saveti = NULL;
1394                         if (iphlen + sizeof(struct tcphdr) > MHLEN)
1395                                 goto nosave;
1396
1397                         if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
1398                                 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
1399                                 if (!tcp_saveti)
1400                                         goto nosave;
1401                         } else {
1402                                 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
1403                                 if (!tcp_saveti)
1404                                         goto nosave;
1405                                 MCLAIM(m, &tcp_mowner);
1406                                 tcp_saveti->m_len = iphlen;
1407                                 m_copydata(m, 0, iphlen,
1408                                     mtod(tcp_saveti, void *));
1409                         }
1410
1411                         if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
1412                                 m_freem(tcp_saveti);
1413                                 tcp_saveti = NULL;
1414                         } else {
1415                                 tcp_saveti->m_len += sizeof(struct tcphdr);
1416                                 memcpy(mtod(tcp_saveti, char *) + iphlen, th,
1417                                     sizeof(struct tcphdr));
1418                         }
1419         nosave:;
1420                 }
1421                 if (so->so_options & SO_ACCEPTCONN) {
1422                         if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1423                                 if (tiflags & TH_RST) {
1424                                         syn_cache_reset(&src.sa, &dst.sa, th);
1425                                 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
1426                                     (TH_ACK|TH_SYN)) {
1427                                         /*
1428                                          * Received a SYN,ACK.  This should
1429                                          * never happen while we are in
1430                                          * LISTEN.  Send an RST.
1431                                          */
1432                                         goto badsyn;
1433                                 } else if (tiflags & TH_ACK) {
1434                                         so = syn_cache_get(&src.sa, &dst.sa,
1435                                                 th, toff, tlen, so, m);
1436                                         if (so == NULL) {
1437                                                 /*
1438                                                  * We don't have a SYN for
1439                                                  * this ACK; send an RST.
1440                                                  */
1441                                                 goto badsyn;
1442                                         } else if (so ==
1443                                             (struct socket *)(-1)) {
1444                                                 /*
1445                                                  * We were unable to create
1446                                                  * the connection.  If the
1447                                                  * 3-way handshake was
1448                                                  * completed, and RST has
1449                                                  * been sent to the peer.
1450                                                  * Since the mbuf might be
1451                                                  * in use for the reply,
1452                                                  * do not free it.
1453                                                  */
1454                                                 m = NULL;
1455                                         } else {
1456                                                 /*
1457                                                  * We have created a
1458                                                  * full-blown connection.
1459                                                  */
1460                                                 tp = NULL;
1461                                                 inp = NULL;
1462 #ifdef INET6
1463                                                 in6p = NULL;
1464 #endif
1465                                                 switch (so->so_proto->pr_domain->dom_family) {
1466 #ifdef INET
1467                                                 case AF_INET:
1468                                                         inp = sotoinpcb(so);
1469                                                         tp = intotcpcb(inp);
1470                                                         break;
1471 #endif
1472 #ifdef INET6
1473                                                 case AF_INET6:
1474                                                         in6p = sotoin6pcb(so);
1475                                                         tp = in6totcpcb(in6p);
1476                                                         break;
1477 #endif
1478                                                 }
1479                                                 if (tp == NULL)
1480                                                         goto badsyn;    /*XXX*/
1481                                                 tiwin <<= tp->snd_scale;
1482                                                 goto after_listen;
1483                                         }
1484                                 } else {
1485                                         /*
1486                                          * None of RST, SYN or ACK was set.
1487                                          * This is an invalid packet for a
1488                                          * TCB in LISTEN state.  Send a RST.
1489                                          */
1490                                         goto badsyn;
1491                                 }
1492                         } else {
1493                                 /*
1494                                  * Received a SYN.
1495                                  *
1496                                  * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1497                                  */
1498                                 if (m->m_flags & (M_BCAST|M_MCAST))
1499                                         goto drop;
1500
1501                                 switch (af) {
1502 #ifdef INET6
1503                                 case AF_INET6:
1504                                         if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
1505                                                 goto drop;
1506                                         break;
1507 #endif /* INET6 */
1508                                 case AF_INET:
1509                                         if (IN_MULTICAST(ip->ip_dst.s_addr) ||
1510                                             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1511                                                 goto drop;
1512                                 break;
1513                                 }
1514
1515 #ifdef INET6
1516                                 /*
1517                                  * If deprecated address is forbidden, we do
1518                                  * not accept SYN to deprecated interface
1519                                  * address to prevent any new inbound
1520                                  * connection from getting established.
1521                                  * When we do not accept SYN, we send a TCP
1522                                  * RST, with deprecated source address (instead
1523                                  * of dropping it).  We compromise it as it is
1524                                  * much better for peer to send a RST, and
1525                                  * RST will be the final packet for the
1526                                  * exchange.
1527                                  *
1528                                  * If we do not forbid deprecated addresses, we
1529                                  * accept the SYN packet.  RFC2462 does not
1530                                  * suggest dropping SYN in this case.
1531                                  * If we decipher RFC2462 5.5.4, it says like
1532                                  * this:
1533                                  * 1. use of deprecated addr with existing
1534                                  *    communication is okay - "SHOULD continue
1535                                  *    to be used"
1536                                  * 2. use of it with new communication:
1537                                  *   (2a) "SHOULD NOT be used if alternate
1538                                  *        address with sufficient scope is
1539                                  *        available"
1540                                  *   (2b) nothing mentioned otherwise.
1541                                  * Here we fall into (2b) case as we have no
1542                                  * choice in our source address selection - we
1543                                  * must obey the peer.
1544                                  *
1545                                  * The wording in RFC2462 is confusing, and
1546                                  * there are multiple description text for
1547                                  * deprecated address handling - worse, they
1548                                  * are not exactly the same.  I believe 5.5.4
1549                                  * is the best one, so we follow 5.5.4.
1550                                  */
1551                                 if (af == AF_INET6 && !ip6_use_deprecated) {
1552                                         struct in6_ifaddr *ia6;
1553                                         if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
1554                                             &ip6->ip6_dst)) &&
1555                                             (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1556                                                 tp = NULL;
1557                                                 goto dropwithreset;
1558                                         }
1559                                 }
1560 #endif
1561
1562 #if defined(IPSEC) || defined(FAST_IPSEC)
1563                                 switch (af) {
1564 #ifdef INET
1565                                 case AF_INET:
1566                                         if (ipsec4_in_reject_so(m, so)) {
1567                                                 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
1568                                                 tp = NULL;
1569                                                 goto dropwithreset;
1570                                         }
1571                                         break;
1572 #endif
1573 #ifdef INET6
1574                                 case AF_INET6:
1575                                         if (ipsec6_in_reject_so(m, so)) {
1576                                                 IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO);
1577                                                 tp = NULL;
1578                                                 goto dropwithreset;
1579                                         }
1580                                         break;
1581 #endif /*INET6*/
1582                                 }
1583 #endif /*IPSEC*/
1584
1585                                 /*
1586                                  * LISTEN socket received a SYN
1587                                  * from itself?  This can't possibly
1588                                  * be valid; drop the packet.
1589                                  */
1590                                 if (th->th_sport == th->th_dport) {
1591                                         int i;
1592
1593                                         switch (af) {
1594 #ifdef INET
1595                                         case AF_INET:
1596                                                 i = in_hosteq(ip->ip_src, ip->ip_dst);
1597                                                 break;
1598 #endif
1599 #ifdef INET6
1600                                         case AF_INET6:
1601                                                 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
1602                                                 break;
1603 #endif
1604                                         default:
1605                                                 i = 1;
1606                                         }
1607                                         if (i) {
1608                                                 TCP_STATINC(TCP_STAT_BADSYN);
1609                                                 goto drop;
1610                                         }
1611                                 }
1612
1613                                 /*
1614                                  * SYN looks ok; create compressed TCP
1615                                  * state for it.
1616                                  */
1617                                 if (so->so_qlen <= so->so_qlimit &&
1618                                     syn_cache_add(&src.sa, &dst.sa, th, tlen,
1619                                                 so, m, optp, optlen, &opti))
1620                                         m = NULL;
1621                         }
1622                         goto drop;
1623                 }
1624         }
1625
1626 after_listen:
1627 #ifdef DIAGNOSTIC
1628         /*
1629          * Should not happen now that all embryonic connections
1630          * are handled with compressed state.
1631          */
1632         if (tp->t_state == TCPS_LISTEN)
1633                 panic("tcp_input: TCPS_LISTEN");
1634 #endif
1635
1636         /*
1637          * Segment received on connection.
1638          * Reset idle time and keep-alive timer.
1639          */
1640         tp->t_rcvtime = tcp_now;
1641         if (TCPS_HAVEESTABLISHED(tp->t_state))
1642                 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
1643
1644         /*
1645          * Process options.
1646          */
1647 #ifdef TCP_SIGNATURE
1648         if (optp || (tp->t_flags & TF_SIGNATURE))
1649 #else
1650         if (optp)
1651 #endif
1652                 if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
1653                         goto drop;
1654
1655         if (TCP_SACK_ENABLED(tp)) {
1656                 tcp_del_sackholes(tp, th);
1657         }
1658
1659         if (TCP_ECN_ALLOWED(tp)) {
1660                 switch (iptos & IPTOS_ECN_MASK) {
1661                 case IPTOS_ECN_CE:
1662                         tp->t_flags |= TF_ECN_SND_ECE;
1663                         TCP_STATINC(TCP_STAT_ECN_CE);
1664                         break;
1665                 case IPTOS_ECN_ECT0:
1666                         TCP_STATINC(TCP_STAT_ECN_ECT);
1667                         break;
1668                 case IPTOS_ECN_ECT1:
1669                         /* XXX: ignore for now -- rpaulo */
1670                         break;
1671                 }
1672
1673                 if (tiflags & TH_CWR)
1674                         tp->t_flags &= ~TF_ECN_SND_ECE;
1675
1676                 /*
1677                  * Congestion experienced.
1678                  * Ignore if we are already trying to recover.
1679                  */
1680                 if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
1681                         tp->t_congctl->cong_exp(tp);
1682         }
1683
1684         if (opti.ts_present && opti.ts_ecr) {
1685                 /*
1686                  * Calculate the RTT from the returned time stamp and the
1687                  * connection's time base.  If the time stamp is later than
1688                  * the current time, or is extremely old, fall back to non-1323
1689                  * RTT calculation.  Since ts_ecr is unsigned, we can test both
1690                  * at the same time.
1691                  */
1692                 ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
1693                 if (ts_rtt > TCP_PAWS_IDLE)
1694                         ts_rtt = 0;
1695         } else {
1696                 ts_rtt = 0;
1697         }
1698
1699         /*
1700          * Header prediction: check for the two common cases
1701          * of a uni-directional data xfer.  If the packet has
1702          * no control flags, is in-sequence, the window didn't
1703          * change and we're not retransmitting, it's a
1704          * candidate.  If the length is zero and the ack moved
1705          * forward, we're the sender side of the xfer.  Just
1706          * free the data acked & wake any higher level process
1707          * that was blocked waiting for space.  If the length
1708          * is non-zero and the ack didn't move, we're the
1709          * receiver side.  If we're getting packets in-order
1710          * (the reassembly queue is empty), add the data to
1711          * the socket buffer and note that we need a delayed ack.
1712          */
1713         if (tp->t_state == TCPS_ESTABLISHED &&
1714             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK))
1715                 == TH_ACK &&
1716             (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1717             th->th_seq == tp->rcv_nxt &&
1718             tiwin && tiwin == tp->snd_wnd &&
1719             tp->snd_nxt == tp->snd_max) {
1720
1721                 /*
1722                  * If last ACK falls within this segment's sequence numbers,
1723                  * record the timestamp.
1724                  * NOTE that the test is modified according to the latest
1725                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1726                  *
1727                  * note that we already know
1728                  *      TSTMP_GEQ(opti.ts_val, tp->ts_recent)
1729                  */
1730                 if (opti.ts_present &&
1731                     SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1732                         tp->ts_recent_age = tcp_now;
1733                         tp->ts_recent = opti.ts_val;
1734                 }
1735
1736                 if (tlen == 0) {
1737                         /* Ack prediction. */
1738                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
1739                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
1740                             tp->snd_cwnd >= tp->snd_wnd &&
1741                             tp->t_partialacks < 0) {
1742                                 /*
1743                                  * this is a pure ack for outstanding data.
1744                                  */
1745                                 if (ts_rtt)
1746                                         tcp_xmit_timer(tp, ts_rtt);
1747                                 else if (tp->t_rtttime &&
1748                                     SEQ_GT(th->th_ack, tp->t_rtseq))
1749                                         tcp_xmit_timer(tp,
1750                                           tcp_now - tp->t_rtttime);
1751                                 acked = th->th_ack - tp->snd_una;
1752                                 tcps = TCP_STAT_GETREF();
1753                                 tcps[TCP_STAT_PREDACK]++;
1754                                 tcps[TCP_STAT_RCVACKPACK]++;
1755                                 tcps[TCP_STAT_RCVACKBYTE] += acked;
1756                                 TCP_STAT_PUTREF();
1757                                 nd6_hint(tp);
1758
1759                                 if (acked > (tp->t_lastoff - tp->t_inoff))
1760                                         tp->t_lastm = NULL;
1761                                 sbdrop(&so->so_snd, acked);
1762                                 tp->t_lastoff -= acked;
1763
1764                                 icmp_check(tp, th, acked);
1765
1766                                 tp->snd_una = th->th_ack;
1767                                 tp->snd_fack = tp->snd_una;
1768                                 if (SEQ_LT(tp->snd_high, tp->snd_una))
1769                                         tp->snd_high = tp->snd_una;
1770                                 m_freem(m);
1771
1772                                 /*
1773                                  * If all outstanding data are acked, stop
1774                                  * retransmit timer, otherwise restart timer
1775                                  * using current (possibly backed-off) value.
1776                                  * If process is waiting for space,
1777                                  * wakeup/selnotify/signal.  If data
1778                                  * are ready to send, let tcp_output
1779                                  * decide between more output or persist.
1780                                  */
1781                                 if (tp->snd_una == tp->snd_max)
1782                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
1783                                 else if (TCP_TIMER_ISARMED(tp,
1784                                     TCPT_PERSIST) == 0)
1785                                         TCP_TIMER_ARM(tp, TCPT_REXMT,
1786                                             tp->t_rxtcur);
1787
1788                                 sowwakeup(so);
1789                                 if (so->so_snd.sb_cc)
1790                                         (void) tcp_output(tp);
1791                                 if (tcp_saveti)
1792                                         m_freem(tcp_saveti);
1793                                 return;
1794                         }
1795                 } else if (th->th_ack == tp->snd_una &&
1796                     TAILQ_FIRST(&tp->segq) == NULL &&
1797                     tlen <= sbspace(&so->so_rcv)) {
1798                         int newsize = 0;        /* automatic sockbuf scaling */
1799
1800                         /*
1801                          * this is a pure, in-sequence data packet
1802                          * with nothing on the reassembly queue and
1803                          * we have enough buffer space to take it.
1804                          */
1805                         tp->rcv_nxt += tlen;
1806                         tcps = TCP_STAT_GETREF();
1807                         tcps[TCP_STAT_PREDDAT]++;
1808                         tcps[TCP_STAT_RCVPACK]++;
1809                         tcps[TCP_STAT_RCVBYTE] += tlen;
1810                         TCP_STAT_PUTREF();
1811                         nd6_hint(tp);
1812
1813                 /*
1814                  * Automatic sizing enables the performance of large buffers
1815                  * and most of the efficiency of small ones by only allocating
1816                  * space when it is needed.
1817                  *
1818                  * On the receive side the socket buffer memory is only rarely
1819                  * used to any significant extent.  This allows us to be much
1820                  * more aggressive in scaling the receive socket buffer.  For
1821                  * the case that the buffer space is actually used to a large
1822                  * extent and we run out of kernel memory we can simply drop
1823                  * the new segments; TCP on the sender will just retransmit it
1824                  * later.  Setting the buffer size too big may only consume too
1825                  * much kernel memory if the application doesn't read() from
1826                  * the socket or packet loss or reordering makes use of the
1827                  * reassembly queue.
1828                  *
1829                  * The criteria to step up the receive buffer one notch are:
1830                  *  1. the number of bytes received during the time it takes
1831                  *     one timestamp to be reflected back to us (the RTT);
1832                  *  2. received bytes per RTT is within seven eighth of the
1833                  *     current socket buffer size;
1834                  *  3. receive buffer size has not hit maximal automatic size;
1835                  *
1836                  * This algorithm does one step per RTT at most and only if
1837                  * we receive a bulk stream w/o packet losses or reorderings.
1838                  * Shrinking the buffer during idle times is not necessary as
1839                  * it doesn't consume any memory when idle.
1840                  *
1841                  * TODO: Only step up if the application is actually serving
1842                  * the buffer to better manage the socket buffer resources.
1843                  */
1844                         if (tcp_do_autorcvbuf &&
1845                             opti.ts_ecr &&
1846                             (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
1847                                 if (opti.ts_ecr > tp->rfbuf_ts &&
1848                                     opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) {
1849                                         if (tp->rfbuf_cnt >
1850                                             (so->so_rcv.sb_hiwat / 8 * 7) &&
1851                                             so->so_rcv.sb_hiwat <
1852                                             tcp_autorcvbuf_max) {
1853                                                 newsize =
1854                                                     min(so->so_rcv.sb_hiwat +
1855                                                     tcp_autorcvbuf_inc,
1856                                                     tcp_autorcvbuf_max);
1857                                         }
1858                                         /* Start over with next RTT. */
1859                                         tp->rfbuf_ts = 0;
1860                                         tp->rfbuf_cnt = 0;
1861                                 } else
1862                                         tp->rfbuf_cnt += tlen;  /* add up */
1863                         }
1864
1865                         /*
1866                          * Drop TCP, IP headers and TCP options then add data
1867                          * to socket buffer.
1868                          */
1869                         if (so->so_state & SS_CANTRCVMORE)
1870                                 m_freem(m);
1871                         else {
1872                                 /*
1873                                  * Set new socket buffer size.
1874                                  * Give up when limit is reached.
1875                                  */
1876                                 if (newsize)
1877                                         if (!sbreserve(&so->so_rcv,
1878                                             newsize, so))
1879                                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1880                                 m_adj(m, toff + off);
1881                                 sbappendstream(&so->so_rcv, m);
1882                         }
1883                         sorwakeup(so);
1884                         tcp_setup_ack(tp, th);
1885                         if (tp->t_flags & TF_ACKNOW)
1886                                 (void) tcp_output(tp);
1887                         if (tcp_saveti)
1888                                 m_freem(tcp_saveti);
1889                         return;
1890                 }
1891         }
1892
1893         /*
1894          * Compute mbuf offset to TCP data segment.
1895          */
1896         hdroptlen = toff + off;
1897
1898         /*
1899          * Calculate amount of space in receive window,
1900          * and then do TCP input processing.
1901          * Receive window is amount of space in rcv queue,
1902          * but not less than advertised window.
1903          */
1904         { int win;
1905
1906         win = sbspace(&so->so_rcv);
1907         if (win < 0)
1908                 win = 0;
1909         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1910         }
1911
1912         /* Reset receive buffer auto scaling when not in bulk receive mode. */
1913         tp->rfbuf_ts = 0;
1914         tp->rfbuf_cnt = 0;
1915
1916         switch (tp->t_state) {
1917         /*
1918          * If the state is SYN_SENT:
1919          *      if seg contains an ACK, but not for our SYN, drop the input.
1920          *      if seg contains a RST, then drop the connection.
1921          *      if seg does not contain SYN, then drop it.
1922          * Otherwise this is an acceptable SYN segment
1923          *      initialize tp->rcv_nxt and tp->irs
1924          *      if seg contains ack then advance tp->snd_una
1925          *      if seg contains a ECE and ECN support is enabled, the stream
1926          *          is ECN capable.
1927          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1928          *      arrange for segment to be acked (eventually)
1929          *      continue processing rest of data/controls, beginning with URG
1930          */
1931         case TCPS_SYN_SENT:
1932                 if ((tiflags & TH_ACK) &&
1933                     (SEQ_LEQ(th->th_ack, tp->iss) ||
1934                      SEQ_GT(th->th_ack, tp->snd_max)))
1935                         goto dropwithreset;
1936                 if (tiflags & TH_RST) {
1937                         if (tiflags & TH_ACK)
1938                                 tp = tcp_drop(tp, ECONNREFUSED);
1939                         goto drop;
1940                 }
1941                 if ((tiflags & TH_SYN) == 0)
1942                         goto drop;
1943                 if (tiflags & TH_ACK) {
1944                         tp->snd_una = th->th_ack;
1945                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1946                                 tp->snd_nxt = tp->snd_una;
1947                         if (SEQ_LT(tp->snd_high, tp->snd_una))
1948                                 tp->snd_high = tp->snd_una;
1949                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
1950
1951                         if ((tiflags & TH_ECE) && tcp_do_ecn) {
1952                                 tp->t_flags |= TF_ECN_PERMIT;
1953                                 TCP_STATINC(TCP_STAT_ECN_SHS);
1954                         }
1955
1956                 }
1957                 tp->irs = th->th_seq;
1958                 tcp_rcvseqinit(tp);
1959                 tp->t_flags |= TF_ACKNOW;
1960                 tcp_mss_from_peer(tp, opti.maxseg);
1961
1962                 /*
1963                  * Initialize the initial congestion window.  If we
1964                  * had to retransmit the SYN, we must initialize cwnd
1965                  * to 1 segment (i.e. the Loss Window).
1966                  */
1967                 if (tp->t_flags & TF_SYN_REXMT)
1968                         tp->snd_cwnd = tp->t_peermss;
1969                 else {
1970                         int ss = tcp_init_win;
1971 #ifdef INET
1972                         if (inp != NULL && in_localaddr(inp->inp_faddr))
1973                                 ss = tcp_init_win_local;
1974 #endif
1975 #ifdef INET6
1976                         if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
1977                                 ss = tcp_init_win_local;
1978 #endif
1979                         tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
1980                 }
1981
1982                 tcp_rmx_rtt(tp);
1983                 if (tiflags & TH_ACK) {
1984                         TCP_STATINC(TCP_STAT_CONNECTS);
1985                         soisconnected(so);
1986                         tcp_established(tp);
1987                         /* Do window scaling on this connection? */
1988                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1989                             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1990                                 tp->snd_scale = tp->requested_s_scale;
1991                                 tp->rcv_scale = tp->request_r_scale;
1992                         }
1993                         TCP_REASS_LOCK(tp);
1994                         (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
1995                         TCP_REASS_UNLOCK(tp);
1996                         /*
1997                          * if we didn't have to retransmit the SYN,
1998                          * use its rtt as our initial srtt & rtt var.
1999                          */
2000                         if (tp->t_rtttime)
2001                                 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2002                 } else
2003                         tp->t_state = TCPS_SYN_RECEIVED;
2004
2005                 /*
2006                  * Advance th->th_seq to correspond to first data byte.
2007                  * If data, trim to stay within window,
2008                  * dropping FIN if necessary.
2009                  */
2010                 th->th_seq++;
2011                 if (tlen > tp->rcv_wnd) {
2012                         todrop = tlen - tp->rcv_wnd;
2013                         m_adj(m, -todrop);
2014                         tlen = tp->rcv_wnd;
2015                         tiflags &= ~TH_FIN;
2016                         tcps = TCP_STAT_GETREF();
2017                         tcps[TCP_STAT_RCVPACKAFTERWIN]++;
2018                         tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop;
2019                         TCP_STAT_PUTREF();
2020                 }
2021                 tp->snd_wl1 = th->th_seq - 1;
2022                 tp->rcv_up = th->th_seq;
2023                 goto step6;
2024
2025         /*
2026          * If the state is SYN_RECEIVED:
2027          *      If seg contains an ACK, but not for our SYN, drop the input
2028          *      and generate an RST.  See page 36, rfc793
2029          */
2030         case TCPS_SYN_RECEIVED:
2031                 if ((tiflags & TH_ACK) &&
2032                     (SEQ_LEQ(th->th_ack, tp->iss) ||
2033                      SEQ_GT(th->th_ack, tp->snd_max)))
2034                         goto dropwithreset;
2035                 break;
2036         }
2037
2038         /*
2039          * States other than LISTEN or SYN_SENT.
2040          * First check timestamp, if present.
2041          * Then check that at least some bytes of segment are within
2042          * receive window.  If segment begins before rcv_nxt,
2043          * drop leading data (and SYN); if nothing left, just ack.
2044          *
2045          * RFC 1323 PAWS: If we have a timestamp reply on this segment
2046          * and it's less than ts_recent, drop it.
2047          */
2048         if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
2049             TSTMP_LT(opti.ts_val, tp->ts_recent)) {
2050
2051                 /* Check to see if ts_recent is over 24 days old.  */
2052                 if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
2053                         /*
2054                          * Invalidate ts_recent.  If this segment updates
2055                          * ts_recent, the age will be reset later and ts_recent
2056                          * will get a valid value.  If it does not, setting
2057                          * ts_recent to zero will at least satisfy the
2058                          * requirement that zero be placed in the timestamp
2059                          * echo reply when ts_recent isn't valid.  The
2060                          * age isn't reset until we get a valid ts_recent
2061                          * because we don't want out-of-order segments to be
2062                          * dropped when ts_recent is old.
2063                          */
2064                         tp->ts_recent = 0;
2065                 } else {
2066                         tcps = TCP_STAT_GETREF();
2067                         tcps[TCP_STAT_RCVDUPPACK]++;
2068                         tcps[TCP_STAT_RCVDUPBYTE] += tlen;
2069                         tcps[TCP_STAT_PAWSDROP]++;
2070                         TCP_STAT_PUTREF();
2071                         tcp_new_dsack(tp, th->th_seq, tlen);
2072                         goto dropafterack;
2073                 }
2074         }
2075
2076         todrop = tp->rcv_nxt - th->th_seq;
2077         dupseg = false;
2078         if (todrop > 0) {
2079                 if (tiflags & TH_SYN) {
2080                         tiflags &= ~TH_SYN;
2081                         th->th_seq++;
2082                         if (th->th_urp > 1)
2083                                 th->th_urp--;
2084                         else {
2085                                 tiflags &= ~TH_URG;
2086                                 th->th_urp = 0;
2087                         }
2088                         todrop--;
2089                 }
2090                 if (todrop > tlen ||
2091                     (todrop == tlen && (tiflags & TH_FIN) == 0)) {
2092                         /*
2093                          * Any valid FIN or RST must be to the left of the
2094                          * window.  At this point the FIN or RST must be a
2095                          * duplicate or out of sequence; drop it.
2096                          */
2097                         if (tiflags & TH_RST)
2098                                 goto drop;
2099                         tiflags &= ~(TH_FIN|TH_RST);
2100                         /*
2101                          * Send an ACK to resynchronize and drop any data.
2102                          * But keep on processing for RST or ACK.
2103                          */
2104                         tp->t_flags |= TF_ACKNOW;
2105                         todrop = tlen;
2106                         dupseg = true;
2107                         tcps = TCP_STAT_GETREF();
2108                         tcps[TCP_STAT_RCVDUPPACK]++;
2109                         tcps[TCP_STAT_RCVDUPBYTE] += todrop;
2110                         TCP_STAT_PUTREF();
2111                 } else if ((tiflags & TH_RST) &&
2112                            th->th_seq != tp->rcv_nxt) {
2113                         /*
2114                          * Test for reset before adjusting the sequence
2115                          * number for overlapping data.
2116                          */
2117                         goto dropafterack_ratelim;
2118                 } else {
2119                         tcps = TCP_STAT_GETREF();
2120                         tcps[TCP_STAT_RCVPARTDUPPACK]++;
2121                         tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
2122                         TCP_STAT_PUTREF();
2123                 }
2124                 tcp_new_dsack(tp, th->th_seq, todrop);
2125                 hdroptlen += todrop;    /*drop from head afterwards*/
2126                 th->th_seq += todrop;
2127                 tlen -= todrop;
2128                 if (th->th_urp > todrop)
2129                         th->th_urp -= todrop;
2130                 else {
2131                         tiflags &= ~TH_URG;
2132                         th->th_urp = 0;
2133                 }
2134         }
2135
2136         /*
2137          * If new data are received on a connection after the
2138          * user processes are gone, then RST the other end.
2139          */
2140         if ((so->so_state & SS_NOFDREF) &&
2141             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2142                 tp = tcp_close(tp);
2143                 TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
2144                 goto dropwithreset;
2145         }
2146
2147         /*
2148          * If segment ends after window, drop trailing data
2149          * (and PUSH and FIN); if nothing left, just ACK.
2150          */
2151         todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
2152         if (todrop > 0) {
2153                 TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
2154                 if (todrop >= tlen) {
2155                         /*
2156                          * The segment actually starts after the window.
2157                          * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
2158                          * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
2159                          * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
2160                          */
2161                         TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
2162                         /*
2163                          * If a new connection request is received
2164                          * while in TIME_WAIT, drop the old connection
2165                          * and start over if the sequence numbers
2166                          * are above the previous ones.
2167                          *
2168                          * NOTE: We will checksum the packet again, and
2169                          * so we need to put the header fields back into
2170                          * network order!
2171                          * XXX This kind of sucks, but we don't expect
2172                          * XXX this to happen very often, so maybe it
2173                          * XXX doesn't matter so much.
2174                          */
2175                         if (tiflags & TH_SYN &&
2176                             tp->t_state == TCPS_TIME_WAIT &&
2177                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2178                                 tp = tcp_close(tp);
2179                                 tcp_fields_to_net(th);
2180                                 goto findpcb;
2181                         }
2182                         /*
2183                          * If window is closed can only take segments at
2184                          * window edge, and have to drop data and PUSH from
2185                          * incoming segments.  Continue processing, but
2186                          * remember to ack.  Otherwise, drop segment
2187                          * and (if not RST) ack.
2188                          */
2189                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2190                                 tp->t_flags |= TF_ACKNOW;
2191                                 TCP_STATINC(TCP_STAT_RCVWINPROBE);
2192                         } else
2193                                 goto dropafterack;
2194                 } else
2195                         TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
2196                 m_adj(m, -todrop);
2197                 tlen -= todrop;
2198                 tiflags &= ~(TH_PUSH|TH_FIN);
2199         }
2200
2201         /*
2202          * If last ACK falls within this segment's sequence numbers,
2203          *  record the timestamp.
2204          * NOTE:
2205          * 1) That the test incorporates suggestions from the latest
2206          *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
2207          * 2) That updating only on newer timestamps interferes with
2208          *    our earlier PAWS tests, so this check should be solely
2209          *    predicated on the sequence space of this segment.
2210          * 3) That we modify the segment boundary check to be
2211          *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
2212          *    instead of RFC1323's
2213          *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
2214          *    This modified check allows us to overcome RFC1323's
2215          *    limitations as described in Stevens TCP/IP Illustrated
2216          *    Vol. 2 p.869. In such cases, we can still calculate the
2217          *    RTT correctly when RCV.NXT == Last.ACK.Sent.
2218          */
2219         if (opti.ts_present &&
2220             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2221             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2222                     ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
2223                 tp->ts_recent_age = tcp_now;
2224                 tp->ts_recent = opti.ts_val;
2225         }
2226
2227         /*
2228          * If the RST bit is set examine the state:
2229          *    SYN_RECEIVED STATE:
2230          *      If passive open, return to LISTEN state.
2231          *      If active open, inform user that connection was refused.
2232          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
2233          *      Inform user that connection was reset, and close tcb.
2234          *    CLOSING, LAST_ACK, TIME_WAIT STATES
2235          *      Close the tcb.
2236          */
2237         if (tiflags & TH_RST) {
2238                 if (th->th_seq != tp->rcv_nxt)
2239                         goto dropafterack_ratelim;
2240
2241                 switch (tp->t_state) {
2242                 case TCPS_SYN_RECEIVED:
2243                         so->so_error = ECONNREFUSED;
2244                         goto close;
2245
2246                 case TCPS_ESTABLISHED:
2247                 case TCPS_FIN_WAIT_1:
2248                 case TCPS_FIN_WAIT_2:
2249                 case TCPS_CLOSE_WAIT:
2250                         so->so_error = ECONNRESET;
2251                 close:
2252                         tp->t_state = TCPS_CLOSED;
2253                         TCP_STATINC(TCP_STAT_DROPS);
2254                         tp = tcp_close(tp);
2255                         goto drop;
2256
2257                 case TCPS_CLOSING:
2258                 case TCPS_LAST_ACK:
2259                 case TCPS_TIME_WAIT:
2260                         tp = tcp_close(tp);
2261                         goto drop;
2262                 }
2263         }
2264
2265         /*
2266          * Since we've covered the SYN-SENT and SYN-RECEIVED states above
2267          * we must be in a synchronized state.  RFC791 states (under RST
2268          * generation) that any unacceptable segment (an out-of-order SYN
2269          * qualifies) received in a synchronized state must elicit only an
2270          * empty acknowledgment segment ... and the connection remains in
2271          * the same state.
2272          */
2273         if (tiflags & TH_SYN) {
2274                 if (tp->rcv_nxt == th->th_seq) {
2275                         tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
2276                             TH_ACK);
2277                         if (tcp_saveti)
2278                                 m_freem(tcp_saveti);
2279                         return;
2280                 }
2281
2282                 goto dropafterack_ratelim;
2283         }
2284
2285         /*
2286          * If the ACK bit is off we drop the segment and return.
2287          */
2288         if ((tiflags & TH_ACK) == 0) {
2289                 if (tp->t_flags & TF_ACKNOW)
2290                         goto dropafterack;
2291                 else
2292                         goto drop;
2293         }
2294
2295         /*
2296          * Ack processing.
2297          */
2298         switch (tp->t_state) {
2299
2300         /*
2301          * In SYN_RECEIVED state if the ack ACKs our SYN then enter
2302          * ESTABLISHED state and continue processing, otherwise
2303          * send an RST.
2304          */
2305         case TCPS_SYN_RECEIVED:
2306                 if (SEQ_GT(tp->snd_una, th->th_ack) ||
2307                     SEQ_GT(th->th_ack, tp->snd_max))
2308                         goto dropwithreset;
2309                 TCP_STATINC(TCP_STAT_CONNECTS);
2310                 soisconnected(so);
2311                 tcp_established(tp);
2312                 /* Do window scaling? */
2313                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2314                     (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2315                         tp->snd_scale = tp->requested_s_scale;
2316                         tp->rcv_scale = tp->request_r_scale;
2317                 }
2318                 TCP_REASS_LOCK(tp);
2319                 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
2320                 TCP_REASS_UNLOCK(tp);
2321                 tp->snd_wl1 = th->th_seq - 1;
2322                 /* fall into ... */
2323
2324         /*
2325          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2326          * ACKs.  If the ack is in the range
2327          *      tp->snd_una < th->th_ack <= tp->snd_max
2328          * then advance tp->snd_una to th->th_ack and drop
2329          * data from the retransmission queue.  If this ACK reflects
2330          * more up to date window information we update our window information.
2331          */
2332         case TCPS_ESTABLISHED:
2333         case TCPS_FIN_WAIT_1:
2334         case TCPS_FIN_WAIT_2:
2335         case TCPS_CLOSE_WAIT:
2336         case TCPS_CLOSING:
2337         case TCPS_LAST_ACK:
2338         case TCPS_TIME_WAIT:
2339
2340                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2341                         if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) {
2342                                 TCP_STATINC(TCP_STAT_RCVDUPPACK);
2343                                 /*
2344                                  * If we have outstanding data (other than
2345                                  * a window probe), this is a completely
2346                                  * duplicate ack (ie, window info didn't
2347                                  * change), the ack is the biggest we've
2348                                  * seen and we've seen exactly our rexmt
2349                                  * threshhold of them, assume a packet
2350                                  * has been dropped and retransmit it.
2351                                  * Kludge snd_nxt & the congestion
2352                                  * window so we send only this one
2353                                  * packet.
2354                                  */
2355                                 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
2356                                     th->th_ack != tp->snd_una)
2357                                         tp->t_dupacks = 0;
2358                                 else if (tp->t_partialacks < 0 &&
2359                                          (++tp->t_dupacks == tcprexmtthresh ||
2360                                          TCP_FACK_FASTRECOV(tp))) {
2361                                         /*
2362                                          * Do the fast retransmit, and adjust
2363                                          * congestion control paramenters.
2364                                          */
2365                                         if (tp->t_congctl->fast_retransmit(tp, th)) {
2366                                                 /* False fast retransmit */
2367                                                 break;
2368                                         } else
2369                                                 goto drop;
2370                                 } else if (tp->t_dupacks > tcprexmtthresh) {
2371                                         tp->snd_cwnd += tp->t_segsz;
2372                                         (void) tcp_output(tp);
2373                                         goto drop;
2374                                 }
2375                         } else {
2376                                 /*
2377                                  * If the ack appears to be very old, only
2378                                  * allow data that is in-sequence.  This
2379                                  * makes it somewhat more difficult to insert
2380                                  * forged data by guessing sequence numbers.
2381                                  * Sent an ack to try to update the send
2382                                  * sequence number on the other side.
2383                                  */
2384                                 if (tlen && th->th_seq != tp->rcv_nxt &&
2385                                     SEQ_LT(th->th_ack,
2386                                     tp->snd_una - tp->max_sndwnd))
2387                                         goto dropafterack;
2388                         }
2389                         break;
2390                 }
2391                 /*
2392                  * If the congestion window was inflated to account
2393                  * for the other side's cached packets, retract it.
2394                  */
2395                 /* XXX: make SACK have his own congestion control
2396                  * struct -- rpaulo */
2397                 if (TCP_SACK_ENABLED(tp))
2398                         tcp_sack_newack(tp, th);
2399                 else
2400                         tp->t_congctl->fast_retransmit_newack(tp, th);
2401                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2402                         TCP_STATINC(TCP_STAT_RCVACKTOOMUCH);
2403                         goto dropafterack;
2404                 }
2405                 acked = th->th_ack - tp->snd_una;
2406                 tcps = TCP_STAT_GETREF();
2407                 tcps[TCP_STAT_RCVACKPACK]++;
2408                 tcps[TCP_STAT_RCVACKBYTE] += acked;
2409                 TCP_STAT_PUTREF();
2410
2411                 /*
2412                  * If we have a timestamp reply, update smoothed
2413                  * round trip time.  If no timestamp is present but
2414                  * transmit timer is running and timed sequence
2415                  * number was acked, update smoothed round trip time.
2416                  * Since we now have an rtt measurement, cancel the
2417                  * timer backoff (cf., Phil Karn's retransmit alg.).
2418                  * Recompute the initial retransmit timer.
2419                  */
2420                 if (ts_rtt)
2421                         tcp_xmit_timer(tp, ts_rtt);
2422                 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2423                         tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2424
2425                 /*
2426                  * If all outstanding data is acked, stop retransmit
2427                  * timer and remember to restart (more output or persist).
2428                  * If there is more data to be acked, restart retransmit
2429                  * timer, using current (possibly backed-off) value.
2430                  */
2431                 if (th->th_ack == tp->snd_max) {
2432                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
2433                         needoutput = 1;
2434                 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
2435                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2436
2437                 /*
2438                  * New data has been acked, adjust the congestion window.
2439                  */
2440                 tp->t_congctl->newack(tp, th);
2441
2442                 nd6_hint(tp);
2443                 if (acked > so->so_snd.sb_cc) {
2444                         tp->snd_wnd -= so->so_snd.sb_cc;
2445                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2446                         ourfinisacked = 1;
2447                 } else {
2448                         if (acked > (tp->t_lastoff - tp->t_inoff))
2449                                 tp->t_lastm = NULL;
2450                         sbdrop(&so->so_snd, acked);
2451                         tp->t_lastoff -= acked;
2452                         tp->snd_wnd -= acked;
2453                         ourfinisacked = 0;
2454                 }
2455                 sowwakeup(so);
2456
2457                 icmp_check(tp, th, acked);
2458
2459                 tp->snd_una = th->th_ack;
2460                 if (SEQ_GT(tp->snd_una, tp->snd_fack))
2461                         tp->snd_fack = tp->snd_una;
2462                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2463                         tp->snd_nxt = tp->snd_una;
2464                 if (SEQ_LT(tp->snd_high, tp->snd_una))
2465                         tp->snd_high = tp->snd_una;
2466
2467                 switch (tp->t_state) {
2468
2469                 /*
2470                  * In FIN_WAIT_1 STATE in addition to the processing
2471                  * for the ESTABLISHED state if our FIN is now acknowledged
2472                  * then enter FIN_WAIT_2.
2473                  */
2474                 case TCPS_FIN_WAIT_1:
2475                         if (ourfinisacked) {
2476                                 /*
2477                                  * If we can't receive any more
2478                                  * data, then closing user can proceed.
2479                                  * Starting the timer is contrary to the
2480                                  * specification, but if we don't get a FIN
2481                                  * we'll hang forever.
2482                                  */
2483                                 if (so->so_state & SS_CANTRCVMORE) {
2484                                         soisdisconnected(so);
2485                                         if (tp->t_maxidle > 0)
2486                                                 TCP_TIMER_ARM(tp, TCPT_2MSL,
2487                                                     tp->t_maxidle);
2488                                 }
2489                                 tp->t_state = TCPS_FIN_WAIT_2;
2490                         }
2491                         break;
2492
2493                 /*
2494                  * In CLOSING STATE in addition to the processing for
2495                  * the ESTABLISHED state if the ACK acknowledges our FIN
2496                  * then enter the TIME-WAIT state, otherwise ignore
2497                  * the segment.
2498                  */
2499                 case TCPS_CLOSING:
2500                         if (ourfinisacked) {
2501                                 tp->t_state = TCPS_TIME_WAIT;
2502                                 tcp_canceltimers(tp);
2503                                 TCP_TIMER_ARM(tp, TCPT_2MSL,
2504                                                 2 * PR_SLOWHZ * tcp_msl);
2505                                 soisdisconnected(so);
2506                         }
2507                         break;
2508
2509                 /*
2510                  * In LAST_ACK, we may still be waiting for data to drain
2511                  * and/or to be acked, as well as for the ack of our FIN.
2512                  * If our FIN is now acknowledged, delete the TCB,
2513                  * enter the closed state and return.
2514                  */
2515                 case TCPS_LAST_ACK:
2516                         if (ourfinisacked) {
2517                                 tp = tcp_close(tp);
2518                                 goto drop;
2519                         }
2520                         break;
2521
2522                 /*
2523                  * In TIME_WAIT state the only thing that should arrive
2524                  * is a retransmission of the remote FIN.  Acknowledge
2525                  * it and restart the finack timer.
2526                  */
2527                 case TCPS_TIME_WAIT:
2528                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl);
2529                         goto dropafterack;
2530                 }
2531         }
2532
2533 step6:
2534         /*
2535          * Update window information.
2536          * Don't look at window if no ACK: TAC's send garbage on first SYN.
2537          */
2538         if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2539             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2540             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2541                 /* keep track of pure window updates */
2542                 if (tlen == 0 &&
2543                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2544                         TCP_STATINC(TCP_STAT_RCVWINUPD);
2545                 tp->snd_wnd = tiwin;
2546                 tp->snd_wl1 = th->th_seq;
2547                 tp->snd_wl2 = th->th_ack;
2548                 if (tp->snd_wnd > tp->max_sndwnd)
2549                         tp->max_sndwnd = tp->snd_wnd;
2550                 needoutput = 1;
2551         }
2552
2553         /*
2554          * Process segments with URG.
2555          */
2556         if ((tiflags & TH_URG) && th->th_urp &&
2557             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2558                 /*
2559                  * This is a kludge, but if we receive and accept
2560                  * random urgent pointers, we'll crash in
2561                  * soreceive.  It's hard to imagine someone
2562                  * actually wanting to send this much urgent data.
2563                  */
2564                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2565                         th->th_urp = 0;                 /* XXX */
2566                         tiflags &= ~TH_URG;             /* XXX */
2567                         goto dodata;                    /* XXX */
2568                 }
2569                 /*
2570                  * If this segment advances the known urgent pointer,
2571                  * then mark the data stream.  This should not happen
2572                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2573                  * a FIN has been received from the remote side.
2574                  * In these states we ignore the URG.
2575                  *
2576                  * According to RFC961 (Assigned Protocols),
2577                  * the urgent pointer points to the last octet
2578                  * of urgent data.  We continue, however,
2579                  * to consider it to indicate the first octet
2580                  * of data past the urgent section as the original
2581                  * spec states (in one of two places).
2582                  */
2583                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2584                         tp->rcv_up = th->th_seq + th->th_urp;
2585                         so->so_oobmark = so->so_rcv.sb_cc +
2586                             (tp->rcv_up - tp->rcv_nxt) - 1;
2587                         if (so->so_oobmark == 0)
2588                                 so->so_state |= SS_RCVATMARK;
2589                         sohasoutofband(so);
2590                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2591                 }
2592                 /*
2593                  * Remove out of band data so doesn't get presented to user.
2594                  * This can happen independent of advancing the URG pointer,
2595                  * but if two URG's are pending at once, some out-of-band
2596                  * data may creep in... ick.
2597                  */
2598                 if (th->th_urp <= (u_int16_t) tlen
2599 #ifdef SO_OOBINLINE
2600                      && (so->so_options & SO_OOBINLINE) == 0
2601 #endif
2602                      )
2603                         tcp_pulloutofband(so, th, m, hdroptlen);
2604         } else
2605                 /*
2606                  * If no out of band data is expected,
2607                  * pull receive urgent pointer along
2608                  * with the receive window.
2609                  */
2610                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2611                         tp->rcv_up = tp->rcv_nxt;
2612 dodata:                                                 /* XXX */
2613
2614         /*
2615          * Process the segment text, merging it into the TCP sequencing queue,
2616          * and arranging for acknowledgement of receipt if necessary.
2617          * This process logically involves adjusting tp->rcv_wnd as data
2618          * is presented to the user (this happens in tcp_usrreq.c,
2619          * case PRU_RCVD).  If a FIN has already been received on this
2620          * connection then we just ignore the text.
2621          */
2622         if ((tlen || (tiflags & TH_FIN)) &&
2623             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2624                 /*
2625                  * Insert segment ti into reassembly queue of tcp with
2626                  * control block tp.  Return TH_FIN if reassembly now includes
2627                  * a segment with FIN.  The macro form does the common case
2628                  * inline (segment is the next to be received on an
2629                  * established connection, and the queue is empty),
2630                  * avoiding linkage into and removal from the queue and
2631                  * repetition of various conversions.
2632                  * Set DELACK for segments received in order, but ack
2633                  * immediately when segments are out of order
2634                  * (so fast retransmit can work).
2635                  */
2636                 /* NOTE: this was TCP_REASS() macro, but used only once */
2637                 TCP_REASS_LOCK(tp);
2638                 if (th->th_seq == tp->rcv_nxt &&
2639                     TAILQ_FIRST(&tp->segq) == NULL &&
2640                     tp->t_state == TCPS_ESTABLISHED) {
2641                         tcp_setup_ack(tp, th);
2642                         tp->rcv_nxt += tlen;
2643                         tiflags = th->th_flags & TH_FIN;
2644                         tcps = TCP_STAT_GETREF();
2645                         tcps[TCP_STAT_RCVPACK]++;
2646                         tcps[TCP_STAT_RCVBYTE] += tlen;
2647                         TCP_STAT_PUTREF();
2648                         nd6_hint(tp);
2649                         if (so->so_state & SS_CANTRCVMORE)
2650                                 m_freem(m);
2651                         else {
2652                                 m_adj(m, hdroptlen);
2653                                 sbappendstream(&(so)->so_rcv, m);
2654                         }
2655                         TCP_REASS_UNLOCK(tp);
2656                         sorwakeup(so);
2657                 } else {
2658                         m_adj(m, hdroptlen);
2659                         tiflags = tcp_reass(tp, th, m, &tlen);
2660                         tp->t_flags |= TF_ACKNOW;
2661                         TCP_REASS_UNLOCK(tp);
2662                 }
2663
2664                 /*
2665                  * Note the amount of data that peer has sent into
2666                  * our window, in order to estimate the sender's
2667                  * buffer size.
2668                  */
2669                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2670         } else {
2671                 m_freem(m);
2672                 m = NULL;
2673                 tiflags &= ~TH_FIN;
2674         }
2675
2676         /*
2677          * If FIN is received ACK the FIN and let the user know
2678          * that the connection is closing.  Ignore a FIN received before
2679          * the connection is fully established.
2680          */
2681         if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2682                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2683                         socantrcvmore(so);
2684                         tp->t_flags |= TF_ACKNOW;
2685                         tp->rcv_nxt++;
2686                 }
2687                 switch (tp->t_state) {
2688
2689                 /*
2690                  * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2691                  */
2692                 case TCPS_ESTABLISHED:
2693                         tp->t_state = TCPS_CLOSE_WAIT;
2694                         break;
2695
2696                 /*
2697                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
2698                  * enter the CLOSING state.
2699                  */
2700                 case TCPS_FIN_WAIT_1:
2701                         tp->t_state = TCPS_CLOSING;
2702                         break;
2703
2704                 /*
2705                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
2706                  * starting the time-wait timer, turning off the other
2707                  * standard timers.
2708                  */
2709                 case TCPS_FIN_WAIT_2:
2710                         tp->t_state = TCPS_TIME_WAIT;
2711                         tcp_canceltimers(tp);
2712                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl);
2713                         soisdisconnected(so);
2714                         break;
2715
2716                 /*
2717                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
2718                  */
2719                 case TCPS_TIME_WAIT:
2720                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * PR_SLOWHZ * tcp_msl);
2721                         break;
2722                 }
2723         }
2724 #ifdef TCP_DEBUG
2725         if (so->so_options & SO_DEBUG)
2726                 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
2727 #endif
2728
2729         /*
2730          * Return any desired output.
2731          */
2732         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2733                 (void) tcp_output(tp);
2734         }
2735         if (tcp_saveti)
2736                 m_freem(tcp_saveti);
2737         return;
2738
2739 badsyn:
2740         /*
2741          * Received a bad SYN.  Increment counters and dropwithreset.
2742          */
2743         TCP_STATINC(TCP_STAT_BADSYN);
2744         tp = NULL;
2745         goto dropwithreset;
2746
2747 dropafterack:
2748         /*
2749          * Generate an ACK dropping incoming segment if it occupies
2750          * sequence space, where the ACK reflects our state.
2751          */
2752         if (tiflags & TH_RST)
2753                 goto drop;
2754         goto dropafterack2;
2755
2756 dropafterack_ratelim:
2757         /*
2758          * We may want to rate-limit ACKs against SYN/RST attack.
2759          */
2760         if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2761             tcp_ackdrop_ppslim) == 0) {
2762                 /* XXX stat */
2763                 goto drop;
2764         }
2765         /* ...fall into dropafterack2... */
2766
2767 dropafterack2:
2768         m_freem(m);
2769         tp->t_flags |= TF_ACKNOW;
2770         (void) tcp_output(tp);
2771         if (tcp_saveti)
2772                 m_freem(tcp_saveti);
2773         return;
2774
2775 dropwithreset_ratelim:
2776         /*
2777          * We may want to rate-limit RSTs in certain situations,
2778          * particularly if we are sending an RST in response to
2779          * an attempt to connect to or otherwise communicate with
2780          * a port for which we have no socket.
2781          */
2782         if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2783             tcp_rst_ppslim) == 0) {
2784                 /* XXX stat */
2785                 goto drop;
2786         }
2787         /* ...fall into dropwithreset... */
2788
2789 dropwithreset:
2790         /*
2791          * Generate a RST, dropping incoming segment.
2792          * Make ACK acceptable to originator of segment.
2793          */
2794         if (tiflags & TH_RST)
2795                 goto drop;
2796
2797         switch (af) {
2798 #ifdef INET6
2799         case AF_INET6:
2800                 /* For following calls to tcp_respond */
2801                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
2802                         goto drop;
2803                 break;
2804 #endif /* INET6 */
2805         case AF_INET:
2806                 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
2807                     in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2808                         goto drop;
2809         }
2810
2811         if (tiflags & TH_ACK)
2812                 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
2813         else {
2814                 if (tiflags & TH_SYN)
2815                         tlen++;
2816                 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
2817                     TH_RST|TH_ACK);
2818         }
2819         if (tcp_saveti)
2820                 m_freem(tcp_saveti);
2821         return;
2822
2823 badcsum:
2824 drop:
2825         /*
2826          * Drop space held by incoming segment and return.
2827          */
2828         if (tp) {
2829                 if (tp->t_inpcb)
2830                         so = tp->t_inpcb->inp_socket;
2831 #ifdef INET6
2832                 else if (tp->t_in6pcb)
2833                         so = tp->t_in6pcb->in6p_socket;
2834 #endif
2835                 else
2836                         so = NULL;
2837 #ifdef TCP_DEBUG
2838                 if (so && (so->so_options & SO_DEBUG) != 0)
2839                         tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
2840 #endif
2841         }
2842         if (tcp_saveti)
2843                 m_freem(tcp_saveti);
2844         m_freem(m);
2845         return;
2846 }
2847
2848 #ifdef TCP_SIGNATURE
2849 int
2850 tcp_signature_apply(void *fstate, void *data, u_int len)
2851 {
2852
2853         MD5Update(fstate, (u_char *)data, len);
2854         return (0);
2855 }
2856
2857 struct secasvar *
2858 tcp_signature_getsav(struct mbuf *m, struct tcphdr *th)
2859 {
2860         struct secasvar *sav;
2861 #ifdef FAST_IPSEC
2862         union sockaddr_union dst;
2863 #endif
2864         struct ip *ip;
2865         struct ip6_hdr *ip6;
2866
2867         ip = mtod(m, struct ip *);
2868         switch (ip->ip_v) {
2869         case 4:
2870                 ip = mtod(m, struct ip *);
2871                 ip6 = NULL;
2872                 break;
2873         case 6:
2874                 ip = NULL;
2875                 ip6 = mtod(m, struct ip6_hdr *);
2876                 break;
2877         default:
2878                 return (NULL);
2879         }
2880
2881 #ifdef FAST_IPSEC
2882         /* Extract the destination from the IP header in the mbuf. */
2883         memset(&dst, 0, sizeof(union sockaddr_union));
2884         if (ip !=NULL) {
2885                 dst.sa.sa_len = sizeof(struct sockaddr_in);
2886                 dst.sa.sa_family = AF_INET;
2887                 dst.sin.sin_addr = ip->ip_dst;
2888         } else {
2889                 dst.sa.sa_len = sizeof(struct sockaddr_in6);
2890                 dst.sa.sa_family = AF_INET6;
2891                 dst.sin6.sin6_addr = ip6->ip6_dst;
2892         }
2893
2894         /*
2895          * Look up an SADB entry which matches the address of the peer.
2896          */
2897         sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
2898 #else
2899         if (ip)
2900                 sav = key_allocsa(AF_INET, (void *)&ip->ip_src,
2901                     (void *)&ip->ip_dst, IPPROTO_TCP,
2902                     htonl(TCP_SIG_SPI), 0, 0);
2903         else
2904                 sav = key_allocsa(AF_INET6, (void *)&ip6->ip6_src,
2905                     (void *)&ip6->ip6_dst, IPPROTO_TCP,
2906                     htonl(TCP_SIG_SPI), 0, 0);
2907 #endif
2908
2909         return (sav);   /* freesav must be performed by caller */
2910 }
2911
2912 int
2913 tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff,
2914     struct secasvar *sav, char *sig)
2915 {
2916         MD5_CTX ctx;
2917         struct ip *ip;
2918         struct ipovly *ipovly;
2919         struct ip6_hdr *ip6;
2920         struct ippseudo ippseudo;
2921         struct ip6_hdr_pseudo ip6pseudo;
2922         struct tcphdr th0;
2923         int l, tcphdrlen;
2924
2925         if (sav == NULL)
2926                 return (-1);
2927
2928         tcphdrlen = th->th_off * 4;
2929
2930         switch (mtod(m, struct ip *)->ip_v) {
2931         case 4:
2932                 ip = mtod(m, struct ip *);
2933                 ip6 = NULL;
2934                 break;
2935         case 6:
2936                 ip = NULL;
2937                 ip6 = mtod(m, struct ip6_hdr *);
2938                 break;
2939         default:
2940                 return (-1);
2941         }
2942
2943         MD5Init(&ctx);
2944
2945         if (ip) {
2946                 memset(&ippseudo, 0, sizeof(ippseudo));
2947                 ipovly = (struct ipovly *)ip;
2948                 ippseudo.ippseudo_src = ipovly->ih_src;
2949                 ippseudo.ippseudo_dst = ipovly->ih_dst;
2950                 ippseudo.ippseudo_pad = 0;
2951                 ippseudo.ippseudo_p = IPPROTO_TCP;
2952                 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
2953                 MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo));
2954         } else {
2955                 memset(&ip6pseudo, 0, sizeof(ip6pseudo));
2956                 ip6pseudo.ip6ph_src = ip6->ip6_src;
2957                 in6_clearscope(&ip6pseudo.ip6ph_src);
2958                 ip6pseudo.ip6ph_dst = ip6->ip6_dst;
2959                 in6_clearscope(&ip6pseudo.ip6ph_dst);
2960                 ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
2961                 ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
2962                 MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo));
2963         }
2964
2965         th0 = *th;
2966         th0.th_sum = 0;
2967         MD5Update(&ctx, (char *)&th0, sizeof(th0));
2968
2969         l = m->m_pkthdr.len - thoff - tcphdrlen;
2970         if (l > 0)
2971                 m_apply(m, thoff + tcphdrlen,
2972                     m->m_pkthdr.len - thoff - tcphdrlen,
2973                     tcp_signature_apply, &ctx);
2974
2975         MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
2976         MD5Final(sig, &ctx);
2977
2978         return (0);
2979 }
2980 #endif
2981
2982 static int
2983 tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt,
2984     struct tcphdr *th,
2985     struct mbuf *m, int toff, struct tcp_opt_info *oi)
2986 {
2987         u_int16_t mss;
2988         int opt, optlen = 0;
2989 #ifdef TCP_SIGNATURE
2990         void *sigp = NULL;
2991         char sigbuf[TCP_SIGLEN];
2992         struct secasvar *sav = NULL;
2993 #endif
2994
2995         for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
2996                 opt = cp[0];
2997                 if (opt == TCPOPT_EOL)
2998                         break;
2999                 if (opt == TCPOPT_NOP)
3000                         optlen = 1;
3001                 else {
3002                         if (cnt < 2)
3003                                 break;
3004                         optlen = cp[1];
3005                         if (optlen < 2 || optlen > cnt)
3006                                 break;
3007                 }
3008                 switch (opt) {
3009
3010                 default:
3011                         continue;
3012
3013                 case TCPOPT_MAXSEG:
3014                         if (optlen != TCPOLEN_MAXSEG)
3015                                 continue;
3016                         if (!(th->th_flags & TH_SYN))
3017                                 continue;
3018                         if (TCPS_HAVERCVDSYN(tp->t_state))
3019                                 continue;
3020                         bcopy(cp + 2, &mss, sizeof(mss));
3021                         oi->maxseg = ntohs(mss);
3022                         break;
3023
3024                 case TCPOPT_WINDOW:
3025                         if (optlen != TCPOLEN_WINDOW)
3026                                 continue;
3027                         if (!(th->th_flags & TH_SYN))
3028                                 continue;
3029                         if (TCPS_HAVERCVDSYN(tp->t_state))
3030                                 continue;
3031                         tp->t_flags |= TF_RCVD_SCALE;
3032                         tp->requested_s_scale = cp[2];
3033                         if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
3034 #if 0   /*XXX*/
3035                                 char *p;
3036
3037                                 if (ip)
3038                                         p = ntohl(ip->ip_src);
3039 #ifdef INET6
3040                                 else if (ip6)
3041                                         p = ip6_sprintf(&ip6->ip6_src);
3042 #endif
3043                                 else
3044                                         p = "(unknown)";
3045                                 log(LOG_ERR, "TCP: invalid wscale %d from %s, "
3046                                     "assuming %d\n",
3047                                     tp->requested_s_scale, p,
3048                                     TCP_MAX_WINSHIFT);
3049 #else
3050                                 log(LOG_ERR, "TCP: invalid wscale %d, "
3051                                     "assuming %d\n",
3052                                     tp->requested_s_scale,
3053                                     TCP_MAX_WINSHIFT);
3054 #endif
3055                                 tp->requested_s_scale = TCP_MAX_WINSHIFT;
3056                         }
3057                         break;
3058
3059                 case TCPOPT_TIMESTAMP:
3060                         if (optlen != TCPOLEN_TIMESTAMP)
3061                                 continue;
3062                         oi->ts_present = 1;
3063                         bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
3064                         NTOHL(oi->ts_val);
3065                         bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
3066                         NTOHL(oi->ts_ecr);
3067
3068                         if (!(th->th_flags & TH_SYN))
3069                                 continue;
3070                         if (TCPS_HAVERCVDSYN(tp->t_state))
3071                                 continue;
3072                         /*
3073                          * A timestamp received in a SYN makes
3074                          * it ok to send timestamp requests and replies.
3075                          */
3076                         tp->t_flags |= TF_RCVD_TSTMP;
3077                         tp->ts_recent = oi->ts_val;
3078                         tp->ts_recent_age = tcp_now;
3079                         break;
3080
3081                 case TCPOPT_SACK_PERMITTED:
3082                         if (optlen != TCPOLEN_SACK_PERMITTED)
3083                                 continue;
3084                         if (!(th->th_flags & TH_SYN))
3085                                 continue;
3086                         if (TCPS_HAVERCVDSYN(tp->t_state))
3087                                 continue;
3088                         if (tcp_do_sack) {
3089                                 tp->t_flags |= TF_SACK_PERMIT;
3090                                 tp->t_flags |= TF_WILL_SACK;
3091                         }
3092                         break;
3093
3094                 case TCPOPT_SACK:
3095                         tcp_sack_option(tp, th, cp, optlen);
3096                         break;
3097 #ifdef TCP_SIGNATURE
3098                 case TCPOPT_SIGNATURE:
3099                         if (optlen != TCPOLEN_SIGNATURE)
3100                                 continue;
3101                         if (sigp && memcmp(sigp, cp + 2, TCP_SIGLEN))
3102                                 return (-1);
3103
3104                         sigp = sigbuf;
3105                         memcpy(sigbuf, cp + 2, TCP_SIGLEN);
3106                         tp->t_flags |= TF_SIGNATURE;
3107                         break;
3108 #endif
3109                 }
3110         }
3111
3112 #ifdef TCP_SIGNATURE
3113         if (tp->t_flags & TF_SIGNATURE) {
3114
3115                 sav = tcp_signature_getsav(m, th);
3116
3117                 if (sav == NULL && tp->t_state == TCPS_LISTEN)
3118                         return (-1);
3119         }
3120
3121         if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
3122                 if (sav == NULL)
3123                         return (-1);
3124 #ifdef FAST_IPSEC
3125                 KEY_FREESAV(&sav);
3126 #else
3127                 key_freesav(sav);
3128 #endif
3129                 return (-1);
3130         }
3131
3132         if (sigp) {
3133                 char sig[TCP_SIGLEN];
3134
3135                 tcp_fields_to_net(th);
3136                 if (tcp_signature(m, th, toff, sav, sig) < 0) {
3137                         tcp_fields_to_host(th);
3138                         if (sav == NULL)
3139                                 return (-1);
3140 #ifdef FAST_IPSEC
3141                         KEY_FREESAV(&sav);
3142 #else
3143                         key_freesav(sav);
3144 #endif
3145                         return (-1);
3146                 }
3147                 tcp_fields_to_host(th);
3148
3149                 if (memcmp(sig, sigp, TCP_SIGLEN)) {
3150                         TCP_STATINC(TCP_STAT_BADSIG);
3151                         if (sav == NULL)
3152                                 return (-1);
3153 #ifdef FAST_IPSEC
3154                         KEY_FREESAV(&sav);
3155 #else
3156                         key_freesav(sav);
3157 #endif
3158                         return (-1);
3159                 } else
3160                         TCP_STATINC(TCP_STAT_GOODSIG);
3161
3162                 key_sa_recordxfer(sav, m);
3163 #ifdef FAST_IPSEC
3164                 KEY_FREESAV(&sav);
3165 #else
3166                 key_freesav(sav);
3167 #endif
3168         }
3169 #endif
3170
3171         return (0);
3172 }
3173
3174 /*
3175  * Pull out of band byte out of a segment so
3176  * it doesn't appear in the user's data queue.
3177  * It is still reflected in the segment length for
3178  * sequencing purposes.
3179  */
3180 void
3181 tcp_pulloutofband(struct socket *so, struct tcphdr *th,
3182     struct mbuf *m, int off)
3183 {
3184         int cnt = off + th->th_urp - 1;
3185
3186         while (cnt >= 0) {
3187                 if (m->m_len > cnt) {
3188                         char *cp = mtod(m, char *) + cnt;
3189                         struct tcpcb *tp = sototcpcb(so);
3190
3191                         tp->t_iobc = *cp;
3192                         tp->t_oobflags |= TCPOOB_HAVEDATA;
3193                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3194                         m->m_len--;
3195                         return;
3196                 }
3197                 cnt -= m->m_len;
3198                 m = m->m_next;
3199                 if (m == 0)
3200                         break;
3201         }
3202         panic("tcp_pulloutofband");
3203 }
3204
3205 /*
3206  * Collect new round-trip time estimate
3207  * and update averages and current timeout.
3208  */
3209 void
3210 tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
3211 {
3212         int32_t delta;
3213
3214         TCP_STATINC(TCP_STAT_RTTUPDATED);
3215         if (tp->t_srtt != 0) {
3216                 /*
3217                  * srtt is stored as fixed point with 3 bits after the
3218                  * binary point (i.e., scaled by 8).  The following magic
3219                  * is equivalent to the smoothing algorithm in rfc793 with
3220                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3221                  * point).  Adjust rtt to origin 0.
3222                  */
3223                 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
3224                 if ((tp->t_srtt += delta) <= 0)
3225                         tp->t_srtt = 1 << 2;
3226                 /*
3227                  * We accumulate a smoothed rtt variance (actually, a
3228                  * smoothed mean difference), then set the retransmit
3229                  * timer to smoothed rtt + 4 times the smoothed variance.
3230                  * rttvar is stored as fixed point with 2 bits after the
3231                  * binary point (scaled by 4).  The following is
3232                  * equivalent to rfc793 smoothing with an alpha of .75
3233                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
3234                  * rfc793's wired-in beta.
3235                  */
3236                 if (delta < 0)
3237                         delta = -delta;
3238                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
3239                 if ((tp->t_rttvar += delta) <= 0)
3240                         tp->t_rttvar = 1 << 2;
3241         } else {
3242                 /*
3243                  * No rtt measurement yet - use the unsmoothed rtt.
3244                  * Set the variance to half the rtt (so our first
3245                  * retransmit happens at 3*rtt).
3246                  */
3247                 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
3248                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
3249         }
3250         tp->t_rtttime = 0;
3251         tp->t_rxtshift = 0;
3252
3253         /*
3254          * the retransmit should happen at rtt + 4 * rttvar.
3255          * Because of the way we do the smoothing, srtt and rttvar
3256          * will each average +1/2 tick of bias.  When we compute
3257          * the retransmit timer, we want 1/2 tick of rounding and
3258          * 1 extra tick because of +-1/2 tick uncertainty in the
3259          * firing of the timer.  The bias will give us exactly the
3260          * 1.5 tick we need.  But, because the bias is
3261          * statistical, we have to test that we don't drop below
3262          * the minimum feasible timer (which is 2 ticks).
3263          */
3264         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3265             max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3266
3267         /*
3268          * We received an ack for a packet that wasn't retransmitted;
3269          * it is probably safe to discard any error indications we've
3270          * received recently.  This isn't quite right, but close enough
3271          * for now (a route might have failed after we sent a segment,
3272          * and the return path might not be symmetrical).
3273          */
3274         tp->t_softerror = 0;
3275 }
3276
3277
3278 /*
3279  * TCP compressed state engine.  Currently used to hold compressed
3280  * state for SYN_RECEIVED.
3281  */
3282
3283 u_long  syn_cache_count;
3284 u_int32_t syn_hash1, syn_hash2;
3285
3286 #define SYN_HASH(sa, sp, dp) \
3287         ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3288                                      ((u_int32_t)(sp)))^syn_hash2)))
3289 #ifndef INET6
3290 #define SYN_HASHALL(hash, src, dst) \
3291 do {                                                                    \
3292         hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3293                 ((const struct sockaddr_in *)(src))->sin_port,          \
3294                 ((const struct sockaddr_in *)(dst))->sin_port);         \
3295 } while (/*CONSTCOND*/ 0)
3296 #else
3297 #define SYN_HASH6(sa, sp, dp) \
3298         ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3299           (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3300          & 0x7fffffff)
3301
3302 #define SYN_HASHALL(hash, src, dst) \
3303 do {                                                                    \
3304         switch ((src)->sa_family) {                                     \
3305         case AF_INET:                                                   \
3306                 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3307                         ((const struct sockaddr_in *)(src))->sin_port,  \
3308                         ((const struct sockaddr_in *)(dst))->sin_port); \
3309                 break;                                                  \
3310         case AF_INET6:                                                  \
3311                 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
3312                         ((const struct sockaddr_in6 *)(src))->sin6_port,        \
3313                         ((const struct sockaddr_in6 *)(dst))->sin6_port);       \
3314                 break;                                                  \
3315         default:                                                        \
3316                 hash = 0;                                               \
3317         }                                                               \
3318 } while (/*CONSTCOND*/0)
3319 #endif /* INET6 */
3320
3321 static struct pool syn_cache_pool;
3322
3323 /*
3324  * We don't estimate RTT with SYNs, so each packet starts with the default
3325  * RTT and each timer step has a fixed timeout value.
3326  */
3327 #define SYN_CACHE_TIMER_ARM(sc)                                         \
3328 do {                                                                    \
3329         TCPT_RANGESET((sc)->sc_rxtcur,                                  \
3330             TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3331             TCPTV_REXMTMAX);                                            \
3332         callout_reset(&(sc)->sc_timer,                                  \
3333             (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
3334 } while (/*CONSTCOND*/0)
3335
3336 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
3337
3338 static inline void
3339 syn_cache_rm(struct syn_cache *sc)
3340 {
3341         TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
3342             sc, sc_bucketq);
3343         sc->sc_tp = NULL;
3344         LIST_REMOVE(sc, sc_tpq);
3345         tcp_syn_cache[sc->sc_bucketidx].sch_length--;
3346         callout_stop(&sc->sc_timer);
3347         syn_cache_count--;
3348 }
3349
3350 static inline void
3351 syn_cache_put(struct syn_cache *sc)
3352 {
3353         if (sc->sc_ipopts)
3354                 (void) m_free(sc->sc_ipopts);
3355         rtcache_free(&sc->sc_route);
3356         if (callout_invoking(&sc->sc_timer))
3357                 sc->sc_flags |= SCF_DEAD;
3358         else {
3359                 callout_destroy(&sc->sc_timer);
3360                 pool_put(&syn_cache_pool, sc);
3361         }
3362 }
3363
3364 void
3365 syn_cache_init(void)
3366 {
3367         int i;
3368
3369         pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
3370             "synpl", NULL, IPL_SOFTNET);
3371
3372         /* Initialize the hash buckets. */
3373         for (i = 0; i < tcp_syn_cache_size; i++)
3374                 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3375 }
3376
3377 void
3378 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
3379 {
3380         struct syn_cache_head *scp;
3381         struct syn_cache *sc2;
3382         int s;
3383
3384         /*
3385          * If there are no entries in the hash table, reinitialize
3386          * the hash secrets.
3387          */
3388         if (syn_cache_count == 0) {
3389                 syn_hash1 = arc4random();
3390                 syn_hash2 = arc4random();
3391         }
3392
3393         SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3394         sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3395         scp = &tcp_syn_cache[sc->sc_bucketidx];
3396
3397         /*
3398          * Make sure that we don't overflow the per-bucket
3399          * limit or the total cache size limit.
3400          */
3401         s = splsoftnet();
3402         if (scp->sch_length >= tcp_syn_bucket_limit) {
3403                 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
3404                 /*
3405                  * The bucket is full.  Toss the oldest element in the
3406                  * bucket.  This will be the first entry in the bucket.
3407                  */
3408                 sc2 = TAILQ_FIRST(&scp->sch_bucket);
3409 #ifdef DIAGNOSTIC
3410                 /*
3411                  * This should never happen; we should always find an
3412                  * entry in our bucket.
3413                  */
3414                 if (sc2 == NULL)
3415                         panic("syn_cache_insert: bucketoverflow: impossible");
3416 #endif
3417                 syn_cache_rm(sc2);
3418                 syn_cache_put(sc2);     /* calls pool_put but see spl above */
3419         } else if (syn_cache_count >= tcp_syn_cache_limit) {
3420                 struct syn_cache_head *scp2, *sce;
3421
3422                 TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
3423                 /*
3424                  * The cache is full.  Toss the oldest entry in the
3425                  * first non-empty bucket we can find.
3426                  *
3427                  * XXX We would really like to toss the oldest
3428                  * entry in the cache, but we hope that this
3429                  * condition doesn't happen very often.
3430                  */
3431                 scp2 = scp;
3432                 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3433                         sce = &tcp_syn_cache[tcp_syn_cache_size];
3434                         for (++scp2; scp2 != scp; scp2++) {
3435                                 if (scp2 >= sce)
3436                                         scp2 = &tcp_syn_cache[0];
3437                                 if (! TAILQ_EMPTY(&scp2->sch_bucket))
3438                                         break;
3439                         }
3440 #ifdef DIAGNOSTIC
3441                         /*
3442                          * This should never happen; we should always find a
3443                          * non-empty bucket.
3444                          */
3445                         if (scp2 == scp)
3446                                 panic("syn_cache_insert: cacheoverflow: "
3447                                     "impossible");
3448 #endif
3449                 }
3450                 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3451                 syn_cache_rm(sc2);
3452                 syn_cache_put(sc2);     /* calls pool_put but see spl above */
3453         }
3454
3455         /*
3456          * Initialize the entry's timer.
3457          */
3458         sc->sc_rxttot = 0;
3459         sc->sc_rxtshift = 0;
3460         SYN_CACHE_TIMER_ARM(sc);
3461
3462         /* Link it from tcpcb entry */
3463         LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3464
3465         /* Put it into the bucket. */
3466         TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3467         scp->sch_length++;
3468         syn_cache_count++;
3469
3470         TCP_STATINC(TCP_STAT_SC_ADDED);
3471         splx(s);
3472 }
3473
3474 /*
3475  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3476  * If we have retransmitted an entry the maximum number of times, expire
3477  * that entry.
3478  */
3479 void
3480 syn_cache_timer(void *arg)
3481 {
3482         struct syn_cache *sc = arg;
3483
3484         mutex_enter(softnet_lock);
3485         KERNEL_LOCK(1, NULL);
3486         callout_ack(&sc->sc_timer);
3487
3488         if (__predict_false(sc->sc_flags & SCF_DEAD)) {
3489                 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
3490                 callout_destroy(&sc->sc_timer);
3491                 pool_put(&syn_cache_pool, sc);
3492                 KERNEL_UNLOCK_ONE(NULL);
3493                 mutex_exit(softnet_lock);
3494                 return;
3495         }
3496
3497         if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3498                 /* Drop it -- too many retransmissions. */
3499                 goto dropit;
3500         }
3501
3502         /*
3503          * Compute the total amount of time this entry has
3504          * been on a queue.  If this entry has been on longer
3505          * than the keep alive timer would allow, expire it.
3506          */
3507         sc->sc_rxttot += sc->sc_rxtcur;
3508         if (sc->sc_rxttot >= tcp_keepinit)
3509                 goto dropit;
3510
3511         TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
3512         (void) syn_cache_respond(sc, NULL);
3513
3514         /* Advance the timer back-off. */
3515         sc->sc_rxtshift++;
3516         SYN_CACHE_TIMER_ARM(sc);
3517
3518         KERNEL_UNLOCK_ONE(NULL);
3519         mutex_exit(softnet_lock);
3520         return;
3521
3522  dropit:
3523         TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
3524         syn_cache_rm(sc);
3525         syn_cache_put(sc);      /* calls pool_put but see spl above */
3526         KERNEL_UNLOCK_ONE(NULL);
3527         mutex_exit(softnet_lock);
3528 }
3529
3530 /*
3531  * Remove syn cache created by the specified tcb entry,
3532  * because this does not make sense to keep them
3533  * (if there's no tcb entry, syn cache entry will never be used)
3534  */
3535 void
3536 syn_cache_cleanup(struct tcpcb *tp)
3537 {
3538         struct syn_cache *sc, *nsc;
3539         int s;
3540
3541         s = splsoftnet();
3542
3543         for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3544                 nsc = LIST_NEXT(sc, sc_tpq);
3545
3546 #ifdef DIAGNOSTIC
3547                 if (sc->sc_tp != tp)
3548                         panic("invalid sc_tp in syn_cache_cleanup");
3549 #endif
3550                 syn_cache_rm(sc);
3551                 syn_cache_put(sc);      /* calls pool_put but see spl above */
3552         }
3553         /* just for safety */
3554         LIST_INIT(&tp->t_sc);
3555
3556         splx(s);
3557 }
3558
3559 /*
3560  * Find an entry in the syn cache.
3561  */
3562 struct syn_cache *
3563 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
3564     struct syn_cache_head **headp)
3565 {
3566         struct syn_cache *sc;
3567         struct syn_cache_head *scp;
3568         u_int32_t hash;
3569         int s;
3570
3571         SYN_HASHALL(hash, src, dst);
3572
3573         scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3574         *headp = scp;
3575         s = splsoftnet();
3576         for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3577              sc = TAILQ_NEXT(sc, sc_bucketq)) {
3578                 if (sc->sc_hash != hash)
3579                         continue;
3580                 if (!memcmp(&sc->sc_src, src, src->sa_len) &&
3581                     !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
3582                         splx(s);
3583                         return (sc);
3584                 }
3585         }
3586         splx(s);
3587         return (NULL);
3588 }
3589
3590 /*
3591  * This function gets called when we receive an ACK for a
3592  * socket in the LISTEN state.  We look up the connection
3593  * in the syn cache, and if its there, we pull it out of
3594  * the cache and turn it into a full-blown connection in
3595  * the SYN-RECEIVED state.
3596  *
3597  * The return values may not be immediately obvious, and their effects
3598  * can be subtle, so here they are:
3599  *
3600  *      NULL    SYN was not found in cache; caller should drop the
3601  *              packet and send an RST.
3602  *
3603  *      -1      We were unable to create the new connection, and are
3604  *              aborting it.  An ACK,RST is being sent to the peer
3605  *              (unless we got screwey sequence numbners; see below),
3606  *              because the 3-way handshake has been completed.  Caller
3607  *              should not free the mbuf, since we may be using it.  If
3608  *              we are not, we will free it.
3609  *
3610  *      Otherwise, the return value is a pointer to the new socket
3611  *      associated with the connection.
3612  */
3613 struct socket *
3614 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
3615     struct tcphdr *th, unsigned int hlen, unsigned int tlen,
3616     struct socket *so, struct mbuf *m)
3617 {
3618         struct syn_cache *sc;
3619         struct syn_cache_head *scp;
3620         struct inpcb *inp = NULL;
3621 #ifdef INET6
3622         struct in6pcb *in6p = NULL;
3623 #endif
3624         struct tcpcb *tp = 0;
3625         struct mbuf *am;
3626         int s;
3627         struct socket *oso;
3628
3629         s = splsoftnet();
3630         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3631                 splx(s);
3632                 return (NULL);
3633         }
3634
3635         /*
3636          * Verify the sequence and ack numbers.  Try getting the correct
3637          * response again.
3638          */
3639         if ((th->th_ack != sc->sc_iss + 1) ||
3640             SEQ_LEQ(th->th_seq, sc->sc_irs) ||
3641             SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
3642                 (void) syn_cache_respond(sc, m);
3643                 splx(s);
3644                 return ((struct socket *)(-1));
3645         }
3646
3647         /* Remove this cache entry */
3648         syn_cache_rm(sc);
3649         splx(s);
3650
3651         /*
3652          * Ok, create the full blown connection, and set things up
3653          * as they would have been set up if we had created the
3654          * connection when the SYN arrived.  If we can't create
3655          * the connection, abort it.
3656          */
3657         /*
3658          * inp still has the OLD in_pcb stuff, set the
3659          * v6-related flags on the new guy, too.   This is
3660          * done particularly for the case where an AF_INET6
3661          * socket is bound only to a port, and a v4 connection
3662          * comes in on that port.
3663          * we also copy the flowinfo from the original pcb
3664          * to the new one.
3665          */
3666         oso = so;
3667         so = sonewconn(so, SS_ISCONNECTED);
3668         if (so == NULL)
3669                 goto resetandabort;
3670
3671         switch (so->so_proto->pr_domain->dom_family) {
3672 #ifdef INET
3673         case AF_INET:
3674                 inp = sotoinpcb(so);
3675                 break;
3676 #endif
3677 #ifdef INET6
3678         case AF_INET6:
3679                 in6p = sotoin6pcb(so);
3680                 break;
3681 #endif
3682         }
3683         switch (src->sa_family) {
3684 #ifdef INET
3685         case AF_INET:
3686                 if (inp) {
3687                         inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
3688                         inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
3689                         inp->inp_options = ip_srcroute();
3690                         in_pcbstate(inp, INP_BOUND);
3691                         if (inp->inp_options == NULL) {
3692                                 inp->inp_options = sc->sc_ipopts;
3693                                 sc->sc_ipopts = NULL;
3694                         }
3695                 }
3696 #ifdef INET6
3697                 else if (in6p) {
3698                         /* IPv4 packet to AF_INET6 socket */
3699                         memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr));
3700                         in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
3701                         bcopy(&((struct sockaddr_in *)dst)->sin_addr,
3702                                 &in6p->in6p_laddr.s6_addr32[3],
3703                                 sizeof(((struct sockaddr_in *)dst)->sin_addr));
3704                         in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
3705                         in6totcpcb(in6p)->t_family = AF_INET;
3706                         if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
3707                                 in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
3708                         else
3709                                 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
3710                         in6_pcbstate(in6p, IN6P_BOUND);
3711                 }
3712 #endif
3713                 break;
3714 #endif
3715 #ifdef INET6
3716         case AF_INET6:
3717                 if (in6p) {
3718                         in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
3719                         in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
3720                         in6_pcbstate(in6p, IN6P_BOUND);
3721                 }
3722                 break;
3723 #endif
3724         }
3725 #ifdef INET6
3726         if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
3727                 struct in6pcb *oin6p = sotoin6pcb(oso);
3728                 /* inherit socket options from the listening socket */
3729                 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
3730                 if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
3731                         m_freem(in6p->in6p_options);
3732                         in6p->in6p_options = 0;
3733                 }
3734                 ip6_savecontrol(in6p, &in6p->in6p_options,
3735                         mtod(m, struct ip6_hdr *), m);
3736         }
3737 #endif
3738
3739 #if defined(IPSEC) || defined(FAST_IPSEC)
3740         /*
3741          * we make a copy of policy, instead of sharing the policy,
3742          * for better behavior in terms of SA lookup and dead SA removal.
3743          */
3744         if (inp) {
3745                 /* copy old policy into new socket's */
3746                 if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
3747                         printf("tcp_input: could not copy policy\n");
3748         }
3749 #ifdef INET6
3750         else if (in6p) {
3751                 /* copy old policy into new socket's */
3752                 if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp,
3753                     in6p->in6p_sp))
3754                         printf("tcp_input: could not copy policy\n");
3755         }
3756 #endif
3757 #endif
3758
3759         /*
3760          * Give the new socket our cached route reference.
3761          */
3762         if (inp) {
3763                 rtcache_copy(&inp->inp_route, &sc->sc_route);
3764                 rtcache_free(&sc->sc_route);
3765         }
3766 #ifdef INET6
3767         else {
3768                 rtcache_copy(&in6p->in6p_route, &sc->sc_route);
3769                 rtcache_free(&sc->sc_route);
3770         }
3771 #endif
3772
3773         am = m_get(M_DONTWAIT, MT_SONAME);      /* XXX */
3774         if (am == NULL)
3775                 goto resetandabort;
3776         MCLAIM(am, &tcp_mowner);
3777         am->m_len = src->sa_len;
3778         bcopy(src, mtod(am, void *), src->sa_len);
3779         if (inp) {
3780                 if (in_pcbconnect(inp, am, &lwp0)) {
3781                         (void) m_free(am);
3782                         goto resetandabort;
3783                 }
3784         }
3785 #ifdef INET6
3786         else if (in6p) {
3787                 if (src->sa_family == AF_INET) {
3788                         /* IPv4 packet to AF_INET6 socket */
3789                         struct sockaddr_in6 *sin6;
3790                         sin6 = mtod(am, struct sockaddr_in6 *);
3791                         am->m_len = sizeof(*sin6);
3792                         memset(sin6, 0, sizeof(*sin6));
3793                         sin6->sin6_family = AF_INET6;
3794                         sin6->sin6_len = sizeof(*sin6);
3795                         sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port;
3796                         sin6->sin6_addr.s6_addr16[5] = htons(0xffff);
3797                         bcopy(&((struct sockaddr_in *)src)->sin_addr,
3798                                 &sin6->sin6_addr.s6_addr32[3],
3799                                 sizeof(sin6->sin6_addr.s6_addr32[3]));
3800                 }
3801                 if (in6_pcbconnect(in6p, am, NULL)) {
3802                         (void) m_free(am);
3803                         goto resetandabort;
3804                 }
3805         }
3806 #endif
3807         else {
3808                 (void) m_free(am);
3809                 goto resetandabort;
3810         }
3811         (void) m_free(am);
3812
3813         if (inp)
3814                 tp = intotcpcb(inp);
3815 #ifdef INET6
3816         else if (in6p)
3817                 tp = in6totcpcb(in6p);
3818 #endif
3819         else
3820                 tp = NULL;
3821         tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
3822         if (sc->sc_request_r_scale != 15) {
3823                 tp->requested_s_scale = sc->sc_requested_s_scale;
3824                 tp->request_r_scale = sc->sc_request_r_scale;
3825                 tp->snd_scale = sc->sc_requested_s_scale;
3826                 tp->rcv_scale = sc->sc_request_r_scale;
3827                 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
3828         }
3829         if (sc->sc_flags & SCF_TIMESTAMP)
3830                 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
3831         tp->ts_timebase = sc->sc_timebase;
3832
3833         tp->t_template = tcp_template(tp);
3834         if (tp->t_template == 0) {
3835                 tp = tcp_drop(tp, ENOBUFS);     /* destroys socket */
3836                 so = NULL;
3837                 m_freem(m);
3838                 goto abort;
3839         }
3840
3841         tp->iss = sc->sc_iss;
3842         tp->irs = sc->sc_irs;
3843         tcp_sendseqinit(tp);
3844         tcp_rcvseqinit(tp);
3845         tp->t_state = TCPS_SYN_RECEIVED;
3846         TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
3847         TCP_STATINC(TCP_STAT_ACCEPTS);
3848
3849         if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
3850                 tp->t_flags |= TF_WILL_SACK;
3851
3852         if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
3853                 tp->t_flags |= TF_ECN_PERMIT;
3854
3855 #ifdef TCP_SIGNATURE
3856         if (sc->sc_flags & SCF_SIGNATURE)
3857                 tp->t_flags |= TF_SIGNATURE;
3858 #endif
3859
3860         /* Initialize tp->t_ourmss before we deal with the peer's! */
3861         tp->t_ourmss = sc->sc_ourmaxseg;
3862         tcp_mss_from_peer(tp, sc->sc_peermaxseg);
3863
3864         /*
3865          * Initialize the initial congestion window.  If we
3866          * had to retransmit the SYN,ACK, we must initialize cwnd
3867          * to 1 segment (i.e. the Loss Window).
3868          */
3869         if (sc->sc_rxtshift)
3870                 tp->snd_cwnd = tp->t_peermss;
3871         else {
3872                 int ss = tcp_init_win;
3873 #ifdef INET
3874                 if (inp != NULL && in_localaddr(inp->inp_faddr))
3875                         ss = tcp_init_win_local;
3876 #endif
3877 #ifdef INET6
3878                 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
3879                         ss = tcp_init_win_local;
3880 #endif
3881                 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
3882         }
3883
3884         tcp_rmx_rtt(tp);
3885         tp->snd_wl1 = sc->sc_irs;
3886         tp->rcv_up = sc->sc_irs + 1;
3887
3888         /*
3889          * This is what whould have happened in tcp_output() when
3890          * the SYN,ACK was sent.
3891          */
3892         tp->snd_up = tp->snd_una;
3893         tp->snd_max = tp->snd_nxt = tp->iss+1;
3894         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
3895         if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
3896                 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
3897         tp->last_ack_sent = tp->rcv_nxt;
3898         tp->t_partialacks = -1;
3899         tp->t_dupacks = 0;
3900
3901         TCP_STATINC(TCP_STAT_SC_COMPLETED);
3902         s = splsoftnet();
3903         syn_cache_put(sc);
3904         splx(s);
3905         return (so);
3906
3907 resetandabort:
3908         (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
3909 abort:
3910         if (so != NULL) {
3911                 (void) soqremque(so, 1);
3912                 (void) soabort(so);
3913                 mutex_enter(softnet_lock);
3914         }
3915         s = splsoftnet();
3916         syn_cache_put(sc);
3917         splx(s);
3918         TCP_STATINC(TCP_STAT_SC_ABORTED);
3919         return ((struct socket *)(-1));
3920 }
3921
3922 /*
3923  * This function is called when we get a RST for a
3924  * non-existent connection, so that we can see if the
3925  * connection is in the syn cache.  If it is, zap it.
3926  */
3927
3928 void
3929 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
3930 {
3931         struct syn_cache *sc;
3932         struct syn_cache_head *scp;
3933         int s = splsoftnet();
3934
3935         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3936                 splx(s);
3937                 return;
3938         }
3939         if (SEQ_LT(th->th_seq, sc->sc_irs) ||
3940             SEQ_GT(th->th_seq, sc->sc_irs+1)) {
3941                 splx(s);
3942                 return;
3943         }
3944         syn_cache_rm(sc);
3945         TCP_STATINC(TCP_STAT_SC_RESET);
3946         syn_cache_put(sc);      /* calls pool_put but see spl above */
3947         splx(s);
3948 }
3949
3950 void
3951 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
3952     struct tcphdr *th)
3953 {
3954         struct syn_cache *sc;
3955         struct syn_cache_head *scp;
3956         int s;
3957
3958         s = splsoftnet();
3959         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3960                 splx(s);
3961                 return;
3962         }
3963         /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3964         if (ntohl (th->th_seq) != sc->sc_iss) {
3965                 splx(s);
3966                 return;
3967         }
3968
3969         /*
3970          * If we've retransmitted 3 times and this is our second error,
3971          * we remove the entry.  Otherwise, we allow it to continue on.
3972          * This prevents us from incorrectly nuking an entry during a
3973          * spurious network outage.
3974          *
3975          * See tcp_notify().
3976          */
3977         if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
3978                 sc->sc_flags |= SCF_UNREACH;
3979                 splx(s);
3980                 return;
3981         }
3982
3983         syn_cache_rm(sc);
3984         TCP_STATINC(TCP_STAT_SC_UNREACH);
3985         syn_cache_put(sc);      /* calls pool_put but see spl above */
3986         splx(s);
3987 }
3988
3989 /*
3990  * Given a LISTEN socket and an inbound SYN request, add
3991  * this to the syn cache, and send back a segment:
3992  *      <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3993  * to the source.
3994  *
3995  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3996  * Doing so would require that we hold onto the data and deliver it
3997  * to the application.  However, if we are the target of a SYN-flood
3998  * DoS attack, an attacker could send data which would eventually
3999  * consume all available buffer space if it were ACKed.  By not ACKing
4000  * the data, we avoid this DoS scenario.
4001  */
4002
4003 int
4004 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
4005     unsigned int hlen, struct socket *so, struct mbuf *m, u_char *optp,
4006     int optlen, struct tcp_opt_info *oi)
4007 {
4008         struct tcpcb tb, *tp;
4009         long win;
4010         struct syn_cache *sc;
4011         struct syn_cache_head *scp;
4012         struct mbuf *ipopts;
4013         struct tcp_opt_info opti;
4014         int s;
4015
4016         tp = sototcpcb(so);
4017
4018         memset(&opti, 0, sizeof(opti));
4019
4020         /*
4021          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
4022          *
4023          * Note this check is performed in tcp_input() very early on.
4024          */
4025
4026         /*
4027          * Initialize some local state.
4028          */
4029         win = sbspace(&so->so_rcv);
4030         if (win > TCP_MAXWIN)
4031                 win = TCP_MAXWIN;
4032
4033         switch (src->sa_family) {
4034 #ifdef INET
4035         case AF_INET:
4036                 /*
4037                  * Remember the IP options, if any.
4038                  */
4039                 ipopts = ip_srcroute();
4040                 break;
4041 #endif
4042         default:
4043                 ipopts = NULL;
4044         }
4045
4046 #ifdef TCP_SIGNATURE
4047         if (optp || (tp->t_flags & TF_SIGNATURE))
4048 #else
4049         if (optp)
4050 #endif
4051         {
4052                 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
4053 #ifdef TCP_SIGNATURE
4054                 tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
4055 #endif
4056                 tb.t_state = TCPS_LISTEN;
4057                 if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len -
4058                     sizeof(struct tcphdr) - optlen - hlen, oi) < 0)
4059                         return (0);
4060         } else
4061                 tb.t_flags = 0;
4062
4063         /*
4064          * See if we already have an entry for this connection.
4065          * If we do, resend the SYN,ACK.  We do not count this
4066          * as a retransmission (XXX though maybe we should).
4067          */
4068         if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
4069                 TCP_STATINC(TCP_STAT_SC_DUPESYN);
4070                 if (ipopts) {
4071                         /*
4072                          * If we were remembering a previous source route,
4073                          * forget it and use the new one we've been given.
4074                          */
4075                         if (sc->sc_ipopts)
4076                                 (void) m_free(sc->sc_ipopts);
4077                         sc->sc_ipopts = ipopts;
4078                 }
4079                 sc->sc_timestamp = tb.ts_recent;
4080                 if (syn_cache_respond(sc, m) == 0) {
4081                         uint64_t *tcps = TCP_STAT_GETREF();
4082                         tcps[TCP_STAT_SNDACKS]++;
4083                         tcps[TCP_STAT_SNDTOTAL]++;
4084                         TCP_STAT_PUTREF();
4085                 }
4086                 return (1);
4087         }
4088
4089         s = splsoftnet();
4090         sc = pool_get(&syn_cache_pool, PR_NOWAIT);
4091         splx(s);
4092         if (sc == NULL) {
4093                 if (ipopts)
4094                         (void) m_free(ipopts);
4095                 return (0);
4096         }
4097
4098         /*
4099          * Fill in the cache, and put the necessary IP and TCP
4100          * options into the reply.
4101          */
4102         memset(sc, 0, sizeof(struct syn_cache));
4103         callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
4104         bcopy(src, &sc->sc_src, src->sa_len);
4105         bcopy(dst, &sc->sc_dst, dst->sa_len);
4106         sc->sc_flags = 0;
4107         sc->sc_ipopts = ipopts;
4108         sc->sc_irs = th->th_seq;
4109         switch (src->sa_family) {
4110 #ifdef INET
4111         case AF_INET:
4112             {
4113                 struct sockaddr_in *srcin = (void *) src;
4114                 struct sockaddr_in *dstin = (void *) dst;
4115
4116                 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
4117                     &srcin->sin_addr, dstin->sin_port,
4118                     srcin->sin_port, sizeof(dstin->sin_addr), 0);
4119                 break;
4120             }
4121 #endif /* INET */
4122 #ifdef INET6
4123         case AF_INET6:
4124             {
4125                 struct sockaddr_in6 *srcin6 = (void *) src;
4126                 struct sockaddr_in6 *dstin6 = (void *) dst;
4127
4128                 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
4129                     &srcin6->sin6_addr, dstin6->sin6_port,
4130                     srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0);
4131                 break;
4132             }
4133 #endif /* INET6 */
4134         }
4135         sc->sc_peermaxseg = oi->maxseg;
4136         sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
4137                                                 m->m_pkthdr.rcvif : NULL,
4138                                                 sc->sc_src.sa.sa_family);
4139         sc->sc_win = win;
4140         sc->sc_timebase = tcp_now - 1;  /* see tcp_newtcpcb() */
4141         sc->sc_timestamp = tb.ts_recent;
4142         if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
4143             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
4144                 sc->sc_flags |= SCF_TIMESTAMP;
4145         if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
4146             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
4147                 sc->sc_requested_s_scale = tb.requested_s_scale;
4148                 sc->sc_request_r_scale = 0;
4149                 /*
4150                  * Pick the smallest possible scaling factor that
4151                  * will still allow us to scale up to sb_max.
4152                  *
4153                  * We do this because there are broken firewalls that
4154                  * will corrupt the window scale option, leading to
4155                  * the other endpoint believing that our advertised
4156                  * window is unscaled.  At scale factors larger than
4157                  * 5 the unscaled window will drop below 1500 bytes,
4158                  * leading to serious problems when traversing these
4159                  * broken firewalls.
4160                  *
4161                  * With the default sbmax of 256K, a scale factor
4162                  * of 3 will be chosen by this algorithm.  Those who
4163                  * choose a larger sbmax should watch out
4164                  * for the compatiblity problems mentioned above.
4165                  *
4166                  * RFC1323: The Window field in a SYN (i.e., a <SYN>
4167                  * or <SYN,ACK>) segment itself is never scaled.
4168                  */
4169                 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
4170                     (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
4171                         sc->sc_request_r_scale++;
4172         } else {
4173                 sc->sc_requested_s_scale = 15;
4174                 sc->sc_request_r_scale = 15;
4175         }
4176         if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
4177                 sc->sc_flags |= SCF_SACK_PERMIT;
4178
4179         /*
4180          * ECN setup packet recieved.
4181          */
4182         if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
4183                 sc->sc_flags |= SCF_ECN_PERMIT;
4184
4185 #ifdef TCP_SIGNATURE
4186         if (tb.t_flags & TF_SIGNATURE)
4187                 sc->sc_flags |= SCF_SIGNATURE;
4188 #endif
4189         sc->sc_tp = tp;
4190         if (syn_cache_respond(sc, m) == 0) {
4191                 uint64_t *tcps = TCP_STAT_GETREF();
4192                 tcps[TCP_STAT_SNDACKS]++;
4193                 tcps[TCP_STAT_SNDTOTAL]++;
4194                 TCP_STAT_PUTREF();
4195                 syn_cache_insert(sc, tp);
4196         } else {
4197                 s = splsoftnet();
4198                 syn_cache_put(sc);
4199                 splx(s);
4200                 TCP_STATINC(TCP_STAT_SC_DROPPED);
4201         }
4202         return (1);
4203 }
4204
4205 int
4206 syn_cache_respond(struct syn_cache *sc, struct mbuf *m)
4207 {
4208 #ifdef INET6
4209         struct rtentry *rt;
4210 #endif
4211         struct route *ro;
4212         u_int8_t *optp;
4213         int optlen, error;
4214         u_int16_t tlen;
4215         struct ip *ip = NULL;
4216 #ifdef INET6
4217         struct ip6_hdr *ip6 = NULL;
4218 #endif
4219         struct tcpcb *tp = NULL;
4220         struct tcphdr *th;
4221         u_int hlen;
4222         struct socket *so;
4223
4224         ro = &sc->sc_route;
4225         switch (sc->sc_src.sa.sa_family) {
4226         case AF_INET:
4227                 hlen = sizeof(struct ip);
4228                 break;
4229 #ifdef INET6
4230         case AF_INET6:
4231                 hlen = sizeof(struct ip6_hdr);
4232                 break;
4233 #endif
4234         default:
4235                 if (m)
4236                         m_freem(m);
4237                 return (EAFNOSUPPORT);
4238         }
4239
4240         /* Compute the size of the TCP options. */
4241         optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
4242             ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) +
4243 #ifdef TCP_SIGNATURE
4244             ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) +
4245 #endif
4246             ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
4247
4248         tlen = hlen + sizeof(struct tcphdr) + optlen;
4249
4250         /*
4251          * Create the IP+TCP header from scratch.
4252          */
4253         if (m)
4254                 m_freem(m);
4255 #ifdef DIAGNOSTIC
4256         if (max_linkhdr + tlen > MCLBYTES)
4257                 return (ENOBUFS);
4258 #endif
4259         MGETHDR(m, M_DONTWAIT, MT_DATA);
4260         if (m && tlen > MHLEN) {
4261                 MCLGET(m, M_DONTWAIT);
4262                 if ((m->m_flags & M_EXT) == 0) {
4263                         m_freem(m);
4264                         m = NULL;
4265                 }
4266         }
4267         if (m == NULL)
4268                 return (ENOBUFS);
4269         MCLAIM(m, &tcp_tx_mowner);
4270
4271         /* Fixup the mbuf. */
4272         m->m_data += max_linkhdr;
4273         m->m_len = m->m_pkthdr.len = tlen;
4274         if (sc->sc_tp) {
4275                 tp = sc->sc_tp;
4276                 if (tp->t_inpcb)
4277                         so = tp->t_inpcb->inp_socket;
4278 #ifdef INET6
4279                 else if (tp->t_in6pcb)
4280                         so = tp->t_in6pcb->in6p_socket;
4281 #endif
4282                 else
4283                         so = NULL;
4284         } else
4285                 so = NULL;
4286         m->m_pkthdr.rcvif = NULL;
4287         memset(mtod(m, u_char *), 0, tlen);
4288
4289         switch (sc->sc_src.sa.sa_family) {
4290         case AF_INET:
4291                 ip = mtod(m, struct ip *);
4292                 ip->ip_v = 4;
4293                 ip->ip_dst = sc->sc_src.sin.sin_addr;
4294                 ip->ip_src = sc->sc_dst.sin.sin_addr;
4295                 ip->ip_p = IPPROTO_TCP;
4296                 th = (struct tcphdr *)(ip + 1);
4297                 th->th_dport = sc->sc_src.sin.sin_port;
4298                 th->th_sport = sc->sc_dst.sin.sin_port;
4299                 break;
4300 #ifdef INET6
4301         case AF_INET6:
4302                 ip6 = mtod(m, struct ip6_hdr *);
4303                 ip6->ip6_vfc = IPV6_VERSION;
4304                 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4305                 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4306                 ip6->ip6_nxt = IPPROTO_TCP;
4307                 /* ip6_plen will be updated in ip6_output() */
4308                 th = (struct tcphdr *)(ip6 + 1);
4309                 th->th_dport = sc->sc_src.sin6.sin6_port;
4310                 th->th_sport = sc->sc_dst.sin6.sin6_port;
4311                 break;
4312 #endif
4313         default:
4314                 th = NULL;
4315         }
4316
4317         th->th_seq = htonl(sc->sc_iss);
4318         th->th_ack = htonl(sc->sc_irs + 1);
4319         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
4320         th->th_flags = TH_SYN|TH_ACK;
4321         th->th_win = htons(sc->sc_win);
4322         /* th_sum already 0 */
4323         /* th_urp already 0 */
4324
4325         /* Tack on the TCP options. */
4326         optp = (u_int8_t *)(th + 1);
4327         *optp++ = TCPOPT_MAXSEG;
4328         *optp++ = 4;
4329         *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
4330         *optp++ = sc->sc_ourmaxseg & 0xff;
4331
4332         if (sc->sc_request_r_scale != 15) {
4333                 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
4334                     TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
4335                     sc->sc_request_r_scale);
4336                 optp += 4;
4337         }
4338
4339         if (sc->sc_flags & SCF_TIMESTAMP) {
4340                 u_int32_t *lp = (u_int32_t *)(optp);
4341                 /* Form timestamp option as shown in appendix A of RFC 1323. */
4342                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
4343                 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4344                 *lp   = htonl(sc->sc_timestamp);
4345                 optp += TCPOLEN_TSTAMP_APPA;
4346         }
4347
4348         if (sc->sc_flags & SCF_SACK_PERMIT) {
4349                 u_int8_t *p = optp;
4350
4351                 /* Let the peer know that we will SACK. */
4352                 p[0] = TCPOPT_SACK_PERMITTED;
4353                 p[1] = 2;
4354                 p[2] = TCPOPT_NOP;
4355                 p[3] = TCPOPT_NOP;
4356                 optp += 4;
4357         }
4358
4359         /*
4360          * Send ECN SYN-ACK setup packet.
4361          * Routes can be asymetric, so, even if we receive a packet
4362          * with ECE and CWR set, we must not assume no one will block
4363          * the ECE packet we are about to send.
4364          */
4365         if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
4366             SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
4367                 th->th_flags |= TH_ECE;
4368                 TCP_STATINC(TCP_STAT_ECN_SHS);
4369
4370                 /*
4371                  * draft-ietf-tcpm-ecnsyn-00.txt
4372                  *
4373                  * "[...] a TCP node MAY respond to an ECN-setup
4374                  * SYN packet by setting ECT in the responding
4375                  * ECN-setup SYN/ACK packet, indicating to routers
4376                  * that the SYN/ACK packet is ECN-Capable.
4377                  * This allows a congested router along the path
4378                  * to mark the packet instead of dropping the
4379                  * packet as an indication of congestion."
4380                  *
4381                  * "[...] There can be a great benefit in setting
4382                  * an ECN-capable codepoint in SYN/ACK packets [...]
4383                  * Congestion is  most likely to occur in
4384                  * the server-to-client direction.  As a result,
4385                  * setting an ECN-capable codepoint in SYN/ACK
4386                  * packets can reduce the occurence of three-second
4387                  * retransmit timeouts resulting from the drop
4388                  * of SYN/ACK packets."
4389                  *
4390                  * Page 4 and 6, January 2006.
4391                  */
4392
4393                 switch (sc->sc_src.sa.sa_family) {
4394 #ifdef INET
4395                 case AF_INET:
4396                         ip->ip_tos |= IPTOS_ECN_ECT0;
4397                         break;
4398 #endif
4399 #ifdef INET6
4400                 case AF_INET6:
4401                         ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
4402                         break;
4403 #endif
4404                 }
4405                 TCP_STATINC(TCP_STAT_ECN_ECT);
4406         }
4407
4408 #ifdef TCP_SIGNATURE
4409         if (sc->sc_flags & SCF_SIGNATURE) {
4410                 struct secasvar *sav;
4411                 u_int8_t *sigp;
4412
4413                 sav = tcp_signature_getsav(m, th);
4414
4415                 if (sav == NULL) {
4416                         if (m)
4417                                 m_freem(m);
4418                         return (EPERM);
4419                 }
4420
4421                 *optp++ = TCPOPT_SIGNATURE;
4422                 *optp++ = TCPOLEN_SIGNATURE;
4423                 sigp = optp;
4424                 memset(optp, 0, TCP_SIGLEN);
4425                 optp += TCP_SIGLEN;
4426                 *optp++ = TCPOPT_NOP;
4427                 *optp++ = TCPOPT_EOL;
4428
4429                 (void)tcp_signature(m, th, hlen, sav, sigp);
4430
4431                 key_sa_recordxfer(sav, m);
4432 #ifdef FAST_IPSEC
4433                 KEY_FREESAV(&sav);
4434 #else
4435                 key_freesav(sav);
4436 #endif
4437         }
4438 #endif
4439
4440         /* Compute the packet's checksum. */
4441         switch (sc->sc_src.sa.sa_family) {
4442         case AF_INET:
4443                 ip->ip_len = htons(tlen - hlen);
4444                 th->th_sum = 0;
4445                 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4446                 break;
4447 #ifdef INET6
4448         case AF_INET6:
4449                 ip6->ip6_plen = htons(tlen - hlen);
4450                 th->th_sum = 0;
4451                 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4452                 break;
4453 #endif
4454         }
4455
4456         /*
4457          * Fill in some straggling IP bits.  Note the stack expects
4458          * ip_len to be in host order, for convenience.
4459          */
4460         switch (sc->sc_src.sa.sa_family) {
4461 #ifdef INET
4462         case AF_INET:
4463                 ip->ip_len = htons(tlen);
4464                 ip->ip_ttl = ip_defttl;
4465                 /* XXX tos? */
4466                 break;
4467 #endif
4468 #ifdef INET6
4469         case AF_INET6:
4470                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4471                 ip6->ip6_vfc |= IPV6_VERSION;
4472                 ip6->ip6_plen = htons(tlen - hlen);
4473                 /* ip6_hlim will be initialized afterwards */
4474                 /* XXX flowlabel? */
4475                 break;
4476 #endif
4477         }
4478
4479         /* XXX use IPsec policy on listening socket, on SYN ACK */
4480         tp = sc->sc_tp;
4481
4482         switch (sc->sc_src.sa.sa_family) {
4483 #ifdef INET
4484         case AF_INET:
4485                 error = ip_output(m, sc->sc_ipopts, ro,
4486                     (ip_mtudisc ? IP_MTUDISC : 0),
4487                     (struct ip_moptions *)NULL, so);
4488                 break;
4489 #endif
4490 #ifdef INET6
4491         case AF_INET6:
4492                 ip6->ip6_hlim = in6_selecthlim(NULL,
4493                                 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp
4494                                                                     : NULL);
4495
4496                 error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, so, NULL);
4497                 break;
4498 #endif
4499         default:
4500                 error = EAFNOSUPPORT;
4501                 break;
4502         }
4503         return (error);
4504 }