i386/linux-2.3.21/net/ipv4/tcp_input.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:
  25  *              Pedro Roque     :       Fast Retransmit/Recovery.
  26  *                                      Two receive queues.
  27  *                                      Retransmit queue handled by TCP.
  28  *                                      Better retransmit timer handling.
  29  *                                      New congestion avoidance.
  30  *                                      Header prediction.
  31  *                                      Variable renaming.
  32  *
  33  *              Eric            :       Fast Retransmit.
  34  *              Randy Scott     :       MSS option defines.
  35  *              Eric Schenk     :       Fixes to slow start algorithm.
  36  *              Eric Schenk     :       Yet another double ACK bug.
  37  *              Eric Schenk     :       Delayed ACK bug fixes.
  38  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39  *              David S. Miller :       Don't allow zero congestion window.
  40  *              Eric Schenk     :       Fix retransmitter so that it sends
  41  *                                      next packet on ack of previous packet.
  42  *              Andi Kleen      :       Moved open_request checking here
  43  *                                      and process RSTs for open_requests.
  44  *              Andi Kleen      :       Better prune_queue, and other fixes.
  45  *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46  *                                      timestamps.
  47  *              Andrey Savochkin:       Check sequence numbers correctly when
  48  *                                      removing SACKs due to in sequence incoming
  49  *                                      data segments.
  50  *              Andi Kleen:             Make sure we never ack data there is not
  51  *                                      enough room for. Also make this condition
  52  *                                      a fatal error if it might still happen.
  53  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  54  *                                      connections with MSS<min(MTU,ann. MSS)
  55  *                                      work without delayed acks.
  56  *              Andi Kleen:             Process packets with PSH set in the
  57  *                                      fast path.
  58  */
  59
  60 #include <linux/config.h>
  61 #include <linux/mm.h>
  62 #include <linux/sysctl.h>
  63 #include <net/tcp.h>
  64 #include <net/inet_common.h>
  65 #include <linux/ipsec.h>
  66
  67 #ifdef CONFIG_SYSCTL
  68 #define SYNC_INIT 0 /* let the user enable it */
  69 #else
  70 #define SYNC_INIT 1
  71 #endif
  72
  73 extern int sysctl_tcp_fin_timeout;
  74 extern int sysctl_tcp_keepalive_time;
  75
  76 /* These are on by default so the code paths get tested.
  77  * For the final 2.2 this may be undone at our discretion. -DaveM
  78  */
  79 int sysctl_tcp_timestamps = 1;
  80 int sysctl_tcp_window_scaling = 1;
  81 int sysctl_tcp_sack = 1;
  82
  83 int sysctl_tcp_syncookies = SYNC_INIT;
  84 int sysctl_tcp_stdurg;
  85 int sysctl_tcp_rfc1337;
  86 int sysctl_tcp_tw_recycle;
  87
  88 static int prune_queue(struct sock *sk);
  89
  90 /* There is something which you must keep in mind when you analyze the
  91  * behavior of the tp->ato delayed ack timeout interval.  When a
  92  * connection starts up, we want to ack as quickly as possible.  The
  93  * problem is that "good" TCP's do slow start at the beginning of data
  94  * transmission.  The means that until we send the first few ACK's the
  95  * sender will sit on his end and only queue most of his data, because
  96  * he can only send snd_cwnd unacked packets at any given time.  For
  97  * each ACK we send, he increments snd_cwnd and transmits more of his
  98  * queue.  -DaveM
  99  */
 100 static void tcp_delack_estimator(struct tcp_opt *tp)
 101 {
 102         if(tp->ato == 0) {
 103                 tp->lrcvtime = tcp_time_stamp;
 104
 105                 /* Help sender leave slow start quickly,
 106                  * and also makes sure we do not take this
 107                  * branch ever again for this connection.
 108                  */
 109                 tp->ato = 1;
 110                 tcp_enter_quickack_mode(tp);
 111         } else {
 112                 int m = tcp_time_stamp - tp->lrcvtime;
 113
 114                 tp->lrcvtime = tcp_time_stamp;
 115                 if(m <= 0)
 116                         m = 1;
 117                 if(m > tp->rto)
 118                         tp->ato = tp->rto;
 119                 else {
 120                         /* This funny shift makes sure we
 121                          * clear the "quick ack mode" bit.
 122                          */
 123                         tp->ato = ((tp->ato << 1) >> 2) + m;
 124                 }
 125         }
 126 }
 127
 128 /*
 129  * Remember to send an ACK later.
 130  */
 131 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
 132                                         struct sk_buff *skb)
 133 {
 134         tp->delayed_acks++;
 135
 136         /* Tiny-grams with PSH set artifically deflate our
 137          * ato measurement, but with a lower bound.
 138          */
 139         if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
 140                 /* Preserve the quickack state. */
 141                 if((tp->ato & 0x7fffffff) > HZ/50)
 142                         tp->ato = ((tp->ato & 0x80000000) |
 143                                    (HZ/50));
 144         }
 145 }
 146
 147 /* Called to compute a smoothed rtt estimate. The data fed to this
 148  * routine either comes from timestamps, or from segments that were
 149  * known _not_ to have been retransmitted [see Karn/Partridge
 150  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 151  * piece by Van Jacobson.
 152  * NOTE: the next three routines used to be one big routine.
 153  * To save cycles in the RFC 1323 implementation it was better to break
 154  * it up into three procedures. -- erics
 155  */
 156
 157 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 158 {
 159         long m = mrtt; /* RTT */
 160
 161         /*      The following amusing code comes from Jacobson's
 162          *      article in SIGCOMM '88.  Note that rtt and mdev
 163          *      are scaled versions of rtt and mean deviation.
 164          *      This is designed to be as fast as possible
 165          *      m stands for "measurement".
 166          *
 167          *      On a 1990 paper the rto value is changed to:
 168          *      RTO = rtt + 4 * mdev
 169          */
 170         if(m == 0)
 171                 m = 1;
 172         if (tp->srtt != 0) {
 173                 m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 174                 tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 175                 if (m < 0)
 176                         m = -m;         /* m is now abs(error) */
 177                 m -= (tp->mdev >> 2);   /* similar update on mdev */
 178                 tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 179         } else {
 180                 /* no previous measure. */
 181                 tp->srtt = m<<3;        /* take the measured time to be rtt */
 182                 tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 183         }
 184 }
 185
 186 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 187  * routine referred to above.
 188  */
 189
 190 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 191 {
 192         tp->rto = (tp->srtt >> 3) + tp->mdev;
 193         /* I am not enough educated to understand this magic.
 194          * However, it smells bad. snd_cwnd>31 is common case.
 195          */
 196         tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 197 }
 198
 199
 200 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 201  * on packet lifetime in the internet. We need the HZ/5 lower
 202  * bound to behave correctly against BSD stacks with a fixed
 203  * delayed ack.
 204  * FIXME: It's not entirely clear this lower bound is the best
 205  * way to avoid the problem. Is it possible to drop the lower
 206  * bound and still avoid trouble with BSD stacks? Perhaps
 207  * some modification to the RTO calculation that takes delayed
 208  * ack bias into account? This needs serious thought. -- erics
 209  */
 210 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 211 {
 212         if (tp->rto > 120*HZ)
 213                 tp->rto = 120*HZ;
 214         if (tp->rto < HZ/5)
 215                 tp->rto = HZ/5;
 216 }
 217
 218 /* Save metrics learned by this TCP session.
 219    This function is called only, when TCP finishes sucessfully
 220    i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
 221  */
 222 static void tcp_update_metrics(struct sock *sk)
 223 {
 224         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 225         struct dst_entry *dst = __sk_dst_get(sk);
 226
 227         if (dst) {
 228                 int m;
 229
 230                 if (tp->backoff || !tp->srtt) {
 231                         /* This session failed to estimate rtt. Why?
 232                          * Probably, no packets returned in time.
 233                          * Reset our results.
 234                          */
 235                         if (!(dst->mxlock&(1<<RTAX_RTT)))
 236                                 dst->rtt = 0;
 237                         return;
 238                 }
 239
 240                 dst_confirm(dst);
 241
 242                 m = dst->rtt - tp->srtt;
 243
 244                 /* If newly calculated rtt larger than stored one,
 245                  * store new one. Otherwise, use EWMA. Remember,
 246                  * rtt overestimation is always better than underestimation.
 247                  */
 248                 if (!(dst->mxlock&(1<<RTAX_RTT))) {
 249                         if (m <= 0)
 250                                 dst->rtt = tp->srtt;
 251                         else
 252                                 dst->rtt -= (m>>3);
 253                 }
 254
 255                 if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
 256                         if (m < 0)
 257                                 m = -m;
 258
 259                         /* Scale deviation to rttvar fixed point */
 260                         m >>= 1;
 261                         if (m < tp->mdev)
 262                                 m = tp->mdev;
 263
 264                         if (m >= dst->rttvar)
 265                                 dst->rttvar = m;
 266                         else
 267                                 dst->rttvar -= (dst->rttvar - m)>>2;
 268                 }
 269
 270                 if (tp->snd_ssthresh == 0x7FFFFFFF) {
 271                         /* Slow start still did not finish. */
 272                         if (dst->ssthresh &&
 273                             !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
 274                             tp->snd_cwnd > dst->ssthresh)
 275                                 dst->ssthresh = tp->snd_cwnd;
 276                         if (!(dst->mxlock&(1<<RTAX_CWND)) &&
 277                             tp->snd_cwnd > dst->cwnd)
 278                                 dst->cwnd = tp->snd_cwnd;
 279                 } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
 280                         /* Cong. avoidance phase, cwnd is reliable. */
 281                         if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
 282                                 dst->ssthresh = tp->snd_cwnd;
 283                         if (!(dst->mxlock&(1<<RTAX_CWND)))
 284                                 dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
 285                 } else {
 286                         /* Else slow start did not finish, cwnd is non-sense,
 287                            ssthresh may be also invalid.
 288                          */
 289                         if (!(dst->mxlock&(1<<RTAX_CWND)))
 290                                 dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
 291                         if (dst->ssthresh &&
 292                             !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
 293                             tp->snd_ssthresh > dst->ssthresh)
 294                                 dst->ssthresh = tp->snd_ssthresh;
 295                 }
 296         }
 297 }
 298
 299 /* Initialize metrics on socket. */
 300
 301 static void tcp_init_metrics(struct sock *sk)
 302 {
 303         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 304         struct dst_entry *dst = __sk_dst_get(sk);
 305
 306         if (dst == NULL)
 307                 goto reset;
 308
 309         dst_confirm(dst);
 310
 311         if (dst->rtt == 0)
 312                 goto reset;
 313
 314         if (!tp->srtt || !tp->saw_tstamp)
 315                 goto reset;
 316
 317         /* Initial rtt is determined from SYN,SYN-ACK.
 318          * The segment is small and rtt may appear much
 319          * less than real one. Use per-dst memory
 320          * to make it more realistic.
 321          *
 322          * A bit of theory. RTT is time passed after "normal" sized packet
 323          * is sent until it is ACKed. In normal curcumstances sending small
 324          * packets force peer to delay ACKs and calculation is correct too.
 325          * The algorithm is adaptive and, provided we follow specs, it
 326          * NEVER underestimate RTT. BUT! If peer tries to make some clever
 327          * tricks sort of "quick acks" for time long enough to decrease RTT
 328          * to low value, and then abruptly stops to do it and starts to delay
 329          * ACKs, wait for troubles.
 330          */
 331         if (dst->rtt > tp->srtt)
 332                 tp->srtt = dst->rtt;
 333         if (dst->rttvar > tp->mdev)
 334                 tp->mdev = dst->rttvar;
 335         tcp_set_rto(tp);
 336         tcp_bound_rto(tp);
 337
 338         if (dst->mxlock&(1<<RTAX_CWND))
 339                 tp->snd_cwnd_clamp = dst->cwnd;
 340         if (dst->ssthresh) {
 341                 tp->snd_ssthresh = dst->ssthresh;
 342                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 343                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
 344         }
 345         return;
 346
 347
 348 reset:
 349         /* Play conservative. If timestamps are not
 350          * supported, TCP will fail to recalculate correct
 351          * rtt, if initial rto is too small. FORGET ALL AND RESET!
 352          */
 353         if (!tp->saw_tstamp && tp->srtt) {
 354                 tp->srtt = 0;
 355                 tp->mdev = TCP_TIMEOUT_INIT;
 356                 tp->rto = TCP_TIMEOUT_INIT;
 357         }
 358 }
 359
 360 #define PAWS_24DAYS     (60 * 60 * 24 * 24)
 361
 362
 363 /* WARNING: this must not be called if tp->saw_tstamp was false. */
 364 extern __inline__ void
 365 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 366 {
 367         if (!after(seq, tp->last_ack_sent)) {
 368                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 369                  * extra check below makes sure this can only happen
 370                  * for pure ACK frames.  -DaveM
 371                  *
 372                  * Not only, also it occurs for expired timestamps
 373                  * and RSTs with bad timestamp option. --ANK
 374                  */
 375
 376                 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
 377                    xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
 378                         tp->ts_recent = tp->rcv_tsval;
 379                         tp->ts_recent_stamp = xtime.tv_sec;
 380                 }
 381         }
 382 }
 383
 384 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 385 {
 386         return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
 387                 xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
 388
 389                  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 390
 391                     I cannot see quitely as all the idea behind PAWS
 392                     is destroyed 8)
 393
 394                     The problem is only in reordering duplicate ACKs.
 395                     Hence, we can check this rare case more carefully.
 396
 397                     1. Check that it is really duplicate ACK (ack==snd_una)
 398                     2. Give it some small "replay" window (~RTO)
 399
 400                     We do not know units of foreign ts values, but make conservative
 401                     assumption that they are >=1ms. It solves problem
 402                     noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
 403                   */
 404                  && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
 405                      TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
 406                      !skb->h.th->ack ||
 407                      (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
 408 }
 409
 410
 411 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 412 {
 413         u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 414
 415         if (tp->rcv_wnd &&
 416             after(end_seq, tp->rcv_nxt) &&
 417             before(seq, end_window))
 418                 return 1;
 419         if (seq != end_window)
 420                 return 0;
 421         return (seq == end_seq);
 422 }
 423
 424 /* This functions checks to see if the tcp header is actually acceptable. */
 425 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 426 {
 427         if (seq == tp->rcv_nxt)
 428                 return (tp->rcv_wnd || (end_seq == seq));
 429
 430         return __tcp_sequence(tp, seq, end_seq);
 431 }
 432
 433 /* When we get a reset we do this. */
 434 static void tcp_reset(struct sock *sk)
 435 {
 436         sk->zapped = 1;
 437
 438         /* We want the right error as BSD sees it (and indeed as we do). */
 439         switch (sk->state) {
 440                 case TCP_SYN_SENT:
 441                         sk->err = ECONNREFUSED;
 442                         break;
 443                 case TCP_CLOSE_WAIT:
 444                         sk->err = EPIPE;
 445                         break;
 446                 case TCP_CLOSE:
 447                         return;
 448                 default:
 449                         sk->err = ECONNRESET;
 450         };
 451         tcp_set_state(sk, TCP_CLOSE);
 452         tcp_clear_xmit_timers(sk);
 453         tcp_done(sk);
 454 }
 455
 456 /* This tags the retransmission queue when SACKs arrive. */
 457 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 458 {
 459         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 460         int i = nsacks;
 461
 462         while(i--) {
 463                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 464                 __u32 start_seq = ntohl(sp->start_seq);
 465                 __u32 end_seq = ntohl(sp->end_seq);
 466                 int fack_count = 0;
 467
 468                 while((skb != NULL) &&
 469                       (skb != tp->send_head) &&
 470                       (skb != (struct sk_buff *)&sk->write_queue)) {
 471                         /* The retransmission queue is always in order, so
 472                          * we can short-circuit the walk early.
 473                          */
 474                         if(after(TCP_SKB_CB(skb)->seq, end_seq))
 475                                 break;
 476
 477                         /* We play conservative, we don't allow SACKS to partially
 478                          * tag a sequence space.
 479                          */
 480                         fack_count++;
 481                         if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 482                            !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 483                                 /* If this was a retransmitted frame, account for it. */
 484                                 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
 485                                    tp->retrans_out)
 486                                         tp->retrans_out--;
 487                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 488
 489                                 /* RULE: All new SACKs will either decrease retrans_out
 490                                  *       or advance fackets_out.
 491                                  */
 492                                 if(fack_count > tp->fackets_out)
 493                                         tp->fackets_out = fack_count;
 494                         }
 495                         skb = skb->next;
 496                 }
 497                 sp++; /* Move on to the next SACK block. */
 498         }
 499 }
 500
 501 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
 502  * But, this can also be called on packets in the established flow when
 503  * the fast version below fails.
 504  */
 505 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 506 {
 507         unsigned char *ptr;
 508         int length=(th->doff*4)-sizeof(struct tcphdr);
 509
 510         ptr = (unsigned char *)(th + 1);
 511         tp->saw_tstamp = 0;
 512
 513         while(length>0) {
 514                 int opcode=*ptr++;
 515                 int opsize;
 516
 517                 switch (opcode) {
 518                         case TCPOPT_EOL:
 519                                 return;
 520                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 521                                 length--;
 522                                 continue;
 523                         default:
 524                                 opsize=*ptr++;
 525                                 if (opsize < 2) /* "silly options" */
 526                                         return;
 527                                 if (opsize > length)
 528                                         break;  /* don't parse partial options */
 529                                 switch(opcode) {
 530                                 case TCPOPT_MSS:
 531                                         if(opsize==TCPOLEN_MSS && th->syn) {
 532                                                 u16 in_mss = ntohs(*(__u16 *)ptr);
 533                                                 if (in_mss) {
 534                                                         if (tp->user_mss && tp->user_mss < in_mss)
 535                                                                 in_mss = tp->user_mss;
 536                                                         tp->mss_clamp = in_mss;
 537                                                 }
 538                                         }
 539                                         break;
 540                                 case TCPOPT_WINDOW:
 541                                         if(opsize==TCPOLEN_WINDOW && th->syn)
 542                                                 if (!no_fancy && sysctl_tcp_window_scaling) {
 543                                                         tp->wscale_ok = 1;
 544                                                         tp->snd_wscale = *(__u8 *)ptr;
 545                                                         if(tp->snd_wscale > 14) {
 546                                                                 if(net_ratelimit())
 547                                                                         printk("tcp_parse_options: Illegal window "
 548                                                                                "scaling value %d >14 received.",
 549                                                                                tp->snd_wscale);
 550                                                                 tp->snd_wscale = 14;
 551                                                         }
 552                                                 }
 553                                         break;
 554                                 case TCPOPT_TIMESTAMP:
 555                                         if(opsize==TCPOLEN_TIMESTAMP) {
 556                                                 if (sysctl_tcp_timestamps && !no_fancy) {
 557                                                         tp->tstamp_ok = 1;
 558                                                         tp->saw_tstamp = 1;
 559                                                         tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 560                                                         tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 561                                                 }
 562                                         }
 563                                         break;
 564                                 case TCPOPT_SACK_PERM:
 565                                         if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 566                                                 if (sysctl_tcp_sack && !no_fancy) {
 567                                                         tp->sack_ok = 1;
 568                                                         tp->num_sacks = 0;
 569                                                 }
 570                                         }
 571                                         break;
 572
 573                                 case TCPOPT_SACK:
 574                                         if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 575                                            sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 576                                                 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 577
 578                                                 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 579                                                         int num_sacks = sack_bytes >> 3;
 580                                                         struct tcp_sack_block *sackp;
 581
 582                                                         sackp = (struct tcp_sack_block *)ptr;
 583                                                         tcp_sacktag_write_queue(sk, sackp, num_sacks);
 584                                                 }
 585                                         }
 586                                 };
 587                                 ptr+=opsize-2;
 588                                 length-=opsize;
 589                 };
 590         }
 591 }
 592
 593 /* Fast parse options. This hopes to only see timestamps.
 594  * If it is wrong it falls back on tcp_parse_options().
 595  */
 596 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 597 {
 598         /* If we didn't send out any options ignore them all. */
 599         if (tp->tcp_header_len == sizeof(struct tcphdr))
 600                 return 0;
 601         if (th->doff == sizeof(struct tcphdr)>>2) {
 602                 tp->saw_tstamp = 0;
 603                 return 0;
 604         } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 605                 __u32 *ptr = (__u32 *)(th + 1);
 606                 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 607                                              | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 608                         tp->saw_tstamp = 1;
 609                         ++ptr;
 610                         tp->rcv_tsval = ntohl(*ptr);
 611                         ++ptr;
 612                         tp->rcv_tsecr = ntohl(*ptr);
 613                         return 1;
 614                 }
 615         }
 616         tcp_parse_options(sk, th, tp, 0);
 617         return 1;
 618 }
 619
 620 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 621 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 622 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 623 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 624 #define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged new data.         */
 625
 626 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 627 {
 628         if (tp->dup_acks > 3)
 629                 tp->snd_cwnd = (tp->snd_ssthresh);
 630
 631         tp->dup_acks = 0;
 632 }
 633
 634 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
 635  * retransmit timer fires.
 636  */
 637 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 638 {
 639         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 640
 641         /* Note: If not_dup is set this implies we got a
 642          * data carrying packet or a window update.
 643          * This carries no new information about possible
 644          * lost packets, so we have to ignore it for the purposes
 645          * of counting duplicate acks. Ideally this does not imply we
 646          * should stop our fast retransmit phase, more acks may come
 647          * later without data to help us. Unfortunately this would make
 648          * the code below much more complex. For now if I see such
 649          * a packet I clear the fast retransmit phase.
 650          */
 651         if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 652                 /* This is the standard reno style fast retransmit branch. */
 653
 654                 /* 1. When the third duplicate ack is received, set ssthresh
 655                  * to one half the current congestion window, but no less
 656                  * than two segments. Retransmit the missing segment.
 657                  */
 658                 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 659                         tp->dup_acks++;
 660                         if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 661                                 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
 662                                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 663                                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
 664                                 tp->snd_cwnd = (tp->snd_ssthresh + 3);
 665                                 tp->high_seq = tp->snd_nxt;
 666                                 if(!tp->fackets_out)
 667                                         tcp_retransmit_skb(sk,
 668                                                            skb_peek(&sk->write_queue));
 669                                 else
 670                                         tcp_fack_retransmit(sk);
 671                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 672                         }
 673                 } else if (++tp->dup_acks > 3) {
 674                         /* 2. Each time another duplicate ACK arrives, increment
 675                          * cwnd by the segment size. [...] Transmit a packet...
 676                          *
 677                          * Packet transmission will be done on normal flow processing
 678                          * since we're not in "retransmit mode".  We do not use
 679                          * duplicate ACKs to artificially inflate the congestion
 680                          * window when doing FACK.
 681                          */
 682                         if(!tp->fackets_out) {
 683                                 tp->snd_cwnd++;
 684                         } else {
 685                                 /* Fill any further holes which may have
 686                                  * appeared.
 687                                  *
 688                                  * We may want to change this to run every
 689                                  * further multiple-of-3 dup ack increments,
 690                                  * to be more robust against out-of-order
 691                                  * packet delivery.  -DaveM
 692                                  */
 693                                 tcp_fack_retransmit(sk);
 694                         }
 695                 }
 696         } else if (tp->high_seq != 0) {
 697                 /* In this branch we deal with clearing the Floyd style
 698                  * block on duplicate fast retransmits, and if requested
 699                  * we do Hoe style secondary fast retransmits.
 700                  */
 701                 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 702                         /* Once we have acked all the packets up to high_seq
 703                          * we are done this fast retransmit phase.
 704                          * Alternatively data arrived. In this case we
 705                          * Have to abort the fast retransmit attempt.
 706                          * Note that we do want to accept a window
 707                          * update since this is expected with Hoe's algorithm.
 708                          */
 709                         clear_fast_retransmit(tp);
 710
 711                         /* After we have cleared up to high_seq we can
 712                          * clear the Floyd style block.
 713                          */
 714                         if (!before(ack, tp->high_seq)) {
 715                                 tp->high_seq = 0;
 716                                 tp->fackets_out = 0;
 717                         }
 718                 } else if (tp->dup_acks >= 3) {
 719                         if (!tp->fackets_out) {
 720                                 /* Hoe Style. We didn't ack the whole
 721                                  * window. Take this as a cue that
 722                                  * another packet was lost and retransmit it.
 723                                  * Don't muck with the congestion window here.
 724                                  * Note that we have to be careful not to
 725                                  * act if this was a window update and it
 726                                  * didn't ack new data, since this does
 727                                  * not indicate a packet left the system.
 728                                  * We can test this by just checking
 729                                  * if ack changed from snd_una, since
 730                                  * the only way to get here without advancing
 731                                  * from snd_una is if this was a window update.
 732                                  */
 733                                 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 734                                         tcp_retransmit_skb(sk,
 735                                                            skb_peek(&sk->write_queue));
 736                                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 737                                 }
 738                         } else {
 739                                 /* FACK style, fill any remaining holes in
 740                                  * receiver's queue.
 741                                  */
 742                                 tcp_fack_retransmit(sk);
 743                         }
 744                 }
 745         }
 746 }
 747
 748 /* This is Jacobson's slow start and congestion avoidance.
 749  * SIGCOMM '88, p. 328.
 750  */
 751 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 752 {
 753         if (tp->snd_cwnd <= tp->snd_ssthresh) {
 754                 /* In "safe" area, increase. */
 755                 tp->snd_cwnd++;
 756         } else {
 757                 /* In dangerous area, increase slowly.
 758                  * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 759                  */
 760                 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 761                         if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 762                                 tp->snd_cwnd++;
 763                         tp->snd_cwnd_cnt=0;
 764                 } else
 765                         tp->snd_cwnd_cnt++;
 766         }
 767 }
 768
 769 /* Remove acknowledged frames from the retransmission queue. */
 770 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 771                                __u32 *seq, __u32 *seq_rtt)
 772 {
 773         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 774         struct sk_buff *skb;
 775         __u32 now = tcp_time_stamp;
 776         int acked = 0;
 777
 778         /* If we are retransmitting, and this ACK clears up to
 779          * the retransmit head, or further, then clear our state.
 780          */
 781         if (tp->retrans_head != NULL &&
 782             !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
 783                 tp->retrans_head = NULL;
 784
 785         while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 786                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 787                 __u8 sacked = scb->sacked;
 788
 789                 /* If our packet is before the ack sequence we can
 790                  * discard it as it's confirmed to have arrived at
 791                  * the other end.
 792                  */
 793                 if (after(scb->end_seq, ack))
 794                         break;
 795
 796                 /* Initial outgoing SYN's get put onto the write_queue
 797                  * just like anything else we transmit.  It is not
 798                  * true data, and if we misinform our callers that
 799                  * this ACK acks real data, we will erroneously exit
 800                  * connection startup slow start one packet too
 801                  * quickly.  This is severely frowned upon behavior.
 802                  */
 803                 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 804                         tp->retrans_out--;
 805                 if(!(scb->flags & TCPCB_FLAG_SYN)) {
 806                         acked |= FLAG_DATA_ACKED;
 807                         if(sacked & TCPCB_SACKED_RETRANS)
 808                                 acked |= FLAG_RETRANS_DATA_ACKED;
 809                         if(tp->fackets_out)
 810                                 tp->fackets_out--;
 811                 } else {
 812                         acked |= FLAG_SYN_ACKED;
 813                         /* This is pure paranoia. */
 814                         tp->retrans_head = NULL;
 815                 }
 816                 tp->packets_out--;
 817                 *seq = scb->seq;
 818                 *seq_rtt = now - scb->when;
 819                 __skb_unlink(skb, skb->list);
 820                 kfree_skb(skb);
 821         }
 822         return acked;
 823 }
 824
 825 static void tcp_ack_probe(struct sock *sk, __u32 ack)
 826 {
 827         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 828
 829         /* Our probe was answered. */
 830         tp->probes_out = 0;
 831
 832         /* Was it a usable window open? */
 833
 834         /* should always be non-null */
 835         if (tp->send_head != NULL &&
 836             !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 837                 tp->backoff = 0;
 838                 tp->pending = 0;
 839                 tcp_clear_xmit_timer(sk, TIME_PROBE0);
 840         } else {
 841                 tcp_reset_xmit_timer(sk, TIME_PROBE0,
 842                                      min(tp->rto << tp->backoff, 120*HZ));
 843         }
 844 }
 845
 846 /* Should we open up the congestion window? */
 847 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 848 {
 849         /* Data must have been acked. */
 850         if ((flag & FLAG_DATA_ACKED) == 0)
 851                 return 0;
 852
 853         /* Some of the data acked was retransmitted somehow? */
 854         if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
 855                 /* We advance in all cases except during
 856                  * non-FACK fast retransmit/recovery.
 857                  */
 858                 if (tp->fackets_out != 0 ||
 859                     tp->retransmits != 0)
 860                         return 1;
 861
 862                 /* Non-FACK fast retransmit does it's own
 863                  * congestion window management, don't get
 864                  * in the way.
 865                  */
 866                 return 0;
 867         }
 868
 869         /* New non-retransmitted data acked, always advance.  */
 870         return 1;
 871 }
 872
 873 /* Read draft-ietf-tcplw-high-performance before mucking
 874  * with this code. (Superceeds RFC1323)
 875  */
 876 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 877                                u32 seq, u32 ack, int flag)
 878 {
 879         __u32 seq_rtt;
 880
 881         /* RTTM Rule: A TSecr value received in a segment is used to
 882          * update the averaged RTT measurement only if the segment
 883          * acknowledges some new data, i.e., only if it advances the
 884          * left edge of the send window.
 885          *
 886          * See draft-ietf-tcplw-high-performance-00, section 3.3.
 887          * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 888          */
 889         if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
 890                 return;
 891
 892         seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
 893         tcp_rtt_estimator(tp, seq_rtt);
 894         if (tp->retransmits) {
 895                 if (tp->packets_out == 0) {
 896                         tp->retransmits = 0;
 897                         tp->fackets_out = 0;
 898                         tp->retrans_out = 0;
 899                         tp->backoff = 0;
 900                         tcp_set_rto(tp);
 901                 } else {
 902                         /* Still retransmitting, use backoff */
 903                         tcp_set_rto(tp);
 904                         tp->rto = tp->rto << tp->backoff;
 905                 }
 906         } else {
 907                 tcp_set_rto(tp);
 908         }
 909
 910         tcp_bound_rto(tp);
 911 }
 912
 913 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 914 {
 915         struct sk_buff *skb = skb_peek(&sk->write_queue);
 916
 917         /* Some data was ACK'd, if still retransmitting (due to a
 918          * timeout), resend more of the retransmit queue.  The
 919          * congestion window is handled properly by that code.
 920          */
 921         if (tp->retransmits) {
 922                 tcp_xmit_retransmit_queue(sk);
 923                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 924         } else {
 925                 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
 926                 if ((__s32)when < 0)
 927                         when = 1;
 928                 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 929         }
 930 }
 931
 932 /* This routine deals with incoming acks, but not outgoing ones. */
 933 static int tcp_ack(struct sock *sk, struct tcphdr *th,
 934                    u32 ack_seq, u32 ack, int len)
 935 {
 936         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 937         int flag = 0;
 938         u32 seq = 0;
 939         u32 seq_rtt = 0;
 940
 941         if(sk->zapped)
 942                 return(1);      /* Dead, can't ack any more so why bother */
 943
 944         if (tp->pending == TIME_KEEPOPEN)
 945                 tp->probes_out = 0;
 946
 947         tp->rcv_tstamp = tcp_time_stamp;
 948
 949         /* If the ack is newer than sent or older than previous acks
 950          * then we can probably ignore it.
 951          */
 952         if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 953                 goto uninteresting_ack;
 954
 955         /* If there is data set flag 1 */
 956         if (len != th->doff*4) {
 957                 flag |= FLAG_DATA;
 958                 tcp_delack_estimator(tp);
 959         }
 960
 961         /* Update our send window. */
 962
 963         /* This is the window update code as per RFC 793
 964          * snd_wl{1,2} are used to prevent unordered
 965          * segments from shrinking the window
 966          */
 967         if (before(tp->snd_wl1, ack_seq) ||
 968             (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
 969                 u32 nwin = ntohs(th->window) << tp->snd_wscale;
 970
 971                 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 972                         flag |= FLAG_WIN_UPDATE;
 973                         tp->snd_wnd = nwin;
 974
 975                         tp->snd_wl1 = ack_seq;
 976                         tp->snd_wl2 = ack;
 977
 978                         if (nwin > tp->max_window)
 979                                 tp->max_window = nwin;
 980                 }
 981         }
 982
 983         /* We passed data and got it acked, remove any soft error
 984          * log. Something worked...
 985          */
 986         sk->err_soft = 0;
 987
 988         /* If this ack opens up a zero window, clear backoff.  It was
 989          * being used to time the probes, and is probably far higher than
 990          * it needs to be for normal retransmission.
 991          */
 992         if (tp->pending == TIME_PROBE0)
 993                 tcp_ack_probe(sk, ack);
 994
 995         /* See if we can take anything off of the retransmit queue. */
 996         flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 997
 998         /* We must do this here, before code below clears out important
 999          * state contained in tp->fackets_out and tp->retransmits.  -DaveM
1000          */
1001         if (should_advance_cwnd(tp, flag))
1002                 tcp_cong_avoid(tp);
1003
1004         /* If we have a timestamp, we always do rtt estimates. */
1005         if (tp->saw_tstamp) {
1006                 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
1007         } else {
1008                 /* If we were retransmiting don't count rtt estimate. */
1009                 if (tp->retransmits) {
1010                         if (tp->packets_out == 0) {
1011                                 tp->retransmits = 0;
1012                                 tp->fackets_out = 0;
1013                                 tp->retrans_out = 0;
1014                         }
1015                 } else {
1016                         /* We don't have a timestamp. Can only use
1017                          * packets that are not retransmitted to determine
1018                          * rtt estimates. Also, we must not reset the
1019                          * backoff for rto until we get a non-retransmitted
1020                          * packet. This allows us to deal with a situation
1021                          * where the network delay has increased suddenly.
1022                          * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1023                          */
1024                         if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
1025                                 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
1026                                         tp->backoff = 0;
1027                                         tcp_rtt_estimator(tp, seq_rtt);
1028                                         tcp_set_rto(tp);
1029                                         tcp_bound_rto(tp);
1030                                 }
1031                         }
1032                 }
1033         }
1034
1035         if (tp->packets_out) {
1036                 if (flag & FLAG_DATA_ACKED)
1037                         tcp_ack_packets_out(sk, tp);
1038         } else {
1039                 tcp_clear_xmit_timer(sk, TIME_RETRANS);
1040         }
1041
1042         flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
1043         if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
1044             (tp->high_seq != 0)) {
1045                 tcp_fast_retrans(sk, ack, flag);
1046         } else {
1047                 /* Clear any aborted fast retransmit starts. */
1048                 tp->dup_acks = 0;
1049         }
1050         /* It is not a brain fart, I thought a bit now. 8)
1051          *
1052          * Forward progress is indicated, if:
1053          *   1. the ack acknowledges new data.
1054          *   2. or the ack is duplicate, but it is caused by new segment
1055          *      arrival. This case is filtered by:
1056          *      - it contains no data, syn or fin.
1057          *      - it does not update window.
1058          *   3. or new SACK. It is difficult to check, so that we ignore it.
1059          *
1060          * Forward progress is also indicated by arrival new data,
1061          * which was caused by window open from our side. This case is more
1062          * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1063          *                                              --ANK (990513)
1064          */
1065         if (ack != tp->snd_una || (flag == 0 && !th->fin))
1066                 dst_confirm(sk->dst_cache);
1067
1068         /* Remember the highest ack received. */
1069         tp->snd_una = ack;
1070         return 1;
1071
1072 uninteresting_ack:
1073         SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
1074         return 0;
1075 }
1076
1077 /* New-style handling of TIME_WAIT sockets. */
1078
1079 /* Must be called only from BH context. */
1080 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
1081 {
1082         struct tcp_ehash_bucket *ehead;
1083         struct tcp_bind_hashbucket *bhead;
1084         struct tcp_bind_bucket *tb;
1085
1086         /* Unlink from established hashes. */
1087         ehead = &tcp_ehash[tw->hashent];
1088         write_lock(&ehead->lock);
1089         if (!tw->pprev) {
1090                 write_unlock(&ehead->lock);
1091                 return;
1092         }
1093         if(tw->next)
1094                 tw->next->pprev = tw->pprev;
1095         *(tw->pprev) = tw->next;
1096         tw->pprev = NULL;
1097         write_unlock(&ehead->lock);
1098
1099         /* Disassociate with bind bucket. */
1100         bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
1101         spin_lock(&bhead->lock);
1102         if ((tb = tw->tb) != NULL) {
1103                 if(tw->bind_next)
1104                         tw->bind_next->bind_pprev = tw->bind_pprev;
1105                 *(tw->bind_pprev) = tw->bind_next;
1106                 tw->tb = NULL;
1107                 if (tb->owners == NULL) {
1108                         if (tb->next)
1109                                 tb->next->pprev = tb->pprev;
1110                         *(tb->pprev) = tb->next;
1111                         kmem_cache_free(tcp_bucket_cachep, tb);
1112                 }
1113         }
1114         spin_unlock(&bhead->lock);
1115
1116 #ifdef INET_REFCNT_DEBUG
1117         if (atomic_read(&tw->refcnt) != 1) {
1118                 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
1119         }
1120 #endif
1121         tcp_tw_put(tw);
1122 }
1123
1124 /* We come here as a special case from the AF specific TCP input processing,
1125  * and the SKB has no owner.  Essentially handling this is very simple,
1126  * we just keep silently eating rx'd packets until none show up for the
1127  * entire timeout period.  The only special cases are for BSD TIME_WAIT
1128  * reconnects and SYN/RST bits being set in the TCP header.
1129  */
1130
1131 /*
1132  * * Main purpose of TIME-WAIT state is to close connection gracefully,
1133  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1134  *   (and, probably, tail of data) and one or more our ACKs are lost.
1135  * * What is TIME-WAIT timeout? It is associated with maximal packet
1136  *   lifetime in the internet, which results in wrong conclusion, that
1137  *   it is set to catch "old duplicate segments" wandering out of their path.
1138  *   It is not quite correct. This timeout is calculated so that it exceeds
1139  *   maximal retransmision timeout enough to allow to lose one (or more)
1140  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
1141  * * When TIME-WAIT socket receives RST, it means that another end
1142  *   finally closed and we are allowed to kill TIME-WAIT too.
1143  * * Second purpose of TIME-WAIT is catching old duplicate segments.
1144  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
1145  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1146  * * If we invented some more clever way to catch duplicates
1147  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1148  *
1149  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1150  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1151  * from the very beginning.
1152  */
1153 enum tcp_tw_status
1154 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
1155                            struct tcphdr *th, unsigned len)
1156 {
1157         struct tcp_opt tp;
1158         int paws_reject = 0;
1159
1160         /*      RFC 1122:
1161          *      "When a connection is [...] on TIME-WAIT state [...]
1162          *      [a TCP] MAY accept a new SYN from the remote TCP to
1163          *      reopen the connection directly, if it:
1164          *
1165          *      (1)  assigns its initial sequence number for the new
1166          *      connection to be larger than the largest sequence
1167          *      number it used on the previous connection incarnation,
1168          *      and
1169          *
1170          *      (2)  returns to TIME-WAIT state if the SYN turns out
1171          *      to be an old duplicate".
1172          */
1173
1174         tp.saw_tstamp = 0;
1175         if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
1176                 tcp_parse_options(NULL, th, &tp, 0);
1177
1178                 paws_reject = tp.saw_tstamp &&
1179                         ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
1180                          xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
1181         }
1182
1183         if (!paws_reject &&
1184             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
1185              TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
1186                 /* In window segment, it may be only reset or bare ack. */
1187
1188                 if (th->rst) {
1189 #ifdef CONFIG_TCP_TW_RECYCLE
1190                         /* When recycling, always follow rfc1337,
1191                          * but mark bucket as ready to recycling immediately.
1192                          */
1193                         if (sysctl_tcp_tw_recycle) {
1194                                 /* May kill it now. */
1195                                 tw->rto = 0;
1196                                 tw->ttd = jiffies;
1197                         } else
1198 #endif
1199                         /* This is TIME_WAIT assasination, in two flavors.
1200                          * Oh well... nobody has a sufficient solution to this
1201                          * protocol bug yet.
1202                          */
1203                         if(sysctl_tcp_rfc1337 == 0) {
1204                                 tcp_tw_deschedule(tw);
1205                                 tcp_timewait_kill(tw);
1206                         }
1207                 } else {
1208                         tcp_tw_reschedule(tw);
1209                 }
1210
1211                 if (tp.saw_tstamp) {
1212                         tw->ts_recent = tp.rcv_tsval;
1213                         tw->ts_recent_stamp = xtime.tv_sec;
1214                 }
1215                 tcp_tw_put(tw);
1216                 return TCP_TW_SUCCESS;
1217         }
1218
1219         /* Out of window segment.
1220
1221            All the segments are ACKed immediately.
1222
1223            The only exception is new SYN. We accept it, if it is
1224            not old duplicate and we are not in danger to be killed
1225            by delayed old duplicates. RFC check is that it has
1226            newer sequence number works at rates <40Mbit/sec.
1227            However, if paws works, it is reliable AND even more,
1228            we even may relax silly seq space cutoff.
1229
1230            RED-PEN: we violate main RFC requirement, if this SYN will appear
1231            old duplicate (i.e. we receive RST in reply to SYN-ACK),
1232            we must return socket to time-wait state. It is not good,
1233            but not fatal yet.
1234          */
1235
1236         if (th->syn && !th->rst && !th->ack && !paws_reject &&
1237             (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
1238              (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
1239                 u32 isn = tw->snd_nxt + 2;
1240                 if (isn == 0)
1241                         isn++;
1242                 TCP_SKB_CB(skb)->when = isn;
1243                 return TCP_TW_SYN;
1244         }
1245
1246         if(!th->rst) {
1247                 /* In this case we must reset the TIMEWAIT timer.
1248
1249                    If it is ACKless SYN it may be both old duplicate
1250                    and new good SYN with random sequence number <rcv_nxt.
1251                    Do not reschedule in the last case.
1252                  */
1253                 if (paws_reject || th->ack) {
1254                         tcp_tw_reschedule(tw);
1255 #ifdef CONFIG_TCP_TW_RECYCLE
1256                         tw->rto = min(120*HZ, tw->rto<<1);
1257                         tw->ttd = jiffies + tw->rto;
1258 #endif
1259                 }
1260
1261                 /* Send ACK. Note, we do not put the bucket,
1262                  * it will be released by caller.
1263                  */
1264                 return TCP_TW_ACK;
1265         }
1266         tcp_tw_put(tw);
1267         return TCP_TW_SUCCESS;
1268 }
1269
1270 /* Enter the time wait state.  This is always called from BH
1271  * context.  Essentially we whip up a timewait bucket, copy the
1272  * relevant info into it from the SK, and mess with hash chains
1273  * and list linkage.
1274  */
1275 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1276 {
1277         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
1278         struct tcp_bind_hashbucket *bhead;
1279         struct sock **head, *sktw;
1280
1281         write_lock(&ehead->lock);
1282
1283         /* Step 1: Remove SK from established hash. */
1284         if (sk->pprev) {
1285                 if(sk->next)
1286                         sk->next->pprev = sk->pprev;
1287                 *sk->pprev = sk->next;
1288                 sk->pprev = NULL;
1289         }
1290
1291         /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1292         head = &(ehead + tcp_ehash_size)->chain;
1293         sktw = (struct sock *)tw;
1294         if((sktw->next = *head) != NULL)
1295                 (*head)->pprev = &sktw->next;
1296         *head = sktw;
1297         sktw->pprev = head;
1298         atomic_inc(&tw->refcnt);
1299
1300         write_unlock(&ehead->lock);
1301
1302         /* Step 3: Put TW into bind hash. Original socket stays there too.
1303            Note, that any socket with sk->num!=0 MUST be bound in binding
1304            cache, even if it is closed.
1305          */
1306         bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
1307         spin_lock(&bhead->lock);
1308         tw->tb = (struct tcp_bind_bucket *)sk->prev;
1309         BUG_TRAP(sk->prev!=NULL);
1310         if ((tw->bind_next = tw->tb->owners) != NULL)
1311                 tw->tb->owners->bind_pprev = &tw->bind_next;
1312         tw->tb->owners = (struct sock*)tw;
1313         tw->bind_pprev = &tw->tb->owners;
1314         spin_unlock(&bhead->lock);
1315
1316         /* Step 4: Un-charge protocol socket in-use count. */
1317         sk->prot->inuse--;
1318 }
1319
1320 /*
1321  * Move a socket to time-wait.
1322  */
1323 void tcp_time_wait(struct sock *sk)
1324 {
1325         struct tcp_tw_bucket *tw;
1326
1327         tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1328         if(tw != NULL) {
1329                 /* Give us an identity. */
1330                 tw->daddr       = sk->daddr;
1331                 tw->rcv_saddr   = sk->rcv_saddr;
1332                 tw->bound_dev_if= sk->bound_dev_if;
1333                 tw->num         = sk->num;
1334                 tw->state       = TCP_TIME_WAIT;
1335                 tw->sport       = sk->sport;
1336                 tw->dport       = sk->dport;
1337                 tw->family      = sk->family;
1338                 tw->reuse       = sk->reuse;
1339                 tw->hashent     = sk->hashent;
1340                 tw->rcv_nxt     = sk->tp_pinfo.af_tcp.rcv_nxt;
1341                 tw->snd_nxt     = sk->tp_pinfo.af_tcp.snd_nxt;
1342                 tw->ts_recent   = sk->tp_pinfo.af_tcp.ts_recent;
1343                 tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
1344 #ifdef CONFIG_TCP_TW_RECYCLE
1345                 tw->rto         = sk->tp_pinfo.af_tcp.rto;
1346                 tw->ttd         = jiffies + 2*tw->rto;
1347 #endif
1348                 atomic_set(&tw->refcnt, 0);
1349
1350 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1351                 if(tw->family == PF_INET6) {
1352                         memcpy(&tw->v6_daddr,
1353                                &sk->net_pinfo.af_inet6.daddr,
1354                                sizeof(struct in6_addr));
1355                         memcpy(&tw->v6_rcv_saddr,
1356                                &sk->net_pinfo.af_inet6.rcv_saddr,
1357                                sizeof(struct in6_addr));
1358                 }
1359 #endif
1360                 /* Linkage updates. */
1361                 __tcp_tw_hashdance(sk, tw);
1362
1363                 /* Get the TIME_WAIT timeout firing. */
1364                 tcp_tw_schedule(tw);
1365
1366                 /* CLOSE the SK. */
1367                 if(sk->state == TCP_ESTABLISHED)
1368                         tcp_statistics.TcpCurrEstab--;
1369                 sk->state = TCP_CLOSE;
1370         } else {
1371                 /* Sorry, we're out of memory, just CLOSE this
1372                  * socket up.  We've got bigger problems than
1373                  * non-graceful socket closings.
1374                  */
1375                 tcp_set_state(sk, TCP_CLOSE);
1376         }
1377
1378         tcp_update_metrics(sk);
1379         tcp_clear_xmit_timers(sk);
1380         tcp_done(sk);
1381 }
1382
1383 /*
1384  *      Process the FIN bit. This now behaves as it is supposed to work
1385  *      and the FIN takes effect when it is validly part of sequence
1386  *      space. Not before when we get holes.
1387  *
1388  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1389  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1390  *      TIME-WAIT)
1391  *
1392  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1393  *      close and we go into CLOSING (and later onto TIME-WAIT)
1394  *
1395  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1396  */
1397
1398 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1399 {
1400         sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1401
1402         tcp_send_ack(sk);
1403
1404         if (!sk->dead) {
1405                 wake_up_interruptible(sk->sleep);
1406                 sock_wake_async(sk->socket, 1, POLL_HUP);
1407         }
1408
1409         switch(sk->state) {
1410                 case TCP_SYN_RECV:
1411                 case TCP_ESTABLISHED:
1412                         /* Move to CLOSE_WAIT */
1413                         tcp_set_state(sk, TCP_CLOSE_WAIT);
1414                         break;
1415
1416                 case TCP_CLOSE_WAIT:
1417                 case TCP_CLOSING:
1418                         /* Received a retransmission of the FIN, do
1419                          * nothing.
1420                          */
1421                         break;
1422                 case TCP_LAST_ACK:
1423                         /* RFC793: Remain in the LAST-ACK state. */
1424                         break;
1425
1426                 case TCP_FIN_WAIT1:
1427                         /* This case occurs when a simultaneous close
1428                          * happens, we must ack the received FIN and
1429                          * enter the CLOSING state.
1430                          */
1431                         tcp_set_state(sk, TCP_CLOSING);
1432                         break;
1433                 case TCP_FIN_WAIT2:
1434                         /* Received a FIN -- send ACK and enter TIME_WAIT. */
1435                         tcp_time_wait(sk);
1436                         break;
1437                 default:
1438                         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1439                          * cases we should never reach this piece of code.
1440                          */
1441                         printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1442                         break;
1443         };
1444 }
1445
1446 /* These routines update the SACK block as out-of-order packets arrive or
1447  * in-order packets close up the sequence space.
1448  */
1449 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1450 {
1451         int this_sack, num_sacks = tp->num_sacks;
1452         struct tcp_sack_block *swalk = &tp->selective_acks[0];
1453
1454         /* If more than one SACK block, see if the recent change to SP eats into
1455          * or hits the sequence space of other SACK blocks, if so coalesce.
1456          */
1457         if(num_sacks != 1) {
1458                 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1459                         if(swalk == sp)
1460                                 continue;
1461
1462                         /* First case, bottom of SP moves into top of the
1463                          * sequence space of SWALK.
1464                          */
1465                         if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1466                                 sp->start_seq = swalk->start_seq;
1467                                 goto coalesce;
1468                         }
1469                         /* Second case, top of SP moves into bottom of the
1470                          * sequence space of SWALK.
1471                          */
1472                         if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1473                                 sp->end_seq = swalk->end_seq;
1474                                 goto coalesce;
1475                         }
1476                 }
1477         }
1478         /* SP is the only SACK, or no coalescing cases found. */
1479         return;
1480
1481 coalesce:
1482         /* Zap SWALK, by moving every further SACK up by one slot.
1483          * Decrease num_sacks.
1484          */
1485         for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1486                 struct tcp_sack_block *next = (swalk + 1);
1487                 swalk->start_seq = next->start_seq;
1488                 swalk->end_seq = next->end_seq;
1489         }
1490         tp->num_sacks--;
1491 }
1492
1493 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1494 {
1495         __u32 tmp;
1496
1497         tmp = sack1->start_seq;
1498         sack1->start_seq = sack2->start_seq;
1499         sack2->start_seq = tmp;
1500
1501         tmp = sack1->end_seq;
1502         sack1->end_seq = sack2->end_seq;
1503         sack2->end_seq = tmp;
1504 }
1505
1506 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1507 {
1508         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1509         struct tcp_sack_block *sp = &tp->selective_acks[0];
1510         int cur_sacks = tp->num_sacks;
1511
1512         if (!cur_sacks)
1513                 goto new_sack;
1514
1515         /* Optimize for the common case, new ofo frames arrive
1516          * "in order". ;-)  This also satisfies the requirements
1517          * of RFC2018 about ordering of SACKs.
1518          */
1519         if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1520                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1521                 tcp_sack_maybe_coalesce(tp, sp);
1522         } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1523                 /* Re-ordered arrival, in this case, can be optimized
1524                  * as well.
1525                  */
1526                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1527                 tcp_sack_maybe_coalesce(tp, sp);
1528         } else {
1529                 struct tcp_sack_block *swap = sp + 1;
1530                 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1531
1532                 /* Oh well, we have to move things around.
1533                  * Try to find a SACK we can tack this onto.
1534                  */
1535
1536                 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1537                         if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1538                            (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1539                                 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1540                                         swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1541                                 else
1542                                         swap->start_seq = TCP_SKB_CB(skb)->seq;
1543                                 tcp_sack_swap(sp, swap);
1544                                 tcp_sack_maybe_coalesce(tp, sp);
1545                                 return;
1546                         }
1547                 }
1548
1549                 /* Could not find an adjacent existing SACK, build a new one,
1550                  * put it at the front, and shift everyone else down.  We
1551                  * always know there is at least one SACK present already here.
1552                  *
1553                  * If the sack array is full, forget about the last one.
1554                  */
1555                 if (cur_sacks >= max_sacks) {
1556                         cur_sacks--;
1557                         tp->num_sacks--;
1558                 }
1559                 while(cur_sacks >= 1) {
1560                         struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1561                         struct tcp_sack_block *prev = (this - 1);
1562                         this->start_seq = prev->start_seq;
1563                         this->end_seq = prev->end_seq;
1564                         cur_sacks--;
1565                 }
1566
1567         new_sack:
1568                 /* Build the new head SACK, and we're done. */
1569                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1570                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1571                 tp->num_sacks++;
1572         }
1573 }
1574
1575 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1576 {
1577         struct tcp_sack_block *sp = &tp->selective_acks[0];
1578         int num_sacks = tp->num_sacks;
1579         int this_sack;
1580
1581         /* This is an in order data segment _or_ an out-of-order SKB being
1582          * moved to the receive queue, so we know this removed SKB will eat
1583          * from the front of a SACK.
1584          */
1585         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1586                 /* Check if the start of the sack is covered by skb. */
1587                 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1588                    before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1589                         break;
1590         }
1591
1592         /* This should only happen if so many SACKs get built that some get
1593          * pushed out before we get here, or we eat some in sequence packets
1594          * which are before the first SACK block.
1595          */
1596         if(this_sack >= num_sacks)
1597                 return;
1598
1599         sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1600         if(!before(sp->start_seq, sp->end_seq)) {
1601                 /* Zap this SACK, by moving forward any other SACKS. */
1602                 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1603                         struct tcp_sack_block *next = (sp + 1);
1604                         sp->start_seq = next->start_seq;
1605                         sp->end_seq = next->end_seq;
1606                 }
1607                 tp->num_sacks--;
1608         }
1609 }
1610
1611 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1612 {
1613         struct tcp_sack_block *sp = &tp->selective_acks[0];
1614         int num_sacks = tp->num_sacks;
1615         int this_sack;
1616
1617         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1618                 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1619                         break;
1620         }
1621         if(this_sack >= num_sacks)
1622                 return;
1623         sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1624 }
1625
1626 /* This one checks to see if we can put data from the
1627  * out_of_order queue into the receive_queue.
1628  */
1629 static void tcp_ofo_queue(struct sock *sk)
1630 {
1631         struct sk_buff *skb;
1632         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1633
1634         while ((skb = skb_peek(&tp->out_of_order_queue))) {
1635                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1636                         break;
1637
1638                 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1639                         SOCK_DEBUG(sk, "ofo packet was already received \n");
1640                         __skb_unlink(skb, skb->list);
1641                         kfree_skb(skb);
1642                         continue;
1643                 }
1644                 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1645                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1646                            TCP_SKB_CB(skb)->end_seq);
1647
1648                 if(tp->sack_ok)
1649                         tcp_sack_remove_skb(tp, skb);
1650                 __skb_unlink(skb, skb->list);
1651                 __skb_queue_tail(&sk->receive_queue, skb);
1652                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1653                 if(skb->h.th->fin)
1654                         tcp_fin(skb, sk, skb->h.th);
1655         }
1656 }
1657
1658 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1659 {
1660         struct sk_buff *skb1;
1661         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1662
1663         /*  Queue data for delivery to the user.
1664          *  Packets in sequence go to the receive queue.
1665          *  Out of sequence packets to the out_of_order_queue.
1666          */
1667         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1668                 /* Ok. In sequence. */
1669         queue_and_out:
1670                 dst_confirm(sk->dst_cache);
1671                 __skb_queue_tail(&sk->receive_queue, skb);
1672                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1673                 if(skb->h.th->fin) {
1674                         tcp_fin(skb, sk, skb->h.th);
1675                 } else {
1676                         tcp_remember_ack(tp, skb->h.th, skb);
1677                 }
1678                 /* This may have eaten into a SACK block. */
1679                 if(tp->sack_ok && tp->num_sacks)
1680                         tcp_sack_remove_skb(tp, skb);
1681                 tcp_ofo_queue(sk);
1682
1683                 /* Turn on fast path. */
1684                 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1685                         tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1686                                                ntohl(TCP_FLAG_ACK) |
1687                                                tp->snd_wnd);
1688                 return;
1689         }
1690
1691         /* An old packet, either a retransmit or some packet got lost. */
1692         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1693                 /* A retransmit, 2nd most common case.  Force an imediate ack. */
1694                 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1695                 tcp_enter_quickack_mode(tp);
1696                 kfree_skb(skb);
1697                 return;
1698         }
1699
1700         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1701                 /* Partial packet, seq < rcv_next < end_seq */
1702                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1703                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1704                            TCP_SKB_CB(skb)->end_seq);
1705
1706                 goto queue_and_out;
1707         }
1708
1709         /* Ok. This is an out_of_order segment, force an ack. */
1710         tp->delayed_acks++;
1711         tcp_enter_quickack_mode(tp);
1712
1713         /* Disable header prediction. */
1714         tp->pred_flags = 0;
1715
1716         SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1717                    tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1718
1719         if (skb_peek(&tp->out_of_order_queue) == NULL) {
1720                 /* Initial out of order segment, build 1 SACK. */
1721                 if(tp->sack_ok) {
1722                         tp->num_sacks = 1;
1723                         tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1724                         tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1725                 }
1726                 __skb_queue_head(&tp->out_of_order_queue,skb);
1727         } else {
1728                 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1729                         /* Already there. */
1730                         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1731                                 if (skb->len >= skb1->len) {
1732                                         if(tp->sack_ok)
1733                                                 tcp_sack_extend(tp, skb1, skb);
1734                                         __skb_append(skb1, skb);
1735                                         __skb_unlink(skb1, skb1->list);
1736                                         kfree_skb(skb1);
1737                                 } else {
1738                                         /* A duplicate, smaller than what is in the
1739                                          * out-of-order queue right now, toss it.
1740                                          */
1741                                         kfree_skb(skb);
1742                                 }
1743                                 break;
1744                         }
1745
1746                         if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1747                                 __skb_append(skb1, skb);
1748                                 if(tp->sack_ok)
1749                                         tcp_sack_new_ofo_skb(sk, skb);
1750                                 break;
1751                         }
1752
1753                         /* See if we've hit the start. If so insert. */
1754                         if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1755                                 __skb_queue_head(&tp->out_of_order_queue,skb);
1756                                 if(tp->sack_ok)
1757                                         tcp_sack_new_ofo_skb(sk, skb);
1758                                 break;
1759                         }
1760                 }
1761         }
1762 }
1763
1764
1765 /*
1766  *      This routine handles the data.  If there is room in the buffer,
1767  *      it will be have already been moved into it.  If there is no
1768  *      room, then we will just have to discard the packet.
1769  */
1770
1771 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1772 {
1773         struct tcphdr *th;
1774         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1775
1776         th = skb->h.th;
1777         skb_pull(skb, th->doff*4);
1778         skb_trim(skb, len - (th->doff*4));
1779
1780         if (skb->len == 0 && !th->fin)
1781                 return(0);
1782
1783         /*
1784          *      If our receive queue has grown past its limits shrink it.
1785          *      Make sure to do this before moving snd_nxt, otherwise
1786          *      data might be acked for that we don't have enough room.
1787          */
1788         if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1789                 if (prune_queue(sk) < 0) {
1790                         /* Still not enough room. That can happen when
1791                          * skb->true_size differs significantly from skb->len.
1792                          */
1793                         return 0;
1794                 }
1795         }
1796
1797         tcp_data_queue(sk, skb);
1798
1799         if (before(tp->rcv_nxt, tp->copied_seq)) {
1800                 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1801                 tp->rcv_nxt = tp->copied_seq;
1802         }
1803
1804         /* Above, tcp_data_queue() increments delayed_acks appropriately.
1805          * Now tell the user we may have some data.
1806          */
1807         if (!sk->dead) {
1808                 wake_up_interruptible(sk->sleep);
1809                 sock_wake_async(sk->socket,1, POLL_IN);
1810         }
1811         return(1);
1812 }
1813
1814 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1815 {
1816         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1817
1818         if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1819             tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1820                 /* Put more data onto the wire. */
1821                 tcp_write_xmit(sk);
1822         } else if (tp->packets_out == 0 && !tp->pending) {
1823                 /* Start probing the receivers window. */
1824                 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1825         }
1826 }
1827
1828 static __inline__ void tcp_data_snd_check(struct sock *sk)
1829 {
1830         struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1831
1832         if (skb != NULL)
1833                 __tcp_data_snd_check(sk, skb);
1834 }
1835
1836 /*
1837  * Adapt the MSS value used to make delayed ack decision to the
1838  * real world.
1839  *
1840  * The constant 536 hasn't any good meaning.  In IPv4 world
1841  * MTU may be smaller, though it contradicts to RFC1122, which
1842  * states that MSS must be at least 536.
1843  * We use the constant to do not ACK each second
1844  * packet in a stream of tiny size packets.
1845  * It means that super-low mtu links will be aggressively delacked.
1846  * Seems, it is even good. If they have so low mtu, they are weirdly
1847  * slow.
1848  *
1849  * AK: BTW it may be useful to add an option to lock the rcv_mss.
1850  *     this way the beowulf people wouldn't need ugly patches to get the
1851  *     ack frequencies they want and it would be an elegant way to tune delack.
1852  */
1853 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1854 {
1855         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1856         unsigned int len, lss;
1857
1858         lss = tp->last_seg_size;
1859         tp->last_seg_size = 0;
1860
1861         /* skb->len may jitter because of SACKs, even if peer
1862          * sends good full-sized frames.
1863          */
1864         len = skb->len;
1865         if (len >= tp->rcv_mss) {
1866                 tp->rcv_mss = len;
1867         } else {
1868                 /* Otherwise, we make more careful check taking into account,
1869                  * that SACKs block is variable.
1870                  *
1871                  * "len" is invariant segment length, including TCP header.
1872                  */
1873                 len = skb->tail - skb->h.raw;
1874                 if (len >= 536 + sizeof(struct tcphdr)) {
1875                         /* Subtract also invariant (if peer is RFC compliant),
1876                          * tcp header plus fixed timestamp option length.
1877                          * Resulting "len" is MSS free of SACK jitter.
1878                          */
1879                         len -= tp->tcp_header_len;
1880                         if (len == lss)
1881                                 tp->rcv_mss = len;
1882                         tp->last_seg_size = len;
1883                 }
1884         }
1885 }
1886
1887 /*
1888  * Check if sending an ack is needed.
1889  */
1890 static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
1891 {
1892         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1893
1894         /* This also takes care of updating the window.
1895          * This if statement needs to be simplified.
1896          *
1897          * Rules for delaying an ack:
1898          *      - delay time <= 0.5 HZ
1899          *      - we don't have a window update to send
1900          *      - must send at least every 2 full sized packets
1901          *      - must send an ACK if we have any out of order data
1902          *
1903          * With an extra heuristic to handle loss of packet
1904          * situations and also helping the sender leave slow
1905          * start in an expediant manner.
1906          */
1907
1908             /* Two full frames received or... */
1909         if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1910             /* We will update the window "significantly" or... */
1911             tcp_raise_window(sk) ||
1912             /* We entered "quick ACK" mode or... */
1913             tcp_in_quickack_mode(tp) ||
1914             /* We have out of order data */
1915             (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
1916                 /* Then ack it now */
1917                 tcp_send_ack(sk);
1918         } else {
1919                 /* Else, send delayed ack. */
1920                 tcp_send_delayed_ack(sk, HZ/2);
1921         }
1922 }
1923
1924 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1925 {
1926         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1927         if (tp->delayed_acks == 0) {
1928                 /* We sent a data segment already. */
1929                 return;
1930         }
1931         __tcp_ack_snd_check(sk, 1);
1932 }
1933
1934
1935 /*
1936  *      This routine is only called when we have urgent data
1937  *      signalled. Its the 'slow' part of tcp_urg. It could be
1938  *      moved inline now as tcp_urg is only called from one
1939  *      place. We handle URGent data wrong. We have to - as
1940  *      BSD still doesn't use the correction from RFC961.
1941  *      For 1003.1g we should support a new option TCP_STDURG to permit
1942  *      either form (or just set the sysctl tcp_stdurg).
1943  */
1944
1945 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1946 {
1947         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1948         u32 ptr = ntohs(th->urg_ptr);
1949
1950         if (ptr && !sysctl_tcp_stdurg)
1951                 ptr--;
1952         ptr += ntohl(th->seq);
1953
1954         /* Ignore urgent data that we've already seen and read. */
1955         if (after(tp->copied_seq, ptr))
1956                 return;
1957
1958         /* Do we already have a newer (or duplicate) urgent pointer? */
1959         if (tp->urg_data && !after(ptr, tp->urg_seq))
1960                 return;
1961
1962         /* Tell the world about our new urgent pointer. */
1963         if (sk->proc != 0) {
1964                 if (sk->proc > 0)
1965                         kill_proc(sk->proc, SIGURG, 1);
1966                 else
1967                         kill_pg(-sk->proc, SIGURG, 1);
1968                 sock_wake_async(sk->socket, 3, POLL_PRI);
1969         }
1970
1971         /* We may be adding urgent data when the last byte read was
1972          * urgent. To do this requires some care. We cannot just ignore
1973          * tp->copied_seq since we would read the last urgent byte again
1974          * as data, nor can we alter copied_seq until this data arrives
1975          * or we break the sematics of SIOCATMARK (and thus sockatmark())
1976          */
1977         if (tp->urg_seq == tp->copied_seq)
1978                 tp->copied_seq++;       /* Move the copied sequence on correctly */
1979         tp->urg_data = URG_NOTYET;
1980         tp->urg_seq = ptr;
1981
1982         /* Disable header prediction. */
1983         tp->pred_flags = 0;
1984 }
1985
1986 /* This is the 'fast' part of urgent handling. */
1987 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1988 {
1989         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1990
1991         /* Check if we get a new urgent pointer - normally not. */
1992         if (th->urg)
1993                 tcp_check_urg(sk,th);
1994
1995         /* Do we wait for any urgent data? - normally not... */
1996         if (tp->urg_data == URG_NOTYET) {
1997                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1998
1999                 /* Is the urgent pointer pointing into this packet? */
2000                 if (ptr < len) {
2001                         tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
2002                         if (!sk->dead)
2003                                 sk->data_ready(sk,0);
2004                 }
2005         }
2006 }
2007
2008 /* Clean the out_of_order queue if we can, trying to get
2009  * the socket within its memory limits again.
2010  *
2011  * Return less than zero if we should start dropping frames
2012  * until the socket owning process reads some of the data
2013  * to stabilize the situation.
2014  */
2015 static int prune_queue(struct sock *sk)
2016 {
2017         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2018         struct sk_buff * skb;
2019
2020         SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
2021
2022         net_statistics.PruneCalled++;
2023
2024         /* First, purge the out_of_order queue. */
2025         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2026         if(skb != NULL) {
2027                 /* Free it all. */
2028                 do {    net_statistics.OfoPruned += skb->len;
2029                         kfree_skb(skb);
2030                         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2031                 } while(skb != NULL);
2032
2033                 /* Reset SACK state.  A conforming SACK implementation will
2034                  * do the same at a timeout based retransmit.  When a connection
2035                  * is in a sad state like this, we care only about integrity
2036                  * of the connection not performance.
2037                  */
2038                 if(tp->sack_ok)
2039                         tp->num_sacks = 0;
2040         }
2041
2042         /* If we are really being abused, tell the caller to silently
2043          * drop receive data on the floor.  It will get retransmitted
2044          * and hopefully then we'll have sufficient space.
2045          *
2046          * We used to try to purge the in-order packets too, but that
2047          * turns out to be deadly and fraught with races.  Consider:
2048          *
2049          * 1) If we acked the data, we absolutely cannot drop the
2050          *    packet.  This data would then never be retransmitted.
2051          * 2) It is possible, with a proper sequence of events involving
2052          *    delayed acks and backlog queue handling, to have the user
2053          *    read the data before it gets acked.  The previous code
2054          *    here got this wrong, and it lead to data corruption.
2055          * 3) Too much state changes happen when the FIN arrives, so once
2056          *    we've seen that we can't remove any in-order data safely.
2057          *
2058          * The net result is that removing in-order receive data is too
2059          * complex for anyones sanity.  So we don't do it anymore.  But
2060          * if we are really having our buffer space abused we stop accepting
2061          * new receive data.
2062          *
2063          * FIXME: it should recompute SACK state and only remove enough
2064          *        buffers to get into bounds again. The current scheme loses
2065      *        badly sometimes on links with large RTT, especially when
2066      *        the driver has high overhead per skb.
2067      *        (increasing the rcvbuf is not enough because it inflates the
2068      *         the window too, disabling flow control effectively) -AK
2069          */
2070         if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
2071                 return 0;
2072
2073         /* Massive buffer overcommit. */
2074         return -1;
2075 }
2076
2077 /*
2078  *      TCP receive function for the ESTABLISHED state.
2079  *
2080  *      It is split into a fast path and a slow path. The fast path is
2081  *      disabled when:
2082  *      - A zero window was announced from us - zero window probing
2083  *        is only handled properly in the slow path.
2084  *  - Out of order segments arrived.
2085  *      - Urgent data is expected.
2086  *      - There is no buffer space left
2087  *      - Unexpected TCP flags/window values/header lengths are received
2088  *        (detected by checking the TCP header against pred_flags)
2089  *      - Data is sent in both directions. Fast path only supports pure senders
2090  *        or pure receivers (this means either the sequence number or the ack
2091  *        value must stay constant)
2092  *  - Unexpected TCP option.
2093  *
2094  *      When these conditions are not satisfied it drops into a standard
2095  *      receive procedure patterned after RFC793 to handle all cases.
2096  *      The first three cases are guaranteed by proper pred_flags setting,
2097  *      the rest is checked inline. Fast processing is turned on in
2098  *      tcp_data_queue when everything is OK.
2099  */
2100 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
2101                         struct tcphdr *th, unsigned len)
2102 {
2103         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2104
2105         /*
2106          *      Header prediction.
2107          *      The code losely follows the one in the famous
2108          *      "30 instruction TCP receive" Van Jacobson mail.
2109          *
2110          *      Van's trick is to deposit buffers into socket queue
2111          *      on a device interrupt, to call tcp_recv function
2112          *      on the receive process context and checksum and copy
2113          *      the buffer to user space. smart...
2114          *
2115          *      Our current scheme is not silly either but we take the
2116          *      extra cost of the net_bh soft interrupt processing...
2117          *      We do checksum and copy also but from device to kernel.
2118          */
2119
2120
2121         /* RED-PEN. Using static variables to pass function arguments
2122          * cannot be good idea...
2123          */
2124         tp->saw_tstamp = 0;
2125
2126         /*      pred_flags is 0xS?10 << 16 + snd_wnd
2127          *      if header_predition is to be made
2128          *      'S' will always be tp->tcp_header_len >> 2
2129          *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2130          *  turn it off (when there are holes in the receive
2131          *       space for instance)
2132          *      PSH flag is ignored.
2133          */
2134
2135         if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
2136                 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
2137                 int tcp_header_len = th->doff*4;
2138
2139                 /* Timestamp header prediction */
2140
2141                 /* Non-standard header f.e. SACKs -> slow path */
2142                 if (tcp_header_len != tp->tcp_header_len)
2143                         goto slow_path;
2144
2145                 /* Check timestamp */
2146                 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
2147                         __u32 *ptr = (__u32 *)(th + 1);
2148
2149                         /* No? Slow path! */
2150                         if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
2151                                                      | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
2152                                 goto slow_path;
2153
2154                         tp->saw_tstamp = 1;
2155                         ++ptr;
2156                         tp->rcv_tsval = ntohl(*ptr);
2157                         ++ptr;
2158                         tp->rcv_tsecr = ntohl(*ptr);
2159
2160                         /* If PAWS failed, check it more carefully in slow path */
2161                         if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
2162                                 goto slow_path;
2163
2164                         /* Predicted packet is in window by definition.
2165                            seq == rcv_nxt and last_ack_sent <= rcv_nxt.
2166                            Hence, check seq<=last_ack_sent reduces to:
2167                          */
2168                         if (tp->rcv_nxt == tp->last_ack_sent) {
2169                                 tp->ts_recent = tp->rcv_tsval;
2170                                 tp->ts_recent_stamp = xtime.tv_sec;
2171                         }
2172                 }
2173
2174                 if (len <= tcp_header_len) {
2175                         /* Bulk data transfer: sender */
2176                         if (len == tcp_header_len) {
2177                                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2178                                         TCP_SKB_CB(skb)->ack_seq, len);
2179                                 kfree_skb(skb);
2180                                 tcp_data_snd_check(sk);
2181                                 return 0;
2182                         } else { /* Header too small */
2183                                 tcp_statistics.TcpInErrs++;
2184                                 goto discard;
2185                         }
2186                 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
2187                            atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
2188                         /* Bulk data transfer: receiver */
2189                         __skb_pull(skb,tcp_header_len);
2190
2191                         /* Is it possible to simplify this? */
2192                         tcp_measure_rcv_mss(sk, skb);
2193
2194                         /* DO NOT notify forward progress here.
2195                          * It saves dozen of CPU instructions in fast path. --ANK
2196                          * And where is it signaled then ? -AK
2197                          */
2198                         __skb_queue_tail(&sk->receive_queue, skb);
2199                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2200
2201                         /* FIN bit check is not done since if FIN is set in
2202                          * this frame, the pred_flags won't match up. -DaveM
2203                          */
2204                         wake_up_interruptible(sk->sleep);
2205                         sock_wake_async(sk->socket,1, POLL_IN);
2206                         tcp_delack_estimator(tp);
2207
2208                         tcp_remember_ack(tp, th, skb);
2209
2210                         __tcp_ack_snd_check(sk, 0);
2211                         return 0;
2212                 }
2213                 /* Packet is in sequence, flags are trivial;
2214                  * only ACK is strange or we are tough on memory.
2215                  * Jump to step 5.
2216                  */
2217                 goto step5;
2218         }
2219
2220 slow_path:
2221         /*
2222          * RFC1323: H1. Apply PAWS check first.
2223          */
2224         if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2225             tcp_paws_discard(tp, skb)) {
2226                 if (!th->rst) {
2227                         tcp_send_ack(sk);
2228                         goto discard;
2229                 }
2230                 /* Resets are accepted even if PAWS failed.
2231
2232                    ts_recent update must be made after we are sure
2233                    that the packet is in window.
2234                  */
2235         }
2236
2237         /*
2238          *      Standard slow path.
2239          */
2240
2241         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2242                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2243                  * (RST) segments are validated by checking their SEQ-fields."
2244                  * And page 69: "If an incoming segment is not acceptable,
2245                  * an acknowledgment should be sent in reply (unless the RST bit
2246                  * is set, if so drop the segment and return)".
2247                  */
2248                 if (th->rst)
2249                         goto discard;
2250                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2251                         SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
2252                                    TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2253                                    tp->rcv_wup, tp->rcv_wnd);
2254                 }
2255                 tcp_send_ack(sk);
2256                 goto discard;
2257         }
2258
2259         if(th->rst) {
2260                 tcp_reset(sk);
2261                 goto discard;
2262         }
2263
2264         if (tp->saw_tstamp) {
2265                 tcp_replace_ts_recent(sk, tp,
2266                                       TCP_SKB_CB(skb)->seq);
2267         }
2268
2269         if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2270                 SOCK_DEBUG(sk, "syn in established state\n");
2271                 tcp_statistics.TcpInErrs++;
2272                 tcp_reset(sk);
2273                 return 1;
2274         }
2275
2276 step5:
2277         if(th->ack)
2278                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
2279
2280         /* Process urgent data. */
2281         tcp_urg(sk, th, len);
2282
2283         {
2284         /* step 7: process the segment text */
2285         int queued = tcp_data(skb, sk, len);
2286
2287         tcp_measure_rcv_mss(sk, skb);
2288
2289         /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2290         if(sk->state != TCP_CLOSE) {
2291                 tcp_data_snd_check(sk);
2292                 tcp_ack_snd_check(sk);
2293         }
2294
2295         if (!queued) {
2296         discard:
2297                 kfree_skb(skb);
2298         }
2299         }
2300
2301         return 0;
2302 }
2303
2304
2305 /* This is not only more efficient than what we used to do, it eliminates
2306  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2307  *
2308  * Actually, we could lots of memory writes here. tp of listening
2309  * socket contains all necessary default parameters.
2310  */
2311 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
2312 {
2313         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
2314
2315         if(newsk != NULL) {
2316                 struct tcp_opt *newtp;
2317 #ifdef CONFIG_FILTER
2318                 struct sk_filter *filter;
2319 #endif
2320
2321                 memcpy(newsk, sk, sizeof(*newsk));
2322                 newsk->state = TCP_SYN_RECV;
2323
2324                 /* SANITY */
2325                 newsk->pprev = NULL;
2326                 newsk->prev = NULL;
2327
2328                 /* Clone the TCP header template */
2329                 newsk->dport = req->rmt_port;
2330
2331                 sock_lock_init(newsk);
2332
2333                 atomic_set(&newsk->rmem_alloc, 0);
2334                 skb_queue_head_init(&newsk->receive_queue);
2335                 atomic_set(&newsk->wmem_alloc, 0);
2336                 skb_queue_head_init(&newsk->write_queue);
2337                 atomic_set(&newsk->omem_alloc, 0);
2338
2339                 newsk->done = 0;
2340                 newsk->proc = 0;
2341                 newsk->backlog.head = newsk->backlog.tail = NULL;
2342                 skb_queue_head_init(&newsk->error_queue);
2343                 newsk->write_space = tcp_write_space;
2344 #ifdef CONFIG_FILTER
2345                 if ((filter = newsk->filter) != NULL)
2346                         sk_filter_charge(newsk, filter);
2347 #endif
2348
2349                 /* Now setup tcp_opt */
2350                 newtp = &(newsk->tp_pinfo.af_tcp);
2351                 newtp->pred_flags = 0;
2352                 newtp->rcv_nxt = req->rcv_isn + 1;
2353                 newtp->snd_nxt = req->snt_isn + 1;
2354                 newtp->snd_una = req->snt_isn + 1;
2355                 newtp->srtt = 0;
2356                 newtp->ato = 0;
2357                 newtp->snd_wl1 = req->rcv_isn;
2358                 newtp->snd_wl2 = req->snt_isn;
2359
2360                 /* RFC1323: The window in SYN & SYN/ACK segments
2361                  * is never scaled.
2362                  */
2363                 newtp->snd_wnd = ntohs(skb->h.th->window);
2364
2365                 newtp->max_window = newtp->snd_wnd;
2366                 newtp->pending = 0;
2367                 newtp->retransmits = 0;
2368                 newtp->last_ack_sent = req->rcv_isn + 1;
2369                 newtp->backoff = 0;
2370                 newtp->mdev = TCP_TIMEOUT_INIT;
2371
2372                 /* So many TCP implementations out there (incorrectly) count the
2373                  * initial SYN frame in their delayed-ACK and congestion control
2374                  * algorithms that we must have the following bandaid to talk
2375                  * efficiently to them.  -DaveM
2376                  */
2377                 newtp->snd_cwnd = 2;
2378
2379                 newtp->rto = TCP_TIMEOUT_INIT;
2380                 newtp->packets_out = 0;
2381                 newtp->fackets_out = 0;
2382                 newtp->retrans_out = 0;
2383                 newtp->high_seq = 0;
2384                 newtp->snd_ssthresh = 0x7fffffff;
2385                 newtp->snd_cwnd_cnt = 0;
2386                 newtp->dup_acks = 0;
2387                 newtp->delayed_acks = 0;
2388                 init_timer(&newtp->retransmit_timer);
2389                 newtp->retransmit_timer.function = &tcp_retransmit_timer;
2390                 newtp->retransmit_timer.data = (unsigned long) newsk;
2391                 init_timer(&newtp->delack_timer);
2392                 newtp->delack_timer.function = &tcp_delack_timer;
2393                 newtp->delack_timer.data = (unsigned long) newsk;
2394                 skb_queue_head_init(&newtp->out_of_order_queue);
2395                 newtp->send_head = newtp->retrans_head = NULL;
2396                 newtp->rcv_wup = req->rcv_isn + 1;
2397                 newtp->write_seq = req->snt_isn + 1;
2398                 newtp->copied_seq = req->rcv_isn + 1;
2399
2400                 newtp->saw_tstamp = 0;
2401
2402                 init_timer(&newtp->probe_timer);
2403                 newtp->probe_timer.function = &tcp_probe_timer;
2404                 newtp->probe_timer.data = (unsigned long) newsk;
2405                 newtp->probes_out = 0;
2406                 newtp->syn_seq = req->rcv_isn;
2407                 newtp->fin_seq = req->rcv_isn;
2408                 newtp->urg_data = 0;
2409                 tcp_synq_init(newtp);
2410                 newtp->syn_backlog = 0;
2411                 if (skb->len >= 536)
2412                         newtp->last_seg_size = skb->len;
2413
2414                 /* Back to base struct sock members. */
2415                 newsk->err = 0;
2416                 newsk->ack_backlog = 0;
2417                 newsk->max_ack_backlog = SOMAXCONN;
2418                 newsk->priority = 0;
2419                 atomic_set(&newsk->refcnt, 1);
2420                 atomic_inc(&inet_sock_nr);
2421
2422                 spin_lock_init(&sk->timer_lock);
2423                 init_timer(&newsk->timer);
2424                 newsk->timer.function = &tcp_keepalive_timer;
2425                 newsk->timer.data = (unsigned long) newsk;
2426                 if (newsk->keepopen)
2427                         tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
2428                 newsk->socket = NULL;
2429                 newsk->sleep = NULL;
2430
2431                 newtp->tstamp_ok = req->tstamp_ok;
2432                 if((newtp->sack_ok = req->sack_ok) != 0)
2433                         newtp->num_sacks = 0;
2434                 newtp->window_clamp = req->window_clamp;
2435                 newtp->rcv_wnd = req->rcv_wnd;
2436                 newtp->wscale_ok = req->wscale_ok;
2437                 if (newtp->wscale_ok) {
2438                         newtp->snd_wscale = req->snd_wscale;
2439                         newtp->rcv_wscale = req->rcv_wscale;
2440                 } else {
2441                         newtp->snd_wscale = newtp->rcv_wscale = 0;
2442                         newtp->window_clamp = min(newtp->window_clamp,65535);
2443                 }
2444                 if (newtp->tstamp_ok) {
2445                         newtp->ts_recent = req->ts_recent;
2446                         newtp->ts_recent_stamp = xtime.tv_sec;
2447                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2448                 } else {
2449                         newtp->ts_recent_stamp = 0;
2450                         newtp->tcp_header_len = sizeof(struct tcphdr);
2451                 }
2452                 newtp->mss_clamp = req->mss;
2453         }
2454         return newsk;
2455 }
2456
2457 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
2458 {
2459         if (seq == s_win)
2460                 return 1;
2461         if (after(end_seq, s_win) && before(seq, e_win))
2462                 return 1;
2463         return (seq == e_win && seq == end_seq);
2464 }
2465
2466
2467 /*
2468  *      Process an incoming packet for SYN_RECV sockets represented
2469  *      as an open_request.
2470  */
2471
2472 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
2473                            struct open_request *req,
2474                            struct open_request *prev)
2475 {
2476         struct tcphdr *th = skb->h.th;
2477         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2478         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
2479         int paws_reject = 0;
2480         struct tcp_opt ttp;
2481
2482         /* If socket has already been created, process
2483            packet in its context.
2484
2485            We fall here only due to race, when packets were enqueued
2486            to backlog of listening socket.
2487          */
2488         if (req->sk)
2489                 return req->sk;
2490
2491         ttp.saw_tstamp = 0;
2492         if (th->doff > (sizeof(struct tcphdr)>>2)) {
2493
2494                 tcp_parse_options(NULL, th, &ttp, 0);
2495
2496                 paws_reject = ttp.saw_tstamp &&
2497                         (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
2498         }
2499
2500         /* Check for pure retransmited SYN. */
2501         if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
2502             flg == TCP_FLAG_SYN &&
2503             !paws_reject) {
2504                 /*
2505                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2506                  * this case on figure 6 and figure 8, but formal
2507                  * protocol description says NOTHING.
2508                  * To be more exact, it says that we should send ACK,
2509                  * because this segment (at least, if it has no data)
2510                  * is out of window.
2511                  *
2512                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2513                  *  describe SYN-RECV state. All the description
2514                  *  is wrong, we cannot believe to it and should
2515                  *  rely only on common sense and implementation
2516                  *  experience.
2517                  *
2518                  * Enforce "SYN-ACK" according to figure 8, figure 6
2519                  * of RFC793, fixed by RFC1122.
2520                  */
2521                 req->class->rtx_syn_ack(sk, req);
2522                 return NULL;
2523         }
2524
2525         /* Further reproduces section "SEGMENT ARRIVES"
2526            for state SYN-RECEIVED of RFC793.
2527            It is broken, however, it does not work only
2528            when SYNs are crossed, which is impossible in our
2529            case.
2530
2531            But generally, we should (RFC lies!) to accept ACK
2532            from SYNACK both here and in tcp_rcv_state_process().
2533            tcp_rcv_state_process() does not, hence, we do not too.
2534
2535            Note that the case is absolutely generic:
2536            we cannot optimize anything here without
2537            violating protocol. All the checks must be made
2538            before attempt to create socket.
2539          */
2540
2541         /* RFC793: "first check sequence number". */
2542
2543         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2544                                           req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
2545                 /* Out of window: send ACK and drop. */
2546                 if (!(flg & TCP_FLAG_RST))
2547                         req->class->send_ack(skb, req);
2548                 return NULL;
2549         }
2550
2551         /* In sequence, PAWS is OK. */
2552
2553         if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
2554                 req->ts_recent = ttp.rcv_tsval;
2555
2556         if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
2557                 /* Truncate SYN, it is out of window starting
2558                    at req->rcv_isn+1. */
2559                 flg &= ~TCP_FLAG_SYN;
2560         }
2561
2562         /* RFC793: "second check the RST bit" and
2563          *         "fourth, check the SYN bit"
2564          */
2565         if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
2566                 goto embryonic_reset;
2567
2568         /* RFC793: "fifth check the ACK field" */
2569
2570         if (!(flg & TCP_FLAG_ACK))
2571                 return NULL;
2572
2573         /* Invalid ACK: reset will be sent by listening socket */
2574         if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
2575                 return sk;
2576
2577         /* OK, ACK is valid, create big socket and
2578            feed this segment to it. It will repeat all
2579            the tests. THIS SEGMENT MUST MOVE SOCKET TO
2580            ESTABLISHED STATE. If it will be dropped after
2581            socket is created, wait for troubles.
2582          */
2583         sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2584         if (sk == NULL)
2585                 return NULL;
2586
2587         tcp_dec_slow_timer(TCP_SLT_SYNACK);
2588         req->sk = sk;
2589         return sk;
2590
2591 embryonic_reset:
2592         tcp_synq_unlink(tp, req, prev);
2593         tp->syn_backlog--;
2594         tcp_dec_slow_timer(TCP_SLT_SYNACK);
2595
2596         net_statistics.EmbryonicRsts++;
2597         if (!(flg & TCP_FLAG_RST))
2598                 req->class->send_reset(skb);
2599
2600         req->class->destructor(req);
2601         tcp_openreq_free(req);
2602         return NULL;
2603 }
2604
2605 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
2606                                          struct tcphdr *th, unsigned len)
2607 {
2608         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2609
2610         tcp_parse_options(sk, th, tp, 0);
2611
2612 #ifdef CONFIG_TCP_TW_RECYCLE
2613         if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
2614             (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
2615             xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
2616                 /* Old duplicate segment. We remember last
2617                    ts_recent from this host in timewait bucket.
2618
2619                    Actually, we could implement per host cache
2620                    to truncate timewait state after RTO. Paranoidal arguments
2621                    of rfc1337 are not enough to close this nice possibility.
2622                 */
2623                 if (net_ratelimit())
2624                         printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
2625                 if (th->ack)
2626                         return 1;
2627                 goto discard;
2628         }
2629 #endif
2630
2631         if (th->ack) {
2632                 /* rfc793:
2633                  * "If the state is SYN-SENT then
2634                  *    first check the ACK bit
2635                  *      If the ACK bit is set
2636                  *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2637                  *        a reset (unless the RST bit is set, if so drop
2638                  *        the segment and return)"
2639                  *
2640                  *  I cite this place to emphasize one essential
2641                  *  detail, this check is different of one
2642                  *  in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2643                  *  SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2644                  *  because we have no previous data sent before SYN.
2645                  *                                        --ANK(990513)
2646                  *
2647                  *  We do not send data with SYN, so that RFC-correct
2648                  *  test reduces to:
2649                  */
2650                 if (sk->zapped ||
2651                     TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
2652                         return 1;
2653
2654                 /* Now ACK is acceptable.
2655                  *
2656                  * "If the RST bit is set
2657                  *    If the ACK was acceptable then signal the user "error:
2658                  *    connection reset", drop the segment, enter CLOSED state,
2659                  *    delete TCB, and return."
2660                  */
2661
2662                 if (th->rst) {
2663                         tcp_reset(sk);
2664                         goto discard;
2665                 }
2666
2667                 /* rfc793:
2668                  *   "fifth, if neither of the SYN or RST bits is set then
2669                  *    drop the segment and return."
2670                  *
2671                  *    See note below!
2672                  *                                        --ANK(990513)
2673                  */
2674                 if (!th->syn)
2675                         goto discard;
2676
2677                 /* rfc793:
2678                  *   "If the SYN bit is on ...
2679                  *    are acceptable then ...
2680                  *    (our SYN has been ACKed), change the connection
2681                  *    state to ESTABLISHED..."
2682                  *
2683                  * Do you see? SYN-less ACKs in SYN-SENT state are
2684                  * completely ignored.
2685                  *
2686                  * The bug causing stalled SYN-SENT sockets
2687                  * was here: tcp_ack advanced snd_una and canceled
2688                  * retransmit timer, so that bare ACK received
2689                  * in SYN-SENT state (even with invalid ack==ISS,
2690                  * because tcp_ack check is too weak for SYN-SENT)
2691                  * causes moving socket to invalid semi-SYN-SENT,
2692                  * semi-ESTABLISHED state and connection hangs.
2693                  *
2694                  * There exist buggy stacks, which really send
2695                  * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2696                  * Actually, if this host did not try to get something
2697                  * from ftp.inr.ac.ru I'd never find this bug 8)
2698                  *
2699                  *                                     --ANK (990514)
2700                  *
2701                  * I was wrong, I apologize. Bare ACK is valid.
2702                  * Actually, RFC793 requires to send such ACK
2703                  * in reply to any out of window packet.
2704                  * It is wrong, but Linux also does it sometimes.
2705                  *                                     --ANK (990724)
2706                  */
2707
2708                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2709                 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2710                         TCP_SKB_CB(skb)->ack_seq, len);
2711
2712                 /* Ok.. it's good. Set up sequence numbers and
2713                  * move to established.
2714                  */
2715                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2716                 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2717
2718                 /* RFC1323: The window in SYN & SYN/ACK segments is
2719                  * never scaled.
2720                  */
2721                 tp->snd_wnd = htons(th->window);
2722                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2723                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2724                 tp->fin_seq = TCP_SKB_CB(skb)->seq;
2725
2726                 tcp_set_state(sk, TCP_ESTABLISHED);
2727
2728                 if (tp->wscale_ok == 0) {
2729                         tp->snd_wscale = tp->rcv_wscale = 0;
2730                         tp->window_clamp = min(tp->window_clamp,65535);
2731                 }
2732
2733                 if (tp->tstamp_ok) {
2734                         tp->tcp_header_len =
2735                                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2736                 } else
2737                         tp->tcp_header_len = sizeof(struct tcphdr);
2738                 if (tp->saw_tstamp) {
2739                         tp->ts_recent = tp->rcv_tsval;
2740                         tp->ts_recent_stamp = xtime.tv_sec;
2741                 }
2742                 tcp_sync_mss(sk, tp->pmtu_cookie);
2743                 tcp_initialize_rcv_mss(sk);
2744                 tcp_init_metrics(sk);
2745
2746                 if (tp->write_pending) {
2747                         /* Save one ACK. Data will be ready after
2748                          * several ticks, if write_pending is set.
2749                          *
2750                          * How to make this correctly?
2751                          */
2752                         tp->delayed_acks++;
2753                         if (tp->ato == 0)
2754                                 tp->ato = tp->rto;
2755                         tcp_send_delayed_ack(sk, tp->rto);
2756                 } else {
2757                         tcp_send_ack(sk);
2758                 }
2759
2760                 tp->copied_seq = tp->rcv_nxt;
2761
2762                 if(!sk->dead) {
2763                         wake_up_interruptible(sk->sleep);
2764                         sock_wake_async(sk->socket, 0, POLL_IN);
2765                 }
2766                 return -1;
2767         }
2768
2769         /* No ACK in the segment */
2770
2771         if (th->rst) {
2772                 /* rfc793:
2773                  * "If the RST bit is set
2774                  *
2775                  *      Otherwise (no ACK) drop the segment and return."
2776                  */
2777
2778                 goto discard;
2779         }
2780
2781         if (th->syn) {
2782                 /* We see SYN without ACK. It is attempt of
2783                  *  simultaneous connect with crossed SYNs.
2784                  *
2785                  * The previous version of the code
2786                  * checked for "connecting to self"
2787                  * here. that check is done now in
2788                  * tcp_connect.
2789                  *
2790                  * RED-PEN: BTW, it does not. 8)
2791                  */
2792                 tcp_set_state(sk, TCP_SYN_RECV);
2793                 if (tp->saw_tstamp) {
2794                         tp->ts_recent = tp->rcv_tsval;
2795                         tp->ts_recent_stamp = xtime.tv_sec;
2796                 }
2797
2798                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2799                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2800
2801                 /* RFC1323: The window in SYN & SYN/ACK segments is
2802                  * never scaled.
2803                  */
2804                 tp->snd_wnd = htons(th->window);
2805                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2806
2807                 tcp_sync_mss(sk, tp->pmtu_cookie);
2808                 tcp_initialize_rcv_mss(sk);
2809
2810                 tcp_send_synack(sk);
2811 #if 0
2812                 /* Note, we could accept data and URG from this segment.
2813                  * There are no obstacles to make this.
2814                  *
2815                  * However, if we ignore data in ACKless segments sometimes,
2816                  * we have no reasons to accept it sometimes.
2817                  * Also, seems the code doing it in step6 of tcp_rcv_state_process
2818                  * is not flawless. So, discard packet for sanity.
2819                  * Uncomment this return to process the data.
2820                  */
2821                 return -1;
2822 #endif
2823         }
2824         /* "fifth, if neither of the SYN or RST bits is set then
2825          * drop the segment and return."
2826          */
2827
2828 discard:
2829         kfree_skb(skb);
2830         return 0;
2831 }
2832
2833
2834 /*
2835  *      This function implements the receiving procedure of RFC 793 for
2836  *      all states except ESTABLISHED and TIME_WAIT.
2837  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2838  *      address independent.
2839  */
2840
2841 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2842                           struct tcphdr *th, unsigned len)
2843 {
2844         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2845         int queued = 0;
2846
2847         tp->saw_tstamp = 0;
2848
2849         switch (sk->state) {
2850         case TCP_CLOSE:
2851                 /* When state == CLOSED, hash lookup always fails.
2852                  *
2853                  * But, there is a back door, the backlog queue.
2854                  * If we have a sequence of packets in the backlog
2855                  * during __release_sock() which have a sequence such
2856                  * that:
2857                  *      packet X        causes entry to TCP_CLOSE state
2858                  *      ...
2859                  *      packet X + N    has FIN bit set
2860                  *
2861                  * We report a (luckily) harmless error in this case.
2862                  * The issue is that backlog queue processing bypasses
2863                  * any hash lookups (we know which socket packets are for).
2864                  * The correct behavior here is what 2.0.x did, since
2865                  * a TCP_CLOSE socket does not exist.  Drop the frame
2866                  * and send a RST back to the other end.
2867                  */
2868
2869                 /* 1. The socket may be moved to TIME-WAIT state.
2870                    2. While this socket was locked, another socket
2871                       with the same identity could be created.
2872                    3. To continue?
2873
2874                    CONCLUSION: discard and only discard!
2875
2876                    Alternative would be relookup and recurse into tcp_v?_rcv
2877                    (not *_do_rcv) to work with timewait and listen states
2878                    correctly.
2879                  */
2880                 goto discard;
2881
2882         case TCP_LISTEN:
2883                 if(th->ack)
2884                         return 1;
2885
2886                 if(th->syn) {
2887                         if(tp->af_specific->conn_request(sk, skb) < 0)
2888                                 return 1;
2889
2890                         /* Now we have several options: In theory there is
2891                          * nothing else in the frame. KA9Q has an option to
2892                          * send data with the syn, BSD accepts data with the
2893                          * syn up to the [to be] advertised window and
2894                          * Solaris 2.1 gives you a protocol error. For now
2895                          * we just ignore it, that fits the spec precisely
2896                          * and avoids incompatibilities. It would be nice in
2897                          * future to drop through and process the data.
2898                          *
2899                          * Now that TTCP is starting to be used we ought to
2900                          * queue this data.
2901                          * But, this leaves one open to an easy denial of
2902                          * service attack, and SYN cookies can't defend
2903                          * against this problem. So, we drop the data
2904                          * in the interest of security over speed.
2905                          */
2906                         goto discard;
2907                 }
2908                 goto discard;
2909
2910         case TCP_SYN_SENT:
2911                 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
2912                 if (queued >= 0)
2913                         return queued;
2914                 queued = 0;
2915                 goto step6;
2916         }
2917
2918         /*   Parse the tcp_options present on this header.
2919          *   By this point we really only expect timestamps.
2920          *   Note that this really has to be here and not later for PAWS
2921          *   (RFC1323) to work.
2922          */
2923         if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2924             tcp_paws_discard(tp, skb)) {
2925                 if (!th->rst) {
2926                         tcp_send_ack(sk);
2927                         goto discard;
2928                 }
2929                 /* Reset is accepted even if it did not pass PAWS. */
2930         }
2931
2932         /* The silly FIN test here is necessary to see an advancing ACK in
2933          * retransmitted FIN frames properly.  Consider the following sequence:
2934          *
2935          *      host1 --> host2         FIN XSEQ:XSEQ(0) ack YSEQ
2936          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ
2937          *      host1 --> host2         XSEQ:XSEQ(0) ack YSEQ+1
2938          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ+1     (fails tcp_sequence test)
2939          *
2940          * At this point the connection will deadlock with host1 believing
2941          * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2942          * forever.  The following fix is from Taral (taral@taral.net).
2943          *
2944          * RED-PEN. Seems, the above is not true.
2945          * If at least one end is RFC compliant, it will send ACK to
2946          * out of window FIN and, hence, move peer to TIME-WAIT.
2947          * I comment out this line. --ANK
2948          *
2949          * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
2950          * received in SYN-RECV. The problem is that description of
2951          * segment processing in SYN-RECV state in RFC792 is WRONG.
2952          * Correct check would accept ACK from this SYN-ACK, see
2953          * figures 6 and 8 (fixed by RFC1122). Compare this
2954          * to problem with FIN, they smell similarly. --ANK
2955          */
2956
2957         /* step 1: check sequence number */
2958         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
2959 #if 0
2960             && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
2961 #endif
2962             ) {
2963                 if (!th->rst) {
2964                         tcp_send_ack(sk);
2965                 }
2966                 goto discard;
2967         }
2968
2969         /* step 2: check RST bit */
2970         if(th->rst) {
2971                 tcp_reset(sk);
2972                 goto discard;
2973         }
2974
2975         if (tp->saw_tstamp) {
2976                 tcp_replace_ts_recent(sk, tp,
2977                                       TCP_SKB_CB(skb)->seq);
2978         }
2979
2980         /* step 3: check security and precedence [ignored] */
2981
2982         /*      step 4:
2983          *
2984          *      Check for a SYN, and ensure it matches the SYN we were
2985          *      first sent. We have to handle the rather unusual (but valid)
2986          *      sequence that KA9Q derived products may generate of
2987          *
2988          *      SYN
2989          *                              SYN|ACK Data
2990          *      ACK     (lost)
2991          *                              SYN|ACK Data + More Data
2992          *      .. we must ACK not RST...
2993          *
2994          *      We keep syn_seq as the sequence space occupied by the
2995          *      original syn.
2996          */
2997
2998         if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2999                 tcp_reset(sk);
3000                 return 1;
3001         }
3002
3003         /* step 5: check the ACK field */
3004         if (th->ack) {
3005                 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
3006                                          TCP_SKB_CB(skb)->ack_seq, len);
3007
3008                 switch(sk->state) {
3009                 case TCP_SYN_RECV:
3010                         if (acceptable) {
3011                                 tcp_set_state(sk, TCP_ESTABLISHED);
3012                                 tp->copied_seq = tp->rcv_nxt;
3013
3014                                 /* Note, that this wakeup is only for marginal
3015                                    crossed SYN case. Passively open sockets
3016                                    are not waked up, because sk->sleep == NULL
3017                                    and sk->socket == NULL.
3018                                  */
3019                                 if (!sk->dead && sk->sleep) {
3020                                         wake_up_interruptible(sk->sleep);
3021                                         sock_wake_async(sk->socket,0,POLL_OUT);
3022                                 }
3023
3024                                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
3025                                 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
3026                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3027                                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3028
3029                                 /* tcp_ack considers this ACK as duplicate
3030                                  * and does not calculate rtt. It is wrong.
3031                                  * Fix it at least with timestamps.
3032                                  */
3033                                 if (tp->saw_tstamp && !tp->srtt)
3034                                         tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
3035
3036                                 tcp_init_metrics(sk);
3037                         } else {
3038                                 SOCK_DEBUG(sk, "bad ack\n");
3039                                 return 1;
3040                         }
3041                         break;
3042
3043                 case TCP_FIN_WAIT1:
3044                         if (tp->snd_una == tp->write_seq) {
3045                                 sk->shutdown |= SEND_SHUTDOWN;
3046                                 tcp_set_state(sk, TCP_FIN_WAIT2);
3047                                 if (!sk->dead)
3048                                         sk->state_change(sk);
3049                                 else
3050                                         tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
3051                                 dst_confirm(sk->dst_cache);
3052                         }
3053                         break;
3054
3055                 case TCP_CLOSING:
3056                         if (tp->snd_una == tp->write_seq) {
3057                                 tcp_time_wait(sk);
3058                                 goto discard;
3059                         }
3060                         break;
3061
3062                 case TCP_LAST_ACK:
3063                         if (tp->snd_una == tp->write_seq) {
3064                                 tcp_set_state(sk,TCP_CLOSE);
3065                                 tcp_update_metrics(sk);
3066                                 tcp_done(sk);
3067                                 goto discard;
3068                         }
3069                         break;
3070                 }
3071         } else
3072                 goto discard;
3073
3074 step6:
3075         /* step 6: check the URG bit */
3076         tcp_urg(sk, th, len);
3077
3078         /* step 7: process the segment text */
3079         switch (sk->state) {
3080         case TCP_CLOSE_WAIT:
3081         case TCP_CLOSING:
3082                 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
3083                         break;
3084
3085         case TCP_FIN_WAIT1:
3086         case TCP_FIN_WAIT2:
3087                 /* RFC 793 says to queue data in these states,
3088                  * RFC 1122 says we MUST send a reset.
3089                  * BSD 4.4 also does reset.
3090                  */
3091                 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
3092                         if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3093                                 tcp_reset(sk);
3094                                 return 1;
3095                         }
3096                 }
3097
3098         case TCP_ESTABLISHED:
3099                 queued = tcp_data(skb, sk, len);
3100
3101                 /* This must be after tcp_data() does the skb_pull() to
3102                  * remove the header size from skb->len.
3103                  */
3104                 tcp_measure_rcv_mss(sk, skb);
3105                 break;
3106         }
3107
3108         /* tcp_data could move socket to TIME-WAIT */
3109         if (sk->state != TCP_CLOSE) {
3110                 tcp_data_snd_check(sk);
3111                 tcp_ack_snd_check(sk);
3112         }
3113
3114         if (!queued) {
3115 discard:
3116                 kfree_skb(skb);
3117         }
3118         return 0;
3119 }