* better
[mascara-docs.git] / i386 / linux-2.3.21 / net / ipv4 / tcp_input.c
blob2f6bd37c9f9ab05802e46a9c63244d68573afb3e
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes:
25 * Pedro Roque : Fast Retransmit/Recovery.
26 * Two receive queues.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
30 * Header prediction.
31 * Variable renaming.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
49 * data segments.
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
57 * fast path.
60 #include <linux/config.h>
61 #include <linux/mm.h>
62 #include <linux/sysctl.h>
63 #include <net/tcp.h>
64 #include <net/inet_common.h>
65 #include <linux/ipsec.h>
67 #ifdef CONFIG_SYSCTL
68 #define SYNC_INIT 0 /* let the user enable it */
69 #else
70 #define SYNC_INIT 1
71 #endif
73 extern int sysctl_tcp_fin_timeout;
74 extern int sysctl_tcp_keepalive_time;
76 /* These are on by default so the code paths get tested.
77 * For the final 2.2 this may be undone at our discretion. -DaveM
79 int sysctl_tcp_timestamps = 1;
80 int sysctl_tcp_window_scaling = 1;
81 int sysctl_tcp_sack = 1;
83 int sysctl_tcp_syncookies = SYNC_INIT;
84 int sysctl_tcp_stdurg;
85 int sysctl_tcp_rfc1337;
86 int sysctl_tcp_tw_recycle;
88 static int prune_queue(struct sock *sk);
90 /* There is something which you must keep in mind when you analyze the
91 * behavior of the tp->ato delayed ack timeout interval. When a
92 * connection starts up, we want to ack as quickly as possible. The
93 * problem is that "good" TCP's do slow start at the beginning of data
94 * transmission. The means that until we send the first few ACK's the
95 * sender will sit on his end and only queue most of his data, because
96 * he can only send snd_cwnd unacked packets at any given time. For
97 * each ACK we send, he increments snd_cwnd and transmits more of his
98 * queue. -DaveM
100 static void tcp_delack_estimator(struct tcp_opt *tp)
102 if(tp->ato == 0) {
103 tp->lrcvtime = tcp_time_stamp;
105 /* Help sender leave slow start quickly,
106 * and also makes sure we do not take this
107 * branch ever again for this connection.
109 tp->ato = 1;
110 tcp_enter_quickack_mode(tp);
111 } else {
112 int m = tcp_time_stamp - tp->lrcvtime;
114 tp->lrcvtime = tcp_time_stamp;
115 if(m <= 0)
116 m = 1;
117 if(m > tp->rto)
118 tp->ato = tp->rto;
119 else {
120 /* This funny shift makes sure we
121 * clear the "quick ack mode" bit.
123 tp->ato = ((tp->ato << 1) >> 2) + m;
129 * Remember to send an ACK later.
131 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
132 struct sk_buff *skb)
134 tp->delayed_acks++;
136 /* Tiny-grams with PSH set artifically deflate our
137 * ato measurement, but with a lower bound.
139 if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
140 /* Preserve the quickack state. */
141 if((tp->ato & 0x7fffffff) > HZ/50)
142 tp->ato = ((tp->ato & 0x80000000) |
143 (HZ/50));
147 /* Called to compute a smoothed rtt estimate. The data fed to this
148 * routine either comes from timestamps, or from segments that were
149 * known _not_ to have been retransmitted [see Karn/Partridge
150 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
151 * piece by Van Jacobson.
152 * NOTE: the next three routines used to be one big routine.
153 * To save cycles in the RFC 1323 implementation it was better to break
154 * it up into three procedures. -- erics
157 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
159 long m = mrtt; /* RTT */
161 /* The following amusing code comes from Jacobson's
162 * article in SIGCOMM '88. Note that rtt and mdev
163 * are scaled versions of rtt and mean deviation.
164 * This is designed to be as fast as possible
165 * m stands for "measurement".
167 * On a 1990 paper the rto value is changed to:
168 * RTO = rtt + 4 * mdev
170 if(m == 0)
171 m = 1;
172 if (tp->srtt != 0) {
173 m -= (tp->srtt >> 3); /* m is now error in rtt est */
174 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
175 if (m < 0)
176 m = -m; /* m is now abs(error) */
177 m -= (tp->mdev >> 2); /* similar update on mdev */
178 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
179 } else {
180 /* no previous measure. */
181 tp->srtt = m<<3; /* take the measured time to be rtt */
182 tp->mdev = m<<2; /* make sure rto = 3*rtt */
186 /* Calculate rto without backoff. This is the second half of Van Jacobson's
187 * routine referred to above.
190 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
192 tp->rto = (tp->srtt >> 3) + tp->mdev;
193 /* I am not enough educated to understand this magic.
194 * However, it smells bad. snd_cwnd>31 is common case.
196 tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
200 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
201 * on packet lifetime in the internet. We need the HZ/5 lower
202 * bound to behave correctly against BSD stacks with a fixed
203 * delayed ack.
204 * FIXME: It's not entirely clear this lower bound is the best
205 * way to avoid the problem. Is it possible to drop the lower
206 * bound and still avoid trouble with BSD stacks? Perhaps
207 * some modification to the RTO calculation that takes delayed
208 * ack bias into account? This needs serious thought. -- erics
210 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
212 if (tp->rto > 120*HZ)
213 tp->rto = 120*HZ;
214 if (tp->rto < HZ/5)
215 tp->rto = HZ/5;
218 /* Save metrics learned by this TCP session.
219 This function is called only, when TCP finishes sucessfully
220 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
222 static void tcp_update_metrics(struct sock *sk)
224 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
225 struct dst_entry *dst = __sk_dst_get(sk);
227 if (dst) {
228 int m;
230 if (tp->backoff || !tp->srtt) {
231 /* This session failed to estimate rtt. Why?
232 * Probably, no packets returned in time.
233 * Reset our results.
235 if (!(dst->mxlock&(1<<RTAX_RTT)))
236 dst->rtt = 0;
237 return;
240 dst_confirm(dst);
242 m = dst->rtt - tp->srtt;
244 /* If newly calculated rtt larger than stored one,
245 * store new one. Otherwise, use EWMA. Remember,
246 * rtt overestimation is always better than underestimation.
248 if (!(dst->mxlock&(1<<RTAX_RTT))) {
249 if (m <= 0)
250 dst->rtt = tp->srtt;
251 else
252 dst->rtt -= (m>>3);
255 if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
256 if (m < 0)
257 m = -m;
259 /* Scale deviation to rttvar fixed point */
260 m >>= 1;
261 if (m < tp->mdev)
262 m = tp->mdev;
264 if (m >= dst->rttvar)
265 dst->rttvar = m;
266 else
267 dst->rttvar -= (dst->rttvar - m)>>2;
270 if (tp->snd_ssthresh == 0x7FFFFFFF) {
271 /* Slow start still did not finish. */
272 if (dst->ssthresh &&
273 !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
274 tp->snd_cwnd > dst->ssthresh)
275 dst->ssthresh = tp->snd_cwnd;
276 if (!(dst->mxlock&(1<<RTAX_CWND)) &&
277 tp->snd_cwnd > dst->cwnd)
278 dst->cwnd = tp->snd_cwnd;
279 } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
280 /* Cong. avoidance phase, cwnd is reliable. */
281 if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
282 dst->ssthresh = tp->snd_cwnd;
283 if (!(dst->mxlock&(1<<RTAX_CWND)))
284 dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
285 } else {
286 /* Else slow start did not finish, cwnd is non-sense,
287 ssthresh may be also invalid.
289 if (!(dst->mxlock&(1<<RTAX_CWND)))
290 dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
291 if (dst->ssthresh &&
292 !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
293 tp->snd_ssthresh > dst->ssthresh)
294 dst->ssthresh = tp->snd_ssthresh;
299 /* Initialize metrics on socket. */
301 static void tcp_init_metrics(struct sock *sk)
303 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
304 struct dst_entry *dst = __sk_dst_get(sk);
306 if (dst == NULL)
307 goto reset;
309 dst_confirm(dst);
311 if (dst->rtt == 0)
312 goto reset;
314 if (!tp->srtt || !tp->saw_tstamp)
315 goto reset;
317 /* Initial rtt is determined from SYN,SYN-ACK.
318 * The segment is small and rtt may appear much
319 * less than real one. Use per-dst memory
320 * to make it more realistic.
322 * A bit of theory. RTT is time passed after "normal" sized packet
323 * is sent until it is ACKed. In normal curcumstances sending small
324 * packets force peer to delay ACKs and calculation is correct too.
325 * The algorithm is adaptive and, provided we follow specs, it
326 * NEVER underestimate RTT. BUT! If peer tries to make some clever
327 * tricks sort of "quick acks" for time long enough to decrease RTT
328 * to low value, and then abruptly stops to do it and starts to delay
329 * ACKs, wait for troubles.
331 if (dst->rtt > tp->srtt)
332 tp->srtt = dst->rtt;
333 if (dst->rttvar > tp->mdev)
334 tp->mdev = dst->rttvar;
335 tcp_set_rto(tp);
336 tcp_bound_rto(tp);
338 if (dst->mxlock&(1<<RTAX_CWND))
339 tp->snd_cwnd_clamp = dst->cwnd;
340 if (dst->ssthresh) {
341 tp->snd_ssthresh = dst->ssthresh;
342 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
343 tp->snd_ssthresh = tp->snd_cwnd_clamp;
345 return;
348 reset:
349 /* Play conservative. If timestamps are not
350 * supported, TCP will fail to recalculate correct
351 * rtt, if initial rto is too small. FORGET ALL AND RESET!
353 if (!tp->saw_tstamp && tp->srtt) {
354 tp->srtt = 0;
355 tp->mdev = TCP_TIMEOUT_INIT;
356 tp->rto = TCP_TIMEOUT_INIT;
360 #define PAWS_24DAYS (60 * 60 * 24 * 24)
363 /* WARNING: this must not be called if tp->saw_tstamp was false. */
364 extern __inline__ void
365 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
367 if (!after(seq, tp->last_ack_sent)) {
368 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
369 * extra check below makes sure this can only happen
370 * for pure ACK frames. -DaveM
372 * Not only, also it occurs for expired timestamps
373 * and RSTs with bad timestamp option. --ANK
376 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
377 xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
378 tp->ts_recent = tp->rcv_tsval;
379 tp->ts_recent_stamp = xtime.tv_sec;
384 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
386 return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
387 xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
389 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
391 I cannot see quitely as all the idea behind PAWS
392 is destroyed 8)
394 The problem is only in reordering duplicate ACKs.
395 Hence, we can check this rare case more carefully.
397 1. Check that it is really duplicate ACK (ack==snd_una)
398 2. Give it some small "replay" window (~RTO)
400 We do not know units of foreign ts values, but make conservative
401 assumption that they are >=1ms. It solves problem
402 noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
404 && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
405 TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
406 !skb->h.th->ack ||
407 (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
411 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
413 u32 end_window = tp->rcv_wup + tp->rcv_wnd;
415 if (tp->rcv_wnd &&
416 after(end_seq, tp->rcv_nxt) &&
417 before(seq, end_window))
418 return 1;
419 if (seq != end_window)
420 return 0;
421 return (seq == end_seq);
424 /* This functions checks to see if the tcp header is actually acceptable. */
425 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
427 if (seq == tp->rcv_nxt)
428 return (tp->rcv_wnd || (end_seq == seq));
430 return __tcp_sequence(tp, seq, end_seq);
433 /* When we get a reset we do this. */
434 static void tcp_reset(struct sock *sk)
436 sk->zapped = 1;
438 /* We want the right error as BSD sees it (and indeed as we do). */
439 switch (sk->state) {
440 case TCP_SYN_SENT:
441 sk->err = ECONNREFUSED;
442 break;
443 case TCP_CLOSE_WAIT:
444 sk->err = EPIPE;
445 break;
446 case TCP_CLOSE:
447 return;
448 default:
449 sk->err = ECONNRESET;
451 tcp_set_state(sk, TCP_CLOSE);
452 tcp_clear_xmit_timers(sk);
453 tcp_done(sk);
456 /* This tags the retransmission queue when SACKs arrive. */
457 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
459 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
460 int i = nsacks;
462 while(i--) {
463 struct sk_buff *skb = skb_peek(&sk->write_queue);
464 __u32 start_seq = ntohl(sp->start_seq);
465 __u32 end_seq = ntohl(sp->end_seq);
466 int fack_count = 0;
468 while((skb != NULL) &&
469 (skb != tp->send_head) &&
470 (skb != (struct sk_buff *)&sk->write_queue)) {
471 /* The retransmission queue is always in order, so
472 * we can short-circuit the walk early.
474 if(after(TCP_SKB_CB(skb)->seq, end_seq))
475 break;
477 /* We play conservative, we don't allow SACKS to partially
478 * tag a sequence space.
480 fack_count++;
481 if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
482 !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
483 /* If this was a retransmitted frame, account for it. */
484 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
485 tp->retrans_out)
486 tp->retrans_out--;
487 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
489 /* RULE: All new SACKs will either decrease retrans_out
490 * or advance fackets_out.
492 if(fack_count > tp->fackets_out)
493 tp->fackets_out = fack_count;
495 skb = skb->next;
497 sp++; /* Move on to the next SACK block. */
501 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
502 * But, this can also be called on packets in the established flow when
503 * the fast version below fails.
505 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
507 unsigned char *ptr;
508 int length=(th->doff*4)-sizeof(struct tcphdr);
510 ptr = (unsigned char *)(th + 1);
511 tp->saw_tstamp = 0;
513 while(length>0) {
514 int opcode=*ptr++;
515 int opsize;
517 switch (opcode) {
518 case TCPOPT_EOL:
519 return;
520 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
521 length--;
522 continue;
523 default:
524 opsize=*ptr++;
525 if (opsize < 2) /* "silly options" */
526 return;
527 if (opsize > length)
528 break; /* don't parse partial options */
529 switch(opcode) {
530 case TCPOPT_MSS:
531 if(opsize==TCPOLEN_MSS && th->syn) {
532 u16 in_mss = ntohs(*(__u16 *)ptr);
533 if (in_mss) {
534 if (tp->user_mss && tp->user_mss < in_mss)
535 in_mss = tp->user_mss;
536 tp->mss_clamp = in_mss;
539 break;
540 case TCPOPT_WINDOW:
541 if(opsize==TCPOLEN_WINDOW && th->syn)
542 if (!no_fancy && sysctl_tcp_window_scaling) {
543 tp->wscale_ok = 1;
544 tp->snd_wscale = *(__u8 *)ptr;
545 if(tp->snd_wscale > 14) {
546 if(net_ratelimit())
547 printk("tcp_parse_options: Illegal window "
548 "scaling value %d >14 received.",
549 tp->snd_wscale);
550 tp->snd_wscale = 14;
553 break;
554 case TCPOPT_TIMESTAMP:
555 if(opsize==TCPOLEN_TIMESTAMP) {
556 if (sysctl_tcp_timestamps && !no_fancy) {
557 tp->tstamp_ok = 1;
558 tp->saw_tstamp = 1;
559 tp->rcv_tsval = ntohl(*(__u32 *)ptr);
560 tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
563 break;
564 case TCPOPT_SACK_PERM:
565 if(opsize==TCPOLEN_SACK_PERM && th->syn) {
566 if (sysctl_tcp_sack && !no_fancy) {
567 tp->sack_ok = 1;
568 tp->num_sacks = 0;
571 break;
573 case TCPOPT_SACK:
574 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
575 sysctl_tcp_sack && (sk != NULL) && !th->syn) {
576 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
578 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
579 int num_sacks = sack_bytes >> 3;
580 struct tcp_sack_block *sackp;
582 sackp = (struct tcp_sack_block *)ptr;
583 tcp_sacktag_write_queue(sk, sackp, num_sacks);
587 ptr+=opsize-2;
588 length-=opsize;
593 /* Fast parse options. This hopes to only see timestamps.
594 * If it is wrong it falls back on tcp_parse_options().
596 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
598 /* If we didn't send out any options ignore them all. */
599 if (tp->tcp_header_len == sizeof(struct tcphdr))
600 return 0;
601 if (th->doff == sizeof(struct tcphdr)>>2) {
602 tp->saw_tstamp = 0;
603 return 0;
604 } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
605 __u32 *ptr = (__u32 *)(th + 1);
606 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
607 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
608 tp->saw_tstamp = 1;
609 ++ptr;
610 tp->rcv_tsval = ntohl(*ptr);
611 ++ptr;
612 tp->rcv_tsecr = ntohl(*ptr);
613 return 1;
616 tcp_parse_options(sk, th, tp, 0);
617 return 1;
620 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
621 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
622 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
623 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
624 #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
626 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
628 if (tp->dup_acks > 3)
629 tp->snd_cwnd = (tp->snd_ssthresh);
631 tp->dup_acks = 0;
634 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
635 * retransmit timer fires.
637 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
639 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
641 /* Note: If not_dup is set this implies we got a
642 * data carrying packet or a window update.
643 * This carries no new information about possible
644 * lost packets, so we have to ignore it for the purposes
645 * of counting duplicate acks. Ideally this does not imply we
646 * should stop our fast retransmit phase, more acks may come
647 * later without data to help us. Unfortunately this would make
648 * the code below much more complex. For now if I see such
649 * a packet I clear the fast retransmit phase.
651 if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
652 /* This is the standard reno style fast retransmit branch. */
654 /* 1. When the third duplicate ack is received, set ssthresh
655 * to one half the current congestion window, but no less
656 * than two segments. Retransmit the missing segment.
658 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
659 tp->dup_acks++;
660 if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
661 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
662 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
663 tp->snd_ssthresh = tp->snd_cwnd_clamp;
664 tp->snd_cwnd = (tp->snd_ssthresh + 3);
665 tp->high_seq = tp->snd_nxt;
666 if(!tp->fackets_out)
667 tcp_retransmit_skb(sk,
668 skb_peek(&sk->write_queue));
669 else
670 tcp_fack_retransmit(sk);
671 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
673 } else if (++tp->dup_acks > 3) {
674 /* 2. Each time another duplicate ACK arrives, increment
675 * cwnd by the segment size. [...] Transmit a packet...
677 * Packet transmission will be done on normal flow processing
678 * since we're not in "retransmit mode". We do not use
679 * duplicate ACKs to artificially inflate the congestion
680 * window when doing FACK.
682 if(!tp->fackets_out) {
683 tp->snd_cwnd++;
684 } else {
685 /* Fill any further holes which may have
686 * appeared.
688 * We may want to change this to run every
689 * further multiple-of-3 dup ack increments,
690 * to be more robust against out-of-order
691 * packet delivery. -DaveM
693 tcp_fack_retransmit(sk);
696 } else if (tp->high_seq != 0) {
697 /* In this branch we deal with clearing the Floyd style
698 * block on duplicate fast retransmits, and if requested
699 * we do Hoe style secondary fast retransmits.
701 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
702 /* Once we have acked all the packets up to high_seq
703 * we are done this fast retransmit phase.
704 * Alternatively data arrived. In this case we
705 * Have to abort the fast retransmit attempt.
706 * Note that we do want to accept a window
707 * update since this is expected with Hoe's algorithm.
709 clear_fast_retransmit(tp);
711 /* After we have cleared up to high_seq we can
712 * clear the Floyd style block.
714 if (!before(ack, tp->high_seq)) {
715 tp->high_seq = 0;
716 tp->fackets_out = 0;
718 } else if (tp->dup_acks >= 3) {
719 if (!tp->fackets_out) {
720 /* Hoe Style. We didn't ack the whole
721 * window. Take this as a cue that
722 * another packet was lost and retransmit it.
723 * Don't muck with the congestion window here.
724 * Note that we have to be careful not to
725 * act if this was a window update and it
726 * didn't ack new data, since this does
727 * not indicate a packet left the system.
728 * We can test this by just checking
729 * if ack changed from snd_una, since
730 * the only way to get here without advancing
731 * from snd_una is if this was a window update.
733 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
734 tcp_retransmit_skb(sk,
735 skb_peek(&sk->write_queue));
736 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
738 } else {
739 /* FACK style, fill any remaining holes in
740 * receiver's queue.
742 tcp_fack_retransmit(sk);
748 /* This is Jacobson's slow start and congestion avoidance.
749 * SIGCOMM '88, p. 328.
751 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
753 if (tp->snd_cwnd <= tp->snd_ssthresh) {
754 /* In "safe" area, increase. */
755 tp->snd_cwnd++;
756 } else {
757 /* In dangerous area, increase slowly.
758 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
760 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
761 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
762 tp->snd_cwnd++;
763 tp->snd_cwnd_cnt=0;
764 } else
765 tp->snd_cwnd_cnt++;
769 /* Remove acknowledged frames from the retransmission queue. */
770 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
771 __u32 *seq, __u32 *seq_rtt)
773 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
774 struct sk_buff *skb;
775 __u32 now = tcp_time_stamp;
776 int acked = 0;
778 /* If we are retransmitting, and this ACK clears up to
779 * the retransmit head, or further, then clear our state.
781 if (tp->retrans_head != NULL &&
782 !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
783 tp->retrans_head = NULL;
785 while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
786 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
787 __u8 sacked = scb->sacked;
789 /* If our packet is before the ack sequence we can
790 * discard it as it's confirmed to have arrived at
791 * the other end.
793 if (after(scb->end_seq, ack))
794 break;
796 /* Initial outgoing SYN's get put onto the write_queue
797 * just like anything else we transmit. It is not
798 * true data, and if we misinform our callers that
799 * this ACK acks real data, we will erroneously exit
800 * connection startup slow start one packet too
801 * quickly. This is severely frowned upon behavior.
803 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
804 tp->retrans_out--;
805 if(!(scb->flags & TCPCB_FLAG_SYN)) {
806 acked |= FLAG_DATA_ACKED;
807 if(sacked & TCPCB_SACKED_RETRANS)
808 acked |= FLAG_RETRANS_DATA_ACKED;
809 if(tp->fackets_out)
810 tp->fackets_out--;
811 } else {
812 acked |= FLAG_SYN_ACKED;
813 /* This is pure paranoia. */
814 tp->retrans_head = NULL;
816 tp->packets_out--;
817 *seq = scb->seq;
818 *seq_rtt = now - scb->when;
819 __skb_unlink(skb, skb->list);
820 kfree_skb(skb);
822 return acked;
825 static void tcp_ack_probe(struct sock *sk, __u32 ack)
827 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
829 /* Our probe was answered. */
830 tp->probes_out = 0;
832 /* Was it a usable window open? */
834 /* should always be non-null */
835 if (tp->send_head != NULL &&
836 !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
837 tp->backoff = 0;
838 tp->pending = 0;
839 tcp_clear_xmit_timer(sk, TIME_PROBE0);
840 } else {
841 tcp_reset_xmit_timer(sk, TIME_PROBE0,
842 min(tp->rto << tp->backoff, 120*HZ));
846 /* Should we open up the congestion window? */
847 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
849 /* Data must have been acked. */
850 if ((flag & FLAG_DATA_ACKED) == 0)
851 return 0;
853 /* Some of the data acked was retransmitted somehow? */
854 if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
855 /* We advance in all cases except during
856 * non-FACK fast retransmit/recovery.
858 if (tp->fackets_out != 0 ||
859 tp->retransmits != 0)
860 return 1;
862 /* Non-FACK fast retransmit does it's own
863 * congestion window management, don't get
864 * in the way.
866 return 0;
869 /* New non-retransmitted data acked, always advance. */
870 return 1;
873 /* Read draft-ietf-tcplw-high-performance before mucking
874 * with this code. (Superceeds RFC1323)
876 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
877 u32 seq, u32 ack, int flag)
879 __u32 seq_rtt;
881 /* RTTM Rule: A TSecr value received in a segment is used to
882 * update the averaged RTT measurement only if the segment
883 * acknowledges some new data, i.e., only if it advances the
884 * left edge of the send window.
886 * See draft-ietf-tcplw-high-performance-00, section 3.3.
887 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
889 if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
890 return;
892 seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
893 tcp_rtt_estimator(tp, seq_rtt);
894 if (tp->retransmits) {
895 if (tp->packets_out == 0) {
896 tp->retransmits = 0;
897 tp->fackets_out = 0;
898 tp->retrans_out = 0;
899 tp->backoff = 0;
900 tcp_set_rto(tp);
901 } else {
902 /* Still retransmitting, use backoff */
903 tcp_set_rto(tp);
904 tp->rto = tp->rto << tp->backoff;
906 } else {
907 tcp_set_rto(tp);
910 tcp_bound_rto(tp);
913 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
915 struct sk_buff *skb = skb_peek(&sk->write_queue);
917 /* Some data was ACK'd, if still retransmitting (due to a
918 * timeout), resend more of the retransmit queue. The
919 * congestion window is handled properly by that code.
921 if (tp->retransmits) {
922 tcp_xmit_retransmit_queue(sk);
923 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
924 } else {
925 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
926 if ((__s32)when < 0)
927 when = 1;
928 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
932 /* This routine deals with incoming acks, but not outgoing ones. */
933 static int tcp_ack(struct sock *sk, struct tcphdr *th,
934 u32 ack_seq, u32 ack, int len)
936 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
937 int flag = 0;
938 u32 seq = 0;
939 u32 seq_rtt = 0;
941 if(sk->zapped)
942 return(1); /* Dead, can't ack any more so why bother */
944 if (tp->pending == TIME_KEEPOPEN)
945 tp->probes_out = 0;
947 tp->rcv_tstamp = tcp_time_stamp;
949 /* If the ack is newer than sent or older than previous acks
950 * then we can probably ignore it.
952 if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
953 goto uninteresting_ack;
955 /* If there is data set flag 1 */
956 if (len != th->doff*4) {
957 flag |= FLAG_DATA;
958 tcp_delack_estimator(tp);
961 /* Update our send window. */
963 /* This is the window update code as per RFC 793
964 * snd_wl{1,2} are used to prevent unordered
965 * segments from shrinking the window
967 if (before(tp->snd_wl1, ack_seq) ||
968 (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
969 u32 nwin = ntohs(th->window) << tp->snd_wscale;
971 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
972 flag |= FLAG_WIN_UPDATE;
973 tp->snd_wnd = nwin;
975 tp->snd_wl1 = ack_seq;
976 tp->snd_wl2 = ack;
978 if (nwin > tp->max_window)
979 tp->max_window = nwin;
983 /* We passed data and got it acked, remove any soft error
984 * log. Something worked...
986 sk->err_soft = 0;
988 /* If this ack opens up a zero window, clear backoff. It was
989 * being used to time the probes, and is probably far higher than
990 * it needs to be for normal retransmission.
992 if (tp->pending == TIME_PROBE0)
993 tcp_ack_probe(sk, ack);
995 /* See if we can take anything off of the retransmit queue. */
996 flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
998 /* We must do this here, before code below clears out important
999 * state contained in tp->fackets_out and tp->retransmits. -DaveM
1001 if (should_advance_cwnd(tp, flag))
1002 tcp_cong_avoid(tp);
1004 /* If we have a timestamp, we always do rtt estimates. */
1005 if (tp->saw_tstamp) {
1006 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
1007 } else {
1008 /* If we were retransmiting don't count rtt estimate. */
1009 if (tp->retransmits) {
1010 if (tp->packets_out == 0) {
1011 tp->retransmits = 0;
1012 tp->fackets_out = 0;
1013 tp->retrans_out = 0;
1015 } else {
1016 /* We don't have a timestamp. Can only use
1017 * packets that are not retransmitted to determine
1018 * rtt estimates. Also, we must not reset the
1019 * backoff for rto until we get a non-retransmitted
1020 * packet. This allows us to deal with a situation
1021 * where the network delay has increased suddenly.
1022 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1024 if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
1025 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
1026 tp->backoff = 0;
1027 tcp_rtt_estimator(tp, seq_rtt);
1028 tcp_set_rto(tp);
1029 tcp_bound_rto(tp);
1035 if (tp->packets_out) {
1036 if (flag & FLAG_DATA_ACKED)
1037 tcp_ack_packets_out(sk, tp);
1038 } else {
1039 tcp_clear_xmit_timer(sk, TIME_RETRANS);
1042 flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
1043 if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
1044 (tp->high_seq != 0)) {
1045 tcp_fast_retrans(sk, ack, flag);
1046 } else {
1047 /* Clear any aborted fast retransmit starts. */
1048 tp->dup_acks = 0;
1050 /* It is not a brain fart, I thought a bit now. 8)
1052 * Forward progress is indicated, if:
1053 * 1. the ack acknowledges new data.
1054 * 2. or the ack is duplicate, but it is caused by new segment
1055 * arrival. This case is filtered by:
1056 * - it contains no data, syn or fin.
1057 * - it does not update window.
1058 * 3. or new SACK. It is difficult to check, so that we ignore it.
1060 * Forward progress is also indicated by arrival new data,
1061 * which was caused by window open from our side. This case is more
1062 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1063 * --ANK (990513)
1065 if (ack != tp->snd_una || (flag == 0 && !th->fin))
1066 dst_confirm(sk->dst_cache);
1068 /* Remember the highest ack received. */
1069 tp->snd_una = ack;
1070 return 1;
1072 uninteresting_ack:
1073 SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
1074 return 0;
1077 /* New-style handling of TIME_WAIT sockets. */
1079 /* Must be called only from BH context. */
1080 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
1082 struct tcp_ehash_bucket *ehead;
1083 struct tcp_bind_hashbucket *bhead;
1084 struct tcp_bind_bucket *tb;
1086 /* Unlink from established hashes. */
1087 ehead = &tcp_ehash[tw->hashent];
1088 write_lock(&ehead->lock);
1089 if (!tw->pprev) {
1090 write_unlock(&ehead->lock);
1091 return;
1093 if(tw->next)
1094 tw->next->pprev = tw->pprev;
1095 *(tw->pprev) = tw->next;
1096 tw->pprev = NULL;
1097 write_unlock(&ehead->lock);
1099 /* Disassociate with bind bucket. */
1100 bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
1101 spin_lock(&bhead->lock);
1102 if ((tb = tw->tb) != NULL) {
1103 if(tw->bind_next)
1104 tw->bind_next->bind_pprev = tw->bind_pprev;
1105 *(tw->bind_pprev) = tw->bind_next;
1106 tw->tb = NULL;
1107 if (tb->owners == NULL) {
1108 if (tb->next)
1109 tb->next->pprev = tb->pprev;
1110 *(tb->pprev) = tb->next;
1111 kmem_cache_free(tcp_bucket_cachep, tb);
1114 spin_unlock(&bhead->lock);
1116 #ifdef INET_REFCNT_DEBUG
1117 if (atomic_read(&tw->refcnt) != 1) {
1118 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
1120 #endif
1121 tcp_tw_put(tw);
1124 /* We come here as a special case from the AF specific TCP input processing,
1125 * and the SKB has no owner. Essentially handling this is very simple,
1126 * we just keep silently eating rx'd packets until none show up for the
1127 * entire timeout period. The only special cases are for BSD TIME_WAIT
1128 * reconnects and SYN/RST bits being set in the TCP header.
1132 * * Main purpose of TIME-WAIT state is to close connection gracefully,
1133 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1134 * (and, probably, tail of data) and one or more our ACKs are lost.
1135 * * What is TIME-WAIT timeout? It is associated with maximal packet
1136 * lifetime in the internet, which results in wrong conclusion, that
1137 * it is set to catch "old duplicate segments" wandering out of their path.
1138 * It is not quite correct. This timeout is calculated so that it exceeds
1139 * maximal retransmision timeout enough to allow to lose one (or more)
1140 * segments sent by peer and our ACKs. This time may be calculated from RTO.
1141 * * When TIME-WAIT socket receives RST, it means that another end
1142 * finally closed and we are allowed to kill TIME-WAIT too.
1143 * * Second purpose of TIME-WAIT is catching old duplicate segments.
1144 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
1145 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1146 * * If we invented some more clever way to catch duplicates
1147 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1149 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1150 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1151 * from the very beginning.
1153 enum tcp_tw_status
1154 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
1155 struct tcphdr *th, unsigned len)
1157 struct tcp_opt tp;
1158 int paws_reject = 0;
1160 /* RFC 1122:
1161 * "When a connection is [...] on TIME-WAIT state [...]
1162 * [a TCP] MAY accept a new SYN from the remote TCP to
1163 * reopen the connection directly, if it:
1165 * (1) assigns its initial sequence number for the new
1166 * connection to be larger than the largest sequence
1167 * number it used on the previous connection incarnation,
1168 * and
1170 * (2) returns to TIME-WAIT state if the SYN turns out
1171 * to be an old duplicate".
1174 tp.saw_tstamp = 0;
1175 if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
1176 tcp_parse_options(NULL, th, &tp, 0);
1178 paws_reject = tp.saw_tstamp &&
1179 ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
1180 xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
1183 if (!paws_reject &&
1184 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
1185 TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
1186 /* In window segment, it may be only reset or bare ack. */
1188 if (th->rst) {
1189 #ifdef CONFIG_TCP_TW_RECYCLE
1190 /* When recycling, always follow rfc1337,
1191 * but mark bucket as ready to recycling immediately.
1193 if (sysctl_tcp_tw_recycle) {
1194 /* May kill it now. */
1195 tw->rto = 0;
1196 tw->ttd = jiffies;
1197 } else
1198 #endif
1199 /* This is TIME_WAIT assasination, in two flavors.
1200 * Oh well... nobody has a sufficient solution to this
1201 * protocol bug yet.
1203 if(sysctl_tcp_rfc1337 == 0) {
1204 tcp_tw_deschedule(tw);
1205 tcp_timewait_kill(tw);
1207 } else {
1208 tcp_tw_reschedule(tw);
1211 if (tp.saw_tstamp) {
1212 tw->ts_recent = tp.rcv_tsval;
1213 tw->ts_recent_stamp = xtime.tv_sec;
1215 tcp_tw_put(tw);
1216 return TCP_TW_SUCCESS;
1219 /* Out of window segment.
1221 All the segments are ACKed immediately.
1223 The only exception is new SYN. We accept it, if it is
1224 not old duplicate and we are not in danger to be killed
1225 by delayed old duplicates. RFC check is that it has
1226 newer sequence number works at rates <40Mbit/sec.
1227 However, if paws works, it is reliable AND even more,
1228 we even may relax silly seq space cutoff.
1230 RED-PEN: we violate main RFC requirement, if this SYN will appear
1231 old duplicate (i.e. we receive RST in reply to SYN-ACK),
1232 we must return socket to time-wait state. It is not good,
1233 but not fatal yet.
1236 if (th->syn && !th->rst && !th->ack && !paws_reject &&
1237 (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
1238 (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
1239 u32 isn = tw->snd_nxt + 2;
1240 if (isn == 0)
1241 isn++;
1242 TCP_SKB_CB(skb)->when = isn;
1243 return TCP_TW_SYN;
1246 if(!th->rst) {
1247 /* In this case we must reset the TIMEWAIT timer.
1249 If it is ACKless SYN it may be both old duplicate
1250 and new good SYN with random sequence number <rcv_nxt.
1251 Do not reschedule in the last case.
1253 if (paws_reject || th->ack) {
1254 tcp_tw_reschedule(tw);
1255 #ifdef CONFIG_TCP_TW_RECYCLE
1256 tw->rto = min(120*HZ, tw->rto<<1);
1257 tw->ttd = jiffies + tw->rto;
1258 #endif
1261 /* Send ACK. Note, we do not put the bucket,
1262 * it will be released by caller.
1264 return TCP_TW_ACK;
1266 tcp_tw_put(tw);
1267 return TCP_TW_SUCCESS;
1270 /* Enter the time wait state. This is always called from BH
1271 * context. Essentially we whip up a timewait bucket, copy the
1272 * relevant info into it from the SK, and mess with hash chains
1273 * and list linkage.
1275 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1277 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
1278 struct tcp_bind_hashbucket *bhead;
1279 struct sock **head, *sktw;
1281 write_lock(&ehead->lock);
1283 /* Step 1: Remove SK from established hash. */
1284 if (sk->pprev) {
1285 if(sk->next)
1286 sk->next->pprev = sk->pprev;
1287 *sk->pprev = sk->next;
1288 sk->pprev = NULL;
1291 /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1292 head = &(ehead + tcp_ehash_size)->chain;
1293 sktw = (struct sock *)tw;
1294 if((sktw->next = *head) != NULL)
1295 (*head)->pprev = &sktw->next;
1296 *head = sktw;
1297 sktw->pprev = head;
1298 atomic_inc(&tw->refcnt);
1300 write_unlock(&ehead->lock);
1302 /* Step 3: Put TW into bind hash. Original socket stays there too.
1303 Note, that any socket with sk->num!=0 MUST be bound in binding
1304 cache, even if it is closed.
1306 bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
1307 spin_lock(&bhead->lock);
1308 tw->tb = (struct tcp_bind_bucket *)sk->prev;
1309 BUG_TRAP(sk->prev!=NULL);
1310 if ((tw->bind_next = tw->tb->owners) != NULL)
1311 tw->tb->owners->bind_pprev = &tw->bind_next;
1312 tw->tb->owners = (struct sock*)tw;
1313 tw->bind_pprev = &tw->tb->owners;
1314 spin_unlock(&bhead->lock);
1316 /* Step 4: Un-charge protocol socket in-use count. */
1317 sk->prot->inuse--;
1321 * Move a socket to time-wait.
1323 void tcp_time_wait(struct sock *sk)
1325 struct tcp_tw_bucket *tw;
1327 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1328 if(tw != NULL) {
1329 /* Give us an identity. */
1330 tw->daddr = sk->daddr;
1331 tw->rcv_saddr = sk->rcv_saddr;
1332 tw->bound_dev_if= sk->bound_dev_if;
1333 tw->num = sk->num;
1334 tw->state = TCP_TIME_WAIT;
1335 tw->sport = sk->sport;
1336 tw->dport = sk->dport;
1337 tw->family = sk->family;
1338 tw->reuse = sk->reuse;
1339 tw->hashent = sk->hashent;
1340 tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
1341 tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt;
1342 tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent;
1343 tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
1344 #ifdef CONFIG_TCP_TW_RECYCLE
1345 tw->rto = sk->tp_pinfo.af_tcp.rto;
1346 tw->ttd = jiffies + 2*tw->rto;
1347 #endif
1348 atomic_set(&tw->refcnt, 0);
1350 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1351 if(tw->family == PF_INET6) {
1352 memcpy(&tw->v6_daddr,
1353 &sk->net_pinfo.af_inet6.daddr,
1354 sizeof(struct in6_addr));
1355 memcpy(&tw->v6_rcv_saddr,
1356 &sk->net_pinfo.af_inet6.rcv_saddr,
1357 sizeof(struct in6_addr));
1359 #endif
1360 /* Linkage updates. */
1361 __tcp_tw_hashdance(sk, tw);
1363 /* Get the TIME_WAIT timeout firing. */
1364 tcp_tw_schedule(tw);
1366 /* CLOSE the SK. */
1367 if(sk->state == TCP_ESTABLISHED)
1368 tcp_statistics.TcpCurrEstab--;
1369 sk->state = TCP_CLOSE;
1370 } else {
1371 /* Sorry, we're out of memory, just CLOSE this
1372 * socket up. We've got bigger problems than
1373 * non-graceful socket closings.
1375 tcp_set_state(sk, TCP_CLOSE);
1378 tcp_update_metrics(sk);
1379 tcp_clear_xmit_timers(sk);
1380 tcp_done(sk);
1384 * Process the FIN bit. This now behaves as it is supposed to work
1385 * and the FIN takes effect when it is validly part of sequence
1386 * space. Not before when we get holes.
1388 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1389 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1390 * TIME-WAIT)
1392 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1393 * close and we go into CLOSING (and later onto TIME-WAIT)
1395 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1398 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1400 sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1402 tcp_send_ack(sk);
1404 if (!sk->dead) {
1405 wake_up_interruptible(sk->sleep);
1406 sock_wake_async(sk->socket, 1, POLL_HUP);
1409 switch(sk->state) {
1410 case TCP_SYN_RECV:
1411 case TCP_ESTABLISHED:
1412 /* Move to CLOSE_WAIT */
1413 tcp_set_state(sk, TCP_CLOSE_WAIT);
1414 break;
1416 case TCP_CLOSE_WAIT:
1417 case TCP_CLOSING:
1418 /* Received a retransmission of the FIN, do
1419 * nothing.
1421 break;
1422 case TCP_LAST_ACK:
1423 /* RFC793: Remain in the LAST-ACK state. */
1424 break;
1426 case TCP_FIN_WAIT1:
1427 /* This case occurs when a simultaneous close
1428 * happens, we must ack the received FIN and
1429 * enter the CLOSING state.
1431 tcp_set_state(sk, TCP_CLOSING);
1432 break;
1433 case TCP_FIN_WAIT2:
1434 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1435 tcp_time_wait(sk);
1436 break;
1437 default:
1438 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1439 * cases we should never reach this piece of code.
1441 printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1442 break;
1446 /* These routines update the SACK block as out-of-order packets arrive or
1447 * in-order packets close up the sequence space.
1449 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1451 int this_sack, num_sacks = tp->num_sacks;
1452 struct tcp_sack_block *swalk = &tp->selective_acks[0];
1454 /* If more than one SACK block, see if the recent change to SP eats into
1455 * or hits the sequence space of other SACK blocks, if so coalesce.
1457 if(num_sacks != 1) {
1458 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1459 if(swalk == sp)
1460 continue;
1462 /* First case, bottom of SP moves into top of the
1463 * sequence space of SWALK.
1465 if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1466 sp->start_seq = swalk->start_seq;
1467 goto coalesce;
1469 /* Second case, top of SP moves into bottom of the
1470 * sequence space of SWALK.
1472 if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1473 sp->end_seq = swalk->end_seq;
1474 goto coalesce;
1478 /* SP is the only SACK, or no coalescing cases found. */
1479 return;
1481 coalesce:
1482 /* Zap SWALK, by moving every further SACK up by one slot.
1483 * Decrease num_sacks.
1485 for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1486 struct tcp_sack_block *next = (swalk + 1);
1487 swalk->start_seq = next->start_seq;
1488 swalk->end_seq = next->end_seq;
1490 tp->num_sacks--;
1493 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1495 __u32 tmp;
1497 tmp = sack1->start_seq;
1498 sack1->start_seq = sack2->start_seq;
1499 sack2->start_seq = tmp;
1501 tmp = sack1->end_seq;
1502 sack1->end_seq = sack2->end_seq;
1503 sack2->end_seq = tmp;
1506 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1508 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1509 struct tcp_sack_block *sp = &tp->selective_acks[0];
1510 int cur_sacks = tp->num_sacks;
1512 if (!cur_sacks)
1513 goto new_sack;
1515 /* Optimize for the common case, new ofo frames arrive
1516 * "in order". ;-) This also satisfies the requirements
1517 * of RFC2018 about ordering of SACKs.
1519 if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1520 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1521 tcp_sack_maybe_coalesce(tp, sp);
1522 } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1523 /* Re-ordered arrival, in this case, can be optimized
1524 * as well.
1526 sp->start_seq = TCP_SKB_CB(skb)->seq;
1527 tcp_sack_maybe_coalesce(tp, sp);
1528 } else {
1529 struct tcp_sack_block *swap = sp + 1;
1530 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1532 /* Oh well, we have to move things around.
1533 * Try to find a SACK we can tack this onto.
1536 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1537 if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1538 (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1539 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1540 swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1541 else
1542 swap->start_seq = TCP_SKB_CB(skb)->seq;
1543 tcp_sack_swap(sp, swap);
1544 tcp_sack_maybe_coalesce(tp, sp);
1545 return;
1549 /* Could not find an adjacent existing SACK, build a new one,
1550 * put it at the front, and shift everyone else down. We
1551 * always know there is at least one SACK present already here.
1553 * If the sack array is full, forget about the last one.
1555 if (cur_sacks >= max_sacks) {
1556 cur_sacks--;
1557 tp->num_sacks--;
1559 while(cur_sacks >= 1) {
1560 struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1561 struct tcp_sack_block *prev = (this - 1);
1562 this->start_seq = prev->start_seq;
1563 this->end_seq = prev->end_seq;
1564 cur_sacks--;
1567 new_sack:
1568 /* Build the new head SACK, and we're done. */
1569 sp->start_seq = TCP_SKB_CB(skb)->seq;
1570 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1571 tp->num_sacks++;
1575 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1577 struct tcp_sack_block *sp = &tp->selective_acks[0];
1578 int num_sacks = tp->num_sacks;
1579 int this_sack;
1581 /* This is an in order data segment _or_ an out-of-order SKB being
1582 * moved to the receive queue, so we know this removed SKB will eat
1583 * from the front of a SACK.
1585 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1586 /* Check if the start of the sack is covered by skb. */
1587 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1588 before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1589 break;
1592 /* This should only happen if so many SACKs get built that some get
1593 * pushed out before we get here, or we eat some in sequence packets
1594 * which are before the first SACK block.
1596 if(this_sack >= num_sacks)
1597 return;
1599 sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1600 if(!before(sp->start_seq, sp->end_seq)) {
1601 /* Zap this SACK, by moving forward any other SACKS. */
1602 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1603 struct tcp_sack_block *next = (sp + 1);
1604 sp->start_seq = next->start_seq;
1605 sp->end_seq = next->end_seq;
1607 tp->num_sacks--;
1611 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1613 struct tcp_sack_block *sp = &tp->selective_acks[0];
1614 int num_sacks = tp->num_sacks;
1615 int this_sack;
1617 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1618 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1619 break;
1621 if(this_sack >= num_sacks)
1622 return;
1623 sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1626 /* This one checks to see if we can put data from the
1627 * out_of_order queue into the receive_queue.
1629 static void tcp_ofo_queue(struct sock *sk)
1631 struct sk_buff *skb;
1632 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1634 while ((skb = skb_peek(&tp->out_of_order_queue))) {
1635 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1636 break;
1638 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1639 SOCK_DEBUG(sk, "ofo packet was already received \n");
1640 __skb_unlink(skb, skb->list);
1641 kfree_skb(skb);
1642 continue;
1644 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1645 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1646 TCP_SKB_CB(skb)->end_seq);
1648 if(tp->sack_ok)
1649 tcp_sack_remove_skb(tp, skb);
1650 __skb_unlink(skb, skb->list);
1651 __skb_queue_tail(&sk->receive_queue, skb);
1652 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1653 if(skb->h.th->fin)
1654 tcp_fin(skb, sk, skb->h.th);
1658 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1660 struct sk_buff *skb1;
1661 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1663 /* Queue data for delivery to the user.
1664 * Packets in sequence go to the receive queue.
1665 * Out of sequence packets to the out_of_order_queue.
1667 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1668 /* Ok. In sequence. */
1669 queue_and_out:
1670 dst_confirm(sk->dst_cache);
1671 __skb_queue_tail(&sk->receive_queue, skb);
1672 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1673 if(skb->h.th->fin) {
1674 tcp_fin(skb, sk, skb->h.th);
1675 } else {
1676 tcp_remember_ack(tp, skb->h.th, skb);
1678 /* This may have eaten into a SACK block. */
1679 if(tp->sack_ok && tp->num_sacks)
1680 tcp_sack_remove_skb(tp, skb);
1681 tcp_ofo_queue(sk);
1683 /* Turn on fast path. */
1684 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1685 tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1686 ntohl(TCP_FLAG_ACK) |
1687 tp->snd_wnd);
1688 return;
1691 /* An old packet, either a retransmit or some packet got lost. */
1692 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1693 /* A retransmit, 2nd most common case. Force an imediate ack. */
1694 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1695 tcp_enter_quickack_mode(tp);
1696 kfree_skb(skb);
1697 return;
1700 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1701 /* Partial packet, seq < rcv_next < end_seq */
1702 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1703 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1704 TCP_SKB_CB(skb)->end_seq);
1706 goto queue_and_out;
1709 /* Ok. This is an out_of_order segment, force an ack. */
1710 tp->delayed_acks++;
1711 tcp_enter_quickack_mode(tp);
1713 /* Disable header prediction. */
1714 tp->pred_flags = 0;
1716 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1717 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1719 if (skb_peek(&tp->out_of_order_queue) == NULL) {
1720 /* Initial out of order segment, build 1 SACK. */
1721 if(tp->sack_ok) {
1722 tp->num_sacks = 1;
1723 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1724 tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1726 __skb_queue_head(&tp->out_of_order_queue,skb);
1727 } else {
1728 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1729 /* Already there. */
1730 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1731 if (skb->len >= skb1->len) {
1732 if(tp->sack_ok)
1733 tcp_sack_extend(tp, skb1, skb);
1734 __skb_append(skb1, skb);
1735 __skb_unlink(skb1, skb1->list);
1736 kfree_skb(skb1);
1737 } else {
1738 /* A duplicate, smaller than what is in the
1739 * out-of-order queue right now, toss it.
1741 kfree_skb(skb);
1743 break;
1746 if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1747 __skb_append(skb1, skb);
1748 if(tp->sack_ok)
1749 tcp_sack_new_ofo_skb(sk, skb);
1750 break;
1753 /* See if we've hit the start. If so insert. */
1754 if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1755 __skb_queue_head(&tp->out_of_order_queue,skb);
1756 if(tp->sack_ok)
1757 tcp_sack_new_ofo_skb(sk, skb);
1758 break;
1766 * This routine handles the data. If there is room in the buffer,
1767 * it will be have already been moved into it. If there is no
1768 * room, then we will just have to discard the packet.
1771 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1773 struct tcphdr *th;
1774 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1776 th = skb->h.th;
1777 skb_pull(skb, th->doff*4);
1778 skb_trim(skb, len - (th->doff*4));
1780 if (skb->len == 0 && !th->fin)
1781 return(0);
1784 * If our receive queue has grown past its limits shrink it.
1785 * Make sure to do this before moving snd_nxt, otherwise
1786 * data might be acked for that we don't have enough room.
1788 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1789 if (prune_queue(sk) < 0) {
1790 /* Still not enough room. That can happen when
1791 * skb->true_size differs significantly from skb->len.
1793 return 0;
1797 tcp_data_queue(sk, skb);
1799 if (before(tp->rcv_nxt, tp->copied_seq)) {
1800 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1801 tp->rcv_nxt = tp->copied_seq;
1804 /* Above, tcp_data_queue() increments delayed_acks appropriately.
1805 * Now tell the user we may have some data.
1807 if (!sk->dead) {
1808 wake_up_interruptible(sk->sleep);
1809 sock_wake_async(sk->socket,1, POLL_IN);
1811 return(1);
1814 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1816 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1818 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1819 tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1820 /* Put more data onto the wire. */
1821 tcp_write_xmit(sk);
1822 } else if (tp->packets_out == 0 && !tp->pending) {
1823 /* Start probing the receivers window. */
1824 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1828 static __inline__ void tcp_data_snd_check(struct sock *sk)
1830 struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1832 if (skb != NULL)
1833 __tcp_data_snd_check(sk, skb);
1837 * Adapt the MSS value used to make delayed ack decision to the
1838 * real world.
1840 * The constant 536 hasn't any good meaning. In IPv4 world
1841 * MTU may be smaller, though it contradicts to RFC1122, which
1842 * states that MSS must be at least 536.
1843 * We use the constant to do not ACK each second
1844 * packet in a stream of tiny size packets.
1845 * It means that super-low mtu links will be aggressively delacked.
1846 * Seems, it is even good. If they have so low mtu, they are weirdly
1847 * slow.
1849 * AK: BTW it may be useful to add an option to lock the rcv_mss.
1850 * this way the beowulf people wouldn't need ugly patches to get the
1851 * ack frequencies they want and it would be an elegant way to tune delack.
1853 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1855 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1856 unsigned int len, lss;
1858 lss = tp->last_seg_size;
1859 tp->last_seg_size = 0;
1861 /* skb->len may jitter because of SACKs, even if peer
1862 * sends good full-sized frames.
1864 len = skb->len;
1865 if (len >= tp->rcv_mss) {
1866 tp->rcv_mss = len;
1867 } else {
1868 /* Otherwise, we make more careful check taking into account,
1869 * that SACKs block is variable.
1871 * "len" is invariant segment length, including TCP header.
1873 len = skb->tail - skb->h.raw;
1874 if (len >= 536 + sizeof(struct tcphdr)) {
1875 /* Subtract also invariant (if peer is RFC compliant),
1876 * tcp header plus fixed timestamp option length.
1877 * Resulting "len" is MSS free of SACK jitter.
1879 len -= tp->tcp_header_len;
1880 if (len == lss)
1881 tp->rcv_mss = len;
1882 tp->last_seg_size = len;
1888 * Check if sending an ack is needed.
1890 static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
1892 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1894 /* This also takes care of updating the window.
1895 * This if statement needs to be simplified.
1897 * Rules for delaying an ack:
1898 * - delay time <= 0.5 HZ
1899 * - we don't have a window update to send
1900 * - must send at least every 2 full sized packets
1901 * - must send an ACK if we have any out of order data
1903 * With an extra heuristic to handle loss of packet
1904 * situations and also helping the sender leave slow
1905 * start in an expediant manner.
1908 /* Two full frames received or... */
1909 if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1910 /* We will update the window "significantly" or... */
1911 tcp_raise_window(sk) ||
1912 /* We entered "quick ACK" mode or... */
1913 tcp_in_quickack_mode(tp) ||
1914 /* We have out of order data */
1915 (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
1916 /* Then ack it now */
1917 tcp_send_ack(sk);
1918 } else {
1919 /* Else, send delayed ack. */
1920 tcp_send_delayed_ack(sk, HZ/2);
1924 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1926 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1927 if (tp->delayed_acks == 0) {
1928 /* We sent a data segment already. */
1929 return;
1931 __tcp_ack_snd_check(sk, 1);
1936 * This routine is only called when we have urgent data
1937 * signalled. Its the 'slow' part of tcp_urg. It could be
1938 * moved inline now as tcp_urg is only called from one
1939 * place. We handle URGent data wrong. We have to - as
1940 * BSD still doesn't use the correction from RFC961.
1941 * For 1003.1g we should support a new option TCP_STDURG to permit
1942 * either form (or just set the sysctl tcp_stdurg).
1945 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1947 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1948 u32 ptr = ntohs(th->urg_ptr);
1950 if (ptr && !sysctl_tcp_stdurg)
1951 ptr--;
1952 ptr += ntohl(th->seq);
1954 /* Ignore urgent data that we've already seen and read. */
1955 if (after(tp->copied_seq, ptr))
1956 return;
1958 /* Do we already have a newer (or duplicate) urgent pointer? */
1959 if (tp->urg_data && !after(ptr, tp->urg_seq))
1960 return;
1962 /* Tell the world about our new urgent pointer. */
1963 if (sk->proc != 0) {
1964 if (sk->proc > 0)
1965 kill_proc(sk->proc, SIGURG, 1);
1966 else
1967 kill_pg(-sk->proc, SIGURG, 1);
1968 sock_wake_async(sk->socket, 3, POLL_PRI);
1971 /* We may be adding urgent data when the last byte read was
1972 * urgent. To do this requires some care. We cannot just ignore
1973 * tp->copied_seq since we would read the last urgent byte again
1974 * as data, nor can we alter copied_seq until this data arrives
1975 * or we break the sematics of SIOCATMARK (and thus sockatmark())
1977 if (tp->urg_seq == tp->copied_seq)
1978 tp->copied_seq++; /* Move the copied sequence on correctly */
1979 tp->urg_data = URG_NOTYET;
1980 tp->urg_seq = ptr;
1982 /* Disable header prediction. */
1983 tp->pred_flags = 0;
1986 /* This is the 'fast' part of urgent handling. */
1987 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1989 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1991 /* Check if we get a new urgent pointer - normally not. */
1992 if (th->urg)
1993 tcp_check_urg(sk,th);
1995 /* Do we wait for any urgent data? - normally not... */
1996 if (tp->urg_data == URG_NOTYET) {
1997 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1999 /* Is the urgent pointer pointing into this packet? */
2000 if (ptr < len) {
2001 tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
2002 if (!sk->dead)
2003 sk->data_ready(sk,0);
2008 /* Clean the out_of_order queue if we can, trying to get
2009 * the socket within its memory limits again.
2011 * Return less than zero if we should start dropping frames
2012 * until the socket owning process reads some of the data
2013 * to stabilize the situation.
2015 static int prune_queue(struct sock *sk)
2017 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2018 struct sk_buff * skb;
2020 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
2022 net_statistics.PruneCalled++;
2024 /* First, purge the out_of_order queue. */
2025 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2026 if(skb != NULL) {
2027 /* Free it all. */
2028 do { net_statistics.OfoPruned += skb->len;
2029 kfree_skb(skb);
2030 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2031 } while(skb != NULL);
2033 /* Reset SACK state. A conforming SACK implementation will
2034 * do the same at a timeout based retransmit. When a connection
2035 * is in a sad state like this, we care only about integrity
2036 * of the connection not performance.
2038 if(tp->sack_ok)
2039 tp->num_sacks = 0;
2042 /* If we are really being abused, tell the caller to silently
2043 * drop receive data on the floor. It will get retransmitted
2044 * and hopefully then we'll have sufficient space.
2046 * We used to try to purge the in-order packets too, but that
2047 * turns out to be deadly and fraught with races. Consider:
2049 * 1) If we acked the data, we absolutely cannot drop the
2050 * packet. This data would then never be retransmitted.
2051 * 2) It is possible, with a proper sequence of events involving
2052 * delayed acks and backlog queue handling, to have the user
2053 * read the data before it gets acked. The previous code
2054 * here got this wrong, and it lead to data corruption.
2055 * 3) Too much state changes happen when the FIN arrives, so once
2056 * we've seen that we can't remove any in-order data safely.
2058 * The net result is that removing in-order receive data is too
2059 * complex for anyones sanity. So we don't do it anymore. But
2060 * if we are really having our buffer space abused we stop accepting
2061 * new receive data.
2063 * FIXME: it should recompute SACK state and only remove enough
2064 * buffers to get into bounds again. The current scheme loses
2065 * badly sometimes on links with large RTT, especially when
2066 * the driver has high overhead per skb.
2067 * (increasing the rcvbuf is not enough because it inflates the
2068 * the window too, disabling flow control effectively) -AK
2070 if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
2071 return 0;
2073 /* Massive buffer overcommit. */
2074 return -1;
2078 * TCP receive function for the ESTABLISHED state.
2080 * It is split into a fast path and a slow path. The fast path is
2081 * disabled when:
2082 * - A zero window was announced from us - zero window probing
2083 * is only handled properly in the slow path.
2084 * - Out of order segments arrived.
2085 * - Urgent data is expected.
2086 * - There is no buffer space left
2087 * - Unexpected TCP flags/window values/header lengths are received
2088 * (detected by checking the TCP header against pred_flags)
2089 * - Data is sent in both directions. Fast path only supports pure senders
2090 * or pure receivers (this means either the sequence number or the ack
2091 * value must stay constant)
2092 * - Unexpected TCP option.
2094 * When these conditions are not satisfied it drops into a standard
2095 * receive procedure patterned after RFC793 to handle all cases.
2096 * The first three cases are guaranteed by proper pred_flags setting,
2097 * the rest is checked inline. Fast processing is turned on in
2098 * tcp_data_queue when everything is OK.
2100 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
2101 struct tcphdr *th, unsigned len)
2103 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2106 * Header prediction.
2107 * The code losely follows the one in the famous
2108 * "30 instruction TCP receive" Van Jacobson mail.
2110 * Van's trick is to deposit buffers into socket queue
2111 * on a device interrupt, to call tcp_recv function
2112 * on the receive process context and checksum and copy
2113 * the buffer to user space. smart...
2115 * Our current scheme is not silly either but we take the
2116 * extra cost of the net_bh soft interrupt processing...
2117 * We do checksum and copy also but from device to kernel.
2121 /* RED-PEN. Using static variables to pass function arguments
2122 * cannot be good idea...
2124 tp->saw_tstamp = 0;
2126 /* pred_flags is 0xS?10 << 16 + snd_wnd
2127 * if header_predition is to be made
2128 * 'S' will always be tp->tcp_header_len >> 2
2129 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2130 * turn it off (when there are holes in the receive
2131 * space for instance)
2132 * PSH flag is ignored.
2135 if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
2136 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
2137 int tcp_header_len = th->doff*4;
2139 /* Timestamp header prediction */
2141 /* Non-standard header f.e. SACKs -> slow path */
2142 if (tcp_header_len != tp->tcp_header_len)
2143 goto slow_path;
2145 /* Check timestamp */
2146 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
2147 __u32 *ptr = (__u32 *)(th + 1);
2149 /* No? Slow path! */
2150 if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
2151 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
2152 goto slow_path;
2154 tp->saw_tstamp = 1;
2155 ++ptr;
2156 tp->rcv_tsval = ntohl(*ptr);
2157 ++ptr;
2158 tp->rcv_tsecr = ntohl(*ptr);
2160 /* If PAWS failed, check it more carefully in slow path */
2161 if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
2162 goto slow_path;
2164 /* Predicted packet is in window by definition.
2165 seq == rcv_nxt and last_ack_sent <= rcv_nxt.
2166 Hence, check seq<=last_ack_sent reduces to:
2168 if (tp->rcv_nxt == tp->last_ack_sent) {
2169 tp->ts_recent = tp->rcv_tsval;
2170 tp->ts_recent_stamp = xtime.tv_sec;
2174 if (len <= tcp_header_len) {
2175 /* Bulk data transfer: sender */
2176 if (len == tcp_header_len) {
2177 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2178 TCP_SKB_CB(skb)->ack_seq, len);
2179 kfree_skb(skb);
2180 tcp_data_snd_check(sk);
2181 return 0;
2182 } else { /* Header too small */
2183 tcp_statistics.TcpInErrs++;
2184 goto discard;
2186 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
2187 atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
2188 /* Bulk data transfer: receiver */
2189 __skb_pull(skb,tcp_header_len);
2191 /* Is it possible to simplify this? */
2192 tcp_measure_rcv_mss(sk, skb);
2194 /* DO NOT notify forward progress here.
2195 * It saves dozen of CPU instructions in fast path. --ANK
2196 * And where is it signaled then ? -AK
2198 __skb_queue_tail(&sk->receive_queue, skb);
2199 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2201 /* FIN bit check is not done since if FIN is set in
2202 * this frame, the pred_flags won't match up. -DaveM
2204 wake_up_interruptible(sk->sleep);
2205 sock_wake_async(sk->socket,1, POLL_IN);
2206 tcp_delack_estimator(tp);
2208 tcp_remember_ack(tp, th, skb);
2210 __tcp_ack_snd_check(sk, 0);
2211 return 0;
2213 /* Packet is in sequence, flags are trivial;
2214 * only ACK is strange or we are tough on memory.
2215 * Jump to step 5.
2217 goto step5;
2220 slow_path:
2222 * RFC1323: H1. Apply PAWS check first.
2224 if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2225 tcp_paws_discard(tp, skb)) {
2226 if (!th->rst) {
2227 tcp_send_ack(sk);
2228 goto discard;
2230 /* Resets are accepted even if PAWS failed.
2232 ts_recent update must be made after we are sure
2233 that the packet is in window.
2238 * Standard slow path.
2241 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2242 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2243 * (RST) segments are validated by checking their SEQ-fields."
2244 * And page 69: "If an incoming segment is not acceptable,
2245 * an acknowledgment should be sent in reply (unless the RST bit
2246 * is set, if so drop the segment and return)".
2248 if (th->rst)
2249 goto discard;
2250 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2251 SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
2252 TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2253 tp->rcv_wup, tp->rcv_wnd);
2255 tcp_send_ack(sk);
2256 goto discard;
2259 if(th->rst) {
2260 tcp_reset(sk);
2261 goto discard;
2264 if (tp->saw_tstamp) {
2265 tcp_replace_ts_recent(sk, tp,
2266 TCP_SKB_CB(skb)->seq);
2269 if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2270 SOCK_DEBUG(sk, "syn in established state\n");
2271 tcp_statistics.TcpInErrs++;
2272 tcp_reset(sk);
2273 return 1;
2276 step5:
2277 if(th->ack)
2278 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
2280 /* Process urgent data. */
2281 tcp_urg(sk, th, len);
2284 /* step 7: process the segment text */
2285 int queued = tcp_data(skb, sk, len);
2287 tcp_measure_rcv_mss(sk, skb);
2289 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2290 if(sk->state != TCP_CLOSE) {
2291 tcp_data_snd_check(sk);
2292 tcp_ack_snd_check(sk);
2295 if (!queued) {
2296 discard:
2297 kfree_skb(skb);
2301 return 0;
2305 /* This is not only more efficient than what we used to do, it eliminates
2306 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2308 * Actually, we could lots of memory writes here. tp of listening
2309 * socket contains all necessary default parameters.
2311 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
2313 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
2315 if(newsk != NULL) {
2316 struct tcp_opt *newtp;
2317 #ifdef CONFIG_FILTER
2318 struct sk_filter *filter;
2319 #endif
2321 memcpy(newsk, sk, sizeof(*newsk));
2322 newsk->state = TCP_SYN_RECV;
2324 /* SANITY */
2325 newsk->pprev = NULL;
2326 newsk->prev = NULL;
2328 /* Clone the TCP header template */
2329 newsk->dport = req->rmt_port;
2331 sock_lock_init(newsk);
2333 atomic_set(&newsk->rmem_alloc, 0);
2334 skb_queue_head_init(&newsk->receive_queue);
2335 atomic_set(&newsk->wmem_alloc, 0);
2336 skb_queue_head_init(&newsk->write_queue);
2337 atomic_set(&newsk->omem_alloc, 0);
2339 newsk->done = 0;
2340 newsk->proc = 0;
2341 newsk->backlog.head = newsk->backlog.tail = NULL;
2342 skb_queue_head_init(&newsk->error_queue);
2343 newsk->write_space = tcp_write_space;
2344 #ifdef CONFIG_FILTER
2345 if ((filter = newsk->filter) != NULL)
2346 sk_filter_charge(newsk, filter);
2347 #endif
2349 /* Now setup tcp_opt */
2350 newtp = &(newsk->tp_pinfo.af_tcp);
2351 newtp->pred_flags = 0;
2352 newtp->rcv_nxt = req->rcv_isn + 1;
2353 newtp->snd_nxt = req->snt_isn + 1;
2354 newtp->snd_una = req->snt_isn + 1;
2355 newtp->srtt = 0;
2356 newtp->ato = 0;
2357 newtp->snd_wl1 = req->rcv_isn;
2358 newtp->snd_wl2 = req->snt_isn;
2360 /* RFC1323: The window in SYN & SYN/ACK segments
2361 * is never scaled.
2363 newtp->snd_wnd = ntohs(skb->h.th->window);
2365 newtp->max_window = newtp->snd_wnd;
2366 newtp->pending = 0;
2367 newtp->retransmits = 0;
2368 newtp->last_ack_sent = req->rcv_isn + 1;
2369 newtp->backoff = 0;
2370 newtp->mdev = TCP_TIMEOUT_INIT;
2372 /* So many TCP implementations out there (incorrectly) count the
2373 * initial SYN frame in their delayed-ACK and congestion control
2374 * algorithms that we must have the following bandaid to talk
2375 * efficiently to them. -DaveM
2377 newtp->snd_cwnd = 2;
2379 newtp->rto = TCP_TIMEOUT_INIT;
2380 newtp->packets_out = 0;
2381 newtp->fackets_out = 0;
2382 newtp->retrans_out = 0;
2383 newtp->high_seq = 0;
2384 newtp->snd_ssthresh = 0x7fffffff;
2385 newtp->snd_cwnd_cnt = 0;
2386 newtp->dup_acks = 0;
2387 newtp->delayed_acks = 0;
2388 init_timer(&newtp->retransmit_timer);
2389 newtp->retransmit_timer.function = &tcp_retransmit_timer;
2390 newtp->retransmit_timer.data = (unsigned long) newsk;
2391 init_timer(&newtp->delack_timer);
2392 newtp->delack_timer.function = &tcp_delack_timer;
2393 newtp->delack_timer.data = (unsigned long) newsk;
2394 skb_queue_head_init(&newtp->out_of_order_queue);
2395 newtp->send_head = newtp->retrans_head = NULL;
2396 newtp->rcv_wup = req->rcv_isn + 1;
2397 newtp->write_seq = req->snt_isn + 1;
2398 newtp->copied_seq = req->rcv_isn + 1;
2400 newtp->saw_tstamp = 0;
2402 init_timer(&newtp->probe_timer);
2403 newtp->probe_timer.function = &tcp_probe_timer;
2404 newtp->probe_timer.data = (unsigned long) newsk;
2405 newtp->probes_out = 0;
2406 newtp->syn_seq = req->rcv_isn;
2407 newtp->fin_seq = req->rcv_isn;
2408 newtp->urg_data = 0;
2409 tcp_synq_init(newtp);
2410 newtp->syn_backlog = 0;
2411 if (skb->len >= 536)
2412 newtp->last_seg_size = skb->len;
2414 /* Back to base struct sock members. */
2415 newsk->err = 0;
2416 newsk->ack_backlog = 0;
2417 newsk->max_ack_backlog = SOMAXCONN;
2418 newsk->priority = 0;
2419 atomic_set(&newsk->refcnt, 1);
2420 atomic_inc(&inet_sock_nr);
2422 spin_lock_init(&sk->timer_lock);
2423 init_timer(&newsk->timer);
2424 newsk->timer.function = &tcp_keepalive_timer;
2425 newsk->timer.data = (unsigned long) newsk;
2426 if (newsk->keepopen)
2427 tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
2428 newsk->socket = NULL;
2429 newsk->sleep = NULL;
2431 newtp->tstamp_ok = req->tstamp_ok;
2432 if((newtp->sack_ok = req->sack_ok) != 0)
2433 newtp->num_sacks = 0;
2434 newtp->window_clamp = req->window_clamp;
2435 newtp->rcv_wnd = req->rcv_wnd;
2436 newtp->wscale_ok = req->wscale_ok;
2437 if (newtp->wscale_ok) {
2438 newtp->snd_wscale = req->snd_wscale;
2439 newtp->rcv_wscale = req->rcv_wscale;
2440 } else {
2441 newtp->snd_wscale = newtp->rcv_wscale = 0;
2442 newtp->window_clamp = min(newtp->window_clamp,65535);
2444 if (newtp->tstamp_ok) {
2445 newtp->ts_recent = req->ts_recent;
2446 newtp->ts_recent_stamp = xtime.tv_sec;
2447 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2448 } else {
2449 newtp->ts_recent_stamp = 0;
2450 newtp->tcp_header_len = sizeof(struct tcphdr);
2452 newtp->mss_clamp = req->mss;
2454 return newsk;
2457 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
2459 if (seq == s_win)
2460 return 1;
2461 if (after(end_seq, s_win) && before(seq, e_win))
2462 return 1;
2463 return (seq == e_win && seq == end_seq);
2468 * Process an incoming packet for SYN_RECV sockets represented
2469 * as an open_request.
2472 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
2473 struct open_request *req,
2474 struct open_request *prev)
2476 struct tcphdr *th = skb->h.th;
2477 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2478 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
2479 int paws_reject = 0;
2480 struct tcp_opt ttp;
2482 /* If socket has already been created, process
2483 packet in its context.
2485 We fall here only due to race, when packets were enqueued
2486 to backlog of listening socket.
2488 if (req->sk)
2489 return req->sk;
2491 ttp.saw_tstamp = 0;
2492 if (th->doff > (sizeof(struct tcphdr)>>2)) {
2494 tcp_parse_options(NULL, th, &ttp, 0);
2496 paws_reject = ttp.saw_tstamp &&
2497 (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
2500 /* Check for pure retransmited SYN. */
2501 if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
2502 flg == TCP_FLAG_SYN &&
2503 !paws_reject) {
2505 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2506 * this case on figure 6 and figure 8, but formal
2507 * protocol description says NOTHING.
2508 * To be more exact, it says that we should send ACK,
2509 * because this segment (at least, if it has no data)
2510 * is out of window.
2512 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2513 * describe SYN-RECV state. All the description
2514 * is wrong, we cannot believe to it and should
2515 * rely only on common sense and implementation
2516 * experience.
2518 * Enforce "SYN-ACK" according to figure 8, figure 6
2519 * of RFC793, fixed by RFC1122.
2521 req->class->rtx_syn_ack(sk, req);
2522 return NULL;
2525 /* Further reproduces section "SEGMENT ARRIVES"
2526 for state SYN-RECEIVED of RFC793.
2527 It is broken, however, it does not work only
2528 when SYNs are crossed, which is impossible in our
2529 case.
2531 But generally, we should (RFC lies!) to accept ACK
2532 from SYNACK both here and in tcp_rcv_state_process().
2533 tcp_rcv_state_process() does not, hence, we do not too.
2535 Note that the case is absolutely generic:
2536 we cannot optimize anything here without
2537 violating protocol. All the checks must be made
2538 before attempt to create socket.
2541 /* RFC793: "first check sequence number". */
2543 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2544 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
2545 /* Out of window: send ACK and drop. */
2546 if (!(flg & TCP_FLAG_RST))
2547 req->class->send_ack(skb, req);
2548 return NULL;
2551 /* In sequence, PAWS is OK. */
2553 if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
2554 req->ts_recent = ttp.rcv_tsval;
2556 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
2557 /* Truncate SYN, it is out of window starting
2558 at req->rcv_isn+1. */
2559 flg &= ~TCP_FLAG_SYN;
2562 /* RFC793: "second check the RST bit" and
2563 * "fourth, check the SYN bit"
2565 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
2566 goto embryonic_reset;
2568 /* RFC793: "fifth check the ACK field" */
2570 if (!(flg & TCP_FLAG_ACK))
2571 return NULL;
2573 /* Invalid ACK: reset will be sent by listening socket */
2574 if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
2575 return sk;
2577 /* OK, ACK is valid, create big socket and
2578 feed this segment to it. It will repeat all
2579 the tests. THIS SEGMENT MUST MOVE SOCKET TO
2580 ESTABLISHED STATE. If it will be dropped after
2581 socket is created, wait for troubles.
2583 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2584 if (sk == NULL)
2585 return NULL;
2587 tcp_dec_slow_timer(TCP_SLT_SYNACK);
2588 req->sk = sk;
2589 return sk;
2591 embryonic_reset:
2592 tcp_synq_unlink(tp, req, prev);
2593 tp->syn_backlog--;
2594 tcp_dec_slow_timer(TCP_SLT_SYNACK);
2596 net_statistics.EmbryonicRsts++;
2597 if (!(flg & TCP_FLAG_RST))
2598 req->class->send_reset(skb);
2600 req->class->destructor(req);
2601 tcp_openreq_free(req);
2602 return NULL;
2605 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
2606 struct tcphdr *th, unsigned len)
2608 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2610 tcp_parse_options(sk, th, tp, 0);
2612 #ifdef CONFIG_TCP_TW_RECYCLE
2613 if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
2614 (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
2615 xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
2616 /* Old duplicate segment. We remember last
2617 ts_recent from this host in timewait bucket.
2619 Actually, we could implement per host cache
2620 to truncate timewait state after RTO. Paranoidal arguments
2621 of rfc1337 are not enough to close this nice possibility.
2623 if (net_ratelimit())
2624 printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
2625 if (th->ack)
2626 return 1;
2627 goto discard;
2629 #endif
2631 if (th->ack) {
2632 /* rfc793:
2633 * "If the state is SYN-SENT then
2634 * first check the ACK bit
2635 * If the ACK bit is set
2636 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2637 * a reset (unless the RST bit is set, if so drop
2638 * the segment and return)"
2640 * I cite this place to emphasize one essential
2641 * detail, this check is different of one
2642 * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2643 * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2644 * because we have no previous data sent before SYN.
2645 * --ANK(990513)
2647 * We do not send data with SYN, so that RFC-correct
2648 * test reduces to:
2650 if (sk->zapped ||
2651 TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
2652 return 1;
2654 /* Now ACK is acceptable.
2656 * "If the RST bit is set
2657 * If the ACK was acceptable then signal the user "error:
2658 * connection reset", drop the segment, enter CLOSED state,
2659 * delete TCB, and return."
2662 if (th->rst) {
2663 tcp_reset(sk);
2664 goto discard;
2667 /* rfc793:
2668 * "fifth, if neither of the SYN or RST bits is set then
2669 * drop the segment and return."
2671 * See note below!
2672 * --ANK(990513)
2674 if (!th->syn)
2675 goto discard;
2677 /* rfc793:
2678 * "If the SYN bit is on ...
2679 * are acceptable then ...
2680 * (our SYN has been ACKed), change the connection
2681 * state to ESTABLISHED..."
2683 * Do you see? SYN-less ACKs in SYN-SENT state are
2684 * completely ignored.
2686 * The bug causing stalled SYN-SENT sockets
2687 * was here: tcp_ack advanced snd_una and canceled
2688 * retransmit timer, so that bare ACK received
2689 * in SYN-SENT state (even with invalid ack==ISS,
2690 * because tcp_ack check is too weak for SYN-SENT)
2691 * causes moving socket to invalid semi-SYN-SENT,
2692 * semi-ESTABLISHED state and connection hangs.
2694 * There exist buggy stacks, which really send
2695 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2696 * Actually, if this host did not try to get something
2697 * from ftp.inr.ac.ru I'd never find this bug 8)
2699 * --ANK (990514)
2701 * I was wrong, I apologize. Bare ACK is valid.
2702 * Actually, RFC793 requires to send such ACK
2703 * in reply to any out of window packet.
2704 * It is wrong, but Linux also does it sometimes.
2705 * --ANK (990724)
2708 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2709 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2710 TCP_SKB_CB(skb)->ack_seq, len);
2712 /* Ok.. it's good. Set up sequence numbers and
2713 * move to established.
2715 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2716 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2718 /* RFC1323: The window in SYN & SYN/ACK segments is
2719 * never scaled.
2721 tp->snd_wnd = htons(th->window);
2722 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2723 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2724 tp->fin_seq = TCP_SKB_CB(skb)->seq;
2726 tcp_set_state(sk, TCP_ESTABLISHED);
2728 if (tp->wscale_ok == 0) {
2729 tp->snd_wscale = tp->rcv_wscale = 0;
2730 tp->window_clamp = min(tp->window_clamp,65535);
2733 if (tp->tstamp_ok) {
2734 tp->tcp_header_len =
2735 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2736 } else
2737 tp->tcp_header_len = sizeof(struct tcphdr);
2738 if (tp->saw_tstamp) {
2739 tp->ts_recent = tp->rcv_tsval;
2740 tp->ts_recent_stamp = xtime.tv_sec;
2742 tcp_sync_mss(sk, tp->pmtu_cookie);
2743 tcp_initialize_rcv_mss(sk);
2744 tcp_init_metrics(sk);
2746 if (tp->write_pending) {
2747 /* Save one ACK. Data will be ready after
2748 * several ticks, if write_pending is set.
2750 * How to make this correctly?
2752 tp->delayed_acks++;
2753 if (tp->ato == 0)
2754 tp->ato = tp->rto;
2755 tcp_send_delayed_ack(sk, tp->rto);
2756 } else {
2757 tcp_send_ack(sk);
2760 tp->copied_seq = tp->rcv_nxt;
2762 if(!sk->dead) {
2763 wake_up_interruptible(sk->sleep);
2764 sock_wake_async(sk->socket, 0, POLL_IN);
2766 return -1;
2769 /* No ACK in the segment */
2771 if (th->rst) {
2772 /* rfc793:
2773 * "If the RST bit is set
2775 * Otherwise (no ACK) drop the segment and return."
2778 goto discard;
2781 if (th->syn) {
2782 /* We see SYN without ACK. It is attempt of
2783 * simultaneous connect with crossed SYNs.
2785 * The previous version of the code
2786 * checked for "connecting to self"
2787 * here. that check is done now in
2788 * tcp_connect.
2790 * RED-PEN: BTW, it does not. 8)
2792 tcp_set_state(sk, TCP_SYN_RECV);
2793 if (tp->saw_tstamp) {
2794 tp->ts_recent = tp->rcv_tsval;
2795 tp->ts_recent_stamp = xtime.tv_sec;
2798 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2799 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2801 /* RFC1323: The window in SYN & SYN/ACK segments is
2802 * never scaled.
2804 tp->snd_wnd = htons(th->window);
2805 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2807 tcp_sync_mss(sk, tp->pmtu_cookie);
2808 tcp_initialize_rcv_mss(sk);
2810 tcp_send_synack(sk);
2811 #if 0
2812 /* Note, we could accept data and URG from this segment.
2813 * There are no obstacles to make this.
2815 * However, if we ignore data in ACKless segments sometimes,
2816 * we have no reasons to accept it sometimes.
2817 * Also, seems the code doing it in step6 of tcp_rcv_state_process
2818 * is not flawless. So, discard packet for sanity.
2819 * Uncomment this return to process the data.
2821 return -1;
2822 #endif
2824 /* "fifth, if neither of the SYN or RST bits is set then
2825 * drop the segment and return."
2828 discard:
2829 kfree_skb(skb);
2830 return 0;
2835 * This function implements the receiving procedure of RFC 793 for
2836 * all states except ESTABLISHED and TIME_WAIT.
2837 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2838 * address independent.
2841 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2842 struct tcphdr *th, unsigned len)
2844 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2845 int queued = 0;
2847 tp->saw_tstamp = 0;
2849 switch (sk->state) {
2850 case TCP_CLOSE:
2851 /* When state == CLOSED, hash lookup always fails.
2853 * But, there is a back door, the backlog queue.
2854 * If we have a sequence of packets in the backlog
2855 * during __release_sock() which have a sequence such
2856 * that:
2857 * packet X causes entry to TCP_CLOSE state
2858 * ...
2859 * packet X + N has FIN bit set
2861 * We report a (luckily) harmless error in this case.
2862 * The issue is that backlog queue processing bypasses
2863 * any hash lookups (we know which socket packets are for).
2864 * The correct behavior here is what 2.0.x did, since
2865 * a TCP_CLOSE socket does not exist. Drop the frame
2866 * and send a RST back to the other end.
2869 /* 1. The socket may be moved to TIME-WAIT state.
2870 2. While this socket was locked, another socket
2871 with the same identity could be created.
2872 3. To continue?
2874 CONCLUSION: discard and only discard!
2876 Alternative would be relookup and recurse into tcp_v?_rcv
2877 (not *_do_rcv) to work with timewait and listen states
2878 correctly.
2880 goto discard;
2882 case TCP_LISTEN:
2883 if(th->ack)
2884 return 1;
2886 if(th->syn) {
2887 if(tp->af_specific->conn_request(sk, skb) < 0)
2888 return 1;
2890 /* Now we have several options: In theory there is
2891 * nothing else in the frame. KA9Q has an option to
2892 * send data with the syn, BSD accepts data with the
2893 * syn up to the [to be] advertised window and
2894 * Solaris 2.1 gives you a protocol error. For now
2895 * we just ignore it, that fits the spec precisely
2896 * and avoids incompatibilities. It would be nice in
2897 * future to drop through and process the data.
2899 * Now that TTCP is starting to be used we ought to
2900 * queue this data.
2901 * But, this leaves one open to an easy denial of
2902 * service attack, and SYN cookies can't defend
2903 * against this problem. So, we drop the data
2904 * in the interest of security over speed.
2906 goto discard;
2908 goto discard;
2910 case TCP_SYN_SENT:
2911 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
2912 if (queued >= 0)
2913 return queued;
2914 queued = 0;
2915 goto step6;
2918 /* Parse the tcp_options present on this header.
2919 * By this point we really only expect timestamps.
2920 * Note that this really has to be here and not later for PAWS
2921 * (RFC1323) to work.
2923 if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2924 tcp_paws_discard(tp, skb)) {
2925 if (!th->rst) {
2926 tcp_send_ack(sk);
2927 goto discard;
2929 /* Reset is accepted even if it did not pass PAWS. */
2932 /* The silly FIN test here is necessary to see an advancing ACK in
2933 * retransmitted FIN frames properly. Consider the following sequence:
2935 * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
2936 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
2937 * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
2938 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
2940 * At this point the connection will deadlock with host1 believing
2941 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2942 * forever. The following fix is from Taral (taral@taral.net).
2944 * RED-PEN. Seems, the above is not true.
2945 * If at least one end is RFC compliant, it will send ACK to
2946 * out of window FIN and, hence, move peer to TIME-WAIT.
2947 * I comment out this line. --ANK
2949 * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
2950 * received in SYN-RECV. The problem is that description of
2951 * segment processing in SYN-RECV state in RFC792 is WRONG.
2952 * Correct check would accept ACK from this SYN-ACK, see
2953 * figures 6 and 8 (fixed by RFC1122). Compare this
2954 * to problem with FIN, they smell similarly. --ANK
2957 /* step 1: check sequence number */
2958 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
2959 #if 0
2960 && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
2961 #endif
2963 if (!th->rst) {
2964 tcp_send_ack(sk);
2966 goto discard;
2969 /* step 2: check RST bit */
2970 if(th->rst) {
2971 tcp_reset(sk);
2972 goto discard;
2975 if (tp->saw_tstamp) {
2976 tcp_replace_ts_recent(sk, tp,
2977 TCP_SKB_CB(skb)->seq);
2980 /* step 3: check security and precedence [ignored] */
2982 /* step 4:
2984 * Check for a SYN, and ensure it matches the SYN we were
2985 * first sent. We have to handle the rather unusual (but valid)
2986 * sequence that KA9Q derived products may generate of
2988 * SYN
2989 * SYN|ACK Data
2990 * ACK (lost)
2991 * SYN|ACK Data + More Data
2992 * .. we must ACK not RST...
2994 * We keep syn_seq as the sequence space occupied by the
2995 * original syn.
2998 if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2999 tcp_reset(sk);
3000 return 1;
3003 /* step 5: check the ACK field */
3004 if (th->ack) {
3005 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
3006 TCP_SKB_CB(skb)->ack_seq, len);
3008 switch(sk->state) {
3009 case TCP_SYN_RECV:
3010 if (acceptable) {
3011 tcp_set_state(sk, TCP_ESTABLISHED);
3012 tp->copied_seq = tp->rcv_nxt;
3014 /* Note, that this wakeup is only for marginal
3015 crossed SYN case. Passively open sockets
3016 are not waked up, because sk->sleep == NULL
3017 and sk->socket == NULL.
3019 if (!sk->dead && sk->sleep) {
3020 wake_up_interruptible(sk->sleep);
3021 sock_wake_async(sk->socket,0,POLL_OUT);
3024 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
3025 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
3026 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3027 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3029 /* tcp_ack considers this ACK as duplicate
3030 * and does not calculate rtt. It is wrong.
3031 * Fix it at least with timestamps.
3033 if (tp->saw_tstamp && !tp->srtt)
3034 tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
3036 tcp_init_metrics(sk);
3037 } else {
3038 SOCK_DEBUG(sk, "bad ack\n");
3039 return 1;
3041 break;
3043 case TCP_FIN_WAIT1:
3044 if (tp->snd_una == tp->write_seq) {
3045 sk->shutdown |= SEND_SHUTDOWN;
3046 tcp_set_state(sk, TCP_FIN_WAIT2);
3047 if (!sk->dead)
3048 sk->state_change(sk);
3049 else
3050 tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
3051 dst_confirm(sk->dst_cache);
3053 break;
3055 case TCP_CLOSING:
3056 if (tp->snd_una == tp->write_seq) {
3057 tcp_time_wait(sk);
3058 goto discard;
3060 break;
3062 case TCP_LAST_ACK:
3063 if (tp->snd_una == tp->write_seq) {
3064 tcp_set_state(sk,TCP_CLOSE);
3065 tcp_update_metrics(sk);
3066 tcp_done(sk);
3067 goto discard;
3069 break;
3071 } else
3072 goto discard;
3074 step6:
3075 /* step 6: check the URG bit */
3076 tcp_urg(sk, th, len);
3078 /* step 7: process the segment text */
3079 switch (sk->state) {
3080 case TCP_CLOSE_WAIT:
3081 case TCP_CLOSING:
3082 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
3083 break;
3085 case TCP_FIN_WAIT1:
3086 case TCP_FIN_WAIT2:
3087 /* RFC 793 says to queue data in these states,
3088 * RFC 1122 says we MUST send a reset.
3089 * BSD 4.4 also does reset.
3091 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
3092 if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3093 tcp_reset(sk);
3094 return 1;
3098 case TCP_ESTABLISHED:
3099 queued = tcp_data(skb, sk, len);
3101 /* This must be after tcp_data() does the skb_pull() to
3102 * remove the header size from skb->len.
3104 tcp_measure_rcv_mss(sk, skb);
3105 break;
3108 /* tcp_data could move socket to TIME-WAIT */
3109 if (sk->state != TCP_CLOSE) {
3110 tcp_data_snd_check(sk);
3111 tcp_ack_snd_check(sk);
3114 if (!queued) {
3115 discard:
3116 kfree_skb(skb);
3118 return 0;