2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/module.h>
26 int sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
;
27 int sysctl_tcp_synack_retries
= TCP_SYNACK_RETRIES
;
28 int sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
29 int sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
30 int sysctl_tcp_keepalive_intvl
= TCP_KEEPALIVE_INTVL
;
31 int sysctl_tcp_retries1
= TCP_RETR1
;
32 int sysctl_tcp_retries2
= TCP_RETR2
;
33 int sysctl_tcp_orphan_retries
;
35 static void tcp_write_timer(unsigned long);
36 static void tcp_delack_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data
);
40 const char tcp_timer_bug_msg
[] = KERN_DEBUG
"tcpbug: unknown timer value\n";
41 EXPORT_SYMBOL(tcp_timer_bug_msg
);
45 * Using different timers for retransmit, delayed acks and probes
46 * We may wish use just one timer maintaining a list of expire jiffies
50 void tcp_init_xmit_timers(struct sock
*sk
)
52 struct tcp_sock
*tp
= tcp_sk(sk
);
54 init_timer(&tp
->retransmit_timer
);
55 tp
->retransmit_timer
.function
=&tcp_write_timer
;
56 tp
->retransmit_timer
.data
= (unsigned long) sk
;
59 init_timer(&tp
->delack_timer
);
60 tp
->delack_timer
.function
=&tcp_delack_timer
;
61 tp
->delack_timer
.data
= (unsigned long) sk
;
64 init_timer(&sk
->sk_timer
);
65 sk
->sk_timer
.function
= &tcp_keepalive_timer
;
66 sk
->sk_timer
.data
= (unsigned long)sk
;
69 void tcp_clear_xmit_timers(struct sock
*sk
)
71 struct tcp_sock
*tp
= tcp_sk(sk
);
74 sk_stop_timer(sk
, &tp
->retransmit_timer
);
78 sk_stop_timer(sk
, &tp
->delack_timer
);
80 sk_stop_timer(sk
, &sk
->sk_timer
);
83 static void tcp_write_err(struct sock
*sk
)
85 sk
->sk_err
= sk
->sk_err_soft
? : ETIMEDOUT
;
86 sk
->sk_error_report(sk
);
89 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT
);
92 /* Do not allow orphaned sockets to eat all our resources.
93 * This is direct violation of TCP specs, but it is required
94 * to prevent DoS attacks. It is called when a retransmission timeout
95 * or zero probe timeout occurs on orphaned socket.
97 * Criterium is still not confirmed experimentally and may change.
98 * We kill the socket, if:
99 * 1. If number of orphaned sockets exceeds an administratively configured
101 * 2. If we have strong memory pressure.
103 static int tcp_out_of_resources(struct sock
*sk
, int do_reset
)
105 struct tcp_sock
*tp
= tcp_sk(sk
);
106 int orphans
= atomic_read(&tcp_orphan_count
);
108 /* If peer does not open window for long time, or did not transmit
109 * anything for long time, penalize it. */
110 if ((s32
)(tcp_time_stamp
- tp
->lsndtime
) > 2*TCP_RTO_MAX
|| !do_reset
)
113 /* If some dubious ICMP arrived, penalize even more. */
117 if (orphans
>= sysctl_tcp_max_orphans
||
118 (sk
->sk_wmem_queued
> SOCK_MIN_SNDBUF
&&
119 atomic_read(&tcp_memory_allocated
) > sysctl_tcp_mem
[2])) {
121 printk(KERN_INFO
"Out of socket memory\n");
123 /* Catch exceptional cases, when connection requires reset.
124 * 1. Last segment was sent recently. */
125 if ((s32
)(tcp_time_stamp
- tp
->lsndtime
) <= TCP_TIMEWAIT_LEN
||
126 /* 2. Window is closed. */
127 (!tp
->snd_wnd
&& !tp
->packets_out
))
130 tcp_send_active_reset(sk
, GFP_ATOMIC
);
132 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY
);
138 /* Calculate maximal number or retries on an orphaned socket. */
139 static int tcp_orphan_retries(struct sock
*sk
, int alive
)
141 int retries
= sysctl_tcp_orphan_retries
; /* May be zero. */
143 /* We know from an ICMP that something is wrong. */
144 if (sk
->sk_err_soft
&& !alive
)
147 /* However, if socket sent something recently, select some safe
148 * number of retries. 8 corresponds to >100 seconds with minimal
150 if (retries
== 0 && alive
)
155 /* A write timeout has occurred. Process the after effects. */
156 static int tcp_write_timeout(struct sock
*sk
)
158 struct tcp_sock
*tp
= tcp_sk(sk
);
161 if ((1 << sk
->sk_state
) & (TCPF_SYN_SENT
| TCPF_SYN_RECV
)) {
163 dst_negative_advice(&sk
->sk_dst_cache
);
164 retry_until
= tp
->syn_retries
? : sysctl_tcp_syn_retries
;
166 if (tp
->retransmits
>= sysctl_tcp_retries1
) {
167 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
170 It is place to make it. It is not made. I do not want
171 to make it. It is disguisting. It does not work in any
172 case. Let me to cite the same draft, which requires for
173 us to implement this:
175 "The one security concern raised by this memo is that ICMP black holes
176 are often caused by over-zealous security administrators who block
177 all ICMP messages. It is vitally important that those who design and
178 deploy security systems understand the impact of strict filtering on
179 upper-layer protocols. The safest web site in the world is worthless
180 if most TCP implementations cannot transfer data from it. It would
181 be far nicer to have all of the black holes fixed rather than fixing
182 all of the TCP implementations."
187 dst_negative_advice(&sk
->sk_dst_cache
);
190 retry_until
= sysctl_tcp_retries2
;
191 if (sock_flag(sk
, SOCK_DEAD
)) {
192 int alive
= (tp
->rto
< TCP_RTO_MAX
);
194 retry_until
= tcp_orphan_retries(sk
, alive
);
196 if (tcp_out_of_resources(sk
, alive
|| tp
->retransmits
< retry_until
))
201 if (tp
->retransmits
>= retry_until
) {
202 /* Has it gone just too far? */
209 static void tcp_delack_timer(unsigned long data
)
211 struct sock
*sk
= (struct sock
*)data
;
212 struct tcp_sock
*tp
= tcp_sk(sk
);
215 if (sock_owned_by_user(sk
)) {
216 /* Try again later. */
218 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED
);
219 sk_reset_timer(sk
, &tp
->delack_timer
, jiffies
+ TCP_DELACK_MIN
);
223 sk_stream_mem_reclaim(sk
);
225 if (sk
->sk_state
== TCP_CLOSE
|| !(tp
->ack
.pending
& TCP_ACK_TIMER
))
228 if (time_after(tp
->ack
.timeout
, jiffies
)) {
229 sk_reset_timer(sk
, &tp
->delack_timer
, tp
->ack
.timeout
);
232 tp
->ack
.pending
&= ~TCP_ACK_TIMER
;
234 if (skb_queue_len(&tp
->ucopy
.prequeue
)) {
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED
,
238 skb_queue_len(&tp
->ucopy
.prequeue
));
240 while ((skb
= __skb_dequeue(&tp
->ucopy
.prequeue
)) != NULL
)
241 sk
->sk_backlog_rcv(sk
, skb
);
243 tp
->ucopy
.memory
= 0;
246 if (tcp_ack_scheduled(tp
)) {
247 if (!tp
->ack
.pingpong
) {
248 /* Delayed ACK missed: inflate ATO. */
249 tp
->ack
.ato
= min(tp
->ack
.ato
<< 1, tp
->rto
);
251 /* Delayed ACK missed: leave pingpong mode and
254 tp
->ack
.pingpong
= 0;
255 tp
->ack
.ato
= TCP_ATO_MIN
;
258 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS
);
263 if (tcp_memory_pressure
)
264 sk_stream_mem_reclaim(sk
);
270 static void tcp_probe_timer(struct sock
*sk
)
272 struct tcp_sock
*tp
= tcp_sk(sk
);
275 if (tp
->packets_out
|| !sk
->sk_send_head
) {
280 /* *WARNING* RFC 1122 forbids this
282 * It doesn't AFAIK, because we kill the retransmit timer -AK
284 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
285 * this behaviour in Solaris down as a bug fix. [AC]
287 * Let me to explain. probes_out is zeroed by incoming ACKs
288 * even if they advertise zero window. Hence, connection is killed only
289 * if we received no ACKs for normal connection timeout. It is not killed
290 * only because window stays zero for some time, window may be zero
291 * until armageddon and even later. We are in full accordance
292 * with RFCs, only probe timer combines both retransmission timeout
293 * and probe timeout in one bottle. --ANK
295 max_probes
= sysctl_tcp_retries2
;
297 if (sock_flag(sk
, SOCK_DEAD
)) {
298 int alive
= ((tp
->rto
<<tp
->backoff
) < TCP_RTO_MAX
);
300 max_probes
= tcp_orphan_retries(sk
, alive
);
302 if (tcp_out_of_resources(sk
, alive
|| tp
->probes_out
<= max_probes
))
306 if (tp
->probes_out
> max_probes
) {
309 /* Only send another probe if we didn't close things up. */
315 * The TCP retransmit timer.
318 static void tcp_retransmit_timer(struct sock
*sk
)
320 struct tcp_sock
*tp
= tcp_sk(sk
);
322 if (!tp
->packets_out
)
325 BUG_TRAP(!skb_queue_empty(&sk
->sk_write_queue
));
327 if (!tp
->snd_wnd
&& !sock_flag(sk
, SOCK_DEAD
) &&
328 !((1 << sk
->sk_state
) & (TCPF_SYN_SENT
| TCPF_SYN_RECV
))) {
329 /* Receiver dastardly shrinks window. Our retransmits
330 * become zero probes, but we should not timeout this
331 * connection. If the socket is an orphan, time it out,
332 * we cannot allow such beasts to hang infinitely.
335 if (net_ratelimit()) {
336 struct inet_sock
*inet
= inet_sk(sk
);
337 printk(KERN_DEBUG
"TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
338 NIPQUAD(inet
->daddr
), htons(inet
->dport
),
339 inet
->num
, tp
->snd_una
, tp
->snd_nxt
);
342 if (tcp_time_stamp
- tp
->rcv_tstamp
> TCP_RTO_MAX
) {
346 tcp_enter_loss(sk
, 0);
347 tcp_retransmit_skb(sk
, skb_peek(&sk
->sk_write_queue
));
349 goto out_reset_timer
;
352 if (tcp_write_timeout(sk
))
355 if (tp
->retransmits
== 0) {
356 if (tp
->ca_state
== TCP_CA_Disorder
|| tp
->ca_state
== TCP_CA_Recovery
) {
357 if (tp
->rx_opt
.sack_ok
) {
358 if (tp
->ca_state
== TCP_CA_Recovery
)
359 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL
);
361 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES
);
363 if (tp
->ca_state
== TCP_CA_Recovery
)
364 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL
);
366 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES
);
368 } else if (tp
->ca_state
== TCP_CA_Loss
) {
369 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES
);
371 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS
);
375 if (tcp_use_frto(sk
)) {
378 tcp_enter_loss(sk
, 0);
381 if (tcp_retransmit_skb(sk
, skb_peek(&sk
->sk_write_queue
)) > 0) {
382 /* Retransmission failed because of local congestion,
385 if (!tp
->retransmits
)
387 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
,
388 min(tp
->rto
, TCP_RESOURCE_PROBE_INTERVAL
));
392 /* Increase the timeout each time we retransmit. Note that
393 * we do not increase the rtt estimate. rto is initialized
394 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
395 * that doubling rto each time is the least we can get away with.
396 * In KA9Q, Karn uses this for the first few times, and then
397 * goes to quadratic. netBSD doubles, but only goes up to *64,
398 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
399 * defined in the protocol as the maximum possible RTT. I guess
400 * we'll have to use something other than TCP to talk to the
401 * University of Mars.
403 * PAWS allows us longer timeouts and large windows, so once
404 * implemented ftp to mars will work nicely. We will have to fix
405 * the 120 second clamps though!
411 tp
->rto
= min(tp
->rto
<< 1, TCP_RTO_MAX
);
412 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
413 if (tp
->retransmits
> sysctl_tcp_retries1
)
419 static void tcp_write_timer(unsigned long data
)
421 struct sock
*sk
= (struct sock
*)data
;
422 struct tcp_sock
*tp
= tcp_sk(sk
);
426 if (sock_owned_by_user(sk
)) {
427 /* Try again later */
428 sk_reset_timer(sk
, &tp
->retransmit_timer
, jiffies
+ (HZ
/ 20));
432 if (sk
->sk_state
== TCP_CLOSE
|| !tp
->pending
)
435 if (time_after(tp
->timeout
, jiffies
)) {
436 sk_reset_timer(sk
, &tp
->retransmit_timer
, tp
->timeout
);
444 case TCP_TIME_RETRANS
:
445 tcp_retransmit_timer(sk
);
447 case TCP_TIME_PROBE0
:
454 sk_stream_mem_reclaim(sk
);
461 * Timer for listening sockets
464 static void tcp_synack_timer(struct sock
*sk
)
466 struct tcp_sock
*tp
= tcp_sk(sk
);
467 struct tcp_listen_opt
*lopt
= tp
->listen_opt
;
468 int max_retries
= tp
->syn_retries
? : sysctl_tcp_synack_retries
;
469 int thresh
= max_retries
;
470 unsigned long now
= jiffies
;
471 struct open_request
**reqp
, *req
;
474 if (lopt
== NULL
|| lopt
->qlen
== 0)
477 /* Normally all the openreqs are young and become mature
478 * (i.e. converted to established socket) for first timeout.
479 * If synack was not acknowledged for 3 seconds, it means
480 * one of the following things: synack was lost, ack was lost,
481 * rtt is high or nobody planned to ack (i.e. synflood).
482 * When server is a bit loaded, queue is populated with old
483 * open requests, reducing effective size of queue.
484 * When server is well loaded, queue size reduces to zero
485 * after several minutes of work. It is not synflood,
486 * it is normal operation. The solution is pruning
487 * too old entries overriding normal timeout, when
488 * situation becomes dangerous.
490 * Essentially, we reserve half of room for young
491 * embrions; and abort old ones without pity, if old
492 * ones are about to clog our table.
494 if (lopt
->qlen
>>(lopt
->max_qlen_log
-1)) {
495 int young
= (lopt
->qlen_young
<<1);
498 if (lopt
->qlen
< young
)
505 if (tp
->defer_accept
)
506 max_retries
= tp
->defer_accept
;
508 budget
= 2*(TCP_SYNQ_HSIZE
/(TCP_TIMEOUT_INIT
/TCP_SYNQ_INTERVAL
));
509 i
= lopt
->clock_hand
;
512 reqp
=&lopt
->syn_table
[i
];
513 while ((req
= *reqp
) != NULL
) {
514 if (time_after_eq(now
, req
->expires
)) {
515 if ((req
->retrans
< thresh
||
516 (req
->acked
&& req
->retrans
< max_retries
))
517 && !req
->class->rtx_syn_ack(sk
, req
, NULL
)) {
520 if (req
->retrans
++ == 0)
522 timeo
= min((TCP_TIMEOUT_INIT
<< req
->retrans
),
524 req
->expires
= now
+ timeo
;
525 reqp
= &req
->dl_next
;
529 /* Drop this request */
530 write_lock(&tp
->syn_wait_lock
);
531 *reqp
= req
->dl_next
;
532 write_unlock(&tp
->syn_wait_lock
);
534 if (req
->retrans
== 0)
536 tcp_openreq_free(req
);
539 reqp
= &req
->dl_next
;
542 i
= (i
+1)&(TCP_SYNQ_HSIZE
-1);
544 } while (--budget
> 0);
546 lopt
->clock_hand
= i
;
549 tcp_reset_keepalive_timer(sk
, TCP_SYNQ_INTERVAL
);
552 void tcp_delete_keepalive_timer (struct sock
*sk
)
554 sk_stop_timer(sk
, &sk
->sk_timer
);
557 void tcp_reset_keepalive_timer (struct sock
*sk
, unsigned long len
)
559 sk_reset_timer(sk
, &sk
->sk_timer
, jiffies
+ len
);
562 void tcp_set_keepalive(struct sock
*sk
, int val
)
564 if ((1 << sk
->sk_state
) & (TCPF_CLOSE
| TCPF_LISTEN
))
567 if (val
&& !sock_flag(sk
, SOCK_KEEPOPEN
))
568 tcp_reset_keepalive_timer(sk
, keepalive_time_when(tcp_sk(sk
)));
570 tcp_delete_keepalive_timer(sk
);
574 static void tcp_keepalive_timer (unsigned long data
)
576 struct sock
*sk
= (struct sock
*) data
;
577 struct tcp_sock
*tp
= tcp_sk(sk
);
580 /* Only process if socket is not in use. */
582 if (sock_owned_by_user(sk
)) {
583 /* Try again later. */
584 tcp_reset_keepalive_timer (sk
, HZ
/20);
588 if (sk
->sk_state
== TCP_LISTEN
) {
589 tcp_synack_timer(sk
);
593 if (sk
->sk_state
== TCP_FIN_WAIT2
&& sock_flag(sk
, SOCK_DEAD
)) {
594 if (tp
->linger2
>= 0) {
595 int tmo
= tcp_fin_time(tp
) - TCP_TIMEWAIT_LEN
;
598 tcp_time_wait(sk
, TCP_FIN_WAIT2
, tmo
);
602 tcp_send_active_reset(sk
, GFP_ATOMIC
);
606 if (!sock_flag(sk
, SOCK_KEEPOPEN
) || sk
->sk_state
== TCP_CLOSE
)
609 elapsed
= keepalive_time_when(tp
);
611 /* It is alive without keepalive 8) */
612 if (tp
->packets_out
|| sk
->sk_send_head
)
615 elapsed
= tcp_time_stamp
- tp
->rcv_tstamp
;
617 if (elapsed
>= keepalive_time_when(tp
)) {
618 if ((!tp
->keepalive_probes
&& tp
->probes_out
>= sysctl_tcp_keepalive_probes
) ||
619 (tp
->keepalive_probes
&& tp
->probes_out
>= tp
->keepalive_probes
)) {
620 tcp_send_active_reset(sk
, GFP_ATOMIC
);
624 if (tcp_write_wakeup(sk
) <= 0) {
626 elapsed
= keepalive_intvl_when(tp
);
628 /* If keepalive was lost due to local congestion,
631 elapsed
= TCP_RESOURCE_PROBE_INTERVAL
;
634 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
635 elapsed
= keepalive_time_when(tp
) - elapsed
;
639 sk_stream_mem_reclaim(sk
);
642 tcp_reset_keepalive_timer (sk
, elapsed
);
653 EXPORT_SYMBOL(tcp_clear_xmit_timers
);
654 EXPORT_SYMBOL(tcp_delete_keepalive_timer
);
655 EXPORT_SYMBOL(tcp_init_xmit_timers
);
656 EXPORT_SYMBOL(tcp_reset_keepalive_timer
);