2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.151 1999/09/07 02:31:21 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
57 * Alan Cox : Tidied tcp_data to avoid a potential
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
206 * This program is free software; you can redistribute it and/or
207 * modify it under the terms of the GNU General Public License
208 * as published by the Free Software Foundation; either version
209 * 2 of the License, or(at your option) any later version.
211 * Description of States:
213 * TCP_SYN_SENT sent a connection request, waiting for ack
215 * TCP_SYN_RECV received a connection request, sent ack,
216 * waiting for final ack in three-way handshake.
218 * TCP_ESTABLISHED connection established
220 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
221 * transmission of remaining buffered data
223 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * TCP_CLOSING both sides have shutdown but we still have
227 * data we have to finish sending
229 * TCP_TIME_WAIT timeout to catch resent junk before entering
230 * closed, can only be entered from FIN_WAIT2
231 * or CLOSING. Required because the other end
232 * may not have gotten our last ACK causing it
233 * to retransmit the data packet (which we ignore)
235 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
236 * us to finish writing our data and to shutdown
237 * (we have to close() to move on to LAST_ACK)
239 * TCP_LAST_ACK out side has shutdown after remote has
240 * shutdown. There may still be data in our
241 * buffer that we have to finish sending
243 * TCP_CLOSE socket is finished
248 * NOTE: I'm not going to be doing comments in the code for this one except
249 * for violations and the like. tcp.c is just too big... If I say something
250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
251 * with Alan. -- MS 950903
252 * [Note: Most of the TCP code has been rewriten/redesigned since this
253 * RFC1122 check. It is probably not correct anymore. It should be redone
256 * Use of PSH (4.2.2.2)
257 * MAY aggregate data sent without the PSH flag. (does)
258 * MAY queue data received without the PSH flag. (does)
259 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
260 * MAY implement PSH on send calls. (doesn't, thus:)
261 * MUST NOT buffer data indefinitely (doesn't [1 second])
262 * MUST set PSH on last segment (does)
263 * MAY pass received PSH to application layer (doesn't)
264 * SHOULD send maximum-sized segment whenever possible. (almost always does)
266 * Window Size (4.2.2.3, 4.2.2.16)
267 * MUST treat window size as an unsigned number (does)
268 * SHOULD treat window size as a 32-bit number (does not)
269 * MUST NOT shrink window once it is offered (does not normally)
271 * Urgent Pointer (4.2.2.4)
272 * **MUST point urgent pointer to last byte of urgent data (not right
273 * after). (doesn't, to be like BSD. That's configurable, but defaults
275 * MUST inform application layer asynchronously of incoming urgent
277 * MUST provide application with means of determining the amount of
278 * urgent data pending. (does)
279 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
280 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
281 * [Follows BSD 1 byte of urgent data]
283 * TCP Options (4.2.2.5)
284 * MUST be able to receive TCP options in any segment. (does)
285 * MUST ignore unsupported options (does)
287 * Maximum Segment Size Option (4.2.2.6)
288 * MUST implement both sending and receiving MSS. (does, but currently
289 * only uses the smaller of both of them)
290 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
291 * it always). (does, even when MSS == 536, which is legal)
292 * MUST assume MSS == 536 if no MSS received at connection setup (does)
293 * MUST calculate "effective send MSS" correctly:
294 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
295 * (does - but allows operator override)
297 * TCP Checksum (4.2.2.7)
298 * MUST generate and check TCP checksum. (does)
300 * Initial Sequence Number Selection (4.2.2.8)
301 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
302 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
303 * necessary for 10Mbps networks - and harder than BSD to spoof!
304 * With syncookies we don't)
306 * Simultaneous Open Attempts (4.2.2.10)
307 * MUST support simultaneous open attempts (does)
309 * Recovery from Old Duplicate SYN (4.2.2.11)
310 * MUST keep track of active vs. passive open (does)
312 * RST segment (4.2.2.12)
313 * SHOULD allow an RST segment to contain data (does, but doesn't do
314 * anything with it, which is standard)
316 * Closing a Connection (4.2.2.13)
317 * MUST inform application of whether connection was closed by RST or
318 * normal close. (does)
319 * MAY allow "half-duplex" close (treat connection as closed for the
320 * local app, even before handshake is done). (does)
321 * MUST linger in TIME_WAIT for 2 * MSL (does)
323 * Retransmission Timeout (4.2.2.15)
324 * MUST implement Jacobson's slow start and congestion avoidance
327 * Probing Zero Windows (4.2.2.17)
328 * MUST support probing of zero windows. (does)
329 * MAY keep offered window closed indefinitely. (does)
330 * MUST allow remote window to stay closed indefinitely. (does)
332 * Passive Open Calls (4.2.2.18)
333 * MUST NOT let new passive open affect other connections. (doesn't)
334 * MUST support passive opens (LISTENs) concurrently. (does)
336 * Time to Live (4.2.2.19)
337 * MUST make TCP TTL configurable. (does - IP_TTL option)
339 * Event Processing (4.2.2.20)
340 * SHOULD queue out-of-order segments. (does)
341 * MUST aggregate ACK segments whenever possible. (does but badly)
343 * Retransmission Timeout Calculation (4.2.3.1)
344 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
345 * calculation. (does, or at least explains them in the comments 8*b)
346 * SHOULD initialize RTO to 0 and RTT to 3. (does)
348 * When to Send an ACK Segment (4.2.3.2)
349 * SHOULD implement delayed ACK. (does)
350 * MUST keep ACK delay < 0.5 sec. (does)
352 * When to Send a Window Update (4.2.3.3)
353 * MUST implement receiver-side SWS. (does)
355 * When to Send Data (4.2.3.4)
356 * MUST implement sender-side SWS. (does)
357 * SHOULD implement Nagle algorithm. (does)
359 * TCP Connection Failures (4.2.3.5)
360 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
361 * SHOULD inform application layer of soft errors. (does)
363 * TCP Keep-Alives (4.2.3.6)
364 * MAY provide keep-alives. (does)
365 * MUST make keep-alives configurable on a per-connection basis. (does)
366 * MUST default to no keep-alives. (does)
367 * MUST make keep-alive interval configurable. (does)
368 * MUST make default keep-alive interval > 2 hours. (does)
369 * MUST NOT interpret failure to ACK keep-alive packet as dead
370 * connection. (doesn't)
371 * SHOULD send keep-alive with no data. (does)
373 * TCP Multihoming (4.2.3.7)
374 * MUST get source address from IP layer before sending first
376 * MUST use same local address for all segments of a connection. (does)
378 * IP Options (4.2.3.8)
379 * MUST ignore unsupported IP options. (does)
380 * MAY support Time Stamp and Record Route. (does)
381 * MUST allow application to specify a source route. (does)
382 * MUST allow received Source Route option to set route for all future
383 * segments on this connection. (does not (security issues))
385 * ICMP messages (4.2.3.9)
386 * MUST act on ICMP errors. (does)
387 * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
388 * because that is deprecated now by the IETF, can be turned on)
389 * MUST NOT abort connection upon receipt of soft Destination
390 * Unreachables (0, 1, 5), Time Exceededs and Parameter
391 * Problems. (doesn't)
392 * SHOULD report soft Destination Unreachables etc. to the
393 * application. (does, except during SYN_RECV and may drop messages
394 * in some rare cases before accept() - ICMP is unreliable)
395 * SHOULD abort connection upon receipt of hard Destination Unreachable
396 * messages (2, 3, 4). (does, but see above)
398 * Remote Address Validation (4.2.3.10)
399 * MUST reject as an error OPEN for invalid remote IP address. (does)
400 * MUST ignore SYN with invalid source address. (does)
401 * MUST silently discard incoming SYN for broadcast/multicast
404 * Asynchronous Reports (4.2.4.1)
405 * MUST provide mechanism for reporting soft errors to application
408 * Type of Service (4.2.4.2)
409 * MUST allow application layer to set Type of Service. (does IP_TOS)
411 * (Whew. -- MS 950903)
412 * (Updated by AK, but not complete yet.)
415 #include <linux/config.h>
416 #include <linux/types.h>
417 #include <linux/fcntl.h>
418 #include <linux/poll.h>
419 #include <linux/init.h>
420 #include <linux/smp_lock.h>
422 #include <net/icmp.h>
425 #include <asm/uaccess.h>
427 int sysctl_tcp_fin_timeout
= TCP_FIN_TIMEOUT
;
429 struct tcp_mib tcp_statistics
;
431 kmem_cache_t
*tcp_openreq_cachep
;
432 kmem_cache_t
*tcp_bucket_cachep
;
433 kmem_cache_t
*tcp_timewait_cachep
;
436 * Find someone to 'accept'. Must be called with
437 * the listening socket locked.
440 static struct open_request
*tcp_find_established(struct tcp_opt
*tp
,
441 struct open_request
**prevp
)
443 struct open_request
*req
= tp
->syn_wait_queue
;
444 struct open_request
*prev
= (struct open_request
*)&tp
->syn_wait_queue
;
447 if((1 << req
->sk
->state
) &
448 ~(TCPF_SYN_SENT
|TCPF_SYN_RECV
))
459 * Walk down the receive queue counting readable data.
461 * Must be called with the socket lock held.
464 static int tcp_readable(struct sock
*sk
)
466 unsigned long counted
;
467 unsigned long amount
;
471 SOCK_DEBUG(sk
, "tcp_readable: %p - ",sk
);
473 skb
= skb_peek(&sk
->receive_queue
);
475 SOCK_DEBUG(sk
, "empty\n");
479 counted
= sk
->tp_pinfo
.af_tcp
.copied_seq
; /* Where we are at the moment */
482 /* Do until a push or until we are out of data. */
484 /* Found a hole so stops here. */
485 if (before(counted
, TCP_SKB_CB(skb
)->seq
)) /* should not happen */
488 /* Length - header but start from where we are up to
491 sum
= skb
->len
- (counted
- TCP_SKB_CB(skb
)->seq
);
493 /* Add it up, move on. */
500 /* Don't count urg data ... but do it in the right place!
501 * Consider: "old_data (ptr is here) URG PUSH data"
502 * The old code would stop at the first push because
503 * it counted the urg (amount==1) and then does amount--
504 * *after* the loop. This means tcp_readable() always
505 * returned zero if any URG PUSH was in the queue, even
506 * though there was normal data available. If we subtract
507 * the urg data right here, we even get it to work for more
508 * than one URG PUSH skb without normal data.
509 * This means that poll() finally works now with urg data
510 * in the queue. Note that rlogin was never affected
511 * because it doesn't use poll(); it uses two processes
512 * and a blocking read(). And the queue scan in tcp_read()
513 * was correct. Mike <pall@rz.uni-karlsruhe.de>
516 /* Don't count urg data. */
520 if (amount
&& skb
->h
.th
->psh
) break;
523 } while(skb
!= (struct sk_buff
*)&sk
->receive_queue
);
525 SOCK_DEBUG(sk
, "got %lu bytes.\n",amount
);
530 * LISTEN is a special case for poll..
532 static unsigned int tcp_listen_poll(struct sock
*sk
, poll_table
*wait
)
534 struct open_request
*req
, *dummy
;
537 req
= tcp_find_established(&sk
->tp_pinfo
.af_tcp
, &dummy
);
540 return POLLIN
| POLLRDNORM
;
545 * Compute minimal free write space needed to queue new packets.
547 #define tcp_min_write_space(__sk) \
548 (atomic_read(&(__sk)->wmem_alloc) / 2)
551 * Wait for a TCP event.
553 * Note that we don't need to lock the socket, as the upper poll layers
554 * take care of normal races (between the test and the event) and we don't
555 * go look at any of the socket buffers directly.
557 unsigned int tcp_poll(struct file
* file
, struct socket
*sock
, poll_table
*wait
)
560 struct sock
*sk
= sock
->sk
;
561 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
563 poll_wait(file
, sk
->sleep
, wait
);
564 if (sk
->state
== TCP_LISTEN
)
565 return tcp_listen_poll(sk
, wait
);
567 /* Socket is not locked. We are protected from async events
568 by poll logic and correct handling of state changes
569 made by another threads is impossible in any case.
577 * POLLHUP is certainly not done right. But poll() doesn't
578 * have a notion of HUP in just one direction, and for a
579 * socket the read side is more interesting.
581 * Some poll() documentation says that POLLHUP is incompatible
582 * with the POLLOUT/POLLWR flags, so somebody should check this
583 * all. But careful, it tends to be safer to return too many
584 * bits than too few, and you can easily break real applications
585 * if you don't tell them that something has hung up!
589 if (sk
->shutdown
& RCV_SHUTDOWN
)
593 if ((1 << sk
->state
) & ~(TCPF_SYN_SENT
|TCPF_SYN_RECV
)) {
594 if ((tp
->rcv_nxt
!= tp
->copied_seq
) &&
595 (tp
->urg_seq
!= tp
->copied_seq
||
596 tp
->rcv_nxt
!= tp
->copied_seq
+1 ||
597 sk
->urginline
|| !tp
->urg_data
))
598 mask
|= POLLIN
| POLLRDNORM
;
600 if (!(sk
->shutdown
& SEND_SHUTDOWN
)) {
601 if (sock_wspace(sk
) >= tcp_min_write_space(sk
)) {
602 mask
|= POLLOUT
| POLLWRNORM
;
603 } else { /* send SIGIO later */
604 sk
->socket
->flags
|= SO_NOSPACE
;
608 if (tp
->urg_data
& URG_VALID
)
615 * Socket write_space callback.
616 * This (or rather the sock_wake_async) should agree with poll.
618 * WARNING. This callback is called from any context (process,
619 * bh or irq). Do not make anything more smart from it.
621 void tcp_write_space(struct sock
*sk
)
623 read_lock(&sk
->callback_lock
);
625 /* Why??!! Does it really not overshedule? --ANK */
626 wake_up_interruptible(sk
->sleep
);
628 if (sock_wspace(sk
) >= tcp_min_write_space(sk
))
629 sock_wake_async(sk
->socket
, 2, POLL_OUT
);
631 read_unlock(&sk
->callback_lock
);
635 int tcp_ioctl(struct sock
*sk
, int cmd
, unsigned long arg
)
641 #ifdef FIXME /* FIXME: */
644 if (sk
->state
== TCP_LISTEN
)
647 answ
= tcp_readable(sk
);
652 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
653 answ
= tp
->urg_data
&& tp
->urg_seq
== tp
->copied_seq
;
657 if (sk
->state
== TCP_LISTEN
)
659 answ
= sock_wspace(sk
);
662 return(-ENOIOCTLCMD
);
665 return put_user(answ
, (int *)arg
);
669 * Wait for a socket to get into the connected state
671 * Note: Must be called with the socket locked.
673 static int wait_for_tcp_connect(struct sock
* sk
, int flags
)
675 struct task_struct
*tsk
= current
;
676 DECLARE_WAITQUEUE(wait
, tsk
);
678 while((1 << sk
->state
) & ~(TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
)) {
680 return sock_error(sk
);
681 if((1 << sk
->state
) &
682 ~(TCPF_SYN_SENT
| TCPF_SYN_RECV
)) {
683 if(sk
->keepopen
&& !(flags
&MSG_NOSIGNAL
))
684 send_sig(SIGPIPE
, tsk
, 0);
687 if(flags
& MSG_DONTWAIT
)
689 if(signal_pending(tsk
))
692 __set_task_state(tsk
, TASK_INTERRUPTIBLE
);
693 add_wait_queue(sk
->sleep
, &wait
);
694 sk
->tp_pinfo
.af_tcp
.write_pending
++;
700 __set_task_state(tsk
, TASK_RUNNING
);
701 remove_wait_queue(sk
->sleep
, &wait
);
702 sk
->tp_pinfo
.af_tcp
.write_pending
--;
707 static inline int tcp_memory_free(struct sock
*sk
)
709 return atomic_read(&sk
->wmem_alloc
) < sk
->sndbuf
;
713 * Wait for more memory for a socket
715 static void wait_for_tcp_memory(struct sock
* sk
)
717 if (!tcp_memory_free(sk
)) {
718 DECLARE_WAITQUEUE(wait
, current
);
720 sk
->socket
->flags
&= ~SO_NOSPACE
;
721 add_wait_queue(sk
->sleep
, &wait
);
723 set_current_state(TASK_INTERRUPTIBLE
);
725 if (signal_pending(current
))
727 if (tcp_memory_free(sk
))
729 if (sk
->shutdown
& SEND_SHUTDOWN
)
734 if (!tcp_memory_free(sk
))
738 current
->state
= TASK_RUNNING
;
739 remove_wait_queue(sk
->sleep
, &wait
);
743 /* When all user supplied data has been queued set the PSH bit */
744 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
747 * This routine copies from a user buffer into a socket,
748 * and starts the transmit system.
750 * Note: must be called with the socket locked.
753 int tcp_do_sendmsg(struct sock
*sk
, struct msghdr
*msg
)
763 tp
= &(sk
->tp_pinfo
.af_tcp
);
765 /* Wait for a connection to finish. */
766 flags
= msg
->msg_flags
;
767 if ((1 << sk
->state
) & ~(TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
))
768 if((err
= wait_for_tcp_connect(sk
, flags
)) != 0)
771 /* This should be in poll */
772 sk
->socket
->flags
&= ~SO_NOSPACE
; /* clear SIGIO XXX */
774 mss_now
= tcp_current_mss(sk
);
776 /* Ok commence sending. */
777 iovlen
= msg
->msg_iovlen
;
781 while(--iovlen
>= 0) {
782 int seglen
=iov
->iov_len
;
783 unsigned char * from
=iov
->iov_base
;
788 int copy
, tmp
, queue_it
, psh
;
793 /* Stop on errors. */
797 /* Make sure that we are established. */
798 if (sk
->shutdown
& SEND_SHUTDOWN
)
801 /* Now we need to check if we have a half
802 * built packet we can tack some data onto.
804 if (tp
->send_head
&& !(flags
& MSG_OOB
)) {
805 skb
= sk
->write_queue
.prev
;
807 /* If the remote does SWS avoidance we should
808 * queue the best we can if not we should in
809 * fact send multiple packets...
810 * A method for detecting this would be most
813 if (skb_tailroom(skb
) > 0 &&
814 (mss_now
- copy
) > 0 &&
815 tp
->snd_nxt
< TCP_SKB_CB(skb
)->end_seq
) {
816 int last_byte_was_odd
= (copy
% 4);
818 copy
= mss_now
- copy
;
819 if(copy
> skb_tailroom(skb
))
820 copy
= skb_tailroom(skb
);
823 if(last_byte_was_odd
) {
824 if(copy_from_user(skb_put(skb
, copy
),
827 skb
->csum
= csum_partial(skb
->data
,
831 csum_and_copy_from_user(
832 from
, skb_put(skb
, copy
),
833 copy
, skb
->csum
, &err
);
836 * FIXME: the *_user functions should
837 * return how much data was
838 * copied before the fault
839 * occurred and then a partial
840 * packet with this data should
841 * be sent. Unfortunately
842 * csum_and_copy_from_user doesn't
843 * return this information.
844 * ATM it might send partly zeroed
847 tp
->write_seq
+= copy
;
848 TCP_SKB_CB(skb
)->end_seq
+= copy
;
853 TCP_SKB_CB(skb
)->flags
|= TCPCB_FLAG_PSH
;
858 /* We also need to worry about the window. If
859 * window < 1/2 the maximum window we've seen
860 * from this host, don't use it. This is
861 * sender side silly window prevention, as
862 * specified in RFC1122. (Note that this is
863 * different than earlier versions of SWS
864 * prevention, e.g. RFC813.). What we
865 * actually do is use the whole MSS. Since
866 * the results in the right edge of the packet
867 * being outside the window, it will be queued
868 * for later rather than sent.
871 copy
= tp
->snd_wnd
- (tp
->snd_nxt
- tp
->snd_una
);
872 if(copy
> (tp
->max_window
>> 1)) {
873 copy
= min(copy
, mss_now
);
881 /* Determine how large of a buffer to allocate. */
882 tmp
= MAX_HEADER
+ sk
->prot
->max_header
;
883 if (copy
< min(mss_now
, tp
->max_window
>> 1) &&
884 !(flags
& MSG_OOB
)) {
885 tmp
+= min(mss_now
, tp
->max_window
);
887 /* What is happening here is that we want to
888 * tack on later members of the users iovec
889 * if possible into a single frame. When we
890 * leave this loop our caller checks to see if
891 * we can send queued frames onto the wire.
892 * See tcp_v[46]_sendmsg() for this.
899 skb
= sock_wmalloc(sk
, tmp
, 0, GFP_KERNEL
);
901 /* If we didn't get any memory, we need to sleep. */
903 sk
->socket
->flags
|= SO_NOSPACE
;
904 if (flags
&MSG_DONTWAIT
) {
908 if (signal_pending(current
)) {
912 tcp_push_pending_frames(sk
, tp
);
913 wait_for_tcp_memory(sk
);
915 /* If SACK's were formed or PMTU events happened,
916 * we must find out about it.
918 mss_now
= tcp_current_mss(sk
);
924 /* Prepare control bits for TCP header creation engine. */
925 TCP_SKB_CB(skb
)->flags
= (TCPCB_FLAG_ACK
|
926 ((PSH_NEEDED
|| psh
) ?
927 TCPCB_FLAG_PSH
: 0));
928 TCP_SKB_CB(skb
)->sacked
= 0;
929 if (flags
& MSG_OOB
) {
930 TCP_SKB_CB(skb
)->flags
|= TCPCB_FLAG_URG
;
931 TCP_SKB_CB(skb
)->urg_ptr
= copy
;
933 TCP_SKB_CB(skb
)->urg_ptr
= 0;
935 /* TCP data bytes are SKB_PUT() on top, later
936 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
937 * Reserve header space and checksum the data.
939 skb_reserve(skb
, MAX_HEADER
+ sk
->prot
->max_header
);
940 skb
->csum
= csum_and_copy_from_user(from
,
941 skb_put(skb
, copy
), copy
, 0, &err
);
949 TCP_SKB_CB(skb
)->seq
= tp
->write_seq
;
950 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(skb
)->seq
+ copy
;
952 /* This advances tp->write_seq for us. */
953 tcp_send_skb(sk
, skb
, queue_it
);
964 err
= sock_error(sk
);
970 if (!(flags
&MSG_NOSIGNAL
))
971 send_sig(SIGPIPE
, current
, 0);
984 tcp_push_pending_frames(sk
, tp
);
991 * Send an ack if one is backlogged at this point. Ought to merge
992 * this with tcp_send_ack().
993 * This is called for delayed acks also.
996 void tcp_read_wakeup(struct sock
*sk
)
998 /* If we're closed, don't send an ack, or we'll get a RST
999 * from the closed destination.
1001 if (sk
->state
!= TCP_CLOSE
)
1006 * Handle reading urgent data. BSD has very simple semantics for
1007 * this, no blocking and very strange errors 8)
1010 static int tcp_recv_urg(struct sock
* sk
, int nonblock
,
1011 struct msghdr
*msg
, int len
, int flags
,
1014 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1016 /* No URG data to read. */
1017 if (sk
->urginline
|| !tp
->urg_data
|| tp
->urg_data
== URG_READ
)
1018 return -EINVAL
; /* Yes this is right ! */
1023 if (sk
->state
== TCP_CLOSE
|| (sk
->shutdown
& RCV_SHUTDOWN
)) {
1028 if (tp
->urg_data
& URG_VALID
) {
1030 char c
= tp
->urg_data
;
1032 if (!(flags
& MSG_PEEK
))
1033 tp
->urg_data
= URG_READ
;
1036 tp
->af_specific
->addr2sockaddr(sk
, (struct sockaddr
*)
1040 *addr_len
= tp
->af_specific
->sockaddr_len
;
1042 /* Read urgent data. */
1043 msg
->msg_flags
|=MSG_OOB
;
1046 err
= memcpy_toiovec(msg
->msg_iov
, &c
, 1);
1049 msg
->msg_flags
|=MSG_TRUNC
;
1051 return err
? -EFAULT
: len
;
1054 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1055 * the available implementations agree in this case:
1056 * this call should never block, independent of the
1057 * blocking state of the socket.
1058 * Mike <pall@rz.uni-karlsruhe.de>
1064 * Release a skb if it is no longer needed. This routine
1065 * must be called with interrupts disabled or with the
1066 * socket locked so that the sk_buff queue operation is ok.
1069 static inline void tcp_eat_skb(struct sock
*sk
, struct sk_buff
* skb
)
1071 __skb_unlink(skb
, &sk
->receive_queue
);
1075 /* Clean up the receive buffer for full frames taken by the user,
1076 * then send an ACK if necessary. COPIED is the number of bytes
1077 * tcp_recvmsg has given to the user so far, it speeds up the
1078 * calculation of whether or not we must ACK for the sake of
1081 static void cleanup_rbuf(struct sock
*sk
, int copied
)
1083 struct sk_buff
*skb
;
1085 /* NOTE! The socket must be locked, so that we don't get
1086 * a messed-up receive queue.
1088 while ((skb
=skb_peek(&sk
->receive_queue
)) != NULL
) {
1089 if (!skb
->used
|| atomic_read(&skb
->users
) > 1)
1091 tcp_eat_skb(sk
, skb
);
1094 /* We send an ACK if we can now advertise a non-zero window
1095 * which has been raised "significantly".
1098 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1099 __u32 rcv_window_now
= tcp_receive_window(tp
);
1100 __u32 new_window
= __tcp_select_window(sk
);
1102 /* We won't be raising the window any further than
1103 * the window-clamp allows. Our window selection
1104 * also keeps things a nice multiple of MSS. These
1105 * checks are necessary to prevent spurious ACKs
1106 * which don't advertize a larger window.
1108 if((new_window
&& (new_window
>= rcv_window_now
* 2)) &&
1109 ((rcv_window_now
+ tp
->mss_cache
) <= tp
->window_clamp
))
1110 tcp_read_wakeup(sk
);
1114 /* Now socket state including sk->err is changed only under lock,
1115 hence we should check only pending signals.
1118 static void tcp_data_wait(struct sock
*sk
)
1120 DECLARE_WAITQUEUE(wait
, current
);
1122 add_wait_queue(sk
->sleep
, &wait
);
1124 __set_current_state(TASK_INTERRUPTIBLE
);
1126 sk
->socket
->flags
|= SO_WAITDATA
;
1129 if (skb_queue_empty(&sk
->receive_queue
))
1133 sk
->socket
->flags
&= ~SO_WAITDATA
;
1135 remove_wait_queue(sk
->sleep
, &wait
);
1136 __set_current_state(TASK_RUNNING
);
1140 * This routine copies from a sock struct into the user buffer.
1143 int tcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
,
1144 int len
, int nonblock
, int flags
, int *addr_len
)
1146 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1149 volatile u32
*seq
; /* So gcc doesn't overoptimise */
1152 int target
= 1; /* Read at least this many bytes */
1160 if (sk
->state
== TCP_LISTEN
)
1163 /* Urgent data needs to be handled specially. */
1164 if (flags
& MSG_OOB
)
1167 /* Copying sequence to update. This is volatile to handle
1168 * the multi-reader case neatly (memcpy_to/fromfs might be
1169 * inline and thus not flush cached variables otherwise).
1171 peek_seq
= tp
->copied_seq
;
1172 seq
= &tp
->copied_seq
;
1173 if (flags
& MSG_PEEK
)
1176 /* Handle the POSIX bogosity MSG_WAITALL. */
1177 if (flags
& MSG_WAITALL
)
1183 * This violates 1003.1g compliance. We must wait for
1184 * data to exist even if we read none!
1188 struct sk_buff
* skb
;
1191 /* Are we at urgent data? Stop if we have read anything. */
1192 if (copied
&& tp
->urg_data
&& tp
->urg_seq
== *seq
)
1195 /* We need to check signals first, to get correct SIGURG
1196 * handling. FIXME: Need to check this doesnt impact 1003.1g
1197 * and move it down to the bottom of the loop
1199 if (signal_pending(current
)) {
1202 copied
= -ERESTARTSYS
;
1208 /* Next get a buffer. */
1210 skb
= skb_peek(&sk
->receive_queue
);
1215 /* Now that we have two receive queues this
1218 if (before(*seq
, TCP_SKB_CB(skb
)->seq
)) {
1219 printk(KERN_INFO
"recvmsg bug: copied %X seq %X\n",
1220 *seq
, TCP_SKB_CB(skb
)->seq
);
1223 offset
= *seq
- TCP_SKB_CB(skb
)->seq
;
1226 if (offset
< skb
->len
)
1230 if (!(flags
& MSG_PEEK
))
1233 } while (skb
!= (struct sk_buff
*)&sk
->receive_queue
);
1235 if (copied
>= target
)
1238 if (sk
->err
&& !(flags
&MSG_PEEK
)) {
1240 copied
= sock_error(sk
);
1244 if (sk
->shutdown
& RCV_SHUTDOWN
) {
1249 if (sk
->state
== TCP_CLOSE
) {
1264 cleanup_rbuf(sk
, copied
);
1269 /* Lock the buffer. We can be fairly relaxed as
1270 * an interrupt will never steal a buffer we are
1271 * using unless I've missed something serious in
1274 atomic_inc(&skb
->users
);
1276 /* Ok so how much can we use? */
1277 used
= skb
->len
- offset
;
1281 /* Do we have urgent data here? */
1283 u32 urg_offset
= tp
->urg_seq
- *seq
;
1284 if (urg_offset
< used
) {
1286 if (!sk
->urginline
) {
1296 /* Copy it - We _MUST_ update *seq first so that we
1297 * don't ever double read when we have dual readers
1301 /* This memcpy_toiovec can sleep. If it sleeps and we
1302 * do a second read it relies on the skb->users to avoid
1303 * a crash when cleanup_rbuf() gets called.
1305 err
= memcpy_toiovec(msg
->msg_iov
, ((unsigned char *)skb
->h
.th
) + skb
->h
.th
->doff
*4 + offset
, used
);
1307 /* Exception. Bailout! */
1308 atomic_dec(&skb
->users
);
1316 /* We now will not sleep again until we are finished
1317 * with skb. Sorry if you are doing the SMP port
1318 * but you'll just have to fix it neatly ;)
1320 * Very funny Alan... -DaveM
1322 atomic_dec(&skb
->users
);
1324 if (after(tp
->copied_seq
,tp
->urg_seq
))
1326 if (used
+ offset
< skb
->len
)
1329 /* Process the FIN. We may also need to handle PSH
1330 * here and make it break out of MSG_WAITALL.
1334 if (flags
& MSG_PEEK
)
1337 if (atomic_read(&skb
->users
) == 1)
1338 tcp_eat_skb(sk
, skb
);
1343 if (flags
& MSG_PEEK
)
1348 sk
->shutdown
|= RCV_SHUTDOWN
;
1352 if (copied
>= 0 && msg
->msg_name
)
1353 tp
->af_specific
->addr2sockaddr(sk
, (struct sockaddr
*)
1357 *addr_len
= tp
->af_specific
->sockaddr_len
;
1359 /* Clean up data we have read: This will do ACK frames. */
1360 cleanup_rbuf(sk
, copied
);
1365 err
= sock_error(sk
);
1372 err
= tcp_recv_urg(sk
, nonblock
, msg
, len
, flags
, addr_len
);
1377 * Check whether to renew the timer.
1379 static inline void tcp_check_fin_timer(struct sock
*sk
)
1381 if (sk
->state
== TCP_FIN_WAIT2
)
1382 tcp_reset_keepalive_timer(sk
, sysctl_tcp_fin_timeout
);
1386 * State processing on a close. This implements the state shift for
1387 * sending our FIN frame. Note that we only send a FIN for some
1388 * states. A shutdown() may have already sent the FIN, or we may be
1392 static unsigned char new_state
[16] = {
1393 /* current state: new state: action: */
1394 /* (Invalid) */ TCP_CLOSE
,
1395 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
1396 /* TCP_SYN_SENT */ TCP_CLOSE
,
1397 /* TCP_SYN_RECV */ TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
1398 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1
,
1399 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2
,
1400 /* TCP_TIME_WAIT */ TCP_CLOSE
,
1401 /* TCP_CLOSE */ TCP_CLOSE
,
1402 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK
| TCP_ACTION_FIN
,
1403 /* TCP_LAST_ACK */ TCP_LAST_ACK
,
1404 /* TCP_LISTEN */ TCP_CLOSE
,
1405 /* TCP_CLOSING */ TCP_CLOSING
,
1408 static int tcp_close_state(struct sock
*sk
, int dead
)
1410 int next
= (int) new_state
[sk
->state
];
1411 int ns
= (next
& TCP_STATE_MASK
);
1413 tcp_set_state(sk
, ns
);
1415 /* This is a (useful) BSD violating of the RFC. There is a
1416 * problem with TCP as specified in that the other end could
1417 * keep a socket open forever with no application left this end.
1418 * We use a 3 minute timeout (about the same as BSD) then kill
1419 * our end. If they send after that then tough - BUT: long enough
1420 * that we won't make the old 4*rto = almost no time - whoops
1424 tcp_check_fin_timer(sk
);
1426 return (next
& TCP_ACTION_FIN
);
1430 * Shutdown the sending side of a connection. Much like close except
1431 * that we don't receive shut down or set sk->dead.
1434 void tcp_shutdown(struct sock
*sk
, int how
)
1436 /* We need to grab some memory, and put together a FIN,
1437 * and then put it into the queue to be sent.
1438 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1440 if (!(how
& SEND_SHUTDOWN
))
1443 /* If we've already sent a FIN, or it's a closed state, skip this. */
1444 if ((1 << sk
->state
) &
1445 (TCPF_ESTABLISHED
|TCPF_SYN_SENT
|TCPF_SYN_RECV
|TCPF_CLOSE_WAIT
)) {
1447 /* Clear out any half completed packets. FIN if needed. */
1448 if (tcp_close_state(sk
,0))
1455 * Return 1 if we still have things to send in our buffers.
1458 static inline int closing(struct sock
* sk
)
1460 return ((1 << sk
->state
) & (TCPF_FIN_WAIT1
|TCPF_CLOSING
|TCPF_LAST_ACK
));
1464 * This routine closes sockets which have been at least partially
1465 * opened, but not yet accepted. Currently it is only called by
1469 static void tcp_close_pending (struct sock
*sk
)
1471 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1472 struct open_request
*req
= tp
->syn_wait_queue
;
1475 struct open_request
*iter
;
1478 tcp_close(req
->sk
, 0);
1486 tcp_dec_slow_timer(TCP_SLT_SYNACK
);
1489 (*iter
->class->destructor
)(iter
);
1490 tcp_openreq_free(iter
);
1492 BUG_TRAP(tp
->syn_backlog
== 0);
1493 BUG_TRAP(sk
->ack_backlog
== 0);
1497 static __inline__
void tcp_kill_sk_queues(struct sock
*sk
)
1499 /* First the read buffer. */
1500 skb_queue_purge(&sk
->receive_queue
);
1502 /* Next, the error queue. */
1503 skb_queue_purge(&sk
->error_queue
);
1505 /* Next, the write queue. */
1506 BUG_TRAP(skb_queue_empty(&sk
->write_queue
));
1508 /* It is _impossible_ for the backlog to contain anything
1509 * when we get here. All user references to this socket
1510 * have gone away, only the net layer knows can touch it.
1515 * At this point, there should be no process reference to this
1516 * socket, and thus no user references at all. Therefore we
1517 * can assume the socket waitqueue is inactive and nobody will
1518 * try to jump onto it.
1520 void tcp_destroy_sock(struct sock
*sk
)
1522 BUG_TRAP(sk
->state
==TCP_CLOSE
);
1525 /* It cannot be in hash table! */
1526 BUG_TRAP(sk
->pprev
==NULL
);
1528 /* It it has not 0 sk->num, it must be bound */
1529 BUG_TRAP(!sk
->num
|| sk
->prev
!=NULL
);
1531 sk
->prot
->destroy(sk
);
1533 tcp_kill_sk_queues(sk
);
1535 #ifdef INET_REFCNT_DEBUG
1536 if (atomic_read(&sk
->refcnt
) != 1) {
1537 printk(KERN_DEBUG
"Destruction TCP %p delayed, c=%d\n", sk
, atomic_read(&sk
->refcnt
));
1544 void tcp_close(struct sock
*sk
, long timeout
)
1546 struct sk_buff
*skb
;
1547 int data_was_unread
= 0;
1550 if(sk
->state
== TCP_LISTEN
) {
1551 tcp_set_state(sk
, TCP_CLOSE
);
1554 tcp_close_pending(sk
);
1556 goto adjudge_to_death
;
1559 sk
->shutdown
= SHUTDOWN_MASK
;
1561 /* We need to flush the recv. buffs. We do this only on the
1562 * descriptor close, not protocol-sourced closes, because the
1563 * reader process may not have drained the data yet!
1565 while((skb
=__skb_dequeue(&sk
->receive_queue
))!=NULL
) {
1566 u32 len
= TCP_SKB_CB(skb
)->end_seq
- TCP_SKB_CB(skb
)->seq
- skb
->h
.th
->fin
;
1567 data_was_unread
+= len
;
1571 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1572 * 3.10, we send a RST here because data was lost. To
1573 * witness the awful effects of the old behavior of always
1574 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1575 * a bulk GET in an FTP client, suspend the process, wait
1576 * for the client to advertise a zero window, then kill -9
1577 * the FTP client, wheee... Note: timeout is always zero
1580 if(data_was_unread
!= 0) {
1581 /* Unread data was tossed, zap the connection. */
1582 tcp_set_state(sk
, TCP_CLOSE
);
1583 tcp_send_active_reset(sk
, GFP_KERNEL
);
1584 } else if (tcp_close_state(sk
,1)) {
1585 /* We FIN if the application ate all the data before
1586 * zapping the connection.
1592 struct task_struct
*tsk
= current
;
1593 DECLARE_WAITQUEUE(wait
, current
);
1595 add_wait_queue(sk
->sleep
, &wait
);
1598 set_current_state(TASK_INTERRUPTIBLE
);
1602 timeout
= schedule_timeout(timeout
);
1604 if (!signal_pending(tsk
) || timeout
)
1608 tsk
->state
= TASK_RUNNING
;
1609 remove_wait_queue(sk
->sleep
, &wait
);
1612 /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1613 * we may need to set up a timer.
1615 tcp_check_fin_timer(sk
);
1618 /* It is the last release_sock in its life. It will remove backlog. */
1622 /* Now socket is owned by kernel and we acquire BH lock
1623 to finish close. No need to check for user refs.
1627 BUG_TRAP(sk
->lock
.users
==0);
1631 /* Announce socket dead, detach it from wait queue and inode. */
1632 write_lock_irq(&sk
->callback_lock
);
1636 write_unlock_irq(&sk
->callback_lock
);
1638 if (sk
->state
== TCP_CLOSE
)
1639 tcp_destroy_sock(sk
);
1640 /* Otherwise, socket is reprieved until protocol close. */
1647 int tcp_disconnect(struct sock
*sk
, int flags
)
1649 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
1653 old_state
= sk
->state
;
1654 if (old_state
!= TCP_CLOSE
)
1655 tcp_set_state(sk
, TCP_CLOSE
);
1657 /* ABORT function of RFC793 */
1658 if (old_state
== TCP_LISTEN
) {
1659 tcp_close_pending(sk
);
1660 } else if (tcp_connected(old_state
)) {
1661 tcp_send_active_reset(sk
, GFP_KERNEL
);
1662 sk
->err
= ECONNRESET
;
1663 } else if (old_state
== TCP_SYN_SENT
)
1664 sk
->err
= ECONNRESET
;
1666 tcp_clear_xmit_timers(sk
);
1667 __skb_queue_purge(&sk
->receive_queue
);
1668 __skb_queue_purge(&sk
->write_queue
);
1669 __skb_queue_purge(&tp
->out_of_order_queue
);
1675 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1676 memset(&sk
->net_pinfo
.af_inet6
.saddr
, 0, 16);
1677 memset(&sk
->net_pinfo
.af_inet6
.rcv_saddr
, 0, 16);
1683 sk
->write_space
= tcp_write_space
;
1685 #ifdef CONFIG_TCP_TW_RECYCLE
1686 if ((tp
->write_seq
+= 2) == 0)
1696 tp
->snd_ssthresh
= 0x7fffffff;
1697 tp
->snd_cwnd_cnt
= 0;
1699 tp
->delayed_acks
= 0;
1700 tp
->send_head
= tp
->retrans_head
= NULL
;
1704 BUG_TRAP(!sk
->num
|| sk
->prev
);
1706 sk
->error_report(sk
);
1711 * Wait for an incoming connection, avoid race
1712 * conditions. This must be called with the socket locked,
1713 * and without the kernel lock held.
1715 static struct open_request
* wait_for_connect(struct sock
* sk
,
1716 struct open_request
**pprev
)
1718 DECLARE_WAITQUEUE(wait
, current
);
1719 struct open_request
*req
;
1722 * True wake-one mechanism for incoming connections: only
1723 * one process gets woken up, not the 'whole herd'.
1724 * Since we do not 'race & poll' for established sockets
1725 * anymore, the common case will execute the loop only once.
1727 * Subtle issue: "add_wait_queue_exclusive()" will be added
1728 * after any current non-exclusive waiters, and we know that
1729 * it will always _stay_ after any new non-exclusive waiters
1730 * because all non-exclusive waiters are added at the
1731 * beginning of the wait-queue. As such, it's ok to "drop"
1732 * our exclusiveness temporarily when we get woken up without
1733 * having to remove and re-insert us on the wait queue.
1735 add_wait_queue_exclusive(sk
->sleep
, &wait
);
1737 current
->state
= TASK_EXCLUSIVE
| TASK_INTERRUPTIBLE
;
1741 req
= tcp_find_established(&(sk
->tp_pinfo
.af_tcp
), pprev
);
1744 if (signal_pending(current
))
1747 current
->state
= TASK_RUNNING
;
1748 remove_wait_queue(sk
->sleep
, &wait
);
1753 * This will accept the next outstanding connection.
1755 * Be careful about race conditions here - this is subtle.
1758 struct sock
*tcp_accept(struct sock
*sk
, int flags
, int *err
)
1760 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
1761 struct open_request
*req
, *prev
;
1767 /* We need to make sure that this socket is listening,
1768 * and that it has something pending.
1771 if (sk
->state
!= TCP_LISTEN
)
1774 /* Find already established connection */
1775 req
= tcp_find_established(tp
, &prev
);
1777 /* If this is a non blocking socket don't sleep */
1779 if (flags
& O_NONBLOCK
)
1782 error
= -ERESTARTSYS
;
1783 req
= wait_for_connect(sk
, &prev
);
1788 tcp_synq_unlink(tp
, req
, prev
);
1790 req
->class->destructor(req
);
1791 tcp_openreq_free(req
);
1803 * Socket option code for TCP.
1806 int tcp_setsockopt(struct sock
*sk
, int level
, int optname
, char *optval
,
1809 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1813 if (level
!= SOL_TCP
)
1814 return tp
->af_specific
->setsockopt(sk
, level
, optname
,
1817 if(optlen
<sizeof(int))
1820 if (get_user(val
, (int *)optval
))
1827 /* values greater than interface MTU won't take effect. however at
1828 * the point when this call is done we typically don't yet know
1829 * which interface is going to be used
1831 if(val
< 1 || val
> MAX_WINDOW
) {
1839 /* You cannot try to use this and TCP_CORK in
1840 * tandem, so let the user know.
1842 if (sk
->nonagle
== 2) {
1846 sk
->nonagle
= (val
== 0) ? 0 : 1;
1850 /* When set indicates to always queue non-full frames.
1851 * Later the user clears this option and we transmit
1852 * any pending partial frames in the queue. This is
1853 * meant to be used alongside sendfile() to get properly
1854 * filled frames when the user (for example) must write
1855 * out headers with a write() call first and then use
1856 * sendfile to send out the data parts.
1858 * You cannot try to use TCP_NODELAY and this mechanism
1859 * at the same time, so let the user know.
1861 if (sk
->nonagle
== 1) {
1870 tcp_push_pending_frames(sk
, tp
);
1875 if (val
< 1 || val
> MAX_TCP_KEEPIDLE
)
1878 tp
->keepalive_time
= val
* HZ
;
1880 __u32 elapsed
= tcp_time_stamp
- tp
->rcv_tstamp
;
1881 if (tp
->keepalive_time
> elapsed
)
1882 elapsed
= tp
->keepalive_time
- elapsed
;
1885 tcp_reset_keepalive_timer(sk
, elapsed
);
1890 if (val
< 1 || val
> MAX_TCP_KEEPINTVL
)
1893 tp
->keepalive_intvl
= val
* HZ
;
1896 if (val
< 1 || val
> MAX_TCP_KEEPCNT
)
1899 tp
->keepalive_probes
= val
;
1902 if (val
< 1 || val
> MAX_TCP_SYNCNT
)
1905 tp
->syn_retries
= val
;
1916 int tcp_getsockopt(struct sock
*sk
, int level
, int optname
, char *optval
,
1919 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1922 if(level
!= SOL_TCP
)
1923 return tp
->af_specific
->getsockopt(sk
, level
, optname
,
1926 if(get_user(len
,optlen
))
1929 len
= min(len
, sizeof(int));
1936 val
= (sk
->nonagle
== 1);
1939 val
= (sk
->nonagle
== 2);
1942 if (tp
->keepalive_time
)
1943 val
= tp
->keepalive_time
/ HZ
;
1945 val
= sysctl_tcp_keepalive_time
/ HZ
;
1948 if (tp
->keepalive_intvl
)
1949 val
= tp
->keepalive_intvl
/ HZ
;
1951 val
= sysctl_tcp_keepalive_intvl
/ HZ
;
1954 if (tp
->keepalive_probes
)
1955 val
= tp
->keepalive_probes
;
1957 val
= sysctl_tcp_keepalive_probes
;
1960 if (tp
->syn_retries
)
1961 val
= tp
->syn_retries
;
1963 val
= sysctl_tcp_syn_retries
;
1966 return -ENOPROTOOPT
;
1969 if(put_user(len
, optlen
))
1971 if(copy_to_user(optval
, &val
,len
))
1977 extern void __skb_cb_too_small_for_tcp(int, int);
1979 void __init
tcp_init(void)
1981 struct sk_buff
*skb
= NULL
;
1985 if(sizeof(struct tcp_skb_cb
) > sizeof(skb
->cb
))
1986 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb
),
1989 tcp_openreq_cachep
= kmem_cache_create("tcp_open_request",
1990 sizeof(struct open_request
),
1991 0, SLAB_HWCACHE_ALIGN
,
1993 if(!tcp_openreq_cachep
)
1994 panic("tcp_init: Cannot alloc open_request cache.");
1996 tcp_bucket_cachep
= kmem_cache_create("tcp_bind_bucket",
1997 sizeof(struct tcp_bind_bucket
),
1998 0, SLAB_HWCACHE_ALIGN
,
2000 if(!tcp_bucket_cachep
)
2001 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2003 tcp_timewait_cachep
= kmem_cache_create("tcp_tw_bucket",
2004 sizeof(struct tcp_tw_bucket
),
2005 0, SLAB_HWCACHE_ALIGN
,
2007 if(!tcp_timewait_cachep
)
2008 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2010 /* Size and allocate the main established and bind bucket
2013 * The methodology is similar to that of the buffer cache.
2015 goal
= num_physpages
>> (23 - PAGE_SHIFT
);
2017 for(order
= 0; (1UL << order
) < goal
; order
++)
2020 tcp_ehash_size
= (1UL << order
) * PAGE_SIZE
/
2021 sizeof(struct tcp_ehash_bucket
);
2022 tcp_ehash_size
>>= 1;
2023 while (tcp_ehash_size
& (tcp_ehash_size
-1))
2025 tcp_ehash
= (struct tcp_ehash_bucket
*)
2026 __get_free_pages(GFP_ATOMIC
, order
);
2027 } while (tcp_ehash
== NULL
&& --order
> 0);
2030 panic("Failed to allocate TCP established hash table\n");
2031 for (i
= 0; i
< (tcp_ehash_size
<<1); i
++) {
2032 tcp_ehash
[i
].lock
= RW_LOCK_UNLOCKED
;
2033 tcp_ehash
[i
].chain
= NULL
;
2037 tcp_bhash_size
= (1UL << order
) * PAGE_SIZE
/
2038 sizeof(struct tcp_bind_hashbucket
);
2039 if ((tcp_bhash_size
> (64 * 1024)) && order
> 0)
2041 tcp_bhash
= (struct tcp_bind_hashbucket
*)
2042 __get_free_pages(GFP_ATOMIC
, order
);
2043 } while (tcp_bhash
== NULL
&& --order
>= 0);
2046 panic("Failed to allocate TCP bind hash table\n");
2047 for (i
= 0; i
< tcp_bhash_size
; i
++) {
2048 tcp_bhash
[i
].lock
= SPIN_LOCK_UNLOCKED
;
2049 tcp_bhash
[i
].chain
= NULL
;
2053 sysctl_local_port_range
[0] = 32768;
2054 sysctl_local_port_range
[1] = 61000;
2055 } else if (order
< 3) {
2056 sysctl_local_port_range
[0] = 1024*(3-order
);
2058 tcp_port_rover
= sysctl_local_port_range
[0] - 1;
2060 printk("TCP: Hash tables configured (established %d bind %d)\n",
2061 tcp_ehash_size
<<1, tcp_bhash_size
);