2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: @(#)tcp.c 1.0.16 05/25/93
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
17 * Alan Cox : Numerous verify_area() calls
18 * Alan Cox : Set the ACK bit on a reset
19 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1
20 * and was trying to connect (tcp_err()).
21 * Alan Cox : All icmp error handling was broken
22 * pointers passed where wrong and the
23 * socket was looked up backwards. Nobody
24 * tested any icmp error code obviously.
25 * Alan Cox : tcp_err() now handled properly. It wakes people
26 * on errors. select behaves and the icmp error race
27 * has gone by moving it into sock.c
28 * Alan Cox : tcp_reset() fixed to work for everything not just
29 * packets for unknown sockets.
30 * Alan Cox : tcp option processing.
31 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong]
32 * Herp Rosmanith : More reset fixes
33 * Alan Cox : No longer acks invalid rst frames. Acking
34 * any kind of RST is right out.
35 * Alan Cox : Sets an ignore me flag on an rst receive
36 * otherwise odd bits of prattle escape still
37 * Alan Cox : Fixed another acking RST frame bug. Should stop
38 * LAN workplace lockups.
39 * Alan Cox : Some tidyups using the new skb list facilities
40 * Alan Cox : sk->keepopen now seems to work
41 * Alan Cox : Pulls options out correctly on accepts
42 * Alan Cox : Fixed assorted sk->rqueue->next errors
43 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops.
44 * Alan Cox : Tidied tcp_data to avoid a potential nasty.
45 * Alan Cox : Added some beter commenting, as the tcp is hard to follow
46 * Alan Cox : Removed incorrect check for 20 * psh
47 * Michael O'Reilly : ack < copied bug fix.
48 * Johannes Stille : Misc tcp fixes (not all in yet).
49 * Alan Cox : FIN with no memory -> CRASH
50 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept.
51 * Alan Cox : Added TCP options (SOL_TCP)
52 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets.
53 * Alan Cox : Use ip_tos/ip_ttl settings.
54 * Alan Cox : Handle FIN (more) properly (we hope).
55 * Alan Cox : RST frames sent on unsynchronised state ack error/
56 * Alan Cox : Put in missing check for SYN bit.
57 * Alan Cox : Added tcp_select_window() aka NET2E
58 * window non shrink trick.
59 * Alan Cox : Added a couple of small NET2E timer fixes
60 * Charles Hedrick : TCP fixes
61 * Toomas Tamm : TCP window fixes
65 * Possibly a problem with accept(). BSD accept never fails after
66 * it causes a select. Linux can - given the official select semantics I
67 * feel that _really_ its the BSD network programs that are bust (notably
68 * inetd, which hangs occasionally because of this).
69 * Add VJ Fastrecovery algorithm ?
70 * Protocol closedown badly messed up.
71 * Incompatiblity with spider ports (tcp hangs on that
72 * socket occasionally).
73 * MSG_PEEK and read on same socket at once can cause crashes.
75 * This program is free software; you can redistribute it and/or
76 * modify it under the terms of the GNU General Public License
77 * as published by the Free Software Foundation; either version
78 * 2 of the License, or(at your option) any later version.
80 #include <linux/types.h>
81 #include <linux/sched.h>
83 #include <linux/string.h>
84 #include <linux/socket.h>
85 #include <linux/sockios.h>
86 #include <linux/termios.h>
88 #include <linux/fcntl.h>
98 #include <linux/errno.h>
99 #include <linux/timer.h>
100 #include <asm/system.h>
101 #include <asm/segment.h>
102 #include <linux/mm.h>
105 unsigned long seq_offset
;
106 #define SUBNETSARELOCAL
108 static __inline__
int
109 min(unsigned int a
, unsigned int b
)
111 if (a
< b
) return(a
);
117 print_th(struct tcphdr
*th
)
121 if (inet_debug
!= DBG_TCP
) return;
123 printk("TCP header:\n");
124 ptr
=(unsigned char *)(th
+ 1);
125 printk(" source=%d, dest=%d, seq =%ld, ack_seq = %ld\n",
126 ntohs(th
->source
), ntohs(th
->dest
),
127 ntohl(th
->seq
), ntohl(th
->ack_seq
));
128 printk(" fin=%d, syn=%d, rst=%d, psh=%d, ack=%d, urg=%d res1=%d res2=%d\n",
129 th
->fin
, th
->syn
, th
->rst
, th
->psh
, th
->ack
,
130 th
->urg
, th
->res1
, th
->res2
);
131 printk(" window = %d, check = %d urg_ptr = %d\n",
132 ntohs(th
->window
), ntohs(th
->check
), ntohs(th
->urg_ptr
));
133 printk(" doff = %d\n", th
->doff
);
134 printk(" options = %d %d %d %d\n", ptr
[0], ptr
[1], ptr
[2], ptr
[3]);
139 /* This routine grabs the first thing off of a rcv queue. */
140 static struct sk_buff
*
141 get_firstr(struct sock
*sk
)
143 return skb_dequeue(&sk
->rqueue
);
147 * Difference between two values in tcp ack terms.
151 diff(unsigned long seq1
, unsigned long seq2
)
156 if (d
> 0) return(d
);
158 /* I hope this returns what I want. */
162 /* This routine picks a TCP windows for a socket based on
163 the following constraints
165 1. The window can never be shrunk once it is offered (RFC 793)
166 2. We limit memory per socket
168 For now we use NET2E3's heuristic of offering half the memory
169 we have handy. All is not as bad as this seems however because
170 of two things. Firstly we will bin packets even within the window
171 in order to get the data we are waiting for into the memory limit.
172 Secondly we bin common duplicate forms at receive time
174 Better heuristics welcome
177 static int tcp_select_window(struct sock
*sk
)
179 int new_window
= sk
->prot
->rspace(sk
);
182 * two things are going on here. First, we don't ever offer a
183 * window less than min(sk->mss, MAX_WINDOW/2). This is the
184 * receiver side of SWS as specified in RFC1122.
185 * Second, we always give them at least the window they
186 * had before, in order to avoid retracting window. This
187 * is technically allowed, but RFC1122 advises against it and
188 * in practice it causes trouble.
190 if (new_window
< min(sk
->mss
, MAX_WINDOW
/2) ||
191 new_window
< sk
->window
)
196 /* Enter the time wait state. */
198 static void tcp_time_wait(struct sock
*sk
)
200 sk
->state
= TCP_TIME_WAIT
;
201 sk
->shutdown
= SHUTDOWN_MASK
;
203 sk
->state_change(sk
);
204 reset_timer(sk
, TIME_CLOSE
, TCP_TIMEWAIT_LEN
);
208 * A timer event has trigger a tcp retransmit timeout. The
209 * socket xmit queue is ready and set up to send. Because
210 * the ack receive code keeps the queue straight we do
211 * nothing clever here.
215 tcp_retransmit(struct sock
*sk
, int all
)
218 ip_retransmit(sk
, all
);
222 sk
->ssthresh
= sk
->cong_window
>> 1; /* remember window where we lost */
223 /* sk->ssthresh in theory can be zero. I guess that's OK */
228 /* Do the actual retransmit. */
229 ip_retransmit(sk
, all
);
234 * This routine is called by the ICMP module when it gets some
235 * sort of error condition. If err < 0 then the socket should
236 * be closed and the error returned to the user. If err > 0
237 * it's just the icmp type << 8 | icmp code. After adjustment
238 * header points to the first 8 bytes of the tcp header. We need
239 * to find the appropriate port.
242 tcp_err(int err
, unsigned char *header
, unsigned long daddr
,
243 unsigned long saddr
, struct inet_protocol
*protocol
)
247 struct iphdr
*iph
=(struct iphdr
*)header
;
251 DPRINTF((DBG_TCP
, "TCP: tcp_err(%d, hdr=%X, daddr=%X saddr=%X, protocol=%X)\n",
252 err
, header
, daddr
, saddr
, protocol
));
254 th
=(struct tcphdr
*)header
;
255 sk
= get_sock(&tcp_prot
, th
->source
/*dest*/, daddr
, th
->dest
/*source*/, saddr
);
258 if (sk
== NULL
) return;
263 sk
->error_report(sk
);
267 if ((err
& 0xff00) == (ICMP_SOURCE_QUENCH
<< 8)) {
270 * For now we will just trigger a linear backoff.
271 * The slow start code should cause a real backoff here.
273 if (sk
->cong_window
> 4) sk
->cong_window
--;
277 DPRINTF((DBG_TCP
, "TCP: icmp_err got error\n"));
278 sk
->err
= icmp_err_convert
[err
& 0xff].errno
;
281 * If we've already connected we will keep trying
282 * until we time out, or the user gives up.
284 if (icmp_err_convert
[err
& 0xff].fatal
) {
285 if (sk
->state
== TCP_SYN_SENT
) {
286 sk
->state
= TCP_CLOSE
;
287 sk
->error_report(sk
); /* Wake people up to see the error (see connect in sock.c) */
295 * Walk down the receive queue counting readable data until we hit the end or we find a gap
296 * in the received data queue (ie a frame missing that needs sending to us)
300 tcp_readable(struct sock
*sk
)
302 unsigned long counted
;
303 unsigned long amount
;
309 DPRINTF((DBG_TCP
, "tcp_readable(sk=%X)\n", sk
));
311 printk("tcp_readable: %p - ",sk
);
313 if (sk
== NULL
|| skb_peek(&sk
->rqueue
) == NULL
) /* Empty sockets are easy! */
320 counted
= sk
->copied_seq
+1; /* Where we are at the moment */
323 save_flags(flags
); /* So nobody adds things at the wrong moment */
325 skb
=(struct sk_buff
*)sk
->rqueue
;
327 /* Do until a push or until we are out of data. */
331 /* This is wrong: It breaks Chameleon amongst other stacks */
333 restore_flags(flags
);
334 DPRINTF((DBG_TCP
, "tcp_readable, more than 20 packets without a psh\n"));
335 printk("tcp_read: possible read_queue corruption.\n");
339 if (before(counted
, skb
->h
.th
->seq
)) /* Found a hole so stops here */
341 sum
= skb
->len
-(counted
- skb
->h
.th
->seq
); /* Length - header but start from where we are up to (avoid overlaps) */
342 if (skb
->h
.th
->syn
) sum
++;
343 if (skb
->h
.th
->urg
) {
344 sum
-= ntohs(skb
->h
.th
->urg_ptr
); /* Dont count urg data */
346 if (sum
>= 0) { /* Add it up, move on */
348 if (skb
->h
.th
->syn
) amount
--;
351 if (amount
&& skb
->h
.th
->psh
) break;
352 skb
=(struct sk_buff
*)skb
->next
; /* Move along */
353 } while(skb
!= sk
->rqueue
);
354 restore_flags(flags
);
355 DPRINTF((DBG_TCP
, "tcp readable returning %d bytes\n", amount
));
357 printk("got %lu bytes.\n",amount
);
363 * Wait for a TCP event. Note the oddity with SEL_IN and reading. The
364 * listening socket has a receive queue of sockets to accept.
368 tcp_select(struct sock
*sk
, int sel_type
, select_table
*wait
)
370 DPRINTF((DBG_TCP
, "tcp_select(sk=%X, sel_type = %d, wait = %X)\n",
371 sk
, sel_type
, wait
));
378 select_wait(sk
->sleep
, wait
);
380 printk("-select out");
381 if (skb_peek(&sk
->rqueue
) != NULL
) {
382 if (sk
->state
== TCP_LISTEN
|| tcp_readable(sk
)) {
385 printk("-select ok data\n");
389 if (sk
->err
!= 0) /* Receiver error */
393 printk("-select ok error");
396 if (sk
->shutdown
& RCV_SHUTDOWN
) {
399 printk("-select ok down\n");
404 printk("-select fail\n");
408 select_wait(sk
->sleep
, wait
);
409 if (sk
->shutdown
& SEND_SHUTDOWN
) {
411 "write select on shutdown socket.\n"));
413 /* FIXME: should this return an error? */
420 * Hack so it will probably be able to write
421 * something if it says it's ok to write.
423 if (sk
->prot
->wspace(sk
) >= sk
->mss
) {
425 /* This should cause connect to work ok. */
426 if (sk
->state
== TCP_SYN_RECV
||
427 sk
->state
== TCP_SYN_SENT
) return(0);
431 "tcp_select: sleeping on write sk->wmem_alloc = %d, "
432 "sk->packets_out = %d\n"
433 "sk->wback = %X, sk->wfront = %X\n"
434 "sk->send_seq = %u, sk->window_seq=%u\n",
435 sk
->wmem_alloc
, sk
->packets_out
,
436 sk
->wback
, sk
->wfront
,
437 sk
->send_seq
, sk
->window_seq
));
442 select_wait(sk
->sleep
,wait
);
457 tcp_ioctl(struct sock
*sk
, int cmd
, unsigned long arg
)
460 DPRINTF((DBG_TCP
, "tcp_ioctl(sk=%X, cmd = %d, arg=%X)\n", sk
, cmd
, arg
));
463 return(dbg_ioctl((void *) arg
, DBG_TCP
));
466 #ifdef FIXME /* FIXME: */
470 unsigned long amount
;
472 if (sk
->state
== TCP_LISTEN
) return(-EINVAL
);
475 amount
= tcp_readable(sk
);
477 DPRINTF((DBG_TCP
, "returning %d\n", amount
));
478 err
=verify_area(VERIFY_WRITE
,(void *)arg
,
479 sizeof(unsigned long));
482 put_fs_long(amount
,(unsigned long *)arg
);
491 * Try to figure out if we need to read
495 if ((skb
=skb_peek(&sk
->rqueue
)) != NULL
)
497 if (sk
->copied_seq
+1 == skb
->h
.th
->seq
&& skb
->h
.th
->urg
)
501 err
=verify_area(VERIFY_WRITE
,(void *) arg
,
502 sizeof(unsigned long));
505 put_fs_long(answ
,(int *) arg
);
510 unsigned long amount
;
512 if (sk
->state
== TCP_LISTEN
) return(-EINVAL
);
513 amount
= sk
->prot
->wspace(sk
);
514 err
=verify_area(VERIFY_WRITE
,(void *)arg
,
515 sizeof(unsigned long));
518 put_fs_long(amount
,(unsigned long *)arg
);
527 /* This routine computes a TCP checksum. */
529 tcp_check(struct tcphdr
*th
, int len
,
530 unsigned long saddr
, unsigned long daddr
)
534 if (saddr
== 0) saddr
= my_addr();
536 __asm__("\t addl %%ecx,%%ebx\n"
537 "\t adcl %%edx,%%ebx\n"
538 "\t adcl $0, %%ebx\n"
540 : "0"(daddr
), "c"(saddr
), "d"((ntohs(len
) << 16) + IPPROTO_TCP
*256)
547 "\t adcl %%eax, %%ebx\n"
549 "\t adcl $0, %%ebx\n"
550 : "=b"(sum
) , "=S"(th
)
551 : "0"(sum
), "c"(len
/4) ,"1"(th
)
552 : "ax", "cx", "bx", "si" );
555 /* Convert from 32 bits to 16 bits. */
556 __asm__("\t movl %%ebx, %%ecx\n"
557 "\t shrl $16,%%ecx\n"
558 "\t addw %%cx, %%bx\n"
564 /* Check for an extra word. */
565 if ((len
& 2) != 0) {
567 "\t addw %%ax,%%bx\n"
569 : "=b"(sum
), "=S"(th
)
574 /* Now check for the extra byte. */
575 if ((len
& 1) != 0) {
578 "\t addw %%ax,%%bx\n"
585 /* We only want the bottom 16 bits, but we never cleared the top 16. */
586 return((~sum
) & 0xffff);
590 void tcp_send_check(struct tcphdr
*th
, unsigned long saddr
,
591 unsigned long daddr
, int len
, struct sock
*sk
)
594 th
->check
= tcp_check(th
, len
, saddr
, daddr
);
598 static void tcp_send_skb(struct sock
*sk
, struct sk_buff
*skb
)
602 /* length of packet (not counting length of pre-tcp headers) */
603 size
= skb
->len
- ((unsigned char *) skb
->h
.th
- skb
->data
);
605 /* sanity check it.. */
606 if (size
< sizeof(struct tcphdr
) || size
> skb
->len
) {
607 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
608 skb
, skb
->data
, skb
->h
.th
, skb
->len
);
609 kfree_skb(skb
, FREE_WRITE
);
613 /* If we have queued a header size packet.. */
614 if (size
== sizeof(struct tcphdr
)) {
615 /* If its got a syn or fin its notionally included in the size..*/
616 if(!skb
->h
.th
->syn
&& !skb
->h
.th
->fin
) {
617 printk("tcp_send_skb: attempt to queue a bogon.\n");
618 kfree_skb(skb
,FREE_WRITE
);
623 /* We need to complete and send the packet. */
624 tcp_send_check(skb
->h
.th
, sk
->saddr
, sk
->daddr
, size
, sk
);
626 skb
->h
.seq
= sk
->send_seq
;
627 if (after(sk
->send_seq
, sk
->window_seq
) ||
628 (sk
->retransmits
&& sk
->timeout
== TIME_WRITE
) ||
629 sk
->packets_out
>= sk
->cong_window
) {
630 DPRINTF((DBG_TCP
, "sk->cong_window = %d, sk->packets_out = %d\n",
631 sk
->cong_window
, sk
->packets_out
));
632 DPRINTF((DBG_TCP
, "sk->send_seq = %d, sk->window_seq = %d\n",
633 sk
->send_seq
, sk
->window_seq
));
635 skb
->magic
= TCP_WRITE_QUEUE_MAGIC
;
636 if (sk
->wback
== NULL
) {
639 sk
->wback
->next
= skb
;
642 if (before(sk
->window_seq
, sk
->wfront
->h
.seq
) &&
643 sk
->send_head
== NULL
&&
644 sk
->ack_backlog
== 0)
645 reset_timer(sk
, TIME_PROBE0
, sk
->rto
);
647 sk
->prot
->queue_xmit(sk
, skb
->dev
, skb
, 0);
651 struct sk_buff
* tcp_dequeue_partial(struct sock
* sk
)
653 struct sk_buff
* skb
;
661 del_timer(&sk
->partial_timer
);
663 restore_flags(flags
);
667 static void tcp_send_partial(struct sock
*sk
)
673 while ((skb
= tcp_dequeue_partial(sk
)) != NULL
)
674 tcp_send_skb(sk
, skb
);
677 void tcp_enqueue_partial(struct sk_buff
* skb
, struct sock
* sk
)
679 struct sk_buff
* tmp
;
686 del_timer(&sk
->partial_timer
);
688 sk
->partial_timer
.expires
= HZ
;
689 sk
->partial_timer
.function
= (void (*)(unsigned long)) tcp_send_partial
;
690 sk
->partial_timer
.data
= (unsigned long) sk
;
691 add_timer(&sk
->partial_timer
);
692 restore_flags(flags
);
694 tcp_send_skb(sk
, tmp
);
698 /* This routine sends an ack and also updates the window. */
700 tcp_send_ack(unsigned long sequence
, unsigned long ack
,
702 struct tcphdr
*th
, unsigned long daddr
)
704 struct sk_buff
*buff
;
706 struct device
*dev
= NULL
;
710 return; /* We have been reset, we may not send again */
712 * We need to grab some memory, and put together an ack,
713 * and then put it into the queue to be sent.
715 buff
= sk
->prot
->wmalloc(sk
, MAX_ACK_SIZE
, 1, GFP_ATOMIC
);
717 /* Force it to send an ack. */
719 if (sk
->timeout
!= TIME_WRITE
&& tcp_connected(sk
->state
)) {
720 reset_timer(sk
, TIME_WRITE
, 10);
722 if (inet_debug
== DBG_SLIP
) printk("\rtcp_ack: malloc failed\n");
726 buff
->mem_addr
= buff
;
727 buff
->mem_len
= MAX_ACK_SIZE
;
728 buff
->len
= sizeof(struct tcphdr
);
730 t1
=(struct tcphdr
*) buff
->data
;
732 /* Put in the IP header and routing stuff. */
733 tmp
= sk
->prot
->build_header(buff
, sk
->saddr
, daddr
, &dev
,
734 IPPROTO_TCP
, sk
->opt
, MAX_ACK_SIZE
,sk
->ip_tos
,sk
->ip_ttl
);
737 sk
->prot
->wfree(sk
, buff
->mem_addr
, buff
->mem_len
);
738 if (inet_debug
== DBG_SLIP
) printk("\rtcp_ack: build_header failed\n");
742 t1
=(struct tcphdr
*)((char *)t1
+tmp
);
745 memcpy(t1
, th
, sizeof(*t1
)); /* this should probably be removed */
747 /* swap the send and the receive. */
748 t1
->dest
= th
->source
;
749 t1
->source
= th
->dest
;
750 t1
->seq
= ntohl(sequence
);
752 sk
->window
= tcp_select_window(sk
);/*sk->prot->rspace(sk);*/
753 t1
->window
= ntohs(sk
->window
);
761 if (ack
== sk
->acked_seq
) {
765 if (sk
->send_head
== NULL
&& sk
->wfront
== NULL
&& sk
->timeout
== TIME_WRITE
)
768 reset_timer(sk
,TIME_KEEPOPEN
,TCP_TIMEOUT_LEN
);
773 t1
->ack_seq
= ntohl(ack
);
774 t1
->doff
= sizeof(*t1
)/4;
775 tcp_send_check(t1
, sk
->saddr
, daddr
, sizeof(*t1
), sk
);
777 printk("\rtcp_ack: seq %lx ack %lx\n", sequence
, ack
);
778 sk
->prot
->queue_xmit(sk
, dev
, buff
, 1);
782 /* This routine builds a generic TCP header. */
784 tcp_build_header(struct tcphdr
*th
, struct sock
*sk
, int push
)
787 /* FIXME: want to get rid of this. */
788 memcpy(th
,(void *) &(sk
->dummy_th
), sizeof(*th
));
789 th
->seq
= htonl(sk
->send_seq
);
790 th
->psh
=(push
== 0) ? 1 : 0;
791 th
->doff
= sizeof(*th
)/4;
797 th
->ack_seq
= htonl(sk
->acked_seq
);
798 sk
->window
= tcp_select_window(sk
)/*sk->prot->rspace(sk)*/;
799 th
->window
= htons(sk
->window
);
805 * This routine copies from a user buffer into a socket,
806 * and starts the transmit system.
809 tcp_write(struct sock
*sk
, unsigned char *from
,
810 int len
, int nonblock
, unsigned flags
)
816 struct sk_buff
*send_tmp
;
819 struct device
*dev
= NULL
;
821 DPRINTF((DBG_TCP
, "tcp_write(sk=%X, from=%X, len=%d, nonblock=%d, flags=%X)\n",
822 sk
, from
, len
, nonblock
, flags
));
827 if (sk
->err
) { /* Stop on an error */
829 if (copied
) return(copied
);
835 /* First thing we do is make sure that we are established. */
836 if (sk
->shutdown
& SEND_SHUTDOWN
) {
839 if (copied
) return(copied
);
845 /* Wait for a connection to finish. */
847 while(sk
->state
!= TCP_ESTABLISHED
&& sk
->state
!= TCP_CLOSE_WAIT
) {
850 if (copied
) return(copied
);
856 if (sk
->state
!= TCP_SYN_SENT
&& sk
->state
!= TCP_SYN_RECV
) {
858 DPRINTF((DBG_TCP
, "tcp_write: return 1\n"));
859 if (copied
) return(copied
);
868 send_sig(SIGPIPE
, current
, 0);
873 if (nonblock
|| copied
) {
875 DPRINTF((DBG_TCP
, "tcp_write: return 2\n"));
876 if (copied
) return(copied
);
882 if (sk
->state
!= TCP_ESTABLISHED
&&
883 sk
->state
!= TCP_CLOSE_WAIT
&& sk
->err
== 0) {
884 interruptible_sleep_on(sk
->sleep
);
885 if (current
->signal
& ~current
->blocked
) {
887 DPRINTF((DBG_TCP
, "tcp_write: return 3\n"));
888 if (copied
) return(copied
);
889 return(-ERESTARTSYS
);
897 * The following code can result in copy <= if sk->mss is ever
898 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
899 * sk->mtu is constant once SYN processing is finished. I.e. we
900 * had better not get here until we've seen his SYN and at least one
901 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
902 * But ESTABLISHED should guarantee that. sk->max_window is by definition
903 * non-decreasing. Note that any ioctl to set user_mss must be done
904 * before the exchange of SYN's. If the initial ack from the other
905 * end has a window of 0, max_window and thus mss will both be 0.
908 /* Now we need to check if we have a half built packet. */
909 if ((skb
= tcp_dequeue_partial(sk
)) != NULL
) {
912 /* IP header + TCP header */
913 hdrlen
= ((unsigned long)skb
->h
.th
- (unsigned long)skb
->data
)
914 + sizeof(struct tcphdr
);
916 /* Add more stuff to the end of skb->len */
917 if (!(flags
& MSG_OOB
)) {
918 copy
= min(sk
->mss
- (skb
->len
- hdrlen
), len
);
919 /* FIXME: this is really a bug. */
921 printk("TCP: **bug**: \"copy\" <= 0!!\n");
925 memcpy_fromfs(skb
->data
+ skb
->len
, from
, copy
);
930 sk
->send_seq
+= copy
;
932 if ((skb
->len
- hdrlen
) >= sk
->mss
||
935 tcp_send_skb(sk
, skb
);
937 tcp_enqueue_partial(skb
, sk
);
942 * We also need to worry about the window.
943 * If window < 1/2 the maximum window we've seen from this
944 * host, don't use it. This is sender side
945 * silly window prevention, as specified in RFC1122.
946 * (Note that this is diffferent than earlier versions of
947 * SWS prevention, e.g. RFC813.). What we actually do is
948 * use the whole MSS. Since the results in the right
949 * edge of the packet being outside the window, it will
950 * be queued for later rather than sent.
953 copy
= diff(sk
->window_seq
, sk
->send_seq
);
954 /* what if max_window == 1? In that case max_window >> 1 is 0.
955 * however in that case copy == max_window, so it's OK to use
957 if (copy
< (sk
->max_window
>> 1))
959 copy
= min(copy
, sk
->mss
);
960 copy
= min(copy
, len
);
962 /* We should really check the window here also. */
964 if (copy
< sk
->mss
&& !(flags
& MSG_OOB
)) {
965 /* We will release the socket incase we sleep here. */
967 /* NB: following must be mtu, because mss can be increased.
968 * mss is always <= mtu */
969 skb
= prot
->wmalloc(sk
, sk
->mtu
+ 128 + prot
->max_header
+ sizeof(*skb
), 0, GFP_KERNEL
);
973 /* We will release the socket incase we sleep here. */
975 skb
= prot
->wmalloc(sk
, copy
+ prot
->max_header
+ sizeof(*skb
), 0, GFP_KERNEL
);
979 /* If we didn't get any memory, we need to sleep. */
981 if (nonblock
/* || copied */) {
983 DPRINTF((DBG_TCP
, "tcp_write: return 4\n"));
984 if (copied
) return(copied
);
988 /* FIXME: here is another race condition. */
989 tmp
= sk
->wmem_alloc
;
992 /* Again we will try to avoid it. */
993 if (tmp
<= sk
->wmem_alloc
&&
994 (sk
->state
== TCP_ESTABLISHED
||sk
->state
== TCP_CLOSE_WAIT
)
996 interruptible_sleep_on(sk
->sleep
);
997 if (current
->signal
& ~current
->blocked
) {
999 DPRINTF((DBG_TCP
, "tcp_write: return 5\n"));
1000 if (copied
) return(copied
);
1001 return(-ERESTARTSYS
);
1016 * FIXME: we need to optimize this.
1017 * Perhaps some hints here would be good.
1019 tmp
= prot
->build_header(skb
, sk
->saddr
, sk
->daddr
, &dev
,
1020 IPPROTO_TCP
, sk
->opt
, skb
->mem_len
,sk
->ip_tos
,sk
->ip_ttl
);
1022 prot
->wfree(sk
, skb
->mem_addr
, skb
->mem_len
);
1024 DPRINTF((DBG_TCP
, "tcp_write: return 6\n"));
1025 if (copied
) return(copied
);
1031 skb
->h
.th
=(struct tcphdr
*) buff
;
1032 tmp
= tcp_build_header((struct tcphdr
*)buff
, sk
, len
-copy
);
1034 prot
->wfree(sk
, skb
->mem_addr
, skb
->mem_len
);
1036 DPRINTF((DBG_TCP
, "tcp_write: return 7\n"));
1037 if (copied
) return(copied
);
1041 if (flags
& MSG_OOB
) {
1042 ((struct tcphdr
*)buff
)->urg
= 1;
1043 ((struct tcphdr
*)buff
)->urg_ptr
= ntohs(copy
);
1046 memcpy_fromfs(buff
+tmp
, from
, copy
);
1053 sk
->send_seq
+= copy
;
1055 if (send_tmp
!= NULL
&& sk
->packets_out
) {
1056 tcp_enqueue_partial(send_tmp
, sk
);
1059 tcp_send_skb(sk
, skb
);
1064 * Nagles rule. Turn Nagle off with TCP_NODELAY for highly
1065 * interactive fast network servers. It's meant to be on and
1066 * it really improves the throughput though not the echo time
1067 * on my slow slip link - Alan
1070 /* Avoid possible race on send_tmp - c/o Johannes Stille */
1073 /* If not nagling we can send on the before case too.. */
1074 || (sk
->nonagle
&& before(sk
->send_seq
, sk
->window_seq
))
1076 tcp_send_partial(sk
);
1079 DPRINTF((DBG_TCP
, "tcp_write: return 8\n"));
1085 tcp_sendto(struct sock
*sk
, unsigned char *from
,
1086 int len
, int nonblock
, unsigned flags
,
1087 struct sockaddr_in
*addr
, int addr_len
)
1089 struct sockaddr_in sin
;
1091 if (addr_len
< sizeof(sin
)) return(-EINVAL
);
1092 memcpy_fromfs(&sin
, addr
, sizeof(sin
));
1093 if (sin
.sin_family
&& sin
.sin_family
!= AF_INET
) return(-EINVAL
);
1094 if (sin
.sin_port
!= sk
->dummy_th
.dest
) return(-EINVAL
);
1095 if (sin
.sin_addr
.s_addr
!= sk
->daddr
) return(-EINVAL
);
1096 return(tcp_write(sk
, from
, len
, nonblock
, flags
));
1101 tcp_read_wakeup(struct sock
*sk
)
1104 struct device
*dev
= NULL
;
1106 struct sk_buff
*buff
;
1108 DPRINTF((DBG_TCP
, "in tcp read wakeup\n"));
1109 if (!sk
->ack_backlog
) return;
1112 * FIXME: we need to put code here to prevent this routine from
1113 * being called. Being called once in a while is ok, so only check
1114 * if this is the second time in a row.
1118 * We need to grab some memory, and put together an ack,
1119 * and then put it into the queue to be sent.
1121 buff
= sk
->prot
->wmalloc(sk
,MAX_ACK_SIZE
,1, GFP_ATOMIC
);
1123 /* Try again real soon. */
1124 reset_timer(sk
, TIME_WRITE
, 10);
1128 buff
->mem_addr
= buff
;
1129 buff
->mem_len
= MAX_ACK_SIZE
;
1130 buff
->len
= sizeof(struct tcphdr
);
1133 /* Put in the IP header and routing stuff. */
1134 tmp
= sk
->prot
->build_header(buff
, sk
->saddr
, sk
->daddr
, &dev
,
1135 IPPROTO_TCP
, sk
->opt
, MAX_ACK_SIZE
,sk
->ip_tos
,sk
->ip_ttl
);
1138 sk
->prot
->wfree(sk
, buff
->mem_addr
, buff
->mem_len
);
1143 t1
=(struct tcphdr
*)(buff
->data
+tmp
);
1145 memcpy(t1
,(void *) &sk
->dummy_th
, sizeof(*t1
));
1146 t1
->seq
= ntohl(sk
->send_seq
);
1154 sk
->ack_backlog
= 0;
1156 sk
->window
= tcp_select_window(sk
);/*sk->prot->rspace(sk);*/
1157 t1
->window
= ntohs(sk
->window
);
1158 t1
->ack_seq
= ntohl(sk
->acked_seq
);
1159 t1
->doff
= sizeof(*t1
)/4;
1160 tcp_send_check(t1
, sk
->saddr
, sk
->daddr
, sizeof(*t1
), sk
);
1161 sk
->prot
->queue_xmit(sk
, dev
, buff
, 1);
1167 * This routine frees used buffers.
1168 * It should consider sending an ACK to let the
1169 * other end know we now have a bigger window.
1172 cleanup_rbuf(struct sock
*sk
)
1174 unsigned long flags
;
1176 struct sk_buff
*skb
;
1179 printk("cleaning rbuf for sk=%p\n", sk
);
1184 left
= sk
->prot
->rspace(sk
);
1187 * We have to loop through all the buffer headers,
1188 * and try to free up all the space we can.
1190 while((skb
=skb_peek(&sk
->rqueue
)) != NULL
)
1196 kfree_skb(skb
, FREE_READ
);
1199 restore_flags(flags
);
1203 * At this point we should send an ack if the difference
1204 * in the window, and the amount of space is bigger than
1207 DPRINTF((DBG_TCP
, "sk->window left = %d, sk->prot->rspace(sk)=%d\n",
1208 sk
->window
- sk
->bytes_rcv
, sk
->prot
->rspace(sk
)));
1211 printk("sk->rspace = %lu, was %d\n", sk
->prot
->rspace(sk
),
1213 if (sk
->prot
->rspace(sk
) != left
)
1216 * This area has caused the most trouble. The current strategy
1217 * is to simply do nothing if the other end has room to send at
1218 * least 3 full packets, because the ack from those will auto-
1219 * matically update the window. If the other end doesn't think
1220 * we have much space left, but we have room for atleast 1 more
1221 * complete packet than it thinks we do, we will send an ack
1222 * immediatedly. Otherwise we will wait up to .5 seconds in case
1223 * the user reads some more.
1227 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1228 * if the other end is offering a window smaller than the agreed on MSS
1229 * (called sk->mtu here). In theory there's no connection between send
1230 * and receive, and so no reason to think that they're going to send
1231 * small packets. For the moment I'm using the hack of reducing the mss
1232 * only on the send side, so I'm putting mtu here.
1234 if ((sk
->prot
->rspace(sk
) > (sk
->window
- sk
->bytes_rcv
+ sk
->mtu
))) {
1235 /* Send an ack right now. */
1236 tcp_read_wakeup(sk
);
1238 /* Force it to send an ack soon. */
1239 int was_active
= del_timer(&sk
->timer
);
1240 if (!was_active
|| TCP_ACK_TIME
< sk
->timer
.expires
) {
1241 reset_timer(sk
, TIME_WRITE
, TCP_ACK_TIME
);
1243 add_timer(&sk
->timer
);
1249 /* Handle reading urgent data. */
1251 tcp_read_urg(struct sock
* sk
, int nonblock
,
1252 unsigned char *to
, int len
, unsigned flags
)
1255 struct sk_buff
*skb
;
1257 DPRINTF((DBG_TCP
, "tcp_read_urg(sk=%X, to=%X, len=%d, flags=%X)\n",
1258 sk
, to
, len
, flags
));
1263 while(sk
->urg
==0 || skb_peek(&sk
->rqueue
) == NULL
) {
1268 if (copied
) return(copied
);
1274 if (sk
->state
== TCP_CLOSE
|| sk
->done
) {
1276 if (copied
) return(copied
);
1284 if (sk
->shutdown
& RCV_SHUTDOWN
) {
1291 if (nonblock
|| copied
) {
1293 if (copied
) return(copied
);
1297 /* Now at this point, we may have gotten some data. */
1300 if ((sk
->urg
== 0 || skb_peek(&sk
->rqueue
) == NULL
) &&
1301 sk
->err
== 0 && !(sk
->shutdown
& RCV_SHUTDOWN
)) {
1302 interruptible_sleep_on(sk
->sleep
);
1303 if (current
->signal
& ~current
->blocked
) {
1305 if (copied
) return(copied
);
1306 return(-ERESTARTSYS
);
1313 skb
= skb_peek(&sk
->rqueue
);
1317 if (skb
->h
.th
->urg
&& !skb
->urg_used
) {
1318 if (skb
->h
.th
->urg_ptr
== 0) {
1319 skb
->h
.th
->urg_ptr
= ntohs(skb
->len
);
1321 amt
= min(ntohs(skb
->h
.th
->urg_ptr
),len
);
1324 memcpy_tofs(to
,(unsigned char *)(skb
->h
.th
) +
1325 skb
->h
.th
->doff
*4, amt
);
1328 if (!(flags
& MSG_PEEK
)) {
1336 skb
=(struct sk_buff
*)skb
->next
;
1337 } while(skb
!= sk
->rqueue
);
1345 /* This routine copies from a sock struct into the user buffer. */
1347 tcp_read(struct sock
*sk
, unsigned char *to
,
1348 int len
, int nonblock
, unsigned flags
)
1350 int copied
=0; /* will be used to say how much has been copied. */
1351 struct sk_buff
*skb
;
1352 unsigned long offset
;
1356 if (len
== 0) return(0);
1361 err
=verify_area(VERIFY_WRITE
,to
,len
);
1365 /* This error should be checked. */
1366 if (sk
->state
== TCP_LISTEN
) return(-ENOTCONN
);
1368 /* Urgent data needs to be handled specially. */
1369 if ((flags
& MSG_OOB
))
1370 return(tcp_read_urg(sk
, nonblock
, to
, len
, flags
));
1372 /* So no-one else will use this socket. */
1375 skb
=skb_peek(&sk
->rqueue
);
1377 DPRINTF((DBG_TCP
, "tcp_read(sk=%X, to=%X, len=%d, nonblock=%d, flags=%X)\n",
1378 sk
, to
, len
, nonblock
, flags
));
1381 /* skb->used just checks to see if we've gone all the way around. */
1383 /* While no data, or first data indicates some is missing, or data is used */
1384 while(skb
== NULL
||
1385 before(sk
->copied_seq
+1, skb
->h
.th
->seq
) || skb
->used
) {
1386 DPRINTF((DBG_TCP
, "skb = %X:\n", skb
));
1395 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n",
1404 if (sk
->state
== TCP_CLOSE
)
1408 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n",
1419 if (sk
->shutdown
& RCV_SHUTDOWN
)
1422 if (copied
== 0) sk
->done
= 1;
1423 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n", copied
));
1427 if (nonblock
|| copied
)
1431 printk("read: EAGAIN\n");
1434 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n",
1441 if ((flags
& MSG_PEEK
) && copied
!= 0)
1444 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n", copied
));
1448 DPRINTF((DBG_TCP
, "tcp_read about to sleep. state = %d\n",
1453 * Now we may have some data waiting or we could
1454 * have changed state.
1457 if (sk
->shutdown
& RCV_SHUTDOWN
|| sk
->err
!= 0) {
1463 if (skb_peek(&sk
->rqueue
) == NULL
||
1464 before(sk
->copied_seq
+1, sk
->rqueue
->h
.th
->seq
)) {
1466 printk("Read wait sleep\n");
1467 interruptible_sleep_on(sk
->sleep
);
1469 printk("Read wait wakes\n");
1470 if (current
->signal
& ~current
->blocked
) {
1473 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n",
1477 return(-ERESTARTSYS
);
1482 DPRINTF((DBG_TCP
, "tcp_read woke up. \n"));
1485 skb
=skb_peek(&sk
->rqueue
);
1486 /* That may have been null if we were beaten, if so we loop again */
1490 * Copy anything from the current block that needs
1491 * to go into the user buffer.
1493 offset
= sk
->copied_seq
+1 - skb
->h
.th
->seq
;
1495 if (skb
->h
.th
->syn
) offset
--;
1496 if (offset
< skb
->len
) /* Some of the packet is useful */
1499 * If there is urgent data we must either
1500 * return or skip over it.
1506 sk
->copied_seq
+= ntohs(skb
->h
.th
->urg_ptr
);
1507 offset
+= ntohs(skb
->h
.th
->urg_ptr
);
1508 if (offset
>= skb
->len
)
1511 skb
=(struct sk_buff
*)skb
->next
;
1520 send_sig(SIGURG
, current
, 0);
1524 /* Ok so how much can we use ? */
1525 used
= min(skb
->len
- offset
, len
);
1527 memcpy_tofs(to
,((unsigned char *)skb
->h
.th
) +
1528 skb
->h
.th
->doff
*4 + offset
, used
);
1533 /* If we were reading the data is 'eaten' */
1534 if (!(flags
& MSG_PEEK
))
1535 sk
->copied_seq
+= used
;
1538 * Mark this data used if we are really reading it,
1539 * and if it doesn't contain any urgent data. And we
1540 * have used all the data.
1542 if (!(flags
& MSG_PEEK
) &&
1543 (!skb
->h
.th
->urg
|| skb
->urg_used
) &&
1544 (used
+ offset
>= skb
->len
))
1548 * See if this is the end of a message or if the
1549 * remaining data is urgent.
1551 if (/*skb->h.th->psh || */skb
->h
.th
->urg
)
1557 { /* already used this data, must be a retransmit */
1560 /* Move along a packet */
1561 skb
=(struct sk_buff
*)skb
->next
;
1563 /* Clean up data we have read: This will do ACK frames */
1566 DPRINTF((DBG_TCP
, "tcp_read: returning %d\n", copied
));
1567 if (copied
== 0 && nonblock
)
1574 * Send a FIN without closing the connection.
1575 * Not called at interrupt time.
1578 tcp_shutdown(struct sock
*sk
, int how
)
1580 struct sk_buff
*buff
;
1581 struct tcphdr
*t1
, *th
;
1584 struct device
*dev
= NULL
;
1587 * We need to grab some memory, and put together a FIN,
1588 * and then put it into the queue to be sent.
1590 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1591 * Most of this is guesswork, so maybe it will work...
1593 /* If we've already sent a FIN, return. */
1594 if (sk
->state
== TCP_FIN_WAIT1
|| sk
->state
== TCP_FIN_WAIT2
) return;
1595 if (!(how
& SEND_SHUTDOWN
)) return;
1598 /* Clear out any half completed packets. */
1600 tcp_send_partial(sk
);
1602 prot
=(struct proto
*)sk
->prot
;
1603 th
=(struct tcphdr
*)&sk
->dummy_th
;
1604 release_sock(sk
); /* incase the malloc sleeps. */
1605 buff
= prot
->wmalloc(sk
, MAX_RESET_SIZE
,1 , GFP_KERNEL
);
1606 if (buff
== NULL
) return;
1609 DPRINTF((DBG_TCP
, "tcp_shutdown_send buff = %X\n", buff
));
1610 buff
->mem_addr
= buff
;
1611 buff
->mem_len
= MAX_RESET_SIZE
;
1613 buff
->len
= sizeof(*t1
);
1614 t1
=(struct tcphdr
*) buff
->data
;
1616 /* Put in the IP header and routing stuff. */
1617 tmp
= prot
->build_header(buff
,sk
->saddr
, sk
->daddr
, &dev
,
1618 IPPROTO_TCP
, sk
->opt
,
1619 sizeof(struct tcphdr
),sk
->ip_tos
,sk
->ip_ttl
);
1622 prot
->wfree(sk
,buff
->mem_addr
, buff
->mem_len
);
1624 DPRINTF((DBG_TCP
, "Unable to build header for fin.\n"));
1628 t1
=(struct tcphdr
*)((char *)t1
+tmp
);
1631 memcpy(t1
, th
, sizeof(*t1
));
1632 t1
->seq
= ntohl(sk
->send_seq
);
1634 buff
->h
.seq
= sk
->send_seq
;
1636 t1
->ack_seq
= ntohl(sk
->acked_seq
);
1637 t1
->window
= ntohs(sk
->window
=tcp_select_window(sk
)/*sk->prot->rspace(sk)*/);
1640 t1
->doff
= sizeof(*t1
)/4;
1641 tcp_send_check(t1
, sk
->saddr
, sk
->daddr
, sizeof(*t1
), sk
);
1644 * Can't just queue this up.
1645 * It should go at the end of the write queue.
1647 if (sk
->wback
!= NULL
) {
1650 sk
->wback
->next
= buff
;
1652 buff
->magic
= TCP_WRITE_QUEUE_MAGIC
;
1654 sk
->prot
->queue_xmit(sk
, dev
, buff
, 0);
1657 if (sk
->state
== TCP_ESTABLISHED
) sk
->state
= TCP_FIN_WAIT1
;
1658 else sk
->state
= TCP_FIN_WAIT2
;
1665 tcp_recvfrom(struct sock
*sk
, unsigned char *to
,
1666 int to_len
, int nonblock
, unsigned flags
,
1667 struct sockaddr_in
*addr
, int *addr_len
)
1669 struct sockaddr_in sin
;
1674 /* Have to check these first unlike the old code. If
1675 we check them after we lose data on an error
1677 err
= verify_area(VERIFY_WRITE
,addr_len
,sizeof(long));
1680 len
= get_fs_long(addr_len
);
1681 if(len
> sizeof(sin
))
1683 err
=verify_area(VERIFY_WRITE
, addr
, len
);
1687 result
=tcp_read(sk
, to
, to_len
, nonblock
, flags
);
1689 if (result
< 0) return(result
);
1691 sin
.sin_family
= AF_INET
;
1692 sin
.sin_port
= sk
->dummy_th
.dest
;
1693 sin
.sin_addr
.s_addr
= sk
->daddr
;
1695 memcpy_tofs(addr
, &sin
, len
);
1696 put_fs_long(len
, addr_len
);
1701 /* This routine will send an RST to the other tcp. */
1703 tcp_reset(unsigned long saddr
, unsigned long daddr
, struct tcphdr
*th
,
1704 struct proto
*prot
, struct options
*opt
, struct device
*dev
, int tos
, int ttl
)
1706 struct sk_buff
*buff
;
1711 * We need to grab some memory, and put together an RST,
1712 * and then put it into the queue to be sent.
1714 buff
= prot
->wmalloc(NULL
, MAX_RESET_SIZE
, 1, GFP_ATOMIC
);
1718 DPRINTF((DBG_TCP
, "tcp_reset buff = %X\n", buff
));
1719 buff
->mem_addr
= buff
;
1720 buff
->mem_len
= MAX_RESET_SIZE
;
1721 buff
->len
= sizeof(*t1
);
1725 t1
=(struct tcphdr
*) buff
->data
;
1727 /* Put in the IP header and routing stuff. */
1728 tmp
= prot
->build_header(buff
, saddr
, daddr
, &dev
, IPPROTO_TCP
, opt
,
1729 sizeof(struct tcphdr
),tos
,ttl
);
1732 prot
->wfree(NULL
, buff
->mem_addr
, buff
->mem_len
);
1735 t1
=(struct tcphdr
*)((char *)t1
+tmp
);
1737 memcpy(t1
, th
, sizeof(*t1
));
1739 /* Swap the send and the receive. */
1740 t1
->dest
= th
->source
;
1741 t1
->source
= th
->dest
;
1748 t1
->seq
=th
->ack_seq
;
1755 t1
->ack_seq
=htonl(th
->seq
);
1757 t1
->ack_seq
=htonl(th
->seq
+1);
1765 t1
->doff
= sizeof(*t1
)/4;
1766 tcp_send_check(t1
, saddr
, daddr
, sizeof(*t1
), NULL
);
1767 prot
->queue_xmit(NULL
, dev
, buff
, 1);
1772 * Look for tcp options. Parses everything but only knows about MSS.
1773 * This routine is always called with the packet containing the SYN.
1774 * However it may also be called with the ack to the SYN. So you
1775 * can't assume this is always the SYN. It's always called after
1776 * we have set up sk->mtu to our own MTU.
1780 tcp_options(struct sock
*sk
, struct tcphdr
*th
)
1783 int length
=(th
->doff
*4)-sizeof(struct tcphdr
);
1786 ptr
= (unsigned char *)(th
+ 1);
1801 if(opsize
<=2) /* Avoid silly options looping forever */
1806 if(opsize
==4 && th
->syn
)
1808 sk
->mtu
=min(sk
->mtu
,ntohs(*(unsigned short *)ptr
));
1812 /* Add other options here as people feel the urge to implement stuff like large windows */
1820 sk
->mtu
=min(sk
->mtu
, 536); /* default MSS if none sent */
1822 sk
->mss
= min(sk
->max_window
, sk
->mtu
);
1825 static inline unsigned long default_mask(unsigned long dst
)
1829 return htonl(IN_CLASSA_NET
);
1831 return htonl(IN_CLASSB_NET
);
1832 return htonl(IN_CLASSC_NET
);
1836 * This routine handles a connection request.
1837 * It should make sure we haven't already responded.
1838 * Because of the way BSD works, we have to send a syn/ack now.
1839 * This also means it will be harder to close a socket which is
1843 tcp_conn_request(struct sock
*sk
, struct sk_buff
*skb
,
1844 unsigned long daddr
, unsigned long saddr
,
1845 struct options
*opt
, struct device
*dev
)
1847 struct sk_buff
*buff
;
1854 DPRINTF((DBG_TCP
, "tcp_conn_request(sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"
1855 " opt = %X, dev = %X)\n",
1856 sk
, skb
, daddr
, saddr
, opt
, dev
));
1860 /* If the socket is dead, don't accept the connection. */
1862 sk
->data_ready(sk
,0);
1864 DPRINTF((DBG_TCP
, "tcp_conn_request on dead socket\n"));
1865 tcp_reset(daddr
, saddr
, th
, sk
->prot
, opt
, dev
, sk
->ip_tos
,sk
->ip_ttl
);
1866 kfree_skb(skb
, FREE_READ
);
1871 * Make sure we can accept more. This will prevent a
1872 * flurry of syns from eating up all our memory.
1874 if (sk
->ack_backlog
>= sk
->max_ack_backlog
) {
1875 kfree_skb(skb
, FREE_READ
);
1880 * We need to build a new sock struct.
1881 * It is sort of bad to have a socket without an inode attached
1882 * to it, but the wake_up's will just wake up the listening socket,
1883 * and if the listening socket is destroyed before this is taken
1884 * off of the queue, this will take care of it.
1886 newsk
= (struct sock
*) kmalloc(sizeof(struct sock
), GFP_ATOMIC
);
1887 if (newsk
== NULL
) {
1888 /* just ignore the syn. It will get retransmitted. */
1889 kfree_skb(skb
, FREE_READ
);
1893 DPRINTF((DBG_TCP
, "newsk = %X\n", newsk
));
1894 memcpy((void *)newsk
,(void *)sk
, sizeof(*newsk
));
1895 newsk
->wback
= NULL
;
1896 newsk
->wfront
= NULL
;
1897 newsk
->rqueue
= NULL
;
1898 newsk
->send_head
= NULL
;
1899 newsk
->send_tail
= NULL
;
1900 newsk
->back_log
= NULL
;
1901 newsk
->rtt
= TCP_CONNECT_TIME
<< 3;
1902 newsk
->rto
= TCP_CONNECT_TIME
;
1904 newsk
->max_window
= 0;
1905 newsk
->cong_window
= 1;
1906 newsk
->cong_count
= 0;
1907 newsk
->ssthresh
= 0;
1913 newsk
->partial
= NULL
;
1915 newsk
->wmem_alloc
= 0;
1916 newsk
->rmem_alloc
= 0;
1918 newsk
->max_unacked
= MAX_WINDOW
- TCP_WINDOW_DIFF
;
1921 newsk
->shutdown
= 0;
1922 newsk
->ack_backlog
= 0;
1923 newsk
->acked_seq
= skb
->h
.th
->seq
+1;
1924 newsk
->fin_seq
= skb
->h
.th
->seq
;
1925 newsk
->copied_seq
= skb
->h
.th
->seq
;
1926 newsk
->state
= TCP_SYN_RECV
;
1928 newsk
->send_seq
= jiffies
* SEQ_TICK
- seq_offset
;
1929 newsk
->window_seq
= newsk
->send_seq
;
1930 newsk
->rcv_ack_seq
= newsk
->send_seq
;
1932 newsk
->retransmits
= 0;
1934 newsk
->timer
.data
= (unsigned long)newsk
;
1935 newsk
->timer
.function
= &net_timer
;
1936 newsk
->dummy_th
.source
= skb
->h
.th
->dest
;
1937 newsk
->dummy_th
.dest
= skb
->h
.th
->source
;
1939 /* Swap these two, they are from our point of view. */
1940 newsk
->daddr
= saddr
;
1941 newsk
->saddr
= daddr
;
1943 put_sock(newsk
->num
,newsk
);
1944 newsk
->dummy_th
.res1
= 0;
1945 newsk
->dummy_th
.doff
= 6;
1946 newsk
->dummy_th
.fin
= 0;
1947 newsk
->dummy_th
.syn
= 0;
1948 newsk
->dummy_th
.rst
= 0;
1949 newsk
->dummy_th
.psh
= 0;
1950 newsk
->dummy_th
.ack
= 0;
1951 newsk
->dummy_th
.urg
= 0;
1952 newsk
->dummy_th
.res2
= 0;
1953 newsk
->acked_seq
= skb
->h
.th
->seq
+ 1;
1954 newsk
->copied_seq
= skb
->h
.th
->seq
;
1956 /* Grab the ttl and tos values and use them */
1957 newsk
->ip_ttl
=sk
->ip_ttl
;
1958 newsk
->ip_tos
=skb
->ip_hdr
->tos
;
1960 /* use 512 or whatever user asked for */
1961 /* note use of sk->user_mss, since user has no direct access to newsk */
1963 newsk
->mtu
= sk
->user_mss
;
1965 #ifdef SUBNETSARELOCAL
1966 if ((saddr
^ daddr
) & default_mask(saddr
))
1968 if ((saddr
^ daddr
) & dev
->pa_mask
)
1970 newsk
->mtu
= 576 - HEADER_SIZE
;
1972 newsk
->mtu
= MAX_WINDOW
;
1974 /* but not bigger than device MTU */
1975 newsk
->mtu
= min(newsk
->mtu
, dev
->mtu
- HEADER_SIZE
);
1977 /* this will min with what arrived in the packet */
1978 tcp_options(newsk
,skb
->h
.th
);
1980 buff
= newsk
->prot
->wmalloc(newsk
, MAX_SYN_SIZE
, 1, GFP_ATOMIC
);
1984 release_sock(newsk
);
1985 kfree_skb(skb
, FREE_READ
);
1989 buff
->mem_addr
= buff
;
1990 buff
->mem_len
= MAX_SYN_SIZE
;
1991 buff
->len
= sizeof(struct tcphdr
)+4;
1994 t1
=(struct tcphdr
*) buff
->data
;
1996 /* Put in the IP header and routing stuff. */
1997 tmp
= sk
->prot
->build_header(buff
, newsk
->saddr
, newsk
->daddr
, &dev
,
1998 IPPROTO_TCP
, NULL
, MAX_SYN_SIZE
,sk
->ip_tos
,sk
->ip_ttl
);
2000 /* Something went wrong. */
2004 kfree_skb(buff
,FREE_WRITE
);
2006 release_sock(newsk
);
2008 kfree_skb(skb
, FREE_READ
);
2013 t1
=(struct tcphdr
*)((char *)t1
+tmp
);
2015 memcpy(t1
, skb
->h
.th
, sizeof(*t1
));
2016 buff
->h
.seq
= newsk
->send_seq
;
2018 /* Swap the send and the receive. */
2019 t1
->dest
= skb
->h
.th
->source
;
2020 t1
->source
= newsk
->dummy_th
.source
;
2021 t1
->seq
= ntohl(newsk
->send_seq
++);
2023 newsk
->window
= tcp_select_window(newsk
);/*newsk->prot->rspace(newsk);*/
2024 t1
->window
= ntohs(newsk
->window
);
2031 t1
->ack_seq
= ntohl(skb
->h
.th
->seq
+1);
2032 t1
->doff
= sizeof(*t1
)/4+1;
2034 ptr
=(unsigned char *)(t1
+1);
2037 ptr
[2] = ((newsk
->mtu
) >> 8) & 0xff;
2038 ptr
[3] =(newsk
->mtu
) & 0xff;
2040 tcp_send_check(t1
, daddr
, saddr
, sizeof(*t1
)+4, newsk
);
2041 newsk
->prot
->queue_xmit(newsk
, dev
, buff
, 0);
2043 reset_timer(newsk
, TIME_WRITE
/* -1 ? FIXME ??? */, TCP_CONNECT_TIME
);
2046 /* Charge the sock_buff to newsk. */
2047 sk
->rmem_alloc
-= skb
->mem_len
;
2048 newsk
->rmem_alloc
+= skb
->mem_len
;
2050 skb_queue_tail(&sk
->rqueue
,skb
);
2052 release_sock(newsk
);
2057 tcp_close(struct sock
*sk
, int timeout
)
2059 struct sk_buff
*buff
;
2061 struct tcphdr
*t1
, *th
;
2063 struct device
*dev
=NULL
;
2067 * We need to grab some memory, and put together a FIN,
2068 * and then put it into the queue to be sent.
2070 DPRINTF((DBG_TCP
, "tcp_close((struct sock *)%X, %d)\n",sk
, timeout
));
2073 sk
->shutdown
= SHUTDOWN_MASK
;
2076 sk
->state_change(sk
);
2078 /* We need to flush the recv. buffs. */
2079 if (skb_peek(&sk
->rqueue
) != NULL
)
2081 struct sk_buff
*skb
;
2083 printk("Clean rcv queue\n");
2084 while((skb
=skb_dequeue(&sk
->rqueue
))!=NULL
)
2086 if(skb
->len
> 0 && after(skb
->h
.th
->seq
+ skb
->len
+ 1 , sk
->copied_seq
))
2088 kfree_skb(skb
, FREE_READ
);
2091 printk("Cleaned.\n");
2095 /* Get rid off any half-completed packets. */
2097 tcp_send_partial(sk
);
2104 /* start a timer. */
2105 /* original code was 4 * sk->rtt. In converting to the
2106 * new rtt representation, we can't quite use that.
2107 * it seems to make most sense to use the backed off value
2109 reset_timer(sk
, TIME_CLOSE
, 4 * sk
->rto
);
2110 if (timeout
) tcp_time_wait(sk
);
2112 return; /* break causes a double release - messy */
2115 sk
->state
= TCP_CLOSE
;
2120 sk
->state
= TCP_CLOSE
;
2126 case TCP_CLOSE_WAIT
:
2127 case TCP_ESTABLISHED
:
2130 prot
=(struct proto
*)sk
->prot
;
2131 th
=(struct tcphdr
*)&sk
->dummy_th
;
2132 buff
= prot
->wmalloc(sk
, MAX_FIN_SIZE
, 1, GFP_ATOMIC
);
2134 /* This will force it to try again later. */
2135 /* Or it would have if someone released the socket
2136 first. Anyway it might work now */
2138 if (sk
->state
!= TCP_CLOSE_WAIT
)
2139 sk
->state
= TCP_ESTABLISHED
;
2140 reset_timer(sk
, TIME_CLOSE
, 100);
2143 buff
->mem_addr
= buff
;
2144 buff
->mem_len
= MAX_FIN_SIZE
;
2147 buff
->len
= sizeof(*t1
);
2148 t1
=(struct tcphdr
*) buff
->data
;
2150 /* Put in the IP header and routing stuff. */
2151 tmp
= prot
->build_header(buff
,sk
->saddr
, sk
->daddr
, &dev
,
2152 IPPROTO_TCP
, sk
->opt
,
2153 sizeof(struct tcphdr
),sk
->ip_tos
,sk
->ip_ttl
);
2155 kfree_skb(buff
,FREE_WRITE
);
2156 DPRINTF((DBG_TCP
, "Unable to build header for fin.\n"));
2161 t1
=(struct tcphdr
*)((char *)t1
+tmp
);
2164 memcpy(t1
, th
, sizeof(*t1
));
2165 t1
->seq
= ntohl(sk
->send_seq
);
2167 buff
->h
.seq
= sk
->send_seq
;
2170 /* Ack everything immediately from now on. */
2172 t1
->ack_seq
= ntohl(sk
->acked_seq
);
2173 t1
->window
= ntohs(sk
->window
=tcp_select_window(sk
)/*sk->prot->rspace(sk)*/);
2175 t1
->rst
= need_reset
;
2176 t1
->doff
= sizeof(*t1
)/4;
2177 tcp_send_check(t1
, sk
->saddr
, sk
->daddr
, sizeof(*t1
), sk
);
2179 if (sk
->wfront
== NULL
) {
2180 prot
->queue_xmit(sk
, dev
, buff
, 0);
2182 reset_timer(sk
, TIME_WRITE
, sk
->rto
);
2184 if (sk
->wback
== NULL
) {
2187 sk
->wback
->next
= buff
;
2190 buff
->magic
= TCP_WRITE_QUEUE_MAGIC
;
2193 if (sk
->state
== TCP_CLOSE_WAIT
) {
2194 sk
->state
= TCP_FIN_WAIT2
;
2196 sk
->state
= TCP_FIN_WAIT1
;
2204 * This routine takes stuff off of the write queue,
2205 * and puts it in the xmit queue.
2208 tcp_write_xmit(struct sock
*sk
)
2210 struct sk_buff
*skb
;
2212 DPRINTF((DBG_TCP
, "tcp_write_xmit(sk=%X)\n", sk
));
2214 /* The bytes will have to remain here. In time closedown will
2215 empty the write queue and all will be happy */
2219 while(sk
->wfront
!= NULL
&&
2220 before(sk
->wfront
->h
.seq
, sk
->window_seq
+1) &&
2221 (sk
->retransmits
== 0 ||
2222 sk
->timeout
!= TIME_WRITE
||
2223 before(sk
->wfront
->h
.seq
, sk
->rcv_ack_seq
+1))
2224 && sk
->packets_out
< sk
->cong_window
) {
2227 sk
->wfront
= skb
->next
;
2228 if (sk
->wfront
== NULL
) sk
->wback
= NULL
;
2230 if (skb
->magic
!= TCP_WRITE_QUEUE_MAGIC
) {
2231 printk("tcp.c skb with bad magic(%X) on write queue. Squashing "
2232 "queue\n", skb
->magic
);
2238 DPRINTF((DBG_TCP
, "Sending a packet.\n"));
2240 /* See if we really need to send the packet. */
2241 if (before(skb
->h
.seq
, sk
->rcv_ack_seq
+1)) {
2242 sk
->retransmits
= 0;
2243 kfree_skb(skb
, FREE_WRITE
);
2244 if (!sk
->dead
) sk
->write_space(sk
);
2246 sk
->prot
->queue_xmit(sk
, skb
->dev
, skb
, skb
->free
);
2253 * This routine sorts the send list, and resets the
2254 * sk->send_head and sk->send_tail pointers.
2257 sort_send(struct sock
*sk
)
2259 struct sk_buff
*list
= NULL
;
2260 struct sk_buff
*skb
,*skb2
,*skb3
;
2262 for (skb
= sk
->send_head
; skb
!= NULL
; skb
= skb2
) {
2263 skb2
= (struct sk_buff
*)skb
->link3
;
2264 if (list
== NULL
|| before (skb2
->h
.seq
, list
->h
.seq
)) {
2266 sk
->send_tail
= skb
;
2269 for (skb3
= list
; ; skb3
= (struct sk_buff
*)skb3
->link3
) {
2270 if (skb3
->link3
== NULL
||
2271 before(skb
->h
.seq
, skb3
->link3
->h
.seq
)) {
2272 skb
->link3
= skb3
->link3
;
2274 if (skb
->link3
== NULL
) sk
->send_tail
= skb
;
2280 sk
->send_head
= list
;
2284 /* This routine deals with incoming acks, but not outgoing ones. */
2286 tcp_ack(struct sock
*sk
, struct tcphdr
*th
, unsigned long saddr
, int len
)
2291 * 1 - there was data in packet as well as ack or new data is sent or
2293 * 2 - data from retransmit queue was acked and removed
2294 * 4 - window shrunk or data from retransmit queue was acked and removed
2298 return(1); /* Dead, cant ack any more so why bother */
2300 ack
= ntohl(th
->ack_seq
);
2301 DPRINTF((DBG_TCP
, "tcp_ack ack=%d, window=%d, "
2302 "sk->rcv_ack_seq=%d, sk->window_seq = %d\n",
2303 ack
, ntohs(th
->window
), sk
->rcv_ack_seq
, sk
->window_seq
));
2305 if (ntohs(th
->window
) > sk
->max_window
) {
2306 sk
->max_window
= ntohs(th
->window
);
2307 sk
->mss
= min(sk
->max_window
, sk
->mtu
);
2310 if (sk
->retransmits
&& sk
->timeout
== TIME_KEEPOPEN
)
2311 sk
->retransmits
= 0;
2313 if (after(ack
, sk
->send_seq
+1) || before(ack
, sk
->rcv_ack_seq
-1)) {
2314 if (after(ack
, sk
->send_seq
) ||
2315 (sk
->state
!= TCP_ESTABLISHED
&& sk
->state
!= TCP_CLOSE_WAIT
)) {
2319 reset_timer(sk
, TIME_KEEPOPEN
, TCP_TIMEOUT_LEN
);
2324 if (len
!= th
->doff
*4) flag
|= 1;
2326 /* See if our window has been shrunk. */
2327 if (after(sk
->window_seq
, ack
+ntohs(th
->window
))) {
2329 * We may need to move packets from the send queue
2330 * to the write queue, if the window has been shrunk on us.
2331 * The RFC says you are not allowed to shrink your window
2332 * like this, but if the other end does, you must be able
2335 struct sk_buff
*skb
;
2336 struct sk_buff
*skb2
;
2337 struct sk_buff
*wskb
= NULL
;
2339 skb2
= sk
->send_head
;
2340 sk
->send_head
= NULL
;
2341 sk
->send_tail
= NULL
;
2345 sk
->window_seq
= ack
+ ntohs(th
->window
);
2347 while (skb2
!= NULL
) {
2349 skb2
= (struct sk_buff
*)skb
->link3
;
2351 if (after(skb
->h
.seq
, sk
->window_seq
)) {
2352 if (sk
->packets_out
> 0) sk
->packets_out
--;
2353 /* We may need to remove this from the dev send list. */
2354 if (skb
->next
!= NULL
) {
2357 /* Now add it to the write_queue. */
2358 skb
->magic
= TCP_WRITE_QUEUE_MAGIC
;
2360 skb
->next
= sk
->wfront
;
2363 skb
->next
= wskb
->next
;
2366 if (sk
->wback
== wskb
) sk
->wback
= skb
;
2369 if (sk
->send_head
== NULL
) {
2370 sk
->send_head
= skb
;
2371 sk
->send_tail
= skb
;
2373 sk
->send_tail
->link3
= skb
;
2374 sk
->send_tail
= skb
;
2382 if (sk
->send_tail
== NULL
|| sk
->send_head
== NULL
) {
2383 sk
->send_head
= NULL
;
2384 sk
->send_tail
= NULL
;
2388 sk
->window_seq
= ack
+ ntohs(th
->window
);
2390 /* We don't want too many packets out there. */
2391 if (sk
->timeout
== TIME_WRITE
&&
2392 sk
->cong_window
< 2048 && after(ack
, sk
->rcv_ack_seq
)) {
2394 * This is Jacobson's slow start and congestion avoidance.
2395 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
2396 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
2397 * counter and increment it once every cwnd times. It's possible
2398 * that this should be done only if sk->retransmits == 0. I'm
2399 * interpreting "new data is acked" as including data that has
2400 * been retransmitted but is just now being acked.
2402 if (sk
->cong_window
< sk
->ssthresh
)
2403 /* in "safe" area, increase */
2406 /* in dangerous area, increase slowly. In theory this is
2407 sk->cong_window += 1 / sk->cong_window
2409 if (sk
->cong_count
>= sk
->cong_window
) {
2417 DPRINTF((DBG_TCP
, "tcp_ack: Updating rcv ack sequence.\n"));
2418 sk
->rcv_ack_seq
= ack
;
2421 * if this ack opens up a zero window, clear backoff. It was
2422 * being used to time the probes, and is probably far higher than
2423 * it needs to be for normal retransmission
2425 if (sk
->timeout
== TIME_PROBE0
) {
2426 if (sk
->wfront
!= NULL
&& /* should always be non-null */
2427 ! before (sk
->window_seq
, sk
->wfront
->h
.seq
)) {
2428 sk
->retransmits
= 0;
2430 /* recompute rto from rtt. this eliminates any backoff */
2431 sk
->rto
= ((sk
->rtt
>> 2) + sk
->mdev
) >> 1;
2432 if (sk
->rto
> 120*HZ
)
2439 /* See if we can take anything off of the retransmit queue. */
2440 while(sk
->send_head
!= NULL
) {
2441 /* Check for a bug. */
2442 if (sk
->send_head
->link3
&&
2443 after(sk
->send_head
->h
.seq
, sk
->send_head
->link3
->h
.seq
)) {
2444 printk("INET: tcp.c: *** bug send_list out of order.\n");
2448 if (before(sk
->send_head
->h
.seq
, ack
+1)) {
2449 struct sk_buff
*oskb
;
2451 if (sk
->retransmits
) {
2453 /* we were retransmitting. don't count this in RTT est */
2457 * even though we've gotten an ack, we're still
2458 * retransmitting as long as we're sending from
2459 * the retransmit queue. Keeping retransmits non-zero
2460 * prevents us from getting new data interspersed with
2464 if (sk
->send_head
->link3
)
2465 sk
->retransmits
= 1;
2467 sk
->retransmits
= 0;
2472 * Note that we only reset backoff and rto in the
2473 * rtt recomputation code. And that doesn't happen
2474 * if there were retransmissions in effect. So the
2475 * first new packet after the retransmissions is
2476 * sent with the backoff still in effect. Not until
2477 * we get an ack from a non-retransmitted packet do
2478 * we reset the backoff and rto. This allows us to deal
2479 * with a situation where the network delay has increased
2480 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2483 /* We have one less packet out there. */
2484 if (sk
->packets_out
> 0) sk
->packets_out
--;
2485 DPRINTF((DBG_TCP
, "skb=%X skb->h.seq = %d acked ack=%d\n",
2486 sk
->send_head
, sk
->send_head
->h
.seq
, ack
));
2488 /* Wake up the process, it can probably write more. */
2489 if (!sk
->dead
) sk
->write_space(sk
);
2491 oskb
= sk
->send_head
;
2496 /* The following amusing code comes from Jacobson's
2497 * article in SIGCOMM '88. Note that rtt and mdev
2498 * are scaled versions of rtt and mean deviation.
2499 * This is designed to be as fast as possible
2500 * m stands for "measurement".
2503 m
= jiffies
- oskb
->when
; /* RTT */
2504 m
-= (sk
->rtt
>> 3); /* m is now error in rtt est */
2505 sk
->rtt
+= m
; /* rtt = 7/8 rtt + 1/8 new */
2507 m
= -m
; /* m is now abs(error) */
2508 m
-= (sk
->mdev
>> 2); /* similar update on mdev */
2509 sk
->mdev
+= m
; /* mdev = 3/4 mdev + 1/4 new */
2511 /* now update timeout. Note that this removes any backoff */
2512 sk
->rto
= ((sk
->rtt
>> 2) + sk
->mdev
) >> 1;
2513 if (sk
->rto
> 120*HZ
)
2524 oskb
= sk
->send_head
;
2526 sk
->send_head
=(struct sk_buff
*)oskb
->link3
;
2527 if (sk
->send_head
== NULL
) {
2528 sk
->send_tail
= NULL
;
2531 /* We may need to remove this from the dev send list. */
2532 skb_unlink(oskb
); /* Much easier! */
2535 kfree_skb(oskb
, FREE_WRITE
); /* write. */
2536 if (!sk
->dead
) sk
->write_space(sk
);
2543 * Maybe we can take some stuff off of the write queue,
2544 * and put it onto the xmit queue.
2546 if (sk
->wfront
!= NULL
) {
2547 if (after (sk
->window_seq
+1, sk
->wfront
->h
.seq
) &&
2548 (sk
->retransmits
== 0 ||
2549 sk
->timeout
!= TIME_WRITE
||
2550 before(sk
->wfront
->h
.seq
, sk
->rcv_ack_seq
+1))
2551 && sk
->packets_out
< sk
->cong_window
) {
2554 } else if (before(sk
->window_seq
, sk
->wfront
->h
.seq
) &&
2555 sk
->send_head
== NULL
&&
2556 sk
->ack_backlog
== 0 &&
2557 sk
->state
!= TCP_TIME_WAIT
) {
2558 reset_timer(sk
, TIME_PROBE0
, sk
->rto
);
2561 if (sk
->send_head
== NULL
&& sk
->ack_backlog
== 0 &&
2562 sk
->state
!= TCP_TIME_WAIT
&& !sk
->keepopen
) {
2563 DPRINTF((DBG_TCP
, "Nothing to do, going to sleep.\n"));
2564 if (!sk
->dead
) sk
->write_space(sk
);
2567 reset_timer(sk
, TIME_KEEPOPEN
, TCP_TIMEOUT_LEN
);
2571 if (sk
->state
!= (unsigned char) sk
->keepopen
) {
2572 reset_timer(sk
, TIME_WRITE
, sk
->rto
);
2574 if (sk
->state
== TCP_TIME_WAIT
) {
2575 reset_timer(sk
, TIME_CLOSE
, TCP_TIMEWAIT_LEN
);
2580 if (sk
->packets_out
== 0 && sk
->partial
!= NULL
&&
2581 sk
->wfront
== NULL
&& sk
->send_head
== NULL
) {
2583 tcp_send_partial(sk
);
2586 /* See if we are done. */
2587 if (sk
->state
== TCP_TIME_WAIT
) {
2589 sk
->state_change(sk
);
2590 if (sk
->rcv_ack_seq
== sk
->send_seq
&& sk
->acked_seq
== sk
->fin_seq
) {
2592 sk
->state
= TCP_CLOSE
;
2593 sk
->shutdown
= SHUTDOWN_MASK
;
2597 if (sk
->state
== TCP_LAST_ACK
|| sk
->state
== TCP_FIN_WAIT2
) {
2598 if (!sk
->dead
) sk
->state_change(sk
);
2599 if (sk
->rcv_ack_seq
== sk
->send_seq
) {
2601 if (sk
->acked_seq
!= sk
->fin_seq
) {
2604 DPRINTF((DBG_TCP
, "tcp_ack closing socket - %X\n", sk
));
2605 tcp_send_ack(sk
->send_seq
, sk
->acked_seq
, sk
,
2607 sk
->shutdown
= SHUTDOWN_MASK
;
2608 sk
->state
= TCP_CLOSE
;
2614 * I make no guarantees about the first clause in the following
2615 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
2616 * what conditions "!flag" would be true. However I think the rest
2617 * of the conditions would prevent that from causing any
2618 * unnecessary retransmission.
2619 * Clearly if the first packet has expired it should be
2620 * retransmitted. The other alternative, "flag&2 && retransmits", is
2621 * harder to explain: You have to look carefully at how and when the
2622 * timer is set and with what timeout. The most recent transmission always
2623 * sets the timer. So in general if the most recent thing has timed
2624 * out, everything before it has as well. So we want to go ahead and
2625 * retransmit some more. If we didn't explicitly test for this
2626 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
2627 * would not be true. If you look at the pattern of timing, you can
2628 * show that rto is increased fast enough that the next packet would
2629 * almost never be retransmitted immediately. Then you'd end up
2630 * waiting for a timeout to send each packet on the retranmission
2631 * queue. With my implementation of the Karn sampling algorithm,
2632 * the timeout would double each time. The net result is that it would
2633 * take a hideous amount of time to recover from a single dropped packet.
2634 * It's possible that there should also be a test for TIME_WRITE, but
2635 * I think as long as "send_head != NULL" and "retransmit" is on, we've
2636 * got to be in real retransmission mode.
2637 * Note that ip_do_retransmit is called with all==1. Setting cong_window
2638 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
2639 * As long as no further losses occur, this seems reasonable.
2642 if (((!flag
) || (flag
&4)) && sk
->send_head
!= NULL
&&
2643 (((flag
&2) && sk
->retransmits
) ||
2644 (sk
->send_head
->when
+ sk
->rto
< jiffies
))) {
2645 ip_do_retransmit(sk
, 1);
2646 reset_timer(sk
, TIME_WRITE
, sk
->rto
);
2649 DPRINTF((DBG_TCP
, "leaving tcp_ack\n"));
2655 * This routine handles the data. If there is room in the buffer,
2656 * it will be have already been moved into it. If there is no
2657 * room, then we will just have to discard the packet.
2660 tcp_data(struct sk_buff
*skb
, struct sock
*sk
,
2661 unsigned long saddr
, unsigned short len
)
2663 struct sk_buff
*skb1
, *skb2
;
2669 skb
->len
= len
-(th
->doff
*4);
2671 DPRINTF((DBG_TCP
, "tcp_data len = %d sk = %X:\n", skb
->len
, sk
));
2673 sk
->bytes_rcv
+= skb
->len
;
2674 if (skb
->len
== 0 && !th
->fin
&& !th
->urg
&& !th
->psh
) {
2675 /* Don't want to keep passing ack's back and forth. */
2676 if (!th
->ack
) tcp_send_ack(sk
->send_seq
, sk
->acked_seq
,sk
, th
, saddr
);
2677 kfree_skb(skb
, FREE_READ
);
2681 if (sk
->shutdown
& RCV_SHUTDOWN
) {
2682 sk
->acked_seq
= th
->seq
+ skb
->len
+ th
->syn
+ th
->fin
;
2683 tcp_reset(sk
->saddr
, sk
->daddr
, skb
->h
.th
,
2684 sk
->prot
, NULL
, skb
->dev
, sk
->ip_tos
, sk
->ip_ttl
);
2685 sk
->state
= TCP_CLOSE
;
2687 sk
->shutdown
= SHUTDOWN_MASK
;
2688 DPRINTF((DBG_TCP
, "tcp_data: closing socket - %X\n", sk
));
2689 kfree_skb(skb
, FREE_READ
);
2690 if (!sk
->dead
) sk
->state_change(sk
);
2695 * Now we have to walk the chain, and figure out where this one
2696 * goes into it. This is set up so that the last packet we received
2697 * will be the first one we look at, that way if everything comes
2698 * in order, there will be no performance loss, and if they come
2699 * out of order we will be able to fit things in nicely.
2702 /* This should start at the last one, and then go around forwards. */
2703 if (sk
->rqueue
== NULL
) {
2704 DPRINTF((DBG_TCP
, "tcp_data: skb = %X:\n", skb
));
2709 skb
->list
= &sk
->rqueue
;
2711 skb_queue_head(&sk
->rqueue
,skb
);
2715 DPRINTF((DBG_TCP
, "tcp_data adding to chain sk = %X:\n", sk
));
2716 for(skb1
=sk
->rqueue
->prev
; ; skb1
=(struct sk_buff
*)skb1
->prev
) {
2719 printk("skb1=%p :", skb1
);
2720 printk("skb1->h.th->seq = %ld: ", skb1
->h
.th
->seq
);
2721 printk("skb->h.th->seq = %ld\n",skb
->h
.th
->seq
);
2722 printk("copied_seq = %ld acked_seq = %ld\n", sk
->copied_seq
,
2726 if (after(th
->seq
+1, skb1
->h
.th
->seq
)) {
2728 skb
->next
= skb1
->next
;
2729 skb
->next
->prev
= skb
;
2731 if (skb1
== sk
->rqueue
) sk
->rqueue
= skb
;
2734 if (skb1
->prev
== sk
->rqueue
) {
2736 skb
->prev
= skb1
->prev
;
2737 skb
->prev
->next
= skb
;
2739 skb1
= NULL
; /* so we know we might be able
2744 if (th
->seq
==skb1
->h
.th
->seq
&& skb
->len
>= skb1
->len
)
2746 skb_append(skb1
,skb
);
2748 kfree_skb(skb1
,FREE_READ
);
2753 if (after(th
->seq
+1, skb1
->h
.th
->seq
))
2755 skb_append(skb1
,skb
);
2758 if (skb1
== sk
->rqueue
)
2760 skb_queue_head(&sk
->rqueue
, skb
);
2765 DPRINTF((DBG_TCP
, "skb = %X:\n", skb
));
2768 th
->ack_seq
= th
->seq
+ skb
->len
;
2769 if (th
->syn
) th
->ack_seq
++;
2770 if (th
->fin
) th
->ack_seq
++;
2772 if (before(sk
->acked_seq
, sk
->copied_seq
)) {
2773 printk("*** tcp.c:tcp_data bug acked < copied\n");
2774 sk
->acked_seq
= sk
->copied_seq
;
2777 /* Now figure out if we can ack anything. */
2778 if ((!dup_dumped
&& (skb1
== NULL
|| skb1
->acked
)) || before(th
->seq
, sk
->acked_seq
+1)) {
2779 if (before(th
->seq
, sk
->acked_seq
+1)) {
2780 if (after(th
->ack_seq
, sk
->acked_seq
))
2781 sk
->acked_seq
= th
->ack_seq
;
2784 /* When we ack the fin, we turn on the RCV_SHUTDOWN flag. */
2785 if (skb
->h
.th
->fin
) {
2786 if (!sk
->dead
) sk
->state_change(sk
);
2787 sk
->shutdown
|= RCV_SHUTDOWN
;
2790 for(skb2
= (struct sk_buff
*)skb
->next
;
2791 skb2
!=(struct sk_buff
*) sk
->rqueue
;
2792 skb2
= (struct sk_buff
*)skb2
->next
) {
2793 if (before(skb2
->h
.th
->seq
, sk
->acked_seq
+1)) {
2794 if (after(skb2
->h
.th
->ack_seq
, sk
->acked_seq
))
2796 long old_acked_seq
= sk
->acked_seq
;
2797 sk
->acked_seq
= skb2
->h
.th
->ack_seq
;
2798 if((int)(sk
->acked_seq
- old_acked_seq
) >0)
2800 int new_window
=sk
->window
-sk
->acked_seq
+
2804 sk
->window
= new_window
;
2810 * When we ack the fin, we turn on
2811 * the RCV_SHUTDOWN flag.
2813 if (skb2
->h
.th
->fin
) {
2814 sk
->shutdown
|= RCV_SHUTDOWN
;
2815 if (!sk
->dead
) sk
->state_change(sk
);
2818 /* Force an immediate ack. */
2819 sk
->ack_backlog
= sk
->max_ack_backlog
;
2826 * This also takes care of updating the window.
2827 * This if statement needs to be simplified.
2829 if (!sk
->delay_acks
||
2830 sk
->ack_backlog
>= sk
->max_ack_backlog
||
2831 sk
->bytes_rcv
> sk
->max_unacked
|| th
->fin
) {
2832 /* tcp_send_ack(sk->send_seq, sk->acked_seq,sk,th, saddr); */
2836 printk("Ack queued.\n");
2837 reset_timer(sk
, TIME_WRITE
, TCP_ACK_TIME
);
2843 * If we've missed a packet, send an ack.
2844 * Also start a timer to send another.
2848 * This is important. If we don't have much room left,
2849 * we need to throw out a few packets so we have a good
2850 * window. Note that mtu is used, not mss, because mss is really
2851 * for the send side. He could be sending us stuff as large as mtu.
2853 while (sk
->prot
->rspace(sk
) < sk
->mtu
) {
2854 skb1
= skb_peek(&sk
->rqueue
);
2856 printk("INET: tcp.c:tcp_data memory leak detected.\n");
2860 /* Don't throw out something that has been acked. */
2867 if (skb1
->prev
== skb1
) {
2870 sk
->rqueue
= (struct sk_buff
*)skb1
->prev
;
2871 skb1
->next
->prev
= skb1
->prev
;
2872 skb1
->prev
->next
= skb1
->next
;
2875 kfree_skb(skb1
, FREE_READ
);
2877 tcp_send_ack(sk
->send_seq
, sk
->acked_seq
, sk
, th
, saddr
);
2879 reset_timer(sk
, TIME_WRITE
, TCP_ACK_TIME
);
2881 /* We missed a packet. Send an ack to try to resync things. */
2882 tcp_send_ack(sk
->send_seq
, sk
->acked_seq
, sk
, th
, saddr
);
2885 /* Now tell the user we may have some data. */
2888 printk("Data wakeup.\n");
2889 sk
->data_ready(sk
,0);
2891 DPRINTF((DBG_TCP
, "data received on dead socket.\n"));
2894 if (sk
->state
== TCP_FIN_WAIT2
&&
2895 sk
->acked_seq
== sk
->fin_seq
&& sk
->rcv_ack_seq
== sk
->send_seq
) {
2896 DPRINTF((DBG_TCP
, "tcp_data: entering last_ack state sk = %X\n", sk
));
2898 /* tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); */
2899 sk
->shutdown
= SHUTDOWN_MASK
;
2900 sk
->state
= TCP_LAST_ACK
;
2901 if (!sk
->dead
) sk
->state_change(sk
);
2909 tcp_urg(struct sock
*sk
, struct tcphdr
*th
, unsigned long saddr
)
2911 extern int kill_pg(int pg
, int sig
, int priv
);
2912 extern int kill_proc(int pid
, int sig
, int priv
);
2915 sk
->data_ready(sk
,0);
2917 if (sk
->urginline
) {
2924 /* So if we get more urgent data, we don't signal the user again. */
2925 if (sk
->proc
!= 0) {
2927 kill_proc(sk
->proc
, SIGURG
, 1);
2929 kill_pg(-sk
->proc
, SIGURG
, 1);
2938 /* This deals with incoming fins. 'Linus at 9 O'clock' 8-) */
2940 tcp_fin(struct sock
*sk
, struct tcphdr
*th
,
2941 unsigned long saddr
, struct device
*dev
)
2943 DPRINTF((DBG_TCP
, "tcp_fin(sk=%X, th=%X, saddr=%X, dev=%X)\n",
2944 sk
, th
, saddr
, dev
));
2947 sk
->state_change(sk
);
2953 case TCP_ESTABLISHED
:
2954 /* Contains the one that needs to be acked */
2955 sk
->fin_seq
= th
->seq
+1;
2956 sk
->state
= TCP_CLOSE_WAIT
;
2957 if (th
->rst
) sk
->shutdown
= SHUTDOWN_MASK
;
2960 case TCP_CLOSE_WAIT
:
2962 break; /* we got a retransmit of the fin. */
2965 /* Contains the one that needs to be acked */
2966 sk
->fin_seq
= th
->seq
+1;
2967 sk
->state
= TCP_FIN_WAIT2
;
2972 sk
->state
= TCP_LAST_ACK
;
2974 /* Start the timers. */
2975 reset_timer(sk
, TIME_CLOSE
, TCP_TIMEWAIT_LEN
);
2984 /* This will accept the next outstanding connection. */
2985 static struct sock
*
2986 tcp_accept(struct sock
*sk
, int flags
)
2989 struct sk_buff
*skb
;
2991 DPRINTF((DBG_TCP
, "tcp_accept(sk=%X, flags=%X, addr=%s)\n",
2992 sk
, flags
, in_ntoa(sk
->saddr
)));
2995 * We need to make sure that this socket is listening,
2996 * and that it has something pending.
2998 if (sk
->state
!= TCP_LISTEN
) {
3003 /* avoid the race. */
3006 while((skb
= get_firstr(sk
)) == NULL
) {
3007 if (flags
& O_NONBLOCK
) {
3015 interruptible_sleep_on(sk
->sleep
);
3016 if (current
->signal
& ~current
->blocked
) {
3018 sk
->err
= ERESTARTSYS
;
3025 /* Now all we need to do is return skb->sk. */
3028 kfree_skb(skb
, FREE_READ
);
3035 /* This will initiate an outgoing connection. */
3037 tcp_connect(struct sock
*sk
, struct sockaddr_in
*usin
, int addr_len
)
3039 struct sk_buff
*buff
;
3040 struct sockaddr_in sin
;
3041 struct device
*dev
=NULL
;
3047 if (sk
->state
!= TCP_CLOSE
) return(-EISCONN
);
3048 if (addr_len
< 8) return(-EINVAL
);
3050 err
=verify_area(VERIFY_READ
, usin
, addr_len
);
3054 memcpy_fromfs(&sin
,usin
, min(sizeof(sin
), addr_len
));
3056 if (sin
.sin_family
&& sin
.sin_family
!= AF_INET
) return(-EAFNOSUPPORT
);
3058 DPRINTF((DBG_TCP
, "TCP connect daddr=%s\n", in_ntoa(sin
.sin_addr
.s_addr
)));
3060 /* Don't want a TCP connection going to a broadcast address */
3061 if (chk_addr(sin
.sin_addr
.s_addr
) == IS_BROADCAST
) {
3062 DPRINTF((DBG_TCP
, "TCP connection to broadcast address not allowed\n"));
3063 return(-ENETUNREACH
);
3066 /* Connect back to the same socket: Blows up so disallow it */
3067 if(sk
->saddr
== sin
.sin_addr
.s_addr
&& sk
->num
==ntohs(sin
.sin_port
))
3071 sk
->daddr
= sin
.sin_addr
.s_addr
;
3072 sk
->send_seq
= jiffies
* SEQ_TICK
- seq_offset
;
3073 sk
->window_seq
= sk
->send_seq
;
3074 sk
->rcv_ack_seq
= sk
->send_seq
-1;
3076 sk
->dummy_th
.dest
= sin
.sin_port
;
3079 buff
= sk
->prot
->wmalloc(sk
,MAX_SYN_SIZE
,0, GFP_KERNEL
);
3084 buff
->mem_addr
= buff
;
3085 buff
->mem_len
= MAX_SYN_SIZE
;
3089 t1
= (struct tcphdr
*) buff
->data
;
3091 /* Put in the IP header and routing stuff. */
3092 /* We need to build the routing stuff fromt the things saved in skb. */
3093 tmp
= sk
->prot
->build_header(buff
, sk
->saddr
, sk
->daddr
, &dev
,
3094 IPPROTO_TCP
, NULL
, MAX_SYN_SIZE
,sk
->ip_tos
,sk
->ip_ttl
);
3096 sk
->prot
->wfree(sk
, buff
->mem_addr
, buff
->mem_len
);
3098 return(-ENETUNREACH
);
3101 t1
= (struct tcphdr
*)((char *)t1
+tmp
);
3103 memcpy(t1
,(void *)&(sk
->dummy_th
), sizeof(*t1
));
3104 t1
->seq
= ntohl(sk
->send_seq
++);
3105 buff
->h
.seq
= sk
->send_seq
;
3117 /* use 512 or whatever user asked for */
3119 sk
->mtu
= sk
->user_mss
;
3121 #ifdef SUBNETSARELOCAL
3122 if ((sk
->saddr
^ sk
->daddr
) & default_mask(sk
->saddr
))
3124 if ((sk
->saddr
^ sk
->daddr
) & dev
->pa_mask
)
3126 sk
->mtu
= 576 - HEADER_SIZE
;
3128 sk
->mtu
= MAX_WINDOW
;
3130 /* but not bigger than device MTU */
3131 sk
->mtu
= min(sk
->mtu
, dev
->mtu
- HEADER_SIZE
);
3133 /* Put in the TCP options to say MTU. */
3134 ptr
= (unsigned char *)(t1
+1);
3137 ptr
[2] = (sk
->mtu
) >> 8;
3138 ptr
[3] = (sk
->mtu
) & 0xff;
3139 tcp_send_check(t1
, sk
->saddr
, sk
->daddr
,
3140 sizeof(struct tcphdr
) + 4, sk
);
3142 /* This must go first otherwise a really quick response will get reset. */
3143 sk
->state
= TCP_SYN_SENT
;
3144 sk
->rtt
= TCP_CONNECT_TIME
;
3145 reset_timer(sk
, TIME_WRITE
, TCP_CONNECT_TIME
); /* Timer for repeating the SYN until an answer */
3146 sk
->retransmits
= TCP_RETR2
- TCP_SYN_RETRIES
;
3148 sk
->prot
->queue_xmit(sk
, dev
, buff
, 0);
3155 /* This functions checks to see if the tcp header is actually acceptible. */
3157 tcp_sequence(struct sock
*sk
, struct tcphdr
*th
, short len
,
3158 struct options
*opt
, unsigned long saddr
, struct device
*dev
)
3161 * This isn't quite right. sk->acked_seq could be more recent
3162 * than sk->window. This is however close enough. We will accept
3163 * slightly more packets than we should, but it should not cause
3164 * problems unless someone is trying to forge packets.
3166 DPRINTF((DBG_TCP
, "tcp_sequence(sk=%X, th=%X, len = %d, opt=%d, saddr=%X)\n",
3167 sk
, th
, len
, opt
, saddr
));
3169 if (between(th
->seq
, sk
->acked_seq
, sk
->acked_seq
+ sk
->window
)||
3170 between(th
->seq
+ len
-(th
->doff
*4), sk
->acked_seq
+ 1,
3171 sk
->acked_seq
+ sk
->window
) ||
3172 (before(th
->seq
, sk
->acked_seq
) &&
3173 after(th
->seq
+ len
-(th
->doff
*4), sk
->acked_seq
+ sk
->window
))) {
3176 DPRINTF((DBG_TCP
, "tcp_sequence: rejecting packet.\n"));
3179 * Send a reset if we get something not ours and we are
3180 * unsynchronized. Note: We don't do anything to our end. We
3181 * are just killing the bogus remote connection then we will
3182 * connect again and it will work (with luck).
3185 if(sk
->state
==TCP_SYN_SENT
||sk
->state
==TCP_SYN_RECV
)
3187 tcp_reset(sk
->saddr
,sk
->daddr
,th
,sk
->prot
,NULL
,dev
, sk
->ip_tos
,sk
->ip_ttl
);
3192 * If it's too far ahead, send an ack to let the
3193 * other end know what we expect.
3195 if (after(th
->seq
, sk
->acked_seq
+ sk
->window
)) {
3197 tcp_send_ack(sk
->send_seq
, sk
->acked_seq
, sk
, th
, saddr
);
3203 * if we do this, we won't respond to keepalive packets, since those
3204 * are slightly out of window, and we have to generate an ack
3205 * a late ack out still not to have a sequence number less than
3206 * one we've seen before. Berkeley doesn't seem to do this, but it's
3207 * always hard to be sure.
3209 /* In case it's just a late ack, let it through. */
3210 if (th
->ack
&& len
== (th
->doff
* 4) &&
3211 after(th
->seq
, sk
->acked_seq
- 32767) &&
3212 !th
->fin
&& !th
->syn
) return(1);
3216 /* Try to resync things. */
3217 tcp_send_ack(sk
->send_seq
, sk
->acked_seq
, sk
, th
, saddr
);
3227 tcp_rcv(struct sk_buff
*skb
, struct device
*dev
, struct options
*opt
,
3228 unsigned long daddr
, unsigned short len
,
3229 unsigned long saddr
, int redo
, struct inet_protocol
* protocol
)
3235 DPRINTF((DBG_TCP
, "tcp.c: tcp_rcv skb = NULL\n"));
3238 #if 0 /* FIXME: it's ok for protocol to be NULL */
3240 DPRINTF((DBG_TCP
, "tcp.c: tcp_rcv protocol = NULL\n"));
3244 if (!opt
) { /* FIXME: it's ok for opt to be NULL */
3245 DPRINTF((DBG_TCP
, "tcp.c: tcp_rcv opt = NULL\n"));
3249 DPRINTF((DBG_TCP
, "tcp.c: tcp_rcv dev = NULL\n"));
3254 /* Find the socket. */
3255 sk
= get_sock(&tcp_prot
, th
->dest
, saddr
, th
->source
, daddr
);
3256 DPRINTF((DBG_TCP
, "<<\n"));
3257 DPRINTF((DBG_TCP
, "len = %d, redo = %d, skb=%X\n", len
, redo
, skb
));
3259 /* If this socket has got a reset its to all intents and purposes
3261 if (sk
!=NULL
&& sk
->zapped
)
3265 DPRINTF((DBG_TCP
, "sk = %X:\n", sk
));
3269 if (tcp_check(th
, len
, saddr
, daddr
)) {
3271 DPRINTF((DBG_TCP
, "packet dropped with bad checksum.\n"));
3272 if (inet_debug
== DBG_SLIP
) printk("\rtcp_rcv: bad checksum\n");
3273 kfree_skb(skb
,FREE_READ
);
3275 * We don't release the socket because it was
3276 * never marked in use.
3281 /* See if we know about the socket. */
3285 th
->seq
= ntohl(th
->seq
);
3286 /* So reset is always called with th->seq in host order */
3287 tcp_reset(daddr
, saddr
, th
, &tcp_prot
, opt
,dev
,skb
->ip_hdr
->tos
,255);
3290 kfree_skb(skb
, FREE_READ
);
3303 th
->seq
= ntohl(th
->seq
);
3305 /* We may need to add it to the backlog here. */
3308 if (sk
->back_log
== NULL
) {
3313 skb
->next
= sk
->back_log
;
3314 skb
->prev
= sk
->back_log
->prev
;
3315 skb
->prev
->next
= skb
;
3316 skb
->next
->prev
= skb
;
3325 DPRINTF((DBG_TCP
, "tcp.c: tcp_rcv bug sk=NULL redo = 1\n"));
3331 DPRINTF((DBG_TCP
, "tcp.c: tcp_rcv sk->prot = NULL \n"));
3335 /* Charge the memory to the socket. */
3336 if (sk
->rmem_alloc
+ skb
->mem_len
>= sk
->rcvbuf
) {
3338 DPRINTF((DBG_TCP
, "dropping packet due to lack of buffer space.\n"));
3339 kfree_skb(skb
, FREE_READ
);
3343 sk
->rmem_alloc
+= skb
->mem_len
;
3345 DPRINTF((DBG_TCP
, "About to do switch.\n"));
3347 /* Now deal with it. */
3350 * This should close the system down if it's waiting
3351 * for an ack that is never going to be sent.
3356 sk
->err
= ECONNRESET
;
3357 sk
->state
= TCP_CLOSE
;
3358 sk
->shutdown
= SHUTDOWN_MASK
;
3360 sk
->state_change(sk
);
3362 kfree_skb(skb
, FREE_READ
);
3367 case TCP_ESTABLISHED
:
3368 case TCP_CLOSE_WAIT
:
3372 if (!tcp_sequence(sk
, th
, len
, opt
, saddr
,dev
)) {
3373 if (inet_debug
== DBG_SLIP
) printk("\rtcp_rcv: not in seq\n");
3375 /* nice idea, but tcp_sequence already does this. Maybe it shouldn't?? */
3377 tcp_send_ack(sk
->send_seq
, sk
->acked_seq
,
3380 kfree_skb(skb
, FREE_READ
);
3387 /* This means the thing should really be closed. */
3388 sk
->err
= ECONNRESET
;
3390 if (sk
->state
== TCP_CLOSE_WAIT
) {
3395 * A reset with a fin just means that
3396 * the data was not all read.
3398 sk
->state
= TCP_CLOSE
;
3399 sk
->shutdown
= SHUTDOWN_MASK
;
3401 sk
->state_change(sk
);
3403 kfree_skb(skb
, FREE_READ
);
3409 if ((opt
&& (opt
->security
!= 0 ||
3410 opt
->compartment
!= 0)) ||
3413 sk
->err
= ECONNRESET
;
3414 sk
->state
= TCP_CLOSE
;
3415 sk
->shutdown
= SHUTDOWN_MASK
;
3416 tcp_reset(daddr
, saddr
, th
, sk
->prot
, opt
,dev
, sk
->ip_tos
,sk
->ip_ttl
);
3418 sk
->state_change(sk
);
3420 kfree_skb(skb
, FREE_READ
);
3425 if (!tcp_ack(sk
, th
, saddr
, len
)) {
3426 kfree_skb(skb
, FREE_READ
);
3432 if (tcp_urg(sk
, th
, saddr
)) {
3433 kfree_skb(skb
, FREE_READ
);
3439 if (tcp_data(skb
, sk
, saddr
, len
)) {
3440 kfree_skb(skb
, FREE_READ
);
3445 /* Moved: you must do data then fin bit */
3446 if (th
->fin
&& tcp_fin(sk
, th
, saddr
, dev
)) {
3447 kfree_skb(skb
, FREE_READ
);
3456 if (sk
->dead
|| sk
->daddr
) {
3457 DPRINTF((DBG_TCP
, "packet received for closed,dead socket\n"));
3458 kfree_skb(skb
, FREE_READ
);
3466 tcp_reset(daddr
, saddr
, th
, sk
->prot
, opt
,dev
,sk
->ip_tos
,sk
->ip_ttl
);
3468 kfree_skb(skb
, FREE_READ
);
3474 kfree_skb(skb
, FREE_READ
);
3479 tcp_reset(daddr
, saddr
, th
, sk
->prot
, opt
,dev
,sk
->ip_tos
,sk
->ip_ttl
);
3480 kfree_skb(skb
, FREE_READ
);
3487 if (opt
->security
!= 0 || opt
->compartment
!= 0) {
3488 tcp_reset(daddr
, saddr
, th
, prot
, opt
,dev
);
3495 * Now we just put the whole thing including
3496 * the header and saddr, and protocol pointer
3497 * into the buffer. We can't respond until the
3498 * user tells us to accept the connection.
3500 tcp_conn_request(sk
, skb
, daddr
, saddr
, opt
, dev
);
3505 kfree_skb(skb
, FREE_READ
);
3510 if (!tcp_sequence(sk
, th
, len
, opt
, saddr
,dev
)) {
3511 kfree_skb(skb
, FREE_READ
);
3518 sk
->err
= ECONNREFUSED
;
3519 sk
->state
= TCP_CLOSE
;
3520 sk
->shutdown
= SHUTDOWN_MASK
;
3523 sk
->state_change(sk
);
3525 kfree_skb(skb
, FREE_READ
);
3530 if (opt
->security
!= 0 || opt
->compartment
!= 0) {
3531 sk
->err
= ECONNRESET
;
3532 sk
->state
= TCP_CLOSE
;
3533 sk
->shutdown
= SHUTDOWN_MASK
;
3534 tcp_reset(daddr
, saddr
, th
, sk
->prot
, opt
, dev
);
3536 wake_up_interruptible(sk
->sleep
);
3538 kfree_skb(skb
, FREE_READ
);
3545 sk
->state
= TCP_SYN_RECV
;
3548 kfree_skb(skb
, FREE_READ
);
3555 if (!tcp_ack(sk
, th
, saddr
, len
)) {
3556 tcp_reset(daddr
, saddr
, th
,
3557 sk
->prot
, opt
,dev
,sk
->ip_tos
,sk
->ip_ttl
);
3558 kfree_skb(skb
, FREE_READ
);
3564 * If the syn bit is also set, switch to
3565 * tcp_syn_recv, and then to established.
3568 kfree_skb(skb
, FREE_READ
);
3573 /* Ack the syn and fall through. */
3574 sk
->acked_seq
= th
->seq
+1;
3575 sk
->fin_seq
= th
->seq
;
3576 tcp_send_ack(sk
->send_seq
, th
->seq
+1,
3580 if (!tcp_ack(sk
, th
, saddr
, len
)) {
3581 tcp_reset(daddr
, saddr
, th
,
3582 sk
->prot
, opt
, dev
,sk
->ip_tos
,sk
->ip_ttl
);
3583 kfree_skb(skb
, FREE_READ
);
3587 sk
->state
= TCP_ESTABLISHED
;
3590 * Now we need to finish filling out
3591 * some of the tcp header.
3593 /* We need to check for mtu info. */
3594 tcp_options(sk
, th
);
3595 sk
->dummy_th
.dest
= th
->source
;
3596 sk
->copied_seq
= sk
->acked_seq
-1;
3598 sk
->state_change(sk
);
3602 * We've already processed his first
3603 * ack. In just about all cases that
3604 * will have set max_window. This is
3605 * to protect us against the possibility
3606 * that the initial window he sent was 0.
3607 * This must occur after tcp_options, which
3610 if (sk
->max_window
== 0) {
3611 sk
->max_window
= 32;
3612 sk
->mss
= min(sk
->max_window
, sk
->mtu
);
3616 * Now process the rest like we were
3617 * already in the established state.
3620 if (tcp_urg(sk
, th
, saddr
)) {
3621 kfree_skb(skb
, FREE_READ
);
3626 if (tcp_data(skb
, sk
, saddr
, len
))
3627 kfree_skb(skb
, FREE_READ
);
3629 if (th
->fin
) tcp_fin(sk
, th
, saddr
, dev
);
3635 if (tcp_urg(sk
, th
, saddr
)) {
3636 kfree_skb(skb
, FREE_READ
);
3642 if (tcp_data(skb
, sk
, saddr
, len
)) {
3643 kfree_skb(skb
, FREE_READ
);
3652 tcp_fin(sk
, th
, saddr
, dev
);
3660 * This routine sends a packet with an out of date sequence
3661 * number. It assumes the other end will try to ack it.
3664 tcp_write_wakeup(struct sock
*sk
)
3666 struct sk_buff
*buff
;
3668 struct device
*dev
=NULL
;
3672 return; /* Afer a valid reset we can send no more */
3674 if (sk
-> state
!= TCP_ESTABLISHED
&& sk
->state
!= TCP_CLOSE_WAIT
) return;
3676 buff
= sk
->prot
->wmalloc(sk
,MAX_ACK_SIZE
,1, GFP_ATOMIC
);
3677 if (buff
== NULL
) return;
3679 buff
->mem_addr
= buff
;
3680 buff
->mem_len
= MAX_ACK_SIZE
;
3681 buff
->len
= sizeof(struct tcphdr
);
3684 DPRINTF((DBG_TCP
, "in tcp_write_wakeup\n"));
3685 t1
= (struct tcphdr
*) buff
->data
;
3687 /* Put in the IP header and routing stuff. */
3688 tmp
= sk
->prot
->build_header(buff
, sk
->saddr
, sk
->daddr
, &dev
,
3689 IPPROTO_TCP
, sk
->opt
, MAX_ACK_SIZE
,sk
->ip_tos
,sk
->ip_ttl
);
3691 sk
->prot
->wfree(sk
, buff
->mem_addr
, buff
->mem_len
);
3696 t1
= (struct tcphdr
*)((char *)t1
+tmp
);
3698 memcpy(t1
,(void *) &sk
->dummy_th
, sizeof(*t1
));
3701 * Use a previous sequence.
3702 * This should cause the other end to send an ack.
3704 t1
->seq
= ntohl(sk
->send_seq
-1);
3713 t1
->ack_seq
= ntohl(sk
->acked_seq
);
3714 t1
->window
= ntohs(tcp_select_window(sk
)/*sk->prot->rspace(sk)*/);
3715 t1
->doff
= sizeof(*t1
)/4;
3716 tcp_send_check(t1
, sk
->saddr
, sk
->daddr
, sizeof(*t1
), sk
);
3718 /* Send it and free it.
3719 * This will prevent the timer from automatically being restarted.
3721 sk
->prot
->queue_xmit(sk
, dev
, buff
, 1);
3725 * This routine probes a zero window. It makes a copy of the first
3726 * packet in the write queue, but with just one byte of data.
3729 tcp_send_probe0(struct sock
*sk
)
3733 struct sk_buff
*skb2
, *skb
;
3734 int len
, hlen
, data
;
3739 return; /* Afer a valid reset we can send no more */
3741 if (sk
-> state
!= TCP_ESTABLISHED
&& sk
->state
!= TCP_CLOSE_WAIT
&&
3742 sk
-> state
!= TCP_FIN_WAIT1
&& sk
->state
!= TCP_FIN_WAIT2
)
3750 /* I know this can't happen but as it does.. */
3753 printk("tcp_send_probe0: NULL device bug!\n");
3759 iph
= (struct iphdr
*) (raw
+ dev
->hard_header_len
);
3761 hlen
= (iph
->ihl
* sizeof(unsigned long)) + dev
->hard_header_len
;
3762 data
= skb
->len
- hlen
- sizeof(struct tcphdr
);
3763 len
= hlen
+ sizeof(struct tcphdr
) + (data
? 1 : 0);
3765 /* Allocate buffer. */
3766 if ((skb2
= alloc_skb(sizeof(struct sk_buff
) + len
, GFP_ATOMIC
)) == NULL
) {
3767 /* printk("alloc failed raw %x th %x hlen %d data %d len %d\n",
3768 raw, skb->h.th, hlen, data, len); */
3769 reset_timer (sk
, TIME_PROBE0
, 10); /* try again real soon */
3773 skb2
->arp
= skb
->arp
;
3775 skb2
->h
.raw
= (char *)(skb2
->data
);
3777 sk
->wmem_alloc
+= skb2
->mem_len
;
3779 /* Copy the packet header into the new buffer. */
3780 memcpy(skb2
->h
.raw
, raw
, len
);
3782 skb2
->h
.raw
+= hlen
; /* it's now h.th -- pointer to the tcp header */
3785 /* source, dest, seq, from existing packet */
3786 t1
->ack_seq
= ntohl(sk
->acked_seq
);
3788 /* doff, fin, from existing packet. Fin is safe because Linux always
3789 * sends fin in a separate packet
3790 * syn, rst, had better be zero in original */
3792 t1
->urg
= 0; /* urgent pointer might be beyond this fragment */
3794 t1
->window
= ntohs(tcp_select_window(sk
)/*sk->prot->rspace(sk)*/);
3796 tcp_send_check(t1
, sk
->saddr
, sk
->daddr
, len
- hlen
, sk
);
3797 /* Send it and free it.
3798 * This will prevent the timer from automatically being restarted.
3800 sk
->prot
->queue_xmit(sk
, dev
, skb2
, 1);
3803 * in the case of retransmissions, there's good reason to limit
3804 * rto to 120 sec, as that's the maximum legal RTT on the Internet.
3805 * For probes it could reasonably be longer. However making it
3806 * much longer could cause unacceptable delays in some situation,
3807 * so we might as well use the same value
3809 sk
->rto
= min(sk
->rto
<< 1, 120*HZ
);
3810 reset_timer (sk
, TIME_PROBE0
, sk
->rto
);
3812 sk
->prot
->retransmits
++;
3816 * Socket option code for TCP.
3819 int tcp_setsockopt(struct sock
*sk
, int level
, int optname
, char *optval
, int optlen
)
3824 return ip_setsockopt(sk
,level
,optname
,optval
,optlen
);
3829 err
=verify_area(VERIFY_READ
, optval
, sizeof(int));
3833 val
= get_fs_long((unsigned long *)optval
);
3838 /* if(val<200||val>2048 || val>sk->mtu) */
3840 * values greater than interface MTU won't take effect. however at
3841 * the point when this call is done we typically don't yet know
3842 * which interface is going to be used
3844 if(val
<1||val
>MAX_WINDOW
)
3849 sk
->nonagle
=(val
==0)?0:1;
3852 return(-ENOPROTOOPT
);
3856 int tcp_getsockopt(struct sock
*sk
, int level
, int optname
, char *optval
, int *optlen
)
3861 return ip_getsockopt(sk
,level
,optname
,optval
,optlen
);
3869 val
=sk
->nonagle
; /* Until Johannes stuff is in */
3872 return(-ENOPROTOOPT
);
3874 err
=verify_area(VERIFY_WRITE
, optlen
, sizeof(int));
3877 put_fs_long(sizeof(int),(unsigned long *) optlen
);
3879 err
=verify_area(VERIFY_WRITE
, optval
, sizeof(int));
3882 put_fs_long(val
,(unsigned long *)optval
);
3888 struct proto tcp_prot
= {