* added 0.99 linux version
[mascara-docs.git] / i386 / linux / linux-0.99 / net / inet / tcp.c
blob0133e183081f6556284ed534ebc4ded07861173a
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: @(#)tcp.c 1.0.16 05/25/93
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
16 * Fixes:
17 * Alan Cox : Numerous verify_area() calls
18 * Alan Cox : Set the ACK bit on a reset
19 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1
20 * and was trying to connect (tcp_err()).
21 * Alan Cox : All icmp error handling was broken
22 * pointers passed where wrong and the
23 * socket was looked up backwards. Nobody
24 * tested any icmp error code obviously.
25 * Alan Cox : tcp_err() now handled properly. It wakes people
26 * on errors. select behaves and the icmp error race
27 * has gone by moving it into sock.c
28 * Alan Cox : tcp_reset() fixed to work for everything not just
29 * packets for unknown sockets.
30 * Alan Cox : tcp option processing.
31 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong]
32 * Herp Rosmanith : More reset fixes
33 * Alan Cox : No longer acks invalid rst frames. Acking
34 * any kind of RST is right out.
35 * Alan Cox : Sets an ignore me flag on an rst receive
36 * otherwise odd bits of prattle escape still
37 * Alan Cox : Fixed another acking RST frame bug. Should stop
38 * LAN workplace lockups.
39 * Alan Cox : Some tidyups using the new skb list facilities
40 * Alan Cox : sk->keepopen now seems to work
41 * Alan Cox : Pulls options out correctly on accepts
42 * Alan Cox : Fixed assorted sk->rqueue->next errors
43 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops.
44 * Alan Cox : Tidied tcp_data to avoid a potential nasty.
45 * Alan Cox : Added some beter commenting, as the tcp is hard to follow
46 * Alan Cox : Removed incorrect check for 20 * psh
47 * Michael O'Reilly : ack < copied bug fix.
48 * Johannes Stille : Misc tcp fixes (not all in yet).
49 * Alan Cox : FIN with no memory -> CRASH
50 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept.
51 * Alan Cox : Added TCP options (SOL_TCP)
52 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets.
53 * Alan Cox : Use ip_tos/ip_ttl settings.
54 * Alan Cox : Handle FIN (more) properly (we hope).
55 * Alan Cox : RST frames sent on unsynchronised state ack error/
56 * Alan Cox : Put in missing check for SYN bit.
57 * Alan Cox : Added tcp_select_window() aka NET2E
58 * window non shrink trick.
59 * Alan Cox : Added a couple of small NET2E timer fixes
60 * Charles Hedrick : TCP fixes
61 * Toomas Tamm : TCP window fixes
64 * To Fix:
65 * Possibly a problem with accept(). BSD accept never fails after
66 * it causes a select. Linux can - given the official select semantics I
67 * feel that _really_ its the BSD network programs that are bust (notably
68 * inetd, which hangs occasionally because of this).
69 * Add VJ Fastrecovery algorithm ?
70 * Protocol closedown badly messed up.
71 * Incompatiblity with spider ports (tcp hangs on that
72 * socket occasionally).
73 * MSG_PEEK and read on same socket at once can cause crashes.
75 * This program is free software; you can redistribute it and/or
76 * modify it under the terms of the GNU General Public License
77 * as published by the Free Software Foundation; either version
78 * 2 of the License, or(at your option) any later version.
80 #include <linux/types.h>
81 #include <linux/sched.h>
82 #include <linux/mm.h>
83 #include <linux/string.h>
84 #include <linux/socket.h>
85 #include <linux/sockios.h>
86 #include <linux/termios.h>
87 #include <linux/in.h>
88 #include <linux/fcntl.h>
89 #include "inet.h"
90 #include "dev.h"
91 #include "ip.h"
92 #include "protocol.h"
93 #include "icmp.h"
94 #include "tcp.h"
95 #include "skbuff.h"
96 #include "sock.h"
97 #include "arp.h"
98 #include <linux/errno.h>
99 #include <linux/timer.h>
100 #include <asm/system.h>
101 #include <asm/segment.h>
102 #include <linux/mm.h>
104 #define SEQ_TICK 3
105 unsigned long seq_offset;
106 #define SUBNETSARELOCAL
108 static __inline__ int
109 min(unsigned int a, unsigned int b)
111 if (a < b) return(a);
112 return(b);
116 void
117 print_th(struct tcphdr *th)
119 unsigned char *ptr;
121 if (inet_debug != DBG_TCP) return;
123 printk("TCP header:\n");
124 ptr =(unsigned char *)(th + 1);
125 printk(" source=%d, dest=%d, seq =%ld, ack_seq = %ld\n",
126 ntohs(th->source), ntohs(th->dest),
127 ntohl(th->seq), ntohl(th->ack_seq));
128 printk(" fin=%d, syn=%d, rst=%d, psh=%d, ack=%d, urg=%d res1=%d res2=%d\n",
129 th->fin, th->syn, th->rst, th->psh, th->ack,
130 th->urg, th->res1, th->res2);
131 printk(" window = %d, check = %d urg_ptr = %d\n",
132 ntohs(th->window), ntohs(th->check), ntohs(th->urg_ptr));
133 printk(" doff = %d\n", th->doff);
134 printk(" options = %d %d %d %d\n", ptr[0], ptr[1], ptr[2], ptr[3]);
139 /* This routine grabs the first thing off of a rcv queue. */
140 static struct sk_buff *
141 get_firstr(struct sock *sk)
143 return skb_dequeue(&sk->rqueue);
147 * Difference between two values in tcp ack terms.
150 static long
151 diff(unsigned long seq1, unsigned long seq2)
153 long d;
155 d = seq1 - seq2;
156 if (d > 0) return(d);
158 /* I hope this returns what I want. */
159 return(~d+1);
162 /* This routine picks a TCP windows for a socket based on
163 the following constraints
165 1. The window can never be shrunk once it is offered (RFC 793)
166 2. We limit memory per socket
168 For now we use NET2E3's heuristic of offering half the memory
169 we have handy. All is not as bad as this seems however because
170 of two things. Firstly we will bin packets even within the window
171 in order to get the data we are waiting for into the memory limit.
172 Secondly we bin common duplicate forms at receive time
174 Better heuristics welcome
177 static int tcp_select_window(struct sock *sk)
179 int new_window = sk->prot->rspace(sk);
182 * two things are going on here. First, we don't ever offer a
183 * window less than min(sk->mss, MAX_WINDOW/2). This is the
184 * receiver side of SWS as specified in RFC1122.
185 * Second, we always give them at least the window they
186 * had before, in order to avoid retracting window. This
187 * is technically allowed, but RFC1122 advises against it and
188 * in practice it causes trouble.
190 if (new_window < min(sk->mss, MAX_WINDOW/2) ||
191 new_window < sk->window)
192 return(sk->window);
193 return(new_window);
196 /* Enter the time wait state. */
198 static void tcp_time_wait(struct sock *sk)
200 sk->state = TCP_TIME_WAIT;
201 sk->shutdown = SHUTDOWN_MASK;
202 if (!sk->dead)
203 sk->state_change(sk);
204 reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
208 * A timer event has trigger a tcp retransmit timeout. The
209 * socket xmit queue is ready and set up to send. Because
210 * the ack receive code keeps the queue straight we do
211 * nothing clever here.
214 static void
215 tcp_retransmit(struct sock *sk, int all)
217 if (all) {
218 ip_retransmit(sk, all);
219 return;
222 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
223 /* sk->ssthresh in theory can be zero. I guess that's OK */
224 sk->cong_count = 0;
226 sk->cong_window = 1;
228 /* Do the actual retransmit. */
229 ip_retransmit(sk, all);
234 * This routine is called by the ICMP module when it gets some
235 * sort of error condition. If err < 0 then the socket should
236 * be closed and the error returned to the user. If err > 0
237 * it's just the icmp type << 8 | icmp code. After adjustment
238 * header points to the first 8 bytes of the tcp header. We need
239 * to find the appropriate port.
241 void
242 tcp_err(int err, unsigned char *header, unsigned long daddr,
243 unsigned long saddr, struct inet_protocol *protocol)
245 struct tcphdr *th;
246 struct sock *sk;
247 struct iphdr *iph=(struct iphdr *)header;
249 header+=4*iph->ihl;
251 DPRINTF((DBG_TCP, "TCP: tcp_err(%d, hdr=%X, daddr=%X saddr=%X, protocol=%X)\n",
252 err, header, daddr, saddr, protocol));
254 th =(struct tcphdr *)header;
255 sk = get_sock(&tcp_prot, th->source/*dest*/, daddr, th->dest/*source*/, saddr);
256 print_th(th);
258 if (sk == NULL) return;
260 if(err<0)
262 sk->err = -err;
263 sk->error_report(sk);
264 return;
267 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) {
269 * FIXME:
270 * For now we will just trigger a linear backoff.
271 * The slow start code should cause a real backoff here.
273 if (sk->cong_window > 4) sk->cong_window--;
274 return;
277 DPRINTF((DBG_TCP, "TCP: icmp_err got error\n"));
278 sk->err = icmp_err_convert[err & 0xff].errno;
281 * If we've already connected we will keep trying
282 * until we time out, or the user gives up.
284 if (icmp_err_convert[err & 0xff].fatal) {
285 if (sk->state == TCP_SYN_SENT) {
286 sk->state = TCP_CLOSE;
287 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
290 return;
295 * Walk down the receive queue counting readable data until we hit the end or we find a gap
296 * in the received data queue (ie a frame missing that needs sending to us)
299 static int
300 tcp_readable(struct sock *sk)
302 unsigned long counted;
303 unsigned long amount;
304 struct sk_buff *skb;
305 int count=0;
306 int sum;
307 unsigned long flags;
309 DPRINTF((DBG_TCP, "tcp_readable(sk=%X)\n", sk));
310 if(sk && sk->debug)
311 printk("tcp_readable: %p - ",sk);
313 if (sk == NULL || skb_peek(&sk->rqueue) == NULL) /* Empty sockets are easy! */
315 if(sk && sk->debug)
316 printk("empty\n");
317 return(0);
320 counted = sk->copied_seq+1; /* Where we are at the moment */
321 amount = 0;
323 save_flags(flags); /* So nobody adds things at the wrong moment */
324 cli();
325 skb =(struct sk_buff *)sk->rqueue;
327 /* Do until a push or until we are out of data. */
328 do {
329 count++;
330 #ifdef OLD
331 /* This is wrong: It breaks Chameleon amongst other stacks */
332 if (count > 20) {
333 restore_flags(flags);
334 DPRINTF((DBG_TCP, "tcp_readable, more than 20 packets without a psh\n"));
335 printk("tcp_read: possible read_queue corruption.\n");
336 return(amount);
338 #endif
339 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
340 break;
341 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
342 if (skb->h.th->syn) sum++;
343 if (skb->h.th->urg) {
344 sum -= ntohs(skb->h.th->urg_ptr); /* Dont count urg data */
346 if (sum >= 0) { /* Add it up, move on */
347 amount += sum;
348 if (skb->h.th->syn) amount--;
349 counted += sum;
351 if (amount && skb->h.th->psh) break;
352 skb =(struct sk_buff *)skb->next; /* Move along */
353 } while(skb != sk->rqueue);
354 restore_flags(flags);
355 DPRINTF((DBG_TCP, "tcp readable returning %d bytes\n", amount));
356 if(sk->debug)
357 printk("got %lu bytes.\n",amount);
358 return(amount);
363 * Wait for a TCP event. Note the oddity with SEL_IN and reading. The
364 * listening socket has a receive queue of sockets to accept.
367 static int
368 tcp_select(struct sock *sk, int sel_type, select_table *wait)
370 DPRINTF((DBG_TCP, "tcp_select(sk=%X, sel_type = %d, wait = %X)\n",
371 sk, sel_type, wait));
373 sk->inuse = 1;
374 switch(sel_type) {
375 case SEL_IN:
376 if(sk->debug)
377 printk("select in");
378 select_wait(sk->sleep, wait);
379 if(sk->debug)
380 printk("-select out");
381 if (skb_peek(&sk->rqueue) != NULL) {
382 if (sk->state == TCP_LISTEN || tcp_readable(sk)) {
383 release_sock(sk);
384 if(sk->debug)
385 printk("-select ok data\n");
386 return(1);
389 if (sk->err != 0) /* Receiver error */
391 release_sock(sk);
392 if(sk->debug)
393 printk("-select ok error");
394 return(1);
396 if (sk->shutdown & RCV_SHUTDOWN) {
397 release_sock(sk);
398 if(sk->debug)
399 printk("-select ok down\n");
400 return(1);
401 } else {
402 release_sock(sk);
403 if(sk->debug)
404 printk("-select fail\n");
405 return(0);
407 case SEL_OUT:
408 select_wait(sk->sleep, wait);
409 if (sk->shutdown & SEND_SHUTDOWN) {
410 DPRINTF((DBG_TCP,
411 "write select on shutdown socket.\n"));
413 /* FIXME: should this return an error? */
414 release_sock(sk);
415 return(0);
419 * FIXME:
420 * Hack so it will probably be able to write
421 * something if it says it's ok to write.
423 if (sk->prot->wspace(sk) >= sk->mss) {
424 release_sock(sk);
425 /* This should cause connect to work ok. */
426 if (sk->state == TCP_SYN_RECV ||
427 sk->state == TCP_SYN_SENT) return(0);
428 return(1);
430 DPRINTF((DBG_TCP,
431 "tcp_select: sleeping on write sk->wmem_alloc = %d, "
432 "sk->packets_out = %d\n"
433 "sk->wback = %X, sk->wfront = %X\n"
434 "sk->send_seq = %u, sk->window_seq=%u\n",
435 sk->wmem_alloc, sk->packets_out,
436 sk->wback, sk->wfront,
437 sk->send_seq, sk->window_seq));
439 release_sock(sk);
440 return(0);
441 case SEL_EX:
442 select_wait(sk->sleep,wait);
443 if (sk->err) {
444 release_sock(sk);
445 return(1);
447 release_sock(sk);
448 return(0);
451 release_sock(sk);
452 return(0);
457 tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
459 int err;
460 DPRINTF((DBG_TCP, "tcp_ioctl(sk=%X, cmd = %d, arg=%X)\n", sk, cmd, arg));
461 switch(cmd) {
462 case DDIOCSDBG:
463 return(dbg_ioctl((void *) arg, DBG_TCP));
465 case TIOCINQ:
466 #ifdef FIXME /* FIXME: */
467 case FIONREAD:
468 #endif
470 unsigned long amount;
472 if (sk->state == TCP_LISTEN) return(-EINVAL);
474 sk->inuse = 1;
475 amount = tcp_readable(sk);
476 release_sock(sk);
477 DPRINTF((DBG_TCP, "returning %d\n", amount));
478 err=verify_area(VERIFY_WRITE,(void *)arg,
479 sizeof(unsigned long));
480 if(err)
481 return err;
482 put_fs_long(amount,(unsigned long *)arg);
483 return(0);
485 case SIOCATMARK:
487 struct sk_buff *skb;
488 int answ = 0;
491 * Try to figure out if we need to read
492 * some urgent data.
494 sk->inuse = 1;
495 if ((skb=skb_peek(&sk->rqueue)) != NULL)
497 if (sk->copied_seq+1 == skb->h.th->seq && skb->h.th->urg)
498 answ = 1;
500 release_sock(sk);
501 err=verify_area(VERIFY_WRITE,(void *) arg,
502 sizeof(unsigned long));
503 if(err)
504 return err;
505 put_fs_long(answ,(int *) arg);
506 return(0);
508 case TIOCOUTQ:
510 unsigned long amount;
512 if (sk->state == TCP_LISTEN) return(-EINVAL);
513 amount = sk->prot->wspace(sk);
514 err=verify_area(VERIFY_WRITE,(void *)arg,
515 sizeof(unsigned long));
516 if(err)
517 return err;
518 put_fs_long(amount,(unsigned long *)arg);
519 return(0);
521 default:
522 return(-EINVAL);
527 /* This routine computes a TCP checksum. */
528 unsigned short
529 tcp_check(struct tcphdr *th, int len,
530 unsigned long saddr, unsigned long daddr)
532 unsigned long sum;
534 if (saddr == 0) saddr = my_addr();
535 print_th(th);
536 __asm__("\t addl %%ecx,%%ebx\n"
537 "\t adcl %%edx,%%ebx\n"
538 "\t adcl $0, %%ebx\n"
539 : "=b"(sum)
540 : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
541 : "cx","bx","dx" );
543 if (len > 3) {
544 __asm__("\tclc\n"
545 "1:\n"
546 "\t lodsl\n"
547 "\t adcl %%eax, %%ebx\n"
548 "\t loop 1b\n"
549 "\t adcl $0, %%ebx\n"
550 : "=b"(sum) , "=S"(th)
551 : "0"(sum), "c"(len/4) ,"1"(th)
552 : "ax", "cx", "bx", "si" );
555 /* Convert from 32 bits to 16 bits. */
556 __asm__("\t movl %%ebx, %%ecx\n"
557 "\t shrl $16,%%ecx\n"
558 "\t addw %%cx, %%bx\n"
559 "\t adcw $0, %%bx\n"
560 : "=b"(sum)
561 : "0"(sum)
562 : "bx", "cx");
564 /* Check for an extra word. */
565 if ((len & 2) != 0) {
566 __asm__("\t lodsw\n"
567 "\t addw %%ax,%%bx\n"
568 "\t adcw $0, %%bx\n"
569 : "=b"(sum), "=S"(th)
570 : "0"(sum) ,"1"(th)
571 : "si", "ax", "bx");
574 /* Now check for the extra byte. */
575 if ((len & 1) != 0) {
576 __asm__("\t lodsb\n"
577 "\t movb $0,%%ah\n"
578 "\t addw %%ax,%%bx\n"
579 "\t adcw $0, %%bx\n"
580 : "=b"(sum)
581 : "0"(sum) ,"S"(th)
582 : "si", "ax", "bx");
585 /* We only want the bottom 16 bits, but we never cleared the top 16. */
586 return((~sum) & 0xffff);
590 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
591 unsigned long daddr, int len, struct sock *sk)
593 th->check = 0;
594 th->check = tcp_check(th, len, saddr, daddr);
595 return;
598 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
600 int size;
602 /* length of packet (not counting length of pre-tcp headers) */
603 size = skb->len - ((unsigned char *) skb->h.th - skb->data);
605 /* sanity check it.. */
606 if (size < sizeof(struct tcphdr) || size > skb->len) {
607 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
608 skb, skb->data, skb->h.th, skb->len);
609 kfree_skb(skb, FREE_WRITE);
610 return;
613 /* If we have queued a header size packet.. */
614 if (size == sizeof(struct tcphdr)) {
615 /* If its got a syn or fin its notionally included in the size..*/
616 if(!skb->h.th->syn && !skb->h.th->fin) {
617 printk("tcp_send_skb: attempt to queue a bogon.\n");
618 kfree_skb(skb,FREE_WRITE);
619 return;
623 /* We need to complete and send the packet. */
624 tcp_send_check(skb->h.th, sk->saddr, sk->daddr, size, sk);
626 skb->h.seq = sk->send_seq;
627 if (after(sk->send_seq , sk->window_seq) ||
628 (sk->retransmits && sk->timeout == TIME_WRITE) ||
629 sk->packets_out >= sk->cong_window) {
630 DPRINTF((DBG_TCP, "sk->cong_window = %d, sk->packets_out = %d\n",
631 sk->cong_window, sk->packets_out));
632 DPRINTF((DBG_TCP, "sk->send_seq = %d, sk->window_seq = %d\n",
633 sk->send_seq, sk->window_seq));
634 skb->next = NULL;
635 skb->magic = TCP_WRITE_QUEUE_MAGIC;
636 if (sk->wback == NULL) {
637 sk->wfront = skb;
638 } else {
639 sk->wback->next = skb;
641 sk->wback = skb;
642 if (before(sk->window_seq, sk->wfront->h.seq) &&
643 sk->send_head == NULL &&
644 sk->ack_backlog == 0)
645 reset_timer(sk, TIME_PROBE0, sk->rto);
646 } else {
647 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
651 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
653 struct sk_buff * skb;
654 unsigned long flags;
656 save_flags(flags);
657 cli();
658 skb = sk->partial;
659 if (skb) {
660 sk->partial = NULL;
661 del_timer(&sk->partial_timer);
663 restore_flags(flags);
664 return skb;
667 static void tcp_send_partial(struct sock *sk)
669 struct sk_buff *skb;
671 if (sk == NULL)
672 return;
673 while ((skb = tcp_dequeue_partial(sk)) != NULL)
674 tcp_send_skb(sk, skb);
677 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
679 struct sk_buff * tmp;
680 unsigned long flags;
682 save_flags(flags);
683 cli();
684 tmp = sk->partial;
685 if (tmp)
686 del_timer(&sk->partial_timer);
687 sk->partial = skb;
688 sk->partial_timer.expires = HZ;
689 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
690 sk->partial_timer.data = (unsigned long) sk;
691 add_timer(&sk->partial_timer);
692 restore_flags(flags);
693 if (tmp)
694 tcp_send_skb(sk, tmp);
698 /* This routine sends an ack and also updates the window. */
699 static void
700 tcp_send_ack(unsigned long sequence, unsigned long ack,
701 struct sock *sk,
702 struct tcphdr *th, unsigned long daddr)
704 struct sk_buff *buff;
705 struct tcphdr *t1;
706 struct device *dev = NULL;
707 int tmp;
709 if(sk->zapped)
710 return; /* We have been reset, we may not send again */
712 * We need to grab some memory, and put together an ack,
713 * and then put it into the queue to be sent.
715 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
716 if (buff == NULL) {
717 /* Force it to send an ack. */
718 sk->ack_backlog++;
719 if (sk->timeout != TIME_WRITE && tcp_connected(sk->state)) {
720 reset_timer(sk, TIME_WRITE, 10);
722 if (inet_debug == DBG_SLIP) printk("\rtcp_ack: malloc failed\n");
723 return;
726 buff->mem_addr = buff;
727 buff->mem_len = MAX_ACK_SIZE;
728 buff->len = sizeof(struct tcphdr);
729 buff->sk = sk;
730 t1 =(struct tcphdr *) buff->data;
732 /* Put in the IP header and routing stuff. */
733 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
734 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
735 if (tmp < 0) {
736 buff->free=1;
737 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
738 if (inet_debug == DBG_SLIP) printk("\rtcp_ack: build_header failed\n");
739 return;
741 buff->len += tmp;
742 t1 =(struct tcphdr *)((char *)t1 +tmp);
744 /* FIXME: */
745 memcpy(t1, th, sizeof(*t1)); /* this should probably be removed */
747 /* swap the send and the receive. */
748 t1->dest = th->source;
749 t1->source = th->dest;
750 t1->seq = ntohl(sequence);
751 t1->ack = 1;
752 sk->window = tcp_select_window(sk);/*sk->prot->rspace(sk);*/
753 t1->window = ntohs(sk->window);
754 t1->res1 = 0;
755 t1->res2 = 0;
756 t1->rst = 0;
757 t1->urg = 0;
758 t1->syn = 0;
759 t1->psh = 0;
760 t1->fin = 0;
761 if (ack == sk->acked_seq) {
762 sk->ack_backlog = 0;
763 sk->bytes_rcv = 0;
764 sk->ack_timed = 0;
765 if (sk->send_head == NULL && sk->wfront == NULL && sk->timeout == TIME_WRITE)
767 if(sk->keepopen)
768 reset_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
769 else
770 delete_timer(sk);
773 t1->ack_seq = ntohl(ack);
774 t1->doff = sizeof(*t1)/4;
775 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
776 if (sk->debug)
777 printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack);
778 sk->prot->queue_xmit(sk, dev, buff, 1);
782 /* This routine builds a generic TCP header. */
783 static int
784 tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
787 /* FIXME: want to get rid of this. */
788 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
789 th->seq = htonl(sk->send_seq);
790 th->psh =(push == 0) ? 1 : 0;
791 th->doff = sizeof(*th)/4;
792 th->ack = 1;
793 th->fin = 0;
794 sk->ack_backlog = 0;
795 sk->bytes_rcv = 0;
796 sk->ack_timed = 0;
797 th->ack_seq = htonl(sk->acked_seq);
798 sk->window = tcp_select_window(sk)/*sk->prot->rspace(sk)*/;
799 th->window = htons(sk->window);
801 return(sizeof(*th));
805 * This routine copies from a user buffer into a socket,
806 * and starts the transmit system.
808 static int
809 tcp_write(struct sock *sk, unsigned char *from,
810 int len, int nonblock, unsigned flags)
812 int copied = 0;
813 int copy;
814 int tmp;
815 struct sk_buff *skb;
816 struct sk_buff *send_tmp;
817 unsigned char *buff;
818 struct proto *prot;
819 struct device *dev = NULL;
821 DPRINTF((DBG_TCP, "tcp_write(sk=%X, from=%X, len=%d, nonblock=%d, flags=%X)\n",
822 sk, from, len, nonblock, flags));
824 sk->inuse=1;
825 prot = sk->prot;
826 while(len > 0) {
827 if (sk->err) { /* Stop on an error */
828 release_sock(sk);
829 if (copied) return(copied);
830 tmp = -sk->err;
831 sk->err = 0;
832 return(tmp);
835 /* First thing we do is make sure that we are established. */
836 if (sk->shutdown & SEND_SHUTDOWN) {
837 release_sock(sk);
838 sk->err = EPIPE;
839 if (copied) return(copied);
840 sk->err = 0;
841 return(-EPIPE);
845 /* Wait for a connection to finish. */
847 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) {
848 if (sk->err) {
849 release_sock(sk);
850 if (copied) return(copied);
851 tmp = -sk->err;
852 sk->err = 0;
853 return(tmp);
856 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) {
857 release_sock(sk);
858 DPRINTF((DBG_TCP, "tcp_write: return 1\n"));
859 if (copied) return(copied);
861 if (sk->err) {
862 tmp = -sk->err;
863 sk->err = 0;
864 return(tmp);
867 if (sk->keepopen) {
868 send_sig(SIGPIPE, current, 0);
870 return(-EPIPE);
873 if (nonblock || copied) {
874 release_sock(sk);
875 DPRINTF((DBG_TCP, "tcp_write: return 2\n"));
876 if (copied) return(copied);
877 return(-EAGAIN);
880 release_sock(sk);
881 cli();
882 if (sk->state != TCP_ESTABLISHED &&
883 sk->state != TCP_CLOSE_WAIT && sk->err == 0) {
884 interruptible_sleep_on(sk->sleep);
885 if (current->signal & ~current->blocked) {
886 sti();
887 DPRINTF((DBG_TCP, "tcp_write: return 3\n"));
888 if (copied) return(copied);
889 return(-ERESTARTSYS);
892 sk->inuse = 1;
893 sti();
897 * The following code can result in copy <= if sk->mss is ever
898 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
899 * sk->mtu is constant once SYN processing is finished. I.e. we
900 * had better not get here until we've seen his SYN and at least one
901 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
902 * But ESTABLISHED should guarantee that. sk->max_window is by definition
903 * non-decreasing. Note that any ioctl to set user_mss must be done
904 * before the exchange of SYN's. If the initial ack from the other
905 * end has a window of 0, max_window and thus mss will both be 0.
908 /* Now we need to check if we have a half built packet. */
909 if ((skb = tcp_dequeue_partial(sk)) != NULL) {
910 int hdrlen;
912 /* IP header + TCP header */
913 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
914 + sizeof(struct tcphdr);
916 /* Add more stuff to the end of skb->len */
917 if (!(flags & MSG_OOB)) {
918 copy = min(sk->mss - (skb->len - hdrlen), len);
919 /* FIXME: this is really a bug. */
920 if (copy <= 0) {
921 printk("TCP: **bug**: \"copy\" <= 0!!\n");
922 copy = 0;
925 memcpy_fromfs(skb->data + skb->len, from, copy);
926 skb->len += copy;
927 from += copy;
928 copied += copy;
929 len -= copy;
930 sk->send_seq += copy;
932 if ((skb->len - hdrlen) >= sk->mss ||
933 (flags & MSG_OOB) ||
934 !sk->packets_out)
935 tcp_send_skb(sk, skb);
936 else
937 tcp_enqueue_partial(skb, sk);
938 continue;
942 * We also need to worry about the window.
943 * If window < 1/2 the maximum window we've seen from this
944 * host, don't use it. This is sender side
945 * silly window prevention, as specified in RFC1122.
946 * (Note that this is diffferent than earlier versions of
947 * SWS prevention, e.g. RFC813.). What we actually do is
948 * use the whole MSS. Since the results in the right
949 * edge of the packet being outside the window, it will
950 * be queued for later rather than sent.
953 copy = diff(sk->window_seq, sk->send_seq);
954 /* what if max_window == 1? In that case max_window >> 1 is 0.
955 * however in that case copy == max_window, so it's OK to use
956 * the window */
957 if (copy < (sk->max_window >> 1))
958 copy = sk->mss;
959 copy = min(copy, sk->mss);
960 copy = min(copy, len);
962 /* We should really check the window here also. */
963 send_tmp = NULL;
964 if (copy < sk->mss && !(flags & MSG_OOB)) {
965 /* We will release the socket incase we sleep here. */
966 release_sock(sk);
967 /* NB: following must be mtu, because mss can be increased.
968 * mss is always <= mtu */
969 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + sizeof(*skb), 0, GFP_KERNEL);
970 sk->inuse = 1;
971 send_tmp = skb;
972 } else {
973 /* We will release the socket incase we sleep here. */
974 release_sock(sk);
975 skb = prot->wmalloc(sk, copy + prot->max_header + sizeof(*skb), 0, GFP_KERNEL);
976 sk->inuse = 1;
979 /* If we didn't get any memory, we need to sleep. */
980 if (skb == NULL) {
981 if (nonblock /* || copied */) {
982 release_sock(sk);
983 DPRINTF((DBG_TCP, "tcp_write: return 4\n"));
984 if (copied) return(copied);
985 return(-EAGAIN);
988 /* FIXME: here is another race condition. */
989 tmp = sk->wmem_alloc;
990 release_sock(sk);
991 cli();
992 /* Again we will try to avoid it. */
993 if (tmp <= sk->wmem_alloc &&
994 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
995 && sk->err == 0) {
996 interruptible_sleep_on(sk->sleep);
997 if (current->signal & ~current->blocked) {
998 sti();
999 DPRINTF((DBG_TCP, "tcp_write: return 5\n"));
1000 if (copied) return(copied);
1001 return(-ERESTARTSYS);
1004 sk->inuse = 1;
1005 sti();
1006 continue;
1009 skb->len = 0;
1010 skb->sk = sk;
1011 skb->free = 0;
1013 buff = skb->data;
1016 * FIXME: we need to optimize this.
1017 * Perhaps some hints here would be good.
1019 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1020 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1021 if (tmp < 0 ) {
1022 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1023 release_sock(sk);
1024 DPRINTF((DBG_TCP, "tcp_write: return 6\n"));
1025 if (copied) return(copied);
1026 return(tmp);
1028 skb->len += tmp;
1029 skb->dev = dev;
1030 buff += tmp;
1031 skb->h.th =(struct tcphdr *) buff;
1032 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1033 if (tmp < 0) {
1034 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1035 release_sock(sk);
1036 DPRINTF((DBG_TCP, "tcp_write: return 7\n"));
1037 if (copied) return(copied);
1038 return(tmp);
1041 if (flags & MSG_OOB) {
1042 ((struct tcphdr *)buff)->urg = 1;
1043 ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1045 skb->len += tmp;
1046 memcpy_fromfs(buff+tmp, from, copy);
1048 from += copy;
1049 copied += copy;
1050 len -= copy;
1051 skb->len += copy;
1052 skb->free = 0;
1053 sk->send_seq += copy;
1055 if (send_tmp != NULL && sk->packets_out) {
1056 tcp_enqueue_partial(send_tmp, sk);
1057 continue;
1059 tcp_send_skb(sk, skb);
1061 sk->err = 0;
1064 * Nagles rule. Turn Nagle off with TCP_NODELAY for highly
1065 * interactive fast network servers. It's meant to be on and
1066 * it really improves the throughput though not the echo time
1067 * on my slow slip link - Alan
1070 /* Avoid possible race on send_tmp - c/o Johannes Stille */
1071 if(sk->partial &&
1072 ((!sk->packets_out)
1073 /* If not nagling we can send on the before case too.. */
1074 || (sk->nonagle && before(sk->send_seq , sk->window_seq))
1076 tcp_send_partial(sk);
1077 /* -- */
1078 release_sock(sk);
1079 DPRINTF((DBG_TCP, "tcp_write: return 8\n"));
1080 return(copied);
1084 static int
1085 tcp_sendto(struct sock *sk, unsigned char *from,
1086 int len, int nonblock, unsigned flags,
1087 struct sockaddr_in *addr, int addr_len)
1089 struct sockaddr_in sin;
1091 if (addr_len < sizeof(sin)) return(-EINVAL);
1092 memcpy_fromfs(&sin, addr, sizeof(sin));
1093 if (sin.sin_family && sin.sin_family != AF_INET) return(-EINVAL);
1094 if (sin.sin_port != sk->dummy_th.dest) return(-EINVAL);
1095 if (sin.sin_addr.s_addr != sk->daddr) return(-EINVAL);
1096 return(tcp_write(sk, from, len, nonblock, flags));
1100 static void
1101 tcp_read_wakeup(struct sock *sk)
1103 int tmp;
1104 struct device *dev = NULL;
1105 struct tcphdr *t1;
1106 struct sk_buff *buff;
1108 DPRINTF((DBG_TCP, "in tcp read wakeup\n"));
1109 if (!sk->ack_backlog) return;
1112 * FIXME: we need to put code here to prevent this routine from
1113 * being called. Being called once in a while is ok, so only check
1114 * if this is the second time in a row.
1118 * We need to grab some memory, and put together an ack,
1119 * and then put it into the queue to be sent.
1121 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1122 if (buff == NULL) {
1123 /* Try again real soon. */
1124 reset_timer(sk, TIME_WRITE, 10);
1125 return;
1128 buff->mem_addr = buff;
1129 buff->mem_len = MAX_ACK_SIZE;
1130 buff->len = sizeof(struct tcphdr);
1131 buff->sk = sk;
1133 /* Put in the IP header and routing stuff. */
1134 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1135 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1136 if (tmp < 0) {
1137 buff->free=1;
1138 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1139 return;
1142 buff->len += tmp;
1143 t1 =(struct tcphdr *)(buff->data +tmp);
1145 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1146 t1->seq = ntohl(sk->send_seq);
1147 t1->ack = 1;
1148 t1->res1 = 0;
1149 t1->res2 = 0;
1150 t1->rst = 0;
1151 t1->urg = 0;
1152 t1->syn = 0;
1153 t1->psh = 0;
1154 sk->ack_backlog = 0;
1155 sk->bytes_rcv = 0;
1156 sk->window = tcp_select_window(sk);/*sk->prot->rspace(sk);*/
1157 t1->window = ntohs(sk->window);
1158 t1->ack_seq = ntohl(sk->acked_seq);
1159 t1->doff = sizeof(*t1)/4;
1160 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1161 sk->prot->queue_xmit(sk, dev, buff, 1);
1166 * FIXME:
1167 * This routine frees used buffers.
1168 * It should consider sending an ACK to let the
1169 * other end know we now have a bigger window.
1171 static void
1172 cleanup_rbuf(struct sock *sk)
1174 unsigned long flags;
1175 int left;
1176 struct sk_buff *skb;
1178 if(sk->debug)
1179 printk("cleaning rbuf for sk=%p\n", sk);
1181 save_flags(flags);
1182 cli();
1184 left = sk->prot->rspace(sk);
1187 * We have to loop through all the buffer headers,
1188 * and try to free up all the space we can.
1190 while((skb=skb_peek(&sk->rqueue)) != NULL )
1192 if (!skb->used)
1193 break;
1194 skb_unlink(skb);
1195 skb->sk = sk;
1196 kfree_skb(skb, FREE_READ);
1199 restore_flags(flags);
1202 * FIXME:
1203 * At this point we should send an ack if the difference
1204 * in the window, and the amount of space is bigger than
1205 * TCP_WINDOW_DIFF.
1207 DPRINTF((DBG_TCP, "sk->window left = %d, sk->prot->rspace(sk)=%d\n",
1208 sk->window - sk->bytes_rcv, sk->prot->rspace(sk)));
1210 if(sk->debug)
1211 printk("sk->rspace = %lu, was %d\n", sk->prot->rspace(sk),
1212 left);
1213 if (sk->prot->rspace(sk) != left)
1216 * This area has caused the most trouble. The current strategy
1217 * is to simply do nothing if the other end has room to send at
1218 * least 3 full packets, because the ack from those will auto-
1219 * matically update the window. If the other end doesn't think
1220 * we have much space left, but we have room for atleast 1 more
1221 * complete packet than it thinks we do, we will send an ack
1222 * immediatedly. Otherwise we will wait up to .5 seconds in case
1223 * the user reads some more.
1225 sk->ack_backlog++;
1227 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1228 * if the other end is offering a window smaller than the agreed on MSS
1229 * (called sk->mtu here). In theory there's no connection between send
1230 * and receive, and so no reason to think that they're going to send
1231 * small packets. For the moment I'm using the hack of reducing the mss
1232 * only on the send side, so I'm putting mtu here.
1234 if ((sk->prot->rspace(sk) > (sk->window - sk->bytes_rcv + sk->mtu))) {
1235 /* Send an ack right now. */
1236 tcp_read_wakeup(sk);
1237 } else {
1238 /* Force it to send an ack soon. */
1239 int was_active = del_timer(&sk->timer);
1240 if (!was_active || TCP_ACK_TIME < sk->timer.expires) {
1241 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1242 } else
1243 add_timer(&sk->timer);
1249 /* Handle reading urgent data. */
1250 static int
1251 tcp_read_urg(struct sock * sk, int nonblock,
1252 unsigned char *to, int len, unsigned flags)
1254 int copied = 0;
1255 struct sk_buff *skb;
1257 DPRINTF((DBG_TCP, "tcp_read_urg(sk=%X, to=%X, len=%d, flags=%X)\n",
1258 sk, to, len, flags));
1260 while(len > 0)
1262 sk->inuse = 1;
1263 while(sk->urg==0 || skb_peek(&sk->rqueue) == NULL) {
1264 if (sk->err) {
1265 int tmp;
1267 release_sock(sk);
1268 if (copied) return(copied);
1269 tmp = -sk->err;
1270 sk->err = 0;
1271 return(tmp);
1274 if (sk->state == TCP_CLOSE || sk->done) {
1275 release_sock(sk);
1276 if (copied) return(copied);
1277 if (!sk->done) {
1278 sk->done = 1;
1279 return(0);
1281 return(-ENOTCONN);
1284 if (sk->shutdown & RCV_SHUTDOWN) {
1285 release_sock(sk);
1286 if (copied == 0)
1287 sk->done = 1;
1288 return(copied);
1291 if (nonblock || copied) {
1292 release_sock(sk);
1293 if (copied) return(copied);
1294 return(-EAGAIN);
1297 /* Now at this point, we may have gotten some data. */
1298 release_sock(sk);
1299 cli();
1300 if ((sk->urg == 0 || skb_peek(&sk->rqueue) == NULL) &&
1301 sk->err == 0 && !(sk->shutdown & RCV_SHUTDOWN)) {
1302 interruptible_sleep_on(sk->sleep);
1303 if (current->signal & ~current->blocked) {
1304 sti();
1305 if (copied) return(copied);
1306 return(-ERESTARTSYS);
1309 sk->inuse = 1;
1310 sti();
1313 skb = skb_peek(&sk->rqueue);
1314 do {
1315 int amt;
1317 if (skb->h.th->urg && !skb->urg_used) {
1318 if (skb->h.th->urg_ptr == 0) {
1319 skb->h.th->urg_ptr = ntohs(skb->len);
1321 amt = min(ntohs(skb->h.th->urg_ptr),len);
1322 if(amt)
1324 memcpy_tofs(to,(unsigned char *)(skb->h.th) +
1325 skb->h.th->doff*4, amt);
1328 if (!(flags & MSG_PEEK)) {
1329 skb->urg_used = 1;
1330 sk->urg--;
1332 release_sock(sk);
1333 copied += amt;
1334 return(copied);
1336 skb =(struct sk_buff *)skb->next;
1337 } while(skb != sk->rqueue);
1339 /*sk->urg = 0;*/
1340 release_sock(sk);
1341 return(0);
1345 /* This routine copies from a sock struct into the user buffer. */
1346 static int
1347 tcp_read(struct sock *sk, unsigned char *to,
1348 int len, int nonblock, unsigned flags)
1350 int copied=0; /* will be used to say how much has been copied. */
1351 struct sk_buff *skb;
1352 unsigned long offset;
1353 unsigned long used;
1354 int err;
1356 if (len == 0) return(0);
1357 if (len < 0) {
1358 return(-EINVAL);
1361 err=verify_area(VERIFY_WRITE,to,len);
1362 if(err)
1363 return err;
1365 /* This error should be checked. */
1366 if (sk->state == TCP_LISTEN) return(-ENOTCONN);
1368 /* Urgent data needs to be handled specially. */
1369 if ((flags & MSG_OOB))
1370 return(tcp_read_urg(sk, nonblock, to, len, flags));
1372 /* So no-one else will use this socket. */
1373 sk->inuse = 1;
1375 skb=skb_peek(&sk->rqueue);
1377 DPRINTF((DBG_TCP, "tcp_read(sk=%X, to=%X, len=%d, nonblock=%d, flags=%X)\n",
1378 sk, to, len, nonblock, flags));
1380 while(len > 0) {
1381 /* skb->used just checks to see if we've gone all the way around. */
1383 /* While no data, or first data indicates some is missing, or data is used */
1384 while(skb == NULL ||
1385 before(sk->copied_seq+1, skb->h.th->seq) || skb->used) {
1386 DPRINTF((DBG_TCP, "skb = %X:\n", skb));
1387 cleanup_rbuf(sk);
1388 if (sk->err)
1390 int tmp;
1392 release_sock(sk);
1393 if (copied)
1395 DPRINTF((DBG_TCP, "tcp_read: returning %d\n",
1396 copied));
1397 return(copied);
1399 tmp = -sk->err;
1400 sk->err = 0;
1401 return(tmp);
1404 if (sk->state == TCP_CLOSE)
1406 release_sock(sk);
1407 if (copied) {
1408 DPRINTF((DBG_TCP, "tcp_read: returning %d\n",
1409 copied));
1410 return(copied);
1412 if (!sk->done) {
1413 sk->done = 1;
1414 return(0);
1416 return(-ENOTCONN);
1419 if (sk->shutdown & RCV_SHUTDOWN)
1421 release_sock(sk);
1422 if (copied == 0) sk->done = 1;
1423 DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied));
1424 return(copied);
1427 if (nonblock || copied)
1429 release_sock(sk);
1430 if(sk->debug)
1431 printk("read: EAGAIN\n");
1432 if (copied)
1434 DPRINTF((DBG_TCP, "tcp_read: returning %d\n",
1435 copied));
1436 return(copied);
1438 return(-EAGAIN);
1441 if ((flags & MSG_PEEK) && copied != 0)
1443 release_sock(sk);
1444 DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied));
1445 return(copied);
1448 DPRINTF((DBG_TCP, "tcp_read about to sleep. state = %d\n",
1449 sk->state));
1450 release_sock(sk);
1453 * Now we may have some data waiting or we could
1454 * have changed state.
1456 cli();
1457 if (sk->shutdown & RCV_SHUTDOWN || sk->err != 0) {
1458 sk->inuse = 1;
1459 sti();
1460 continue;
1463 if (skb_peek(&sk->rqueue) == NULL ||
1464 before(sk->copied_seq+1, sk->rqueue->h.th->seq)) {
1465 if(sk->debug)
1466 printk("Read wait sleep\n");
1467 interruptible_sleep_on(sk->sleep);
1468 if(sk->debug)
1469 printk("Read wait wakes\n");
1470 if (current->signal & ~current->blocked) {
1471 sti();
1472 if (copied) {
1473 DPRINTF((DBG_TCP, "tcp_read: returning %d\n",
1474 copied));
1475 return(copied);
1477 return(-ERESTARTSYS);
1480 sk->inuse = 1;
1481 sti();
1482 DPRINTF((DBG_TCP, "tcp_read woke up. \n"));
1485 skb=skb_peek(&sk->rqueue);
1486 /* That may have been null if we were beaten, if so we loop again */
1490 * Copy anything from the current block that needs
1491 * to go into the user buffer.
1493 offset = sk->copied_seq+1 - skb->h.th->seq;
1495 if (skb->h.th->syn) offset--;
1496 if (offset < skb->len) /* Some of the packet is useful */
1499 * If there is urgent data we must either
1500 * return or skip over it.
1502 if (skb->h.th->urg)
1504 if (skb->urg_used)
1506 sk->copied_seq += ntohs(skb->h.th->urg_ptr);
1507 offset += ntohs(skb->h.th->urg_ptr);
1508 if (offset >= skb->len)
1510 skb->used = 1;
1511 skb =(struct sk_buff *)skb->next;
1512 continue;
1515 else
1517 release_sock(sk);
1518 if (copied)
1519 return(copied);
1520 send_sig(SIGURG, current, 0);
1521 return(-EINTR);
1524 /* Ok so how much can we use ? */
1525 used = min(skb->len - offset, len);
1526 /* Copy it */
1527 memcpy_tofs(to,((unsigned char *)skb->h.th) +
1528 skb->h.th->doff*4 + offset, used);
1529 copied += used;
1530 len -= used;
1531 to += used;
1533 /* If we were reading the data is 'eaten' */
1534 if (!(flags & MSG_PEEK))
1535 sk->copied_seq += used;
1538 * Mark this data used if we are really reading it,
1539 * and if it doesn't contain any urgent data. And we
1540 * have used all the data.
1542 if (!(flags & MSG_PEEK) &&
1543 (!skb->h.th->urg || skb->urg_used) &&
1544 (used + offset >= skb->len))
1545 skb->used = 1;
1548 * See if this is the end of a message or if the
1549 * remaining data is urgent.
1551 if (/*skb->h.th->psh || */skb->h.th->urg)
1553 break;
1556 else
1557 { /* already used this data, must be a retransmit */
1558 skb->used = 1;
1560 /* Move along a packet */
1561 skb =(struct sk_buff *)skb->next;
1563 /* Clean up data we have read: This will do ACK frames */
1564 cleanup_rbuf(sk);
1565 release_sock(sk);
1566 DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied));
1567 if (copied == 0 && nonblock)
1568 return(-EAGAIN);
1569 return(copied);
1574 * Send a FIN without closing the connection.
1575 * Not called at interrupt time.
1577 void
1578 tcp_shutdown(struct sock *sk, int how)
1580 struct sk_buff *buff;
1581 struct tcphdr *t1, *th;
1582 struct proto *prot;
1583 int tmp;
1584 struct device *dev = NULL;
1587 * We need to grab some memory, and put together a FIN,
1588 * and then put it into the queue to be sent.
1589 * FIXME:
1590 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1591 * Most of this is guesswork, so maybe it will work...
1593 /* If we've already sent a FIN, return. */
1594 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2) return;
1595 if (!(how & SEND_SHUTDOWN)) return;
1596 sk->inuse = 1;
1598 /* Clear out any half completed packets. */
1599 if (sk->partial)
1600 tcp_send_partial(sk);
1602 prot =(struct proto *)sk->prot;
1603 th =(struct tcphdr *)&sk->dummy_th;
1604 release_sock(sk); /* incase the malloc sleeps. */
1605 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
1606 if (buff == NULL) return;
1607 sk->inuse = 1;
1609 DPRINTF((DBG_TCP, "tcp_shutdown_send buff = %X\n", buff));
1610 buff->mem_addr = buff;
1611 buff->mem_len = MAX_RESET_SIZE;
1612 buff->sk = sk;
1613 buff->len = sizeof(*t1);
1614 t1 =(struct tcphdr *) buff->data;
1616 /* Put in the IP header and routing stuff. */
1617 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
1618 IPPROTO_TCP, sk->opt,
1619 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
1620 if (tmp < 0) {
1621 buff->free=1;
1622 prot->wfree(sk,buff->mem_addr, buff->mem_len);
1623 release_sock(sk);
1624 DPRINTF((DBG_TCP, "Unable to build header for fin.\n"));
1625 return;
1628 t1 =(struct tcphdr *)((char *)t1 +tmp);
1629 buff->len += tmp;
1630 buff->dev = dev;
1631 memcpy(t1, th, sizeof(*t1));
1632 t1->seq = ntohl(sk->send_seq);
1633 sk->send_seq++;
1634 buff->h.seq = sk->send_seq;
1635 t1->ack = 1;
1636 t1->ack_seq = ntohl(sk->acked_seq);
1637 t1->window = ntohs(sk->window=tcp_select_window(sk)/*sk->prot->rspace(sk)*/);
1638 t1->fin = 1;
1639 t1->rst = 0;
1640 t1->doff = sizeof(*t1)/4;
1641 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1644 * Can't just queue this up.
1645 * It should go at the end of the write queue.
1647 if (sk->wback != NULL) {
1648 buff->free=0;
1649 buff->next = NULL;
1650 sk->wback->next = buff;
1651 sk->wback = buff;
1652 buff->magic = TCP_WRITE_QUEUE_MAGIC;
1653 } else {
1654 sk->prot->queue_xmit(sk, dev, buff, 0);
1657 if (sk->state == TCP_ESTABLISHED) sk->state = TCP_FIN_WAIT1;
1658 else sk->state = TCP_FIN_WAIT2;
1660 release_sock(sk);
1664 static int
1665 tcp_recvfrom(struct sock *sk, unsigned char *to,
1666 int to_len, int nonblock, unsigned flags,
1667 struct sockaddr_in *addr, int *addr_len)
1669 struct sockaddr_in sin;
1670 int len;
1671 int err;
1672 int result;
1674 /* Have to check these first unlike the old code. If
1675 we check them after we lose data on an error
1676 which is wrong */
1677 err = verify_area(VERIFY_WRITE,addr_len,sizeof(long));
1678 if(err)
1679 return err;
1680 len = get_fs_long(addr_len);
1681 if(len > sizeof(sin))
1682 len = sizeof(sin);
1683 err=verify_area(VERIFY_WRITE, addr, len);
1684 if(err)
1685 return err;
1687 result=tcp_read(sk, to, to_len, nonblock, flags);
1689 if (result < 0) return(result);
1691 sin.sin_family = AF_INET;
1692 sin.sin_port = sk->dummy_th.dest;
1693 sin.sin_addr.s_addr = sk->daddr;
1695 memcpy_tofs(addr, &sin, len);
1696 put_fs_long(len, addr_len);
1697 return(result);
1701 /* This routine will send an RST to the other tcp. */
1702 static void
1703 tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
1704 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
1706 struct sk_buff *buff;
1707 struct tcphdr *t1;
1708 int tmp;
1711 * We need to grab some memory, and put together an RST,
1712 * and then put it into the queue to be sent.
1714 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
1715 if (buff == NULL)
1716 return;
1718 DPRINTF((DBG_TCP, "tcp_reset buff = %X\n", buff));
1719 buff->mem_addr = buff;
1720 buff->mem_len = MAX_RESET_SIZE;
1721 buff->len = sizeof(*t1);
1722 buff->sk = NULL;
1723 buff->dev = dev;
1725 t1 =(struct tcphdr *) buff->data;
1727 /* Put in the IP header and routing stuff. */
1728 tmp = prot->build_header(buff, saddr, daddr, &dev, IPPROTO_TCP, opt,
1729 sizeof(struct tcphdr),tos,ttl);
1730 if (tmp < 0) {
1731 buff->free = 1;
1732 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
1733 return;
1735 t1 =(struct tcphdr *)((char *)t1 +tmp);
1736 buff->len += tmp;
1737 memcpy(t1, th, sizeof(*t1));
1739 /* Swap the send and the receive. */
1740 t1->dest = th->source;
1741 t1->source = th->dest;
1742 t1->rst = 1;
1743 t1->window = 0;
1745 if(th->ack)
1747 t1->ack=0;
1748 t1->seq=th->ack_seq;
1749 t1->ack_seq=0;
1751 else
1753 t1->ack=1;
1754 if(!th->syn)
1755 t1->ack_seq=htonl(th->seq);
1756 else
1757 t1->ack_seq=htonl(th->seq+1);
1758 t1->seq=0;
1761 t1->syn = 0;
1762 t1->urg = 0;
1763 t1->fin = 0;
1764 t1->psh = 0;
1765 t1->doff = sizeof(*t1)/4;
1766 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
1767 prot->queue_xmit(NULL, dev, buff, 1);
1772 * Look for tcp options. Parses everything but only knows about MSS.
1773 * This routine is always called with the packet containing the SYN.
1774 * However it may also be called with the ack to the SYN. So you
1775 * can't assume this is always the SYN. It's always called after
1776 * we have set up sk->mtu to our own MTU.
1779 static void
1780 tcp_options(struct sock *sk, struct tcphdr *th)
1782 unsigned char *ptr;
1783 int length=(th->doff*4)-sizeof(struct tcphdr);
1784 int mss_seen = 0;
1786 ptr = (unsigned char *)(th + 1);
1788 while(length>0)
1790 int opcode=*ptr++;
1791 int opsize=*ptr++;
1792 switch(opcode)
1794 case TCPOPT_EOL:
1795 return;
1796 case TCPOPT_NOP:
1797 length-=2;
1798 continue;
1800 default:
1801 if(opsize<=2) /* Avoid silly options looping forever */
1802 return;
1803 switch(opcode)
1805 case TCPOPT_MSS:
1806 if(opsize==4 && th->syn)
1808 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
1809 mss_seen = 1;
1811 break;
1812 /* Add other options here as people feel the urge to implement stuff like large windows */
1814 ptr+=opsize-2;
1815 length-=opsize;
1818 if (th->syn) {
1819 if (! mss_seen)
1820 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
1822 sk->mss = min(sk->max_window, sk->mtu);
1825 static inline unsigned long default_mask(unsigned long dst)
1827 dst = ntohl(dst);
1828 if (IN_CLASSA(dst))
1829 return htonl(IN_CLASSA_NET);
1830 if (IN_CLASSB(dst))
1831 return htonl(IN_CLASSB_NET);
1832 return htonl(IN_CLASSC_NET);
1836 * This routine handles a connection request.
1837 * It should make sure we haven't already responded.
1838 * Because of the way BSD works, we have to send a syn/ack now.
1839 * This also means it will be harder to close a socket which is
1840 * listening.
1842 static void
1843 tcp_conn_request(struct sock *sk, struct sk_buff *skb,
1844 unsigned long daddr, unsigned long saddr,
1845 struct options *opt, struct device *dev)
1847 struct sk_buff *buff;
1848 struct tcphdr *t1;
1849 unsigned char *ptr;
1850 struct sock *newsk;
1851 struct tcphdr *th;
1852 int tmp;
1854 DPRINTF((DBG_TCP, "tcp_conn_request(sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"
1855 " opt = %X, dev = %X)\n",
1856 sk, skb, daddr, saddr, opt, dev));
1858 th = skb->h.th;
1860 /* If the socket is dead, don't accept the connection. */
1861 if (!sk->dead) {
1862 sk->data_ready(sk,0);
1863 } else {
1864 DPRINTF((DBG_TCP, "tcp_conn_request on dead socket\n"));
1865 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
1866 kfree_skb(skb, FREE_READ);
1867 return;
1871 * Make sure we can accept more. This will prevent a
1872 * flurry of syns from eating up all our memory.
1874 if (sk->ack_backlog >= sk->max_ack_backlog) {
1875 kfree_skb(skb, FREE_READ);
1876 return;
1880 * We need to build a new sock struct.
1881 * It is sort of bad to have a socket without an inode attached
1882 * to it, but the wake_up's will just wake up the listening socket,
1883 * and if the listening socket is destroyed before this is taken
1884 * off of the queue, this will take care of it.
1886 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
1887 if (newsk == NULL) {
1888 /* just ignore the syn. It will get retransmitted. */
1889 kfree_skb(skb, FREE_READ);
1890 return;
1893 DPRINTF((DBG_TCP, "newsk = %X\n", newsk));
1894 memcpy((void *)newsk,(void *)sk, sizeof(*newsk));
1895 newsk->wback = NULL;
1896 newsk->wfront = NULL;
1897 newsk->rqueue = NULL;
1898 newsk->send_head = NULL;
1899 newsk->send_tail = NULL;
1900 newsk->back_log = NULL;
1901 newsk->rtt = TCP_CONNECT_TIME << 3;
1902 newsk->rto = TCP_CONNECT_TIME;
1903 newsk->mdev = 0;
1904 newsk->max_window = 0;
1905 newsk->cong_window = 1;
1906 newsk->cong_count = 0;
1907 newsk->ssthresh = 0;
1908 newsk->backoff = 0;
1909 newsk->blog = 0;
1910 newsk->intr = 0;
1911 newsk->proc = 0;
1912 newsk->done = 0;
1913 newsk->partial = NULL;
1914 newsk->pair = NULL;
1915 newsk->wmem_alloc = 0;
1916 newsk->rmem_alloc = 0;
1918 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
1920 newsk->err = 0;
1921 newsk->shutdown = 0;
1922 newsk->ack_backlog = 0;
1923 newsk->acked_seq = skb->h.th->seq+1;
1924 newsk->fin_seq = skb->h.th->seq;
1925 newsk->copied_seq = skb->h.th->seq;
1926 newsk->state = TCP_SYN_RECV;
1927 newsk->timeout = 0;
1928 newsk->send_seq = jiffies * SEQ_TICK - seq_offset;
1929 newsk->window_seq = newsk->send_seq;
1930 newsk->rcv_ack_seq = newsk->send_seq;
1931 newsk->urg =0;
1932 newsk->retransmits = 0;
1933 newsk->destroy = 0;
1934 newsk->timer.data = (unsigned long)newsk;
1935 newsk->timer.function = &net_timer;
1936 newsk->dummy_th.source = skb->h.th->dest;
1937 newsk->dummy_th.dest = skb->h.th->source;
1939 /* Swap these two, they are from our point of view. */
1940 newsk->daddr = saddr;
1941 newsk->saddr = daddr;
1943 put_sock(newsk->num,newsk);
1944 newsk->dummy_th.res1 = 0;
1945 newsk->dummy_th.doff = 6;
1946 newsk->dummy_th.fin = 0;
1947 newsk->dummy_th.syn = 0;
1948 newsk->dummy_th.rst = 0;
1949 newsk->dummy_th.psh = 0;
1950 newsk->dummy_th.ack = 0;
1951 newsk->dummy_th.urg = 0;
1952 newsk->dummy_th.res2 = 0;
1953 newsk->acked_seq = skb->h.th->seq + 1;
1954 newsk->copied_seq = skb->h.th->seq;
1956 /* Grab the ttl and tos values and use them */
1957 newsk->ip_ttl=sk->ip_ttl;
1958 newsk->ip_tos=skb->ip_hdr->tos;
1960 /* use 512 or whatever user asked for */
1961 /* note use of sk->user_mss, since user has no direct access to newsk */
1962 if (sk->user_mss)
1963 newsk->mtu = sk->user_mss;
1964 else {
1965 #ifdef SUBNETSARELOCAL
1966 if ((saddr ^ daddr) & default_mask(saddr))
1967 #else
1968 if ((saddr ^ daddr) & dev->pa_mask)
1969 #endif
1970 newsk->mtu = 576 - HEADER_SIZE;
1971 else
1972 newsk->mtu = MAX_WINDOW;
1974 /* but not bigger than device MTU */
1975 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
1977 /* this will min with what arrived in the packet */
1978 tcp_options(newsk,skb->h.th);
1980 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
1981 if (buff == NULL) {
1982 sk->err = -ENOMEM;
1983 newsk->dead = 1;
1984 release_sock(newsk);
1985 kfree_skb(skb, FREE_READ);
1986 return;
1989 buff->mem_addr = buff;
1990 buff->mem_len = MAX_SYN_SIZE;
1991 buff->len = sizeof(struct tcphdr)+4;
1992 buff->sk = newsk;
1994 t1 =(struct tcphdr *) buff->data;
1996 /* Put in the IP header and routing stuff. */
1997 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &dev,
1998 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2000 /* Something went wrong. */
2001 if (tmp < 0) {
2002 sk->err = tmp;
2003 buff->free=1;
2004 kfree_skb(buff,FREE_WRITE);
2005 newsk->dead = 1;
2006 release_sock(newsk);
2007 skb->sk = sk;
2008 kfree_skb(skb, FREE_READ);
2009 return;
2012 buff->len += tmp;
2013 t1 =(struct tcphdr *)((char *)t1 +tmp);
2015 memcpy(t1, skb->h.th, sizeof(*t1));
2016 buff->h.seq = newsk->send_seq;
2018 /* Swap the send and the receive. */
2019 t1->dest = skb->h.th->source;
2020 t1->source = newsk->dummy_th.source;
2021 t1->seq = ntohl(newsk->send_seq++);
2022 t1->ack = 1;
2023 newsk->window = tcp_select_window(newsk);/*newsk->prot->rspace(newsk);*/
2024 t1->window = ntohs(newsk->window);
2025 t1->res1 = 0;
2026 t1->res2 = 0;
2027 t1->rst = 0;
2028 t1->urg = 0;
2029 t1->psh = 0;
2030 t1->syn = 1;
2031 t1->ack_seq = ntohl(skb->h.th->seq+1);
2032 t1->doff = sizeof(*t1)/4+1;
2034 ptr =(unsigned char *)(t1+1);
2035 ptr[0] = 2;
2036 ptr[1] = 4;
2037 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2038 ptr[3] =(newsk->mtu) & 0xff;
2040 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2041 newsk->prot->queue_xmit(newsk, dev, buff, 0);
2043 reset_timer(newsk, TIME_WRITE /* -1 ? FIXME ??? */, TCP_CONNECT_TIME);
2044 skb->sk = newsk;
2046 /* Charge the sock_buff to newsk. */
2047 sk->rmem_alloc -= skb->mem_len;
2048 newsk->rmem_alloc += skb->mem_len;
2050 skb_queue_tail(&sk->rqueue,skb);
2051 sk->ack_backlog++;
2052 release_sock(newsk);
2056 static void
2057 tcp_close(struct sock *sk, int timeout)
2059 struct sk_buff *buff;
2060 int need_reset = 0;
2061 struct tcphdr *t1, *th;
2062 struct proto *prot;
2063 struct device *dev=NULL;
2064 int tmp;
2067 * We need to grab some memory, and put together a FIN,
2068 * and then put it into the queue to be sent.
2070 DPRINTF((DBG_TCP, "tcp_close((struct sock *)%X, %d)\n",sk, timeout));
2071 sk->inuse = 1;
2072 sk->keepopen = 1;
2073 sk->shutdown = SHUTDOWN_MASK;
2075 if (!sk->dead)
2076 sk->state_change(sk);
2078 /* We need to flush the recv. buffs. */
2079 if (skb_peek(&sk->rqueue) != NULL)
2081 struct sk_buff *skb;
2082 if(sk->debug)
2083 printk("Clean rcv queue\n");
2084 while((skb=skb_dequeue(&sk->rqueue))!=NULL)
2086 if(skb->len > 0 && after(skb->h.th->seq + skb->len + 1 , sk->copied_seq))
2087 need_reset = 1;
2088 kfree_skb(skb, FREE_READ);
2090 if(sk->debug)
2091 printk("Cleaned.\n");
2093 sk->rqueue = NULL;
2095 /* Get rid off any half-completed packets. */
2096 if (sk->partial) {
2097 tcp_send_partial(sk);
2100 switch(sk->state) {
2101 case TCP_FIN_WAIT1:
2102 case TCP_FIN_WAIT2:
2103 case TCP_LAST_ACK:
2104 /* start a timer. */
2105 /* original code was 4 * sk->rtt. In converting to the
2106 * new rtt representation, we can't quite use that.
2107 * it seems to make most sense to use the backed off value
2109 reset_timer(sk, TIME_CLOSE, 4 * sk->rto);
2110 if (timeout) tcp_time_wait(sk);
2111 release_sock(sk);
2112 return; /* break causes a double release - messy */
2113 case TCP_TIME_WAIT:
2114 if (timeout) {
2115 sk->state = TCP_CLOSE;
2117 release_sock(sk);
2118 return;
2119 case TCP_LISTEN:
2120 sk->state = TCP_CLOSE;
2121 release_sock(sk);
2122 return;
2123 case TCP_CLOSE:
2124 release_sock(sk);
2125 return;
2126 case TCP_CLOSE_WAIT:
2127 case TCP_ESTABLISHED:
2128 case TCP_SYN_SENT:
2129 case TCP_SYN_RECV:
2130 prot =(struct proto *)sk->prot;
2131 th =(struct tcphdr *)&sk->dummy_th;
2132 buff = prot->wmalloc(sk, MAX_FIN_SIZE, 1, GFP_ATOMIC);
2133 if (buff == NULL) {
2134 /* This will force it to try again later. */
2135 /* Or it would have if someone released the socket
2136 first. Anyway it might work now */
2137 release_sock(sk);
2138 if (sk->state != TCP_CLOSE_WAIT)
2139 sk->state = TCP_ESTABLISHED;
2140 reset_timer(sk, TIME_CLOSE, 100);
2141 return;
2143 buff->mem_addr = buff;
2144 buff->mem_len = MAX_FIN_SIZE;
2145 buff->sk = sk;
2146 buff->free = 1;
2147 buff->len = sizeof(*t1);
2148 t1 =(struct tcphdr *) buff->data;
2150 /* Put in the IP header and routing stuff. */
2151 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2152 IPPROTO_TCP, sk->opt,
2153 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2154 if (tmp < 0) {
2155 kfree_skb(buff,FREE_WRITE);
2156 DPRINTF((DBG_TCP, "Unable to build header for fin.\n"));
2157 release_sock(sk);
2158 return;
2161 t1 =(struct tcphdr *)((char *)t1 +tmp);
2162 buff->len += tmp;
2163 buff->dev = dev;
2164 memcpy(t1, th, sizeof(*t1));
2165 t1->seq = ntohl(sk->send_seq);
2166 sk->send_seq++;
2167 buff->h.seq = sk->send_seq;
2168 t1->ack = 1;
2170 /* Ack everything immediately from now on. */
2171 sk->delay_acks = 0;
2172 t1->ack_seq = ntohl(sk->acked_seq);
2173 t1->window = ntohs(sk->window=tcp_select_window(sk)/*sk->prot->rspace(sk)*/);
2174 t1->fin = 1;
2175 t1->rst = need_reset;
2176 t1->doff = sizeof(*t1)/4;
2177 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2179 if (sk->wfront == NULL) {
2180 prot->queue_xmit(sk, dev, buff, 0);
2181 } else {
2182 reset_timer(sk, TIME_WRITE, sk->rto);
2183 buff->next = NULL;
2184 if (sk->wback == NULL) {
2185 sk->wfront = buff;
2186 } else {
2187 sk->wback->next = buff;
2189 sk->wback = buff;
2190 buff->magic = TCP_WRITE_QUEUE_MAGIC;
2193 if (sk->state == TCP_CLOSE_WAIT) {
2194 sk->state = TCP_FIN_WAIT2;
2195 } else {
2196 sk->state = TCP_FIN_WAIT1;
2199 release_sock(sk);
2204 * This routine takes stuff off of the write queue,
2205 * and puts it in the xmit queue.
2207 static void
2208 tcp_write_xmit(struct sock *sk)
2210 struct sk_buff *skb;
2212 DPRINTF((DBG_TCP, "tcp_write_xmit(sk=%X)\n", sk));
2214 /* The bytes will have to remain here. In time closedown will
2215 empty the write queue and all will be happy */
2216 if(sk->zapped)
2217 return;
2219 while(sk->wfront != NULL &&
2220 before(sk->wfront->h.seq, sk->window_seq +1) &&
2221 (sk->retransmits == 0 ||
2222 sk->timeout != TIME_WRITE ||
2223 before(sk->wfront->h.seq, sk->rcv_ack_seq +1))
2224 && sk->packets_out < sk->cong_window) {
2225 skb = sk->wfront;
2226 IS_SKB(skb);
2227 sk->wfront = skb->next;
2228 if (sk->wfront == NULL) sk->wback = NULL;
2229 skb->next = NULL;
2230 if (skb->magic != TCP_WRITE_QUEUE_MAGIC) {
2231 printk("tcp.c skb with bad magic(%X) on write queue. Squashing "
2232 "queue\n", skb->magic);
2233 sk->wfront = NULL;
2234 sk->wback = NULL;
2235 return;
2237 skb->magic = 0;
2238 DPRINTF((DBG_TCP, "Sending a packet.\n"));
2240 /* See if we really need to send the packet. */
2241 if (before(skb->h.seq, sk->rcv_ack_seq +1)) {
2242 sk->retransmits = 0;
2243 kfree_skb(skb, FREE_WRITE);
2244 if (!sk->dead) sk->write_space(sk);
2245 } else {
2246 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
2253 * This routine sorts the send list, and resets the
2254 * sk->send_head and sk->send_tail pointers.
2256 void
2257 sort_send(struct sock *sk)
2259 struct sk_buff *list = NULL;
2260 struct sk_buff *skb,*skb2,*skb3;
2262 for (skb = sk->send_head; skb != NULL; skb = skb2) {
2263 skb2 = (struct sk_buff *)skb->link3;
2264 if (list == NULL || before (skb2->h.seq, list->h.seq)) {
2265 skb->link3 = list;
2266 sk->send_tail = skb;
2267 list = skb;
2268 } else {
2269 for (skb3 = list; ; skb3 = (struct sk_buff *)skb3->link3) {
2270 if (skb3->link3 == NULL ||
2271 before(skb->h.seq, skb3->link3->h.seq)) {
2272 skb->link3 = skb3->link3;
2273 skb3->link3 = skb;
2274 if (skb->link3 == NULL) sk->send_tail = skb;
2275 break;
2280 sk->send_head = list;
2284 /* This routine deals with incoming acks, but not outgoing ones. */
2285 static int
2286 tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
2288 unsigned long ack;
2289 int flag = 0;
2291 * 1 - there was data in packet as well as ack or new data is sent or
2292 * in shutdown state
2293 * 2 - data from retransmit queue was acked and removed
2294 * 4 - window shrunk or data from retransmit queue was acked and removed
2297 if(sk->zapped)
2298 return(1); /* Dead, cant ack any more so why bother */
2300 ack = ntohl(th->ack_seq);
2301 DPRINTF((DBG_TCP, "tcp_ack ack=%d, window=%d, "
2302 "sk->rcv_ack_seq=%d, sk->window_seq = %d\n",
2303 ack, ntohs(th->window), sk->rcv_ack_seq, sk->window_seq));
2305 if (ntohs(th->window) > sk->max_window) {
2306 sk->max_window = ntohs(th->window);
2307 sk->mss = min(sk->max_window, sk->mtu);
2310 if (sk->retransmits && sk->timeout == TIME_KEEPOPEN)
2311 sk->retransmits = 0;
2313 if (after(ack, sk->send_seq+1) || before(ack, sk->rcv_ack_seq-1)) {
2314 if (after(ack, sk->send_seq) ||
2315 (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)) {
2316 return(0);
2318 if (sk->keepopen) {
2319 reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
2321 return(1);
2324 if (len != th->doff*4) flag |= 1;
2326 /* See if our window has been shrunk. */
2327 if (after(sk->window_seq, ack+ntohs(th->window))) {
2329 * We may need to move packets from the send queue
2330 * to the write queue, if the window has been shrunk on us.
2331 * The RFC says you are not allowed to shrink your window
2332 * like this, but if the other end does, you must be able
2333 * to deal with it.
2335 struct sk_buff *skb;
2336 struct sk_buff *skb2;
2337 struct sk_buff *wskb = NULL;
2339 skb2 = sk->send_head;
2340 sk->send_head = NULL;
2341 sk->send_tail = NULL;
2343 flag |= 4;
2345 sk->window_seq = ack + ntohs(th->window);
2346 cli();
2347 while (skb2 != NULL) {
2348 skb = skb2;
2349 skb2 = (struct sk_buff *)skb->link3;
2350 skb->link3 = NULL;
2351 if (after(skb->h.seq, sk->window_seq)) {
2352 if (sk->packets_out > 0) sk->packets_out--;
2353 /* We may need to remove this from the dev send list. */
2354 if (skb->next != NULL) {
2355 skb_unlink(skb);
2357 /* Now add it to the write_queue. */
2358 skb->magic = TCP_WRITE_QUEUE_MAGIC;
2359 if (wskb == NULL) {
2360 skb->next = sk->wfront;
2361 sk->wfront = skb;
2362 } else {
2363 skb->next = wskb->next;
2364 wskb->next = skb;
2366 if (sk->wback == wskb) sk->wback = skb;
2367 wskb = skb;
2368 } else {
2369 if (sk->send_head == NULL) {
2370 sk->send_head = skb;
2371 sk->send_tail = skb;
2372 } else {
2373 sk->send_tail->link3 = skb;
2374 sk->send_tail = skb;
2376 skb->link3 = NULL;
2379 sti();
2382 if (sk->send_tail == NULL || sk->send_head == NULL) {
2383 sk->send_head = NULL;
2384 sk->send_tail = NULL;
2385 sk->packets_out= 0;
2388 sk->window_seq = ack + ntohs(th->window);
2390 /* We don't want too many packets out there. */
2391 if (sk->timeout == TIME_WRITE &&
2392 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) {
2394 * This is Jacobson's slow start and congestion avoidance.
2395 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
2396 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
2397 * counter and increment it once every cwnd times. It's possible
2398 * that this should be done only if sk->retransmits == 0. I'm
2399 * interpreting "new data is acked" as including data that has
2400 * been retransmitted but is just now being acked.
2402 if (sk->cong_window < sk->ssthresh)
2403 /* in "safe" area, increase */
2404 sk->cong_window++;
2405 else {
2406 /* in dangerous area, increase slowly. In theory this is
2407 sk->cong_window += 1 / sk->cong_window
2409 if (sk->cong_count >= sk->cong_window) {
2410 sk->cong_window++;
2411 sk->cong_count = 0;
2412 } else
2413 sk->cong_count++;
2417 DPRINTF((DBG_TCP, "tcp_ack: Updating rcv ack sequence.\n"));
2418 sk->rcv_ack_seq = ack;
2421 * if this ack opens up a zero window, clear backoff. It was
2422 * being used to time the probes, and is probably far higher than
2423 * it needs to be for normal retransmission
2425 if (sk->timeout == TIME_PROBE0) {
2426 if (sk->wfront != NULL && /* should always be non-null */
2427 ! before (sk->window_seq, sk->wfront->h.seq)) {
2428 sk->retransmits = 0;
2429 sk->backoff = 0;
2430 /* recompute rto from rtt. this eliminates any backoff */
2431 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
2432 if (sk->rto > 120*HZ)
2433 sk->rto = 120*HZ;
2434 if (sk->rto < 1*HZ)
2435 sk->rto = 1*HZ;
2439 /* See if we can take anything off of the retransmit queue. */
2440 while(sk->send_head != NULL) {
2441 /* Check for a bug. */
2442 if (sk->send_head->link3 &&
2443 after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) {
2444 printk("INET: tcp.c: *** bug send_list out of order.\n");
2445 sort_send(sk);
2448 if (before(sk->send_head->h.seq, ack+1)) {
2449 struct sk_buff *oskb;
2451 if (sk->retransmits) {
2453 /* we were retransmitting. don't count this in RTT est */
2454 flag |= 2;
2457 * even though we've gotten an ack, we're still
2458 * retransmitting as long as we're sending from
2459 * the retransmit queue. Keeping retransmits non-zero
2460 * prevents us from getting new data interspersed with
2461 * retransmissions.
2464 if (sk->send_head->link3)
2465 sk->retransmits = 1;
2466 else
2467 sk->retransmits = 0;
2472 * Note that we only reset backoff and rto in the
2473 * rtt recomputation code. And that doesn't happen
2474 * if there were retransmissions in effect. So the
2475 * first new packet after the retransmissions is
2476 * sent with the backoff still in effect. Not until
2477 * we get an ack from a non-retransmitted packet do
2478 * we reset the backoff and rto. This allows us to deal
2479 * with a situation where the network delay has increased
2480 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2483 /* We have one less packet out there. */
2484 if (sk->packets_out > 0) sk->packets_out --;
2485 DPRINTF((DBG_TCP, "skb=%X skb->h.seq = %d acked ack=%d\n",
2486 sk->send_head, sk->send_head->h.seq, ack));
2488 /* Wake up the process, it can probably write more. */
2489 if (!sk->dead) sk->write_space(sk);
2491 oskb = sk->send_head;
2493 if (!(flag&2)) {
2494 long m;
2496 /* The following amusing code comes from Jacobson's
2497 * article in SIGCOMM '88. Note that rtt and mdev
2498 * are scaled versions of rtt and mean deviation.
2499 * This is designed to be as fast as possible
2500 * m stands for "measurement".
2503 m = jiffies - oskb->when; /* RTT */
2504 m -= (sk->rtt >> 3); /* m is now error in rtt est */
2505 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
2506 if (m < 0)
2507 m = -m; /* m is now abs(error) */
2508 m -= (sk->mdev >> 2); /* similar update on mdev */
2509 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
2511 /* now update timeout. Note that this removes any backoff */
2512 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
2513 if (sk->rto > 120*HZ)
2514 sk->rto = 120*HZ;
2515 if (sk->rto < 1*HZ)
2516 sk->rto = 1*HZ;
2517 sk->backoff = 0;
2520 flag |= (2|4);
2522 cli();
2524 oskb = sk->send_head;
2525 IS_SKB(oskb);
2526 sk->send_head =(struct sk_buff *)oskb->link3;
2527 if (sk->send_head == NULL) {
2528 sk->send_tail = NULL;
2531 /* We may need to remove this from the dev send list. */
2532 skb_unlink(oskb); /* Much easier! */
2533 sti();
2534 oskb->magic = 0;
2535 kfree_skb(oskb, FREE_WRITE); /* write. */
2536 if (!sk->dead) sk->write_space(sk);
2537 } else {
2538 break;
2543 * Maybe we can take some stuff off of the write queue,
2544 * and put it onto the xmit queue.
2546 if (sk->wfront != NULL) {
2547 if (after (sk->window_seq+1, sk->wfront->h.seq) &&
2548 (sk->retransmits == 0 ||
2549 sk->timeout != TIME_WRITE ||
2550 before(sk->wfront->h.seq, sk->rcv_ack_seq +1))
2551 && sk->packets_out < sk->cong_window) {
2552 flag |= 1;
2553 tcp_write_xmit(sk);
2554 } else if (before(sk->window_seq, sk->wfront->h.seq) &&
2555 sk->send_head == NULL &&
2556 sk->ack_backlog == 0 &&
2557 sk->state != TCP_TIME_WAIT) {
2558 reset_timer(sk, TIME_PROBE0, sk->rto);
2560 } else {
2561 if (sk->send_head == NULL && sk->ack_backlog == 0 &&
2562 sk->state != TCP_TIME_WAIT && !sk->keepopen) {
2563 DPRINTF((DBG_TCP, "Nothing to do, going to sleep.\n"));
2564 if (!sk->dead) sk->write_space(sk);
2566 if (sk->keepopen)
2567 reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
2568 else
2569 delete_timer(sk);
2570 } else {
2571 if (sk->state != (unsigned char) sk->keepopen) {
2572 reset_timer(sk, TIME_WRITE, sk->rto);
2574 if (sk->state == TCP_TIME_WAIT) {
2575 reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2580 if (sk->packets_out == 0 && sk->partial != NULL &&
2581 sk->wfront == NULL && sk->send_head == NULL) {
2582 flag |= 1;
2583 tcp_send_partial(sk);
2586 /* See if we are done. */
2587 if (sk->state == TCP_TIME_WAIT) {
2588 if (!sk->dead)
2589 sk->state_change(sk);
2590 if (sk->rcv_ack_seq == sk->send_seq && sk->acked_seq == sk->fin_seq) {
2591 flag |= 1;
2592 sk->state = TCP_CLOSE;
2593 sk->shutdown = SHUTDOWN_MASK;
2597 if (sk->state == TCP_LAST_ACK || sk->state == TCP_FIN_WAIT2) {
2598 if (!sk->dead) sk->state_change(sk);
2599 if (sk->rcv_ack_seq == sk->send_seq) {
2600 flag |= 1;
2601 if (sk->acked_seq != sk->fin_seq) {
2602 tcp_time_wait(sk);
2603 } else {
2604 DPRINTF((DBG_TCP, "tcp_ack closing socket - %X\n", sk));
2605 tcp_send_ack(sk->send_seq, sk->acked_seq, sk,
2606 th, sk->daddr);
2607 sk->shutdown = SHUTDOWN_MASK;
2608 sk->state = TCP_CLOSE;
2614 * I make no guarantees about the first clause in the following
2615 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
2616 * what conditions "!flag" would be true. However I think the rest
2617 * of the conditions would prevent that from causing any
2618 * unnecessary retransmission.
2619 * Clearly if the first packet has expired it should be
2620 * retransmitted. The other alternative, "flag&2 && retransmits", is
2621 * harder to explain: You have to look carefully at how and when the
2622 * timer is set and with what timeout. The most recent transmission always
2623 * sets the timer. So in general if the most recent thing has timed
2624 * out, everything before it has as well. So we want to go ahead and
2625 * retransmit some more. If we didn't explicitly test for this
2626 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
2627 * would not be true. If you look at the pattern of timing, you can
2628 * show that rto is increased fast enough that the next packet would
2629 * almost never be retransmitted immediately. Then you'd end up
2630 * waiting for a timeout to send each packet on the retranmission
2631 * queue. With my implementation of the Karn sampling algorithm,
2632 * the timeout would double each time. The net result is that it would
2633 * take a hideous amount of time to recover from a single dropped packet.
2634 * It's possible that there should also be a test for TIME_WRITE, but
2635 * I think as long as "send_head != NULL" and "retransmit" is on, we've
2636 * got to be in real retransmission mode.
2637 * Note that ip_do_retransmit is called with all==1. Setting cong_window
2638 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
2639 * As long as no further losses occur, this seems reasonable.
2642 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
2643 (((flag&2) && sk->retransmits) ||
2644 (sk->send_head->when + sk->rto < jiffies))) {
2645 ip_do_retransmit(sk, 1);
2646 reset_timer(sk, TIME_WRITE, sk->rto);
2649 DPRINTF((DBG_TCP, "leaving tcp_ack\n"));
2650 return(1);
2655 * This routine handles the data. If there is room in the buffer,
2656 * it will be have already been moved into it. If there is no
2657 * room, then we will just have to discard the packet.
2659 static int
2660 tcp_data(struct sk_buff *skb, struct sock *sk,
2661 unsigned long saddr, unsigned short len)
2663 struct sk_buff *skb1, *skb2;
2664 struct tcphdr *th;
2665 int dup_dumped=0;
2667 th = skb->h.th;
2668 print_th(th);
2669 skb->len = len -(th->doff*4);
2671 DPRINTF((DBG_TCP, "tcp_data len = %d sk = %X:\n", skb->len, sk));
2673 sk->bytes_rcv += skb->len;
2674 if (skb->len == 0 && !th->fin && !th->urg && !th->psh) {
2675 /* Don't want to keep passing ack's back and forth. */
2676 if (!th->ack) tcp_send_ack(sk->send_seq, sk->acked_seq,sk, th, saddr);
2677 kfree_skb(skb, FREE_READ);
2678 return(0);
2681 if (sk->shutdown & RCV_SHUTDOWN) {
2682 sk->acked_seq = th->seq + skb->len + th->syn + th->fin;
2683 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
2684 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
2685 sk->state = TCP_CLOSE;
2686 sk->err = EPIPE;
2687 sk->shutdown = SHUTDOWN_MASK;
2688 DPRINTF((DBG_TCP, "tcp_data: closing socket - %X\n", sk));
2689 kfree_skb(skb, FREE_READ);
2690 if (!sk->dead) sk->state_change(sk);
2691 return(0);
2695 * Now we have to walk the chain, and figure out where this one
2696 * goes into it. This is set up so that the last packet we received
2697 * will be the first one we look at, that way if everything comes
2698 * in order, there will be no performance loss, and if they come
2699 * out of order we will be able to fit things in nicely.
2702 /* This should start at the last one, and then go around forwards. */
2703 if (sk->rqueue == NULL) {
2704 DPRINTF((DBG_TCP, "tcp_data: skb = %X:\n", skb));
2705 #ifdef OLDWAY
2706 sk->rqueue = skb;
2707 skb->next = skb;
2708 skb->prev = skb;
2709 skb->list = &sk->rqueue;
2710 #else
2711 skb_queue_head(&sk->rqueue,skb);
2712 #endif
2713 skb1= NULL;
2714 } else {
2715 DPRINTF((DBG_TCP, "tcp_data adding to chain sk = %X:\n", sk));
2716 for(skb1=sk->rqueue->prev; ; skb1 =(struct sk_buff *)skb1->prev) {
2717 if(sk->debug)
2719 printk("skb1=%p :", skb1);
2720 printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq);
2721 printk("skb->h.th->seq = %ld\n",skb->h.th->seq);
2722 printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq,
2723 sk->acked_seq);
2725 #ifdef OLD
2726 if (after(th->seq+1, skb1->h.th->seq)) {
2727 skb->prev = skb1;
2728 skb->next = skb1->next;
2729 skb->next->prev = skb;
2730 skb1->next = skb;
2731 if (skb1 == sk->rqueue) sk->rqueue = skb;
2732 break;
2734 if (skb1->prev == sk->rqueue) {
2735 skb->next= skb1;
2736 skb->prev = skb1->prev;
2737 skb->prev->next = skb;
2738 skb1->prev = skb;
2739 skb1 = NULL; /* so we know we might be able
2740 to ack stuff. */
2741 break;
2743 #else
2744 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
2746 skb_append(skb1,skb);
2747 skb_unlink(skb1);
2748 kfree_skb(skb1,FREE_READ);
2749 dup_dumped=1;
2750 skb1=NULL;
2751 break;
2753 if (after(th->seq+1, skb1->h.th->seq))
2755 skb_append(skb1,skb);
2756 break;
2758 if (skb1 == sk->rqueue)
2760 skb_queue_head(&sk->rqueue, skb);
2761 break;
2763 #endif
2765 DPRINTF((DBG_TCP, "skb = %X:\n", skb));
2768 th->ack_seq = th->seq + skb->len;
2769 if (th->syn) th->ack_seq++;
2770 if (th->fin) th->ack_seq++;
2772 if (before(sk->acked_seq, sk->copied_seq)) {
2773 printk("*** tcp.c:tcp_data bug acked < copied\n");
2774 sk->acked_seq = sk->copied_seq;
2777 /* Now figure out if we can ack anything. */
2778 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) {
2779 if (before(th->seq, sk->acked_seq+1)) {
2780 if (after(th->ack_seq, sk->acked_seq))
2781 sk->acked_seq = th->ack_seq;
2782 skb->acked = 1;
2784 /* When we ack the fin, we turn on the RCV_SHUTDOWN flag. */
2785 if (skb->h.th->fin) {
2786 if (!sk->dead) sk->state_change(sk);
2787 sk->shutdown |= RCV_SHUTDOWN;
2790 for(skb2 = (struct sk_buff *)skb->next;
2791 skb2 !=(struct sk_buff *) sk->rqueue;
2792 skb2 = (struct sk_buff *)skb2->next) {
2793 if (before(skb2->h.th->seq, sk->acked_seq+1)) {
2794 if (after(skb2->h.th->ack_seq, sk->acked_seq))
2796 long old_acked_seq = sk->acked_seq;
2797 sk->acked_seq = skb2->h.th->ack_seq;
2798 if((int)(sk->acked_seq - old_acked_seq) >0)
2800 int new_window=sk->window-sk->acked_seq+
2801 old_acked_seq;
2802 if(new_window<0)
2803 new_window=0;
2804 sk->window = new_window;
2807 skb2->acked = 1;
2810 * When we ack the fin, we turn on
2811 * the RCV_SHUTDOWN flag.
2813 if (skb2->h.th->fin) {
2814 sk->shutdown |= RCV_SHUTDOWN;
2815 if (!sk->dead) sk->state_change(sk);
2818 /* Force an immediate ack. */
2819 sk->ack_backlog = sk->max_ack_backlog;
2820 } else {
2821 break;
2826 * This also takes care of updating the window.
2827 * This if statement needs to be simplified.
2829 if (!sk->delay_acks ||
2830 sk->ack_backlog >= sk->max_ack_backlog ||
2831 sk->bytes_rcv > sk->max_unacked || th->fin) {
2832 /* tcp_send_ack(sk->send_seq, sk->acked_seq,sk,th, saddr); */
2833 } else {
2834 sk->ack_backlog++;
2835 if(sk->debug)
2836 printk("Ack queued.\n");
2837 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2843 * If we've missed a packet, send an ack.
2844 * Also start a timer to send another.
2846 if (!skb->acked) {
2848 * This is important. If we don't have much room left,
2849 * we need to throw out a few packets so we have a good
2850 * window. Note that mtu is used, not mss, because mss is really
2851 * for the send side. He could be sending us stuff as large as mtu.
2853 while (sk->prot->rspace(sk) < sk->mtu) {
2854 skb1 = skb_peek(&sk->rqueue);
2855 if (skb1 == NULL) {
2856 printk("INET: tcp.c:tcp_data memory leak detected.\n");
2857 break;
2860 /* Don't throw out something that has been acked. */
2861 if (skb1->acked) {
2862 break;
2865 skb_unlink(skb1);
2866 #ifdef OLDWAY
2867 if (skb1->prev == skb1) {
2868 sk->rqueue = NULL;
2869 } else {
2870 sk->rqueue = (struct sk_buff *)skb1->prev;
2871 skb1->next->prev = skb1->prev;
2872 skb1->prev->next = skb1->next;
2874 #endif
2875 kfree_skb(skb1, FREE_READ);
2877 tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr);
2878 sk->ack_backlog++;
2879 reset_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2880 } else {
2881 /* We missed a packet. Send an ack to try to resync things. */
2882 tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr);
2885 /* Now tell the user we may have some data. */
2886 if (!sk->dead) {
2887 if(sk->debug)
2888 printk("Data wakeup.\n");
2889 sk->data_ready(sk,0);
2890 } else {
2891 DPRINTF((DBG_TCP, "data received on dead socket.\n"));
2894 if (sk->state == TCP_FIN_WAIT2 &&
2895 sk->acked_seq == sk->fin_seq && sk->rcv_ack_seq == sk->send_seq) {
2896 DPRINTF((DBG_TCP, "tcp_data: entering last_ack state sk = %X\n", sk));
2898 /* tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); */
2899 sk->shutdown = SHUTDOWN_MASK;
2900 sk->state = TCP_LAST_ACK;
2901 if (!sk->dead) sk->state_change(sk);
2904 return(0);
2908 static int
2909 tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long saddr)
2911 extern int kill_pg(int pg, int sig, int priv);
2912 extern int kill_proc(int pid, int sig, int priv);
2914 if (!sk->dead)
2915 sk->data_ready(sk,0);
2917 if (sk->urginline) {
2918 th->urg = 0;
2919 th->psh = 1;
2920 return(0);
2923 if (!sk->urg) {
2924 /* So if we get more urgent data, we don't signal the user again. */
2925 if (sk->proc != 0) {
2926 if (sk->proc > 0) {
2927 kill_proc(sk->proc, SIGURG, 1);
2928 } else {
2929 kill_pg(-sk->proc, SIGURG, 1);
2933 sk->urg++;
2934 return(0);
2938 /* This deals with incoming fins. 'Linus at 9 O'clock' 8-) */
2939 static int
2940 tcp_fin(struct sock *sk, struct tcphdr *th,
2941 unsigned long saddr, struct device *dev)
2943 DPRINTF((DBG_TCP, "tcp_fin(sk=%X, th=%X, saddr=%X, dev=%X)\n",
2944 sk, th, saddr, dev));
2946 if (!sk->dead) {
2947 sk->state_change(sk);
2950 switch(sk->state) {
2951 case TCP_SYN_RECV:
2952 case TCP_SYN_SENT:
2953 case TCP_ESTABLISHED:
2954 /* Contains the one that needs to be acked */
2955 sk->fin_seq = th->seq+1;
2956 sk->state = TCP_CLOSE_WAIT;
2957 if (th->rst) sk->shutdown = SHUTDOWN_MASK;
2958 break;
2960 case TCP_CLOSE_WAIT:
2961 case TCP_FIN_WAIT2:
2962 break; /* we got a retransmit of the fin. */
2964 case TCP_FIN_WAIT1:
2965 /* Contains the one that needs to be acked */
2966 sk->fin_seq = th->seq+1;
2967 sk->state = TCP_FIN_WAIT2;
2968 break;
2970 default:
2971 case TCP_TIME_WAIT:
2972 sk->state = TCP_LAST_ACK;
2974 /* Start the timers. */
2975 reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2976 return(0);
2978 sk->ack_backlog++;
2980 return(0);
2984 /* This will accept the next outstanding connection. */
2985 static struct sock *
2986 tcp_accept(struct sock *sk, int flags)
2988 struct sock *newsk;
2989 struct sk_buff *skb;
2991 DPRINTF((DBG_TCP, "tcp_accept(sk=%X, flags=%X, addr=%s)\n",
2992 sk, flags, in_ntoa(sk->saddr)));
2995 * We need to make sure that this socket is listening,
2996 * and that it has something pending.
2998 if (sk->state != TCP_LISTEN) {
2999 sk->err = EINVAL;
3000 return(NULL);
3003 /* avoid the race. */
3004 cli();
3005 sk->inuse = 1;
3006 while((skb = get_firstr(sk)) == NULL) {
3007 if (flags & O_NONBLOCK) {
3008 sti();
3009 release_sock(sk);
3010 sk->err = EAGAIN;
3011 return(NULL);
3014 release_sock(sk);
3015 interruptible_sleep_on(sk->sleep);
3016 if (current->signal & ~current->blocked) {
3017 sti();
3018 sk->err = ERESTARTSYS;
3019 return(NULL);
3021 sk->inuse = 1;
3023 sti();
3025 /* Now all we need to do is return skb->sk. */
3026 newsk = skb->sk;
3028 kfree_skb(skb, FREE_READ);
3029 sk->ack_backlog--;
3030 release_sock(sk);
3031 return(newsk);
3035 /* This will initiate an outgoing connection. */
3036 static int
3037 tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
3039 struct sk_buff *buff;
3040 struct sockaddr_in sin;
3041 struct device *dev=NULL;
3042 unsigned char *ptr;
3043 int tmp;
3044 struct tcphdr *t1;
3045 int err;
3047 if (sk->state != TCP_CLOSE) return(-EISCONN);
3048 if (addr_len < 8) return(-EINVAL);
3050 err=verify_area(VERIFY_READ, usin, addr_len);
3051 if(err)
3052 return err;
3054 memcpy_fromfs(&sin,usin, min(sizeof(sin), addr_len));
3056 if (sin.sin_family && sin.sin_family != AF_INET) return(-EAFNOSUPPORT);
3058 DPRINTF((DBG_TCP, "TCP connect daddr=%s\n", in_ntoa(sin.sin_addr.s_addr)));
3060 /* Don't want a TCP connection going to a broadcast address */
3061 if (chk_addr(sin.sin_addr.s_addr) == IS_BROADCAST) {
3062 DPRINTF((DBG_TCP, "TCP connection to broadcast address not allowed\n"));
3063 return(-ENETUNREACH);
3066 /* Connect back to the same socket: Blows up so disallow it */
3067 if(sk->saddr == sin.sin_addr.s_addr && sk->num==ntohs(sin.sin_port))
3068 return -EBUSY;
3070 sk->inuse = 1;
3071 sk->daddr = sin.sin_addr.s_addr;
3072 sk->send_seq = jiffies * SEQ_TICK - seq_offset;
3073 sk->window_seq = sk->send_seq;
3074 sk->rcv_ack_seq = sk->send_seq -1;
3075 sk->err = 0;
3076 sk->dummy_th.dest = sin.sin_port;
3077 release_sock(sk);
3079 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
3080 if (buff == NULL) {
3081 return(-ENOMEM);
3083 sk->inuse = 1;
3084 buff->mem_addr = buff;
3085 buff->mem_len = MAX_SYN_SIZE;
3086 buff->len = 24;
3087 buff->sk = sk;
3088 buff->free = 1;
3089 t1 = (struct tcphdr *) buff->data;
3091 /* Put in the IP header and routing stuff. */
3092 /* We need to build the routing stuff fromt the things saved in skb. */
3093 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
3094 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3095 if (tmp < 0) {
3096 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
3097 release_sock(sk);
3098 return(-ENETUNREACH);
3100 buff->len += tmp;
3101 t1 = (struct tcphdr *)((char *)t1 +tmp);
3103 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
3104 t1->seq = ntohl(sk->send_seq++);
3105 buff->h.seq = sk->send_seq;
3106 t1->ack = 0;
3107 t1->window = 2;
3108 t1->res1=0;
3109 t1->res2=0;
3110 t1->rst = 0;
3111 t1->urg = 0;
3112 t1->psh = 0;
3113 t1->syn = 1;
3114 t1->urg_ptr = 0;
3115 t1->doff = 6;
3117 /* use 512 or whatever user asked for */
3118 if (sk->user_mss)
3119 sk->mtu = sk->user_mss;
3120 else {
3121 #ifdef SUBNETSARELOCAL
3122 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
3123 #else
3124 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
3125 #endif
3126 sk->mtu = 576 - HEADER_SIZE;
3127 else
3128 sk->mtu = MAX_WINDOW;
3130 /* but not bigger than device MTU */
3131 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
3133 /* Put in the TCP options to say MTU. */
3134 ptr = (unsigned char *)(t1+1);
3135 ptr[0] = 2;
3136 ptr[1] = 4;
3137 ptr[2] = (sk->mtu) >> 8;
3138 ptr[3] = (sk->mtu) & 0xff;
3139 tcp_send_check(t1, sk->saddr, sk->daddr,
3140 sizeof(struct tcphdr) + 4, sk);
3142 /* This must go first otherwise a really quick response will get reset. */
3143 sk->state = TCP_SYN_SENT;
3144 sk->rtt = TCP_CONNECT_TIME;
3145 reset_timer(sk, TIME_WRITE, TCP_CONNECT_TIME); /* Timer for repeating the SYN until an answer */
3146 sk->retransmits = TCP_RETR2 - TCP_SYN_RETRIES;
3148 sk->prot->queue_xmit(sk, dev, buff, 0);
3150 release_sock(sk);
3151 return(0);
3155 /* This functions checks to see if the tcp header is actually acceptible. */
3156 static int
3157 tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
3158 struct options *opt, unsigned long saddr, struct device *dev)
3161 * This isn't quite right. sk->acked_seq could be more recent
3162 * than sk->window. This is however close enough. We will accept
3163 * slightly more packets than we should, but it should not cause
3164 * problems unless someone is trying to forge packets.
3166 DPRINTF((DBG_TCP, "tcp_sequence(sk=%X, th=%X, len = %d, opt=%d, saddr=%X)\n",
3167 sk, th, len, opt, saddr));
3169 if (between(th->seq, sk->acked_seq, sk->acked_seq + sk->window)||
3170 between(th->seq + len-(th->doff*4), sk->acked_seq + 1,
3171 sk->acked_seq + sk->window) ||
3172 (before(th->seq, sk->acked_seq) &&
3173 after(th->seq + len -(th->doff*4), sk->acked_seq + sk->window))) {
3174 return(1);
3176 DPRINTF((DBG_TCP, "tcp_sequence: rejecting packet.\n"));
3179 * Send a reset if we get something not ours and we are
3180 * unsynchronized. Note: We don't do anything to our end. We
3181 * are just killing the bogus remote connection then we will
3182 * connect again and it will work (with luck).
3185 if(sk->state==TCP_SYN_SENT||sk->state==TCP_SYN_RECV)
3187 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
3188 return(1);
3192 * If it's too far ahead, send an ack to let the
3193 * other end know what we expect.
3195 if (after(th->seq, sk->acked_seq + sk->window)) {
3196 if(!th->rst)
3197 tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr);
3198 return(0);
3201 #ifdef undef
3203 * if we do this, we won't respond to keepalive packets, since those
3204 * are slightly out of window, and we have to generate an ack
3205 * a late ack out still not to have a sequence number less than
3206 * one we've seen before. Berkeley doesn't seem to do this, but it's
3207 * always hard to be sure.
3209 /* In case it's just a late ack, let it through. */
3210 if (th->ack && len == (th->doff * 4) &&
3211 after(th->seq, sk->acked_seq - 32767) &&
3212 !th->fin && !th->syn) return(1);
3213 #endif
3215 if (!th->rst) {
3216 /* Try to resync things. */
3217 tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr);
3219 return(0);
3227 tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
3228 unsigned long daddr, unsigned short len,
3229 unsigned long saddr, int redo, struct inet_protocol * protocol)
3231 struct tcphdr *th;
3232 struct sock *sk;
3234 if (!skb) {
3235 DPRINTF((DBG_TCP, "tcp.c: tcp_rcv skb = NULL\n"));
3236 return(0);
3238 #if 0 /* FIXME: it's ok for protocol to be NULL */
3239 if (!protocol) {
3240 DPRINTF((DBG_TCP, "tcp.c: tcp_rcv protocol = NULL\n"));
3241 return(0);
3244 if (!opt) { /* FIXME: it's ok for opt to be NULL */
3245 DPRINTF((DBG_TCP, "tcp.c: tcp_rcv opt = NULL\n"));
3247 #endif
3248 if (!dev) {
3249 DPRINTF((DBG_TCP, "tcp.c: tcp_rcv dev = NULL\n"));
3250 return(0);
3252 th = skb->h.th;
3254 /* Find the socket. */
3255 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
3256 DPRINTF((DBG_TCP, "<<\n"));
3257 DPRINTF((DBG_TCP, "len = %d, redo = %d, skb=%X\n", len, redo, skb));
3259 /* If this socket has got a reset its to all intents and purposes
3260 really dead */
3261 if (sk!=NULL && sk->zapped)
3262 sk=NULL;
3264 if (sk) {
3265 DPRINTF((DBG_TCP, "sk = %X:\n", sk));
3268 if (!redo) {
3269 if (tcp_check(th, len, saddr, daddr )) {
3270 skb->sk = NULL;
3271 DPRINTF((DBG_TCP, "packet dropped with bad checksum.\n"));
3272 if (inet_debug == DBG_SLIP) printk("\rtcp_rcv: bad checksum\n");
3273 kfree_skb(skb,FREE_READ);
3275 * We don't release the socket because it was
3276 * never marked in use.
3278 return(0);
3281 /* See if we know about the socket. */
3282 if (sk == NULL) {
3283 if (!th->rst)
3285 th->seq = ntohl(th->seq);
3286 /* So reset is always called with th->seq in host order */
3287 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
3289 skb->sk = NULL;
3290 kfree_skb(skb, FREE_READ);
3291 return(0);
3294 skb->len = len;
3295 skb->sk = sk;
3296 skb->acked = 0;
3297 skb->used = 0;
3298 skb->free = 0;
3299 skb->urg_used = 0;
3300 skb->saddr = daddr;
3301 skb->daddr = saddr;
3303 th->seq = ntohl(th->seq);
3305 /* We may need to add it to the backlog here. */
3306 cli();
3307 if (sk->inuse) {
3308 if (sk->back_log == NULL) {
3309 sk->back_log = skb;
3310 skb->next = skb;
3311 skb->prev = skb;
3312 } else {
3313 skb->next = sk->back_log;
3314 skb->prev = sk->back_log->prev;
3315 skb->prev->next = skb;
3316 skb->next->prev = skb;
3318 sti();
3319 return(0);
3321 sk->inuse = 1;
3322 sti();
3323 } else {
3324 if (!sk) {
3325 DPRINTF((DBG_TCP, "tcp.c: tcp_rcv bug sk=NULL redo = 1\n"));
3326 return(0);
3330 if (!sk->prot) {
3331 DPRINTF((DBG_TCP, "tcp.c: tcp_rcv sk->prot = NULL \n"));
3332 return(0);
3335 /* Charge the memory to the socket. */
3336 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) {
3337 skb->sk = NULL;
3338 DPRINTF((DBG_TCP, "dropping packet due to lack of buffer space.\n"));
3339 kfree_skb(skb, FREE_READ);
3340 release_sock(sk);
3341 return(0);
3343 sk->rmem_alloc += skb->mem_len;
3345 DPRINTF((DBG_TCP, "About to do switch.\n"));
3347 /* Now deal with it. */
3348 switch(sk->state) {
3350 * This should close the system down if it's waiting
3351 * for an ack that is never going to be sent.
3353 case TCP_LAST_ACK:
3354 if (th->rst) {
3355 sk->zapped=1;
3356 sk->err = ECONNRESET;
3357 sk->state = TCP_CLOSE;
3358 sk->shutdown = SHUTDOWN_MASK;
3359 if (!sk->dead) {
3360 sk->state_change(sk);
3362 kfree_skb(skb, FREE_READ);
3363 release_sock(sk);
3364 return(0);
3367 case TCP_ESTABLISHED:
3368 case TCP_CLOSE_WAIT:
3369 case TCP_FIN_WAIT1:
3370 case TCP_FIN_WAIT2:
3371 case TCP_TIME_WAIT:
3372 if (!tcp_sequence(sk, th, len, opt, saddr,dev)) {
3373 if (inet_debug == DBG_SLIP) printk("\rtcp_rcv: not in seq\n");
3374 #ifdef undef
3375 /* nice idea, but tcp_sequence already does this. Maybe it shouldn't?? */
3376 if(!th->rst)
3377 tcp_send_ack(sk->send_seq, sk->acked_seq,
3378 sk, th, saddr);
3379 #endif
3380 kfree_skb(skb, FREE_READ);
3381 release_sock(sk);
3382 return(0);
3385 if (th->rst) {
3386 sk->zapped=1;
3387 /* This means the thing should really be closed. */
3388 sk->err = ECONNRESET;
3390 if (sk->state == TCP_CLOSE_WAIT) {
3391 sk->err = EPIPE;
3395 * A reset with a fin just means that
3396 * the data was not all read.
3398 sk->state = TCP_CLOSE;
3399 sk->shutdown = SHUTDOWN_MASK;
3400 if (!sk->dead) {
3401 sk->state_change(sk);
3403 kfree_skb(skb, FREE_READ);
3404 release_sock(sk);
3405 return(0);
3407 if (
3408 #if 0
3409 if ((opt && (opt->security != 0 ||
3410 opt->compartment != 0)) ||
3411 #endif
3412 th->syn) {
3413 sk->err = ECONNRESET;
3414 sk->state = TCP_CLOSE;
3415 sk->shutdown = SHUTDOWN_MASK;
3416 tcp_reset(daddr, saddr, th, sk->prot, opt,dev, sk->ip_tos,sk->ip_ttl);
3417 if (!sk->dead) {
3418 sk->state_change(sk);
3420 kfree_skb(skb, FREE_READ);
3421 release_sock(sk);
3422 return(0);
3424 if (th->ack) {
3425 if (!tcp_ack(sk, th, saddr, len)) {
3426 kfree_skb(skb, FREE_READ);
3427 release_sock(sk);
3428 return(0);
3431 if (th->urg) {
3432 if (tcp_urg(sk, th, saddr)) {
3433 kfree_skb(skb, FREE_READ);
3434 release_sock(sk);
3435 return(0);
3439 if (tcp_data(skb, sk, saddr, len)) {
3440 kfree_skb(skb, FREE_READ);
3441 release_sock(sk);
3442 return(0);
3445 /* Moved: you must do data then fin bit */
3446 if (th->fin && tcp_fin(sk, th, saddr, dev)) {
3447 kfree_skb(skb, FREE_READ);
3448 release_sock(sk);
3449 return(0);
3452 release_sock(sk);
3453 return(0);
3455 case TCP_CLOSE:
3456 if (sk->dead || sk->daddr) {
3457 DPRINTF((DBG_TCP, "packet received for closed,dead socket\n"));
3458 kfree_skb(skb, FREE_READ);
3459 release_sock(sk);
3460 return(0);
3463 if (!th->rst) {
3464 if (!th->ack)
3465 th->ack_seq = 0;
3466 tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
3468 kfree_skb(skb, FREE_READ);
3469 release_sock(sk);
3470 return(0);
3472 case TCP_LISTEN:
3473 if (th->rst) {
3474 kfree_skb(skb, FREE_READ);
3475 release_sock(sk);
3476 return(0);
3478 if (th->ack) {
3479 tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
3480 kfree_skb(skb, FREE_READ);
3481 release_sock(sk);
3482 return(0);
3485 if (th->syn) {
3486 #if 0
3487 if (opt->security != 0 || opt->compartment != 0) {
3488 tcp_reset(daddr, saddr, th, prot, opt,dev);
3489 release_sock(sk);
3490 return(0);
3492 #endif
3495 * Now we just put the whole thing including
3496 * the header and saddr, and protocol pointer
3497 * into the buffer. We can't respond until the
3498 * user tells us to accept the connection.
3500 tcp_conn_request(sk, skb, daddr, saddr, opt, dev);
3501 release_sock(sk);
3502 return(0);
3505 kfree_skb(skb, FREE_READ);
3506 release_sock(sk);
3507 return(0);
3509 default:
3510 if (!tcp_sequence(sk, th, len, opt, saddr,dev)) {
3511 kfree_skb(skb, FREE_READ);
3512 release_sock(sk);
3513 return(0);
3516 case TCP_SYN_SENT:
3517 if (th->rst) {
3518 sk->err = ECONNREFUSED;
3519 sk->state = TCP_CLOSE;
3520 sk->shutdown = SHUTDOWN_MASK;
3521 sk->zapped = 1;
3522 if (!sk->dead) {
3523 sk->state_change(sk);
3525 kfree_skb(skb, FREE_READ);
3526 release_sock(sk);
3527 return(0);
3529 #if 0
3530 if (opt->security != 0 || opt->compartment != 0) {
3531 sk->err = ECONNRESET;
3532 sk->state = TCP_CLOSE;
3533 sk->shutdown = SHUTDOWN_MASK;
3534 tcp_reset(daddr, saddr, th, sk->prot, opt, dev);
3535 if (!sk->dead) {
3536 wake_up_interruptible(sk->sleep);
3538 kfree_skb(skb, FREE_READ);
3539 release_sock(sk);
3540 return(0);
3542 #endif
3543 if (!th->ack) {
3544 if (th->syn) {
3545 sk->state = TCP_SYN_RECV;
3548 kfree_skb(skb, FREE_READ);
3549 release_sock(sk);
3550 return(0);
3553 switch(sk->state) {
3554 case TCP_SYN_SENT:
3555 if (!tcp_ack(sk, th, saddr, len)) {
3556 tcp_reset(daddr, saddr, th,
3557 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
3558 kfree_skb(skb, FREE_READ);
3559 release_sock(sk);
3560 return(0);
3564 * If the syn bit is also set, switch to
3565 * tcp_syn_recv, and then to established.
3567 if (!th->syn) {
3568 kfree_skb(skb, FREE_READ);
3569 release_sock(sk);
3570 return(0);
3573 /* Ack the syn and fall through. */
3574 sk->acked_seq = th->seq+1;
3575 sk->fin_seq = th->seq;
3576 tcp_send_ack(sk->send_seq, th->seq+1,
3577 sk, th, sk->daddr);
3579 case TCP_SYN_RECV:
3580 if (!tcp_ack(sk, th, saddr, len)) {
3581 tcp_reset(daddr, saddr, th,
3582 sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
3583 kfree_skb(skb, FREE_READ);
3584 release_sock(sk);
3585 return(0);
3587 sk->state = TCP_ESTABLISHED;
3590 * Now we need to finish filling out
3591 * some of the tcp header.
3593 /* We need to check for mtu info. */
3594 tcp_options(sk, th);
3595 sk->dummy_th.dest = th->source;
3596 sk->copied_seq = sk->acked_seq-1;
3597 if (!sk->dead) {
3598 sk->state_change(sk);
3602 * We've already processed his first
3603 * ack. In just about all cases that
3604 * will have set max_window. This is
3605 * to protect us against the possibility
3606 * that the initial window he sent was 0.
3607 * This must occur after tcp_options, which
3608 * sets sk->mtu.
3610 if (sk->max_window == 0) {
3611 sk->max_window = 32;
3612 sk->mss = min(sk->max_window, sk->mtu);
3616 * Now process the rest like we were
3617 * already in the established state.
3619 if (th->urg) {
3620 if (tcp_urg(sk, th, saddr)) {
3621 kfree_skb(skb, FREE_READ);
3622 release_sock(sk);
3623 return(0);
3626 if (tcp_data(skb, sk, saddr, len))
3627 kfree_skb(skb, FREE_READ);
3629 if (th->fin) tcp_fin(sk, th, saddr, dev);
3630 release_sock(sk);
3631 return(0);
3634 if (th->urg) {
3635 if (tcp_urg(sk, th, saddr)) {
3636 kfree_skb(skb, FREE_READ);
3637 release_sock(sk);
3638 return(0);
3642 if (tcp_data(skb, sk, saddr, len)) {
3643 kfree_skb(skb, FREE_READ);
3644 release_sock(sk);
3645 return(0);
3648 if (!th->fin) {
3649 release_sock(sk);
3650 return(0);
3652 tcp_fin(sk, th, saddr, dev);
3653 release_sock(sk);
3654 return(0);
3660 * This routine sends a packet with an out of date sequence
3661 * number. It assumes the other end will try to ack it.
3663 static void
3664 tcp_write_wakeup(struct sock *sk)
3666 struct sk_buff *buff;
3667 struct tcphdr *t1;
3668 struct device *dev=NULL;
3669 int tmp;
3671 if (sk->zapped)
3672 return; /* Afer a valid reset we can send no more */
3674 if (sk -> state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) return;
3676 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
3677 if (buff == NULL) return;
3679 buff->mem_addr = buff;
3680 buff->mem_len = MAX_ACK_SIZE;
3681 buff->len = sizeof(struct tcphdr);
3682 buff->free = 1;
3683 buff->sk = sk;
3684 DPRINTF((DBG_TCP, "in tcp_write_wakeup\n"));
3685 t1 = (struct tcphdr *) buff->data;
3687 /* Put in the IP header and routing stuff. */
3688 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
3689 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
3690 if (tmp < 0) {
3691 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
3692 return;
3695 buff->len += tmp;
3696 t1 = (struct tcphdr *)((char *)t1 +tmp);
3698 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
3701 * Use a previous sequence.
3702 * This should cause the other end to send an ack.
3704 t1->seq = ntohl(sk->send_seq-1);
3705 t1->ack = 1;
3706 t1->res1= 0;
3707 t1->res2= 0;
3708 t1->rst = 0;
3709 t1->urg = 0;
3710 t1->psh = 0;
3711 t1->fin = 0;
3712 t1->syn = 0;
3713 t1->ack_seq = ntohl(sk->acked_seq);
3714 t1->window = ntohs(tcp_select_window(sk)/*sk->prot->rspace(sk)*/);
3715 t1->doff = sizeof(*t1)/4;
3716 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
3718 /* Send it and free it.
3719 * This will prevent the timer from automatically being restarted.
3721 sk->prot->queue_xmit(sk, dev, buff, 1);
3725 * This routine probes a zero window. It makes a copy of the first
3726 * packet in the write queue, but with just one byte of data.
3728 void
3729 tcp_send_probe0(struct sock *sk)
3731 unsigned char *raw;
3732 struct iphdr *iph;
3733 struct sk_buff *skb2, *skb;
3734 int len, hlen, data;
3735 struct tcphdr *t1;
3736 struct device *dev;
3738 if (sk->zapped)
3739 return; /* Afer a valid reset we can send no more */
3741 if (sk -> state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT &&
3742 sk -> state != TCP_FIN_WAIT1 && sk->state != TCP_FIN_WAIT2)
3743 return;
3745 skb = sk->wfront;
3746 if (skb == NULL)
3747 return;
3749 dev = skb->dev;
3750 /* I know this can't happen but as it does.. */
3751 if(dev==NULL)
3753 printk("tcp_send_probe0: NULL device bug!\n");
3754 return;
3756 IS_SKB(skb);
3758 raw = skb->data;
3759 iph = (struct iphdr *) (raw + dev->hard_header_len);
3761 hlen = (iph->ihl * sizeof(unsigned long)) + dev->hard_header_len;
3762 data = skb->len - hlen - sizeof(struct tcphdr);
3763 len = hlen + sizeof(struct tcphdr) + (data ? 1 : 0);
3765 /* Allocate buffer. */
3766 if ((skb2 = alloc_skb(sizeof(struct sk_buff) + len, GFP_ATOMIC)) == NULL) {
3767 /* printk("alloc failed raw %x th %x hlen %d data %d len %d\n",
3768 raw, skb->h.th, hlen, data, len); */
3769 reset_timer (sk, TIME_PROBE0, 10); /* try again real soon */
3770 return;
3773 skb2->arp = skb->arp;
3774 skb2->len = len;
3775 skb2->h.raw = (char *)(skb2->data);
3777 sk->wmem_alloc += skb2->mem_len;
3779 /* Copy the packet header into the new buffer. */
3780 memcpy(skb2->h.raw, raw, len);
3782 skb2->h.raw += hlen; /* it's now h.th -- pointer to the tcp header */
3783 t1 = skb2->h.th;
3785 /* source, dest, seq, from existing packet */
3786 t1->ack_seq = ntohl(sk->acked_seq);
3787 t1->res1 = 0;
3788 /* doff, fin, from existing packet. Fin is safe because Linux always
3789 * sends fin in a separate packet
3790 * syn, rst, had better be zero in original */
3791 t1->ack = 1;
3792 t1->urg = 0; /* urgent pointer might be beyond this fragment */
3793 t1->res2 = 0;
3794 t1->window = ntohs(tcp_select_window(sk)/*sk->prot->rspace(sk)*/);
3795 t1->urg_ptr = 0;
3796 tcp_send_check(t1, sk->saddr, sk->daddr, len - hlen, sk);
3797 /* Send it and free it.
3798 * This will prevent the timer from automatically being restarted.
3800 sk->prot->queue_xmit(sk, dev, skb2, 1);
3801 sk->backoff++;
3803 * in the case of retransmissions, there's good reason to limit
3804 * rto to 120 sec, as that's the maximum legal RTT on the Internet.
3805 * For probes it could reasonably be longer. However making it
3806 * much longer could cause unacceptable delays in some situation,
3807 * so we might as well use the same value
3809 sk->rto = min(sk->rto << 1, 120*HZ);
3810 reset_timer (sk, TIME_PROBE0, sk->rto);
3811 sk->retransmits++;
3812 sk->prot->retransmits ++;
3816 * Socket option code for TCP.
3819 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
3821 int val,err;
3823 if(level!=SOL_TCP)
3824 return ip_setsockopt(sk,level,optname,optval,optlen);
3826 if (optval == NULL)
3827 return(-EINVAL);
3829 err=verify_area(VERIFY_READ, optval, sizeof(int));
3830 if(err)
3831 return err;
3833 val = get_fs_long((unsigned long *)optval);
3835 switch(optname)
3837 case TCP_MAXSEG:
3838 /* if(val<200||val>2048 || val>sk->mtu) */
3840 * values greater than interface MTU won't take effect. however at
3841 * the point when this call is done we typically don't yet know
3842 * which interface is going to be used
3844 if(val<1||val>MAX_WINDOW)
3845 return -EINVAL;
3846 sk->user_mss=val;
3847 return 0;
3848 case TCP_NODELAY:
3849 sk->nonagle=(val==0)?0:1;
3850 return 0;
3851 default:
3852 return(-ENOPROTOOPT);
3856 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
3858 int val,err;
3860 if(level!=SOL_TCP)
3861 return ip_getsockopt(sk,level,optname,optval,optlen);
3863 switch(optname)
3865 case TCP_MAXSEG:
3866 val=sk->user_mss;
3867 break;
3868 case TCP_NODELAY:
3869 val=sk->nonagle; /* Until Johannes stuff is in */
3870 break;
3871 default:
3872 return(-ENOPROTOOPT);
3874 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
3875 if(err)
3876 return err;
3877 put_fs_long(sizeof(int),(unsigned long *) optlen);
3879 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
3880 if(err)
3881 return err;
3882 put_fs_long(val,(unsigned long *)optval);
3884 return(0);
3888 struct proto tcp_prot = {
3889 sock_wmalloc,
3890 sock_rmalloc,
3891 sock_wfree,
3892 sock_rfree,
3893 sock_rspace,
3894 sock_wspace,
3895 tcp_close,
3896 tcp_read,
3897 tcp_write,
3898 tcp_sendto,
3899 tcp_recvfrom,
3900 ip_build_header,
3901 tcp_connect,
3902 tcp_accept,
3903 ip_queue_xmit,
3904 tcp_retransmit,
3905 tcp_write_wakeup,
3906 tcp_read_wakeup,
3907 tcp_rcv,
3908 tcp_select,
3909 tcp_ioctl,
3910 NULL,
3911 tcp_shutdown,
3912 tcp_setsockopt,
3913 tcp_getsockopt,
3914 128,
3916 {NULL,},
3917 "TCP"