[tcp] Use a dedicated timer for the TIME_WAIT state
[gpxe.git] / src / net / tcp.c
blobd703afa77fcc3dcd3fa4fc8f32b19aa18a556ca2
1 #include <string.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <assert.h>
5 #include <errno.h>
6 #include <byteswap.h>
7 #include <gpxe/timer.h>
8 #include <gpxe/iobuf.h>
9 #include <gpxe/malloc.h>
10 #include <gpxe/retry.h>
11 #include <gpxe/refcnt.h>
12 #include <gpxe/xfer.h>
13 #include <gpxe/open.h>
14 #include <gpxe/uri.h>
15 #include <gpxe/tcpip.h>
16 #include <gpxe/tcp.h>
18 /** @file
20 * TCP protocol
24 FILE_LICENCE ( GPL2_OR_LATER );
26 /** A TCP connection */
27 struct tcp_connection {
28 /** Reference counter */
29 struct refcnt refcnt;
30 /** List of TCP connections */
31 struct list_head list;
33 /** Data transfer interface */
34 struct xfer_interface xfer;
35 /** Data transfer interface closed flag */
36 int xfer_closed;
38 /** Remote socket address */
39 struct sockaddr_tcpip peer;
40 /** Local port */
41 unsigned int local_port;
43 /** Current TCP state */
44 unsigned int tcp_state;
45 /** Previous TCP state
47 * Maintained only for debug messages
49 unsigned int prev_tcp_state;
50 /** Current sequence number
52 * Equivalent to SND.UNA in RFC 793 terminology.
54 uint32_t snd_seq;
55 /** Unacknowledged sequence count
57 * Equivalent to (SND.NXT-SND.UNA) in RFC 793 terminology.
59 uint32_t snd_sent;
60 /** Send window
62 * Equivalent to SND.WND in RFC 793 terminology
64 uint32_t snd_win;
65 /** Current acknowledgement number
67 * Equivalent to RCV.NXT in RFC 793 terminology.
69 uint32_t rcv_ack;
70 /** Receive window
72 * Equivalent to RCV.WND in RFC 793 terminology.
74 uint32_t rcv_win;
75 /** Most recent received timestamp
77 * Equivalent to TS.Recent in RFC 1323 terminology.
79 uint32_t ts_recent;
80 /** Timestamps enabled */
81 int timestamps;
83 /** Transmit queue */
84 struct list_head queue;
85 /** Retransmission timer */
86 struct retry_timer timer;
87 /** Shutdown (TIME_WAIT) timer */
88 struct retry_timer wait;
91 /**
92 * List of registered TCP connections
94 static LIST_HEAD ( tcp_conns );
96 /* Forward declarations */
97 static struct xfer_interface_operations tcp_xfer_operations;
98 static void tcp_expired ( struct retry_timer *timer, int over );
99 static void tcp_wait_expired ( struct retry_timer *timer, int over );
100 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
101 uint32_t win );
104 * Name TCP state
106 * @v state TCP state
107 * @ret name Name of TCP state
109 static inline __attribute__ (( always_inline )) const char *
110 tcp_state ( int state ) {
111 switch ( state ) {
112 case TCP_CLOSED: return "CLOSED";
113 case TCP_LISTEN: return "LISTEN";
114 case TCP_SYN_SENT: return "SYN_SENT";
115 case TCP_SYN_RCVD: return "SYN_RCVD";
116 case TCP_ESTABLISHED: return "ESTABLISHED";
117 case TCP_FIN_WAIT_1: return "FIN_WAIT_1";
118 case TCP_FIN_WAIT_2: return "FIN_WAIT_2";
119 case TCP_CLOSING_OR_LAST_ACK: return "CLOSING/LAST_ACK";
120 case TCP_TIME_WAIT: return "TIME_WAIT";
121 case TCP_CLOSE_WAIT: return "CLOSE_WAIT";
122 default: return "INVALID";
127 * Dump TCP state transition
129 * @v tcp TCP connection
131 static inline __attribute__ (( always_inline )) void
132 tcp_dump_state ( struct tcp_connection *tcp ) {
134 if ( tcp->tcp_state != tcp->prev_tcp_state ) {
135 DBGC ( tcp, "TCP %p transitioned from %s to %s\n", tcp,
136 tcp_state ( tcp->prev_tcp_state ),
137 tcp_state ( tcp->tcp_state ) );
139 tcp->prev_tcp_state = tcp->tcp_state;
143 * Dump TCP flags
145 * @v flags TCP flags
147 static inline __attribute__ (( always_inline )) void
148 tcp_dump_flags ( struct tcp_connection *tcp, unsigned int flags ) {
149 if ( flags & TCP_RST )
150 DBGC2 ( tcp, " RST" );
151 if ( flags & TCP_SYN )
152 DBGC2 ( tcp, " SYN" );
153 if ( flags & TCP_PSH )
154 DBGC2 ( tcp, " PSH" );
155 if ( flags & TCP_FIN )
156 DBGC2 ( tcp, " FIN" );
157 if ( flags & TCP_ACK )
158 DBGC2 ( tcp, " ACK" );
161 /***************************************************************************
163 * Open and close
165 ***************************************************************************
169 * Bind TCP connection to local port
171 * @v tcp TCP connection
172 * @v port Local port number
173 * @ret rc Return status code
175 * If the port is 0, the connection is assigned an available port
176 * between 1024 and 65535.
178 static int tcp_bind ( struct tcp_connection *tcp, unsigned int port ) {
179 struct tcp_connection *existing;
180 uint16_t try_port;
181 int i;
183 /* If no port specified, find an available port */
184 if ( ! port ) {
185 try_port = ( random() % 64512 ) + 1023;
186 for ( i = 0 ; i < 65536 ; ++i ) {
187 if ( ++try_port < 1024 )
188 continue;
189 if ( tcp_bind ( tcp, try_port ) == 0 )
190 return 0;
192 DBGC ( tcp, "TCP %p could not bind: no free ports\n", tcp );
193 return -EADDRINUSE;
196 /* Attempt bind to local port */
197 list_for_each_entry ( existing, &tcp_conns, list ) {
198 if ( existing->local_port == port ) {
199 DBGC ( tcp, "TCP %p could not bind: port %d in use\n",
200 tcp, port );
201 return -EADDRINUSE;
204 tcp->local_port = port;
206 DBGC ( tcp, "TCP %p bound to port %d\n", tcp, port );
207 return 0;
211 * Open a TCP connection
213 * @v xfer Data transfer interface
214 * @v peer Peer socket address
215 * @v local Local socket address, or NULL
216 * @ret rc Return status code
218 static int tcp_open ( struct xfer_interface *xfer, struct sockaddr *peer,
219 struct sockaddr *local ) {
220 struct sockaddr_tcpip *st_peer = ( struct sockaddr_tcpip * ) peer;
221 struct sockaddr_tcpip *st_local = ( struct sockaddr_tcpip * ) local;
222 struct tcp_connection *tcp;
223 unsigned int bind_port;
224 int rc;
226 /* Allocate and initialise structure */
227 tcp = zalloc ( sizeof ( *tcp ) );
228 if ( ! tcp )
229 return -ENOMEM;
230 DBGC ( tcp, "TCP %p allocated\n", tcp );
231 ref_init ( &tcp->refcnt, NULL );
232 xfer_init ( &tcp->xfer, &tcp_xfer_operations, &tcp->refcnt );
233 timer_init ( &tcp->timer, tcp_expired );
234 timer_init ( &tcp->wait, tcp_wait_expired );
235 tcp->prev_tcp_state = TCP_CLOSED;
236 tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
237 tcp_dump_state ( tcp );
238 tcp->snd_seq = random();
239 INIT_LIST_HEAD ( &tcp->queue );
240 memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
242 /* Bind to local port */
243 bind_port = ( st_local ? ntohs ( st_local->st_port ) : 0 );
244 if ( ( rc = tcp_bind ( tcp, bind_port ) ) != 0 )
245 goto err;
247 /* Start timer to initiate SYN */
248 start_timer_nodelay ( &tcp->timer );
250 /* Attach parent interface, transfer reference to connection
251 * list and return
253 xfer_plug_plug ( &tcp->xfer, xfer );
254 list_add ( &tcp->list, &tcp_conns );
255 return 0;
257 err:
258 ref_put ( &tcp->refcnt );
259 return rc;
263 * Close TCP connection
265 * @v tcp TCP connection
266 * @v rc Reason for close
268 * Closes the data transfer interface. If the TCP state machine is in
269 * a suitable state, the connection will be deleted.
271 static void tcp_close ( struct tcp_connection *tcp, int rc ) {
272 struct io_buffer *iobuf;
273 struct io_buffer *tmp;
275 /* Close data transfer interface */
276 xfer_nullify ( &tcp->xfer );
277 xfer_close ( &tcp->xfer, rc );
278 tcp->xfer_closed = 1;
280 /* If we are in CLOSED, or have otherwise not yet received a
281 * SYN (i.e. we are in LISTEN or SYN_SENT), just delete the
282 * connection.
284 if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
286 /* Transition to CLOSED for the sake of debugging messages */
287 tcp->tcp_state = TCP_CLOSED;
288 tcp_dump_state ( tcp );
290 /* Free any unsent I/O buffers */
291 list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
292 list_del ( &iobuf->list );
293 free_iob ( iobuf );
296 /* Remove from list and drop reference */
297 stop_timer ( &tcp->timer );
298 list_del ( &tcp->list );
299 ref_put ( &tcp->refcnt );
300 DBGC ( tcp, "TCP %p connection deleted\n", tcp );
301 return;
304 /* If we have not had our SYN acknowledged (i.e. we are in
305 * SYN_RCVD), pretend that it has been acknowledged so that we
306 * can send a FIN without breaking things.
308 if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
309 tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
311 /* If we have no data remaining to send, start sending FIN */
312 if ( list_empty ( &tcp->queue ) ) {
313 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
314 tcp_dump_state ( tcp );
318 /***************************************************************************
320 * Transmit data path
322 ***************************************************************************
326 * Calculate transmission window
328 * @v tcp TCP connection
329 * @ret len Maximum length that can be sent in a single packet
331 static size_t tcp_xmit_win ( struct tcp_connection *tcp ) {
332 size_t len;
334 /* Not ready if we're not in a suitable connection state */
335 if ( ! TCP_CAN_SEND_DATA ( tcp->tcp_state ) )
336 return 0;
338 /* Length is the minimum of the receiver's window and the path MTU */
339 len = tcp->snd_win;
340 if ( len > TCP_PATH_MTU )
341 len = TCP_PATH_MTU;
343 return len;
347 * Process TCP transmit queue
349 * @v tcp TCP connection
350 * @v max_len Maximum length to process
351 * @v dest I/O buffer to fill with data, or NULL
352 * @v remove Remove data from queue
353 * @ret len Length of data processed
355 * This processes at most @c max_len bytes from the TCP connection's
356 * transmit queue. Data will be copied into the @c dest I/O buffer
357 * (if provided) and, if @c remove is true, removed from the transmit
358 * queue.
360 static size_t tcp_process_queue ( struct tcp_connection *tcp, size_t max_len,
361 struct io_buffer *dest, int remove ) {
362 struct io_buffer *iobuf;
363 struct io_buffer *tmp;
364 size_t frag_len;
365 size_t len = 0;
367 list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
368 frag_len = iob_len ( iobuf );
369 if ( frag_len > max_len )
370 frag_len = max_len;
371 if ( dest ) {
372 memcpy ( iob_put ( dest, frag_len ), iobuf->data,
373 frag_len );
375 if ( remove ) {
376 iob_pull ( iobuf, frag_len );
377 if ( ! iob_len ( iobuf ) ) {
378 list_del ( &iobuf->list );
379 free_iob ( iobuf );
382 len += frag_len;
383 max_len -= frag_len;
385 return len;
389 * Transmit any outstanding data
391 * @v tcp TCP connection
392 * @v force_send Force sending of packet
394 * Transmits any outstanding data on the connection.
396 * Note that even if an error is returned, the retransmission timer
397 * will have been started if necessary, and so the stack will
398 * eventually attempt to retransmit the failed packet.
400 static int tcp_xmit ( struct tcp_connection *tcp, int force_send ) {
401 struct io_buffer *iobuf;
402 struct tcp_header *tcphdr;
403 struct tcp_mss_option *mssopt;
404 struct tcp_timestamp_padded_option *tsopt;
405 void *payload;
406 unsigned int flags;
407 size_t len = 0;
408 uint32_t seq_len;
409 uint32_t app_win;
410 uint32_t max_rcv_win;
411 int rc;
413 /* If retransmission timer is already running, do nothing */
414 if ( timer_running ( &tcp->timer ) )
415 return 0;
417 /* Calculate both the actual (payload) and sequence space
418 * lengths that we wish to transmit.
420 if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
421 len = tcp_process_queue ( tcp, tcp_xmit_win ( tcp ),
422 NULL, 0 );
424 seq_len = len;
425 flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
426 if ( flags & ( TCP_SYN | TCP_FIN ) ) {
427 /* SYN or FIN consume one byte, and we can never send both */
428 assert ( ! ( ( flags & TCP_SYN ) && ( flags & TCP_FIN ) ) );
429 seq_len++;
431 tcp->snd_sent = seq_len;
433 /* If we have nothing to transmit, stop now */
434 if ( ( seq_len == 0 ) && ! force_send )
435 return 0;
437 /* If we are transmitting anything that requires
438 * acknowledgement (i.e. consumes sequence space), start the
439 * retransmission timer. Do this before attempting to
440 * allocate the I/O buffer, in case allocation itself fails.
442 if ( seq_len )
443 start_timer ( &tcp->timer );
445 /* Allocate I/O buffer */
446 iobuf = alloc_iob ( len + MAX_HDR_LEN );
447 if ( ! iobuf ) {
448 DBGC ( tcp, "TCP %p could not allocate iobuf for %08x..%08x "
449 "%08x\n", tcp, tcp->snd_seq, ( tcp->snd_seq + seq_len ),
450 tcp->rcv_ack );
451 return -ENOMEM;
453 iob_reserve ( iobuf, MAX_HDR_LEN );
455 /* Fill data payload from transmit queue */
456 tcp_process_queue ( tcp, len, iobuf, 0 );
458 /* Expand receive window if possible */
459 max_rcv_win = ( ( freemem * 3 ) / 4 );
460 if ( max_rcv_win > TCP_MAX_WINDOW_SIZE )
461 max_rcv_win = TCP_MAX_WINDOW_SIZE;
462 app_win = xfer_window ( &tcp->xfer );
463 if ( max_rcv_win > app_win )
464 max_rcv_win = app_win;
465 max_rcv_win &= ~0x03; /* Keep everything dword-aligned */
466 if ( tcp->rcv_win < max_rcv_win )
467 tcp->rcv_win = max_rcv_win;
469 /* Fill up the TCP header */
470 payload = iobuf->data;
471 if ( flags & TCP_SYN ) {
472 mssopt = iob_push ( iobuf, sizeof ( *mssopt ) );
473 mssopt->kind = TCP_OPTION_MSS;
474 mssopt->length = sizeof ( *mssopt );
475 mssopt->mss = htons ( TCP_MSS );
477 if ( ( flags & TCP_SYN ) || tcp->timestamps ) {
478 tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
479 memset ( tsopt->nop, TCP_OPTION_NOP, sizeof ( tsopt->nop ) );
480 tsopt->tsopt.kind = TCP_OPTION_TS;
481 tsopt->tsopt.length = sizeof ( tsopt->tsopt );
482 tsopt->tsopt.tsval = htonl ( currticks() );
483 tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
485 if ( ! ( flags & TCP_SYN ) )
486 flags |= TCP_PSH;
487 tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
488 memset ( tcphdr, 0, sizeof ( *tcphdr ) );
489 tcphdr->src = htons ( tcp->local_port );
490 tcphdr->dest = tcp->peer.st_port;
491 tcphdr->seq = htonl ( tcp->snd_seq );
492 tcphdr->ack = htonl ( tcp->rcv_ack );
493 tcphdr->hlen = ( ( payload - iobuf->data ) << 2 );
494 tcphdr->flags = flags;
495 tcphdr->win = htons ( tcp->rcv_win );
496 tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
498 /* Dump header */
499 DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x %08x %4zd",
500 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
501 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) + seq_len ),
502 ntohl ( tcphdr->ack ), len );
503 tcp_dump_flags ( tcp, tcphdr->flags );
504 DBGC2 ( tcp, "\n" );
506 /* Transmit packet */
507 if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, &tcp->peer, NULL,
508 &tcphdr->csum ) ) != 0 ) {
509 DBGC ( tcp, "TCP %p could not transmit %08x..%08x %08x: %s\n",
510 tcp, tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ),
511 tcp->rcv_ack, strerror ( rc ) );
512 return rc;
515 return 0;
519 * Retransmission timer expired
521 * @v timer Retransmission timer
522 * @v over Failure indicator
524 static void tcp_expired ( struct retry_timer *timer, int over ) {
525 struct tcp_connection *tcp =
526 container_of ( timer, struct tcp_connection, timer );
528 DBGC ( tcp, "TCP %p timer %s in %s for %08x..%08x %08x\n", tcp,
529 ( over ? "expired" : "fired" ), tcp_state ( tcp->tcp_state ),
530 tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
532 assert ( ( tcp->tcp_state == TCP_SYN_SENT ) ||
533 ( tcp->tcp_state == TCP_SYN_RCVD ) ||
534 ( tcp->tcp_state == TCP_ESTABLISHED ) ||
535 ( tcp->tcp_state == TCP_FIN_WAIT_1 ) ||
536 ( tcp->tcp_state == TCP_CLOSE_WAIT ) ||
537 ( tcp->tcp_state == TCP_CLOSING_OR_LAST_ACK ) );
539 if ( over ) {
540 /* If we have finally timed out and given up,
541 * terminate the connection
543 tcp->tcp_state = TCP_CLOSED;
544 tcp_dump_state ( tcp );
545 tcp_close ( tcp, -ETIMEDOUT );
546 } else {
547 /* Otherwise, retransmit the packet */
548 tcp_xmit ( tcp, 0 );
553 * Shutdown timer expired
555 * @v timer Shutdown timer
556 * @v over Failure indicator
558 static void tcp_wait_expired ( struct retry_timer *timer, int over __unused ) {
559 struct tcp_connection *tcp =
560 container_of ( timer, struct tcp_connection, wait );
562 assert ( tcp->tcp_state == TCP_TIME_WAIT );
564 DBGC ( tcp, "TCP %p wait complete in %s for %08x..%08x %08x\n", tcp,
565 tcp_state ( tcp->tcp_state ), tcp->snd_seq,
566 ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
568 tcp->tcp_state = TCP_CLOSED;
569 tcp_dump_state ( tcp );
570 tcp_close ( tcp, 0 );
574 * Send RST response to incoming packet
576 * @v in_tcphdr TCP header of incoming packet
577 * @ret rc Return status code
579 static int tcp_xmit_reset ( struct tcp_connection *tcp,
580 struct sockaddr_tcpip *st_dest,
581 struct tcp_header *in_tcphdr ) {
582 struct io_buffer *iobuf;
583 struct tcp_header *tcphdr;
584 int rc;
586 /* Allocate space for dataless TX buffer */
587 iobuf = alloc_iob ( MAX_HDR_LEN );
588 if ( ! iobuf ) {
589 DBGC ( tcp, "TCP %p could not allocate iobuf for RST "
590 "%08x..%08x %08x\n", tcp, ntohl ( in_tcphdr->ack ),
591 ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ) );
592 return -ENOMEM;
594 iob_reserve ( iobuf, MAX_HDR_LEN );
596 /* Construct RST response */
597 tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
598 memset ( tcphdr, 0, sizeof ( *tcphdr ) );
599 tcphdr->src = in_tcphdr->dest;
600 tcphdr->dest = in_tcphdr->src;
601 tcphdr->seq = in_tcphdr->ack;
602 tcphdr->ack = in_tcphdr->seq;
603 tcphdr->hlen = ( ( sizeof ( *tcphdr ) / 4 ) << 4 );
604 tcphdr->flags = ( TCP_RST | TCP_ACK );
605 tcphdr->win = htons ( TCP_MAX_WINDOW_SIZE );
606 tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
608 /* Dump header */
609 DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x %08x %4d",
610 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
611 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) ),
612 ntohl ( tcphdr->ack ), 0 );
613 tcp_dump_flags ( tcp, tcphdr->flags );
614 DBGC2 ( tcp, "\n" );
616 /* Transmit packet */
617 if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, st_dest,
618 NULL, &tcphdr->csum ) ) != 0 ) {
619 DBGC ( tcp, "TCP %p could not transmit RST %08x..%08x %08x: "
620 "%s\n", tcp, ntohl ( in_tcphdr->ack ),
621 ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ),
622 strerror ( rc ) );
623 return rc;
626 return 0;
629 /***************************************************************************
631 * Receive data path
633 ***************************************************************************
637 * Identify TCP connection by local port number
639 * @v local_port Local port
640 * @ret tcp TCP connection, or NULL
642 static struct tcp_connection * tcp_demux ( unsigned int local_port ) {
643 struct tcp_connection *tcp;
645 list_for_each_entry ( tcp, &tcp_conns, list ) {
646 if ( tcp->local_port == local_port )
647 return tcp;
649 return NULL;
653 * Parse TCP received options
655 * @v tcp TCP connection
656 * @v data Raw options data
657 * @v len Raw options length
658 * @v options Options structure to fill in
660 static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
661 size_t len, struct tcp_options *options ) {
662 const void *end = ( data + len );
663 const struct tcp_option *option;
664 unsigned int kind;
666 memset ( options, 0, sizeof ( *options ) );
667 while ( data < end ) {
668 option = data;
669 kind = option->kind;
670 if ( kind == TCP_OPTION_END )
671 return;
672 if ( kind == TCP_OPTION_NOP ) {
673 data++;
674 continue;
676 switch ( kind ) {
677 case TCP_OPTION_MSS:
678 options->mssopt = data;
679 break;
680 case TCP_OPTION_TS:
681 options->tsopt = data;
682 break;
683 default:
684 DBGC ( tcp, "TCP %p received unknown option %d\n",
685 tcp, kind );
686 break;
688 data += option->length;
693 * Consume received sequence space
695 * @v tcp TCP connection
696 * @v seq_len Sequence space length to consume
698 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
699 tcp->rcv_ack += seq_len;
700 if ( tcp->rcv_win > seq_len ) {
701 tcp->rcv_win -= seq_len;
702 } else {
703 tcp->rcv_win = 0;
708 * Handle TCP received SYN
710 * @v tcp TCP connection
711 * @v seq SEQ value (in host-endian order)
712 * @v options TCP options
713 * @ret rc Return status code
715 static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
716 struct tcp_options *options ) {
718 /* Synchronise sequence numbers on first SYN */
719 if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
720 tcp->rcv_ack = seq;
721 if ( options->tsopt )
722 tcp->timestamps = 1;
725 /* Ignore duplicate SYN */
726 if ( ( tcp->rcv_ack - seq ) > 0 )
727 return 0;
729 /* Acknowledge SYN */
730 tcp_rx_seq ( tcp, 1 );
732 /* Mark SYN as received and start sending ACKs with each packet */
733 tcp->tcp_state |= ( TCP_STATE_SENT ( TCP_ACK ) |
734 TCP_STATE_RCVD ( TCP_SYN ) );
736 return 0;
740 * Handle TCP received ACK
742 * @v tcp TCP connection
743 * @v ack ACK value (in host-endian order)
744 * @v win WIN value (in host-endian order)
745 * @ret rc Return status code
747 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
748 uint32_t win ) {
749 uint32_t ack_len = ( ack - tcp->snd_seq );
750 size_t len;
751 unsigned int acked_flags;
753 /* Check for out-of-range or old duplicate ACKs */
754 if ( ack_len > tcp->snd_sent ) {
755 DBGC ( tcp, "TCP %p received ACK for %08x..%08x, "
756 "sent only %08x..%08x\n", tcp, tcp->snd_seq,
757 ( tcp->snd_seq + ack_len ), tcp->snd_seq,
758 ( tcp->snd_seq + tcp->snd_sent ) );
760 if ( TCP_HAS_BEEN_ESTABLISHED ( tcp->tcp_state ) ) {
761 /* Just ignore what might be old duplicate ACKs */
762 return 0;
763 } else {
764 /* Send RST if an out-of-range ACK is received
765 * on a not-yet-established connection, as per
766 * RFC 793.
768 return -EINVAL;
772 /* Ignore ACKs that don't actually acknowledge any new data.
773 * (In particular, do not stop the retransmission timer; this
774 * avoids creating a sorceror's apprentice syndrome when a
775 * duplicate ACK is received and we still have data in our
776 * transmit queue.)
778 if ( ack_len == 0 )
779 return 0;
781 /* Stop the retransmission timer */
782 stop_timer ( &tcp->timer );
784 /* Determine acknowledged flags and data length */
785 len = ack_len;
786 acked_flags = ( TCP_FLAGS_SENDING ( tcp->tcp_state ) &
787 ( TCP_SYN | TCP_FIN ) );
788 if ( acked_flags )
789 len--;
791 /* Update SEQ and sent counters, and window size */
792 tcp->snd_seq = ack;
793 tcp->snd_sent = 0;
794 tcp->snd_win = win;
796 /* Remove any acknowledged data from transmit queue */
797 tcp_process_queue ( tcp, len, NULL, 1 );
799 /* Mark SYN/FIN as acknowledged if applicable. */
800 if ( acked_flags )
801 tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
803 /* Start sending FIN if we've had all possible data ACKed */
804 if ( list_empty ( &tcp->queue ) && tcp->xfer_closed )
805 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
807 return 0;
811 * Handle TCP received data
813 * @v tcp TCP connection
814 * @v seq SEQ value (in host-endian order)
815 * @v iobuf I/O buffer
816 * @ret rc Return status code
818 * This function takes ownership of the I/O buffer.
820 static int tcp_rx_data ( struct tcp_connection *tcp, uint32_t seq,
821 struct io_buffer *iobuf ) {
822 uint32_t already_rcvd;
823 uint32_t len;
824 int rc;
826 /* Ignore duplicate or out-of-order data */
827 already_rcvd = ( tcp->rcv_ack - seq );
828 len = iob_len ( iobuf );
829 if ( already_rcvd >= len ) {
830 free_iob ( iobuf );
831 return 0;
833 iob_pull ( iobuf, already_rcvd );
834 len -= already_rcvd;
836 /* Acknowledge new data */
837 tcp_rx_seq ( tcp, len );
839 /* Deliver data to application */
840 if ( ( rc = xfer_deliver_iob ( &tcp->xfer, iobuf ) ) != 0 ) {
841 DBGC ( tcp, "TCP %p could not deliver %08x..%08x: %s\n",
842 tcp, seq, ( seq + len ), strerror ( rc ) );
843 return rc;
846 return 0;
850 * Handle TCP received FIN
852 * @v tcp TCP connection
853 * @v seq SEQ value (in host-endian order)
854 * @ret rc Return status code
856 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
858 /* Ignore duplicate or out-of-order FIN */
859 if ( ( tcp->rcv_ack - seq ) > 0 )
860 return 0;
862 /* Acknowledge FIN */
863 tcp_rx_seq ( tcp, 1 );
865 /* Mark FIN as received */
866 tcp->tcp_state |= TCP_STATE_RCVD ( TCP_FIN );
868 /* Close connection */
869 tcp_close ( tcp, 0 );
871 return 0;
875 * Handle TCP received RST
877 * @v tcp TCP connection
878 * @v seq SEQ value (in host-endian order)
879 * @ret rc Return status code
881 static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
883 /* Accept RST only if it falls within the window. If we have
884 * not yet received a SYN, then we have no window to test
885 * against, so fall back to checking that our SYN has been
886 * ACKed.
888 if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
889 if ( ( seq - tcp->rcv_ack ) >= tcp->rcv_win )
890 return 0;
891 } else {
892 if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
893 return 0;
896 /* Abort connection */
897 tcp->tcp_state = TCP_CLOSED;
898 tcp_dump_state ( tcp );
899 tcp_close ( tcp, -ECONNRESET );
901 DBGC ( tcp, "TCP %p connection reset by peer\n", tcp );
902 return -ECONNRESET;
906 * Process received packet
908 * @v iobuf I/O buffer
909 * @v st_src Partially-filled source address
910 * @v st_dest Partially-filled destination address
911 * @v pshdr_csum Pseudo-header checksum
912 * @ret rc Return status code
914 static int tcp_rx ( struct io_buffer *iobuf,
915 struct sockaddr_tcpip *st_src,
916 struct sockaddr_tcpip *st_dest __unused,
917 uint16_t pshdr_csum ) {
918 struct tcp_header *tcphdr = iobuf->data;
919 struct tcp_connection *tcp;
920 struct tcp_options options;
921 size_t hlen;
922 uint16_t csum;
923 uint32_t start_seq;
924 uint32_t seq;
925 uint32_t ack;
926 uint32_t win;
927 uint32_t ts_recent;
928 unsigned int flags;
929 size_t len;
930 int rc;
932 /* Sanity check packet */
933 if ( iob_len ( iobuf ) < sizeof ( *tcphdr ) ) {
934 DBG ( "TCP packet too short at %zd bytes (min %zd bytes)\n",
935 iob_len ( iobuf ), sizeof ( *tcphdr ) );
936 rc = -EINVAL;
937 goto discard;
939 hlen = ( ( tcphdr->hlen & TCP_MASK_HLEN ) / 16 ) * 4;
940 if ( hlen < sizeof ( *tcphdr ) ) {
941 DBG ( "TCP header too short at %zd bytes (min %zd bytes)\n",
942 hlen, sizeof ( *tcphdr ) );
943 rc = -EINVAL;
944 goto discard;
946 if ( hlen > iob_len ( iobuf ) ) {
947 DBG ( "TCP header too long at %zd bytes (max %zd bytes)\n",
948 hlen, iob_len ( iobuf ) );
949 rc = -EINVAL;
950 goto discard;
952 csum = tcpip_continue_chksum ( pshdr_csum, iobuf->data,
953 iob_len ( iobuf ) );
954 if ( csum != 0 ) {
955 DBG ( "TCP checksum incorrect (is %04x including checksum "
956 "field, should be 0000)\n", csum );
957 rc = -EINVAL;
958 goto discard;
961 /* Parse parameters from header and strip header */
962 tcp = tcp_demux ( ntohs ( tcphdr->dest ) );
963 start_seq = seq = ntohl ( tcphdr->seq );
964 ack = ntohl ( tcphdr->ack );
965 win = ntohs ( tcphdr->win );
966 flags = tcphdr->flags;
967 tcp_rx_opts ( tcp, ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) ),
968 ( hlen - sizeof ( *tcphdr ) ), &options );
969 ts_recent = ( options.tsopt ?
970 ntohl ( options.tsopt->tsval ) : tcp->ts_recent );
971 iob_pull ( iobuf, hlen );
972 len = iob_len ( iobuf );
974 /* Dump header */
975 DBGC2 ( tcp, "TCP %p RX %d<-%d %08x %08x..%08zx %4zd",
976 tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
977 ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
978 ( ntohl ( tcphdr->seq ) + len +
979 ( ( tcphdr->flags & ( TCP_SYN | TCP_FIN ) ) ? 1 : 0 )), len);
980 tcp_dump_flags ( tcp, tcphdr->flags );
981 DBGC2 ( tcp, "\n" );
983 /* If no connection was found, send RST */
984 if ( ! tcp ) {
985 tcp_xmit_reset ( tcp, st_src, tcphdr );
986 rc = -ENOTCONN;
987 goto discard;
990 /* Handle ACK, if present */
991 if ( flags & TCP_ACK ) {
992 if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
993 tcp_xmit_reset ( tcp, st_src, tcphdr );
994 goto discard;
998 /* Handle SYN, if present */
999 if ( flags & TCP_SYN ) {
1000 tcp_rx_syn ( tcp, seq, &options );
1001 seq++;
1004 /* Handle RST, if present */
1005 if ( flags & TCP_RST ) {
1006 if ( ( rc = tcp_rx_rst ( tcp, seq ) ) != 0 )
1007 goto discard;
1010 /* Handle new data, if any */
1011 tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
1012 seq += len;
1014 /* Handle FIN, if present */
1015 if ( flags & TCP_FIN ) {
1016 tcp_rx_fin ( tcp, seq );
1017 seq++;
1020 /* Update timestamp, if applicable */
1021 if ( seq == tcp->rcv_ack )
1022 tcp->ts_recent = ts_recent;
1024 /* Dump out any state change as a result of the received packet */
1025 tcp_dump_state ( tcp );
1027 /* Send out any pending data. We force sending a reply if either
1029 * a) the peer is expecting an ACK (i.e. consumed sequence space), or
1030 * b) either end of the packet was outside the receive window
1032 * Case (b) enables us to support TCP keepalives using
1033 * zero-length packets, which we would otherwise ignore. Note
1034 * that for case (b), we need *only* consider zero-length
1035 * packets, since non-zero-length packets will already be
1036 * caught by case (a).
1038 tcp_xmit ( tcp, ( ( start_seq != seq ) ||
1039 ( ( seq - tcp->rcv_ack ) > tcp->rcv_win ) ) );
1041 /* If this packet was the last we expect to receive, set up
1042 * timer to expire and cause the connection to be freed.
1044 if ( TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) {
1045 stop_timer ( &tcp->wait );
1046 start_timer_fixed ( &tcp->wait, ( 2 * TCP_MSL ) );
1049 return 0;
1051 discard:
1052 /* Free received packet */
1053 free_iob ( iobuf );
1054 return rc;
1057 /** TCP protocol */
1058 struct tcpip_protocol tcp_protocol __tcpip_protocol = {
1059 .name = "TCP",
1060 .rx = tcp_rx,
1061 .tcpip_proto = IP_TCP,
1064 /***************************************************************************
1066 * Data transfer interface
1068 ***************************************************************************
1072 * Close interface
1074 * @v xfer Data transfer interface
1075 * @v rc Reason for close
1077 static void tcp_xfer_close ( struct xfer_interface *xfer, int rc ) {
1078 struct tcp_connection *tcp =
1079 container_of ( xfer, struct tcp_connection, xfer );
1081 /* Close data transfer interface */
1082 tcp_close ( tcp, rc );
1084 /* Transmit FIN, if possible */
1085 tcp_xmit ( tcp, 0 );
1089 * Check flow control window
1091 * @v xfer Data transfer interface
1092 * @ret len Length of window
1094 static size_t tcp_xfer_window ( struct xfer_interface *xfer ) {
1095 struct tcp_connection *tcp =
1096 container_of ( xfer, struct tcp_connection, xfer );
1098 /* Not ready if data queue is non-empty. This imposes a limit
1099 * of only one unACKed packet in the TX queue at any time; we
1100 * do this to conserve memory usage.
1102 if ( ! list_empty ( &tcp->queue ) )
1103 return 0;
1105 /* Return TCP window length */
1106 return tcp_xmit_win ( tcp );
1110 * Deliver datagram as I/O buffer
1112 * @v xfer Data transfer interface
1113 * @v iobuf Datagram I/O buffer
1114 * @v meta Data transfer metadata
1115 * @ret rc Return status code
1117 static int tcp_xfer_deliver_iob ( struct xfer_interface *xfer,
1118 struct io_buffer *iobuf,
1119 struct xfer_metadata *meta __unused ) {
1120 struct tcp_connection *tcp =
1121 container_of ( xfer, struct tcp_connection, xfer );
1123 /* Enqueue packet */
1124 list_add_tail ( &iobuf->list, &tcp->queue );
1126 /* Transmit data, if possible */
1127 tcp_xmit ( tcp, 0 );
1129 return 0;
1132 /** TCP data transfer interface operations */
1133 static struct xfer_interface_operations tcp_xfer_operations = {
1134 .close = tcp_xfer_close,
1135 .vredirect = ignore_xfer_vredirect,
1136 .window = tcp_xfer_window,
1137 .alloc_iob = default_xfer_alloc_iob,
1138 .deliver_iob = tcp_xfer_deliver_iob,
1139 .deliver_raw = xfer_deliver_as_iob,
1142 /***************************************************************************
1144 * Openers
1146 ***************************************************************************
1149 /** TCP socket opener */
1150 struct socket_opener tcp_socket_opener __socket_opener = {
1151 .semantics = TCP_SOCK_STREAM,
1152 .family = AF_INET,
1153 .open = tcp_open,
1156 /** Linkage hack */
1157 int tcp_sock_stream = TCP_SOCK_STREAM;
1160 * Open TCP URI
1162 * @v xfer Data transfer interface
1163 * @v uri URI
1164 * @ret rc Return status code
1166 static int tcp_open_uri ( struct xfer_interface *xfer, struct uri *uri ) {
1167 struct sockaddr_tcpip peer;
1169 /* Sanity check */
1170 if ( ! uri->host )
1171 return -EINVAL;
1173 memset ( &peer, 0, sizeof ( peer ) );
1174 peer.st_port = htons ( uri_port ( uri, 0 ) );
1175 return xfer_open_named_socket ( xfer, SOCK_STREAM,
1176 ( struct sockaddr * ) &peer,
1177 uri->host, NULL );
1180 /** TCP URI opener */
1181 struct uri_opener tcp_uri_opener __uri_opener = {
1182 .scheme = "tcp",
1183 .open = tcp_open_uri,