vfs: check userland buffers before reading them.
[haiku.git] / src / add-ons / kernel / network / protocols / tcp / TCPEndpoint.cpp
blob007f772aa91ac21910967a67a83f9697bb52550a
1 /*
2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
5 * Authors:
6 * Andrew Galante, haiku.galante@gmail.com
7 * Axel Dörfler, axeld@pinc-software.de
8 * Hugo Santos, hugosantos@gmail.com
9 */
12 #include "TCPEndpoint.h"
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <stdint.h>
23 #include <KernelExport.h>
24 #include <Select.h>
26 #include <net_buffer.h>
27 #include <net_datalink.h>
28 #include <net_stat.h>
29 #include <NetBufferUtilities.h>
30 #include <NetUtilities.h>
32 #include <lock.h>
33 #include <tracing.h>
34 #include <util/AutoLock.h>
35 #include <util/list.h>
37 #include "EndpointManager.h"
40 // References:
41 // - RFC 793 - Transmission Control Protocol
42 // - RFC 813 - Window and Acknowledgement Strategy in TCP
43 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP
45 // Things this implementation currently doesn't implement:
46 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
47 // RFC 2001, RFC 2581, RFC 3042
48 // - NewReno Modification to TCP's Fast Recovery, RFC 2582
49 // - Explicit Congestion Notification (ECN), RFC 3168
50 // - SYN-Cache
51 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
52 // - Forward RTO-Recovery, RFC 4138
53 // - Time-Wait hash instead of keeping sockets alive
55 // Things incomplete in this implementation:
56 // - TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS
58 #define PrintAddress(address) \
59 AddressString(Domain(), address, true).Data()
61 //#define TRACE_TCP
62 //#define PROBE_TCP
64 #ifdef TRACE_TCP
65 // the space before ', ##args' is important in order for this to work with cpp 2.95
66 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \
67 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
68 system_time(), this, name_for_state(fState) , ##args)
69 #else
70 # define TRACE(args...) do { } while (0)
71 #endif
73 #ifdef PROBE_TCP
74 # define PROBE(buffer, window) \
75 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
76 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
77 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
78 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
79 system_time(), PrintAddress(buffer->source), \
80 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
81 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
82 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
83 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
84 #else
85 # define PROBE(buffer, window) do { } while (0)
86 #endif
88 #if TCP_TRACING
89 namespace TCPTracing {
91 class Receive : public AbstractTraceEntry {
92 public:
93 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
94 net_buffer* buffer)
96 fEndpoint(endpoint),
97 fBuffer(buffer),
98 fBufferSize(buffer->size),
99 fSequence(segment.sequence),
100 fAcknowledge(segment.acknowledge),
101 fWindow(window),
102 fState(endpoint->State()),
103 fFlags(segment.flags)
105 Initialized();
108 virtual void AddDump(TraceOutput& out)
110 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
111 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
112 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
113 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
116 protected:
117 TCPEndpoint* fEndpoint;
118 net_buffer* fBuffer;
119 uint32 fBufferSize;
120 uint32 fSequence;
121 uint32 fAcknowledge;
122 uint32 fWindow;
123 tcp_state fState;
124 uint8 fFlags;
127 class Send : public AbstractTraceEntry {
128 public:
129 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
130 tcp_sequence firstSequence, tcp_sequence lastSequence)
132 fEndpoint(endpoint),
133 fBuffer(buffer),
134 fBufferSize(buffer->size),
135 fSequence(segment.sequence),
136 fAcknowledge(segment.acknowledge),
137 fFirstSequence(firstSequence.Number()),
138 fLastSequence(lastSequence.Number()),
139 fState(endpoint->State()),
140 fFlags(segment.flags)
142 Initialized();
145 virtual void AddDump(TraceOutput& out)
147 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
148 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
149 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
150 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
151 fAcknowledge, fFirstSequence, fLastSequence);
154 protected:
155 TCPEndpoint* fEndpoint;
156 net_buffer* fBuffer;
157 uint32 fBufferSize;
158 uint32 fSequence;
159 uint32 fAcknowledge;
160 uint32 fFirstSequence;
161 uint32 fLastSequence;
162 tcp_state fState;
163 uint8 fFlags;
166 class State : public AbstractTraceEntry {
167 public:
168 State(TCPEndpoint* endpoint)
170 fEndpoint(endpoint),
171 fState(endpoint->State())
173 Initialized();
176 virtual void AddDump(TraceOutput& out)
178 out.Print("tcp:%p (%12s) state change", fEndpoint,
179 name_for_state(fState));
182 protected:
183 TCPEndpoint* fEndpoint;
184 tcp_state fState;
187 class Spawn : public AbstractTraceEntry {
188 public:
189 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
191 fListeningEndpoint(listeningEndpoint),
192 fSpawnedEndpoint(spawnedEndpoint)
194 Initialized();
197 virtual void AddDump(TraceOutput& out)
199 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
202 protected:
203 TCPEndpoint* fListeningEndpoint;
204 TCPEndpoint* fSpawnedEndpoint;
207 class Error : public AbstractTraceEntry {
208 public:
209 Error(TCPEndpoint* endpoint, const char* error, int32 line)
211 fEndpoint(endpoint),
212 fLine(line),
213 fError(error),
214 fState(endpoint->State())
216 Initialized();
219 virtual void AddDump(TraceOutput& out)
221 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
222 name_for_state(fState), fLine, fError);
225 protected:
226 TCPEndpoint* fEndpoint;
227 int32 fLine;
228 const char* fError;
229 tcp_state fState;
232 class TimerSet : public AbstractTraceEntry {
233 public:
234 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
236 fEndpoint(endpoint),
237 fWhich(which),
238 fTimeout(timeout),
239 fState(endpoint->State())
241 Initialized();
244 virtual void AddDump(TraceOutput& out)
246 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
247 name_for_state(fState), fWhich, fTimeout);
250 protected:
251 TCPEndpoint* fEndpoint;
252 const char* fWhich;
253 bigtime_t fTimeout;
254 tcp_state fState;
257 class TimerTriggered : public AbstractTraceEntry {
258 public:
259 TimerTriggered(TCPEndpoint* endpoint, const char* which)
261 fEndpoint(endpoint),
262 fWhich(which),
263 fState(endpoint->State())
265 Initialized();
268 virtual void AddDump(TraceOutput& out)
270 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
271 name_for_state(fState), fWhich);
274 protected:
275 TCPEndpoint* fEndpoint;
276 const char* fWhich;
277 tcp_state fState;
280 class APICall : public AbstractTraceEntry {
281 public:
282 APICall(TCPEndpoint* endpoint, const char* which)
284 fEndpoint(endpoint),
285 fWhich(which),
286 fState(endpoint->State())
288 Initialized();
291 virtual void AddDump(TraceOutput& out)
293 out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
294 name_for_state(fState), fWhich);
297 protected:
298 TCPEndpoint* fEndpoint;
299 const char* fWhich;
300 tcp_state fState;
303 } // namespace TCPTracing
305 # define T(x) new(std::nothrow) TCPTracing::x
306 #else
307 # define T(x)
308 #endif // TCP_TRACING
311 // constants for the fFlags field
312 enum {
313 FLAG_OPTION_WINDOW_SCALE = 0x01,
314 FLAG_OPTION_TIMESTAMP = 0x02,
315 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
316 // That is, what is expected from accept() after a shutdown()
317 // is performed on a listen()ing socket.
318 FLAG_NO_RECEIVE = 0x04,
319 FLAG_CLOSED = 0x08,
320 FLAG_DELETE_ON_CLOSE = 0x10,
321 FLAG_LOCAL = 0x20,
322 FLAG_RECOVERY = 0x40
326 static const int kTimestampFactor = 1000;
327 // conversion factor between usec system time and msec tcp time
330 static inline bigtime_t
331 absolute_timeout(bigtime_t timeout)
333 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
334 return timeout;
336 return timeout + system_time();
340 static inline status_t
341 posix_error(status_t error)
343 if (error == B_TIMED_OUT)
344 return B_WOULD_BLOCK;
346 return error;
350 static inline bool
351 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
352 uint32 receiveWindow)
354 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
358 static inline bool
359 segment_in_sequence(const tcp_segment_header& segment, int size,
360 const tcp_sequence& receiveNext, uint32 receiveWindow)
362 tcp_sequence sequence(segment.sequence);
363 if (size == 0) {
364 if (receiveWindow == 0)
365 return sequence == receiveNext;
366 return in_window(sequence, receiveNext, receiveWindow);
367 } else {
368 if (receiveWindow == 0)
369 return false;
370 return in_window(sequence, receiveNext, receiveWindow)
371 || in_window(sequence + size - 1, receiveNext, receiveWindow);
376 static inline bool
377 is_writable(tcp_state state)
379 return state == ESTABLISHED || state == FINISH_RECEIVED;
383 static inline bool
384 is_establishing(tcp_state state)
386 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
390 static inline uint32 tcp_now()
392 return system_time() / kTimestampFactor;
396 static inline uint32 tcp_diff_timestamp(uint32 base)
398 uint32 now = tcp_now();
400 if (now > base)
401 return now - base;
403 return now + UINT_MAX - base;
407 static inline bool
408 state_needs_finish(int32 state)
410 return state == WAIT_FOR_FINISH_ACKNOWLEDGE
411 || state == FINISH_SENT || state == CLOSING;
415 // #pragma mark -
418 TCPEndpoint::TCPEndpoint(net_socket* socket)
420 ProtocolSocket(socket),
421 fManager(NULL),
422 fOptions(0),
423 fSendWindowShift(0),
424 fReceiveWindowShift(0),
425 fSendUnacknowledged(0),
426 fSendNext(0),
427 fSendMax(0),
428 fSendUrgentOffset(0),
429 fSendWindow(0),
430 fSendMaxWindow(0),
431 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
432 fSendMaxSegments(0),
433 fSendQueue(socket->send.buffer_size),
434 fInitialSendSequence(0),
435 fPreviousHighestAcknowledge(0),
436 fDuplicateAcknowledgeCount(0),
437 fPreviousFlightSize(0),
438 fRecover(0),
439 fRoute(NULL),
440 fReceiveNext(0),
441 fReceiveMaxAdvertised(0),
442 fReceiveWindow(socket->receive.buffer_size),
443 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
444 fReceiveQueue(socket->receive.buffer_size),
445 fSmoothedRoundTripTime(0),
446 fRoundTripVariation(0),
447 fSendTime(0),
448 fRoundTripStartSequence(0),
449 fRetransmitTimeout(TCP_INITIAL_RTT),
450 fReceivedTimestamp(0),
451 fCongestionWindow(0),
452 fSlowStartThreshold(0),
453 fState(CLOSED),
454 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
456 // TODO: to be replaced with a real read/write locking strategy!
457 mutex_init(&fLock, "tcp lock");
459 fReceiveCondition.Init(this, "tcp receive");
460 fSendCondition.Init(this, "tcp send");
462 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
463 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
464 this);
465 gStackModule->init_timer(&fDelayedAcknowledgeTimer,
466 TCPEndpoint::_DelayedAcknowledgeTimer, this);
467 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
468 this);
470 T(APICall(this, "constructor"));
474 TCPEndpoint::~TCPEndpoint()
476 mutex_lock(&fLock);
478 T(APICall(this, "destructor"));
480 _CancelConnectionTimers();
481 gStackModule->cancel_timer(&fTimeWaitTimer);
482 T(TimerSet(this, "time-wait", -1));
484 if (fManager != NULL) {
485 fManager->Unbind(this);
486 put_endpoint_manager(fManager);
489 mutex_destroy(&fLock);
491 // we need to wait for all timers to return
492 gStackModule->wait_for_timer(&fRetransmitTimer);
493 gStackModule->wait_for_timer(&fPersistTimer);
494 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
495 gStackModule->wait_for_timer(&fTimeWaitTimer);
497 gDatalinkModule->put_route(Domain(), fRoute);
501 status_t
502 TCPEndpoint::InitCheck() const
504 return B_OK;
508 // #pragma mark - protocol API
511 status_t
512 TCPEndpoint::Open()
514 TRACE("Open()");
515 T(APICall(this, "open"));
517 status_t status = ProtocolSocket::Open();
518 if (status < B_OK)
519 return status;
521 fManager = get_endpoint_manager(Domain());
522 if (fManager == NULL)
523 return EAFNOSUPPORT;
525 return B_OK;
529 status_t
530 TCPEndpoint::Close()
532 MutexLocker locker(fLock);
534 TRACE("Close()");
535 T(APICall(this, "close"));
537 if (fState == LISTEN)
538 delete_sem(fAcceptSemaphore);
540 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
541 // TODO: what about linger in case of SYNCHRONIZE_SENT?
542 fState = CLOSED;
543 T(State(this));
544 return B_OK;
547 status_t status = _Disconnect(true);
548 if (status != B_OK)
549 return status;
551 if (socket->options & SO_LINGER) {
552 TRACE("Close(): Lingering for %i secs", socket->linger);
554 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
556 while (fSendQueue.Used() > 0) {
557 status = _WaitForCondition(fSendCondition, locker, maximum);
558 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
559 break;
560 else if (status < B_OK)
561 return status;
564 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
565 " bytes.", fSendQueue.Used());
567 return B_OK;
571 void
572 TCPEndpoint::Free()
574 MutexLocker _(fLock);
576 TRACE("Free()");
577 T(APICall(this, "free"));
579 if (fState <= SYNCHRONIZE_SENT)
580 return;
582 // we are only interested in the timer, not in changing state
583 _EnterTimeWait();
585 fFlags |= FLAG_CLOSED;
586 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
587 // we'll be freed later when the 2MSL timer expires
588 gSocketModule->acquire_socket(socket);
593 /*! Creates and sends a synchronize packet to /a address, and then waits
594 until the connection has been established or refused.
596 status_t
597 TCPEndpoint::Connect(const sockaddr* address)
599 if (!AddressModule()->is_same_family(address))
600 return EAFNOSUPPORT;
602 MutexLocker locker(fLock);
604 TRACE("Connect() on address %s", PrintAddress(address));
605 T(APICall(this, "connect"));
607 if (gStackModule->is_restarted_syscall()) {
608 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
609 status_t status = _WaitForEstablished(locker, timeout);
610 TRACE(" Connect(): Connection complete: %s (timeout was %"
611 B_PRIdBIGTIME ")", strerror(status), timeout);
612 return posix_error(status);
615 // Can only call connect() from CLOSED or LISTEN states
616 // otherwise endpoint is considered already connected
617 if (fState == LISTEN) {
618 // this socket is about to connect; remove pending connections in the backlog
619 gSocketModule->set_max_backlog(socket, 0);
620 } else if (fState == ESTABLISHED) {
621 return EISCONN;
622 } else if (fState != CLOSED)
623 return EALREADY;
625 // consider destination address INADDR_ANY as INADDR_LOOPBACK
626 sockaddr_storage _address;
627 if (AddressModule()->is_empty_address(address, false)) {
628 AddressModule()->get_loopback_address((sockaddr *)&_address);
629 // for IPv4 and IPv6 the port is at the same offset
630 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
631 address = (sockaddr *)&_address;
634 status_t status = _PrepareSendPath(address);
635 if (status < B_OK)
636 return status;
638 TRACE(" Connect(): starting 3-way handshake...");
640 fState = SYNCHRONIZE_SENT;
641 T(State(this));
643 // send SYN
644 status = _SendQueued();
645 if (status != B_OK) {
646 _Close();
647 return status;
650 // If we are running over Loopback, after _SendQueued() returns we
651 // may be in ESTABLISHED already.
652 if (fState == ESTABLISHED) {
653 TRACE(" Connect() completed after _SendQueued()");
654 return B_OK;
657 // wait until 3-way handshake is complete (if needed)
658 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
659 if (timeout == 0) {
660 // we're a non-blocking socket
661 TRACE(" Connect() delayed, return EINPROGRESS");
662 return EINPROGRESS;
665 bigtime_t absoluteTimeout = absolute_timeout(timeout);
666 gStackModule->store_syscall_restart_timeout(absoluteTimeout);
668 status = _WaitForEstablished(locker, absoluteTimeout);
669 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
670 ")", strerror(status), timeout);
671 return posix_error(status);
675 status_t
676 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
678 MutexLocker locker(fLock);
680 TRACE("Accept()");
681 T(APICall(this, "accept"));
683 status_t status;
684 bigtime_t timeout = absolute_timeout(socket->receive.timeout);
685 if (gStackModule->is_restarted_syscall())
686 timeout = gStackModule->restore_syscall_restart_timeout();
687 else
688 gStackModule->store_syscall_restart_timeout(timeout);
690 do {
691 locker.Unlock();
693 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
694 | B_CAN_INTERRUPT, timeout);
695 if (status != B_OK) {
696 if (status == B_TIMED_OUT && socket->receive.timeout == 0)
697 return B_WOULD_BLOCK;
699 return status;
702 locker.Lock();
703 status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
704 #ifdef TRACE_TCP
705 if (status == B_OK)
706 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol);
707 #endif
708 } while (status != B_OK);
710 return status;
714 status_t
715 TCPEndpoint::Bind(const sockaddr *address)
717 if (address == NULL)
718 return B_BAD_VALUE;
720 MutexLocker lock(fLock);
722 TRACE("Bind() on address %s", PrintAddress(address));
723 T(APICall(this, "bind"));
725 if (fState != CLOSED)
726 return EISCONN;
728 return fManager->Bind(this, address);
732 status_t
733 TCPEndpoint::Unbind(struct sockaddr *address)
735 MutexLocker _(fLock);
737 TRACE("Unbind()");
738 T(APICall(this, "unbind"));
740 return fManager->Unbind(this);
744 status_t
745 TCPEndpoint::Listen(int count)
747 MutexLocker _(fLock);
749 TRACE("Listen()");
750 T(APICall(this, "listen"));
752 if (fState != CLOSED && fState != LISTEN)
753 return B_BAD_VALUE;
755 if (fState == CLOSED) {
756 fAcceptSemaphore = create_sem(0, "tcp accept");
757 if (fAcceptSemaphore < B_OK)
758 return ENOBUFS;
760 status_t status = fManager->SetPassive(this);
761 if (status != B_OK) {
762 delete_sem(fAcceptSemaphore);
763 fAcceptSemaphore = -1;
764 return status;
768 gSocketModule->set_max_backlog(socket, count);
770 fState = LISTEN;
771 T(State(this));
772 return B_OK;
776 status_t
777 TCPEndpoint::Shutdown(int direction)
779 MutexLocker lock(fLock);
781 TRACE("Shutdown(%i)", direction);
782 T(APICall(this, "shutdown"));
784 if (direction == SHUT_RD || direction == SHUT_RDWR)
785 fFlags |= FLAG_NO_RECEIVE;
787 if (direction == SHUT_WR || direction == SHUT_RDWR) {
788 // TODO: That's not correct. After read/write shutting down the socket
789 // one should still be able to read previously arrived data.
790 _Disconnect(false);
793 return B_OK;
797 /*! Puts data contained in \a buffer into send buffer */
798 status_t
799 TCPEndpoint::SendData(net_buffer *buffer)
801 MutexLocker lock(fLock);
803 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
804 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
805 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
806 T(APICall(this, "senddata"));
808 uint32 flags = buffer->flags;
810 if (fState == CLOSED)
811 return ENOTCONN;
812 if (fState == LISTEN)
813 return EDESTADDRREQ;
814 if (!is_writable(fState) && !is_establishing(fState)) {
815 // we only send signals when called from userland
816 if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0)
817 send_signal(find_thread(NULL), SIGPIPE);
818 return EPIPE;
821 size_t left = buffer->size;
823 bigtime_t timeout = absolute_timeout(socket->send.timeout);
824 if (gStackModule->is_restarted_syscall())
825 timeout = gStackModule->restore_syscall_restart_timeout();
826 else
827 gStackModule->store_syscall_restart_timeout(timeout);
829 while (left > 0) {
830 while (fSendQueue.Free() < socket->send.low_water_mark) {
831 // wait until enough space is available
832 status_t status = _WaitForCondition(fSendCondition, lock, timeout);
833 if (status < B_OK) {
834 TRACE(" SendData() returning %s (%d)",
835 strerror(posix_error(status)), (int)posix_error(status));
836 return posix_error(status);
839 if (!is_writable(fState) && !is_establishing(fState)) {
840 // we only send signals when called from userland
841 if (gStackModule->is_syscall())
842 send_signal(find_thread(NULL), SIGPIPE);
843 return EPIPE;
847 size_t size = fSendQueue.Free();
848 if (size < left) {
849 // we need to split the original buffer
850 net_buffer* clone = gBufferModule->clone(buffer, false);
851 // TODO: add offset/size parameter to net_buffer::clone() or
852 // even a move_data() function, as this is a bit inefficient
853 if (clone == NULL)
854 return ENOBUFS;
856 status_t status = gBufferModule->trim(clone, size);
857 if (status != B_OK) {
858 gBufferModule->free(clone);
859 return status;
862 gBufferModule->remove_header(buffer, size);
863 left -= size;
864 fSendQueue.Add(clone);
865 } else {
866 left -= buffer->size;
867 fSendQueue.Add(buffer);
871 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
873 bool force = false;
874 if ((flags & MSG_OOB) != 0) {
875 fSendUrgentOffset = fSendQueue.LastSequence();
876 // RFC 961 specifies that the urgent offset points to the last
877 // byte of urgent data. However, this is commonly implemented as
878 // here, ie. it points to the first byte after the urgent data.
879 force = true;
881 if ((flags & MSG_EOF) != 0)
882 _Disconnect(false);
884 if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
885 _SendQueued(force);
887 return B_OK;
891 ssize_t
892 TCPEndpoint::SendAvailable()
894 MutexLocker locker(fLock);
896 ssize_t available;
898 if (is_writable(fState))
899 available = fSendQueue.Free();
900 else if (is_establishing(fState))
901 available = 0;
902 else
903 available = EPIPE;
905 TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
906 T(APICall(this, "sendavailable"));
907 return available;
911 status_t
912 TCPEndpoint::FillStat(net_stat *stat)
914 MutexLocker _(fLock);
916 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
917 stat->receive_queue_size = fReceiveQueue.Available();
918 stat->send_queue_size = fSendQueue.Used();
920 return B_OK;
924 status_t
925 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
927 MutexLocker locker(fLock);
929 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
930 flags);
931 T(APICall(this, "readdata"));
933 *_buffer = NULL;
935 if (fState == CLOSED)
936 return ENOTCONN;
938 bigtime_t timeout = absolute_timeout(socket->receive.timeout);
939 if (gStackModule->is_restarted_syscall())
940 timeout = gStackModule->restore_syscall_restart_timeout();
941 else
942 gStackModule->store_syscall_restart_timeout(timeout);
944 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
945 if (flags & MSG_DONTWAIT)
946 return B_WOULD_BLOCK;
948 status_t status = _WaitForEstablished(locker, timeout);
949 if (status < B_OK)
950 return posix_error(status);
953 size_t dataNeeded = socket->receive.low_water_mark;
955 // When MSG_WAITALL is set then the function should block
956 // until the full amount of data can be returned.
957 if (flags & MSG_WAITALL)
958 dataNeeded = numBytes;
960 // TODO: add support for urgent data (MSG_OOB)
962 while (true) {
963 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
964 || fState == TIME_WAIT) {
965 // ``Connection closing''.
966 return B_OK;
969 if (fReceiveQueue.Available() > 0) {
970 if (fReceiveQueue.Available() >= dataNeeded
971 || (fReceiveQueue.PushedData() > 0
972 && fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
973 break;
974 } else if (fState == FINISH_RECEIVED) {
975 // ``If no text is awaiting delivery, the RECEIVE will
976 // get a Connection closing''.
977 return B_OK;
980 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
981 return B_WOULD_BLOCK;
983 if ((fFlags & FLAG_NO_RECEIVE) != 0)
984 return B_OK;
986 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
987 if (status < B_OK) {
988 // The Open Group base specification mentions that EINTR should be
989 // returned if the recv() is interrupted before _any data_ is
990 // available. So we actually check if there is data, and if so,
991 // push it to the user.
992 if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
993 && fReceiveQueue.Available() > 0)
994 break;
996 return posix_error(status);
1000 TRACE(" ReadData(): %" B_PRIuSIZE " are available.",
1001 fReceiveQueue.Available());
1003 if (numBytes < fReceiveQueue.Available())
1004 fReceiveCondition.NotifyAll();
1006 bool clone = (flags & MSG_PEEK) != 0;
1008 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
1010 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.",
1011 fReceiveQueue.Available());
1013 // if we are opening the window, check if we should send an ACK
1014 if (!clone)
1015 SendAcknowledge(false);
1017 return receivedBytes;
1021 ssize_t
1022 TCPEndpoint::ReadAvailable()
1024 MutexLocker locker(fLock);
1026 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1027 T(APICall(this, "readavailable"));
1029 return _AvailableData();
1033 status_t
1034 TCPEndpoint::SetSendBufferSize(size_t length)
1036 MutexLocker _(fLock);
1037 fSendQueue.SetMaxBytes(length);
1038 return B_OK;
1042 status_t
1043 TCPEndpoint::SetReceiveBufferSize(size_t length)
1045 MutexLocker _(fLock);
1046 fReceiveQueue.SetMaxBytes(length);
1047 return B_OK;
1051 status_t
1052 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1054 if (*_length != sizeof(int))
1055 return B_BAD_VALUE;
1057 int* value = (int*)_value;
1059 switch (option) {
1060 case TCP_NODELAY:
1061 if ((fOptions & TCP_NODELAY) != 0)
1062 *value = 1;
1063 else
1064 *value = 0;
1065 return B_OK;
1067 case TCP_MAXSEG:
1068 *value = fReceiveMaxSegmentSize;
1069 return B_OK;
1071 default:
1072 return B_BAD_VALUE;
1077 status_t
1078 TCPEndpoint::SetOption(int option, const void* _value, int length)
1080 if (option != TCP_NODELAY)
1081 return B_BAD_VALUE;
1083 if (length != sizeof(int))
1084 return B_BAD_VALUE;
1086 const int* value = (const int*)_value;
1088 MutexLocker _(fLock);
1089 if (*value)
1090 fOptions |= TCP_NODELAY;
1091 else
1092 fOptions &= ~TCP_NODELAY;
1094 return B_OK;
1098 // #pragma mark - misc
1101 bool
1102 TCPEndpoint::IsBound() const
1104 return !LocalAddress().IsEmpty(true);
1108 bool
1109 TCPEndpoint::IsLocal() const
1111 return (fFlags & FLAG_LOCAL) != 0;
1115 status_t
1116 TCPEndpoint::DelayedAcknowledge()
1118 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1119 // timer was active, send an ACK now (with the exception above,
1120 // we send every other ACK)
1121 T(TimerSet(this, "delayed ack", -1));
1122 return SendAcknowledge(true);
1125 gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1126 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1127 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1128 return B_OK;
1132 status_t
1133 TCPEndpoint::SendAcknowledge(bool force)
1135 return _SendQueued(force, 0);
1139 void
1140 TCPEndpoint::_StartPersistTimer()
1142 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1143 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1147 void
1148 TCPEndpoint::_EnterTimeWait()
1150 TRACE("_EnterTimeWait()");
1152 if (fState == TIME_WAIT) {
1153 _CancelConnectionTimers();
1155 if (IsLocal()) {
1156 // we do not use TIME_WAIT state for local connections
1157 fFlags |= FLAG_DELETE_ON_CLOSE;
1158 return;
1162 _UpdateTimeWait();
1166 void
1167 TCPEndpoint::_UpdateTimeWait()
1169 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1170 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1174 void
1175 TCPEndpoint::_CancelConnectionTimers()
1177 gStackModule->cancel_timer(&fRetransmitTimer);
1178 T(TimerSet(this, "retransmit", -1));
1179 gStackModule->cancel_timer(&fPersistTimer);
1180 T(TimerSet(this, "persist", -1));
1181 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1182 T(TimerSet(this, "delayed ack", -1));
1186 /*! Sends the FIN flag to the peer when the connection is still open.
1187 Moves the endpoint to the next state depending on where it was.
1189 status_t
1190 TCPEndpoint::_Disconnect(bool closing)
1192 tcp_state previousState = fState;
1194 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1195 fState = FINISH_SENT;
1196 else if (fState == FINISH_RECEIVED)
1197 fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1198 else
1199 return B_OK;
1201 T(State(this));
1203 status_t status = _SendQueued();
1204 if (status != B_OK) {
1205 fState = previousState;
1206 T(State(this));
1207 return status;
1210 return B_OK;
1214 void
1215 TCPEndpoint::_MarkEstablished()
1217 fState = ESTABLISHED;
1218 T(State(this));
1220 if (gSocketModule->has_parent(socket)) {
1221 gSocketModule->set_connected(socket);
1222 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1225 fSendCondition.NotifyAll();
1226 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1230 status_t
1231 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1233 // TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1234 // When investigating this, also have a look at _Close() and _HandleReset().
1235 while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1236 if (socket->error != B_OK)
1237 return socket->error;
1239 status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1240 if (status < B_OK)
1241 return status;
1244 return B_OK;
1248 // #pragma mark - receive
1251 void
1252 TCPEndpoint::_Close()
1254 _CancelConnectionTimers();
1255 fState = CLOSED;
1256 T(State(this));
1258 fFlags |= FLAG_DELETE_ON_CLOSE;
1260 fSendCondition.NotifyAll();
1261 _NotifyReader();
1263 if (gSocketModule->has_parent(socket)) {
1264 // We still have a parent - obviously, we haven't been accepted yet,
1265 // so no one could ever close us.
1266 _CancelConnectionTimers();
1267 gSocketModule->set_aborted(socket);
1272 void
1273 TCPEndpoint::_HandleReset(status_t error)
1275 socket->error = error;
1276 _Close();
1278 gSocketModule->notify(socket, B_SELECT_WRITE, error);
1279 gSocketModule->notify(socket, B_SELECT_ERROR, error);
1283 void
1284 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1286 if (fDuplicateAcknowledgeCount == 0)
1287 fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number();
1289 if (++fDuplicateAcknowledgeCount < 3) {
1290 if (fSendQueue.Available(fSendMax) != 0 && fSendWindow != 0) {
1291 fSendNext = fSendMax;
1292 fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1293 _SendQueued();
1294 TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack");
1295 fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1299 if (fDuplicateAcknowledgeCount == 3) {
1300 if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize &&
1301 (fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) {
1302 fFlags |= FLAG_RECOVERY;
1303 fRecover = fSendMax.Number() - 1;
1304 fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize);
1305 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1306 fSendNext = segment.acknowledge;
1307 _SendQueued();
1308 TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack");
1310 } else if (fDuplicateAcknowledgeCount > 3) {
1311 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1312 if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize)
1313 fCongestionWindow += fSendMaxSegmentSize;
1314 if (fSendQueue.Available(fSendMax) != 0) {
1315 fSendNext = fSendMax;
1316 _SendQueued();
1322 void
1323 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1324 size_t segmentLength)
1326 if (fFlags & FLAG_OPTION_TIMESTAMP) {
1327 tcp_sequence sequence(segment.sequence);
1329 if (fLastAcknowledgeSent >= sequence
1330 && fLastAcknowledgeSent < (sequence + segmentLength))
1331 fReceivedTimestamp = segment.timestamp_value;
1336 ssize_t
1337 TCPEndpoint::_AvailableData() const
1339 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1340 // the application of FLAG_NO_RECEIVE in listen()ing
1341 // sockets.
1342 if (fState == LISTEN)
1343 return gSocketModule->count_connected(socket);
1344 if (fState == SYNCHRONIZE_SENT)
1345 return 0;
1347 ssize_t availableData = fReceiveQueue.Available();
1349 if (availableData == 0 && !_ShouldReceive())
1350 return ENOTCONN;
1352 return availableData;
1356 void
1357 TCPEndpoint::_NotifyReader()
1359 fReceiveCondition.NotifyAll();
1360 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1364 bool
1365 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1367 if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1368 // Remember the position of the finish received flag
1369 fFinishReceived = true;
1370 fFinishReceivedAt = segment.sequence + buffer->size;
1373 fReceiveQueue.Add(buffer, segment.sequence);
1374 fReceiveNext = fReceiveQueue.NextSequence();
1376 if (fFinishReceived) {
1377 // Set or reset the finish flag on the current segment
1378 if (fReceiveNext < fFinishReceivedAt)
1379 segment.flags &= ~TCP_FLAG_FINISH;
1380 else
1381 segment.flags |= TCP_FLAG_FINISH;
1384 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1385 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1387 if ((segment.flags & TCP_FLAG_PUSH) != 0)
1388 fReceiveQueue.SetPushPointer();
1390 return fReceiveQueue.Available() > 0;
1394 void
1395 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1397 fInitialReceiveSequence = segment.sequence;
1398 fFinishReceived = false;
1400 // count the received SYN
1401 segment.sequence++;
1403 fReceiveNext = segment.sequence;
1404 fReceiveQueue.SetInitialSequence(segment.sequence);
1406 if ((fOptions & TCP_NOOPT) == 0) {
1407 if (segment.max_segment_size > 0)
1408 fSendMaxSegmentSize = segment.max_segment_size;
1410 if (segment.options & TCP_HAS_WINDOW_SCALE) {
1411 fFlags |= FLAG_OPTION_WINDOW_SCALE;
1412 fSendWindowShift = segment.window_shift;
1413 } else {
1414 fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1415 fReceiveWindowShift = 0;
1418 if (segment.options & TCP_HAS_TIMESTAMPS) {
1419 fFlags |= FLAG_OPTION_TIMESTAMP;
1420 fReceivedTimestamp = segment.timestamp_value;
1421 } else
1422 fFlags &= ~FLAG_OPTION_TIMESTAMP;
1425 if (fSendMaxSegmentSize > 2190)
1426 fCongestionWindow = 2 * fSendMaxSegmentSize;
1427 else if (fSendMaxSegmentSize > 1095)
1428 fCongestionWindow = 3 * fSendMaxSegmentSize;
1429 else
1430 fCongestionWindow = 4 * fSendMaxSegmentSize;
1432 fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize;
1433 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1437 bool
1438 TCPEndpoint::_ShouldReceive() const
1440 if ((fFlags & FLAG_NO_RECEIVE) != 0)
1441 return false;
1443 return fState == ESTABLISHED || fState == FINISH_SENT
1444 || fState == FINISH_ACKNOWLEDGED;
1448 int32
1449 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1450 net_buffer* buffer)
1452 MutexLocker _(fLock);
1454 // TODO error checking
1455 ProtocolSocket::Open();
1457 fState = SYNCHRONIZE_RECEIVED;
1458 T(Spawn(parent, this));
1460 fManager = parent->fManager;
1462 LocalAddress().SetTo(buffer->destination);
1463 PeerAddress().SetTo(buffer->source);
1465 TRACE("Spawn()");
1467 // TODO: proper error handling!
1468 if (fManager->BindChild(this) != B_OK) {
1469 T(Error(this, "binding failed", __LINE__));
1470 return DROP;
1472 if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1473 T(Error(this, "prepare send faild", __LINE__));
1474 return DROP;
1477 fOptions = parent->fOptions;
1478 fAcceptSemaphore = parent->fAcceptSemaphore;
1480 _PrepareReceivePath(segment);
1482 // send SYN+ACK
1483 if (_SendQueued() != B_OK) {
1484 T(Error(this, "sending failed", __LINE__));
1485 return DROP;
1488 segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1489 // we handled this flag now, it must not be set for further processing
1491 return _Receive(segment, buffer);
1495 int32
1496 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1498 TRACE("ListenReceive()");
1500 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1501 // but the error behaviour differs
1502 if (segment.flags & TCP_FLAG_RESET)
1503 return DROP;
1504 if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1505 return DROP | RESET;
1506 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1507 return DROP;
1509 // TODO: drop broadcast/multicast
1511 // spawn new endpoint for accept()
1512 net_socket* newSocket;
1513 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1514 T(Error(this, "spawning failed", __LINE__));
1515 return DROP;
1518 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1519 segment, buffer);
1523 int32
1524 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1525 net_buffer *buffer)
1527 TRACE("_SynchronizeSentReceive()");
1529 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1530 && (fInitialSendSequence >= segment.acknowledge
1531 || fSendMax < segment.acknowledge))
1532 return DROP | RESET;
1534 if (segment.flags & TCP_FLAG_RESET) {
1535 _HandleReset(ECONNREFUSED);
1536 return DROP;
1539 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1540 return DROP;
1542 fSendUnacknowledged = segment.acknowledge;
1543 _PrepareReceivePath(segment);
1545 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1546 _MarkEstablished();
1547 } else {
1548 // simultaneous open
1549 fState = SYNCHRONIZE_RECEIVED;
1550 T(State(this));
1553 segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1554 // we handled this flag now, it must not be set for further processing
1556 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1560 int32
1561 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1563 // PAWS processing takes precedence over regular TCP acceptability check
1564 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) {
1565 if ((segment.options & TCP_HAS_TIMESTAMPS) == 0)
1566 return DROP;
1567 if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0
1568 && (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX)
1569 return DROP | IMMEDIATE_ACKNOWLEDGE;
1572 uint32 advertisedWindow = (uint32)segment.advertised_window
1573 << fSendWindowShift;
1574 size_t segmentLength = buffer->size;
1576 // First, handle the most common case for uni-directional data transfer
1577 // (known as header prediction - the segment must not change the window,
1578 // and must be the expected sequence, and contain no control flags)
1580 if (fState == ESTABLISHED
1581 && segment.AcknowledgeOnly()
1582 && fReceiveNext == segment.sequence
1583 && advertisedWindow > 0 && advertisedWindow == fSendWindow
1584 && fSendNext == fSendMax) {
1585 _UpdateTimestamps(segment, segmentLength);
1587 if (segmentLength == 0) {
1588 // this is a pure acknowledge segment - we're on the sending end
1589 if (fSendUnacknowledged < segment.acknowledge
1590 && fSendMax >= segment.acknowledge) {
1591 _Acknowledged(segment);
1592 return DROP;
1594 } else if (segment.acknowledge == fSendUnacknowledged
1595 && fReceiveQueue.IsContiguous()
1596 && fReceiveQueue.Free() >= segmentLength
1597 && (fFlags & FLAG_NO_RECEIVE) == 0) {
1598 if (_AddData(segment, buffer))
1599 _NotifyReader();
1601 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1602 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1606 // The fast path was not applicable, so we continue with the standard
1607 // processing of the incoming segment
1609 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1611 if (fState != CLOSED && fState != TIME_WAIT) {
1612 // Check sequence number
1613 if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1614 fReceiveWindow)) {
1615 TRACE(" Receive(): segment out of window, next: %" B_PRIu32
1616 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1617 if ((segment.flags & TCP_FLAG_RESET) != 0) {
1618 // TODO: this doesn't look right - review!
1619 return DROP;
1621 return DROP | IMMEDIATE_ACKNOWLEDGE;
1625 if ((segment.flags & TCP_FLAG_RESET) != 0) {
1626 // Is this a valid reset?
1627 // We generally ignore resets in time wait state (see RFC 1337)
1628 if (fLastAcknowledgeSent <= segment.sequence
1629 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1630 + fReceiveWindow)
1631 && fState != TIME_WAIT) {
1632 status_t error;
1633 if (fState == SYNCHRONIZE_RECEIVED)
1634 error = ECONNREFUSED;
1635 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1636 error = ENOTCONN;
1637 else
1638 error = ECONNRESET;
1640 _HandleReset(error);
1643 return DROP;
1646 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1647 || (fState == SYNCHRONIZE_RECEIVED
1648 && (fInitialReceiveSequence > segment.sequence
1649 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1650 && (fSendUnacknowledged > segment.acknowledge
1651 || fSendMax < segment.acknowledge))))) {
1652 // reset the connection - either the initial SYN was faulty, or we
1653 // received a SYN within the data stream
1654 return DROP | RESET;
1657 // TODO: Check this! Why do we advertize a window outside of what we should
1658 // buffer?
1659 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1660 // the window must not shrink
1662 // trim buffer to be within the receive window
1663 int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1664 if (drop > 0) {
1665 if ((uint32)drop > buffer->size
1666 || ((uint32)drop == buffer->size
1667 && (segment.flags & TCP_FLAG_FINISH) == 0)) {
1668 // don't accidently remove a FIN we shouldn't remove
1669 segment.flags &= ~TCP_FLAG_FINISH;
1670 drop = buffer->size;
1673 // remove duplicate data at the start
1674 TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1675 gBufferModule->remove_header(buffer, drop);
1676 segment.sequence += drop;
1679 int32 action = KEEP;
1681 // immediately acknowledge out-of-order segment to trigger fast-retransmit at the sender
1682 if (drop != 0)
1683 action |= IMMEDIATE_ACKNOWLEDGE;
1685 drop = (int32)(segment.sequence + buffer->size
1686 - (fReceiveNext + fReceiveWindow)).Number();
1687 if (drop > 0) {
1688 // remove data exceeding our window
1689 if ((uint32)drop >= buffer->size) {
1690 // if we can accept data, or the segment is not what we'd expect,
1691 // drop the segment (an immediate acknowledge is always triggered)
1692 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1693 return DROP | IMMEDIATE_ACKNOWLEDGE;
1695 action |= IMMEDIATE_ACKNOWLEDGE;
1698 if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1699 // we need to remove the finish, too, as part of the data
1700 drop--;
1703 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1704 TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1705 gBufferModule->remove_trailer(buffer, drop);
1708 #ifdef TRACE_TCP
1709 if (advertisedWindow > fSendWindow) {
1710 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1711 fSendWindow, advertisedWindow);
1713 #endif
1715 if (advertisedWindow > fSendWindow)
1716 action |= IMMEDIATE_ACKNOWLEDGE;
1718 fSendWindow = advertisedWindow;
1719 if (advertisedWindow > fSendMaxWindow)
1720 fSendMaxWindow = advertisedWindow;
1722 // Then look at the acknowledgement for any updates
1724 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1725 // process acknowledged data
1726 if (fState == SYNCHRONIZE_RECEIVED)
1727 _MarkEstablished();
1729 if (fSendMax < segment.acknowledge)
1730 return DROP | IMMEDIATE_ACKNOWLEDGE;
1732 if (segment.acknowledge == fSendUnacknowledged) {
1733 if (buffer->size == 0 && advertisedWindow == fSendWindow
1734 && (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) {
1735 TRACE("Receive(): duplicate ack!");
1736 _DuplicateAcknowledge(segment);
1738 } else if (segment.acknowledge < fSendUnacknowledged) {
1739 return DROP;
1740 } else {
1741 // this segment acknowledges in flight data
1743 if (fDuplicateAcknowledgeCount >= 3) {
1744 // deflate the window.
1745 if (segment.acknowledge > fRecover) {
1746 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1747 fCongestionWindow = min_c(fSlowStartThreshold,
1748 max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize);
1749 fFlags &= ~FLAG_RECOVERY;
1753 if (fSendMax == segment.acknowledge)
1754 TRACE("Receive(): all inflight data ack'd!");
1756 if (segment.acknowledge > fSendQueue.LastSequence()
1757 && fState > ESTABLISHED) {
1758 TRACE("Receive(): FIN has been acknowledged!");
1760 switch (fState) {
1761 case FINISH_SENT:
1762 fState = FINISH_ACKNOWLEDGED;
1763 T(State(this));
1764 break;
1765 case CLOSING:
1766 fState = TIME_WAIT;
1767 T(State(this));
1768 _EnterTimeWait();
1769 return DROP;
1770 case WAIT_FOR_FINISH_ACKNOWLEDGE:
1771 _Close();
1772 break;
1774 default:
1775 break;
1779 if (fState != CLOSED)
1780 _Acknowledged(segment);
1784 if (segment.flags & TCP_FLAG_URGENT) {
1785 if (fState == ESTABLISHED || fState == FINISH_SENT
1786 || fState == FINISH_ACKNOWLEDGED) {
1787 // TODO: Handle urgent data:
1788 // - RCV.UP <- max(RCV.UP, SEG.UP)
1789 // - signal the user that urgent data is available (SIGURG)
1793 bool notify = false;
1795 // The buffer may be freed if its data is added to the queue, so cache
1796 // the size as we still need it later.
1797 uint32 bufferSize = buffer->size;
1799 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1800 && _ShouldReceive())
1801 notify = _AddData(segment, buffer);
1802 else {
1803 if ((fFlags & FLAG_NO_RECEIVE) != 0)
1804 fReceiveNext += buffer->size;
1806 action = (action & ~KEEP) | DROP;
1809 if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1810 segmentLength++;
1811 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1812 TRACE("Receive(): peer is finishing connection!");
1813 fReceiveNext++;
1814 notify = true;
1816 // FIN implies PUSH
1817 fReceiveQueue.SetPushPointer();
1819 // we'll reply immediately to the FIN if we are not
1820 // transitioning to TIME WAIT so we immediatly ACK it.
1821 action |= IMMEDIATE_ACKNOWLEDGE;
1823 // other side is closing connection; change states
1824 switch (fState) {
1825 case ESTABLISHED:
1826 case SYNCHRONIZE_RECEIVED:
1827 fState = FINISH_RECEIVED;
1828 T(State(this));
1829 break;
1830 case FINISH_SENT:
1831 // simultaneous close
1832 fState = CLOSING;
1833 T(State(this));
1834 break;
1835 case FINISH_ACKNOWLEDGED:
1836 fState = TIME_WAIT;
1837 T(State(this));
1838 _EnterTimeWait();
1839 break;
1840 case TIME_WAIT:
1841 _UpdateTimeWait();
1842 break;
1844 default:
1845 break;
1850 if (notify)
1851 _NotifyReader();
1853 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1854 action |= ACKNOWLEDGE;
1856 _UpdateTimestamps(segment, segmentLength);
1858 TRACE("Receive() Action %" B_PRId32, action);
1860 return action;
1864 int32
1865 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1867 MutexLocker locker(fLock);
1869 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1870 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1871 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1872 PrintAddress(buffer->destination), segment.flags, segment.sequence,
1873 segment.acknowledge,
1874 (uint32)segment.advertised_window << fSendWindowShift);
1875 T(Receive(this, segment,
1876 (uint32)segment.advertised_window << fSendWindowShift, buffer));
1877 int32 segmentAction = DROP;
1879 switch (fState) {
1880 case LISTEN:
1881 segmentAction = _ListenReceive(segment, buffer);
1882 break;
1884 case SYNCHRONIZE_SENT:
1885 segmentAction = _SynchronizeSentReceive(segment, buffer);
1886 break;
1888 case SYNCHRONIZE_RECEIVED:
1889 case ESTABLISHED:
1890 case FINISH_RECEIVED:
1891 case WAIT_FOR_FINISH_ACKNOWLEDGE:
1892 case FINISH_SENT:
1893 case FINISH_ACKNOWLEDGED:
1894 case CLOSING:
1895 case TIME_WAIT:
1896 case CLOSED:
1897 segmentAction = _Receive(segment, buffer);
1898 break;
1901 // process acknowledge action as asked for by the *Receive() method
1902 if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1903 SendAcknowledge(true);
1904 else if (segmentAction & ACKNOWLEDGE)
1905 DelayedAcknowledge();
1907 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1908 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1909 locker.Unlock();
1910 gSocketModule->release_socket(socket);
1913 return segmentAction;
1917 // #pragma mark - send
1920 inline uint8
1921 TCPEndpoint::_CurrentFlags()
1923 // we don't set FLAG_FINISH here, instead we do it
1924 // conditionally below depending if we are sending
1925 // the last bytes of the send queue.
1927 switch (fState) {
1928 case CLOSED:
1929 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1931 case SYNCHRONIZE_SENT:
1932 return TCP_FLAG_SYNCHRONIZE;
1933 case SYNCHRONIZE_RECEIVED:
1934 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1936 case ESTABLISHED:
1937 case FINISH_RECEIVED:
1938 case FINISH_ACKNOWLEDGED:
1939 case TIME_WAIT:
1940 case WAIT_FOR_FINISH_ACKNOWLEDGE:
1941 case FINISH_SENT:
1942 case CLOSING:
1943 return TCP_FLAG_ACKNOWLEDGE;
1945 default:
1946 return 0;
1951 inline bool
1952 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1953 uint32 segmentMaxSize, uint32 flightSize)
1955 if (fState == ESTABLISHED && fSendMaxSegments == 0)
1956 return false;
1958 if (length > 0) {
1959 // Avoid the silly window syndrome - we only send a segment in case:
1960 // - we have a full segment to send, or
1961 // - we're at the end of our buffer queue, or
1962 // - the buffer is at least larger than half of the maximum send window,
1963 // or
1964 // - we're retransmitting data
1965 if (length == segmentMaxSize
1966 || (fOptions & TCP_NODELAY) != 0
1967 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1968 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1969 return true;
1972 // check if we need to send a window update to the peer
1973 if (segment.advertised_window > 0) {
1974 // correct the window to take into account what already has been advertised
1975 uint32 window = (segment.advertised_window << fReceiveWindowShift)
1976 - (fReceiveMaxAdvertised - fReceiveNext).Number();
1978 // if we can advertise a window larger than twice the maximum segment
1979 // size, or half the maximum buffer size we send a window update
1980 if (window >= (fReceiveMaxSegmentSize << 1)
1981 || window >= (socket->receive.buffer_size >> 1))
1982 return true;
1985 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1986 | TCP_FLAG_RESET)) != 0)
1987 return true;
1989 // We do have urgent data pending
1990 if (fSendUrgentOffset > fSendNext)
1991 return true;
1993 // there is no reason to send a segment just now
1994 return false;
1998 status_t
1999 TCPEndpoint::_SendQueued(bool force)
2001 return _SendQueued(force, fSendWindow);
2005 /*! Sends one or more TCP segments with the data waiting in the queue, or some
2006 specific flags that need to be sent.
2008 status_t
2009 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
2011 if (fRoute == NULL)
2012 return B_ERROR;
2014 // in passive state?
2015 if (fState == LISTEN)
2016 return B_ERROR;
2018 tcp_segment_header segment(_CurrentFlags());
2020 if ((fOptions & TCP_NOOPT) == 0) {
2021 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
2022 segment.options |= TCP_HAS_TIMESTAMPS;
2023 segment.timestamp_reply = fReceivedTimestamp;
2024 segment.timestamp_value = tcp_now();
2027 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
2028 && fSendNext == fInitialSendSequence) {
2029 // add connection establishment options
2030 segment.max_segment_size = fReceiveMaxSegmentSize;
2031 if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
2032 segment.options |= TCP_HAS_WINDOW_SCALE;
2033 segment.window_shift = fReceiveWindowShift;
2038 size_t availableBytes = fReceiveQueue.Free();
2039 // window size must remain same for duplicate acknowledgements
2040 if (!fReceiveQueue.IsContiguous())
2041 availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number();
2043 if (fFlags & FLAG_OPTION_WINDOW_SCALE)
2044 segment.advertised_window = availableBytes >> fReceiveWindowShift;
2045 else
2046 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
2048 segment.acknowledge = fReceiveNext.Number();
2050 // Process urgent data
2051 if (fSendUrgentOffset > fSendNext) {
2052 segment.flags |= TCP_FLAG_URGENT;
2053 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
2054 } else {
2055 fSendUrgentOffset = fSendUnacknowledged.Number();
2056 // Keep urgent offset updated, so that it doesn't reach into our
2057 // send window on overlap
2058 segment.urgent_offset = 0;
2061 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
2062 sendWindow = fCongestionWindow;
2064 // fSendUnacknowledged
2065 // | fSendNext fSendMax
2066 // | | |
2067 // v v v
2068 // -----------------------------------
2069 // | effective window |
2070 // -----------------------------------
2072 // Flight size represents the window of data which is currently in the
2073 // ether. We should never send data such as the flight size becomes larger
2074 // than the effective window. Note however that the effective window may be
2075 // reduced (by congestion for instance), so at some point in time flight
2076 // size may be larger than the currently calculated window.
2078 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2079 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2081 if (consumedWindow > sendWindow) {
2082 sendWindow = 0;
2083 // TODO: enter persist state? try to get a window update.
2084 } else
2085 sendWindow -= consumedWindow;
2087 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
2088 // send one byte of data to ask for a window update
2089 // (triggered by the persist timer)
2090 sendWindow = 1;
2093 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2094 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2095 bool retransmit = fSendNext < fSendMax;
2097 if (fDuplicateAcknowledgeCount != 0) {
2098 // send at most 1 SMSS of data when under limited transmit, fast transmit/recovery
2099 length = min_c(length, fSendMaxSegmentSize);
2102 do {
2103 uint32 segmentMaxSize = fSendMaxSegmentSize
2104 - tcp_options_length(segment);
2105 uint32 segmentLength = min_c(length, segmentMaxSize);
2107 if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
2108 if (state_needs_finish(fState))
2109 segment.flags |= TCP_FLAG_FINISH;
2110 if (length > 0)
2111 segment.flags |= TCP_FLAG_PUSH;
2114 // Determine if we should really send this segment
2115 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2116 segmentMaxSize, flightSize)) {
2117 if (fSendQueue.Available()
2118 && !gStackModule->is_timer_active(&fPersistTimer)
2119 && !gStackModule->is_timer_active(&fRetransmitTimer))
2120 _StartPersistTimer();
2121 break;
2124 net_buffer *buffer = gBufferModule->create(256);
2125 if (buffer == NULL)
2126 return B_NO_MEMORY;
2128 status_t status = B_OK;
2129 if (segmentLength > 0)
2130 status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2131 if (status < B_OK) {
2132 gBufferModule->free(buffer);
2133 return status;
2136 LocalAddress().CopyTo(buffer->source);
2137 PeerAddress().CopyTo(buffer->destination);
2139 uint32 size = buffer->size;
2140 segment.sequence = fSendNext.Number();
2142 TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2143 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2144 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2145 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2146 buffer, buffer->size, PrintAddress(buffer->source),
2147 PrintAddress(buffer->destination), segment.flags, segment.sequence,
2148 segment.acknowledge, segment.advertised_window,
2149 fCongestionWindow, fSlowStartThreshold, segmentLength,
2150 fSendQueue.FirstSequence().Number(),
2151 fSendQueue.LastSequence().Number());
2152 T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2153 fSendQueue.LastSequence()));
2155 PROBE(buffer, sendWindow);
2156 sendWindow -= buffer->size;
2158 status = add_tcp_header(AddressModule(), segment, buffer);
2159 if (status != B_OK) {
2160 gBufferModule->free(buffer);
2161 return status;
2164 // Update send status - we need to do this before we send the data
2165 // for local connections as the answer is directly handled
2167 if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2168 segment.options &= ~TCP_HAS_WINDOW_SCALE;
2169 segment.max_segment_size = 0;
2170 size++;
2173 if (segment.flags & TCP_FLAG_FINISH)
2174 size++;
2176 uint32 sendMax = fSendMax.Number();
2177 fSendNext += size;
2178 if (fSendMax < fSendNext)
2179 fSendMax = fSendNext;
2181 fReceiveMaxAdvertised = fReceiveNext
2182 + ((uint32)segment.advertised_window << fReceiveWindowShift);
2184 if (segmentLength != 0 && fState == ESTABLISHED)
2185 --fSendMaxSegments;
2187 status = next->module->send_routed_data(next, fRoute, buffer);
2188 if (status < B_OK) {
2189 gBufferModule->free(buffer);
2191 fSendNext = segment.sequence;
2192 fSendMax = sendMax;
2193 // restore send status
2194 return status;
2197 if (fSendTime == 0 && !retransmit
2198 && (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) !=0)) {
2199 fSendTime = tcp_now();
2200 fRoundTripStartSequence = segment.sequence;
2203 if (shouldStartRetransmitTimer && size > 0) {
2204 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2205 fRetransmitTimeout);
2206 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2207 T(TimerSet(this, "retransmit", fRetransmitTimeout));
2208 shouldStartRetransmitTimer = false;
2211 if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2212 fLastAcknowledgeSent = segment.acknowledge;
2214 length -= segmentLength;
2215 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2216 | TCP_FLAG_FINISH);
2218 if (retransmit)
2219 break;
2221 } while (length > 0);
2223 return B_OK;
2228 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2230 return next->module->get_mtu(next, address) - sizeof(tcp_header);
2234 status_t
2235 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2237 if (fRoute == NULL) {
2238 fRoute = gDatalinkModule->get_route(Domain(), peer);
2239 if (fRoute == NULL)
2240 return ENETUNREACH;
2242 if ((fRoute->flags & RTF_LOCAL) != 0)
2243 fFlags |= FLAG_LOCAL;
2246 // make sure connection does not already exist
2247 status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2248 fRoute->interface_address->local);
2249 if (status < B_OK)
2250 return status;
2252 fInitialSendSequence = system_time() >> 4;
2253 fSendNext = fInitialSendSequence;
2254 fSendUnacknowledged = fInitialSendSequence;
2255 fSendMax = fInitialSendSequence;
2256 fSendUrgentOffset = fInitialSendSequence;
2257 fRecover = fInitialSendSequence.Number();
2259 // we are counting the SYN here
2260 fSendQueue.SetInitialSequence(fSendNext + 1);
2262 fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2264 // Compute the window shift we advertise to our peer - if it doesn't support
2265 // this option, this will be reset to 0 (when its SYN is received)
2266 fReceiveWindowShift = 0;
2267 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2268 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2269 fReceiveWindowShift++;
2272 return B_OK;
2276 void
2277 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2279 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2280 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2281 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2283 ASSERT(fSendUnacknowledged <= segment.acknowledge);
2285 if (fSendUnacknowledged < segment.acknowledge) {
2286 fSendQueue.RemoveUntil(segment.acknowledge);
2288 uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number();
2289 fPreviousHighestAcknowledge = fSendUnacknowledged;
2290 fSendUnacknowledged = segment.acknowledge;
2291 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2292 int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1);
2294 if (fPreviousHighestAcknowledge > fSendUnacknowledged) {
2295 // need to update the recover variable upon a sequence wraparound
2296 fRecover = segment.acknowledge - 1;
2299 // the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window
2300 if (fSendUnacknowledged != fInitialSendSequence) {
2301 if (fCongestionWindow < fSlowStartThreshold)
2302 fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize);
2303 else {
2304 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2306 if (increment < fCongestionWindow)
2307 increment = 1;
2308 else
2309 increment /= fCongestionWindow;
2311 fCongestionWindow += increment;
2314 fSendMaxSegments = UINT32_MAX;
2317 if ((fFlags & FLAG_RECOVERY) != 0) {
2318 fSendNext = fSendUnacknowledged;
2319 _SendQueued();
2320 fCongestionWindow -= bytesAcknowledged;
2322 if (bytesAcknowledged > fSendMaxSegmentSize)
2323 fCongestionWindow += fSendMaxSegmentSize;
2325 fSendNext = fSendMax;
2326 } else
2327 fDuplicateAcknowledgeCount = 0;
2329 if (fSendNext < fSendUnacknowledged)
2330 fSendNext = fSendUnacknowledged;
2332 if (fFlags & FLAG_OPTION_TIMESTAMP) {
2333 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply),
2334 expectedSamples > 0 ? expectedSamples : 1);
2335 } else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) {
2336 _UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1);
2337 fSendTime = 0;
2340 if (fSendUnacknowledged == fSendMax) {
2341 TRACE("all acknowledged, cancelling retransmission timer.");
2342 gStackModule->cancel_timer(&fRetransmitTimer);
2343 T(TimerSet(this, "retransmit", -1));
2344 } else {
2345 TRACE("data acknowledged, resetting retransmission timer to: %"
2346 B_PRIdBIGTIME, fRetransmitTimeout);
2347 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2348 T(TimerSet(this, "retransmit", fRetransmitTimeout));
2351 if (is_writable(fState)) {
2352 // notify threads waiting on the socket to become writable again
2353 fSendCondition.NotifyAll();
2354 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2358 // if there is data left to be sent, send it now
2359 if (fSendQueue.Used() > 0)
2360 _SendQueued();
2364 void
2365 TCPEndpoint::_Retransmit()
2367 TRACE("Retransmit()");
2369 if (fState < ESTABLISHED) {
2370 fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT;
2371 fCongestionWindow = fSendMaxSegmentSize;
2372 } else {
2373 _ResetSlowStart();
2374 fDuplicateAcknowledgeCount = 0;
2375 // Do exponential back off of the retransmit timeout
2376 fRetransmitTimeout *= 2;
2377 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2378 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2381 fSendNext = fSendUnacknowledged;
2382 _SendQueued();
2384 fRecover = fSendNext.Number() - 1;
2385 if ((fFlags & FLAG_RECOVERY) != 0)
2386 fFlags &= ~FLAG_RECOVERY;
2390 void
2391 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples)
2393 if(fSmoothedRoundTripTime == 0) {
2394 fSmoothedRoundTripTime = roundTripTime;
2395 fRoundTripVariation = roundTripTime / 2;
2396 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2397 * kTimestampFactor;
2398 } else {
2399 int32 delta = fSmoothedRoundTripTime - roundTripTime;
2400 if (delta < 0)
2401 delta = -delta;
2402 fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4);
2403 fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8);
2404 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2405 * kTimestampFactor;
2408 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2409 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2411 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2412 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2414 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2415 fRetransmitTimeout, roundTripTime);
2419 void
2420 TCPEndpoint::_ResetSlowStart()
2422 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2423 2 * fSendMaxSegmentSize);
2424 fCongestionWindow = fSendMaxSegmentSize;
2428 // #pragma mark - timer
2431 /*static*/ void
2432 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2434 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2435 T(TimerTriggered(endpoint, "retransmit"));
2437 MutexLocker locker(endpoint->fLock);
2438 if (!locker.IsLocked() || gStackModule->is_timer_active(timer))
2439 return;
2441 endpoint->_Retransmit();
2445 /*static*/ void
2446 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2448 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2449 T(TimerTriggered(endpoint, "persist"));
2451 MutexLocker locker(endpoint->fLock);
2452 if (!locker.IsLocked())
2453 return;
2455 // the timer might not have been canceled early enough
2456 if (endpoint->State() == CLOSED)
2457 return;
2459 endpoint->_SendQueued(true);
2463 /*static*/ void
2464 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2466 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2467 T(TimerTriggered(endpoint, "delayed ack"));
2469 MutexLocker locker(endpoint->fLock);
2470 if (!locker.IsLocked())
2471 return;
2473 // the timer might not have been canceled early enough
2474 if (endpoint->State() == CLOSED)
2475 return;
2477 endpoint->SendAcknowledge(true);
2481 /*static*/ void
2482 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2484 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2485 T(TimerTriggered(endpoint, "time-wait"));
2487 MutexLocker locker(endpoint->fLock);
2488 if (!locker.IsLocked())
2489 return;
2491 if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2492 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2493 return;
2496 locker.Unlock();
2498 gSocketModule->release_socket(endpoint->socket);
2502 /*static*/ status_t
2503 TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2504 MutexLocker& locker, bigtime_t timeout)
2506 ConditionVariableEntry entry;
2507 condition.Add(&entry);
2509 locker.Unlock();
2510 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2511 locker.Lock();
2513 return result;
2517 // #pragma mark -
2520 void
2521 TCPEndpoint::Dump() const
2523 kprintf("TCP endpoint %p\n", this);
2524 kprintf(" state: %s\n", name_for_state(fState));
2525 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags);
2526 #if KDEBUG
2527 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2528 #endif
2529 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2530 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2531 kprintf(" send\n");
2532 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift);
2533 kprintf(" unacknowledged: %" B_PRIu32 "\n",
2534 fSendUnacknowledged.Number());
2535 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number());
2536 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number());
2537 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2538 kprintf(" window: %" B_PRIu32 "\n", fSendWindow);
2539 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow);
2540 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2541 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2542 fSendQueue.Size());
2543 #if DEBUG_BUFFER_QUEUE
2544 fSendQueue.Dump();
2545 #endif
2546 kprintf(" last acknowledge sent: %" B_PRIu32 "\n",
2547 fLastAcknowledgeSent.Number());
2548 kprintf(" initial sequence: %" B_PRIu32 "\n",
2549 fInitialSendSequence.Number());
2550 kprintf(" receive\n");
2551 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2552 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number());
2553 kprintf(" max advertised: %" B_PRIu32 "\n",
2554 fReceiveMaxAdvertised.Number());
2555 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow);
2556 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2557 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2558 fReceiveQueue.Available(), fReceiveQueue.Size());
2559 #if DEBUG_BUFFER_QUEUE
2560 fReceiveQueue.Dump();
2561 #endif
2562 kprintf(" initial sequence: %" B_PRIu32 "\n",
2563 fInitialReceiveSequence.Number());
2564 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n",
2565 fDuplicateAcknowledgeCount);
2566 kprintf(" smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2567 fSmoothedRoundTripTime, fRoundTripVariation);
2568 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2569 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2570 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);