1 /* LWIP service - tcpsock.c - TCP sockets */
3 * This module implements support for TCP sockets based on lwIP's core TCP PCB
4 * module, which is largely but not fully cooperative with exactly what we want
5 * to achieve, with as a result that this module is rather complicated.
7 * Each socket has a send queue and a receive queue. Both are using lwIP's own
8 * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
9 * The buffers on the send queue are allocated and freed by us--the latter only
10 * once they are no longer in use by lwIP as well. A bit counterintuitively,
11 * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
12 * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
13 * easily trigger conditions where we cannot enqueue data (or the final FIN)
14 * right away. This way, we get to test the internal logic of this module a
15 * lot more easily. The small lwIP send queue size should not have any impact
16 * on performance, as our own per-socket send queues can be much larger and we
17 * enqueue more of that on the lwIP PCB as soon as we can in all cases.
19 * The receive queue consists of whatever buffers were given to us by lwIP, but
20 * since those may be many buffers with small amounts of data each, we perform
21 * fairly aggressive merging of consecutive buffers. The intended result is
22 * that we waste no more than 50% of memory within the receive queue. Merging
23 * requires memory copies, which makes it expensive, but we do not configure
24 * lwIP with enough buffers to make running out of buffers a non-issue, so this
25 * trade-off is necessary. Practical experience and measurements of the merge
26 * policy will have to show whether and how the current policy may be improved.
28 * As can be expected, the connection close semantics are by far the most
29 * complicated part of this module. We attempt to get rid of the lwIP PCB as
30 * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
31 * However, there are various conditions that have to be met before we can
32 * forget about the PCB here--most importantly, that none of our sent data
33 * blocks are still referenced by lwIP because they have not yet been sent or
34 * acknowledged. We can only free the data blocks once lwIP is done with them.
36 * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
37 * full state tracking here. However, we do not look at a socket's TCP state
38 * while in a lwIP-generated event for that socket, because the state may not
39 * necessarily reflect the (correct or new) TCP state of the connection, nor
40 * may the PCB be available--this is the case for error events. For these
41 * reasons we use a few internal TCPF_ flags to perform partial state tracking.
43 * More generally, we tend to access lwIP PCB fields directly only when lwIP's
44 * own BSD API implementation does that too and there is no better alternative.
45 * One example of this is the check to see if our FIN was acknowledged, for
46 * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API
47 * changes later, we can change our code to imitate whatever lwIP's BSD API
48 * implementation does at that point.
51 #include <sys/socketvar.h>
52 #include <netinet/in.h>
53 #include <netinet/tcp.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/tcp_timer.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/tcp_fsm.h>
60 * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
61 * preprocessor variables. Make sure we do not attempt to use the NetBSD one
62 * where it matters. We do need one of the NetBSD definitions though.
64 static const unsigned int NETBSD_TF_NODELAY
= TF_NODELAY
;
72 #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
75 * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
79 * We fully control the send buffer, so we can let its size be set to whatever
80 * we want. The receive buffer is different: if it is smaller than the window
81 * size, we may have to refuse data that lwIP hands us, at which point more
82 * incoming data will cause lwIP to abort the TCP connection--even aside from
83 * performance issues. Therefore, we must make sure the receive buffer is
84 * larger than the TCP window at all times.
86 #define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */
87 #define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */
88 #define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */
89 #define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */
90 #define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */
91 #define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
94 * The total number of buffers that may in use for TCP socket send queues. The
95 * goal is to allow at least some progress to be made on receiving from TCP
96 * sockets and on differently-typed sockets, at least as long as the LWIP
97 * service can manage to allocate the memory it wants. For the case that it
98 * does not, we can only reactively kill off TCP sockets and/or free enqueued
99 * ethernet packets, neither of which is currently implemented (TODO).
101 #define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4)
103 /* Polling intervals, in 500-millsecond units. */
104 #define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */
105 #define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */
107 static struct tcpsock
{
108 struct ipsock tcp_ipsock
; /* IP socket, MUST be first */
109 struct tcp_pcb
*tcp_pcb
; /* lwIP TCP control block */
110 union pxfer_tcp_queue
{ /* free/accept queue */
111 TAILQ_ENTRY(tcpsock
) tq_next
; /* next in queue */
112 TAILQ_HEAD(, tcpsock
) tq_head
; /* head of queue */
114 struct tcpsock
*tcp_listener
; /* listener if on accept q. */
115 struct { /* send queue */
116 struct pbuf
*ts_head
; /* first pbuf w/unacked data */
117 struct pbuf
*ts_unsent
; /* first pbuf w/unsent data */
118 struct pbuf
*ts_tail
; /* most recently added data */
119 size_t ts_len
; /* total sent + unsent */
120 unsigned short ts_head_off
; /* offset into head pbuf */
121 unsigned short ts_unsent_off
; /* offset into unsent pbuf */
123 struct { /* receive queue */
124 struct pbuf
*tr_head
; /* first pbuf w/unrecvd data */
125 struct pbuf
**tr_pre_tailp
; /* ptr-ptr to newest pbuf */
126 size_t tr_len
; /* bytes on receive queue */
127 unsigned short tr_head_off
; /* offset into head pbuf */
128 unsigned short tr_unacked
; /* current window reduction */
130 } tcp_array
[NR_TCPSOCK
];
132 static TAILQ_HEAD(, tcpsock
) tcp_freelist
; /* list of free TCP sockets */
134 static const struct sockevent_ops tcpsock_ops
;
136 static unsigned int tcpsock_sendbufs
; /* # send buffers in use */
137 static unsigned int tcpsock_recvbufs
; /* # receive buffers in use */
139 /* A bunch of macros that are just for convenience. */
140 #define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
141 #define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock)
142 #define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp)))
143 #define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
144 #define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
145 #define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
146 #define tcpsock_is_shutdown(tcp,fl) \
147 (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
148 #define tcpsock_is_listening(tcp) \
149 (sockevent_is_listening(tcpsock_get_sock(tcp)))
150 #define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp)))
151 #define tcpsock_set_flag(tcp,fl) \
152 (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
153 #define tcpsock_clear_flag(tcp,fl) \
154 (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
156 static ssize_t
tcpsock_pcblist(struct rmib_call
*, struct rmib_node
*,
157 struct rmib_oldp
*, struct rmib_newp
*);
159 /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
160 /* TODO: add many more and make some of them writable.. */
161 static struct rmib_node net_inet_tcp_table
[] = {
162 /* 2*/ [TCPCTL_SENDSPACE
] = RMIB_INT(RMIB_RO
, TCP_SNDBUF_DEF
,
164 "Default TCP send buffer size"),
165 /* 3*/ [TCPCTL_RECVSPACE
] = RMIB_INT(RMIB_RO
, TCP_RCVBUF_DEF
,
167 "Default TCP receive buffer size"),
168 /*29*/ [TCPCTL_LOOPBACKCKSUM
] = RMIB_FUNC(RMIB_RW
| CTLTYPE_INT
, sizeof(int),
169 loopif_cksum
, "do_loopback_cksum",
170 "Perform TCP checksum on loopback"),
171 /*+0*/ [TCPCTL_MAXID
] = RMIB_FUNC(RMIB_RO
| CTLTYPE_NODE
, 0,
172 tcpsock_pcblist
, "pcblist",
173 "TCP protocol control block list"),
174 /*+1*/ [TCPCTL_MAXID
+ 1] = RMIB_FUNC(RMIB_RW
| CTLFLAG_PRIVATE
|
175 CTLFLAG_HIDDEN
| CTLTYPE_STRING
,
176 TCPISN_SECRET_HEX_LENGTH
, tcpisn_secret
,
178 "TCP ISN secret (MINIX 3 specific)")
181 static struct rmib_node net_inet_tcp_node
=
182 RMIB_NODE(RMIB_RO
, net_inet_tcp_table
, "tcp", "TCP related settings");
183 static struct rmib_node net_inet6_tcp6_node
=
184 RMIB_NODE(RMIB_RO
, net_inet_tcp_table
, "tcp6", "TCP related settings");
187 * Initialize the TCP sockets module.
194 /* Initialize the list of free TCP sockets. */
195 TAILQ_INIT(&tcp_freelist
);
197 for (slot
= 0; slot
< __arraycount(tcp_array
); slot
++)
198 TAILQ_INSERT_TAIL(&tcp_freelist
, &tcp_array
[slot
],
201 /* Initialize other variables. */
202 tcpsock_sendbufs
= 0;
204 /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
205 mibtree_register_inet(PF_INET
, IPPROTO_TCP
, &net_inet_tcp_node
);
206 mibtree_register_inet(PF_INET6
, IPPROTO_TCP
, &net_inet6_tcp6_node
);
210 * Initialize the state of a TCP socket's send queue.
213 tcpsock_reset_send(struct tcpsock
* tcp
)
216 tcp
->tcp_snd
.ts_tail
= NULL
;
217 tcp
->tcp_snd
.ts_unsent
= NULL
;
218 tcp
->tcp_snd
.ts_head
= NULL
;
219 tcp
->tcp_snd
.ts_len
= 0;
220 tcp
->tcp_snd
.ts_unsent_off
= 0;
221 tcp
->tcp_snd
.ts_head_off
= 0;
225 * Initialize the state of a TCP socket's receive queue.
228 tcpsock_reset_recv(struct tcpsock
* tcp
)
231 tcp
->tcp_rcv
.tr_pre_tailp
= NULL
;
232 tcp
->tcp_rcv
.tr_head
= NULL
;
233 tcp
->tcp_rcv
.tr_len
= 0;
234 tcp
->tcp_rcv
.tr_head_off
= 0;
235 tcp
->tcp_rcv
.tr_unacked
= 0;
239 * Create a TCP socket.
242 tcpsock_socket(int domain
, int protocol
, struct sock
** sockp
,
243 const struct sockevent_ops
** ops
)
254 return EPROTONOSUPPORT
;
257 if (TAILQ_EMPTY(&tcp_freelist
))
260 tcp
= TAILQ_FIRST(&tcp_freelist
);
263 * Initialize the structure. Do not memset it to zero, as it is still
264 * part of the linked free list. Initialization may still fail. When
265 * adding new fields, make sure to change tcpsock_clone() accordingly.
268 ip_type
= ipsock_socket(tcpsock_get_ipsock(tcp
), domain
,
269 TCP_SNDBUF_DEF
, TCP_RCVBUF_DEF
, sockp
);
271 if ((tcp
->tcp_pcb
= tcp_new_ip_type(ip_type
)) == NULL
)
273 tcp_arg(tcp
->tcp_pcb
, tcp
);
275 tcp
->tcp_listener
= NULL
;
277 tcpsock_reset_send(tcp
);
278 tcpsock_reset_recv(tcp
);
280 TAILQ_REMOVE(&tcp_freelist
, tcp
, tcp_queue
.tq_next
);
283 return tcpsock_get_id(tcp
);
287 * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
288 * incoming on listening socket 'listener'. The new socket is essentially a
289 * "clone" of the listening TCP socket, in that it should inherit any settings
290 * from the listening socket. The socket has not yet been accepted by userland
291 * so add it to the queue of connetions pending for the listening socket. On
292 * success, return OK. On failure, return a negative error code.
295 tcpsock_clone(struct tcpsock
* listener
, struct tcp_pcb
* pcb
)
299 if (TAILQ_EMPTY(&tcp_freelist
))
302 tcp
= TAILQ_FIRST(&tcp_freelist
);
305 * Initialize the structure. Do not memset it to zero, as it is still
306 * part of the linked free list. Initialization may still fail. Most
307 * settings should be inherited from the listening socket here, rather
308 * than being initialized to their default state.
311 ipsock_clone(tcpsock_get_ipsock(listener
), tcpsock_get_ipsock(tcp
),
312 tcpsock_get_id(tcp
));
317 tcpsock_reset_send(tcp
);
318 tcpsock_reset_recv(tcp
);
321 * Remove the new socket from the free list, and add it to the queue of
322 * the listening socket--in this order, because the same next pointer
325 TAILQ_REMOVE(&tcp_freelist
, tcp
, tcp_queue
.tq_next
);
327 TAILQ_INSERT_TAIL(&listener
->tcp_queue
.tq_head
, tcp
,
329 tcp
->tcp_listener
= listener
;
335 * Allocate a buffer from the pool, using the standard pool size. The returned
336 * buffer is a single element--never a chain.
339 tcpsock_alloc_buf(void)
343 pbuf
= pbuf_alloc(PBUF_RAW
, MEMPOOL_BUFSIZE
, PBUF_RAM
);
345 assert(pbuf
== NULL
|| pbuf
->len
== pbuf
->tot_len
);
351 * Free the given buffer. Ensure that pbuf_free() will not attempt to free the
352 * next buffer(s) in the chain as well. This may be called for pbufs other
353 * than those allocated with tcpsock_alloc_buf().
356 tcpsock_free_buf(struct pbuf
* pbuf
)
360 * Resetting the length is currently not necessary, but better safe
363 pbuf
->len
= pbuf
->tot_len
;
370 * Clear the send queue of a TCP socket. The caller must ensure that lwIP will
371 * no longer access any of data on the send queue.
374 tcpsock_clear_send(struct tcpsock
* tcp
)
378 assert(tcp
->tcp_pcb
== NULL
);
380 while ((phead
= tcp
->tcp_snd
.ts_head
) != NULL
) {
381 tcp
->tcp_snd
.ts_head
= phead
->next
;
383 assert(tcpsock_sendbufs
> 0);
386 tcpsock_free_buf(phead
);
389 tcpsock_reset_send(tcp
);
393 * Clear the receive queue of a TCP socket. If 'ack_data' is set, also
394 * acknowledge the previous contents of the receive queue to lwIP.
397 tcpsock_clear_recv(struct tcpsock
* tcp
, int ack_data
)
402 rlen
= tcp
->tcp_rcv
.tr_len
;
404 while ((phead
= tcp
->tcp_rcv
.tr_head
) != NULL
) {
405 tcp
->tcp_rcv
.tr_head
= phead
->next
;
407 assert(tcpsock_recvbufs
> 0);
410 tcpsock_free_buf(phead
);
414 * From now on, we will basically be discarding incoming data as fast
415 * as possible, to keep the full window open at all times.
417 if (ack_data
&& tcp
->tcp_pcb
!= NULL
&& tcp
->tcp_rcv
.tr_unacked
> 0)
418 tcp_recved(tcp
->tcp_pcb
, tcp
->tcp_rcv
.tr_unacked
);
420 tcpsock_reset_recv(tcp
);
426 * The TCP socket's PCB has been detached from the socket, typically because
427 * the connection was aborted, either by us or by lwIP. Either way, any TCP
428 * connection is gone. Clear the socket's send queue, remove the socket from
429 * a listening socket's queue, and if the socket itself is ready and allowed to
430 * be freed, free it now. The socket is ready to be freed if it was either on
431 * a listening queue or being closed already. The socket is allowed to be
432 * freed only if 'may_free' is TRUE. If the socket is not freed, its receive
433 * queue is left as is, as it may still have data to be received by userland.
436 tcpsock_cleanup(struct tcpsock
* tcp
, int may_free
)
440 assert(tcp
->tcp_pcb
== NULL
);
443 * Free any data on the send queue. This is safe to do right now,
444 * because the PCB has been aborted (or was already gone). We must be
445 * very careful about clearing the send queue in all other situations.
447 tcpsock_clear_send(tcp
);
450 * If this was a socket pending acceptance, remove it from the
451 * corresponding listener socket's queue, and free it. Otherwise, free
452 * the socket only if it suspended a graceful close operation.
454 if (tcp
->tcp_listener
!= NULL
) {
455 TAILQ_REMOVE(&tcp
->tcp_listener
->tcp_queue
.tq_head
, tcp
,
457 tcp
->tcp_listener
= NULL
;
460 * The listener socket's backlog count should be adjusted by
461 * lwIP whenever the PCB is freed up, so we need (and must) not
462 * attempt to do that here.
467 destroy
= sockevent_is_closing(tcpsock_get_sock(tcp
));
470 * Do not free the socket if 'may_free' is FALSE. That flag may be set
471 * if we are currently in the second tcpsock_close() call on the
472 * socket, in which case sockevent_is_closing() is TRUE but we must
473 * still not free the socket now: doing so would derail libsockevent.
475 if (destroy
&& may_free
) {
476 (void)tcpsock_clear_recv(tcp
, FALSE
/*ack_data*/);
478 sockevent_raise(tcpsock_get_sock(tcp
), SEV_CLOSE
);
485 * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is
486 * connected, this will cause the connection to be reset. The PCB, which must
487 * have still been present before the call, will be gone after the call.
490 tcpsock_pcb_abort(struct tcpsock
* tcp
)
493 assert(tcp
->tcp_pcb
!= NULL
);
494 assert(!tcpsock_is_listening(tcp
));
496 tcp_recv(tcp
->tcp_pcb
, NULL
);
497 tcp_sent(tcp
->tcp_pcb
, NULL
);
498 tcp_err(tcp
->tcp_pcb
, NULL
);
499 tcp_poll(tcp
->tcp_pcb
, NULL
, TCP_POLL_REG_INTERVAL
);
501 tcp_arg(tcp
->tcp_pcb
, NULL
);
503 tcp_abort(tcp
->tcp_pcb
);
509 * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is
510 * connected, its graceful close will be finished by lwIP in the background.
511 * The PCB, which must have still been present before the call, will be gone
515 tcpsock_pcb_close(struct tcpsock
* tcp
)
519 assert(tcp
->tcp_pcb
!= NULL
);
520 assert(tcp
->tcp_snd
.ts_len
== 0);
522 if (!tcpsock_is_listening(tcp
)) {
523 tcp_recv(tcp
->tcp_pcb
, NULL
);
524 tcp_sent(tcp
->tcp_pcb
, NULL
);
525 tcp_err(tcp
->tcp_pcb
, NULL
);
526 tcp_poll(tcp
->tcp_pcb
, NULL
, TCP_POLL_REG_INTERVAL
);
529 tcp_arg(tcp
->tcp_pcb
, NULL
);
531 if ((err
= tcp_close(tcp
->tcp_pcb
)) != ERR_OK
)
532 panic("unexpected TCP close failure: %d", err
);
538 * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
539 * FALSE if they are not. Upon calling this function, the socket's PCB must
543 tcpsock_may_close(struct tcpsock
* tcp
)
546 assert(tcp
->tcp_pcb
!= NULL
);
549 * Regular closing of the PCB requires three conditions to be met:
551 * 1. all our data has been transmitted AND acknowledged, so that we do
552 * not risk corruption in case there are still unsent or unack'ed
553 * data buffers that may otherwise be recycled too soon;
554 * 2. we have sent our FIN to the peer; and,
555 * 3. we have received a FIN from the peer.
557 return ((tcpsock_get_flags(tcp
) & (TCPF_SENT_FIN
| TCPF_RCVD_FIN
)) ==
558 (TCPF_SENT_FIN
| TCPF_RCVD_FIN
) && tcp
->tcp_snd
.ts_len
== 0);
562 * The given socket is ready to be closed as per the tcpsock_may_close() rules.
563 * This implies that its send queue is already empty. Gracefully close the
564 * PCB. In addition, if the socket is being closed gracefully, meaning we
565 * suspended an earlier tcpsock_close() call (and as such already emptied the
566 * receive queue as well), then tell libsockevent that the close is finished,
567 * freeing the socket. Return TRUE if the socket has indeed been freed this
568 * way, or FALSE if the socket is still around.
571 tcpsock_finish_close(struct tcpsock
* tcp
)
574 assert(tcp
->tcp_snd
.ts_len
== 0);
575 assert(tcp
->tcp_listener
== NULL
);
578 * If we get here, we have already shut down the sending side of the
579 * PCB. Technically, we are interested only in shutting down the
580 * receiving side of the PCB here, so that lwIP may decide to recycle
581 * the socket later etcetera. We call tcp_close() because we do not
582 * want to rely on tcp_shutdown(RX) doing the exact same thing.
583 * However, we do rely on the fact that the PCB is not immediately
584 * destroyed by the tcp_close() call: otherwise we may have to return
585 * ERR_ABRT if this function is called from a lwIP-generated event.
587 tcpsock_pcb_close(tcp
);
590 * If we suspended an earlier tcpsock_close() call, we have to tell
591 * libsockevent that the close operation is now complete.
593 if (sockevent_is_closing(tcpsock_get_sock(tcp
))) {
594 assert(tcp
->tcp_rcv
.tr_len
== 0);
596 sockevent_raise(tcpsock_get_sock(tcp
), SEV_CLOSE
);
604 * Attempt to start or resume enqueuing data and/or a FIN to send on the given
605 * TCP socket. Return TRUE if anything at all could be newly enqueued on the
606 * lwIP PCB, even if less than desired. In that case, the caller should try to
607 * send whatever was enqueued, and if applicable, check if the socket may now
608 * be closed (due to the FIN being enqueued). In particular, in any situation
609 * where the socket may be in the process of being closed, the caller must use
610 * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could
611 * be enqueued, in which case no send attempt need to be made either.
614 tcpsock_pcb_enqueue(struct tcpsock
* tcp
)
616 struct pbuf
*punsent
;
622 assert(tcp
->tcp_pcb
!= NULL
);
624 if (tcpsock_get_flags(tcp
) & TCPF_FULL
)
628 * Attempt to enqueue more unsent data, if any, on the PCB's send
633 while (tcp
->tcp_snd
.ts_unsent
!= NULL
) {
634 if ((space
= tcp_sndbuf(tcp
->tcp_pcb
)) == 0)
638 * We may maintain a non-NULL unsent pointer even when there is
639 * nothing more to send right now, because the tail buffer may
640 * be filled up further later on.
642 punsent
= tcp
->tcp_snd
.ts_unsent
;
644 assert(punsent
->len
>= tcp
->tcp_snd
.ts_unsent_off
);
646 chunk
= (size_t)punsent
->len
- tcp
->tcp_snd
.ts_unsent_off
;
653 /* Try to enqueue more data for sending. */
654 if (chunk
< punsent
->len
|| punsent
->next
!= NULL
)
655 flags
= TCP_WRITE_FLAG_MORE
;
659 err
= tcp_write(tcp
->tcp_pcb
, (char *)punsent
->payload
+
660 tcp
->tcp_snd
.ts_unsent_off
, chunk
, flags
);
663 * Since tcp_write() enqueues data only, it should only return
664 * out-of-memory errors; no fatal ones. In any case, stop.
667 assert(err
== ERR_MEM
);
672 /* We have successfully enqueued data. */
675 tcp
->tcp_snd
.ts_unsent_off
+= chunk
;
677 if (tcp
->tcp_snd
.ts_unsent_off
< punsent
->tot_len
) {
678 assert(tcp
->tcp_snd
.ts_unsent_off
< punsent
->len
||
679 punsent
->next
== NULL
);
684 tcp
->tcp_snd
.ts_unsent
= punsent
->next
;
685 tcp
->tcp_snd
.ts_unsent_off
= 0;
689 * If all pending data has been enqueued for sending, and we should
690 * shut down the sending end of the socket, try that now.
692 if ((tcp
->tcp_snd
.ts_unsent
== NULL
||
693 tcp
->tcp_snd
.ts_unsent_off
== tcp
->tcp_snd
.ts_unsent
->len
) &&
694 tcpsock_is_shutdown(tcp
, SFL_SHUT_WR
) &&
695 !(tcpsock_get_flags(tcp
) & TCPF_SENT_FIN
)) {
696 err
= tcp_shutdown(tcp
->tcp_pcb
, 0 /*shut_rx*/, 1 /*shut_tx*/);
700 * We have successfully enqueued a FIN. The caller is
701 * now responsible for checking whether the PCB and
702 * possibly even the socket object can now be freed.
704 tcpsock_set_flag(tcp
, TCPF_SENT_FIN
);
708 assert(err
== ERR_MEM
);
711 * FIXME: the resolution for lwIP bug #47485 has taken
712 * away even more control over the closing process from
713 * us, making tracking sockets especially for SO_LINGER
714 * even harder. For now, we simply effectively undo
715 * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
716 * returns ERR_MEM. This will not be sustainable in
717 * the long term, though.
719 tcp
->tcp_pcb
->flags
&= ~TF_CLOSEPEND
;
721 tcpsock_set_flag(tcp
, TCPF_FULL
);
729 * Request lwIP to start sending any enqueued data and/or FIN on the TCP
730 * socket's lwIP PCB. On success, return OK. On failure, return a negative
731 * error code, after cleaning up the socket, freeing the PCB. If the socket
732 * was already being closed, also free the socket object in that case; the
733 * caller must then not touch the socket object anymore upon return. If the
734 * socket object is not freed, and if 'raise_error' is TRUE, raise the error
735 * on the socket object.
738 tcpsock_pcb_send(struct tcpsock
* tcp
, int raise_error
)
743 assert(tcp
->tcp_pcb
!= NULL
);
746 * If we have enqueued something, ask lwIP to send TCP packets now.
747 * This may result in a fatal error, in which case we clean up the
748 * socket and return the error to the caller. Since cleaning up the
749 * socket may free the socket object, and the caller cannot tell
750 * whether that will happen or has happened, also possibly raise the
751 * error on the socket object if it is not gone. As such, callers that
752 * set 'raise_error' to FALSE must know for sure that the socket was
753 * not being closed, for example because the caller is processing a
754 * (send) call from userland.
756 err
= tcp_output(tcp
->tcp_pcb
);
758 if (err
!= ERR_OK
&& err
!= ERR_MEM
) {
759 tcpsock_pcb_abort(tcp
);
761 r
= util_convert_err(err
);
763 if (!tcpsock_cleanup(tcp
, TRUE
/*may_free*/)) {
765 sockevent_set_error(tcpsock_get_sock(tcp
), r
);
767 /* Otherwise, do not touch the socket object anymore! */
775 * Callback from lwIP. The given number of data bytes have been acknowledged
776 * as received by the remote end. Dequeue and free data from the TCP socket's
777 * send queue as appropriate.
780 tcpsock_event_sent(void * arg
, struct tcp_pcb
* pcb __unused
, uint16_t len
)
782 struct tcpsock
*tcp
= (struct tcpsock
*)arg
;
787 assert(pcb
== tcp
->tcp_pcb
);
790 assert(tcp
->tcp_snd
.ts_len
>= len
);
791 assert(tcp
->tcp_snd
.ts_head
!= NULL
);
796 * First see if we can free up whole buffers. Check against the head
797 * buffer's 'len' rather than 'tot_len', or we may end up leaving an
798 * empty buffer on the chain.
800 while ((phead
= tcp
->tcp_snd
.ts_head
) != NULL
&&
801 left
>= (size_t)phead
->len
- tcp
->tcp_snd
.ts_head_off
) {
802 left
-= (size_t)phead
->len
- tcp
->tcp_snd
.ts_head_off
;
804 tcp
->tcp_snd
.ts_head
= phead
->next
;
805 tcp
->tcp_snd
.ts_head_off
= 0;
807 if (phead
== tcp
->tcp_snd
.ts_unsent
) {
808 assert(tcp
->tcp_snd
.ts_unsent_off
== phead
->len
);
810 tcp
->tcp_snd
.ts_unsent
= phead
->next
;
811 tcp
->tcp_snd
.ts_unsent_off
= 0;
814 assert(tcpsock_sendbufs
> 0);
817 tcpsock_free_buf(phead
);
821 * The rest of the given length is for less than the current head
825 assert(tcp
->tcp_snd
.ts_head
!= NULL
);
826 assert((size_t)tcp
->tcp_snd
.ts_head
->len
-
827 tcp
->tcp_snd
.ts_head_off
> left
);
829 tcp
->tcp_snd
.ts_head_off
+= left
;
832 tcp
->tcp_snd
.ts_len
-= (size_t)len
;
834 if (tcp
->tcp_snd
.ts_head
== NULL
) {
835 assert(tcp
->tcp_snd
.ts_len
== 0);
836 assert(tcp
->tcp_snd
.ts_unsent
== NULL
);
837 tcp
->tcp_snd
.ts_tail
= NULL
;
839 assert(tcp
->tcp_snd
.ts_len
> 0);
842 * If we emptied the send queue, and we already managed to send a FIN
843 * earlier, we may now have met all requirements to close the socket's
844 * PCB. Otherwise, we may also be able to send more now, so try to
845 * resume sending. Since we are invoked from the "sent" event,
846 * tcp_output() will not actually process anything, and so we do not
847 * call it either. If we did, we would have to deal with errors here.
849 if (tcpsock_may_close(tcp
)) {
850 if (tcpsock_finish_close(tcp
))
853 tcpsock_clear_flag(tcp
, TCPF_FULL
);
856 * If we now manage to enqueue a FIN, we may be ready to close
859 if (tcpsock_pcb_enqueue(tcp
)) {
860 if (tcpsock_may_close(tcp
) &&
861 tcpsock_finish_close(tcp
))
866 /* The user may also be able to send more now. */
867 sockevent_raise(tcpsock_get_sock(tcp
), SEV_SEND
);
873 * Check whether any (additional) data previously received on a TCP socket
874 * should be acknowledged, possibly allowing the remote end to send additional
878 tcpsock_ack_recv(struct tcpsock
* tcp
)
880 size_t rcvbuf
, left
, delta
, ack
;
882 assert(tcp
->tcp_pcb
!= NULL
);
885 * We must make sure that at all times, we can still add an entire
886 * window's worth of data to the receive queue. If the amount of free
887 * space drops below that threshold, we stop acknowledging received
888 * data. The user may change the receive buffer size at all times; we
889 * update the window size lazily as appropriate.
891 rcvbuf
= tcpsock_get_rcvbuf(tcp
);
893 if (rcvbuf
> tcp
->tcp_rcv
.tr_len
&& tcp
->tcp_rcv
.tr_unacked
> 0) {
895 * The number of bytes that lwIP can still give us at any time
896 * is represented as 'left'. The number of bytes that we still
897 * allow to be stored in the receive queue is represented as
898 * 'delta'. We must make sure that 'left' does not ever exceed
899 * 'delta' while acknowledging as many bytes as possible under
902 left
= TCP_WND
- tcp
->tcp_rcv
.tr_unacked
;
903 delta
= rcvbuf
- tcp
->tcp_rcv
.tr_len
;
908 if (ack
> tcp
->tcp_rcv
.tr_unacked
)
909 ack
= tcp
->tcp_rcv
.tr_unacked
;
911 tcp_recved(tcp
->tcp_pcb
, ack
);
913 tcp
->tcp_rcv
.tr_unacked
-= ack
;
915 assert(tcp
->tcp_rcv
.tr_len
+ TCP_WND
-
916 tcp
->tcp_rcv
.tr_unacked
<= rcvbuf
);
922 * Attempt to merge two consecutive underfilled buffers in the receive queue of
923 * a TCP socket, freeing up one of the two buffers as a result. The first
924 * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
925 * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to
926 * the first buffer. The second buffer may be followed by additional buffers
927 * with even more new data. Return TRUE if buffers have been merged, in which
928 * case the pointer at 'pnext' may have changed, and no assumptions should be
929 * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE
930 * if no merging was necessary or if no new buffer could be allocated.
933 tcpsock_try_merge(struct pbuf
**pnext
, struct pbuf
* ptail
, struct pbuf
* pbuf
)
937 assert(*pnext
== ptail
);
938 assert(ptail
->next
== pbuf
);
941 * Unfortunately, we cannot figure out what kind of pbuf we were given
942 * by the lower layers, so we cannot merge two buffers without first
943 * allocating a third. Once we have done that, though, we can easily
944 * merge more into that new buffer. For now we use the following
947 * 1. if two consecutive lwIP-provided buffers are both used less than
948 * half the size of a full buffer, try to allocate a new buffer and
949 * copy both lwIP-provided buffers into that new buffer, freeing up
950 * the pair afterwards;
951 * 2. if the tail buffer on the chain is allocated by us and not yet
952 * full, and the next buffer's contents can be added to the tail
953 * buffer in their entirety, do just that.
955 * Obviously there is a trade-off between the performance overhead of
956 * copying and the resource overhead of keeping less-than-full buffers
957 * on the receive queue, but this policy should both keep actual memory
958 * usage to no more than twice the receive queue length and prevent
959 * excessive copying. The policy deliberately performs more aggressive
960 * merging into a buffer that we allocated ourselves.
962 if (ptail
->tot_len
<= MEMPOOL_BUFSIZE
/ 2 &&
963 pbuf
->len
<= MEMPOOL_BUFSIZE
/ 2) {
967 assert(ptail
->tot_len
== ptail
->len
);
968 assert(pbuf
->tot_len
== pbuf
->len
);
970 pnew
= tcpsock_alloc_buf();
974 memcpy(pnew
->payload
, ptail
->payload
, ptail
->len
);
975 memcpy((char *)pnew
->payload
+ ptail
->len
, pbuf
->payload
,
977 pnew
->len
= ptail
->len
+ pbuf
->len
;
978 assert(pnew
->len
<= pnew
->tot_len
);
980 pnew
->next
= pbuf
->next
;
981 /* For now, we need not inherit any flags from either pbuf. */
985 /* One allocated, two about to be deallocated. */
986 assert(tcpsock_recvbufs
> 0);
989 tcpsock_free_buf(ptail
);
990 tcpsock_free_buf(pbuf
);
993 } else if (ptail
->tot_len
- ptail
->len
>= pbuf
->len
) {
997 memcpy((char *)ptail
->payload
+ ptail
->len
, pbuf
->payload
,
1000 ptail
->len
+= pbuf
->len
;
1002 ptail
->next
= pbuf
->next
;
1004 assert(tcpsock_recvbufs
> 0);
1007 tcpsock_free_buf(pbuf
);
1015 * Callback from lwIP. New data or flags have been received on a TCP socket.
1018 tcpsock_event_recv(void * arg
, struct tcp_pcb
* pcb __unused
,
1019 struct pbuf
* pbuf
, err_t err
)
1021 struct tcpsock
*tcp
= (struct tcpsock
*)arg
;
1022 struct pbuf
*ptail
, **pprevp
;
1025 assert(tcp
!= NULL
);
1026 assert(pcb
== tcp
->tcp_pcb
);
1029 * lwIP should never provide anything other than ERR_OK in 'err', and
1030 * it is not clear what we should do if it would. If lwIP ever changes
1031 * in this regard, we will likely have to change this code accordingly.
1034 panic("TCP receive event with error: %d", err
);
1036 /* If the given buffer is NULL, we have received a FIN. */
1038 tcpsock_set_flag(tcp
, TCPF_RCVD_FIN
);
1040 /* Userland may now receive EOF. */
1041 if (!tcpsock_is_shutdown(tcp
, SFL_SHUT_RD
))
1042 sockevent_raise(tcpsock_get_sock(tcp
), SEV_RECV
);
1045 * If we were in the process of closing the socket, and we
1046 * receive a FIN before our FIN got acknowledged, we close the
1047 * socket anyway, as described in tcpsock_close(). However, if
1048 * there is still unacknowledged outgoing data or we did not
1049 * even manage to send our FIN yet, hold off closing the socket
1052 if (tcpsock_may_close(tcp
))
1053 (void)tcpsock_finish_close(tcp
);
1059 * If the socket is being closed, receiving new data should cause a
1062 if (sockevent_is_closing(tcpsock_get_sock(tcp
))) {
1063 tcpsock_pcb_abort(tcp
);
1065 (void)tcpsock_cleanup(tcp
, TRUE
/*may_free*/);
1066 /* Do not touch the socket object anymore! */
1074 * If the socket has already been shut down for reading, discard the
1075 * incoming data and do nothing else.
1077 if (tcpsock_is_shutdown(tcp
, SFL_SHUT_RD
)) {
1078 tcp_recved(tcp
->tcp_pcb
, pbuf
->tot_len
);
1086 * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would
1087 * enable the receive functionality to delay delivering "un-pushed"
1088 * data to applications. The implementation of this scheme could track
1089 * the amount of data up to and including the last-pushed segment using
1090 * a "tr_push_len" field or so. Deciding when to deliver "un-pushed"
1091 * data after all is a bit tricker though. As far as I can tell, the
1092 * BSDs do not implement anything like that. Windows does, and this
1093 * results in interaction problems with even more lightweight TCP/IP
1094 * stacks that do not send the TCP PSH flag. Currently, there is no
1095 * obvious benefit for us to support delaying data delivery like that.
1096 * In addition, testing its implementation reliably would be difficult.
1099 len
= (size_t)pbuf
->tot_len
;
1102 * Count the number of buffers that are now owned by us. The new total
1103 * of buffers owned by us must not exceed the size of the memory pool.
1104 * Any more would indicate an accounting error. Note that
1105 * tcpsock_recvbufs is currently used for debugging only!
1107 tcpsock_recvbufs
+= pbuf_clen(pbuf
);
1108 assert(tcpsock_recvbufs
< mempool_cur_buffers());
1111 * The pre-tail pointer points to whatever is pointing to the tail
1112 * buffer. The latter pointer may be the 'tr_head' field in our
1113 * tcpsock structure, or the 'next' field in the penultimate buffer,
1114 * or NULL if there are currently no buffers on the receive queue.
1116 if ((pprevp
= tcp
->tcp_rcv
.tr_pre_tailp
) != NULL
) {
1119 assert(ptail
!= NULL
);
1120 assert(ptail
->next
== NULL
);
1121 assert(tcp
->tcp_rcv
.tr_head
!= NULL
);
1124 pbuf
->tot_len
= pbuf
->len
; /* to help freeing on merges */
1126 if (tcpsock_try_merge(pprevp
, ptail
, pbuf
)) {
1132 pprevp
= &ptail
->next
;
1134 assert(tcp
->tcp_rcv
.tr_head
== NULL
);
1135 assert(tcp
->tcp_rcv
.tr_head_off
== 0);
1137 tcp
->tcp_rcv
.tr_head
= pbuf
;
1139 pprevp
= &tcp
->tcp_rcv
.tr_head
;
1143 * Chop up the chain into individual buffers. This is necessary as we
1144 * overload 'tot_len' to mean "space available in the buffer", as we
1145 * want for buffers allocated by us as part of buffer merges. Also get
1146 * a pointer to the pointer to the new penultimate tail buffer. Due to
1147 * merging, the chain may already be empty by now, though.
1150 for (; pbuf
->next
!= NULL
; pbuf
= pbuf
->next
) {
1151 pbuf
->tot_len
= pbuf
->len
;
1153 pprevp
= &pbuf
->next
;
1155 assert(pbuf
->len
== pbuf
->tot_len
);
1158 assert(*pprevp
!= NULL
);
1159 assert((*pprevp
)->next
== NULL
);
1160 tcp
->tcp_rcv
.tr_pre_tailp
= pprevp
;
1162 tcp
->tcp_rcv
.tr_len
+= len
;
1163 tcp
->tcp_rcv
.tr_unacked
+= len
;
1165 assert(tcp
->tcp_rcv
.tr_unacked
<= TCP_WND
);
1168 * Note that tr_len may now exceed the receive buffer size in the
1169 * highly exceptional case that the user is adjusting the latter after
1170 * the socket had already received data.
1173 /* See if we can immediately acknowledge some or all of the data. */
1174 tcpsock_ack_recv(tcp
);
1176 /* Also wake up any receivers now. */
1177 sockevent_raise(tcpsock_get_sock(tcp
), SEV_RECV
);
1183 * Callback from lwIP. The PCB corresponding to the socket identified by 'arg'
1184 * has been closed by lwIP, with the reason specified in 'err': either the
1185 * connection has been aborted locally (ERR_ABRT), it has been reset by the
1186 * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
1189 tcpsock_event_err(void * arg
, err_t err
)
1191 struct tcpsock
*tcp
= (struct tcpsock
*)arg
;
1194 assert(tcp
!= NULL
);
1195 assert(tcp
->tcp_pcb
!= NULL
);
1196 assert(err
!= ERR_OK
);
1198 /* The original PCB is now gone, or will be shortly. */
1199 tcp
->tcp_pcb
= NULL
;
1202 * Clean up the socket. As a result it may be freed, in which case we
1203 * must not touch it anymore. No need to return ERR_ABRT from here, as
1204 * the PCB has been aborted already.
1206 if (tcpsock_cleanup(tcp
, TRUE
/*may_free*/))
1209 if (err
== ERR_CLSD
) {
1211 * We may get here if the socket is shut down for writing and
1212 * we already received a FIN from the remote side, thus putting
1213 * the socket in LAST_ACK state, and we receive that last
1214 * acknowledgment. There is nothing more we need to do.
1216 * We will never get here in the other case that ERR_CLSD is
1217 * raised, which is when the socket is reset because of
1218 * unacknowledged data while closing: we handle the
1219 * reset-on-ACK case ourselves in tcpsock_close(), and the
1220 * socket is in closing state after that.
1222 assert(tcpsock_is_shutdown(tcp
, SFL_SHUT_WR
));
1223 assert(tcpsock_get_flags(tcp
) & TCPF_RCVD_FIN
);
1226 * Anything else should be an error directly from lwIP;
1227 * currently either ERR_ABRT and ERR_RST. Covert it to a
1228 * regular error and set it on the socket. Doing so will also
1229 * raise the appropriate events.
1232 * Unfortunately, lwIP is not throwing accurate errors even
1233 * when it can. We convert some errors to reflect more
1234 * accurately the most likely cause.
1236 * TODO: fix lwIP in this regard..
1238 r
= util_convert_err(err
);
1240 if (tcpsock_get_flags(tcp
) & TCPF_CONNECTING
) {
1242 case ERR_ABRT
: r
= ETIMEDOUT
; break;
1243 case ERR_RST
: r
= ECONNREFUSED
; break;
1247 sockevent_set_error(tcpsock_get_sock(tcp
), r
);
1252 * Callback from lwIP. Perform regular checks on a TCP socket. This function
1253 * is called one per five seconds on connected sockets, and twice per second on
1257 tcpsock_event_poll(void * arg
, struct tcp_pcb
* pcb __unused
)
1259 struct tcpsock
*tcp
= (struct tcpsock
*)arg
;
1263 assert(tcp
!= NULL
);
1264 assert(pcb
== tcp
->tcp_pcb
);
1267 * If we ended up running out of buffers earlier, try resuming any send
1268 * requests now, both for enqueuing TCP data with lwIP and for user
1271 if (tcpsock_get_flags(tcp
) & (TCPF_FULL
| TCPF_OOM
)) {
1272 tcpsock_clear_flag(tcp
, TCPF_FULL
);
1273 tcpsock_clear_flag(tcp
, TCPF_OOM
);
1275 /* See if we can enqueue more data with lwIP. */
1276 if (tcpsock_pcb_enqueue(tcp
)) {
1277 /* In some cases, we can now close the PCB. */
1278 if (tcpsock_may_close(tcp
)) {
1279 (void)tcpsock_finish_close(tcp
);
1281 * The PCB is definitely gone here, and the
1282 * entire socket object may be gone now too.
1283 * Do not touch either anymore!
1290 * If actually sending the data fails, the PCB will be
1291 * gone, and the socket object may be gone as well. Do
1292 * not touch either anymore in that case!
1294 if (tcpsock_pcb_send(tcp
, TRUE
/*raise_error*/) != OK
)
1299 * If we ran out of buffers earlier, it may be possible to take
1300 * in more data from a user process now, even if we did not
1301 * manage to enqueue any more pending data with lwIP.
1303 sockevent_raise(tcpsock_get_sock(tcp
), SEV_SEND
);
1305 assert(tcp
->tcp_pcb
!= NULL
);
1306 } else if (tcp
->tcp_snd
.ts_unsent
!= NULL
&&
1307 tcp
->tcp_snd
.ts_unsent_off
< tcp
->tcp_snd
.ts_unsent
->len
) {
1309 * If the send buffer is full, we will no longer call
1310 * tcp_output(), which means we may also miss out on fatal
1311 * errors that would otherwise kill the connection (e.g., no
1312 * route). As a result, the connection may erroneously
1313 * continue to exist for a long time. To avoid this, we call
1314 * tcp_output() every once in a while when there are still
1317 err
= tcp_output(tcp
->tcp_pcb
);
1319 if (err
!= ERR_OK
&& err
!= ERR_MEM
) {
1320 tcpsock_pcb_abort(tcp
);
1322 if (!tcpsock_cleanup(tcp
, TRUE
/*may_free*/)) {
1323 r
= util_convert_err(err
);
1325 sockevent_set_error(tcpsock_get_sock(tcp
), r
);
1327 /* Otherwise do not touch the socket object anymore! */
1334 * If we are closing the socket, and we sent a FIN, see if the FIN got
1335 * acknowledged. If so, finish closing the socket. Unfortunately, we
1336 * can perform this check by polling only. TODO: change lwIP..
1338 if (sockevent_is_closing(tcpsock_get_sock(tcp
)) &&
1339 (tcpsock_get_flags(tcp
) & TCPF_SENT_FIN
) &&
1340 tcp
->tcp_pcb
->unsent
== NULL
&& tcp
->tcp_pcb
->unacked
== NULL
) {
1341 assert(tcp
->tcp_snd
.ts_len
== 0);
1343 tcpsock_finish_close(tcp
);
1350 * Bind a TCP socket to a local address.
1353 tcpsock_bind(struct sock
* sock
, const struct sockaddr
* addr
,
1354 socklen_t addr_len
, endpoint_t user_endpt
)
1356 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1362 if (tcp
->tcp_pcb
== NULL
|| tcp
->tcp_pcb
->state
!= CLOSED
)
1365 if ((r
= ipsock_get_src_addr(tcpsock_get_ipsock(tcp
), addr
, addr_len
,
1366 user_endpt
, &tcp
->tcp_pcb
->local_ip
, tcp
->tcp_pcb
->local_port
,
1367 FALSE
/*allow_mcast*/, &ipaddr
, &port
)) != OK
)
1370 err
= tcp_bind(tcp
->tcp_pcb
, &ipaddr
, port
);
1372 return util_convert_err(err
);
1376 * Callback from lwIP. A new connection 'pcb' has arrived on the listening
1377 * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that
1378 * lwIP could not accept the connection itself.
1381 tcpsock_event_accept(void * arg
, struct tcp_pcb
* pcb
, err_t err
)
1383 struct tcpsock
*tcp
= (struct tcpsock
*)arg
;
1385 assert(tcp
!= NULL
);
1386 assert(tcpsock_is_listening(tcp
));
1389 * If the given PCB is NULL, then lwIP ran out of memory allocating a
1390 * PCB for the new connection. There is nothing we can do with that
1391 * information. Also check 'err' just to make sure.
1393 if (pcb
== NULL
|| err
!= OK
)
1397 * The TCP socket is the listening socket, but the PCB is for the
1398 * incoming connection.
1400 if (tcpsock_clone(tcp
, pcb
) != OK
) {
1402 * We could not allocate the resources necessary to accept the
1403 * connection. Abort it immediately.
1411 * The connection has not yet been accepted, and thus should still be
1412 * considered on the listen queue.
1414 tcp_backlog_delayed(pcb
);
1416 /* Set the callback functions. */
1417 tcp_recv(pcb
, tcpsock_event_recv
);
1418 tcp_sent(pcb
, tcpsock_event_sent
);
1419 tcp_err(pcb
, tcpsock_event_err
);
1420 tcp_poll(pcb
, tcpsock_event_poll
, TCP_POLL_REG_INTERVAL
);
1422 sockevent_raise(tcpsock_get_sock(tcp
), SEV_ACCEPT
);
1428 * Put a TCP socket in listening mode.
1431 tcpsock_listen(struct sock
* sock
, int backlog
)
1433 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1434 struct tcp_pcb
*pcb
;
1437 /* The maximum backlog value must not exceed its field size. */
1438 assert(SOMAXCONN
<= UINT8_MAX
);
1441 * Allow only CLOSED sockets to enter listening mode. If the socket
1442 * was already in listening mode, allow its backlog value to be
1443 * updated, even if it was shut down already (making this a no-op).
1445 if (!tcpsock_is_listening(tcp
) &&
1446 (tcp
->tcp_pcb
== NULL
|| tcp
->tcp_pcb
->state
!= CLOSED
))
1450 * If the socket was not already in listening mode, put it in that mode
1451 * now. That involves switching PCBs as lwIP attempts to save memory
1452 * by replacing the original PCB with a smaller one. If the socket was
1453 * already in listening mode, simply update its backlog value--this has
1454 * no effect on the sockets already in the backlog.
1456 if (!tcpsock_is_listening(tcp
)) {
1457 assert(tcp
->tcp_pcb
!= NULL
);
1460 * If the socket has not been bound to a port yet, do that
1461 * first. This does mean that the listen call may fail with
1462 * side effects, but that is acceptable in this case.
1464 if (tcp
->tcp_pcb
->local_port
== 0) {
1465 err
= tcp_bind(tcp
->tcp_pcb
, &tcp
->tcp_pcb
->local_ip
,
1469 return util_convert_err(err
);
1473 * Clear the argument on the PCB that is about to be replaced,
1474 * because if we do not, once the PCB is reused (which does not
1475 * clear the argument), we might get weird events. Do this
1476 * before the tcp_listen() call, because we should no longer
1477 * access the old PCB afterwards (even if we can).
1479 tcp_arg(tcp
->tcp_pcb
, NULL
);
1481 pcb
= tcp_listen_with_backlog_and_err(tcp
->tcp_pcb
, backlog
,
1485 tcp_arg(tcp
->tcp_pcb
, tcp
); /* oops, undo. */
1487 return util_convert_err(err
);
1493 tcp_accept(pcb
, tcpsock_event_accept
);
1495 /* Initialize the queue head for sockets pending acceptance. */
1496 TAILQ_INIT(&tcp
->tcp_queue
.tq_head
);
1497 } else if (tcp
->tcp_pcb
!= NULL
)
1498 tcp_backlog_set(tcp
->tcp_pcb
, backlog
);
1504 * Callback from lwIP. A socket connection attempt has succeeded. Note that
1505 * failed socket events will trigger the tcpsock_event_err() callback instead.
1508 tcpsock_event_connected(void * arg
, struct tcp_pcb
* pcb __unused
, err_t err
)
1510 struct tcpsock
*tcp
= (struct tcpsock
*)arg
;
1512 assert(tcp
!= NULL
);
1513 assert(pcb
== tcp
->tcp_pcb
);
1514 assert(tcpsock_get_flags(tcp
) & TCPF_CONNECTING
);
1517 * If lwIP ever changes so that this callback is called for connect
1518 * failures as well, then we need to change the code here accordingly.
1521 panic("TCP connected event with error: %d", err
);
1523 tcpsock_clear_flag(tcp
, TCPF_CONNECTING
);
1525 sockevent_raise(tcpsock_get_sock(tcp
), SEV_CONNECT
| SEV_SEND
);
1531 * Connect a TCP socket to a remote address.
1534 tcpsock_connect(struct sock
* sock
, const struct sockaddr
* addr
,
1535 socklen_t addr_len
, endpoint_t user_endpt
)
1537 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1544 * Listening sockets may not have a PCB, so we use higher-level flags
1545 * to throw the correct error code for those instead.
1547 if (tcpsock_is_listening(tcp
))
1551 * If there is no longer any PCB, we obviously cannot perform the
1552 * connection, but POSIX is not clear on which error to return. We
1555 if (tcp
->tcp_pcb
== NULL
)
1559 * The only state from which a connection can be initiated, is CLOSED.
1560 * Some of the other states require distinct error codes, though.
1562 switch (tcp
->tcp_pcb
->state
) {
1568 assert(0); /* we just checked.. */
1574 * Get the destination address, and attempt to start connecting. If
1575 * the socket was not bound before, or it was bound to a port only,
1576 * then lwIP will select a source address for us. We cannot do this
1577 * ourselves even if we wanted to: it is impossible to re-bind a TCP
1578 * PCB in the case it was previously bound to a port only.
1580 if ((r
= ipsock_get_dst_addr(tcpsock_get_ipsock(tcp
), addr
, addr_len
,
1581 &tcp
->tcp_pcb
->local_ip
, &dst_addr
, &dst_port
)) != OK
)
1584 err
= tcp_connect(tcp
->tcp_pcb
, &dst_addr
, dst_port
,
1585 tcpsock_event_connected
);
1588 * Note that various tcp_connect() error cases will leave the PCB with
1589 * a newly set local and remote IP address anyway. We should be
1590 * careful not to rely on the addresses being as they were before.
1593 return util_convert_err(err
);
1595 /* Set the other callback functions. */
1596 tcp_recv(tcp
->tcp_pcb
, tcpsock_event_recv
);
1597 tcp_sent(tcp
->tcp_pcb
, tcpsock_event_sent
);
1598 tcp_err(tcp
->tcp_pcb
, tcpsock_event_err
);
1599 tcp_poll(tcp
->tcp_pcb
, tcpsock_event_poll
, TCP_POLL_REG_INTERVAL
);
1602 * Set a flag so that we can correct lwIP's error codes in case the
1605 tcpsock_set_flag(tcp
, TCPF_CONNECTING
);
1611 * Test whether any new connections are pending on a listening TCP socket.
1614 tcpsock_test_accept(struct sock
* sock
)
1616 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1618 /* Is this socket in listening mode at all? */
1619 if (!tcpsock_is_listening(tcp
))
1622 /* Are there any connections to accept right now? */
1623 if (!TAILQ_EMPTY(&tcp
->tcp_queue
.tq_head
))
1626 /* If the socket has been shut down, we return ECONNABORTED. */
1627 if (tcp
->tcp_pcb
== NULL
)
1628 return ECONNABORTED
;
1630 /* Otherwise, wait for a new connection first. */
1635 * Accept a connection on a listening TCP socket, creating a new TCP socket.
1638 tcpsock_accept(struct sock
* sock
, struct sockaddr
* addr
,
1639 socklen_t
* addr_len
, endpoint_t user_endpt __unused
,
1640 struct sock
** newsockp
)
1642 struct tcpsock
*listener
= (struct tcpsock
*)sock
;
1643 struct tcpsock
*tcp
;
1646 if ((r
= tcpsock_test_accept(sock
)) != OK
)
1648 /* Below, we must not assume that the listener has a PCB. */
1650 tcp
= TAILQ_FIRST(&listener
->tcp_queue
.tq_head
);
1651 assert(tcp
->tcp_listener
== listener
);
1652 assert(tcp
->tcp_pcb
!= NULL
);
1654 TAILQ_REMOVE(&listener
->tcp_queue
.tq_head
, tcp
, tcp_queue
.tq_next
);
1655 tcp
->tcp_listener
= NULL
;
1657 tcp_backlog_accepted(tcp
->tcp_pcb
);
1659 ipsock_put_addr(tcpsock_get_ipsock(tcp
), addr
, addr_len
,
1660 &tcp
->tcp_pcb
->remote_ip
, tcp
->tcp_pcb
->remote_port
);
1663 * Set 'newsockp' to NULL so that libsockevent knows we already cloned
1664 * the socket, and it must not be reinitialized anymore.
1667 return tcpsock_get_id(tcp
);
1671 * Perform preliminary checks on a send request.
1674 tcpsock_pre_send(struct sock
* sock
, size_t len __unused
,
1675 socklen_t ctl_len __unused
, const struct sockaddr
* addr __unused
,
1676 socklen_t addr_len __unused
, endpoint_t user_endpt __unused
, int flags
)
1680 * Reject calls with unknown flags. Since libsockevent strips out the
1681 * flags it handles itself here, we only have to test for ones we can
1682 * not handle. Currently, there are no send flags that we support.
1691 * Test whether the given number of data bytes can be sent on a TCP socket.
1694 tcpsock_test_send(struct sock
* sock
, size_t min
)
1696 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1699 if (tcp
->tcp_pcb
== NULL
)
1702 switch (tcp
->tcp_pcb
->state
) {
1703 case CLOSED
: /* new */
1704 case LISTEN
: /* listening */
1706 case SYN_SENT
: /* connecting */
1707 case SYN_RCVD
: /* simultaneous open, maybe someday? */
1709 case ESTABLISHED
: /* connected */
1710 case CLOSE_WAIT
: /* closed remotely */
1712 default: /* shut down locally */
1713 assert(tcpsock_is_shutdown(tcp
, SFL_SHUT_WR
));
1717 sndbuf
= tcpsock_get_sndbuf(tcp
);
1721 if (tcp
->tcp_snd
.ts_len
+ min
> sndbuf
)
1728 * Send data on a TCP socket.
1731 tcpsock_send(struct sock
* sock
, const struct sockdriver_data
* data
,
1732 size_t len
, size_t * offp
, const struct sockdriver_data
* ctl __unused
,
1733 socklen_t ctl_len __unused
, socklen_t
* ctl_off __unused
,
1734 const struct sockaddr
* addr __unused
, socklen_t addr_len __unused
,
1735 endpoint_t user_endpt __unused
, int flags __unused
, size_t min
)
1737 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1738 struct pbuf
*ptail
, *pfirst
, *pnext
, *plast
;
1739 size_t off
, tail_off
, chunk
, left
, sndbuf
;
1742 if ((r
= tcpsock_test_send(sock
, min
)) != OK
)
1746 return OK
; /* nothing to do */
1748 sndbuf
= tcpsock_get_sndbuf(tcp
);
1753 assert(sndbuf
> tcp
->tcp_snd
.ts_len
);
1754 left
= sndbuf
- tcp
->tcp_snd
.ts_len
;
1759 * First see if we can fit any more data in the current tail buffer.
1760 * If so, we set 'ptail' to point to it and 'tail_off' to the previous
1761 * length of the tail buffer, while optimistically extending it to
1762 * include the new data. If not, we set them to NULL/0.
1764 if ((ptail
= tcp
->tcp_snd
.ts_tail
) != NULL
&&
1765 ptail
->len
< ptail
->tot_len
) {
1766 assert(ptail
->len
> 0);
1767 tail_off
= (size_t)ptail
->len
;
1770 * Optimistically extend the head buffer to include whatever
1771 * fits in it. This is needed for util_copy_data().
1773 assert(ptail
->tot_len
> ptail
->len
);
1774 off
= (size_t)ptail
->tot_len
- (size_t)ptail
->len
;
1785 * Then, if there is more to send, allocate new buffers as needed. If
1786 * we run out of memory, work with whatever we did manage to grab.
1790 while (off
< left
) {
1791 if (tcpsock_sendbufs
>= TCP_MAX_SENDBUFS
||
1792 (pnext
= tcpsock_alloc_buf()) == NULL
) {
1794 * Chances are that we will end up suspending this send
1795 * request because of being out of buffers. We try to
1796 * resume such requests from the polling function.
1798 tcpsock_set_flag(tcp
, TCPF_OOM
);
1808 plast
->next
= pnext
;
1811 chunk
= (size_t)pnext
->tot_len
;
1812 if (chunk
> left
- off
)
1819 * Copy in the data and continue, unless we did not manage to find
1820 * enough space to even meet the low send watermark, in which case we
1821 * undo any allocation and suspend the call until later.
1825 * Optimistically attach the new buffers to the tail, also for
1826 * util_copy_data(). We undo all this if the copy fails.
1828 if (ptail
!= NULL
) {
1829 ptail
->next
= pfirst
;
1835 assert(pnext
!= NULL
);
1837 r
= util_copy_data(data
, off
, *offp
, pnext
, tail_off
,
1843 /* Undo the modifications made so far. */
1844 while (pfirst
!= NULL
) {
1845 pnext
= pfirst
->next
;
1847 assert(tcpsock_sendbufs
> 0);
1850 tcpsock_free_buf(pfirst
);
1855 if (ptail
!= NULL
) {
1858 ptail
->len
= tail_off
;
1864 /* Attach the new buffers, if any, to the buffer tail. */
1865 if (pfirst
!= NULL
) {
1866 if ((ptail
= tcp
->tcp_snd
.ts_tail
) != NULL
) {
1867 assert(ptail
->len
== ptail
->tot_len
);
1870 * Due to our earlier optimistic modifications, this
1871 * may or may not be redundant.
1873 ptail
->next
= pfirst
;
1876 assert(plast
!= NULL
);
1877 tcp
->tcp_snd
.ts_tail
= plast
;
1879 if (tcp
->tcp_snd
.ts_head
== NULL
) {
1880 tcp
->tcp_snd
.ts_head
= pfirst
;
1881 assert(tcp
->tcp_snd
.ts_head_off
== 0);
1883 if (tcp
->tcp_snd
.ts_unsent
== NULL
) {
1884 tcp
->tcp_snd
.ts_unsent
= pfirst
;
1885 assert(tcp
->tcp_snd
.ts_unsent_off
== 0);
1889 tcp
->tcp_snd
.ts_len
+= off
;
1892 * See if we can send any of the data we just enqueued. The socket is
1893 * still open as we are still processing a call from userland on it;
1894 * this saves us from having to deal with the cases that the following
1895 * calls end up freeing the socket object.
1897 if (tcpsock_pcb_enqueue(tcp
) &&
1898 (r
= tcpsock_pcb_send(tcp
, FALSE
/*raise_error*/)) != OK
) {
1900 * That did not go well. Return the error immediately if we
1901 * had not made any progress earlier. Otherwise, return our
1902 * partial progress and leave the error to be picked up later.
1905 sockevent_set_error(tcpsock_get_sock(tcp
), r
);
1913 return (off
< len
) ? SUSPEND
: OK
;
1917 * Perform preliminary checks on a receive request.
1920 tcpsock_pre_recv(struct sock
* sock __unused
, endpoint_t user_endpt __unused
,
1925 * Reject calls with unknown flags. Since libsockevent strips out the
1926 * flags it handles itself here, we only have to test for ones we can
1929 if ((flags
& ~(MSG_PEEK
| MSG_WAITALL
)) != 0)
1936 * Return TRUE if receive calls may wait for more data to come in on the
1937 * connection, or FALSE if we already know that that is not going to happen.
1940 tcpsock_may_wait(struct tcpsock
* tcp
)
1943 return (tcp
->tcp_pcb
!= NULL
&&
1944 !(tcpsock_get_flags(tcp
) & TCPF_RCVD_FIN
));
1948 * Test whether data can be received on a TCP socket, and if so, how many bytes
1952 tcpsock_test_recv(struct sock
* sock
, size_t min
, size_t * size
)
1954 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1957 /* If there is and never was a connection, refuse the call at all. */
1958 if (tcp
->tcp_pcb
!= NULL
&& (tcp
->tcp_pcb
->state
== CLOSED
||
1959 tcp
->tcp_pcb
->state
== LISTEN
))
1963 * If we are certain that no more data will come in later, ignore the
1964 * low receive watermark. Otherwise, bound it to the size of the
1965 * receive buffer, or receive calls may block forever.
1967 if (!(may_wait
= tcpsock_may_wait(tcp
)))
1969 else if (min
> tcpsock_get_rcvbuf(tcp
))
1970 min
= tcpsock_get_rcvbuf(tcp
);
1972 if (tcp
->tcp_rcv
.tr_len
>= min
) {
1974 *size
= tcp
->tcp_rcv
.tr_len
;
1979 return (may_wait
) ? SUSPEND
: SOCKEVENT_EOF
;
1983 * Receive data on a TCP socket.
1986 tcpsock_recv(struct sock
* sock
, const struct sockdriver_data
* data
,
1987 size_t len
, size_t * offp
, const struct sockdriver_data
* ctl __unused
,
1988 socklen_t ctl_len __unused
, socklen_t
* ctl_off __unused
,
1989 struct sockaddr
* addr __unused
, socklen_t
* addr_len __unused
,
1990 endpoint_t user_endpt __unused
, int flags
, size_t min
,
1991 int * rflags __unused
)
1993 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
1998 /* See if we can receive at all, and if so, how much at most. */
1999 if ((r
= tcpsock_test_recv(sock
, min
, NULL
)) != OK
)
2003 return OK
; /* nothing to do */
2005 off
= tcp
->tcp_rcv
.tr_len
;
2009 assert(tcp
->tcp_rcv
.tr_head
!= NULL
);
2010 assert(tcp
->tcp_rcv
.tr_head_off
< tcp
->tcp_rcv
.tr_head
->len
);
2012 /* Copy out the data to the caller. */
2013 if ((r
= util_copy_data(data
, off
, *offp
, tcp
->tcp_rcv
.tr_head
,
2014 tcp
->tcp_rcv
.tr_head_off
, FALSE
/*copy_in*/)) != OK
)
2017 /* Unless peeking, remove the data from the receive queue. */
2018 if (!(flags
& MSG_PEEK
)) {
2021 /* Dequeue and free as many entire buffers as possible. */
2022 while ((ptail
= tcp
->tcp_rcv
.tr_head
) != NULL
&&
2023 left
>= (size_t)ptail
->len
- tcp
->tcp_rcv
.tr_head_off
) {
2024 left
-= (size_t)ptail
->len
- tcp
->tcp_rcv
.tr_head_off
;
2026 tcp
->tcp_rcv
.tr_head
= ptail
->next
;
2027 tcp
->tcp_rcv
.tr_head_off
= 0;
2029 if (tcp
->tcp_rcv
.tr_head
== NULL
)
2030 tcp
->tcp_rcv
.tr_pre_tailp
= NULL
;
2031 else if (tcp
->tcp_rcv
.tr_pre_tailp
== &ptail
->next
)
2032 tcp
->tcp_rcv
.tr_pre_tailp
=
2033 &tcp
->tcp_rcv
.tr_head
;
2035 assert(tcpsock_recvbufs
> 0);
2038 tcpsock_free_buf(ptail
);
2042 * If only part of the (new) head buffer is consumed, adjust
2043 * the saved offset into that buffer.
2046 assert(tcp
->tcp_rcv
.tr_head
!= NULL
);
2047 assert((size_t)tcp
->tcp_rcv
.tr_head
->len
-
2048 tcp
->tcp_rcv
.tr_head_off
> left
);
2050 tcp
->tcp_rcv
.tr_head_off
+= left
;
2053 tcp
->tcp_rcv
.tr_len
-= off
;
2055 if (tcp
->tcp_rcv
.tr_head
!= NULL
) {
2056 assert(tcp
->tcp_rcv
.tr_pre_tailp
!= NULL
);
2057 assert(tcp
->tcp_rcv
.tr_len
> 0);
2059 assert(tcp
->tcp_rcv
.tr_pre_tailp
== NULL
);
2060 assert(tcp
->tcp_rcv
.tr_len
== 0);
2064 * The receive buffer has shrunk, so there may now be space to
2065 * receive more data.
2067 if (tcp
->tcp_pcb
!= NULL
)
2068 tcpsock_ack_recv(tcp
);
2070 flags
&= ~MSG_WAITALL
; /* for the check below */
2072 /* Advance the current copy position, and see if we are done. */
2074 if ((flags
& MSG_WAITALL
) && off
< len
&& tcpsock_may_wait(tcp
))
2081 * Update the set of flag-type socket options on a TCP socket.
2084 tcpsock_setsockmask(struct sock
* sock
, unsigned int mask
)
2086 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2088 if (tcp
->tcp_pcb
== NULL
)
2091 if (mask
& SO_REUSEADDR
)
2092 ip_set_option(tcp
->tcp_pcb
, SOF_REUSEADDR
);
2094 ip_reset_option(tcp
->tcp_pcb
, SOF_REUSEADDR
);
2096 if (mask
& SO_KEEPALIVE
)
2097 ip_set_option(tcp
->tcp_pcb
, SOF_KEEPALIVE
);
2099 ip_reset_option(tcp
->tcp_pcb
, SOF_KEEPALIVE
);
2103 * Prepare a helper structure for IP-level option processing.
2106 tcpsock_get_ipopts(struct tcpsock
* tcp
, struct ipopts
* ipopts
)
2109 ipopts
->local_ip
= &tcp
->tcp_pcb
->local_ip
;
2110 ipopts
->remote_ip
= &tcp
->tcp_pcb
->remote_ip
;
2111 ipopts
->tos
= &tcp
->tcp_pcb
->tos
;
2112 ipopts
->ttl
= &tcp
->tcp_pcb
->ttl
;
2113 ipopts
->sndmin
= TCP_SNDBUF_MIN
;
2114 ipopts
->sndmax
= TCP_SNDBUF_MAX
;
2115 ipopts
->rcvmin
= TCP_RCVBUF_MIN
;
2116 ipopts
->rcvmax
= TCP_RCVBUF_MAX
;
2120 * Set socket options on a TCP socket.
2123 tcpsock_setsockopt(struct sock
* sock
, int level
, int name
,
2124 const struct sockdriver_data
* data
, socklen_t len
)
2126 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2127 struct ipopts ipopts
;
2131 if (tcp
->tcp_pcb
== NULL
)
2134 /* Handle TCP-level options. */
2138 case IPV6_RECVTCLASS
:
2139 if ((r
= sockdriver_copyin_opt(data
, &val
, sizeof(val
),
2144 * This option is not supported for TCP sockets; it
2145 * would not even make sense. However, named(8)
2146 * insists on trying to set it anyway. We accept the
2147 * request but ignore the value, not even returning
2148 * what was set through getsockopt(2).
2153 if ((r
= sockdriver_copyin_opt(data
, &val
, sizeof(val
),
2158 * This option is not supported at all, but to save
2159 * ourselves from having to remember the current state
2160 * for getsockopt(2), we also refuse to enable it.
2174 * lwIP's listening TCP PCBs do not have this field.
2175 * If this ever becomes an issue, we can create our own
2176 * shadow flag and do the inheritance ourselves.
2178 if (tcp
->tcp_pcb
->state
== LISTEN
)
2181 if ((r
= sockdriver_copyin_opt(data
, &val
, sizeof(val
),
2186 tcp_nagle_disable(tcp
->tcp_pcb
);
2188 tcp_nagle_enable(tcp
->tcp_pcb
);
2195 * lwIP's listening TCP PCBs do not have these fields.
2197 if (tcp
->tcp_pcb
->state
== LISTEN
)
2200 if ((r
= sockdriver_copyin_opt(data
, &val
, sizeof(val
),
2208 * The given value is unsigned, but lwIP stores the
2209 * value in milliseconds in a uint32_t field, so we
2210 * have to limit large values to whatever fits in the
2213 if (val
< 0 || (uint32_t)val
> UINT32_MAX
/ 1000)
2216 uval
= (uint32_t)val
* 1000;
2218 if (name
== TCP_KEEPIDLE
)
2219 tcp
->tcp_pcb
->keep_idle
= uval
;
2221 tcp
->tcp_pcb
->keep_intvl
= uval
;
2226 /* lwIP's listening TCP PCBs do not have this field. */
2227 if (tcp
->tcp_pcb
->state
== LISTEN
)
2230 if ((r
= sockdriver_copyin_opt(data
, &val
, sizeof(val
),
2237 tcp
->tcp_pcb
->keep_cnt
= (uint32_t)val
;
2245 /* Handle all other options at the IP level. */
2246 tcpsock_get_ipopts(tcp
, &ipopts
);
2248 return ipsock_setsockopt(tcpsock_get_ipsock(tcp
), level
, name
, data
,
2253 * Retrieve socket options on a TCP socket.
2256 tcpsock_getsockopt(struct sock
* sock
, int level
, int name
,
2257 const struct sockdriver_data
* data
, socklen_t
* len
)
2259 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2260 struct ipopts ipopts
;
2263 if (tcp
->tcp_pcb
== NULL
)
2266 /* Handle TCP-level options. */
2270 case IPV6_RECVTCLASS
:
2274 return sockdriver_copyout_opt(data
, &val
, sizeof(val
),
2283 /* lwIP's listening TCP PCBs do not have this field. */
2284 if (tcp
->tcp_pcb
->state
== LISTEN
)
2287 val
= tcp_nagle_disabled(tcp
->tcp_pcb
);
2289 return sockdriver_copyout_opt(data
, &val
, sizeof(val
),
2293 /* lwIP's listening TCP PCBs do not have this field. */
2294 if (tcp
->tcp_pcb
->state
== LISTEN
)
2297 /* This option is read-only at this time. */
2298 val
= tcp
->tcp_pcb
->mss
;
2300 return sockdriver_copyout_opt(data
, &val
, sizeof(val
),
2304 /* lwIP's listening TCP PCBs do not have this field. */
2305 if (tcp
->tcp_pcb
->state
== LISTEN
)
2308 val
= (int)(tcp
->tcp_pcb
->keep_idle
/ 1000);
2310 return sockdriver_copyout_opt(data
, &val
, sizeof(val
),
2314 /* lwIP's listening TCP PCBs do not have this field. */
2315 if (tcp
->tcp_pcb
->state
== LISTEN
)
2318 val
= (int)(tcp
->tcp_pcb
->keep_intvl
/ 1000);
2320 return sockdriver_copyout_opt(data
, &val
, sizeof(val
),
2324 /* lwIP's listening TCP PCBs do not have this field. */
2325 if (tcp
->tcp_pcb
->state
== LISTEN
)
2328 val
= (int)tcp
->tcp_pcb
->keep_cnt
;
2330 return sockdriver_copyout_opt(data
, &val
, sizeof(val
),
2337 /* Handle all other options at the IP level. */
2338 tcpsock_get_ipopts(tcp
, &ipopts
);
2340 return ipsock_getsockopt(tcpsock_get_ipsock(tcp
), level
, name
, data
,
2345 * Retrieve the local socket address of a TCP socket.
2348 tcpsock_getsockname(struct sock
* sock
, struct sockaddr
* addr
,
2349 socklen_t
* addr_len
)
2351 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2353 if (tcp
->tcp_pcb
== NULL
)
2356 ipsock_put_addr(tcpsock_get_ipsock(tcp
), addr
, addr_len
,
2357 &tcp
->tcp_pcb
->local_ip
, tcp
->tcp_pcb
->local_port
);
2363 * Retrieve the remote socket address of a TCP socket.
2366 tcpsock_getpeername(struct sock
* sock
, struct sockaddr
* addr
,
2367 socklen_t
* addr_len
)
2369 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2371 if (tcp
->tcp_pcb
== NULL
|| tcp
->tcp_pcb
->state
== CLOSED
||
2372 tcp
->tcp_pcb
->state
== LISTEN
|| tcp
->tcp_pcb
->state
== SYN_SENT
)
2375 ipsock_put_addr(tcpsock_get_ipsock(tcp
), addr
, addr_len
,
2376 &tcp
->tcp_pcb
->remote_ip
, tcp
->tcp_pcb
->remote_port
);
2382 * Perform a TCP half-close on a TCP socket. This operation may not complete
2383 * immediately due to memory conditions, in which case it will be completed at
2387 tcpsock_send_fin(struct tcpsock
* tcp
)
2390 sockevent_set_shutdown(tcpsock_get_sock(tcp
), SFL_SHUT_WR
);
2393 * Attempt to send the FIN. If a fatal error occurs as a result, raise
2394 * it as an asynchronous error, because this function's callers cannot
2395 * do much with it. That happens to match the way these functions are
2396 * used elsewhere. In any case, as a result, the PCB may be closed.
2397 * However, we are never called from a situation where the socket is
2398 * being closed here, so the socket object will not be freed either.
2400 if (tcpsock_pcb_enqueue(tcp
)) {
2401 assert(!sockevent_is_closing(tcpsock_get_sock(tcp
)));
2403 if (tcpsock_may_close(tcp
))
2404 tcpsock_finish_close(tcp
);
2406 (void)tcpsock_pcb_send(tcp
, TRUE
/*raise_error*/);
2411 * Shut down a TCP socket for reading and/or writing.
2414 tcpsock_shutdown(struct sock
* sock
, unsigned int mask
)
2416 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2419 * If the PCB is gone, we want to allow shutdowns for reading but not
2420 * writing: shutting down for writing affects the PCB, shutting down
2421 * for reading does not. Also, if the PCB is in CLOSED state, we would
2422 * not know how to deal with subsequent operations after a shutdown for
2423 * writing, so forbid such calls altogether.
2425 if ((tcp
->tcp_pcb
== NULL
|| tcp
->tcp_pcb
->state
== CLOSED
) &&
2426 (mask
& SFL_SHUT_WR
))
2430 * Handle listening sockets as a special case. Shutting down a
2431 * listening socket frees its PCB. Sockets pending on the accept queue
2432 * may still be accepted, but after that, accept(2) will start
2433 * returning ECONNABORTED. This feature allows multi-process server
2434 * applications to shut down gracefully, supposedly..
2436 if (tcpsock_is_listening(tcp
)) {
2437 if (tcp
->tcp_pcb
!= NULL
)
2438 tcpsock_pcb_close(tcp
);
2444 * We control shutdown-for-reading locally, and intentially do not tell
2445 * lwIP about it: if we do that and also shut down for writing, the PCB
2446 * may disappear (now or eventually), which is not what we want.
2447 * Instead, we only tell lwIP to shut down for reading once we actually
2448 * want to get rid of the PCB, using tcp_close(). In the meantime, if
2449 * the socket is shut down for reading by the user, we simply discard
2450 * received data as fast as we can--one out of a number of possible
2451 * design choices there, and (reportedly) the one used by the BSDs.
2453 if (mask
& SFL_SHUT_RD
)
2454 (void)tcpsock_clear_recv(tcp
, TRUE
/*ack_data*/);
2457 * Shutting down for writing a connecting socket simply closes its PCB.
2458 * Closing a PCB in SYN_SENT state simply deallocates it, so this can
2459 * not fail. On the other hand, for connected sockets we want to send
2460 * a FIN, which may fail due to memory shortage, in which case we have
2461 * to try again later..
2463 if (mask
& SFL_SHUT_WR
) {
2464 if (tcp
->tcp_pcb
->state
== SYN_SENT
)
2465 tcpsock_pcb_close(tcp
);
2466 else if (!tcpsock_is_shutdown(tcp
, SFL_SHUT_WR
))
2467 tcpsock_send_fin(tcp
);
2474 * Close a TCP socket. Complete the operation immediately if possible, or
2475 * otherwise initiate the closing process and complete it later, notifying
2476 * libsockevent about that as well. Depending on linger settings, this
2477 * function may be called twice on the same socket: the first time with the
2478 * 'force' flag cleared, and the second time with the 'force' flag set.
2481 tcpsock_close(struct sock
* sock
, int force
)
2483 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2484 struct tcpsock
*queued
;
2487 assert(tcp
->tcp_listener
== NULL
);
2490 * If this was a listening socket, so abort and clean up any and all
2491 * connections on its listener queue. Note that the listening socket
2492 * may or may not have a PCB at this point.
2494 if (tcpsock_is_listening(tcp
)) {
2495 while (!TAILQ_EMPTY(&tcp
->tcp_queue
.tq_head
)) {
2496 queued
= TAILQ_FIRST(&tcp
->tcp_queue
.tq_head
);
2498 tcpsock_pcb_abort(queued
);
2500 (void)tcpsock_cleanup(queued
, TRUE
/*may_free*/);
2505 * Clear the receive queue, and make sure that we no longer add new
2506 * data to it. The latter is relevant only for the case that we end up
2507 * returning SUSPEND below. Remember whether there were bytes left,
2508 * because we should reset the connection if there were.
2510 rlen
= tcpsock_clear_recv(tcp
, FALSE
/*ack_data*/);
2512 sockevent_set_shutdown(tcpsock_get_sock(tcp
), SFL_SHUT_RD
);
2515 * If the socket is connected, perform a graceful shutdown, unless 1)
2516 * we are asked to force-close the socket, or 2) if the local side has
2517 * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP
2518 * would take care of the second point, but we may have data in our
2519 * receive buffer of which lwIP is not aware.
2521 * Implementing proper linger support is somewhat difficult with lwIP.
2522 * In particular, we cannot reliably wait for our FIN to be ACK'ed by
2523 * the other side in all cases:
2525 * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
2526 * trigger any event and once in the TIME_WAIT state, the poll event
2527 * no longer triggers either;
2528 * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
2529 * TIME_WAIT will trigger a receive event, but it is not clear
2530 * whether we can reliably check that our FIN was ACK'ed from there.
2532 * That means we have to compromise. Instead of the proper approach,
2533 * we complete our side of the close operation whenever:
2535 * 1. all of or data was acknowledged, AND,
2536 * 2. our FIN was sent, AND,
2537 * 3a. our FIN was acknowledged, OR,
2538 * 3b. we received a FIN from the other side.
2540 * With the addition of the rule 3b, we do not run into the above
2541 * reliability problems, but we may return from SO_LINGER-blocked close
2542 * calls too early and thus give callers a false impression of success.
2543 * TODO: if lwIP ever gets improved on this point, the code in this
2544 * module should be rewritten to make use of the improvements.
2546 * The set of rules is basically the same as for closing the PCB early
2547 * as per tcpsock_may_close(), except with the check for our FIN being
2548 * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
2549 * (reentered) CLOSED TCP states guarantee that there are no
2550 * unacknowledged data segments anymore, so we may have to wait for
2551 * reaching any one of these before we can actually finish closing the
2552 * socket with tcp_close().
2554 * In addition, lwIP does not tell us when our FIN gets acknowledged,
2555 * so we have to use polling and direct access to lwIP's PCB fields
2556 * instead, just like lwIP's BSD API does. There is no other way.
2557 * Also, we may not even be able to send the FIN right away, in which
2558 * case we must defer that until later.
2560 if (tcp
->tcp_pcb
!= NULL
) {
2561 switch (tcp
->tcp_pcb
->state
) {
2565 assert(tcpsock_get_flags(tcp
) & TCPF_RCVD_FIN
);
2571 /* First check if we should abort the connection. */
2572 if (force
|| rlen
> 0)
2576 * If we have not sent a FIN yet, try sending it now;
2577 * if all other conditions are met for closing the
2578 * socket, successful FIN transmission will complete
2579 * the close. Otherwise, perform the close check
2582 if (!tcpsock_is_shutdown(tcp
, SFL_SHUT_WR
))
2583 tcpsock_send_fin(tcp
);
2584 else if (tcpsock_may_close(tcp
))
2585 tcpsock_pcb_close(tcp
);
2588 * If at this point the PCB is gone, we managed to
2589 * close the connection immediately, and the socket has
2590 * already been cleaned up by now. This may occur if
2591 * there is no unacknowledged data and we already
2592 * received a FIN earlier on.
2594 if (tcp
->tcp_pcb
== NULL
)
2598 * Complete the close operation at a later time.
2599 * Adjust the polling interval, so that we can detect
2600 * completion of the close as quickly as possible.
2602 tcp_poll(tcp
->tcp_pcb
, tcpsock_event_poll
,
2603 TCP_POLL_CLOSE_INTERVAL
);
2609 * The connection is either not yet established, or
2610 * already in a state where we can close it right now.
2612 tcpsock_pcb_close(tcp
);
2617 * Abort the connection is the PCB is still around, and clean up the
2618 * socket. We cannot let tcpsock_cleanup() free the socket object yet,
2619 * because we are still in the callback from libsockevent, and the
2620 * latter cannot handle the socket object being freed from here.
2622 if (tcp
->tcp_pcb
!= NULL
)
2623 tcpsock_pcb_abort(tcp
);
2625 (void)tcpsock_cleanup(tcp
, FALSE
/*may_free*/);
2631 * Free up a closed TCP socket.
2634 tcpsock_free(struct sock
* sock
)
2636 struct tcpsock
*tcp
= (struct tcpsock
*)sock
;
2638 assert(tcp
->tcp_pcb
== NULL
);
2639 assert(tcp
->tcp_snd
.ts_len
== 0);
2640 assert(tcp
->tcp_snd
.ts_head
== NULL
);
2641 assert(tcp
->tcp_rcv
.tr_len
== 0);
2642 assert(tcp
->tcp_rcv
.tr_head
== NULL
);
2644 TAILQ_INSERT_HEAD(&tcp_freelist
, tcp
, tcp_queue
.tq_next
);
2647 /* This table maps TCP states from lwIP numbers to NetBSD numbers. */
2648 static const struct {
2651 } tcpsock_statemap
[] = {
2652 [CLOSED
] = { TCPS_CLOSED
, SS_ISDISCONNECTED
},
2653 [LISTEN
] = { TCPS_LISTEN
, 0 },
2654 [SYN_SENT
] = { TCPS_SYN_SENT
, SS_ISCONNECTING
},
2655 [SYN_RCVD
] = { TCPS_SYN_RECEIVED
, SS_ISCONNECTING
},
2656 [ESTABLISHED
] = { TCPS_ESTABLISHED
, SS_ISCONNECTED
},
2657 [FIN_WAIT_1
] = { TCPS_FIN_WAIT_1
, SS_ISDISCONNECTING
},
2658 [FIN_WAIT_2
] = { TCPS_FIN_WAIT_2
, SS_ISDISCONNECTING
},
2659 [CLOSE_WAIT
] = { TCPS_CLOSE_WAIT
, SS_ISCONNECTED
},
2660 [CLOSING
] = { TCPS_CLOSING
, SS_ISDISCONNECTING
},
2661 [LAST_ACK
] = { TCPS_LAST_ACK
, SS_ISDISCONNECTING
},
2662 [TIME_WAIT
] = { TCPS_TIME_WAIT
, SS_ISDISCONNECTED
},
2666 * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
2667 * PCB identified by the given pointer.
2670 tcpsock_get_info(struct kinfo_pcb
* ki
, const void * ptr
)
2672 const struct tcp_pcb
*pcb
= (const struct tcp_pcb
*)ptr
;
2673 struct tcpsock
*tcp
;
2676 * Not all TCP PCBs have an associated tcpsock structure. We are
2677 * careful enough clearing the callback argument for PCBs on any of the
2678 * TCP lists that we can use that callback argument to determine
2679 * whether there is an associated tcpsock structure, although with one
2680 * exception: PCBs for incoming connections that have not yet been
2681 * fully established (i.e., in SYN_RCVD state). These will have the
2682 * callback argument of the listening socket (which itself may already
2683 * have been deallocated at this point) but should not be considered as
2684 * associated with the listening socket's tcpsock structure.
2686 if (pcb
->callback_arg
!= NULL
&& pcb
->state
!= SYN_RCVD
) {
2687 tcp
= (struct tcpsock
*)pcb
->callback_arg
;
2688 assert(tcp
>= tcp_array
&&
2689 tcp
< &tcp_array
[__arraycount(tcp_array
)]);
2691 /* TODO: change this so that sockstat(1) may work one day. */
2692 ki
->ki_sockaddr
= (uint64_t)(uintptr_t)tcpsock_get_sock(tcp
);
2694 /* No tcpsock. Could also be in TIME_WAIT state etc. */
2697 ki
->ki_sostate
= SS_NOFDREF
;
2700 ki
->ki_type
= SOCK_STREAM
;
2702 if ((unsigned int)pcb
->state
< __arraycount(tcpsock_statemap
)) {
2703 ki
->ki_tstate
= tcpsock_statemap
[pcb
->state
].tsm_tstate
;
2704 /* TODO: this needs work, but does anything rely on it? */
2705 ki
->ki_sostate
|= tcpsock_statemap
[pcb
->state
].tsm_sostate
;
2708 /* Careful with the LISTEN state here (see below). */
2709 ipsock_get_info(ki
, &pcb
->local_ip
, pcb
->local_port
,
2710 &pcb
->remote_ip
, (pcb
->state
!= LISTEN
) ? pcb
->remote_port
: 0);
2713 * The PCBs for listening sockets are actually smaller. Thus, for
2714 * listening sockets, do not attempt to access any of the fields beyond
2715 * those provided in the smaller structure.
2717 if (pcb
->state
== LISTEN
) {
2718 assert(tcp
!= NULL
);
2720 (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp
->tcp_queue
.tq_head
);
2722 if (tcp_nagle_disabled(pcb
))
2723 ki
->ki_tflags
|= NETBSD_TF_NODELAY
;
2726 ki
->ki_rcvq
= tcp
->tcp_rcv
.tr_len
;
2727 ki
->ki_sndq
= tcp
->tcp_snd
.ts_len
;
2729 if (tcp
->tcp_listener
!= NULL
)
2730 ki
->ki_nextref
= (uint64_t)(uintptr_t)
2731 TAILQ_NEXT(tcp
, tcp_queue
.tq_next
);
2737 * Given either NULL or a previously returned TCP PCB pointer, return the first
2738 * or next TCP PCB pointer, or NULL if there are no more. The current
2739 * implementation supports only one concurrent iteration at once.
2742 tcpsock_enum(const void * last
)
2746 const struct tcp_pcb
*pcb
;
2749 if (last
!= NULL
&& (iter
.pcb
= iter
.pcb
->next
) != NULL
)
2750 return (const void *)iter
.pcb
;
2752 for (iter
.i
= (last
!= NULL
) ? iter
.i
+ 1 : 0;
2753 iter
.i
< __arraycount(tcp_pcb_lists
); iter
.i
++) {
2754 if ((iter
.pcb
= *tcp_pcb_lists
[iter
.i
]) != NULL
)
2755 return (const void *)iter
.pcb
;
2762 * Obtain the list of TCP protocol control blocks, for sysctl(7).
2765 tcpsock_pcblist(struct rmib_call
* call
, struct rmib_node
* node __unused
,
2766 struct rmib_oldp
* oldp
, struct rmib_newp
* newp __unused
)
2769 return util_pcblist(call
, oldp
, tcpsock_enum
, tcpsock_get_info
);
2772 static const struct sockevent_ops tcpsock_ops
= {
2773 .sop_bind
= tcpsock_bind
,
2774 .sop_listen
= tcpsock_listen
,
2775 .sop_connect
= tcpsock_connect
,
2776 .sop_accept
= tcpsock_accept
,
2777 .sop_test_accept
= tcpsock_test_accept
,
2778 .sop_pre_send
= tcpsock_pre_send
,
2779 .sop_send
= tcpsock_send
,
2780 .sop_test_send
= tcpsock_test_send
,
2781 .sop_pre_recv
= tcpsock_pre_recv
,
2782 .sop_recv
= tcpsock_recv
,
2783 .sop_test_recv
= tcpsock_test_recv
,
2784 .sop_ioctl
= ifconf_ioctl
,
2785 .sop_setsockmask
= tcpsock_setsockmask
,
2786 .sop_setsockopt
= tcpsock_setsockopt
,
2787 .sop_getsockopt
= tcpsock_getsockopt
,
2788 .sop_getsockname
= tcpsock_getsockname
,
2789 .sop_getpeername
= tcpsock_getpeername
,
2790 .sop_shutdown
= tcpsock_shutdown
,
2791 .sop_close
= tcpsock_close
,
2792 .sop_free
= tcpsock_free