etc/services - sync with NetBSD-8
[minix.git] / minix / net / lwip / tcpsock.c
blob8266a05c386e3c1ee4f4257571c6601233128ab9
1 /* LWIP service - tcpsock.c - TCP sockets */
2 /*
3 * This module implements support for TCP sockets based on lwIP's core TCP PCB
4 * module, which is largely but not fully cooperative with exactly what we want
5 * to achieve, with as a result that this module is rather complicated.
7 * Each socket has a send queue and a receive queue. Both are using lwIP's own
8 * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
9 * The buffers on the send queue are allocated and freed by us--the latter only
10 * once they are no longer in use by lwIP as well. A bit counterintuitively,
11 * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
12 * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
13 * easily trigger conditions where we cannot enqueue data (or the final FIN)
14 * right away. This way, we get to test the internal logic of this module a
15 * lot more easily. The small lwIP send queue size should not have any impact
16 * on performance, as our own per-socket send queues can be much larger and we
17 * enqueue more of that on the lwIP PCB as soon as we can in all cases.
19 * The receive queue consists of whatever buffers were given to us by lwIP, but
20 * since those may be many buffers with small amounts of data each, we perform
21 * fairly aggressive merging of consecutive buffers. The intended result is
22 * that we waste no more than 50% of memory within the receive queue. Merging
23 * requires memory copies, which makes it expensive, but we do not configure
24 * lwIP with enough buffers to make running out of buffers a non-issue, so this
25 * trade-off is necessary. Practical experience and measurements of the merge
26 * policy will have to show whether and how the current policy may be improved.
28 * As can be expected, the connection close semantics are by far the most
29 * complicated part of this module. We attempt to get rid of the lwIP PCB as
30 * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
31 * However, there are various conditions that have to be met before we can
32 * forget about the PCB here--most importantly, that none of our sent data
33 * blocks are still referenced by lwIP because they have not yet been sent or
34 * acknowledged. We can only free the data blocks once lwIP is done with them.
36 * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
37 * full state tracking here. However, we do not look at a socket's TCP state
38 * while in a lwIP-generated event for that socket, because the state may not
39 * necessarily reflect the (correct or new) TCP state of the connection, nor
40 * may the PCB be available--this is the case for error events. For these
41 * reasons we use a few internal TCPF_ flags to perform partial state tracking.
43 * More generally, we tend to access lwIP PCB fields directly only when lwIP's
44 * own BSD API implementation does that too and there is no better alternative.
45 * One example of this is the check to see if our FIN was acknowledged, for
46 * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API
47 * changes later, we can change our code to imitate whatever lwIP's BSD API
48 * implementation does at that point.
51 #include <sys/socketvar.h>
52 #include <netinet/in.h>
53 #include <netinet/tcp.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/tcp_timer.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/tcp_fsm.h>
60 * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
61 * preprocessor variables. Make sure we do not attempt to use the NetBSD one
62 * where it matters. We do need one of the NetBSD definitions though.
64 static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
65 #undef TF_NODELAY
66 #undef TCP_MSS
68 #include "lwip.h"
69 #include "tcpisn.h"
71 #include "lwip/tcp.h"
72 #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
75 * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
79 * We fully control the send buffer, so we can let its size be set to whatever
80 * we want. The receive buffer is different: if it is smaller than the window
81 * size, we may have to refuse data that lwIP hands us, at which point more
82 * incoming data will cause lwIP to abort the TCP connection--even aside from
83 * performance issues. Therefore, we must make sure the receive buffer is
84 * larger than the TCP window at all times.
86 #define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */
87 #define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */
88 #define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */
89 #define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */
90 #define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */
91 #define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
94 * The total number of buffers that may in use for TCP socket send queues. The
95 * goal is to allow at least some progress to be made on receiving from TCP
96 * sockets and on differently-typed sockets, at least as long as the LWIP
97 * service can manage to allocate the memory it wants. For the case that it
98 * does not, we can only reactively kill off TCP sockets and/or free enqueued
99 * ethernet packets, neither of which is currently implemented (TODO).
101 #define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4)
103 /* Polling intervals, in 500-millsecond units. */
104 #define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */
105 #define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */
107 static struct tcpsock {
108 struct ipsock tcp_ipsock; /* IP socket, MUST be first */
109 struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */
110 union pxfer_tcp_queue { /* free/accept queue */
111 TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */
112 TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */
113 } tcp_queue;
114 struct tcpsock *tcp_listener; /* listener if on accept q. */
115 struct { /* send queue */
116 struct pbuf *ts_head; /* first pbuf w/unacked data */
117 struct pbuf *ts_unsent; /* first pbuf w/unsent data */
118 struct pbuf *ts_tail; /* most recently added data */
119 size_t ts_len; /* total sent + unsent */
120 unsigned short ts_head_off; /* offset into head pbuf */
121 unsigned short ts_unsent_off; /* offset into unsent pbuf */
122 } tcp_snd;
123 struct { /* receive queue */
124 struct pbuf *tr_head; /* first pbuf w/unrecvd data */
125 struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */
126 size_t tr_len; /* bytes on receive queue */
127 unsigned short tr_head_off; /* offset into head pbuf */
128 unsigned short tr_unacked; /* current window reduction */
129 } tcp_rcv;
130 } tcp_array[NR_TCPSOCK];
132 static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */
134 static const struct sockevent_ops tcpsock_ops;
136 static unsigned int tcpsock_sendbufs; /* # send buffers in use */
137 static unsigned int tcpsock_recvbufs; /* # receive buffers in use */
139 /* A bunch of macros that are just for convenience. */
140 #define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
141 #define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock)
142 #define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp)))
143 #define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
144 #define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
145 #define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
146 #define tcpsock_is_shutdown(tcp,fl) \
147 (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
148 #define tcpsock_is_listening(tcp) \
149 (sockevent_is_listening(tcpsock_get_sock(tcp)))
150 #define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp)))
151 #define tcpsock_set_flag(tcp,fl) \
152 (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
153 #define tcpsock_clear_flag(tcp,fl) \
154 (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
156 static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
157 struct rmib_oldp *, struct rmib_newp *);
159 /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
160 /* TODO: add many more and make some of them writable.. */
161 static struct rmib_node net_inet_tcp_table[] = {
162 /* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
163 "sendspace",
164 "Default TCP send buffer size"),
165 /* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
166 "recvspace",
167 "Default TCP receive buffer size"),
168 /*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
169 loopif_cksum, "do_loopback_cksum",
170 "Perform TCP checksum on loopback"),
171 /*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
172 tcpsock_pcblist, "pcblist",
173 "TCP protocol control block list"),
174 /*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
175 CTLFLAG_HIDDEN | CTLTYPE_STRING,
176 TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
177 "isn_secret",
178 "TCP ISN secret (MINIX 3 specific)")
181 static struct rmib_node net_inet_tcp_node =
182 RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
183 static struct rmib_node net_inet6_tcp6_node =
184 RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
187 * Initialize the TCP sockets module.
189 void
190 tcpsock_init(void)
192 unsigned int slot;
194 /* Initialize the list of free TCP sockets. */
195 TAILQ_INIT(&tcp_freelist);
197 for (slot = 0; slot < __arraycount(tcp_array); slot++)
198 TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
199 tcp_queue.tq_next);
201 /* Initialize other variables. */
202 tcpsock_sendbufs = 0;
204 /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
205 mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
206 mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
210 * Initialize the state of a TCP socket's send queue.
212 static void
213 tcpsock_reset_send(struct tcpsock * tcp)
216 tcp->tcp_snd.ts_tail = NULL;
217 tcp->tcp_snd.ts_unsent = NULL;
218 tcp->tcp_snd.ts_head = NULL;
219 tcp->tcp_snd.ts_len = 0;
220 tcp->tcp_snd.ts_unsent_off = 0;
221 tcp->tcp_snd.ts_head_off = 0;
225 * Initialize the state of a TCP socket's receive queue.
227 static void
228 tcpsock_reset_recv(struct tcpsock * tcp)
231 tcp->tcp_rcv.tr_pre_tailp = NULL;
232 tcp->tcp_rcv.tr_head = NULL;
233 tcp->tcp_rcv.tr_len = 0;
234 tcp->tcp_rcv.tr_head_off = 0;
235 tcp->tcp_rcv.tr_unacked = 0;
239 * Create a TCP socket.
241 sockid_t
242 tcpsock_socket(int domain, int protocol, struct sock ** sockp,
243 const struct sockevent_ops ** ops)
245 struct tcpsock *tcp;
246 uint8_t ip_type;
248 switch (protocol) {
249 case 0:
250 case IPPROTO_TCP:
251 break;
253 default:
254 return EPROTONOSUPPORT;
257 if (TAILQ_EMPTY(&tcp_freelist))
258 return ENOBUFS;
260 tcp = TAILQ_FIRST(&tcp_freelist);
263 * Initialize the structure. Do not memset it to zero, as it is still
264 * part of the linked free list. Initialization may still fail. When
265 * adding new fields, make sure to change tcpsock_clone() accordingly.
268 ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
269 TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
271 if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
272 return ENOBUFS;
273 tcp_arg(tcp->tcp_pcb, tcp);
275 tcp->tcp_listener = NULL;
277 tcpsock_reset_send(tcp);
278 tcpsock_reset_recv(tcp);
280 TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
282 *ops = &tcpsock_ops;
283 return tcpsock_get_id(tcp);
287 * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
288 * incoming on listening socket 'listener'. The new socket is essentially a
289 * "clone" of the listening TCP socket, in that it should inherit any settings
290 * from the listening socket. The socket has not yet been accepted by userland
291 * so add it to the queue of connetions pending for the listening socket. On
292 * success, return OK. On failure, return a negative error code.
294 static int
295 tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
297 struct tcpsock *tcp;
299 if (TAILQ_EMPTY(&tcp_freelist))
300 return ENOBUFS;
302 tcp = TAILQ_FIRST(&tcp_freelist);
305 * Initialize the structure. Do not memset it to zero, as it is still
306 * part of the linked free list. Initialization may still fail. Most
307 * settings should be inherited from the listening socket here, rather
308 * than being initialized to their default state.
311 ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
312 tcpsock_get_id(tcp));
314 tcp->tcp_pcb = pcb;
315 tcp_arg(pcb, tcp);
317 tcpsock_reset_send(tcp);
318 tcpsock_reset_recv(tcp);
321 * Remove the new socket from the free list, and add it to the queue of
322 * the listening socket--in this order, because the same next pointer
323 * is used for both.
325 TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
327 TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
328 tcp_queue.tq_next);
329 tcp->tcp_listener = listener;
331 return OK;
335 * Allocate a buffer from the pool, using the standard pool size. The returned
336 * buffer is a single element--never a chain.
338 static struct pbuf *
339 tcpsock_alloc_buf(void)
341 struct pbuf *pbuf;
343 pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
345 assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
347 return pbuf;
351 * Free the given buffer. Ensure that pbuf_free() will not attempt to free the
352 * next buffer(s) in the chain as well. This may be called for pbufs other
353 * than those allocated with tcpsock_alloc_buf().
355 static void
356 tcpsock_free_buf(struct pbuf * pbuf)
360 * Resetting the length is currently not necessary, but better safe
361 * than sorry..
363 pbuf->len = pbuf->tot_len;
364 pbuf->next = NULL;
366 pbuf_free(pbuf);
370 * Clear the send queue of a TCP socket. The caller must ensure that lwIP will
371 * no longer access any of data on the send queue.
373 static void
374 tcpsock_clear_send(struct tcpsock * tcp)
376 struct pbuf *phead;
378 assert(tcp->tcp_pcb == NULL);
380 while ((phead = tcp->tcp_snd.ts_head) != NULL) {
381 tcp->tcp_snd.ts_head = phead->next;
383 assert(tcpsock_sendbufs > 0);
384 tcpsock_sendbufs--;
386 tcpsock_free_buf(phead);
389 tcpsock_reset_send(tcp);
393 * Clear the receive queue of a TCP socket. If 'ack_data' is set, also
394 * acknowledge the previous contents of the receive queue to lwIP.
396 static size_t
397 tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
399 struct pbuf *phead;
400 size_t rlen;
402 rlen = tcp->tcp_rcv.tr_len;
404 while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
405 tcp->tcp_rcv.tr_head = phead->next;
407 assert(tcpsock_recvbufs > 0);
408 tcpsock_recvbufs--;
410 tcpsock_free_buf(phead);
414 * From now on, we will basically be discarding incoming data as fast
415 * as possible, to keep the full window open at all times.
417 if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
418 tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
420 tcpsock_reset_recv(tcp);
422 return rlen;
426 * The TCP socket's PCB has been detached from the socket, typically because
427 * the connection was aborted, either by us or by lwIP. Either way, any TCP
428 * connection is gone. Clear the socket's send queue, remove the socket from
429 * a listening socket's queue, and if the socket itself is ready and allowed to
430 * be freed, free it now. The socket is ready to be freed if it was either on
431 * a listening queue or being closed already. The socket is allowed to be
432 * freed only if 'may_free' is TRUE. If the socket is not freed, its receive
433 * queue is left as is, as it may still have data to be received by userland.
435 static int
436 tcpsock_cleanup(struct tcpsock * tcp, int may_free)
438 int destroy;
440 assert(tcp->tcp_pcb == NULL);
443 * Free any data on the send queue. This is safe to do right now,
444 * because the PCB has been aborted (or was already gone). We must be
445 * very careful about clearing the send queue in all other situations.
447 tcpsock_clear_send(tcp);
450 * If this was a socket pending acceptance, remove it from the
451 * corresponding listener socket's queue, and free it. Otherwise, free
452 * the socket only if it suspended a graceful close operation.
454 if (tcp->tcp_listener != NULL) {
455 TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
456 tcp_queue.tq_next);
457 tcp->tcp_listener = NULL;
460 * The listener socket's backlog count should be adjusted by
461 * lwIP whenever the PCB is freed up, so we need (and must) not
462 * attempt to do that here.
465 destroy = TRUE;
466 } else
467 destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
470 * Do not free the socket if 'may_free' is FALSE. That flag may be set
471 * if we are currently in the second tcpsock_close() call on the
472 * socket, in which case sockevent_is_closing() is TRUE but we must
473 * still not free the socket now: doing so would derail libsockevent.
475 if (destroy && may_free) {
476 (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
478 sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
481 return destroy;
485 * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is
486 * connected, this will cause the connection to be reset. The PCB, which must
487 * have still been present before the call, will be gone after the call.
489 static void
490 tcpsock_pcb_abort(struct tcpsock * tcp)
493 assert(tcp->tcp_pcb != NULL);
494 assert(!tcpsock_is_listening(tcp));
496 tcp_recv(tcp->tcp_pcb, NULL);
497 tcp_sent(tcp->tcp_pcb, NULL);
498 tcp_err(tcp->tcp_pcb, NULL);
499 tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
501 tcp_arg(tcp->tcp_pcb, NULL);
503 tcp_abort(tcp->tcp_pcb);
505 tcp->tcp_pcb = NULL;
509 * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is
510 * connected, its graceful close will be finished by lwIP in the background.
511 * The PCB, which must have still been present before the call, will be gone
512 * after the call.
514 static void
515 tcpsock_pcb_close(struct tcpsock * tcp)
517 err_t err;
519 assert(tcp->tcp_pcb != NULL);
520 assert(tcp->tcp_snd.ts_len == 0);
522 if (!tcpsock_is_listening(tcp)) {
523 tcp_recv(tcp->tcp_pcb, NULL);
524 tcp_sent(tcp->tcp_pcb, NULL);
525 tcp_err(tcp->tcp_pcb, NULL);
526 tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
529 tcp_arg(tcp->tcp_pcb, NULL);
531 if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
532 panic("unexpected TCP close failure: %d", err);
534 tcp->tcp_pcb = NULL;
538 * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
539 * FALSE if they are not. Upon calling this function, the socket's PCB must
540 * still be around.
542 static int
543 tcpsock_may_close(struct tcpsock * tcp)
546 assert(tcp->tcp_pcb != NULL);
549 * Regular closing of the PCB requires three conditions to be met:
551 * 1. all our data has been transmitted AND acknowledged, so that we do
552 * not risk corruption in case there are still unsent or unack'ed
553 * data buffers that may otherwise be recycled too soon;
554 * 2. we have sent our FIN to the peer; and,
555 * 3. we have received a FIN from the peer.
557 return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
558 (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
562 * The given socket is ready to be closed as per the tcpsock_may_close() rules.
563 * This implies that its send queue is already empty. Gracefully close the
564 * PCB. In addition, if the socket is being closed gracefully, meaning we
565 * suspended an earlier tcpsock_close() call (and as such already emptied the
566 * receive queue as well), then tell libsockevent that the close is finished,
567 * freeing the socket. Return TRUE if the socket has indeed been freed this
568 * way, or FALSE if the socket is still around.
570 static int
571 tcpsock_finish_close(struct tcpsock * tcp)
574 assert(tcp->tcp_snd.ts_len == 0);
575 assert(tcp->tcp_listener == NULL);
578 * If we get here, we have already shut down the sending side of the
579 * PCB. Technically, we are interested only in shutting down the
580 * receiving side of the PCB here, so that lwIP may decide to recycle
581 * the socket later etcetera. We call tcp_close() because we do not
582 * want to rely on tcp_shutdown(RX) doing the exact same thing.
583 * However, we do rely on the fact that the PCB is not immediately
584 * destroyed by the tcp_close() call: otherwise we may have to return
585 * ERR_ABRT if this function is called from a lwIP-generated event.
587 tcpsock_pcb_close(tcp);
590 * If we suspended an earlier tcpsock_close() call, we have to tell
591 * libsockevent that the close operation is now complete.
593 if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
594 assert(tcp->tcp_rcv.tr_len == 0);
596 sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
598 return TRUE;
599 } else
600 return FALSE;
604 * Attempt to start or resume enqueuing data and/or a FIN to send on the given
605 * TCP socket. Return TRUE if anything at all could be newly enqueued on the
606 * lwIP PCB, even if less than desired. In that case, the caller should try to
607 * send whatever was enqueued, and if applicable, check if the socket may now
608 * be closed (due to the FIN being enqueued). In particular, in any situation
609 * where the socket may be in the process of being closed, the caller must use
610 * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could
611 * be enqueued, in which case no send attempt need to be made either.
613 static int
614 tcpsock_pcb_enqueue(struct tcpsock * tcp)
616 struct pbuf *punsent;
617 size_t space, chunk;
618 unsigned int flags;
619 err_t err;
620 int enqueued;
622 assert(tcp->tcp_pcb != NULL);
624 if (tcpsock_get_flags(tcp) & TCPF_FULL)
625 return FALSE;
628 * Attempt to enqueue more unsent data, if any, on the PCB's send
629 * queue.
631 enqueued = FALSE;
633 while (tcp->tcp_snd.ts_unsent != NULL) {
634 if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
635 break;
638 * We may maintain a non-NULL unsent pointer even when there is
639 * nothing more to send right now, because the tail buffer may
640 * be filled up further later on.
642 punsent = tcp->tcp_snd.ts_unsent;
644 assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
646 chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
647 if (chunk == 0)
648 break;
650 if (chunk > space)
651 chunk = space;
653 /* Try to enqueue more data for sending. */
654 if (chunk < punsent->len || punsent->next != NULL)
655 flags = TCP_WRITE_FLAG_MORE;
656 else
657 flags = 0;
659 err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
660 tcp->tcp_snd.ts_unsent_off, chunk, flags);
663 * Since tcp_write() enqueues data only, it should only return
664 * out-of-memory errors; no fatal ones. In any case, stop.
666 if (err != ERR_OK) {
667 assert(err == ERR_MEM);
669 break;
672 /* We have successfully enqueued data. */
673 enqueued = TRUE;
675 tcp->tcp_snd.ts_unsent_off += chunk;
677 if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
678 assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
679 punsent->next == NULL);
681 break;
684 tcp->tcp_snd.ts_unsent = punsent->next;
685 tcp->tcp_snd.ts_unsent_off = 0;
689 * If all pending data has been enqueued for sending, and we should
690 * shut down the sending end of the socket, try that now.
692 if ((tcp->tcp_snd.ts_unsent == NULL ||
693 tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
694 tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
695 !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
696 err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
698 if (err == ERR_OK) {
700 * We have successfully enqueued a FIN. The caller is
701 * now responsible for checking whether the PCB and
702 * possibly even the socket object can now be freed.
704 tcpsock_set_flag(tcp, TCPF_SENT_FIN);
706 enqueued = TRUE;
707 } else {
708 assert(err == ERR_MEM);
711 * FIXME: the resolution for lwIP bug #47485 has taken
712 * away even more control over the closing process from
713 * us, making tracking sockets especially for SO_LINGER
714 * even harder. For now, we simply effectively undo
715 * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
716 * returns ERR_MEM. This will not be sustainable in
717 * the long term, though.
719 tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
721 tcpsock_set_flag(tcp, TCPF_FULL);
725 return enqueued;
729 * Request lwIP to start sending any enqueued data and/or FIN on the TCP
730 * socket's lwIP PCB. On success, return OK. On failure, return a negative
731 * error code, after cleaning up the socket, freeing the PCB. If the socket
732 * was already being closed, also free the socket object in that case; the
733 * caller must then not touch the socket object anymore upon return. If the
734 * socket object is not freed, and if 'raise_error' is TRUE, raise the error
735 * on the socket object.
737 static int
738 tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
740 err_t err;
741 int r;
743 assert(tcp->tcp_pcb != NULL);
746 * If we have enqueued something, ask lwIP to send TCP packets now.
747 * This may result in a fatal error, in which case we clean up the
748 * socket and return the error to the caller. Since cleaning up the
749 * socket may free the socket object, and the caller cannot tell
750 * whether that will happen or has happened, also possibly raise the
751 * error on the socket object if it is not gone. As such, callers that
752 * set 'raise_error' to FALSE must know for sure that the socket was
753 * not being closed, for example because the caller is processing a
754 * (send) call from userland.
756 err = tcp_output(tcp->tcp_pcb);
758 if (err != ERR_OK && err != ERR_MEM) {
759 tcpsock_pcb_abort(tcp);
761 r = util_convert_err(err);
763 if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
764 if (raise_error)
765 sockevent_set_error(tcpsock_get_sock(tcp), r);
767 /* Otherwise, do not touch the socket object anymore! */
769 return r;
770 } else
771 return OK;
775 * Callback from lwIP. The given number of data bytes have been acknowledged
776 * as received by the remote end. Dequeue and free data from the TCP socket's
777 * send queue as appropriate.
779 static err_t
780 tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
782 struct tcpsock *tcp = (struct tcpsock *)arg;
783 struct pbuf *phead;
784 size_t left;
786 assert(tcp != NULL);
787 assert(pcb == tcp->tcp_pcb);
788 assert(len > 0);
790 assert(tcp->tcp_snd.ts_len >= len);
791 assert(tcp->tcp_snd.ts_head != NULL);
793 left = len;
796 * First see if we can free up whole buffers. Check against the head
797 * buffer's 'len' rather than 'tot_len', or we may end up leaving an
798 * empty buffer on the chain.
800 while ((phead = tcp->tcp_snd.ts_head) != NULL &&
801 left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
802 left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
804 tcp->tcp_snd.ts_head = phead->next;
805 tcp->tcp_snd.ts_head_off = 0;
807 if (phead == tcp->tcp_snd.ts_unsent) {
808 assert(tcp->tcp_snd.ts_unsent_off == phead->len);
810 tcp->tcp_snd.ts_unsent = phead->next;
811 tcp->tcp_snd.ts_unsent_off = 0;
814 assert(tcpsock_sendbufs > 0);
815 tcpsock_sendbufs--;
817 tcpsock_free_buf(phead);
821 * The rest of the given length is for less than the current head
822 * buffer.
824 if (left > 0) {
825 assert(tcp->tcp_snd.ts_head != NULL);
826 assert((size_t)tcp->tcp_snd.ts_head->len -
827 tcp->tcp_snd.ts_head_off > left);
829 tcp->tcp_snd.ts_head_off += left;
832 tcp->tcp_snd.ts_len -= (size_t)len;
834 if (tcp->tcp_snd.ts_head == NULL) {
835 assert(tcp->tcp_snd.ts_len == 0);
836 assert(tcp->tcp_snd.ts_unsent == NULL);
837 tcp->tcp_snd.ts_tail = NULL;
838 } else
839 assert(tcp->tcp_snd.ts_len > 0);
842 * If we emptied the send queue, and we already managed to send a FIN
843 * earlier, we may now have met all requirements to close the socket's
844 * PCB. Otherwise, we may also be able to send more now, so try to
845 * resume sending. Since we are invoked from the "sent" event,
846 * tcp_output() will not actually process anything, and so we do not
847 * call it either. If we did, we would have to deal with errors here.
849 if (tcpsock_may_close(tcp)) {
850 if (tcpsock_finish_close(tcp))
851 return ERR_OK;
852 } else {
853 tcpsock_clear_flag(tcp, TCPF_FULL);
856 * If we now manage to enqueue a FIN, we may be ready to close
857 * the PCB after all.
859 if (tcpsock_pcb_enqueue(tcp)) {
860 if (tcpsock_may_close(tcp) &&
861 tcpsock_finish_close(tcp))
862 return ERR_OK;
866 /* The user may also be able to send more now. */
867 sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
869 return ERR_OK;
873 * Check whether any (additional) data previously received on a TCP socket
874 * should be acknowledged, possibly allowing the remote end to send additional
875 * data as a result.
877 static void
878 tcpsock_ack_recv(struct tcpsock * tcp)
880 size_t rcvbuf, left, delta, ack;
882 assert(tcp->tcp_pcb != NULL);
885 * We must make sure that at all times, we can still add an entire
886 * window's worth of data to the receive queue. If the amount of free
887 * space drops below that threshold, we stop acknowledging received
888 * data. The user may change the receive buffer size at all times; we
889 * update the window size lazily as appropriate.
891 rcvbuf = tcpsock_get_rcvbuf(tcp);
893 if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
895 * The number of bytes that lwIP can still give us at any time
896 * is represented as 'left'. The number of bytes that we still
897 * allow to be stored in the receive queue is represented as
898 * 'delta'. We must make sure that 'left' does not ever exceed
899 * 'delta' while acknowledging as many bytes as possible under
900 * that rule.
902 left = TCP_WND - tcp->tcp_rcv.tr_unacked;
903 delta = rcvbuf - tcp->tcp_rcv.tr_len;
905 if (left < delta) {
906 ack = delta - left;
908 if (ack > tcp->tcp_rcv.tr_unacked)
909 ack = tcp->tcp_rcv.tr_unacked;
911 tcp_recved(tcp->tcp_pcb, ack);
913 tcp->tcp_rcv.tr_unacked -= ack;
915 assert(tcp->tcp_rcv.tr_len + TCP_WND -
916 tcp->tcp_rcv.tr_unacked <= rcvbuf);
922 * Attempt to merge two consecutive underfilled buffers in the receive queue of
923 * a TCP socket, freeing up one of the two buffers as a result. The first
924 * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
925 * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to
926 * the first buffer. The second buffer may be followed by additional buffers
927 * with even more new data. Return TRUE if buffers have been merged, in which
928 * case the pointer at 'pnext' may have changed, and no assumptions should be
929 * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE
930 * if no merging was necessary or if no new buffer could be allocated.
932 static int
933 tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
935 struct pbuf *pnew;
937 assert(*pnext == ptail);
938 assert(ptail->next == pbuf);
941 * Unfortunately, we cannot figure out what kind of pbuf we were given
942 * by the lower layers, so we cannot merge two buffers without first
943 * allocating a third. Once we have done that, though, we can easily
944 * merge more into that new buffer. For now we use the following
945 * policies:
947 * 1. if two consecutive lwIP-provided buffers are both used less than
948 * half the size of a full buffer, try to allocate a new buffer and
949 * copy both lwIP-provided buffers into that new buffer, freeing up
950 * the pair afterwards;
951 * 2. if the tail buffer on the chain is allocated by us and not yet
952 * full, and the next buffer's contents can be added to the tail
953 * buffer in their entirety, do just that.
955 * Obviously there is a trade-off between the performance overhead of
956 * copying and the resource overhead of keeping less-than-full buffers
957 * on the receive queue, but this policy should both keep actual memory
958 * usage to no more than twice the receive queue length and prevent
959 * excessive copying. The policy deliberately performs more aggressive
960 * merging into a buffer that we allocated ourselves.
962 if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
963 pbuf->len <= MEMPOOL_BUFSIZE / 2) {
965 * Case #1.
967 assert(ptail->tot_len == ptail->len);
968 assert(pbuf->tot_len == pbuf->len);
970 pnew = tcpsock_alloc_buf();
971 if (pnew == NULL)
972 return FALSE;
974 memcpy(pnew->payload, ptail->payload, ptail->len);
975 memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
976 pbuf->len);
977 pnew->len = ptail->len + pbuf->len;
978 assert(pnew->len <= pnew->tot_len);
980 pnew->next = pbuf->next;
981 /* For now, we need not inherit any flags from either pbuf. */
983 *pnext = pnew;
985 /* One allocated, two about to be deallocated. */
986 assert(tcpsock_recvbufs > 0);
987 tcpsock_recvbufs--;
989 tcpsock_free_buf(ptail);
990 tcpsock_free_buf(pbuf);
992 return TRUE;
993 } else if (ptail->tot_len - ptail->len >= pbuf->len) {
995 * Case #2.
997 memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
998 pbuf->len);
1000 ptail->len += pbuf->len;
1002 ptail->next = pbuf->next;
1004 assert(tcpsock_recvbufs > 0);
1005 tcpsock_recvbufs--;
1007 tcpsock_free_buf(pbuf);
1009 return TRUE;
1010 } else
1011 return FALSE;
1015 * Callback from lwIP. New data or flags have been received on a TCP socket.
1017 static err_t
1018 tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
1019 struct pbuf * pbuf, err_t err)
1021 struct tcpsock *tcp = (struct tcpsock *)arg;
1022 struct pbuf *ptail, **pprevp;
1023 size_t len;
1025 assert(tcp != NULL);
1026 assert(pcb == tcp->tcp_pcb);
1029 * lwIP should never provide anything other than ERR_OK in 'err', and
1030 * it is not clear what we should do if it would. If lwIP ever changes
1031 * in this regard, we will likely have to change this code accordingly.
1033 if (err != ERR_OK)
1034 panic("TCP receive event with error: %d", err);
1036 /* If the given buffer is NULL, we have received a FIN. */
1037 if (pbuf == NULL) {
1038 tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
1040 /* Userland may now receive EOF. */
1041 if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
1042 sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1045 * If we were in the process of closing the socket, and we
1046 * receive a FIN before our FIN got acknowledged, we close the
1047 * socket anyway, as described in tcpsock_close(). However, if
1048 * there is still unacknowledged outgoing data or we did not
1049 * even manage to send our FIN yet, hold off closing the socket
1050 * for now.
1052 if (tcpsock_may_close(tcp))
1053 (void)tcpsock_finish_close(tcp);
1055 return ERR_OK;
1059 * If the socket is being closed, receiving new data should cause a
1060 * reset.
1062 if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
1063 tcpsock_pcb_abort(tcp);
1065 (void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
1066 /* Do not touch the socket object anymore! */
1068 pbuf_free(pbuf);
1070 return ERR_ABRT;
1074 * If the socket has already been shut down for reading, discard the
1075 * incoming data and do nothing else.
1077 if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
1078 tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
1080 pbuf_free(pbuf);
1082 return ERR_OK;
1086 * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would
1087 * enable the receive functionality to delay delivering "un-pushed"
1088 * data to applications. The implementation of this scheme could track
1089 * the amount of data up to and including the last-pushed segment using
1090 * a "tr_push_len" field or so. Deciding when to deliver "un-pushed"
1091 * data after all is a bit tricker though. As far as I can tell, the
1092 * BSDs do not implement anything like that. Windows does, and this
1093 * results in interaction problems with even more lightweight TCP/IP
1094 * stacks that do not send the TCP PSH flag. Currently, there is no
1095 * obvious benefit for us to support delaying data delivery like that.
1096 * In addition, testing its implementation reliably would be difficult.
1099 len = (size_t)pbuf->tot_len;
1102 * Count the number of buffers that are now owned by us. The new total
1103 * of buffers owned by us must not exceed the size of the memory pool.
1104 * Any more would indicate an accounting error. Note that
1105 * tcpsock_recvbufs is currently used for debugging only!
1107 tcpsock_recvbufs += pbuf_clen(pbuf);
1108 assert(tcpsock_recvbufs < mempool_cur_buffers());
1111 * The pre-tail pointer points to whatever is pointing to the tail
1112 * buffer. The latter pointer may be the 'tr_head' field in our
1113 * tcpsock structure, or the 'next' field in the penultimate buffer,
1114 * or NULL if there are currently no buffers on the receive queue.
1116 if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
1117 ptail = *pprevp;
1119 assert(ptail != NULL);
1120 assert(ptail->next == NULL);
1121 assert(tcp->tcp_rcv.tr_head != NULL);
1123 ptail->next = pbuf;
1124 pbuf->tot_len = pbuf->len; /* to help freeing on merges */
1126 if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
1127 ptail = *pprevp;
1128 pbuf = ptail->next;
1131 if (pbuf != NULL)
1132 pprevp = &ptail->next;
1133 } else {
1134 assert(tcp->tcp_rcv.tr_head == NULL);
1135 assert(tcp->tcp_rcv.tr_head_off == 0);
1137 tcp->tcp_rcv.tr_head = pbuf;
1139 pprevp = &tcp->tcp_rcv.tr_head;
1143 * Chop up the chain into individual buffers. This is necessary as we
1144 * overload 'tot_len' to mean "space available in the buffer", as we
1145 * want for buffers allocated by us as part of buffer merges. Also get
1146 * a pointer to the pointer to the new penultimate tail buffer. Due to
1147 * merging, the chain may already be empty by now, though.
1149 if (pbuf != NULL) {
1150 for (; pbuf->next != NULL; pbuf = pbuf->next) {
1151 pbuf->tot_len = pbuf->len;
1153 pprevp = &pbuf->next;
1155 assert(pbuf->len == pbuf->tot_len);
1158 assert(*pprevp != NULL);
1159 assert((*pprevp)->next == NULL);
1160 tcp->tcp_rcv.tr_pre_tailp = pprevp;
1162 tcp->tcp_rcv.tr_len += len;
1163 tcp->tcp_rcv.tr_unacked += len;
1165 assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
1168 * Note that tr_len may now exceed the receive buffer size in the
1169 * highly exceptional case that the user is adjusting the latter after
1170 * the socket had already received data.
1173 /* See if we can immediately acknowledge some or all of the data. */
1174 tcpsock_ack_recv(tcp);
1176 /* Also wake up any receivers now. */
1177 sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1179 return ERR_OK;
1183 * Callback from lwIP. The PCB corresponding to the socket identified by 'arg'
1184 * has been closed by lwIP, with the reason specified in 'err': either the
1185 * connection has been aborted locally (ERR_ABRT), it has been reset by the
1186 * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
1188 static void
1189 tcpsock_event_err(void * arg, err_t err)
1191 struct tcpsock *tcp = (struct tcpsock *)arg;
1192 int r;
1194 assert(tcp != NULL);
1195 assert(tcp->tcp_pcb != NULL);
1196 assert(err != ERR_OK);
1198 /* The original PCB is now gone, or will be shortly. */
1199 tcp->tcp_pcb = NULL;
1202 * Clean up the socket. As a result it may be freed, in which case we
1203 * must not touch it anymore. No need to return ERR_ABRT from here, as
1204 * the PCB has been aborted already.
1206 if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
1207 return;
1209 if (err == ERR_CLSD) {
1211 * We may get here if the socket is shut down for writing and
1212 * we already received a FIN from the remote side, thus putting
1213 * the socket in LAST_ACK state, and we receive that last
1214 * acknowledgment. There is nothing more we need to do.
1216 * We will never get here in the other case that ERR_CLSD is
1217 * raised, which is when the socket is reset because of
1218 * unacknowledged data while closing: we handle the
1219 * reset-on-ACK case ourselves in tcpsock_close(), and the
1220 * socket is in closing state after that.
1222 assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1223 assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
1224 } else {
1226 * Anything else should be an error directly from lwIP;
1227 * currently either ERR_ABRT and ERR_RST. Covert it to a
1228 * regular error and set it on the socket. Doing so will also
1229 * raise the appropriate events.
1232 * Unfortunately, lwIP is not throwing accurate errors even
1233 * when it can. We convert some errors to reflect more
1234 * accurately the most likely cause.
1236 * TODO: fix lwIP in this regard..
1238 r = util_convert_err(err);
1240 if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
1241 switch (err) {
1242 case ERR_ABRT: r = ETIMEDOUT; break;
1243 case ERR_RST: r = ECONNREFUSED; break;
1247 sockevent_set_error(tcpsock_get_sock(tcp), r);
1252 * Callback from lwIP. Perform regular checks on a TCP socket. This function
1253 * is called one per five seconds on connected sockets, and twice per second on
1254 * closing sockets.
1256 static err_t
1257 tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
1259 struct tcpsock *tcp = (struct tcpsock *)arg;
1260 err_t err;
1261 int r;
1263 assert(tcp != NULL);
1264 assert(pcb == tcp->tcp_pcb);
1267 * If we ended up running out of buffers earlier, try resuming any send
1268 * requests now, both for enqueuing TCP data with lwIP and for user
1269 * requests.
1271 if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
1272 tcpsock_clear_flag(tcp, TCPF_FULL);
1273 tcpsock_clear_flag(tcp, TCPF_OOM);
1275 /* See if we can enqueue more data with lwIP. */
1276 if (tcpsock_pcb_enqueue(tcp)) {
1277 /* In some cases, we can now close the PCB. */
1278 if (tcpsock_may_close(tcp)) {
1279 (void)tcpsock_finish_close(tcp);
1281 * The PCB is definitely gone here, and the
1282 * entire socket object may be gone now too.
1283 * Do not touch either anymore!
1286 return ERR_OK;
1290 * If actually sending the data fails, the PCB will be
1291 * gone, and the socket object may be gone as well. Do
1292 * not touch either anymore in that case!
1294 if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
1295 return ERR_ABRT;
1299 * If we ran out of buffers earlier, it may be possible to take
1300 * in more data from a user process now, even if we did not
1301 * manage to enqueue any more pending data with lwIP.
1303 sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
1305 assert(tcp->tcp_pcb != NULL);
1306 } else if (tcp->tcp_snd.ts_unsent != NULL &&
1307 tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
1309 * If the send buffer is full, we will no longer call
1310 * tcp_output(), which means we may also miss out on fatal
1311 * errors that would otherwise kill the connection (e.g., no
1312 * route). As a result, the connection may erroneously
1313 * continue to exist for a long time. To avoid this, we call
1314 * tcp_output() every once in a while when there are still
1315 * unsent data.
1317 err = tcp_output(tcp->tcp_pcb);
1319 if (err != ERR_OK && err != ERR_MEM) {
1320 tcpsock_pcb_abort(tcp);
1322 if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
1323 r = util_convert_err(err);
1325 sockevent_set_error(tcpsock_get_sock(tcp), r);
1327 /* Otherwise do not touch the socket object anymore! */
1329 return ERR_ABRT;
1334 * If we are closing the socket, and we sent a FIN, see if the FIN got
1335 * acknowledged. If so, finish closing the socket. Unfortunately, we
1336 * can perform this check by polling only. TODO: change lwIP..
1338 if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
1339 (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
1340 tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
1341 assert(tcp->tcp_snd.ts_len == 0);
1343 tcpsock_finish_close(tcp);
1346 return ERR_OK;
1350 * Bind a TCP socket to a local address.
1352 static int
1353 tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
1354 socklen_t addr_len, endpoint_t user_endpt)
1356 struct tcpsock *tcp = (struct tcpsock *)sock;
1357 ip_addr_t ipaddr;
1358 uint16_t port;
1359 err_t err;
1360 int r;
1362 if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
1363 return EINVAL;
1365 if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1366 user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
1367 FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
1368 return r;
1370 err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
1372 return util_convert_err(err);
1376 * Callback from lwIP. A new connection 'pcb' has arrived on the listening
1377 * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that
1378 * lwIP could not accept the connection itself.
1380 static err_t
1381 tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
1383 struct tcpsock *tcp = (struct tcpsock *)arg;
1385 assert(tcp != NULL);
1386 assert(tcpsock_is_listening(tcp));
1389 * If the given PCB is NULL, then lwIP ran out of memory allocating a
1390 * PCB for the new connection. There is nothing we can do with that
1391 * information. Also check 'err' just to make sure.
1393 if (pcb == NULL || err != OK)
1394 return ERR_OK;
1397 * The TCP socket is the listening socket, but the PCB is for the
1398 * incoming connection.
1400 if (tcpsock_clone(tcp, pcb) != OK) {
1402 * We could not allocate the resources necessary to accept the
1403 * connection. Abort it immediately.
1405 tcp_abort(pcb);
1407 return ERR_ABRT;
1411 * The connection has not yet been accepted, and thus should still be
1412 * considered on the listen queue.
1414 tcp_backlog_delayed(pcb);
1416 /* Set the callback functions. */
1417 tcp_recv(pcb, tcpsock_event_recv);
1418 tcp_sent(pcb, tcpsock_event_sent);
1419 tcp_err(pcb, tcpsock_event_err);
1420 tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1422 sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
1424 return ERR_OK;
1428 * Put a TCP socket in listening mode.
1430 static int
1431 tcpsock_listen(struct sock * sock, int backlog)
1433 struct tcpsock *tcp = (struct tcpsock *)sock;
1434 struct tcp_pcb *pcb;
1435 err_t err;
1437 /* The maximum backlog value must not exceed its field size. */
1438 assert(SOMAXCONN <= UINT8_MAX);
1441 * Allow only CLOSED sockets to enter listening mode. If the socket
1442 * was already in listening mode, allow its backlog value to be
1443 * updated, even if it was shut down already (making this a no-op).
1445 if (!tcpsock_is_listening(tcp) &&
1446 (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
1447 return EINVAL;
1450 * If the socket was not already in listening mode, put it in that mode
1451 * now. That involves switching PCBs as lwIP attempts to save memory
1452 * by replacing the original PCB with a smaller one. If the socket was
1453 * already in listening mode, simply update its backlog value--this has
1454 * no effect on the sockets already in the backlog.
1456 if (!tcpsock_is_listening(tcp)) {
1457 assert(tcp->tcp_pcb != NULL);
1460 * If the socket has not been bound to a port yet, do that
1461 * first. This does mean that the listen call may fail with
1462 * side effects, but that is acceptable in this case.
1464 if (tcp->tcp_pcb->local_port == 0) {
1465 err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
1466 0 /*port*/);
1468 if (err != ERR_OK)
1469 return util_convert_err(err);
1473 * Clear the argument on the PCB that is about to be replaced,
1474 * because if we do not, once the PCB is reused (which does not
1475 * clear the argument), we might get weird events. Do this
1476 * before the tcp_listen() call, because we should no longer
1477 * access the old PCB afterwards (even if we can).
1479 tcp_arg(tcp->tcp_pcb, NULL);
1481 pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
1482 &err);
1484 if (pcb == NULL) {
1485 tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
1487 return util_convert_err(err);
1490 tcp_arg(pcb, tcp);
1491 tcp->tcp_pcb = pcb;
1493 tcp_accept(pcb, tcpsock_event_accept);
1495 /* Initialize the queue head for sockets pending acceptance. */
1496 TAILQ_INIT(&tcp->tcp_queue.tq_head);
1497 } else if (tcp->tcp_pcb != NULL)
1498 tcp_backlog_set(tcp->tcp_pcb, backlog);
1500 return OK;
1504 * Callback from lwIP. A socket connection attempt has succeeded. Note that
1505 * failed socket events will trigger the tcpsock_event_err() callback instead.
1507 static err_t
1508 tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
1510 struct tcpsock *tcp = (struct tcpsock *)arg;
1512 assert(tcp != NULL);
1513 assert(pcb == tcp->tcp_pcb);
1514 assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
1517 * If lwIP ever changes so that this callback is called for connect
1518 * failures as well, then we need to change the code here accordingly.
1520 if (err != ERR_OK)
1521 panic("TCP connected event with error: %d", err);
1523 tcpsock_clear_flag(tcp, TCPF_CONNECTING);
1525 sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
1527 return ERR_OK;
1531 * Connect a TCP socket to a remote address.
1533 static int
1534 tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
1535 socklen_t addr_len, endpoint_t user_endpt)
1537 struct tcpsock *tcp = (struct tcpsock *)sock;
1538 ip_addr_t dst_addr;
1539 uint16_t dst_port;
1540 err_t err;
1541 int r;
1544 * Listening sockets may not have a PCB, so we use higher-level flags
1545 * to throw the correct error code for those instead.
1547 if (tcpsock_is_listening(tcp))
1548 return EOPNOTSUPP;
1551 * If there is no longer any PCB, we obviously cannot perform the
1552 * connection, but POSIX is not clear on which error to return. We
1553 * copy NetBSD's.
1555 if (tcp->tcp_pcb == NULL)
1556 return EINVAL;
1559 * The only state from which a connection can be initiated, is CLOSED.
1560 * Some of the other states require distinct error codes, though.
1562 switch (tcp->tcp_pcb->state) {
1563 case CLOSED:
1564 break;
1565 case SYN_SENT:
1566 return EALREADY;
1567 case LISTEN:
1568 assert(0); /* we just checked.. */
1569 default:
1570 return EISCONN;
1574 * Get the destination address, and attempt to start connecting. If
1575 * the socket was not bound before, or it was bound to a port only,
1576 * then lwIP will select a source address for us. We cannot do this
1577 * ourselves even if we wanted to: it is impossible to re-bind a TCP
1578 * PCB in the case it was previously bound to a port only.
1580 if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1581 &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
1582 return r;
1584 err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
1585 tcpsock_event_connected);
1588 * Note that various tcp_connect() error cases will leave the PCB with
1589 * a newly set local and remote IP address anyway. We should be
1590 * careful not to rely on the addresses being as they were before.
1592 if (err != ERR_OK)
1593 return util_convert_err(err);
1595 /* Set the other callback functions. */
1596 tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
1597 tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
1598 tcp_err(tcp->tcp_pcb, tcpsock_event_err);
1599 tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1602 * Set a flag so that we can correct lwIP's error codes in case the
1603 * connection fails.
1605 tcpsock_set_flag(tcp, TCPF_CONNECTING);
1607 return SUSPEND;
1611 * Test whether any new connections are pending on a listening TCP socket.
1613 static int
1614 tcpsock_test_accept(struct sock * sock)
1616 struct tcpsock *tcp = (struct tcpsock *)sock;
1618 /* Is this socket in listening mode at all? */
1619 if (!tcpsock_is_listening(tcp))
1620 return EINVAL;
1622 /* Are there any connections to accept right now? */
1623 if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
1624 return OK;
1626 /* If the socket has been shut down, we return ECONNABORTED. */
1627 if (tcp->tcp_pcb == NULL)
1628 return ECONNABORTED;
1630 /* Otherwise, wait for a new connection first. */
1631 return SUSPEND;
1635 * Accept a connection on a listening TCP socket, creating a new TCP socket.
1637 static sockid_t
1638 tcpsock_accept(struct sock * sock, struct sockaddr * addr,
1639 socklen_t * addr_len, endpoint_t user_endpt __unused,
1640 struct sock ** newsockp)
1642 struct tcpsock *listener = (struct tcpsock *)sock;
1643 struct tcpsock *tcp;
1644 int r;
1646 if ((r = tcpsock_test_accept(sock)) != OK)
1647 return r;
1648 /* Below, we must not assume that the listener has a PCB. */
1650 tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
1651 assert(tcp->tcp_listener == listener);
1652 assert(tcp->tcp_pcb != NULL);
1654 TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
1655 tcp->tcp_listener = NULL;
1657 tcp_backlog_accepted(tcp->tcp_pcb);
1659 ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1660 &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
1663 * Set 'newsockp' to NULL so that libsockevent knows we already cloned
1664 * the socket, and it must not be reinitialized anymore.
1666 *newsockp = NULL;
1667 return tcpsock_get_id(tcp);
1671 * Perform preliminary checks on a send request.
1673 static int
1674 tcpsock_pre_send(struct sock * sock, size_t len __unused,
1675 socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
1676 socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
1680 * Reject calls with unknown flags. Since libsockevent strips out the
1681 * flags it handles itself here, we only have to test for ones we can
1682 * not handle. Currently, there are no send flags that we support.
1684 if (flags != 0)
1685 return EOPNOTSUPP;
1687 return OK;
1691 * Test whether the given number of data bytes can be sent on a TCP socket.
1693 static int
1694 tcpsock_test_send(struct sock * sock, size_t min)
1696 struct tcpsock *tcp = (struct tcpsock *)sock;
1697 size_t sndbuf;
1699 if (tcp->tcp_pcb == NULL)
1700 return EPIPE;
1702 switch (tcp->tcp_pcb->state) {
1703 case CLOSED: /* new */
1704 case LISTEN: /* listening */
1705 return ENOTCONN;
1706 case SYN_SENT: /* connecting */
1707 case SYN_RCVD: /* simultaneous open, maybe someday? */
1708 return SUSPEND;
1709 case ESTABLISHED: /* connected */
1710 case CLOSE_WAIT: /* closed remotely */
1711 break;
1712 default: /* shut down locally */
1713 assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1714 return EPIPE;
1717 sndbuf = tcpsock_get_sndbuf(tcp);
1718 if (min > sndbuf)
1719 min = sndbuf;
1721 if (tcp->tcp_snd.ts_len + min > sndbuf)
1722 return SUSPEND;
1723 else
1724 return OK;
1728 * Send data on a TCP socket.
1730 static int
1731 tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
1732 size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1733 socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1734 const struct sockaddr * addr __unused, socklen_t addr_len __unused,
1735 endpoint_t user_endpt __unused, int flags __unused, size_t min)
1737 struct tcpsock *tcp = (struct tcpsock *)sock;
1738 struct pbuf *ptail, *pfirst, *pnext, *plast;
1739 size_t off, tail_off, chunk, left, sndbuf;
1740 int r;
1742 if ((r = tcpsock_test_send(sock, min)) != OK)
1743 return r;
1745 if (len == 0)
1746 return OK; /* nothing to do */
1748 sndbuf = tcpsock_get_sndbuf(tcp);
1749 if (min > sndbuf)
1750 min = sndbuf;
1751 assert(min > 0);
1753 assert(sndbuf > tcp->tcp_snd.ts_len);
1754 left = sndbuf - tcp->tcp_snd.ts_len;
1755 if (left > len)
1756 left = len;
1759 * First see if we can fit any more data in the current tail buffer.
1760 * If so, we set 'ptail' to point to it and 'tail_off' to the previous
1761 * length of the tail buffer, while optimistically extending it to
1762 * include the new data. If not, we set them to NULL/0.
1764 if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
1765 ptail->len < ptail->tot_len) {
1766 assert(ptail->len > 0);
1767 tail_off = (size_t)ptail->len;
1770 * Optimistically extend the head buffer to include whatever
1771 * fits in it. This is needed for util_copy_data().
1773 assert(ptail->tot_len > ptail->len);
1774 off = (size_t)ptail->tot_len - (size_t)ptail->len;
1775 if (off > left)
1776 off = left;
1777 ptail->len += off;
1778 } else {
1779 ptail = NULL;
1780 tail_off = 0;
1781 off = 0;
1785 * Then, if there is more to send, allocate new buffers as needed. If
1786 * we run out of memory, work with whatever we did manage to grab.
1788 pfirst = NULL;
1789 plast = NULL;
1790 while (off < left) {
1791 if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
1792 (pnext = tcpsock_alloc_buf()) == NULL) {
1794 * Chances are that we will end up suspending this send
1795 * request because of being out of buffers. We try to
1796 * resume such requests from the polling function.
1798 tcpsock_set_flag(tcp, TCPF_OOM);
1800 break;
1803 tcpsock_sendbufs++;
1805 if (pfirst == NULL)
1806 pfirst = pnext;
1807 else
1808 plast->next = pnext;
1809 plast = pnext;
1811 chunk = (size_t)pnext->tot_len;
1812 if (chunk > left - off)
1813 chunk = left - off;
1814 pnext->len = chunk;
1815 off += chunk;
1819 * Copy in the data and continue, unless we did not manage to find
1820 * enough space to even meet the low send watermark, in which case we
1821 * undo any allocation and suspend the call until later.
1823 if (off >= min) {
1825 * Optimistically attach the new buffers to the tail, also for
1826 * util_copy_data(). We undo all this if the copy fails.
1828 if (ptail != NULL) {
1829 ptail->next = pfirst;
1831 pnext = ptail;
1832 } else
1833 pnext = pfirst;
1835 assert(pnext != NULL);
1837 r = util_copy_data(data, off, *offp, pnext, tail_off,
1838 TRUE /*copy_in*/);
1839 } else
1840 r = SUSPEND;
1842 if (r != OK) {
1843 /* Undo the modifications made so far. */
1844 while (pfirst != NULL) {
1845 pnext = pfirst->next;
1847 assert(tcpsock_sendbufs > 0);
1848 tcpsock_sendbufs--;
1850 tcpsock_free_buf(pfirst);
1852 pfirst = pnext;
1855 if (ptail != NULL) {
1856 ptail->next = NULL;
1858 ptail->len = tail_off;
1861 return r;
1864 /* Attach the new buffers, if any, to the buffer tail. */
1865 if (pfirst != NULL) {
1866 if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
1867 assert(ptail->len == ptail->tot_len);
1870 * Due to our earlier optimistic modifications, this
1871 * may or may not be redundant.
1873 ptail->next = pfirst;
1876 assert(plast != NULL);
1877 tcp->tcp_snd.ts_tail = plast;
1879 if (tcp->tcp_snd.ts_head == NULL) {
1880 tcp->tcp_snd.ts_head = pfirst;
1881 assert(tcp->tcp_snd.ts_head_off == 0);
1883 if (tcp->tcp_snd.ts_unsent == NULL) {
1884 tcp->tcp_snd.ts_unsent = pfirst;
1885 assert(tcp->tcp_snd.ts_unsent_off == 0);
1889 tcp->tcp_snd.ts_len += off;
1892 * See if we can send any of the data we just enqueued. The socket is
1893 * still open as we are still processing a call from userland on it;
1894 * this saves us from having to deal with the cases that the following
1895 * calls end up freeing the socket object.
1897 if (tcpsock_pcb_enqueue(tcp) &&
1898 (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
1900 * That did not go well. Return the error immediately if we
1901 * had not made any progress earlier. Otherwise, return our
1902 * partial progress and leave the error to be picked up later.
1904 if (*offp > 0) {
1905 sockevent_set_error(tcpsock_get_sock(tcp), r);
1907 return OK;
1908 } else
1909 return r;
1912 *offp += off;
1913 return (off < len) ? SUSPEND : OK;
1917 * Perform preliminary checks on a receive request.
1919 static int
1920 tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1921 int flags)
1925 * Reject calls with unknown flags. Since libsockevent strips out the
1926 * flags it handles itself here, we only have to test for ones we can
1927 * not handle.
1929 if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
1930 return EOPNOTSUPP;
1932 return OK;
1936 * Return TRUE if receive calls may wait for more data to come in on the
1937 * connection, or FALSE if we already know that that is not going to happen.
1939 static int
1940 tcpsock_may_wait(struct tcpsock * tcp)
1943 return (tcp->tcp_pcb != NULL &&
1944 !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
1948 * Test whether data can be received on a TCP socket, and if so, how many bytes
1949 * of data.
1951 static int
1952 tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
1954 struct tcpsock *tcp = (struct tcpsock *)sock;
1955 int may_wait;
1957 /* If there is and never was a connection, refuse the call at all. */
1958 if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
1959 tcp->tcp_pcb->state == LISTEN))
1960 return ENOTCONN;
1963 * If we are certain that no more data will come in later, ignore the
1964 * low receive watermark. Otherwise, bound it to the size of the
1965 * receive buffer, or receive calls may block forever.
1967 if (!(may_wait = tcpsock_may_wait(tcp)))
1968 min = 1;
1969 else if (min > tcpsock_get_rcvbuf(tcp))
1970 min = tcpsock_get_rcvbuf(tcp);
1972 if (tcp->tcp_rcv.tr_len >= min) {
1973 if (size != NULL)
1974 *size = tcp->tcp_rcv.tr_len;
1976 return OK;
1979 return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
1983 * Receive data on a TCP socket.
1985 static int
1986 tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
1987 size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1988 socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1989 struct sockaddr * addr __unused, socklen_t * addr_len __unused,
1990 endpoint_t user_endpt __unused, int flags, size_t min,
1991 int * rflags __unused)
1993 struct tcpsock *tcp = (struct tcpsock *)sock;
1994 struct pbuf *ptail;
1995 size_t off, left;
1996 int r;
1998 /* See if we can receive at all, and if so, how much at most. */
1999 if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
2000 return r;
2002 if (len == 0)
2003 return OK; /* nothing to do */
2005 off = tcp->tcp_rcv.tr_len;
2006 if (off > len)
2007 off = len;
2009 assert(tcp->tcp_rcv.tr_head != NULL);
2010 assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
2012 /* Copy out the data to the caller. */
2013 if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
2014 tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
2015 return r;
2017 /* Unless peeking, remove the data from the receive queue. */
2018 if (!(flags & MSG_PEEK)) {
2019 left = off;
2021 /* Dequeue and free as many entire buffers as possible. */
2022 while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
2023 left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
2024 left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
2026 tcp->tcp_rcv.tr_head = ptail->next;
2027 tcp->tcp_rcv.tr_head_off = 0;
2029 if (tcp->tcp_rcv.tr_head == NULL)
2030 tcp->tcp_rcv.tr_pre_tailp = NULL;
2031 else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
2032 tcp->tcp_rcv.tr_pre_tailp =
2033 &tcp->tcp_rcv.tr_head;
2035 assert(tcpsock_recvbufs > 0);
2036 tcpsock_recvbufs--;
2038 tcpsock_free_buf(ptail);
2042 * If only part of the (new) head buffer is consumed, adjust
2043 * the saved offset into that buffer.
2045 if (left > 0) {
2046 assert(tcp->tcp_rcv.tr_head != NULL);
2047 assert((size_t)tcp->tcp_rcv.tr_head->len -
2048 tcp->tcp_rcv.tr_head_off > left);
2050 tcp->tcp_rcv.tr_head_off += left;
2053 tcp->tcp_rcv.tr_len -= off;
2055 if (tcp->tcp_rcv.tr_head != NULL) {
2056 assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
2057 assert(tcp->tcp_rcv.tr_len > 0);
2058 } else {
2059 assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
2060 assert(tcp->tcp_rcv.tr_len == 0);
2064 * The receive buffer has shrunk, so there may now be space to
2065 * receive more data.
2067 if (tcp->tcp_pcb != NULL)
2068 tcpsock_ack_recv(tcp);
2069 } else
2070 flags &= ~MSG_WAITALL; /* for the check below */
2072 /* Advance the current copy position, and see if we are done. */
2073 *offp += off;
2074 if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
2075 return SUSPEND;
2076 else
2077 return OK;
2081 * Update the set of flag-type socket options on a TCP socket.
2083 static void
2084 tcpsock_setsockmask(struct sock * sock, unsigned int mask)
2086 struct tcpsock *tcp = (struct tcpsock *)sock;
2088 if (tcp->tcp_pcb == NULL)
2089 return;
2091 if (mask & SO_REUSEADDR)
2092 ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
2093 else
2094 ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
2096 if (mask & SO_KEEPALIVE)
2097 ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2098 else
2099 ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2103 * Prepare a helper structure for IP-level option processing.
2105 static void
2106 tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
2109 ipopts->local_ip = &tcp->tcp_pcb->local_ip;
2110 ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
2111 ipopts->tos = &tcp->tcp_pcb->tos;
2112 ipopts->ttl = &tcp->tcp_pcb->ttl;
2113 ipopts->sndmin = TCP_SNDBUF_MIN;
2114 ipopts->sndmax = TCP_SNDBUF_MAX;
2115 ipopts->rcvmin = TCP_RCVBUF_MIN;
2116 ipopts->rcvmax = TCP_RCVBUF_MAX;
2120 * Set socket options on a TCP socket.
2122 static int
2123 tcpsock_setsockopt(struct sock * sock, int level, int name,
2124 const struct sockdriver_data * data, socklen_t len)
2126 struct tcpsock *tcp = (struct tcpsock *)sock;
2127 struct ipopts ipopts;
2128 uint32_t uval;
2129 int r, val;
2131 if (tcp->tcp_pcb == NULL)
2132 return ECONNRESET;
2134 /* Handle TCP-level options. */
2135 switch (level) {
2136 case IPPROTO_IPV6:
2137 switch (name) {
2138 case IPV6_RECVTCLASS:
2139 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2140 len)) != OK)
2141 return r;
2144 * This option is not supported for TCP sockets; it
2145 * would not even make sense. However, named(8)
2146 * insists on trying to set it anyway. We accept the
2147 * request but ignore the value, not even returning
2148 * what was set through getsockopt(2).
2150 return OK;
2152 case IPV6_FAITH:
2153 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2154 len)) != OK)
2155 return r;
2158 * This option is not supported at all, but to save
2159 * ourselves from having to remember the current state
2160 * for getsockopt(2), we also refuse to enable it.
2162 if (val != 0)
2163 return EINVAL;
2165 return OK;
2168 break;
2170 case IPPROTO_TCP:
2171 switch (name) {
2172 case TCP_NODELAY:
2174 * lwIP's listening TCP PCBs do not have this field.
2175 * If this ever becomes an issue, we can create our own
2176 * shadow flag and do the inheritance ourselves.
2178 if (tcp->tcp_pcb->state == LISTEN)
2179 return EINVAL;
2181 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2182 len)) != OK)
2183 return r;
2185 if (val)
2186 tcp_nagle_disable(tcp->tcp_pcb);
2187 else
2188 tcp_nagle_enable(tcp->tcp_pcb);
2190 return OK;
2192 case TCP_KEEPIDLE:
2193 case TCP_KEEPINTVL:
2195 * lwIP's listening TCP PCBs do not have these fields.
2197 if (tcp->tcp_pcb->state == LISTEN)
2198 return EINVAL;
2200 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2201 len)) != OK)
2202 return r;
2204 if (val == 0)
2205 return EINVAL;
2208 * The given value is unsigned, but lwIP stores the
2209 * value in milliseconds in a uint32_t field, so we
2210 * have to limit large values to whatever fits in the
2211 * field anyway.
2213 if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
2214 uval = UINT32_MAX;
2215 else
2216 uval = (uint32_t)val * 1000;
2218 if (name == TCP_KEEPIDLE)
2219 tcp->tcp_pcb->keep_idle = uval;
2220 else
2221 tcp->tcp_pcb->keep_intvl = uval;
2223 return OK;
2225 case TCP_KEEPCNT:
2226 /* lwIP's listening TCP PCBs do not have this field. */
2227 if (tcp->tcp_pcb->state == LISTEN)
2228 return EINVAL;
2230 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2231 len)) != OK)
2232 return r;
2234 if (val == 0)
2235 return EINVAL;
2237 tcp->tcp_pcb->keep_cnt = (uint32_t)val;
2239 return OK;
2242 return EOPNOTSUPP;
2245 /* Handle all other options at the IP level. */
2246 tcpsock_get_ipopts(tcp, &ipopts);
2248 return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2249 len, &ipopts);
2253 * Retrieve socket options on a TCP socket.
2255 static int
2256 tcpsock_getsockopt(struct sock * sock, int level, int name,
2257 const struct sockdriver_data * data, socklen_t * len)
2259 struct tcpsock *tcp = (struct tcpsock *)sock;
2260 struct ipopts ipopts;
2261 int val;
2263 if (tcp->tcp_pcb == NULL)
2264 return ECONNRESET;
2266 /* Handle TCP-level options. */
2267 switch (level) {
2268 case IPPROTO_IPV6:
2269 switch (name) {
2270 case IPV6_RECVTCLASS:
2271 case IPV6_FAITH:
2272 val = 0;
2274 return sockdriver_copyout_opt(data, &val, sizeof(val),
2275 len);
2278 break;
2280 case IPPROTO_TCP:
2281 switch (name) {
2282 case TCP_NODELAY:
2283 /* lwIP's listening TCP PCBs do not have this field. */
2284 if (tcp->tcp_pcb->state == LISTEN)
2285 return EINVAL;
2287 val = tcp_nagle_disabled(tcp->tcp_pcb);
2289 return sockdriver_copyout_opt(data, &val, sizeof(val),
2290 len);
2292 case TCP_MAXSEG:
2293 /* lwIP's listening TCP PCBs do not have this field. */
2294 if (tcp->tcp_pcb->state == LISTEN)
2295 return EINVAL;
2297 /* This option is read-only at this time. */
2298 val = tcp->tcp_pcb->mss;
2300 return sockdriver_copyout_opt(data, &val, sizeof(val),
2301 len);
2303 case TCP_KEEPIDLE:
2304 /* lwIP's listening TCP PCBs do not have this field. */
2305 if (tcp->tcp_pcb->state == LISTEN)
2306 return EINVAL;
2308 val = (int)(tcp->tcp_pcb->keep_idle / 1000);
2310 return sockdriver_copyout_opt(data, &val, sizeof(val),
2311 len);
2313 case TCP_KEEPINTVL:
2314 /* lwIP's listening TCP PCBs do not have this field. */
2315 if (tcp->tcp_pcb->state == LISTEN)
2316 return EINVAL;
2318 val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
2320 return sockdriver_copyout_opt(data, &val, sizeof(val),
2321 len);
2323 case TCP_KEEPCNT:
2324 /* lwIP's listening TCP PCBs do not have this field. */
2325 if (tcp->tcp_pcb->state == LISTEN)
2326 return EINVAL;
2328 val = (int)tcp->tcp_pcb->keep_cnt;
2330 return sockdriver_copyout_opt(data, &val, sizeof(val),
2331 len);
2334 return EOPNOTSUPP;
2337 /* Handle all other options at the IP level. */
2338 tcpsock_get_ipopts(tcp, &ipopts);
2340 return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2341 len, &ipopts);
2345 * Retrieve the local socket address of a TCP socket.
2347 static int
2348 tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
2349 socklen_t * addr_len)
2351 struct tcpsock *tcp = (struct tcpsock *)sock;
2353 if (tcp->tcp_pcb == NULL)
2354 return EINVAL;
2356 ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2357 &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
2359 return OK;
2363 * Retrieve the remote socket address of a TCP socket.
2365 static int
2366 tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
2367 socklen_t * addr_len)
2369 struct tcpsock *tcp = (struct tcpsock *)sock;
2371 if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
2372 tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
2373 return ENOTCONN;
2375 ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2376 &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
2378 return OK;
2382 * Perform a TCP half-close on a TCP socket. This operation may not complete
2383 * immediately due to memory conditions, in which case it will be completed at
2384 * a later time.
2386 static void
2387 tcpsock_send_fin(struct tcpsock * tcp)
2390 sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
2393 * Attempt to send the FIN. If a fatal error occurs as a result, raise
2394 * it as an asynchronous error, because this function's callers cannot
2395 * do much with it. That happens to match the way these functions are
2396 * used elsewhere. In any case, as a result, the PCB may be closed.
2397 * However, we are never called from a situation where the socket is
2398 * being closed here, so the socket object will not be freed either.
2400 if (tcpsock_pcb_enqueue(tcp)) {
2401 assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
2403 if (tcpsock_may_close(tcp))
2404 tcpsock_finish_close(tcp);
2405 else
2406 (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
2411 * Shut down a TCP socket for reading and/or writing.
2413 static int
2414 tcpsock_shutdown(struct sock * sock, unsigned int mask)
2416 struct tcpsock *tcp = (struct tcpsock *)sock;
2419 * If the PCB is gone, we want to allow shutdowns for reading but not
2420 * writing: shutting down for writing affects the PCB, shutting down
2421 * for reading does not. Also, if the PCB is in CLOSED state, we would
2422 * not know how to deal with subsequent operations after a shutdown for
2423 * writing, so forbid such calls altogether.
2425 if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
2426 (mask & SFL_SHUT_WR))
2427 return ENOTCONN;
2430 * Handle listening sockets as a special case. Shutting down a
2431 * listening socket frees its PCB. Sockets pending on the accept queue
2432 * may still be accepted, but after that, accept(2) will start
2433 * returning ECONNABORTED. This feature allows multi-process server
2434 * applications to shut down gracefully, supposedly..
2436 if (tcpsock_is_listening(tcp)) {
2437 if (tcp->tcp_pcb != NULL)
2438 tcpsock_pcb_close(tcp);
2440 return OK;
2444 * We control shutdown-for-reading locally, and intentially do not tell
2445 * lwIP about it: if we do that and also shut down for writing, the PCB
2446 * may disappear (now or eventually), which is not what we want.
2447 * Instead, we only tell lwIP to shut down for reading once we actually
2448 * want to get rid of the PCB, using tcp_close(). In the meantime, if
2449 * the socket is shut down for reading by the user, we simply discard
2450 * received data as fast as we can--one out of a number of possible
2451 * design choices there, and (reportedly) the one used by the BSDs.
2453 if (mask & SFL_SHUT_RD)
2454 (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
2457 * Shutting down for writing a connecting socket simply closes its PCB.
2458 * Closing a PCB in SYN_SENT state simply deallocates it, so this can
2459 * not fail. On the other hand, for connected sockets we want to send
2460 * a FIN, which may fail due to memory shortage, in which case we have
2461 * to try again later..
2463 if (mask & SFL_SHUT_WR) {
2464 if (tcp->tcp_pcb->state == SYN_SENT)
2465 tcpsock_pcb_close(tcp);
2466 else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2467 tcpsock_send_fin(tcp);
2470 return OK;
2474 * Close a TCP socket. Complete the operation immediately if possible, or
2475 * otherwise initiate the closing process and complete it later, notifying
2476 * libsockevent about that as well. Depending on linger settings, this
2477 * function may be called twice on the same socket: the first time with the
2478 * 'force' flag cleared, and the second time with the 'force' flag set.
2480 static int
2481 tcpsock_close(struct sock * sock, int force)
2483 struct tcpsock *tcp = (struct tcpsock *)sock;
2484 struct tcpsock *queued;
2485 size_t rlen;
2487 assert(tcp->tcp_listener == NULL);
2490 * If this was a listening socket, so abort and clean up any and all
2491 * connections on its listener queue. Note that the listening socket
2492 * may or may not have a PCB at this point.
2494 if (tcpsock_is_listening(tcp)) {
2495 while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
2496 queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2498 tcpsock_pcb_abort(queued);
2500 (void)tcpsock_cleanup(queued, TRUE /*may_free*/);
2505 * Clear the receive queue, and make sure that we no longer add new
2506 * data to it. The latter is relevant only for the case that we end up
2507 * returning SUSPEND below. Remember whether there were bytes left,
2508 * because we should reset the connection if there were.
2510 rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
2512 sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
2515 * If the socket is connected, perform a graceful shutdown, unless 1)
2516 * we are asked to force-close the socket, or 2) if the local side has
2517 * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP
2518 * would take care of the second point, but we may have data in our
2519 * receive buffer of which lwIP is not aware.
2521 * Implementing proper linger support is somewhat difficult with lwIP.
2522 * In particular, we cannot reliably wait for our FIN to be ACK'ed by
2523 * the other side in all cases:
2525 * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
2526 * trigger any event and once in the TIME_WAIT state, the poll event
2527 * no longer triggers either;
2528 * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
2529 * TIME_WAIT will trigger a receive event, but it is not clear
2530 * whether we can reliably check that our FIN was ACK'ed from there.
2532 * That means we have to compromise. Instead of the proper approach,
2533 * we complete our side of the close operation whenever:
2535 * 1. all of or data was acknowledged, AND,
2536 * 2. our FIN was sent, AND,
2537 * 3a. our FIN was acknowledged, OR,
2538 * 3b. we received a FIN from the other side.
2540 * With the addition of the rule 3b, we do not run into the above
2541 * reliability problems, but we may return from SO_LINGER-blocked close
2542 * calls too early and thus give callers a false impression of success.
2543 * TODO: if lwIP ever gets improved on this point, the code in this
2544 * module should be rewritten to make use of the improvements.
2546 * The set of rules is basically the same as for closing the PCB early
2547 * as per tcpsock_may_close(), except with the check for our FIN being
2548 * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
2549 * (reentered) CLOSED TCP states guarantee that there are no
2550 * unacknowledged data segments anymore, so we may have to wait for
2551 * reaching any one of these before we can actually finish closing the
2552 * socket with tcp_close().
2554 * In addition, lwIP does not tell us when our FIN gets acknowledged,
2555 * so we have to use polling and direct access to lwIP's PCB fields
2556 * instead, just like lwIP's BSD API does. There is no other way.
2557 * Also, we may not even be able to send the FIN right away, in which
2558 * case we must defer that until later.
2560 if (tcp->tcp_pcb != NULL) {
2561 switch (tcp->tcp_pcb->state) {
2562 case CLOSE_WAIT:
2563 case CLOSING:
2564 case LAST_ACK:
2565 assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
2567 /* FALLTHROUGH */
2568 case SYN_RCVD:
2569 case ESTABLISHED:
2570 case FIN_WAIT_1:
2571 /* First check if we should abort the connection. */
2572 if (force || rlen > 0)
2573 break;
2576 * If we have not sent a FIN yet, try sending it now;
2577 * if all other conditions are met for closing the
2578 * socket, successful FIN transmission will complete
2579 * the close. Otherwise, perform the close check
2580 * explicitly.
2582 if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2583 tcpsock_send_fin(tcp);
2584 else if (tcpsock_may_close(tcp))
2585 tcpsock_pcb_close(tcp);
2588 * If at this point the PCB is gone, we managed to
2589 * close the connection immediately, and the socket has
2590 * already been cleaned up by now. This may occur if
2591 * there is no unacknowledged data and we already
2592 * received a FIN earlier on.
2594 if (tcp->tcp_pcb == NULL)
2595 return OK;
2598 * Complete the close operation at a later time.
2599 * Adjust the polling interval, so that we can detect
2600 * completion of the close as quickly as possible.
2602 tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
2603 TCP_POLL_CLOSE_INTERVAL);
2605 return SUSPEND;
2607 default:
2609 * The connection is either not yet established, or
2610 * already in a state where we can close it right now.
2612 tcpsock_pcb_close(tcp);
2617 * Abort the connection is the PCB is still around, and clean up the
2618 * socket. We cannot let tcpsock_cleanup() free the socket object yet,
2619 * because we are still in the callback from libsockevent, and the
2620 * latter cannot handle the socket object being freed from here.
2622 if (tcp->tcp_pcb != NULL)
2623 tcpsock_pcb_abort(tcp);
2625 (void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
2627 return OK;
2631 * Free up a closed TCP socket.
2633 static void
2634 tcpsock_free(struct sock * sock)
2636 struct tcpsock *tcp = (struct tcpsock *)sock;
2638 assert(tcp->tcp_pcb == NULL);
2639 assert(tcp->tcp_snd.ts_len == 0);
2640 assert(tcp->tcp_snd.ts_head == NULL);
2641 assert(tcp->tcp_rcv.tr_len == 0);
2642 assert(tcp->tcp_rcv.tr_head == NULL);
2644 TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
2647 /* This table maps TCP states from lwIP numbers to NetBSD numbers. */
2648 static const struct {
2649 int tsm_tstate;
2650 int tsm_sostate;
2651 } tcpsock_statemap[] = {
2652 [CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED },
2653 [LISTEN] = { TCPS_LISTEN, 0 },
2654 [SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING },
2655 [SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING },
2656 [ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED },
2657 [FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING },
2658 [FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING },
2659 [CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED },
2660 [CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING },
2661 [LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING },
2662 [TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED },
2666 * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
2667 * PCB identified by the given pointer.
2669 static void
2670 tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
2672 const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
2673 struct tcpsock *tcp;
2676 * Not all TCP PCBs have an associated tcpsock structure. We are
2677 * careful enough clearing the callback argument for PCBs on any of the
2678 * TCP lists that we can use that callback argument to determine
2679 * whether there is an associated tcpsock structure, although with one
2680 * exception: PCBs for incoming connections that have not yet been
2681 * fully established (i.e., in SYN_RCVD state). These will have the
2682 * callback argument of the listening socket (which itself may already
2683 * have been deallocated at this point) but should not be considered as
2684 * associated with the listening socket's tcpsock structure.
2686 if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
2687 tcp = (struct tcpsock *)pcb->callback_arg;
2688 assert(tcp >= tcp_array &&
2689 tcp < &tcp_array[__arraycount(tcp_array)]);
2691 /* TODO: change this so that sockstat(1) may work one day. */
2692 ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
2693 } else {
2694 /* No tcpsock. Could also be in TIME_WAIT state etc. */
2695 tcp = NULL;
2697 ki->ki_sostate = SS_NOFDREF;
2700 ki->ki_type = SOCK_STREAM;
2702 if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
2703 ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
2704 /* TODO: this needs work, but does anything rely on it? */
2705 ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
2708 /* Careful with the LISTEN state here (see below). */
2709 ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
2710 &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
2713 * The PCBs for listening sockets are actually smaller. Thus, for
2714 * listening sockets, do not attempt to access any of the fields beyond
2715 * those provided in the smaller structure.
2717 if (pcb->state == LISTEN) {
2718 assert(tcp != NULL);
2719 ki->ki_refs =
2720 (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2721 } else {
2722 if (tcp_nagle_disabled(pcb))
2723 ki->ki_tflags |= NETBSD_TF_NODELAY;
2725 if (tcp != NULL) {
2726 ki->ki_rcvq = tcp->tcp_rcv.tr_len;
2727 ki->ki_sndq = tcp->tcp_snd.ts_len;
2729 if (tcp->tcp_listener != NULL)
2730 ki->ki_nextref = (uint64_t)(uintptr_t)
2731 TAILQ_NEXT(tcp, tcp_queue.tq_next);
2737 * Given either NULL or a previously returned TCP PCB pointer, return the first
2738 * or next TCP PCB pointer, or NULL if there are no more. The current
2739 * implementation supports only one concurrent iteration at once.
2741 static const void *
2742 tcpsock_enum(const void * last)
2744 static struct {
2745 unsigned int i;
2746 const struct tcp_pcb *pcb;
2747 } iter;
2749 if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
2750 return (const void *)iter.pcb;
2752 for (iter.i = (last != NULL) ? iter.i + 1 : 0;
2753 iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
2754 if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
2755 return (const void *)iter.pcb;
2758 return NULL;
2762 * Obtain the list of TCP protocol control blocks, for sysctl(7).
2764 static ssize_t
2765 tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
2766 struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
2769 return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
2772 static const struct sockevent_ops tcpsock_ops = {
2773 .sop_bind = tcpsock_bind,
2774 .sop_listen = tcpsock_listen,
2775 .sop_connect = tcpsock_connect,
2776 .sop_accept = tcpsock_accept,
2777 .sop_test_accept = tcpsock_test_accept,
2778 .sop_pre_send = tcpsock_pre_send,
2779 .sop_send = tcpsock_send,
2780 .sop_test_send = tcpsock_test_send,
2781 .sop_pre_recv = tcpsock_pre_recv,
2782 .sop_recv = tcpsock_recv,
2783 .sop_test_recv = tcpsock_test_recv,
2784 .sop_ioctl = ifconf_ioctl,
2785 .sop_setsockmask = tcpsock_setsockmask,
2786 .sop_setsockopt = tcpsock_setsockopt,
2787 .sop_getsockopt = tcpsock_getsockopt,
2788 .sop_getsockname = tcpsock_getsockname,
2789 .sop_getpeername = tcpsock_getpeername,
2790 .sop_shutdown = tcpsock_shutdown,
2791 .sop_close = tcpsock_close,
2792 .sop_free = tcpsock_free