minix/net/lwip/tcpsock.c

   1 /* LWIP service - tcpsock.c - TCP sockets */
   2 /*
   3  * This module implements support for TCP sockets based on lwIP's core TCP PCB
   4  * module, which is largely but not fully cooperative with exactly what we want
   5  * to achieve, with as a result that this module is rather complicated.
   6  *
   7  * Each socket has a send queue and a receive queue.  Both are using lwIP's own
   8  * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
   9  * The buffers on the send queue are allocated and freed by us--the latter only
  10  * once they are no longer in use by lwIP as well.  A bit counterintuitively,
  11  * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
  12  * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
  13  * easily trigger conditions where we cannot enqueue data (or the final FIN)
  14  * right away.  This way, we get to test the internal logic of this module a
  15  * lot more easily.  The small lwIP send queue size should not have any impact
  16  * on performance, as our own per-socket send queues can be much larger and we
  17  * enqueue more of that on the lwIP PCB as soon as we can in all cases.
  18  *
  19  * The receive queue consists of whatever buffers were given to us by lwIP, but
  20  * since those may be many buffers with small amounts of data each, we perform
  21  * fairly aggressive merging of consecutive buffers.  The intended result is
  22  * that we waste no more than 50% of memory within the receive queue.  Merging
  23  * requires memory copies, which makes it expensive, but we do not configure
  24  * lwIP with enough buffers to make running out of buffers a non-issue, so this
  25  * trade-off is necessary.  Practical experience and measurements of the merge
  26  * policy will have to show whether and how the current policy may be improved.
  27  *
  28  * As can be expected, the connection close semantics are by far the most
  29  * complicated part of this module.  We attempt to get rid of the lwIP PCB as
  30  * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
  31  * However, there are various conditions that have to be met before we can
  32  * forget about the PCB here--most importantly, that none of our sent data
  33  * blocks are still referenced by lwIP because they have not yet been sent or
  34  * acknowledged.  We can only free the data blocks once lwIP is done with them.
  35  *
  36  * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
  37  * full state tracking here.  However, we do not look at a socket's TCP state
  38  * while in a lwIP-generated event for that socket, because the state may not
  39  * necessarily reflect the (correct or new) TCP state of the connection, nor
  40  * may the PCB be available--this is the case for error events.  For these
  41  * reasons we use a few internal TCPF_ flags to perform partial state tracking.
  42  *
  43  * More generally, we tend to access lwIP PCB fields directly only when lwIP's
  44  * own BSD API implementation does that too and there is no better alternative.
  45  * One example of this is the check to see if our FIN was acknowledged, for
  46  * SO_LINGER support.  In terms of maintenance, our hope is that if lwIP's API
  47  * changes later, we can change our code to imitate whatever lwIP's BSD API
  48  * implementation does at that point.
  49  */
  50
  51 #include <sys/socketvar.h>
  52 #include <netinet/in.h>
  53 #include <netinet/tcp.h>
  54 #include <netinet/ip_var.h>
  55 #include <netinet/tcp_timer.h>
  56 #include <netinet/tcp_var.h>
  57 #include <netinet/tcp_fsm.h>
  58
  59 /*
  60  * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
  61  * preprocessor variables.  Make sure we do not attempt to use the NetBSD one
  62  * where it matters.  We do need one of the NetBSD definitions though.
  63  */
  64 static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
  65 #undef TF_NODELAY
  66 #undef TCP_MSS
  67
  68 #include "lwip.h"
  69 #include "tcpisn.h"
  70
  71 #include "lwip/tcp.h"
  72 #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
  73
  74 /*
  75  * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
  76  */
  77
  78 /*
  79  * We fully control the send buffer, so we can let its size be set to whatever
  80  * we want.  The receive buffer is different: if it is smaller than the window
  81  * size, we may have to refuse data that lwIP hands us, at which point more
  82  * incoming data will cause lwIP to abort the TCP connection--even aside from
  83  * performance issues.  Therefore, we must make sure the receive buffer is
  84  * larger than the TCP window at all times.
  85  */
  86 #define TCP_SNDBUF_MIN  1               /* minimum TCP send buffer size */
  87 #define TCP_SNDBUF_DEF  32768           /* default TCP send buffer size */
  88 #define TCP_SNDBUF_MAX  131072          /* maximum TCP send buffer size */
  89 #define TCP_RCVBUF_MIN  TCP_WND         /* minimum TCP receive buffer size */
  90 #define TCP_RCVBUF_DEF  MAX(TCP_WND, 32768) /* default TCP recv buffer size */
  91 #define TCP_RCVBUF_MAX  MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
  92
  93 /*
  94  * The total number of buffers that may in use for TCP socket send queues.  The
  95  * goal is to allow at least some progress to be made on receiving from TCP
  96  * sockets and on differently-typed sockets, at least as long as the LWIP
  97  * service can manage to allocate the memory it wants.  For the case that it
  98  * does not, we can only reactively kill off TCP sockets and/or free enqueued
  99  * ethernet packets, neither of which is currently implemented (TODO).
 100  */
 101 #define TCP_MAX_SENDBUFS        (mempool_max_buffers() * 3 / 4)
 102
 103 /* Polling intervals, in 500-millsecond units. */
 104 #define TCP_POLL_REG_INTERVAL   10      /* interval for reattempting sends */
 105 #define TCP_POLL_CLOSE_INTERVAL 1       /* interval while closing connection */
 106
 107 static struct tcpsock {
 108         struct ipsock tcp_ipsock;               /* IP socket, MUST be first */
 109         struct tcp_pcb *tcp_pcb;                /* lwIP TCP control block */
 110         union pxfer_tcp_queue {                 /* free/accept queue */
 111                 TAILQ_ENTRY(tcpsock) tq_next;   /* next in queue */
 112                 TAILQ_HEAD(, tcpsock) tq_head;  /* head of queue */
 113         } tcp_queue;
 114         struct tcpsock *tcp_listener;           /* listener if on accept q. */
 115         struct {                                /* send queue */
 116                 struct pbuf *ts_head;           /* first pbuf w/unacked data */
 117                 struct pbuf *ts_unsent;         /* first pbuf w/unsent data */
 118                 struct pbuf *ts_tail;           /* most recently added data */
 119                 size_t ts_len;                  /* total sent + unsent */
 120                 unsigned short ts_head_off;     /* offset into head pbuf */
 121                 unsigned short ts_unsent_off;   /* offset into unsent pbuf */
 122         } tcp_snd;
 123         struct {                                /* receive queue */
 124                 struct pbuf *tr_head;           /* first pbuf w/unrecvd data */
 125                 struct pbuf **tr_pre_tailp;     /* ptr-ptr to newest pbuf */
 126                 size_t tr_len;                  /* bytes on receive queue */
 127                 unsigned short tr_head_off;     /* offset into head pbuf */
 128                 unsigned short tr_unacked;      /* current window reduction */
 129         } tcp_rcv;
 130 } tcp_array[NR_TCPSOCK];
 131
 132 static TAILQ_HEAD(, tcpsock) tcp_freelist;      /* list of free TCP sockets */
 133
 134 static const struct sockevent_ops tcpsock_ops;
 135
 136 static unsigned int tcpsock_sendbufs;           /* # send buffers in use */
 137 static unsigned int tcpsock_recvbufs;           /* # receive buffers in use */
 138
 139 /* A bunch of macros that are just for convenience. */
 140 #define tcpsock_get_id(tcp)     (SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
 141 #define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock)
 142 #define tcpsock_get_sock(tcp)   (ipsock_get_sock(tcpsock_get_ipsock(tcp)))
 143 #define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
 144 #define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
 145 #define tcpsock_is_ipv6(tcp)    (ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
 146 #define tcpsock_is_shutdown(tcp,fl) \
 147         (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
 148 #define tcpsock_is_listening(tcp) \
 149         (sockevent_is_listening(tcpsock_get_sock(tcp)))
 150 #define tcpsock_get_flags(tcp)  (ipsock_get_flags(tcpsock_get_ipsock(tcp)))
 151 #define tcpsock_set_flag(tcp,fl) \
 152         (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
 153 #define tcpsock_clear_flag(tcp,fl) \
 154         (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
 155
 156 static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
 157         struct rmib_oldp *, struct rmib_newp *);
 158
 159 /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
 160 /* TODO: add many more and make some of them writable.. */
 161 static struct rmib_node net_inet_tcp_table[] = {
 162 /* 2*/  [TCPCTL_SENDSPACE]      = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
 163                                     "sendspace",
 164                                     "Default TCP send buffer size"),
 165 /* 3*/  [TCPCTL_RECVSPACE]      = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
 166                                     "recvspace",
 167                                     "Default TCP receive buffer size"),
 168 /*29*/  [TCPCTL_LOOPBACKCKSUM]  = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
 169                                     loopif_cksum, "do_loopback_cksum",
 170                                     "Perform TCP checksum on loopback"),
 171 /*+0*/  [TCPCTL_MAXID]          = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
 172                                     tcpsock_pcblist, "pcblist",
 173                                     "TCP protocol control block list"),
 174 /*+1*/  [TCPCTL_MAXID + 1]      = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
 175                                     CTLFLAG_HIDDEN | CTLTYPE_STRING,
 176                                     TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
 177                                     "isn_secret",
 178                                     "TCP ISN secret (MINIX 3 specific)")
 179 };
 180
 181 static struct rmib_node net_inet_tcp_node =
 182     RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
 183 static struct rmib_node net_inet6_tcp6_node =
 184     RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
 185
 186 /*
 187  * Initialize the TCP sockets module.
 188  */
 189 void
 190 tcpsock_init(void)
 191 {
 192         unsigned int slot;
 193
 194         /* Initialize the list of free TCP sockets. */
 195         TAILQ_INIT(&tcp_freelist);
 196
 197         for (slot = 0; slot < __arraycount(tcp_array); slot++)
 198                 TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
 199                     tcp_queue.tq_next);
 200
 201         /* Initialize other variables. */
 202         tcpsock_sendbufs = 0;
 203
 204         /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
 205         mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
 206         mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
 207 }
 208
 209 /*
 210  * Initialize the state of a TCP socket's send queue.
 211  */
 212 static void
 213 tcpsock_reset_send(struct tcpsock * tcp)
 214 {
 215
 216         tcp->tcp_snd.ts_tail = NULL;
 217         tcp->tcp_snd.ts_unsent = NULL;
 218         tcp->tcp_snd.ts_head = NULL;
 219         tcp->tcp_snd.ts_len = 0;
 220         tcp->tcp_snd.ts_unsent_off = 0;
 221         tcp->tcp_snd.ts_head_off = 0;
 222 }
 223
 224 /*
 225  * Initialize the state of a TCP socket's receive queue.
 226  */
 227 static void
 228 tcpsock_reset_recv(struct tcpsock * tcp)
 229 {
 230
 231         tcp->tcp_rcv.tr_pre_tailp = NULL;
 232         tcp->tcp_rcv.tr_head = NULL;
 233         tcp->tcp_rcv.tr_len = 0;
 234         tcp->tcp_rcv.tr_head_off = 0;
 235         tcp->tcp_rcv.tr_unacked = 0;
 236 }
 237
 238 /*
 239  * Create a TCP socket.
 240  */
 241 sockid_t
 242 tcpsock_socket(int domain, int protocol, struct sock ** sockp,
 243         const struct sockevent_ops ** ops)
 244 {
 245         struct tcpsock *tcp;
 246         uint8_t ip_type;
 247
 248         switch (protocol) {
 249         case 0:
 250         case IPPROTO_TCP:
 251                 break;
 252
 253         default:
 254                 return EPROTONOSUPPORT;
 255         }
 256
 257         if (TAILQ_EMPTY(&tcp_freelist))
 258                 return ENOBUFS;
 259
 260         tcp = TAILQ_FIRST(&tcp_freelist);
 261
 262         /*
 263          * Initialize the structure.  Do not memset it to zero, as it is still
 264          * part of the linked free list.  Initialization may still fail.  When
 265          * adding new fields, make sure to change tcpsock_clone() accordingly.
 266          */
 267
 268         ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
 269             TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
 270
 271         if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
 272                 return ENOBUFS;
 273         tcp_arg(tcp->tcp_pcb, tcp);
 274
 275         tcp->tcp_listener = NULL;
 276
 277         tcpsock_reset_send(tcp);
 278         tcpsock_reset_recv(tcp);
 279
 280         TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
 281
 282         *ops = &tcpsock_ops;
 283         return tcpsock_get_id(tcp);
 284 }
 285
 286 /*
 287  * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
 288  * incoming on listening socket 'listener'.  The new socket is essentially a
 289  * "clone" of the listening TCP socket, in that it should inherit any settings
 290  * from the listening socket.  The socket has not yet been accepted by userland
 291  * so add it to the queue of connetions pending for the listening socket.  On
 292  * success, return OK.  On failure, return a negative error code.
 293  */
 294 static int
 295 tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
 296 {
 297         struct tcpsock *tcp;
 298
 299         if (TAILQ_EMPTY(&tcp_freelist))
 300                 return ENOBUFS;
 301
 302         tcp = TAILQ_FIRST(&tcp_freelist);
 303
 304         /*
 305          * Initialize the structure.  Do not memset it to zero, as it is still
 306          * part of the linked free list.  Initialization may still fail.  Most
 307          * settings should be inherited from the listening socket here, rather
 308          * than being initialized to their default state.
 309          */
 310
 311         ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
 312             tcpsock_get_id(tcp));
 313
 314         tcp->tcp_pcb = pcb;
 315         tcp_arg(pcb, tcp);
 316
 317         tcpsock_reset_send(tcp);
 318         tcpsock_reset_recv(tcp);
 319
 320         /*
 321          * Remove the new socket from the free list, and add it to the queue of
 322          * the listening socket--in this order, because the same next pointer
 323          * is used for both.
 324          */
 325         TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
 326
 327         TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
 328             tcp_queue.tq_next);
 329         tcp->tcp_listener = listener;
 330
 331         return OK;
 332 }
 333
 334 /*
 335  * Allocate a buffer from the pool, using the standard pool size.  The returned
 336  * buffer is a single element--never a chain.
 337  */
 338 static struct pbuf *
 339 tcpsock_alloc_buf(void)
 340 {
 341         struct pbuf *pbuf;
 342
 343         pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
 344
 345         assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
 346
 347         return pbuf;
 348 }
 349
 350 /*
 351  * Free the given buffer.  Ensure that pbuf_free() will not attempt to free the
 352  * next buffer(s) in the chain as well.  This may be called for pbufs other
 353  * than those allocated with tcpsock_alloc_buf().
 354  */
 355 static void
 356 tcpsock_free_buf(struct pbuf * pbuf)
 357 {
 358
 359         /*
 360          * Resetting the length is currently not necessary, but better safe
 361          * than sorry..
 362          */
 363         pbuf->len = pbuf->tot_len;
 364         pbuf->next = NULL;
 365
 366         pbuf_free(pbuf);
 367 }
 368
 369 /*
 370  * Clear the send queue of a TCP socket.  The caller must ensure that lwIP will
 371  * no longer access any of data on the send queue.
 372  */
 373 static void
 374 tcpsock_clear_send(struct tcpsock * tcp)
 375 {
 376         struct pbuf *phead;
 377
 378         assert(tcp->tcp_pcb == NULL);
 379
 380         while ((phead = tcp->tcp_snd.ts_head) != NULL) {
 381                 tcp->tcp_snd.ts_head = phead->next;
 382
 383                 assert(tcpsock_sendbufs > 0);
 384                 tcpsock_sendbufs--;
 385
 386                 tcpsock_free_buf(phead);
 387         }
 388
 389         tcpsock_reset_send(tcp);
 390 }
 391
 392 /*
 393  * Clear the receive queue of a TCP socket.  If 'ack_data' is set, also
 394  * acknowledge the previous contents of the receive queue to lwIP.
 395  */
 396 static size_t
 397 tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
 398 {
 399         struct pbuf *phead;
 400         size_t rlen;
 401
 402         rlen = tcp->tcp_rcv.tr_len;
 403
 404         while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
 405                 tcp->tcp_rcv.tr_head = phead->next;
 406
 407                 assert(tcpsock_recvbufs > 0);
 408                 tcpsock_recvbufs--;
 409
 410                 tcpsock_free_buf(phead);
 411         }
 412
 413         /*
 414          * From now on, we will basically be discarding incoming data as fast
 415          * as possible, to keep the full window open at all times.
 416          */
 417         if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
 418                 tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
 419
 420         tcpsock_reset_recv(tcp);
 421
 422         return rlen;
 423 }
 424
 425 /*
 426  * The TCP socket's PCB has been detached from the socket, typically because
 427  * the connection was aborted, either by us or by lwIP.  Either way, any TCP
 428  * connection is gone.  Clear the socket's send queue, remove the socket from
 429  * a listening socket's queue, and if the socket itself is ready and allowed to
 430  * be freed, free it now.  The socket is ready to be freed if it was either on
 431  * a listening queue or being closed already.  The socket is allowed to be
 432  * freed only if 'may_free' is TRUE.  If the socket is not freed, its receive
 433  * queue is left as is, as it may still have data to be received by userland.
 434  */
 435 static int
 436 tcpsock_cleanup(struct tcpsock * tcp, int may_free)
 437 {
 438         int destroy;
 439
 440         assert(tcp->tcp_pcb == NULL);
 441
 442         /*
 443          * Free any data on the send queue.  This is safe to do right now,
 444          * because the PCB has been aborted (or was already gone).  We must be
 445          * very careful about clearing the send queue in all other situations.
 446          */
 447         tcpsock_clear_send(tcp);
 448
 449         /*
 450          * If this was a socket pending acceptance, remove it from the
 451          * corresponding listener socket's queue, and free it.  Otherwise, free
 452          * the socket only if it suspended a graceful close operation.
 453          */
 454         if (tcp->tcp_listener != NULL) {
 455                 TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
 456                     tcp_queue.tq_next);
 457                 tcp->tcp_listener = NULL;
 458
 459                 /*
 460                  * The listener socket's backlog count should be adjusted by
 461                  * lwIP whenever the PCB is freed up, so we need (and must) not
 462                  * attempt to do that here.
 463                  */
 464
 465                 destroy = TRUE;
 466         } else
 467                 destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
 468
 469         /*
 470          * Do not free the socket if 'may_free' is FALSE.  That flag may be set
 471          * if we are currently in the second tcpsock_close() call on the
 472          * socket, in which case sockevent_is_closing() is TRUE but we must
 473          * still not free the socket now: doing so would derail libsockevent.
 474          */
 475         if (destroy && may_free) {
 476                 (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
 477
 478                 sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
 479         }
 480
 481         return destroy;
 482 }
 483
 484 /*
 485  * Abort the lwIP PCB for the given socket, using tcp_abort().  If the PCB is
 486  * connected, this will cause the connection to be reset.  The PCB, which must
 487  * have still been present before the call, will be gone after the call.
 488  */
 489 static void
 490 tcpsock_pcb_abort(struct tcpsock * tcp)
 491 {
 492
 493         assert(tcp->tcp_pcb != NULL);
 494         assert(!tcpsock_is_listening(tcp));
 495
 496         tcp_recv(tcp->tcp_pcb, NULL);
 497         tcp_sent(tcp->tcp_pcb, NULL);
 498         tcp_err(tcp->tcp_pcb, NULL);
 499         tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
 500
 501         tcp_arg(tcp->tcp_pcb, NULL);
 502
 503         tcp_abort(tcp->tcp_pcb);
 504
 505         tcp->tcp_pcb = NULL;
 506 }
 507
 508 /*
 509  * Close the lwIP PCB for the given socket, using tcp_close().  If the PCB is
 510  * connected, its graceful close will be finished by lwIP in the background.
 511  * The PCB, which must have still been present before the call, will be gone
 512  * after the call.
 513  */
 514 static void
 515 tcpsock_pcb_close(struct tcpsock * tcp)
 516 {
 517         err_t err;
 518
 519         assert(tcp->tcp_pcb != NULL);
 520         assert(tcp->tcp_snd.ts_len == 0);
 521
 522         if (!tcpsock_is_listening(tcp)) {
 523                 tcp_recv(tcp->tcp_pcb, NULL);
 524                 tcp_sent(tcp->tcp_pcb, NULL);
 525                 tcp_err(tcp->tcp_pcb, NULL);
 526                 tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
 527         }
 528
 529         tcp_arg(tcp->tcp_pcb, NULL);
 530
 531         if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
 532                 panic("unexpected TCP close failure: %d", err);
 533
 534         tcp->tcp_pcb = NULL;
 535 }
 536
 537 /*
 538  * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
 539  * FALSE if they are not.  Upon calling this function, the socket's PCB must
 540  * still be around.
 541  */
 542 static int
 543 tcpsock_may_close(struct tcpsock * tcp)
 544 {
 545
 546         assert(tcp->tcp_pcb != NULL);
 547
 548         /*
 549          * Regular closing of the PCB requires three conditions to be met:
 550          *
 551          * 1. all our data has been transmitted AND acknowledged, so that we do
 552          *    not risk corruption in case there are still unsent or unack'ed
 553          *    data buffers that may otherwise be recycled too soon;
 554          * 2. we have sent our FIN to the peer; and,
 555          * 3. we have received a FIN from the peer.
 556          */
 557         return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
 558             (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
 559 }
 560
 561 /*
 562  * The given socket is ready to be closed as per the tcpsock_may_close() rules.
 563  * This implies that its send queue is already empty.  Gracefully close the
 564  * PCB.  In addition, if the socket is being closed gracefully, meaning we
 565  * suspended an earlier tcpsock_close() call (and as such already emptied the
 566  * receive queue as well), then tell libsockevent that the close is finished,
 567  * freeing the socket.  Return TRUE if the socket has indeed been freed this
 568  * way, or FALSE if the socket is still around.
 569  */
 570 static int
 571 tcpsock_finish_close(struct tcpsock * tcp)
 572 {
 573
 574         assert(tcp->tcp_snd.ts_len == 0);
 575         assert(tcp->tcp_listener == NULL);
 576
 577         /*
 578          * If we get here, we have already shut down the sending side of the
 579          * PCB.  Technically, we are interested only in shutting down the
 580          * receiving side of the PCB here, so that lwIP may decide to recycle
 581          * the socket later etcetera.  We call tcp_close() because we do not
 582          * want to rely on tcp_shutdown(RX) doing the exact same thing.
 583          * However, we do rely on the fact that the PCB is not immediately
 584          * destroyed by the tcp_close() call: otherwise we may have to return
 585          * ERR_ABRT if this function is called from a lwIP-generated event.
 586          */
 587         tcpsock_pcb_close(tcp);
 588
 589         /*
 590          * If we suspended an earlier tcpsock_close() call, we have to tell
 591          * libsockevent that the close operation is now complete.
 592          */
 593         if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
 594                 assert(tcp->tcp_rcv.tr_len == 0);
 595
 596                 sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
 597
 598                 return TRUE;
 599         } else
 600                 return FALSE;
 601 }
 602
 603 /*
 604  * Attempt to start or resume enqueuing data and/or a FIN to send on the given
 605  * TCP socket.  Return TRUE if anything at all could be newly enqueued on the
 606  * lwIP PCB, even if less than desired.  In that case, the caller should try to
 607  * send whatever was enqueued, and if applicable, check if the socket may now
 608  * be closed (due to the FIN being enqueued).  In particular, in any situation
 609  * where the socket may be in the process of being closed, the caller must use
 610  * tcpsock_may_close() if TRUE is returned.  Return FALSE if nothing new could
 611  * be enqueued, in which case no send attempt need to be made either.
 612  */
 613 static int
 614 tcpsock_pcb_enqueue(struct tcpsock * tcp)
 615 {
 616         struct pbuf *punsent;
 617         size_t space, chunk;
 618         unsigned int flags;
 619         err_t err;
 620         int enqueued;
 621
 622         assert(tcp->tcp_pcb != NULL);
 623
 624         if (tcpsock_get_flags(tcp) & TCPF_FULL)
 625                 return FALSE;
 626
 627         /*
 628          * Attempt to enqueue more unsent data, if any, on the PCB's send
 629          * queue.
 630          */
 631         enqueued = FALSE;
 632
 633         while (tcp->tcp_snd.ts_unsent != NULL) {
 634                 if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
 635                         break;
 636
 637                 /*
 638                  * We may maintain a non-NULL unsent pointer even when there is
 639                  * nothing more to send right now, because the tail buffer may
 640                  * be filled up further later on.
 641                  */
 642                 punsent = tcp->tcp_snd.ts_unsent;
 643
 644                 assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
 645
 646                 chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
 647                 if (chunk == 0)
 648                         break;
 649
 650                 if (chunk > space)
 651                         chunk = space;
 652
 653                 /* Try to enqueue more data for sending. */
 654                 if (chunk < punsent->len || punsent->next != NULL)
 655                         flags = TCP_WRITE_FLAG_MORE;
 656                 else
 657                         flags = 0;
 658
 659                 err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
 660                     tcp->tcp_snd.ts_unsent_off, chunk, flags);
 661
 662                 /*
 663                  * Since tcp_write() enqueues data only, it should only return
 664                  * out-of-memory errors; no fatal ones.  In any case, stop.
 665                  */
 666                 if (err != ERR_OK) {
 667                         assert(err == ERR_MEM);
 668
 669                         break;
 670                 }
 671
 672                 /* We have successfully enqueued data. */
 673                 enqueued = TRUE;
 674
 675                 tcp->tcp_snd.ts_unsent_off += chunk;
 676
 677                 if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
 678                         assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
 679                             punsent->next == NULL);
 680
 681                         break;
 682                 }
 683
 684                 tcp->tcp_snd.ts_unsent = punsent->next;
 685                 tcp->tcp_snd.ts_unsent_off = 0;
 686         }
 687
 688         /*
 689          * If all pending data has been enqueued for sending, and we should
 690          * shut down the sending end of the socket, try that now.
 691          */
 692         if ((tcp->tcp_snd.ts_unsent == NULL ||
 693             tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
 694             tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
 695             !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
 696                 err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
 697
 698                 if (err == ERR_OK) {
 699                         /*
 700                          * We have successfully enqueued a FIN.  The caller is
 701                          * now responsible for checking whether the PCB and
 702                          * possibly even the socket object can now be freed.
 703                          */
 704                         tcpsock_set_flag(tcp, TCPF_SENT_FIN);
 705
 706                         enqueued = TRUE;
 707                 } else {
 708                         assert(err == ERR_MEM);
 709
 710                         /*
 711                          * FIXME: the resolution for lwIP bug #47485 has taken
 712                          * away even more control over the closing process from
 713                          * us, making tracking sockets especially for SO_LINGER
 714                          * even harder.  For now, we simply effectively undo
 715                          * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
 716                          * returns ERR_MEM.  This will not be sustainable in
 717                          * the long term, though.
 718                          */
 719                         tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
 720
 721                         tcpsock_set_flag(tcp, TCPF_FULL);
 722                 }
 723         }
 724
 725         return enqueued;
 726 }
 727
 728 /*
 729  * Request lwIP to start sending any enqueued data and/or FIN on the TCP
 730  * socket's lwIP PCB.  On success, return OK.  On failure, return a negative
 731  * error code, after cleaning up the socket, freeing the PCB.  If the socket
 732  * was already being closed, also free the socket object in that case; the
 733  * caller must then not touch the socket object anymore upon return.  If the
 734  * socket object is not freed, and if 'raise_error' is TRUE, raise the error
 735  * on the socket object.
 736  */
 737 static int
 738 tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
 739 {
 740         err_t err;
 741         int r;
 742
 743         assert(tcp->tcp_pcb != NULL);
 744
 745         /*
 746          * If we have enqueued something, ask lwIP to send TCP packets now.
 747          * This may result in a fatal error, in which case we clean up the
 748          * socket and return the error to the caller.  Since cleaning up the
 749          * socket may free the socket object, and the caller cannot tell
 750          * whether that will happen or has happened, also possibly raise the
 751          * error on the socket object if it is not gone.  As such, callers that
 752          * set 'raise_error' to FALSE must know for sure that the socket was
 753          * not being closed, for example because the caller is processing a
 754          * (send) call from userland.
 755          */
 756         err = tcp_output(tcp->tcp_pcb);
 757
 758         if (err != ERR_OK && err != ERR_MEM) {
 759                 tcpsock_pcb_abort(tcp);
 760
 761                 r = util_convert_err(err);
 762
 763                 if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
 764                         if (raise_error)
 765                                 sockevent_set_error(tcpsock_get_sock(tcp), r);
 766                 }
 767                 /* Otherwise, do not touch the socket object anymore! */
 768
 769                 return r;
 770         } else
 771                 return OK;
 772 }
 773
 774 /*
 775  * Callback from lwIP.  The given number of data bytes have been acknowledged
 776  * as received by the remote end.  Dequeue and free data from the TCP socket's
 777  * send queue as appropriate.
 778  */
 779 static err_t
 780 tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
 781 {
 782         struct tcpsock *tcp = (struct tcpsock *)arg;
 783         struct pbuf *phead;
 784         size_t left;
 785
 786         assert(tcp != NULL);
 787         assert(pcb == tcp->tcp_pcb);
 788         assert(len > 0);
 789
 790         assert(tcp->tcp_snd.ts_len >= len);
 791         assert(tcp->tcp_snd.ts_head != NULL);
 792
 793         left = len;
 794
 795         /*
 796          * First see if we can free up whole buffers.  Check against the head
 797          * buffer's 'len' rather than 'tot_len', or we may end up leaving an
 798          * empty buffer on the chain.
 799          */
 800         while ((phead = tcp->tcp_snd.ts_head) != NULL &&
 801             left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
 802                 left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
 803
 804                 tcp->tcp_snd.ts_head = phead->next;
 805                 tcp->tcp_snd.ts_head_off = 0;
 806
 807                 if (phead == tcp->tcp_snd.ts_unsent) {
 808                         assert(tcp->tcp_snd.ts_unsent_off == phead->len);
 809
 810                         tcp->tcp_snd.ts_unsent = phead->next;
 811                         tcp->tcp_snd.ts_unsent_off = 0;
 812                 }
 813
 814                 assert(tcpsock_sendbufs > 0);
 815                 tcpsock_sendbufs--;
 816
 817                 tcpsock_free_buf(phead);
 818         }
 819
 820         /*
 821          * The rest of the given length is for less than the current head
 822          * buffer.
 823          */
 824         if (left > 0) {
 825                 assert(tcp->tcp_snd.ts_head != NULL);
 826                 assert((size_t)tcp->tcp_snd.ts_head->len -
 827                     tcp->tcp_snd.ts_head_off > left);
 828
 829                 tcp->tcp_snd.ts_head_off += left;
 830         }
 831
 832         tcp->tcp_snd.ts_len -= (size_t)len;
 833
 834         if (tcp->tcp_snd.ts_head == NULL) {
 835                 assert(tcp->tcp_snd.ts_len == 0);
 836                 assert(tcp->tcp_snd.ts_unsent == NULL);
 837                 tcp->tcp_snd.ts_tail = NULL;
 838         } else
 839                 assert(tcp->tcp_snd.ts_len > 0);
 840
 841         /*
 842          * If we emptied the send queue, and we already managed to send a FIN
 843          * earlier, we may now have met all requirements to close the socket's
 844          * PCB.  Otherwise, we may also be able to send more now, so try to
 845          * resume sending.  Since we are invoked from the "sent" event,
 846          * tcp_output() will not actually process anything, and so we do not
 847          * call it either.  If we did, we would have to deal with errors here.
 848          */
 849         if (tcpsock_may_close(tcp)) {
 850                 if (tcpsock_finish_close(tcp))
 851                         return ERR_OK;
 852         } else {
 853                 tcpsock_clear_flag(tcp, TCPF_FULL);
 854
 855                 /*
 856                  * If we now manage to enqueue a FIN, we may be ready to close
 857                  * the PCB after all.
 858                  */
 859                 if (tcpsock_pcb_enqueue(tcp)) {
 860                         if (tcpsock_may_close(tcp) &&
 861                             tcpsock_finish_close(tcp))
 862                                 return ERR_OK;
 863                 }
 864         }
 865
 866         /* The user may also be able to send more now. */
 867         sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
 868
 869         return ERR_OK;
 870 }
 871
 872 /*
 873  * Check whether any (additional) data previously received on a TCP socket
 874  * should be acknowledged, possibly allowing the remote end to send additional
 875  * data as a result.
 876  */
 877 static void
 878 tcpsock_ack_recv(struct tcpsock * tcp)
 879 {
 880         size_t rcvbuf, left, delta, ack;
 881
 882         assert(tcp->tcp_pcb != NULL);
 883
 884         /*
 885          * We must make sure that at all times, we can still add an entire
 886          * window's worth of data to the receive queue.  If the amount of free
 887          * space drops below that threshold, we stop acknowledging received
 888          * data.  The user may change the receive buffer size at all times; we
 889          * update the window size lazily as appropriate.
 890          */
 891         rcvbuf = tcpsock_get_rcvbuf(tcp);
 892
 893         if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
 894                 /*
 895                  * The number of bytes that lwIP can still give us at any time
 896                  * is represented as 'left'.  The number of bytes that we still
 897                  * allow to be stored in the receive queue is represented as
 898                  * 'delta'.  We must make sure that 'left' does not ever exceed
 899                  * 'delta' while acknowledging as many bytes as possible under
 900                  * that rule.
 901                  */
 902                 left = TCP_WND - tcp->tcp_rcv.tr_unacked;
 903                 delta = rcvbuf - tcp->tcp_rcv.tr_len;
 904
 905                 if (left < delta) {
 906                         ack = delta - left;
 907
 908                         if (ack > tcp->tcp_rcv.tr_unacked)
 909                                 ack = tcp->tcp_rcv.tr_unacked;
 910
 911                         tcp_recved(tcp->tcp_pcb, ack);
 912
 913                         tcp->tcp_rcv.tr_unacked -= ack;
 914
 915                         assert(tcp->tcp_rcv.tr_len + TCP_WND -
 916                             tcp->tcp_rcv.tr_unacked <= rcvbuf);
 917                 }
 918         }
 919 }
 920
 921 /*
 922  * Attempt to merge two consecutive underfilled buffers in the receive queue of
 923  * a TCP socket, freeing up one of the two buffers as a result.  The first
 924  * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
 925  * 'pnext'.  The second (new) buffer is 'pbuf', which is already attached to
 926  * the first buffer.  The second buffer may be followed by additional buffers
 927  * with even more new data.  Return TRUE if buffers have been merged, in which
 928  * case the pointer at 'pnext' may have changed, and no assumptions should be
 929  * made about whether 'ptail' and 'pbuf' still exist in any form.  Return FALSE
 930  * if no merging was necessary or if no new buffer could be allocated.
 931  */
 932 static int
 933 tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
 934 {
 935         struct pbuf *pnew;
 936
 937         assert(*pnext == ptail);
 938         assert(ptail->next == pbuf);
 939
 940         /*
 941          * Unfortunately, we cannot figure out what kind of pbuf we were given
 942          * by the lower layers, so we cannot merge two buffers without first
 943          * allocating a third.  Once we have done that, though, we can easily
 944          * merge more into that new buffer.  For now we use the following
 945          * policies:
 946          *
 947          * 1. if two consecutive lwIP-provided buffers are both used less than
 948          *    half the size of a full buffer, try to allocate a new buffer and
 949          *    copy both lwIP-provided buffers into that new buffer, freeing up
 950          *    the pair afterwards;
 951          * 2. if the tail buffer on the chain is allocated by us and not yet
 952          *    full, and the next buffer's contents can be added to the tail
 953          *    buffer in their entirety, do just that.
 954          *
 955          * Obviously there is a trade-off between the performance overhead of
 956          * copying and the resource overhead of keeping less-than-full buffers
 957          * on the receive queue, but this policy should both keep actual memory
 958          * usage to no more than twice the receive queue length and prevent
 959          * excessive copying.  The policy deliberately performs more aggressive
 960          * merging into a buffer that we allocated ourselves.
 961          */
 962         if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
 963             pbuf->len <= MEMPOOL_BUFSIZE / 2) {
 964                 /*
 965                  * Case #1.
 966                  */
 967                 assert(ptail->tot_len == ptail->len);
 968                 assert(pbuf->tot_len == pbuf->len);
 969
 970                 pnew = tcpsock_alloc_buf();
 971                 if (pnew == NULL)
 972                         return FALSE;
 973
 974                 memcpy(pnew->payload, ptail->payload, ptail->len);
 975                 memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
 976                     pbuf->len);
 977                 pnew->len = ptail->len + pbuf->len;
 978                 assert(pnew->len <= pnew->tot_len);
 979
 980                 pnew->next = pbuf->next;
 981                 /* For now, we need not inherit any flags from either pbuf. */
 982
 983                 *pnext = pnew;
 984
 985                 /* One allocated, two about to be deallocated. */
 986                 assert(tcpsock_recvbufs > 0);
 987                 tcpsock_recvbufs--;
 988
 989                 tcpsock_free_buf(ptail);
 990                 tcpsock_free_buf(pbuf);
 991
 992                 return TRUE;
 993         } else if (ptail->tot_len - ptail->len >= pbuf->len) {
 994                 /*
 995                  * Case #2.
 996                  */
 997                 memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
 998                     pbuf->len);
 999
1000                 ptail->len += pbuf->len;
1001
1002                 ptail->next = pbuf->next;
1003
1004                 assert(tcpsock_recvbufs > 0);
1005                 tcpsock_recvbufs--;
1006
1007                 tcpsock_free_buf(pbuf);
1008
1009                 return TRUE;
1010         } else
1011                 return FALSE;
1012 }
1013
1014 /*
1015  * Callback from lwIP.  New data or flags have been received on a TCP socket.
1016  */
1017 static err_t
1018 tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
1019         struct pbuf * pbuf, err_t err)
1020 {
1021         struct tcpsock *tcp = (struct tcpsock *)arg;
1022         struct pbuf *ptail, **pprevp;
1023         size_t len;
1024
1025         assert(tcp != NULL);
1026         assert(pcb == tcp->tcp_pcb);
1027
1028         /*
1029          * lwIP should never provide anything other than ERR_OK in 'err', and
1030          * it is not clear what we should do if it would.  If lwIP ever changes
1031          * in this regard, we will likely have to change this code accordingly.
1032          */
1033         if (err != ERR_OK)
1034                 panic("TCP receive event with error: %d", err);
1035
1036         /* If the given buffer is NULL, we have received a FIN. */
1037         if (pbuf == NULL) {
1038                 tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
1039
1040                 /* Userland may now receive EOF. */
1041                 if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
1042                         sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1043
1044                 /*
1045                  * If we were in the process of closing the socket, and we
1046                  * receive a FIN before our FIN got acknowledged, we close the
1047                  * socket anyway, as described in tcpsock_close().  However, if
1048                  * there is still unacknowledged outgoing data or we did not
1049                  * even manage to send our FIN yet, hold off closing the socket
1050                  * for now.
1051                  */
1052                 if (tcpsock_may_close(tcp))
1053                         (void)tcpsock_finish_close(tcp);
1054
1055                 return ERR_OK;
1056         }
1057
1058         /*
1059          * If the socket is being closed, receiving new data should cause a
1060          * reset.
1061          */
1062         if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
1063                 tcpsock_pcb_abort(tcp);
1064
1065                 (void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
1066                 /* Do not touch the socket object anymore! */
1067
1068                 pbuf_free(pbuf);
1069
1070                 return ERR_ABRT;
1071         }
1072
1073         /*
1074          * If the socket has already been shut down for reading, discard the
1075          * incoming data and do nothing else.
1076          */
1077         if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
1078                 tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
1079
1080                 pbuf_free(pbuf);
1081
1082                 return ERR_OK;
1083         }
1084
1085         /*
1086          * We deliberately ignore the PBUF_FLAG_PUSH flag.  This flag would
1087          * enable the receive functionality to delay delivering "un-pushed"
1088          * data to applications.  The implementation of this scheme could track
1089          * the amount of data up to and including the last-pushed segment using
1090          * a "tr_push_len" field or so.  Deciding when to deliver "un-pushed"
1091          * data after all is a bit tricker though.  As far as I can tell, the
1092          * BSDs do not implement anything like that.  Windows does, and this
1093          * results in interaction problems with even more lightweight TCP/IP
1094          * stacks that do not send the TCP PSH flag.  Currently, there is no
1095          * obvious benefit for us to support delaying data delivery like that.
1096          * In addition, testing its implementation reliably would be difficult.
1097          */
1098
1099         len = (size_t)pbuf->tot_len;
1100
1101         /*
1102          * Count the number of buffers that are now owned by us.  The new total
1103          * of buffers owned by us must not exceed the size of the memory pool.
1104          * Any more would indicate an accounting error.  Note that
1105          * tcpsock_recvbufs is currently used for debugging only!
1106          */
1107         tcpsock_recvbufs += pbuf_clen(pbuf);
1108         assert(tcpsock_recvbufs < mempool_cur_buffers());
1109
1110         /*
1111          * The pre-tail pointer points to whatever is pointing to the tail
1112          * buffer.  The latter pointer may be the 'tr_head' field in our
1113          * tcpsock structure, or the 'next' field in the penultimate buffer,
1114          * or NULL if there are currently no buffers on the receive queue.
1115          */
1116         if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
1117                 ptail = *pprevp;
1118
1119                 assert(ptail != NULL);
1120                 assert(ptail->next == NULL);
1121                 assert(tcp->tcp_rcv.tr_head != NULL);
1122
1123                 ptail->next = pbuf;
1124                 pbuf->tot_len = pbuf->len;      /* to help freeing on merges */
1125
1126                 if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
1127                         ptail = *pprevp;
1128                         pbuf = ptail->next;
1129                 }
1130
1131                 if (pbuf != NULL)
1132                         pprevp = &ptail->next;
1133         } else {
1134                 assert(tcp->tcp_rcv.tr_head == NULL);
1135                 assert(tcp->tcp_rcv.tr_head_off == 0);
1136
1137                 tcp->tcp_rcv.tr_head = pbuf;
1138
1139                 pprevp = &tcp->tcp_rcv.tr_head;
1140         }
1141
1142         /*
1143          * Chop up the chain into individual buffers.  This is necessary as we
1144          * overload 'tot_len' to mean "space available in the buffer", as we
1145          * want for buffers allocated by us as part of buffer merges.  Also get
1146          * a pointer to the pointer to the new penultimate tail buffer.  Due to
1147          * merging, the chain may already be empty by now, though.
1148          */
1149         if (pbuf != NULL) {
1150                 for (; pbuf->next != NULL; pbuf = pbuf->next) {
1151                         pbuf->tot_len = pbuf->len;
1152
1153                         pprevp = &pbuf->next;
1154                 }
1155                 assert(pbuf->len == pbuf->tot_len);
1156         }
1157
1158         assert(*pprevp != NULL);
1159         assert((*pprevp)->next == NULL);
1160         tcp->tcp_rcv.tr_pre_tailp = pprevp;
1161
1162         tcp->tcp_rcv.tr_len += len;
1163         tcp->tcp_rcv.tr_unacked += len;
1164
1165         assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
1166
1167         /*
1168          * Note that tr_len may now exceed the receive buffer size in the
1169          * highly exceptional case that the user is adjusting the latter after
1170          * the socket had already received data.
1171          */
1172
1173         /* See if we can immediately acknowledge some or all of the data. */
1174         tcpsock_ack_recv(tcp);
1175
1176         /* Also wake up any receivers now. */
1177         sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1178
1179         return ERR_OK;
1180 }
1181
1182 /*
1183  * Callback from lwIP.  The PCB corresponding to the socket identified by 'arg'
1184  * has been closed by lwIP, with the reason specified in 'err': either the
1185  * connection has been aborted locally (ERR_ABRT), it has been reset by the
1186  * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
1187  */
1188 static void
1189 tcpsock_event_err(void * arg, err_t err)
1190 {
1191         struct tcpsock *tcp = (struct tcpsock *)arg;
1192         int r;
1193
1194         assert(tcp != NULL);
1195         assert(tcp->tcp_pcb != NULL);
1196         assert(err != ERR_OK);
1197
1198         /* The original PCB is now gone, or will be shortly. */
1199         tcp->tcp_pcb = NULL;
1200
1201         /*
1202          * Clean up the socket.  As a result it may be freed, in which case we
1203          * must not touch it anymore.  No need to return ERR_ABRT from here, as
1204          * the PCB has been aborted already.
1205          */
1206         if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
1207                 return;
1208
1209         if (err == ERR_CLSD) {
1210                 /*
1211                  * We may get here if the socket is shut down for writing and
1212                  * we already received a FIN from the remote side, thus putting
1213                  * the socket in LAST_ACK state, and we receive that last
1214                  * acknowledgment.  There is nothing more we need to do.
1215                  *
1216                  * We will never get here in the other case that ERR_CLSD is
1217                  * raised, which is when the socket is reset because of
1218                  * unacknowledged data while closing: we handle the
1219                  * reset-on-ACK case ourselves in tcpsock_close(), and the
1220                  * socket is in closing state after that.
1221                  */
1222                 assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1223                 assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
1224         } else {
1225                 /*
1226                  * Anything else should be an error directly from lwIP;
1227                  * currently either ERR_ABRT and ERR_RST.  Covert it to a
1228                  * regular error and set it on the socket.  Doing so will also
1229                  * raise the appropriate events.
1230                  */
1231                 /*
1232                  * Unfortunately, lwIP is not throwing accurate errors even
1233                  * when it can.  We convert some errors to reflect more
1234                  * accurately the most likely cause.
1235                  *
1236                  * TODO: fix lwIP in this regard..
1237                  */
1238                 r = util_convert_err(err);
1239
1240                 if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
1241                         switch (err) {
1242                         case ERR_ABRT:  r = ETIMEDOUT;          break;
1243                         case ERR_RST:   r = ECONNREFUSED;       break;
1244                         }
1245                 }
1246
1247                 sockevent_set_error(tcpsock_get_sock(tcp), r);
1248         }
1249 }
1250
1251 /*
1252  * Callback from lwIP.  Perform regular checks on a TCP socket.  This function
1253  * is called one per five seconds on connected sockets, and twice per second on
1254  * closing sockets.
1255  */
1256 static err_t
1257 tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
1258 {
1259         struct tcpsock *tcp = (struct tcpsock *)arg;
1260         err_t err;
1261         int r;
1262
1263         assert(tcp != NULL);
1264         assert(pcb == tcp->tcp_pcb);
1265
1266         /*
1267          * If we ended up running out of buffers earlier, try resuming any send
1268          * requests now, both for enqueuing TCP data with lwIP and for user
1269          * requests.
1270          */
1271         if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
1272                 tcpsock_clear_flag(tcp, TCPF_FULL);
1273                 tcpsock_clear_flag(tcp, TCPF_OOM);
1274
1275                 /* See if we can enqueue more data with lwIP. */
1276                 if (tcpsock_pcb_enqueue(tcp)) {
1277                         /* In some cases, we can now close the PCB. */
1278                         if (tcpsock_may_close(tcp)) {
1279                                 (void)tcpsock_finish_close(tcp);
1280                                 /*
1281                                  * The PCB is definitely gone here, and the
1282                                  * entire socket object may be gone now too.
1283                                  * Do not touch either anymore!
1284                                  */
1285
1286                                 return ERR_OK;
1287                         }
1288
1289                         /*
1290                          * If actually sending the data fails, the PCB will be
1291                          * gone, and the socket object may be gone as well.  Do
1292                          * not touch either anymore in that case!
1293                          */
1294                         if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
1295                                 return ERR_ABRT;
1296                 }
1297
1298                 /*
1299                  * If we ran out of buffers earlier, it may be possible to take
1300                  * in more data from a user process now, even if we did not
1301                  * manage to enqueue any more pending data with lwIP.
1302                  */
1303                 sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
1304
1305                 assert(tcp->tcp_pcb != NULL);
1306         } else if (tcp->tcp_snd.ts_unsent != NULL &&
1307             tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
1308                 /*
1309                  * If the send buffer is full, we will no longer call
1310                  * tcp_output(), which means we may also miss out on fatal
1311                  * errors that would otherwise kill the connection (e.g., no
1312                  * route).  As a result, the connection may erroneously
1313                  * continue to exist for a long time.  To avoid this, we call
1314                  * tcp_output() every once in a while when there are still
1315                  * unsent data.
1316                  */
1317                 err = tcp_output(tcp->tcp_pcb);
1318
1319                 if (err != ERR_OK && err != ERR_MEM) {
1320                         tcpsock_pcb_abort(tcp);
1321
1322                         if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
1323                                 r = util_convert_err(err);
1324
1325                                 sockevent_set_error(tcpsock_get_sock(tcp), r);
1326                         }
1327                         /* Otherwise do not touch the socket object anymore! */
1328
1329                         return ERR_ABRT;
1330                 }
1331         }
1332
1333         /*
1334          * If we are closing the socket, and we sent a FIN, see if the FIN got
1335          * acknowledged.  If so, finish closing the socket.  Unfortunately, we
1336          * can perform this check by polling only.  TODO: change lwIP..
1337          */
1338         if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
1339             (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
1340             tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
1341                 assert(tcp->tcp_snd.ts_len == 0);
1342
1343                 tcpsock_finish_close(tcp);
1344         }
1345
1346         return ERR_OK;
1347 }
1348
1349 /*
1350  * Bind a TCP socket to a local address.
1351  */
1352 static int
1353 tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
1354         socklen_t addr_len, endpoint_t user_endpt)
1355 {
1356         struct tcpsock *tcp = (struct tcpsock *)sock;
1357         ip_addr_t ipaddr;
1358         uint16_t port;
1359         err_t err;
1360         int r;
1361
1362         if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
1363                 return EINVAL;
1364
1365         if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1366             user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
1367             FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
1368                 return r;
1369
1370         err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
1371
1372         return util_convert_err(err);
1373 }
1374
1375 /*
1376  * Callback from lwIP.  A new connection 'pcb' has arrived on the listening
1377  * socket identified by 'arg'.  Note that 'pcb' may be NULL in the case that
1378  * lwIP could not accept the connection itself.
1379  */
1380 static err_t
1381 tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
1382 {
1383         struct tcpsock *tcp = (struct tcpsock *)arg;
1384
1385         assert(tcp != NULL);
1386         assert(tcpsock_is_listening(tcp));
1387
1388         /*
1389          * If the given PCB is NULL, then lwIP ran out of memory allocating a
1390          * PCB for the new connection.  There is nothing we can do with that
1391          * information.  Also check 'err' just to make sure.
1392          */
1393         if (pcb == NULL || err != OK)
1394                 return ERR_OK;
1395
1396         /*
1397          * The TCP socket is the listening socket, but the PCB is for the
1398          * incoming connection.
1399          */
1400         if (tcpsock_clone(tcp, pcb) != OK) {
1401                 /*
1402                  * We could not allocate the resources necessary to accept the
1403                  * connection.  Abort it immediately.
1404                  */
1405                 tcp_abort(pcb);
1406
1407                 return ERR_ABRT;
1408         }
1409
1410         /*
1411          * The connection has not yet been accepted, and thus should still be
1412          * considered on the listen queue.
1413          */
1414         tcp_backlog_delayed(pcb);
1415
1416         /* Set the callback functions. */
1417         tcp_recv(pcb, tcpsock_event_recv);
1418         tcp_sent(pcb, tcpsock_event_sent);
1419         tcp_err(pcb, tcpsock_event_err);
1420         tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1421
1422         sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
1423
1424         return ERR_OK;
1425 }
1426
1427 /*
1428  * Put a TCP socket in listening mode.
1429  */
1430 static int
1431 tcpsock_listen(struct sock * sock, int backlog)
1432 {
1433         struct tcpsock *tcp = (struct tcpsock *)sock;
1434         struct tcp_pcb *pcb;
1435         err_t err;
1436
1437         /* The maximum backlog value must not exceed its field size. */
1438         assert(SOMAXCONN <= UINT8_MAX);
1439
1440         /*
1441          * Allow only CLOSED sockets to enter listening mode.  If the socket
1442          * was already in listening mode, allow its backlog value to be
1443          * updated, even if it was shut down already (making this a no-op).
1444          */
1445         if (!tcpsock_is_listening(tcp) &&
1446             (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
1447                 return EINVAL;
1448
1449         /*
1450          * If the socket was not already in listening mode, put it in that mode
1451          * now.  That involves switching PCBs as lwIP attempts to save memory
1452          * by replacing the original PCB with a smaller one.  If the socket was
1453          * already in listening mode, simply update its backlog value--this has
1454          * no effect on the sockets already in the backlog.
1455          */
1456         if (!tcpsock_is_listening(tcp)) {
1457                 assert(tcp->tcp_pcb != NULL);
1458
1459                 /*
1460                  * If the socket has not been bound to a port yet, do that
1461                  * first.  This does mean that the listen call may fail with
1462                  * side effects, but that is acceptable in this case.
1463                  */
1464                 if (tcp->tcp_pcb->local_port == 0) {
1465                         err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
1466                             0 /*port*/);
1467
1468                         if (err != ERR_OK)
1469                                 return util_convert_err(err);
1470                 }
1471
1472                 /*
1473                  * Clear the argument on the PCB that is about to be replaced,
1474                  * because if we do not, once the PCB is reused (which does not
1475                  * clear the argument), we might get weird events.  Do this
1476                  * before the tcp_listen() call, because we should no longer
1477                  * access the old PCB afterwards (even if we can).
1478                  */
1479                 tcp_arg(tcp->tcp_pcb, NULL);
1480
1481                 pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
1482                     &err);
1483
1484                 if (pcb == NULL) {
1485                         tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
1486
1487                         return util_convert_err(err);
1488                 }
1489
1490                 tcp_arg(pcb, tcp);
1491                 tcp->tcp_pcb = pcb;
1492
1493                 tcp_accept(pcb, tcpsock_event_accept);
1494
1495                 /* Initialize the queue head for sockets pending acceptance. */
1496                 TAILQ_INIT(&tcp->tcp_queue.tq_head);
1497         } else if (tcp->tcp_pcb != NULL)
1498                 tcp_backlog_set(tcp->tcp_pcb, backlog);
1499
1500         return OK;
1501 }
1502
1503 /*
1504  * Callback from lwIP.  A socket connection attempt has succeeded.  Note that
1505  * failed socket events will trigger the tcpsock_event_err() callback instead.
1506  */
1507 static err_t
1508 tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
1509 {
1510         struct tcpsock *tcp = (struct tcpsock *)arg;
1511
1512         assert(tcp != NULL);
1513         assert(pcb == tcp->tcp_pcb);
1514         assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
1515
1516         /*
1517          * If lwIP ever changes so that this callback is called for connect
1518          * failures as well, then we need to change the code here accordingly.
1519          */
1520         if (err != ERR_OK)
1521                 panic("TCP connected event with error: %d", err);
1522
1523         tcpsock_clear_flag(tcp, TCPF_CONNECTING);
1524
1525         sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
1526
1527         return ERR_OK;
1528 }
1529
1530 /*
1531  * Connect a TCP socket to a remote address.
1532  */
1533 static int
1534 tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
1535         socklen_t addr_len, endpoint_t user_endpt)
1536 {
1537         struct tcpsock *tcp = (struct tcpsock *)sock;
1538         ip_addr_t dst_addr;
1539         uint16_t dst_port;
1540         err_t err;
1541         int r;
1542
1543         /*
1544          * Listening sockets may not have a PCB, so we use higher-level flags
1545          * to throw the correct error code for those instead.
1546          */
1547         if (tcpsock_is_listening(tcp))
1548                 return EOPNOTSUPP;
1549
1550         /*
1551          * If there is no longer any PCB, we obviously cannot perform the
1552          * connection, but POSIX is not clear on which error to return.  We
1553          * copy NetBSD's.
1554          */
1555         if (tcp->tcp_pcb == NULL)
1556                 return EINVAL;
1557
1558         /*
1559          * The only state from which a connection can be initiated, is CLOSED.
1560          * Some of the other states require distinct error codes, though.
1561          */
1562         switch (tcp->tcp_pcb->state) {
1563         case CLOSED:
1564                 break;
1565         case SYN_SENT:
1566                 return EALREADY;
1567         case LISTEN:
1568                 assert(0); /* we just checked.. */
1569         default:
1570                 return EISCONN;
1571         }
1572
1573         /*
1574          * Get the destination address, and attempt to start connecting.  If
1575          * the socket was not bound before, or it was bound to a port only,
1576          * then lwIP will select a source address for us.  We cannot do this
1577          * ourselves even if we wanted to: it is impossible to re-bind a TCP
1578          * PCB in the case it was previously bound to a port only.
1579          */
1580         if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1581             &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
1582                 return r;
1583
1584         err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
1585             tcpsock_event_connected);
1586
1587         /*
1588          * Note that various tcp_connect() error cases will leave the PCB with
1589          * a newly set local and remote IP address anyway.  We should be
1590          * careful not to rely on the addresses being as they were before.
1591          */
1592         if (err != ERR_OK)
1593                 return util_convert_err(err);
1594
1595         /* Set the other callback functions. */
1596         tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
1597         tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
1598         tcp_err(tcp->tcp_pcb, tcpsock_event_err);
1599         tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1600
1601         /*
1602          * Set a flag so that we can correct lwIP's error codes in case the
1603          * connection fails.
1604          */
1605         tcpsock_set_flag(tcp, TCPF_CONNECTING);
1606
1607         return SUSPEND;
1608 }
1609
1610 /*
1611  * Test whether any new connections are pending on a listening TCP socket.
1612  */
1613 static int
1614 tcpsock_test_accept(struct sock * sock)
1615 {
1616         struct tcpsock *tcp = (struct tcpsock *)sock;
1617
1618         /* Is this socket in listening mode at all? */
1619         if (!tcpsock_is_listening(tcp))
1620                 return EINVAL;
1621
1622         /* Are there any connections to accept right now? */
1623         if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
1624                 return OK;
1625
1626         /* If the socket has been shut down, we return ECONNABORTED. */
1627         if (tcp->tcp_pcb == NULL)
1628                 return ECONNABORTED;
1629
1630         /* Otherwise, wait for a new connection first. */
1631         return SUSPEND;
1632 }
1633
1634 /*
1635  * Accept a connection on a listening TCP socket, creating a new TCP socket.
1636  */
1637 static sockid_t
1638 tcpsock_accept(struct sock * sock, struct sockaddr * addr,
1639         socklen_t * addr_len, endpoint_t user_endpt __unused,
1640         struct sock ** newsockp)
1641 {
1642         struct tcpsock *listener = (struct tcpsock *)sock;
1643         struct tcpsock *tcp;
1644         int r;
1645
1646         if ((r = tcpsock_test_accept(sock)) != OK)
1647                 return r;
1648         /* Below, we must not assume that the listener has a PCB. */
1649
1650         tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
1651         assert(tcp->tcp_listener == listener);
1652         assert(tcp->tcp_pcb != NULL);
1653
1654         TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
1655         tcp->tcp_listener = NULL;
1656
1657         tcp_backlog_accepted(tcp->tcp_pcb);
1658
1659         ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1660             &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
1661
1662         /*
1663          * Set 'newsockp' to NULL so that libsockevent knows we already cloned
1664          * the socket, and it must not be reinitialized anymore.
1665          */
1666         *newsockp = NULL;
1667         return tcpsock_get_id(tcp);
1668 }
1669
1670 /*
1671  * Perform preliminary checks on a send request.
1672  */
1673 static int
1674 tcpsock_pre_send(struct sock * sock, size_t len __unused,
1675         socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
1676         socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
1677 {
1678
1679         /*
1680          * Reject calls with unknown flags.  Since libsockevent strips out the
1681          * flags it handles itself here, we only have to test for ones we can
1682          * not handle.  Currently, there are no send flags that we support.
1683          */
1684         if (flags != 0)
1685                 return EOPNOTSUPP;
1686
1687         return OK;
1688 }
1689
1690 /*
1691  * Test whether the given number of data bytes can be sent on a TCP socket.
1692  */
1693 static int
1694 tcpsock_test_send(struct sock * sock, size_t min)
1695 {
1696         struct tcpsock *tcp = (struct tcpsock *)sock;
1697         size_t sndbuf;
1698
1699         if (tcp->tcp_pcb == NULL)
1700                 return EPIPE;
1701
1702         switch (tcp->tcp_pcb->state) {
1703         case CLOSED:                    /* new */
1704         case LISTEN:                    /* listening */
1705                 return ENOTCONN;
1706         case SYN_SENT:                  /* connecting */
1707         case SYN_RCVD:                  /* simultaneous open, maybe someday? */
1708                 return SUSPEND;
1709         case ESTABLISHED:               /* connected */
1710         case CLOSE_WAIT:                /* closed remotely */
1711                 break;
1712         default:                        /* shut down locally */
1713                 assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1714                 return EPIPE;
1715         }
1716
1717         sndbuf = tcpsock_get_sndbuf(tcp);
1718         if (min > sndbuf)
1719                 min = sndbuf;
1720
1721         if (tcp->tcp_snd.ts_len + min > sndbuf)
1722                 return SUSPEND;
1723         else
1724                 return OK;
1725 }
1726
1727 /*
1728  * Send data on a TCP socket.
1729  */
1730 static int
1731 tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
1732         size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1733         socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1734         const struct sockaddr * addr __unused, socklen_t addr_len __unused,
1735         endpoint_t user_endpt __unused, int flags __unused, size_t min)
1736 {
1737         struct tcpsock *tcp = (struct tcpsock *)sock;
1738         struct pbuf *ptail, *pfirst, *pnext, *plast;
1739         size_t off, tail_off, chunk, left, sndbuf;
1740         int r;
1741
1742         if ((r = tcpsock_test_send(sock, min)) != OK)
1743                 return r;
1744
1745         if (len == 0)
1746                 return OK;      /* nothing to do */
1747
1748         sndbuf = tcpsock_get_sndbuf(tcp);
1749         if (min > sndbuf)
1750                 min = sndbuf;
1751         assert(min > 0);
1752
1753         assert(sndbuf > tcp->tcp_snd.ts_len);
1754         left = sndbuf - tcp->tcp_snd.ts_len;
1755         if (left > len)
1756                 left = len;
1757
1758         /*
1759          * First see if we can fit any more data in the current tail buffer.
1760          * If so, we set 'ptail' to point to it and 'tail_off' to the previous
1761          * length of the tail buffer, while optimistically extending it to
1762          * include the new data.  If not, we set them to NULL/0.
1763          */
1764         if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
1765             ptail->len < ptail->tot_len) {
1766                 assert(ptail->len > 0);
1767                 tail_off = (size_t)ptail->len;
1768
1769                 /*
1770                  * Optimistically extend the head buffer to include whatever
1771                  * fits in it.  This is needed for util_copy_data().
1772                  */
1773                 assert(ptail->tot_len > ptail->len);
1774                 off = (size_t)ptail->tot_len - (size_t)ptail->len;
1775                 if (off > left)
1776                         off = left;
1777                 ptail->len += off;
1778         } else {
1779                 ptail = NULL;
1780                 tail_off = 0;
1781                 off = 0;
1782         }
1783
1784         /*
1785          * Then, if there is more to send, allocate new buffers as needed.  If
1786          * we run out of memory, work with whatever we did manage to grab.
1787          */
1788         pfirst = NULL;
1789         plast = NULL;
1790         while (off < left) {
1791                 if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
1792                     (pnext = tcpsock_alloc_buf()) == NULL) {
1793                         /*
1794                          * Chances are that we will end up suspending this send
1795                          * request because of being out of buffers.  We try to
1796                          * resume such requests from the polling function.
1797                          */
1798                         tcpsock_set_flag(tcp, TCPF_OOM);
1799
1800                         break;
1801                 }
1802
1803                 tcpsock_sendbufs++;
1804
1805                 if (pfirst == NULL)
1806                         pfirst = pnext;
1807                 else
1808                         plast->next = pnext;
1809                 plast = pnext;
1810
1811                 chunk = (size_t)pnext->tot_len;
1812                 if (chunk > left - off)
1813                         chunk = left - off;
1814                 pnext->len = chunk;
1815                 off += chunk;
1816         }
1817
1818         /*
1819          * Copy in the data and continue, unless we did not manage to find
1820          * enough space to even meet the low send watermark, in which case we
1821          * undo any allocation and suspend the call until later.
1822          */
1823         if (off >= min) {
1824                 /*
1825                  * Optimistically attach the new buffers to the tail, also for
1826                  * util_copy_data().  We undo all this if the copy fails.
1827                  */
1828                 if (ptail != NULL) {
1829                         ptail->next = pfirst;
1830
1831                         pnext = ptail;
1832                 } else
1833                         pnext = pfirst;
1834
1835                 assert(pnext != NULL);
1836
1837                 r = util_copy_data(data, off, *offp, pnext, tail_off,
1838                     TRUE /*copy_in*/);
1839         } else
1840                 r = SUSPEND;
1841
1842         if (r != OK) {
1843                 /* Undo the modifications made so far. */
1844                 while (pfirst != NULL) {
1845                         pnext = pfirst->next;
1846
1847                         assert(tcpsock_sendbufs > 0);
1848                         tcpsock_sendbufs--;
1849
1850                         tcpsock_free_buf(pfirst);
1851
1852                         pfirst = pnext;
1853                 }
1854
1855                 if (ptail != NULL) {
1856                         ptail->next = NULL;
1857
1858                         ptail->len = tail_off;
1859                 }
1860
1861                 return r;
1862         }
1863
1864         /* Attach the new buffers, if any, to the buffer tail. */
1865         if (pfirst != NULL) {
1866                 if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
1867                         assert(ptail->len == ptail->tot_len);
1868
1869                         /*
1870                          * Due to our earlier optimistic modifications, this
1871                          * may or may not be redundant.
1872                          */
1873                         ptail->next = pfirst;
1874                 }
1875
1876                 assert(plast != NULL);
1877                 tcp->tcp_snd.ts_tail = plast;
1878
1879                 if (tcp->tcp_snd.ts_head == NULL) {
1880                         tcp->tcp_snd.ts_head = pfirst;
1881                         assert(tcp->tcp_snd.ts_head_off == 0);
1882                 }
1883                 if (tcp->tcp_snd.ts_unsent == NULL) {
1884                         tcp->tcp_snd.ts_unsent = pfirst;
1885                         assert(tcp->tcp_snd.ts_unsent_off == 0);
1886                 }
1887         }
1888
1889         tcp->tcp_snd.ts_len += off;
1890
1891         /*
1892          * See if we can send any of the data we just enqueued.  The socket is
1893          * still open as we are still processing a call from userland on it;
1894          * this saves us from having to deal with the cases that the following
1895          * calls end up freeing the socket object.
1896          */
1897         if (tcpsock_pcb_enqueue(tcp) &&
1898             (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
1899                 /*
1900                  * That did not go well.  Return the error immediately if we
1901                  * had not made any progress earlier.  Otherwise, return our
1902                  * partial progress and leave the error to be picked up later.
1903                  */
1904                 if (*offp > 0) {
1905                         sockevent_set_error(tcpsock_get_sock(tcp), r);
1906
1907                         return OK;
1908                 } else
1909                         return r;
1910         }
1911
1912         *offp += off;
1913         return (off < len) ? SUSPEND : OK;
1914 }
1915
1916 /*
1917  * Perform preliminary checks on a receive request.
1918  */
1919 static int
1920 tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1921         int flags)
1922 {
1923
1924         /*
1925          * Reject calls with unknown flags.  Since libsockevent strips out the
1926          * flags it handles itself here, we only have to test for ones we can
1927          * not handle.
1928          */
1929         if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
1930                 return EOPNOTSUPP;
1931
1932         return OK;
1933 }
1934
1935 /*
1936  * Return TRUE if receive calls may wait for more data to come in on the
1937  * connection, or FALSE if we already know that that is not going to happen.
1938  */
1939 static int
1940 tcpsock_may_wait(struct tcpsock * tcp)
1941 {
1942
1943         return (tcp->tcp_pcb != NULL &&
1944             !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
1945 }
1946
1947 /*
1948  * Test whether data can be received on a TCP socket, and if so, how many bytes
1949  * of data.
1950  */
1951 static int
1952 tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
1953 {
1954         struct tcpsock *tcp = (struct tcpsock *)sock;
1955         int may_wait;
1956
1957         /* If there is and never was a connection, refuse the call at all. */
1958         if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
1959             tcp->tcp_pcb->state == LISTEN))
1960                 return ENOTCONN;
1961
1962         /*
1963          * If we are certain that no more data will come in later, ignore the
1964          * low receive watermark.  Otherwise, bound it to the size of the
1965          * receive buffer, or receive calls may block forever.
1966          */
1967         if (!(may_wait = tcpsock_may_wait(tcp)))
1968                 min = 1;
1969         else if (min > tcpsock_get_rcvbuf(tcp))
1970                 min = tcpsock_get_rcvbuf(tcp);
1971
1972         if (tcp->tcp_rcv.tr_len >= min) {
1973                 if (size != NULL)
1974                         *size = tcp->tcp_rcv.tr_len;
1975
1976                 return OK;
1977         }
1978
1979         return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
1980 }
1981
1982 /*
1983  * Receive data on a TCP socket.
1984  */
1985 static int
1986 tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
1987         size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1988         socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1989         struct sockaddr * addr __unused, socklen_t * addr_len __unused,
1990         endpoint_t user_endpt __unused, int flags, size_t min,
1991         int * rflags __unused)
1992 {
1993         struct tcpsock *tcp = (struct tcpsock *)sock;
1994         struct pbuf *ptail;
1995         size_t off, left;
1996         int r;
1997
1998         /* See if we can receive at all, and if so, how much at most. */
1999         if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
2000                 return r;
2001
2002         if (len == 0)
2003                 return OK;      /* nothing to do */
2004
2005         off = tcp->tcp_rcv.tr_len;
2006         if (off > len)
2007                 off = len;
2008
2009         assert(tcp->tcp_rcv.tr_head != NULL);
2010         assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
2011
2012         /* Copy out the data to the caller. */
2013         if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
2014             tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
2015                 return r;
2016
2017         /* Unless peeking, remove the data from the receive queue. */
2018         if (!(flags & MSG_PEEK)) {
2019                 left = off;
2020
2021                 /* Dequeue and free as many entire buffers as possible. */
2022                 while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
2023                     left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
2024                         left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
2025
2026                         tcp->tcp_rcv.tr_head = ptail->next;
2027                         tcp->tcp_rcv.tr_head_off = 0;
2028
2029                         if (tcp->tcp_rcv.tr_head == NULL)
2030                                 tcp->tcp_rcv.tr_pre_tailp = NULL;
2031                         else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
2032                                 tcp->tcp_rcv.tr_pre_tailp =
2033                                     &tcp->tcp_rcv.tr_head;
2034
2035                         assert(tcpsock_recvbufs > 0);
2036                         tcpsock_recvbufs--;
2037
2038                         tcpsock_free_buf(ptail);
2039                 }
2040
2041                 /*
2042                  * If only part of the (new) head buffer is consumed, adjust
2043                  * the saved offset into that buffer.
2044                  */
2045                 if (left > 0) {
2046                         assert(tcp->tcp_rcv.tr_head != NULL);
2047                         assert((size_t)tcp->tcp_rcv.tr_head->len -
2048                             tcp->tcp_rcv.tr_head_off > left);
2049
2050                         tcp->tcp_rcv.tr_head_off += left;
2051                 }
2052
2053                 tcp->tcp_rcv.tr_len -= off;
2054
2055                 if (tcp->tcp_rcv.tr_head != NULL) {
2056                         assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
2057                         assert(tcp->tcp_rcv.tr_len > 0);
2058                 } else {
2059                         assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
2060                         assert(tcp->tcp_rcv.tr_len == 0);
2061                 }
2062
2063                 /*
2064                  * The receive buffer has shrunk, so there may now be space to
2065                  * receive more data.
2066                  */
2067                 if (tcp->tcp_pcb != NULL)
2068                         tcpsock_ack_recv(tcp);
2069         } else
2070                 flags &= ~MSG_WAITALL; /* for the check below */
2071
2072         /* Advance the current copy position, and see if we are done. */
2073         *offp += off;
2074         if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
2075                 return SUSPEND;
2076         else
2077                 return OK;
2078 }
2079
2080 /*
2081  * Update the set of flag-type socket options on a TCP socket.
2082  */
2083 static void
2084 tcpsock_setsockmask(struct sock * sock, unsigned int mask)
2085 {
2086         struct tcpsock *tcp = (struct tcpsock *)sock;
2087
2088         if (tcp->tcp_pcb == NULL)
2089                 return;
2090
2091         if (mask & SO_REUSEADDR)
2092                 ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
2093         else
2094                 ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
2095
2096         if (mask & SO_KEEPALIVE)
2097                 ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2098         else
2099                 ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2100 }
2101
2102 /*
2103  * Prepare a helper structure for IP-level option processing.
2104  */
2105 static void
2106 tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
2107 {
2108
2109         ipopts->local_ip = &tcp->tcp_pcb->local_ip;
2110         ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
2111         ipopts->tos = &tcp->tcp_pcb->tos;
2112         ipopts->ttl = &tcp->tcp_pcb->ttl;
2113         ipopts->sndmin = TCP_SNDBUF_MIN;
2114         ipopts->sndmax = TCP_SNDBUF_MAX;
2115         ipopts->rcvmin = TCP_RCVBUF_MIN;
2116         ipopts->rcvmax = TCP_RCVBUF_MAX;
2117 }
2118
2119 /*
2120  * Set socket options on a TCP socket.
2121  */
2122 static int
2123 tcpsock_setsockopt(struct sock * sock, int level, int name,
2124         const struct sockdriver_data * data, socklen_t len)
2125 {
2126         struct tcpsock *tcp = (struct tcpsock *)sock;
2127         struct ipopts ipopts;
2128         uint32_t uval;
2129         int r, val;
2130
2131         if (tcp->tcp_pcb == NULL)
2132                 return ECONNRESET;
2133
2134         /* Handle TCP-level options. */
2135         switch (level) {
2136         case IPPROTO_IPV6:
2137                 switch (name) {
2138                 case IPV6_RECVTCLASS:
2139                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2140                             len)) != OK)
2141                                 return r;
2142
2143                         /*
2144                          * This option is not supported for TCP sockets; it
2145                          * would not even make sense.  However, named(8)
2146                          * insists on trying to set it anyway.  We accept the
2147                          * request but ignore the value, not even returning
2148                          * what was set through getsockopt(2).
2149                          */
2150                         return OK;
2151
2152                 case IPV6_FAITH:
2153                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2154                             len)) != OK)
2155                                 return r;
2156
2157                         /*
2158                          * This option is not supported at all, but to save
2159                          * ourselves from having to remember the current state
2160                          * for getsockopt(2), we also refuse to enable it.
2161                          */
2162                         if (val != 0)
2163                                 return EINVAL;
2164
2165                         return OK;
2166                 }
2167
2168                 break;
2169
2170         case IPPROTO_TCP:
2171                 switch (name) {
2172                 case TCP_NODELAY:
2173                         /*
2174                          * lwIP's listening TCP PCBs do not have this field.
2175                          * If this ever becomes an issue, we can create our own
2176                          * shadow flag and do the inheritance ourselves.
2177                          */
2178                         if (tcp->tcp_pcb->state == LISTEN)
2179                                 return EINVAL;
2180
2181                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2182                             len)) != OK)
2183                                 return r;
2184
2185                         if (val)
2186                                 tcp_nagle_disable(tcp->tcp_pcb);
2187                         else
2188                                 tcp_nagle_enable(tcp->tcp_pcb);
2189
2190                         return OK;
2191
2192                 case TCP_KEEPIDLE:
2193                 case TCP_KEEPINTVL:
2194                         /*
2195                          * lwIP's listening TCP PCBs do not have these fields.
2196                          */
2197                         if (tcp->tcp_pcb->state == LISTEN)
2198                                 return EINVAL;
2199
2200                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2201                             len)) != OK)
2202                                 return r;
2203
2204                         if (val == 0)
2205                                 return EINVAL;
2206
2207                         /*
2208                          * The given value is unsigned, but lwIP stores the
2209                          * value in milliseconds in a uint32_t field, so we
2210                          * have to limit large values to whatever fits in the
2211                          * field anyway.
2212                          */
2213                         if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
2214                                 uval = UINT32_MAX;
2215                         else
2216                                 uval = (uint32_t)val * 1000;
2217
2218                         if (name == TCP_KEEPIDLE)
2219                                 tcp->tcp_pcb->keep_idle = uval;
2220                         else
2221                                 tcp->tcp_pcb->keep_intvl = uval;
2222
2223                         return OK;
2224
2225                 case TCP_KEEPCNT:
2226                         /* lwIP's listening TCP PCBs do not have this field. */
2227                         if (tcp->tcp_pcb->state == LISTEN)
2228                                 return EINVAL;
2229
2230                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2231                             len)) != OK)
2232                                 return r;
2233
2234                         if (val == 0)
2235                                 return EINVAL;
2236
2237                         tcp->tcp_pcb->keep_cnt = (uint32_t)val;
2238
2239                         return OK;
2240                 }
2241
2242                 return EOPNOTSUPP;
2243         }
2244
2245         /* Handle all other options at the IP level. */
2246         tcpsock_get_ipopts(tcp, &ipopts);
2247
2248         return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2249             len, &ipopts);
2250 }
2251
2252 /*
2253  * Retrieve socket options on a TCP socket.
2254  */
2255 static int
2256 tcpsock_getsockopt(struct sock * sock, int level, int name,
2257         const struct sockdriver_data * data, socklen_t * len)
2258 {
2259         struct tcpsock *tcp = (struct tcpsock *)sock;
2260         struct ipopts ipopts;
2261         int val;
2262
2263         if (tcp->tcp_pcb == NULL)
2264                 return ECONNRESET;
2265
2266         /* Handle TCP-level options. */
2267         switch (level) {
2268         case IPPROTO_IPV6:
2269                 switch (name) {
2270                 case IPV6_RECVTCLASS:
2271                 case IPV6_FAITH:
2272                         val = 0;
2273
2274                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2275                             len);
2276                 }
2277
2278                 break;
2279
2280         case IPPROTO_TCP:
2281                 switch (name) {
2282                 case TCP_NODELAY:
2283                         /* lwIP's listening TCP PCBs do not have this field. */
2284                         if (tcp->tcp_pcb->state == LISTEN)
2285                                 return EINVAL;
2286
2287                         val = tcp_nagle_disabled(tcp->tcp_pcb);
2288
2289                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2290                             len);
2291
2292                 case TCP_MAXSEG:
2293                         /* lwIP's listening TCP PCBs do not have this field. */
2294                         if (tcp->tcp_pcb->state == LISTEN)
2295                                 return EINVAL;
2296
2297                         /* This option is read-only at this time. */
2298                         val = tcp->tcp_pcb->mss;
2299
2300                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2301                             len);
2302
2303                 case TCP_KEEPIDLE:
2304                         /* lwIP's listening TCP PCBs do not have this field. */
2305                         if (tcp->tcp_pcb->state == LISTEN)
2306                                 return EINVAL;
2307
2308                         val = (int)(tcp->tcp_pcb->keep_idle / 1000);
2309
2310                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2311                             len);
2312
2313                 case TCP_KEEPINTVL:
2314                         /* lwIP's listening TCP PCBs do not have this field. */
2315                         if (tcp->tcp_pcb->state == LISTEN)
2316                                 return EINVAL;
2317
2318                         val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
2319
2320                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2321                             len);
2322
2323                 case TCP_KEEPCNT:
2324                         /* lwIP's listening TCP PCBs do not have this field. */
2325                         if (tcp->tcp_pcb->state == LISTEN)
2326                                 return EINVAL;
2327
2328                         val = (int)tcp->tcp_pcb->keep_cnt;
2329
2330                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2331                             len);
2332                 }
2333
2334                 return EOPNOTSUPP;
2335         }
2336
2337         /* Handle all other options at the IP level. */
2338         tcpsock_get_ipopts(tcp, &ipopts);
2339
2340         return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2341             len, &ipopts);
2342 }
2343
2344 /*
2345  * Retrieve the local socket address of a TCP socket.
2346  */
2347 static int
2348 tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
2349         socklen_t * addr_len)
2350 {
2351         struct tcpsock *tcp = (struct tcpsock *)sock;
2352
2353         if (tcp->tcp_pcb == NULL)
2354                 return EINVAL;
2355
2356         ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2357             &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
2358
2359         return OK;
2360 }
2361
2362 /*
2363  * Retrieve the remote socket address of a TCP socket.
2364  */
2365 static int
2366 tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
2367         socklen_t * addr_len)
2368 {
2369         struct tcpsock *tcp = (struct tcpsock *)sock;
2370
2371         if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
2372             tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
2373                 return ENOTCONN;
2374
2375         ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2376             &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
2377
2378         return OK;
2379 }
2380
2381 /*
2382  * Perform a TCP half-close on a TCP socket.  This operation may not complete
2383  * immediately due to memory conditions, in which case it will be completed at
2384  * a later time.
2385  */
2386 static void
2387 tcpsock_send_fin(struct tcpsock * tcp)
2388 {
2389
2390         sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
2391
2392         /*
2393          * Attempt to send the FIN.  If a fatal error occurs as a result, raise
2394          * it as an asynchronous error, because this function's callers cannot
2395          * do much with it.  That happens to match the way these functions are
2396          * used elsewhere.  In any case, as a result, the PCB may be closed.
2397          * However, we are never called from a situation where the socket is
2398          * being closed here, so the socket object will not be freed either.
2399          */
2400         if (tcpsock_pcb_enqueue(tcp)) {
2401                 assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
2402
2403                 if (tcpsock_may_close(tcp))
2404                         tcpsock_finish_close(tcp);
2405                 else
2406                         (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
2407         }
2408 }
2409
2410 /*
2411  * Shut down a TCP socket for reading and/or writing.
2412  */
2413 static int
2414 tcpsock_shutdown(struct sock * sock, unsigned int mask)
2415 {
2416         struct tcpsock *tcp = (struct tcpsock *)sock;
2417
2418         /*
2419          * If the PCB is gone, we want to allow shutdowns for reading but not
2420          * writing: shutting down for writing affects the PCB, shutting down
2421          * for reading does not.  Also, if the PCB is in CLOSED state, we would
2422          * not know how to deal with subsequent operations after a shutdown for
2423          * writing, so forbid such calls altogether.
2424          */
2425         if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
2426             (mask & SFL_SHUT_WR))
2427                 return ENOTCONN;
2428
2429         /*
2430          * Handle listening sockets as a special case.  Shutting down a
2431          * listening socket frees its PCB.  Sockets pending on the accept queue
2432          * may still be accepted, but after that, accept(2) will start
2433          * returning ECONNABORTED.  This feature allows multi-process server
2434          * applications to shut down gracefully, supposedly..
2435          */
2436         if (tcpsock_is_listening(tcp)) {
2437                 if (tcp->tcp_pcb != NULL)
2438                         tcpsock_pcb_close(tcp);
2439
2440                 return OK;
2441         }
2442
2443         /*
2444          * We control shutdown-for-reading locally, and intentially do not tell
2445          * lwIP about it: if we do that and also shut down for writing, the PCB
2446          * may disappear (now or eventually), which is not what we want.
2447          * Instead, we only tell lwIP to shut down for reading once we actually
2448          * want to get rid of the PCB, using tcp_close().  In the meantime, if
2449          * the socket is shut down for reading by the user, we simply discard
2450          * received data as fast as we can--one out of a number of possible
2451          * design choices there, and (reportedly) the one used by the BSDs.
2452          */
2453         if (mask & SFL_SHUT_RD)
2454                 (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
2455
2456         /*
2457          * Shutting down for writing a connecting socket simply closes its PCB.
2458          * Closing a PCB in SYN_SENT state simply deallocates it, so this can
2459          * not fail.  On the other hand, for connected sockets we want to send
2460          * a FIN, which may fail due to memory shortage, in which case we have
2461          * to try again later..
2462          */
2463         if (mask & SFL_SHUT_WR) {
2464                 if (tcp->tcp_pcb->state == SYN_SENT)
2465                         tcpsock_pcb_close(tcp);
2466                 else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2467                         tcpsock_send_fin(tcp);
2468         }
2469
2470         return OK;
2471 }
2472
2473 /*
2474  * Close a TCP socket.  Complete the operation immediately if possible, or
2475  * otherwise initiate the closing process and complete it later, notifying
2476  * libsockevent about that as well.  Depending on linger settings, this
2477  * function may be called twice on the same socket: the first time with the
2478  * 'force' flag cleared, and the second time with the 'force' flag set.
2479  */
2480 static int
2481 tcpsock_close(struct sock * sock, int force)
2482 {
2483         struct tcpsock *tcp = (struct tcpsock *)sock;
2484         struct tcpsock *queued;
2485         size_t rlen;
2486
2487         assert(tcp->tcp_listener == NULL);
2488
2489         /*
2490          * If this was a listening socket, so abort and clean up any and all
2491          * connections on its listener queue.  Note that the listening socket
2492          * may or may not have a PCB at this point.
2493          */
2494         if (tcpsock_is_listening(tcp)) {
2495                 while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
2496                         queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2497
2498                         tcpsock_pcb_abort(queued);
2499
2500                         (void)tcpsock_cleanup(queued, TRUE /*may_free*/);
2501                 }
2502         }
2503
2504         /*
2505          * Clear the receive queue, and make sure that we no longer add new
2506          * data to it.  The latter is relevant only for the case that we end up
2507          * returning SUSPEND below.  Remember whether there were bytes left,
2508          * because we should reset the connection if there were.
2509          */
2510         rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
2511
2512         sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
2513
2514         /*
2515          * If the socket is connected, perform a graceful shutdown, unless 1)
2516          * we are asked to force-close the socket, or 2) if the local side has
2517          * not consumed all data, as per RFC 1122 Sec.4.2.2.13.  Normally lwIP
2518          * would take care of the second point, but we may have data in our
2519          * receive buffer of which lwIP is not aware.
2520          *
2521          * Implementing proper linger support is somewhat difficult with lwIP.
2522          * In particular, we cannot reliably wait for our FIN to be ACK'ed by
2523          * the other side in all cases:
2524          *
2525          * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
2526          *   trigger any event and once in the TIME_WAIT state, the poll event
2527          *   no longer triggers either;
2528          * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
2529          *   TIME_WAIT will trigger a receive event, but it is not clear
2530          *   whether we can reliably check that our FIN was ACK'ed from there.
2531          *
2532          * That means we have to compromise.  Instead of the proper approach,
2533          * we complete our side of the close operation whenever:
2534          *
2535          * 1. all of or data was acknowledged, AND,
2536          * 2. our FIN was sent, AND,
2537          * 3a. our FIN was acknowledged, OR,
2538          * 3b. we received a FIN from the other side.
2539          *
2540          * With the addition of the rule 3b, we do not run into the above
2541          * reliability problems, but we may return from SO_LINGER-blocked close
2542          * calls too early and thus give callers a false impression of success.
2543          * TODO: if lwIP ever gets improved on this point, the code in this
2544          * module should be rewritten to make use of the improvements.
2545          *
2546          * The set of rules is basically the same as for closing the PCB early
2547          * as per tcpsock_may_close(), except with the check for our FIN being
2548          * acknowledged.  Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
2549          * (reentered) CLOSED TCP states guarantee that there are no
2550          * unacknowledged data segments anymore, so we may have to wait for
2551          * reaching any one of these before we can actually finish closing the
2552          * socket with tcp_close().
2553          *
2554          * In addition, lwIP does not tell us when our FIN gets acknowledged,
2555          * so we have to use polling and direct access to lwIP's PCB fields
2556          * instead, just like lwIP's BSD API does.  There is no other way.
2557          * Also, we may not even be able to send the FIN right away, in which
2558          * case we must defer that until later.
2559          */
2560         if (tcp->tcp_pcb != NULL) {
2561                 switch (tcp->tcp_pcb->state) {
2562                 case CLOSE_WAIT:
2563                 case CLOSING:
2564                 case LAST_ACK:
2565                         assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
2566
2567                         /* FALLTHROUGH */
2568                 case SYN_RCVD:
2569                 case ESTABLISHED:
2570                 case FIN_WAIT_1:
2571                         /* First check if we should abort the connection. */
2572                         if (force || rlen > 0)
2573                                 break;
2574
2575                         /*
2576                          * If we have not sent a FIN yet, try sending it now;
2577                          * if all other conditions are met for closing the
2578                          * socket, successful FIN transmission will complete
2579                          * the close.  Otherwise, perform the close check
2580                          * explicitly.
2581                          */
2582                         if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2583                                 tcpsock_send_fin(tcp);
2584                         else if (tcpsock_may_close(tcp))
2585                                 tcpsock_pcb_close(tcp);
2586
2587                         /*
2588                          * If at this point the PCB is gone, we managed to
2589                          * close the connection immediately, and the socket has
2590                          * already been cleaned up by now.  This may occur if
2591                          * there is no unacknowledged data and we already
2592                          * received a FIN earlier on.
2593                          */
2594                         if (tcp->tcp_pcb == NULL)
2595                                 return OK;
2596
2597                         /*
2598                          * Complete the close operation at a later time.
2599                          * Adjust the polling interval, so that we can detect
2600                          * completion of the close as quickly as possible.
2601                          */
2602                         tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
2603                             TCP_POLL_CLOSE_INTERVAL);
2604
2605                         return SUSPEND;
2606
2607                 default:
2608                         /*
2609                          * The connection is either not yet established, or
2610                          * already in a state where we can close it right now.
2611                          */
2612                         tcpsock_pcb_close(tcp);
2613                 }
2614         }
2615
2616         /*
2617          * Abort the connection is the PCB is still around, and clean up the
2618          * socket.  We cannot let tcpsock_cleanup() free the socket object yet,
2619          * because we are still in the callback from libsockevent, and the
2620          * latter cannot handle the socket object being freed from here.
2621          */
2622         if (tcp->tcp_pcb != NULL)
2623                 tcpsock_pcb_abort(tcp);
2624
2625         (void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
2626
2627         return OK;
2628 }
2629
2630 /*
2631  * Free up a closed TCP socket.
2632  */
2633 static void
2634 tcpsock_free(struct sock * sock)
2635 {
2636         struct tcpsock *tcp = (struct tcpsock *)sock;
2637
2638         assert(tcp->tcp_pcb == NULL);
2639         assert(tcp->tcp_snd.ts_len == 0);
2640         assert(tcp->tcp_snd.ts_head == NULL);
2641         assert(tcp->tcp_rcv.tr_len == 0);
2642         assert(tcp->tcp_rcv.tr_head == NULL);
2643
2644         TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
2645 }
2646
2647 /* This table maps TCP states from lwIP numbers to NetBSD numbers. */
2648 static const struct {
2649         int tsm_tstate;
2650         int tsm_sostate;
2651 } tcpsock_statemap[] = {
2652         [CLOSED]        = { TCPS_CLOSED,        SS_ISDISCONNECTED       },
2653         [LISTEN]        = { TCPS_LISTEN,        0                       },
2654         [SYN_SENT]      = { TCPS_SYN_SENT,      SS_ISCONNECTING         },
2655         [SYN_RCVD]      = { TCPS_SYN_RECEIVED,  SS_ISCONNECTING         },
2656         [ESTABLISHED]   = { TCPS_ESTABLISHED,   SS_ISCONNECTED          },
2657         [FIN_WAIT_1]    = { TCPS_FIN_WAIT_1,    SS_ISDISCONNECTING      },
2658         [FIN_WAIT_2]    = { TCPS_FIN_WAIT_2,    SS_ISDISCONNECTING      },
2659         [CLOSE_WAIT]    = { TCPS_CLOSE_WAIT,    SS_ISCONNECTED          },
2660         [CLOSING]       = { TCPS_CLOSING,       SS_ISDISCONNECTING      },
2661         [LAST_ACK]      = { TCPS_LAST_ACK,      SS_ISDISCONNECTING      },
2662         [TIME_WAIT]     = { TCPS_TIME_WAIT,     SS_ISDISCONNECTED       },
2663 };
2664
2665 /*
2666  * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
2667  * PCB identified by the given pointer.
2668  */
2669 static void
2670 tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
2671 {
2672         const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
2673         struct tcpsock *tcp;
2674
2675         /*
2676          * Not all TCP PCBs have an associated tcpsock structure.  We are
2677          * careful enough clearing the callback argument for PCBs on any of the
2678          * TCP lists that we can use that callback argument to determine
2679          * whether there is an associated tcpsock structure, although with one
2680          * exception: PCBs for incoming connections that have not yet been
2681          * fully established (i.e., in SYN_RCVD state).  These will have the
2682          * callback argument of the listening socket (which itself may already
2683          * have been deallocated at this point) but should not be considered as
2684          * associated with the listening socket's tcpsock structure.
2685          */
2686         if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
2687                 tcp = (struct tcpsock *)pcb->callback_arg;
2688                 assert(tcp >= tcp_array &&
2689                     tcp < &tcp_array[__arraycount(tcp_array)]);
2690
2691                 /* TODO: change this so that sockstat(1) may work one day. */
2692                 ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
2693         } else {
2694                 /* No tcpsock.  Could also be in TIME_WAIT state etc. */
2695                 tcp = NULL;
2696
2697                 ki->ki_sostate = SS_NOFDREF;
2698         }
2699
2700         ki->ki_type = SOCK_STREAM;
2701
2702         if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
2703                 ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
2704                 /* TODO: this needs work, but does anything rely on it? */
2705                 ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
2706         }
2707
2708         /* Careful with the LISTEN state here (see below). */
2709         ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
2710             &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
2711
2712         /*
2713          * The PCBs for listening sockets are actually smaller.  Thus, for
2714          * listening sockets, do not attempt to access any of the fields beyond
2715          * those provided in the smaller structure.
2716          */
2717         if (pcb->state == LISTEN) {
2718                 assert(tcp != NULL);
2719                 ki->ki_refs =
2720                     (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2721         } else {
2722                 if (tcp_nagle_disabled(pcb))
2723                         ki->ki_tflags |= NETBSD_TF_NODELAY;
2724
2725                 if (tcp != NULL) {
2726                         ki->ki_rcvq = tcp->tcp_rcv.tr_len;
2727                         ki->ki_sndq = tcp->tcp_snd.ts_len;
2728
2729                         if (tcp->tcp_listener != NULL)
2730                                 ki->ki_nextref = (uint64_t)(uintptr_t)
2731                                     TAILQ_NEXT(tcp, tcp_queue.tq_next);
2732                 }
2733         }
2734 }
2735
2736 /*
2737  * Given either NULL or a previously returned TCP PCB pointer, return the first
2738  * or next TCP PCB pointer, or NULL if there are no more.  The current
2739  * implementation supports only one concurrent iteration at once.
2740  */
2741 static const void *
2742 tcpsock_enum(const void * last)
2743 {
2744         static struct {
2745                 unsigned int i;
2746                 const struct tcp_pcb *pcb;
2747         } iter;
2748
2749         if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
2750                 return (const void *)iter.pcb;
2751
2752         for (iter.i = (last != NULL) ? iter.i + 1 : 0;
2753             iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
2754                 if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
2755                         return (const void *)iter.pcb;
2756         }
2757
2758         return NULL;
2759 }
2760
2761 /*
2762  * Obtain the list of TCP protocol control blocks, for sysctl(7).
2763  */
2764 static ssize_t
2765 tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
2766         struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
2767 {
2768
2769         return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
2770 }
2771
2772 static const struct sockevent_ops tcpsock_ops = {
2773         .sop_bind               = tcpsock_bind,
2774         .sop_listen             = tcpsock_listen,
2775         .sop_connect            = tcpsock_connect,
2776         .sop_accept             = tcpsock_accept,
2777         .sop_test_accept        = tcpsock_test_accept,
2778         .sop_pre_send           = tcpsock_pre_send,
2779         .sop_send               = tcpsock_send,
2780         .sop_test_send          = tcpsock_test_send,
2781         .sop_pre_recv           = tcpsock_pre_recv,
2782         .sop_recv               = tcpsock_recv,
2783         .sop_test_recv          = tcpsock_test_recv,
2784         .sop_ioctl              = ifconf_ioctl,
2785         .sop_setsockmask        = tcpsock_setsockmask,
2786         .sop_setsockopt         = tcpsock_setsockopt,
2787         .sop_getsockopt         = tcpsock_getsockopt,
2788         .sop_getsockname        = tcpsock_getsockname,
2789         .sop_getpeername        = tcpsock_getpeername,
2790         .sop_shutdown           = tcpsock_shutdown,
2791         .sop_close              = tcpsock_close,
2792         .sop_free               = tcpsock_free
2793 };