minix/lib/libsockevent/sockevent.c

   1 /* Socket event dispatching library - by D.C. van Moolenbroek */
   2
   3 #include <minix/drivers.h>
   4 #include <minix/sockdriver.h>
   5 #include <minix/sockevent.h>
   6 #include <sys/ioctl.h>
   7
   8 #include "sockevent_proc.h"
   9
  10 #define US              1000000UL       /* microseconds per second */
  11
  12 #define SOCKHASH_SLOTS  256             /* # slots in ID-to-sock hash table */
  13
  14 static SLIST_HEAD(, sock) sockhash[SOCKHASH_SLOTS];
  15
  16 static SLIST_HEAD(, sock) socktimer;
  17
  18 static minix_timer_t sockevent_timer;
  19
  20 static SIMPLEQ_HEAD(, sock) sockevent_pending;
  21
  22 static sockevent_socket_cb_t sockevent_socket_cb = NULL;
  23
  24 static int sockevent_working;
  25
  26 static void socktimer_del(struct sock * sock);
  27 static void sockevent_cancel_send(struct sock * sock,
  28         struct sockevent_proc * spr, int err);
  29 static void sockevent_cancel_recv(struct sock * sock,
  30         struct sockevent_proc * spr, int err);
  31
  32 /*
  33  * Initialize the hash table of sock objects.
  34  */
  35 static void
  36 sockhash_init(void)
  37 {
  38         unsigned int slot;
  39
  40         for (slot = 0; slot < __arraycount(sockhash); slot++)
  41                 SLIST_INIT(&sockhash[slot]);
  42 }
  43
  44 /*
  45  * Given a socket identifier, return a hash table slot number.
  46  */
  47 static unsigned int
  48 sockhash_slot(sockid_t id)
  49 {
  50
  51         /*
  52          * The idea of the shift is that a socket driver may offer multiple
  53          * classes of sockets, and put the class in the higher bits.  The shift
  54          * aims to prevent that all classes' first sockets end up in the same
  55          * hash slot.
  56          */
  57         return (id + (id >> 16)) % SOCKHASH_SLOTS;
  58 }
  59
  60 /*
  61  * Obtain a sock object from the hash table using its unique identifier.
  62  * Return a pointer to the object if found, or NULL otherwise.
  63  */
  64 static struct sock *
  65 sockhash_get(sockid_t id)
  66 {
  67         struct sock *sock;
  68         unsigned int slot;
  69
  70         slot = sockhash_slot(id);
  71
  72         SLIST_FOREACH(sock, &sockhash[slot], sock_hash) {
  73                 if (sock->sock_id == id)
  74                         return sock;
  75         }
  76
  77         return NULL;
  78 }
  79
  80 /*
  81  * Add a sock object to the hash table.  The sock object must have a valid ID
  82  * in its 'sock_id' field, and must not be in the hash table already.
  83  */
  84 static void
  85 sockhash_add(struct sock * sock)
  86 {
  87         unsigned int slot;
  88
  89         slot = sockhash_slot(sock->sock_id);
  90
  91         SLIST_INSERT_HEAD(&sockhash[slot], sock, sock_hash);
  92 }
  93
  94 /*
  95  * Remove a sock object from the hash table.  The sock object must be in the
  96  * hash table.
  97  */
  98 static void
  99 sockhash_del(struct sock * sock)
 100 {
 101         unsigned int slot;
 102
 103         slot = sockhash_slot(sock->sock_id);
 104
 105         /* This macro is O(n). */
 106         SLIST_REMOVE(&sockhash[slot], sock, sock, sock_hash);
 107 }
 108
 109 /*
 110  * Reset a socket object to a proper initial state, with a particular socket
 111  * identifier, a SOCK_ type, and a socket operations table.  The socket is
 112  * added to the ID-to-object hash table.  This function always succeeds.
 113  */
 114 static void
 115 sockevent_reset(struct sock * sock, sockid_t id, int domain, int type,
 116         const struct sockevent_ops * ops)
 117 {
 118
 119         assert(sock != NULL);
 120
 121         memset(sock, 0, sizeof(*sock));
 122
 123         sock->sock_id = id;
 124         sock->sock_domain = domain;
 125         sock->sock_type = type;
 126
 127         sock->sock_slowat = 1;
 128         sock->sock_rlowat = 1;
 129
 130         sock->sock_ops = ops;
 131         sock->sock_proc = NULL;
 132         sock->sock_select.ss_endpt = NONE;
 133
 134         sockhash_add(sock);
 135 }
 136
 137 /*
 138  * Initialize a new socket that will serve as an accepted socket on the given
 139  * listening socket 'sock'.  The new socket is given as 'newsock', and its new
 140  * socket identifier is given as 'newid'.  This function always succeeds.
 141  */
 142 void
 143 sockevent_clone(struct sock * sock, struct sock * newsock, sockid_t newid)
 144 {
 145
 146         sockevent_reset(newsock, newid, (int)sock->sock_domain,
 147             sock->sock_type, sock->sock_ops);
 148
 149         /* These are the settings that are currently inherited. */
 150         newsock->sock_opt = sock->sock_opt & ~SO_ACCEPTCONN;
 151         newsock->sock_linger = sock->sock_linger;
 152         newsock->sock_stimeo = sock->sock_stimeo;
 153         newsock->sock_rtimeo = sock->sock_rtimeo;
 154         newsock->sock_slowat = sock->sock_slowat;
 155         newsock->sock_rlowat = sock->sock_rlowat;
 156
 157         newsock->sock_flags |= SFL_CLONED;
 158 }
 159
 160 /*
 161  * A new socket has just been accepted.  The corresponding listening socket is
 162  * given as 'sock'.  The new socket has ID 'newid', and if it had not already
 163  * been added to the hash table through sockevent_clone() before, 'newsock' is
 164  * a non-NULL pointer which identifies the socket object to clone into.
 165  */
 166 static void
 167 sockevent_accepted(struct sock * sock, struct sock * newsock, sockid_t newid)
 168 {
 169
 170         if (newsock == NULL) {
 171                 if ((newsock = sockhash_get(newid)) == NULL)
 172                         panic("libsockdriver: socket driver returned unknown "
 173                             "ID %d from accept callback", newid);
 174         } else
 175                 sockevent_clone(sock, newsock, newid);
 176
 177         assert(newsock->sock_flags & SFL_CLONED);
 178         newsock->sock_flags &= ~SFL_CLONED;
 179 }
 180
 181 /*
 182  * Allocate a sock object, by asking the socket driver for one.  On success,
 183  * return OK, with a pointer to the new object stored in 'sockp'.  This new
 184  * object has all its fields set to initial values, in part based on the given
 185  * parameters.  On failure, return an error code.  Failure has two typical
 186  * cause: either the given domain, type, protocol combination is not supported,
 187  * or the socket driver is out of sockets (globally or for this combination).
 188  */
 189 static int
 190 sockevent_alloc(int domain, int type, int protocol, endpoint_t user_endpt,
 191         struct sock ** sockp)
 192 {
 193         struct sock *sock;
 194         const struct sockevent_ops *ops;
 195         sockid_t r;
 196
 197         /*
 198          * Verify that the given domain is sane.  Unlike the type and protocol,
 199          * the domain is already verified by VFS, so we do not limit ourselves
 200          * here.  The result is that we can store the domain in just a byte.
 201          */
 202         if (domain < 0 || domain > UINT8_MAX)
 203                 return EAFNOSUPPORT;
 204
 205         /* Make sure that the library has actually been initialized. */
 206         if (sockevent_socket_cb == NULL)
 207                 panic("libsockevent: not initialized");
 208
 209         sock = NULL;
 210         ops = NULL;
 211
 212         /*
 213          * Ask the socket driver to create a socket for the given combination
 214          * of domain, type, and protocol.  If so, let it return a new sock
 215          * object, a unique socket identifier for that object, and an
 216          * operations table for it.
 217          */
 218         if ((r = sockevent_socket_cb(domain, type, protocol, user_endpt, &sock,
 219             &ops)) < 0)
 220                 return r;
 221
 222         assert(sock != NULL);
 223         assert(ops != NULL);
 224
 225         sockevent_reset(sock, r, domain, type, ops);
 226
 227         *sockp = sock;
 228         return OK;
 229 }
 230
 231 /*
 232  * Free a previously allocated sock object.
 233  */
 234 static void
 235 sockevent_free(struct sock * sock)
 236 {
 237         const struct sockevent_ops *ops;
 238
 239         assert(sock->sock_proc == NULL);
 240
 241         socktimer_del(sock);
 242
 243         sockhash_del(sock);
 244
 245         /*
 246          * Invalidate the operations table on the socket, before freeing the
 247          * socket.  This allows us to detect cases where sockevent functions
 248          * are called on sockets that have already been freed.
 249          */
 250         ops = sock->sock_ops;
 251         sock->sock_ops = NULL;
 252
 253         assert(ops != NULL);
 254         assert(ops->sop_free != NULL);
 255
 256         ops->sop_free(sock);
 257 }
 258
 259 /*
 260  * Create a new socket.
 261  */
 262 static sockid_t
 263 sockevent_socket(int domain, int type, int protocol, endpoint_t user_endpt)
 264 {
 265         struct sock *sock;
 266         int r;
 267
 268         if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
 269             &sock)) != OK)
 270                 return r;
 271
 272         return sock->sock_id;
 273 }
 274
 275 /*
 276  * Create a pair of connected sockets.
 277  */
 278 static int
 279 sockevent_socketpair(int domain, int type, int protocol, endpoint_t user_endpt,
 280         sockid_t id[2])
 281 {
 282         struct sock *sock1, *sock2;
 283         int r;
 284
 285         if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
 286             &sock1)) != OK)
 287                 return r;
 288
 289         /* Creating socket pairs is not always supported. */
 290         if (sock1->sock_ops->sop_pair == NULL) {
 291                 sockevent_free(sock1);
 292
 293                 return EOPNOTSUPP;
 294         }
 295
 296         if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
 297             &sock2)) != OK) {
 298                 sockevent_free(sock1);
 299
 300                 return r;
 301         }
 302
 303         assert(sock1->sock_ops == sock2->sock_ops);
 304
 305         r = sock1->sock_ops->sop_pair(sock1, sock2, user_endpt);
 306
 307         if (r != OK) {
 308                 sockevent_free(sock2);
 309                 sockevent_free(sock1);
 310
 311                 return r;
 312         }
 313
 314         id[0] = sock1->sock_id;
 315         id[1] = sock2->sock_id;
 316         return OK;
 317 }
 318
 319 /*
 320  * A send request returned EPIPE.  If desired, send a SIGPIPE signal to the
 321  * user process that issued the request.
 322  */
 323 static void
 324 sockevent_sigpipe(struct sock * sock, endpoint_t user_endpt, int flags)
 325 {
 326
 327         /*
 328          * POSIX says that pipe signals should be generated for SOCK_STREAM
 329          * sockets.  Linux does just this, NetBSD raises signals for all socket
 330          * types.
 331          */
 332         if (sock->sock_type != SOCK_STREAM)
 333                 return;
 334
 335         /*
 336          * Why would there be fewer than four ways to do the same thing?
 337          * O_NOSIGPIPE, MSG_NOSIGNAL, SO_NOSIGPIPE, and of course blocking
 338          * SIGPIPE.  VFS already sets MSG_NOSIGNAL for calls on sockets with
 339          * O_NOSIGPIPE.  The fact that SO_NOSIGPIPE is a thing, is also the
 340          * reason why we cannot let VFS handle signal generation altogether.
 341          */
 342         if (flags & MSG_NOSIGNAL)
 343                 return;
 344         if (sock->sock_opt & SO_NOSIGPIPE)
 345                 return;
 346
 347         /*
 348          * Send a SIGPIPE signal to the user process.  Unfortunately we cannot
 349          * guarantee that the SIGPIPE reaches the user process before the send
 350          * call returns.  Usually, the scheduling priorities of system services
 351          * are such that the signal is likely to arrive first anyway, but if
 352          * timely arrival of the signal is required, a more fundamental change
 353          * to the system would be needed.
 354          */
 355         sys_kill(user_endpt, SIGPIPE);
 356 }
 357
 358 /*
 359  * Suspend a request without data, that is, a bind, connect, accept, or close
 360  * request.
 361  */
 362 static void
 363 sockevent_suspend(struct sock * sock, unsigned int event,
 364         const struct sockdriver_call * __restrict call, endpoint_t user_endpt)
 365 {
 366         struct sockevent_proc *spr, **sprp;
 367
 368         /* There is one slot for each process, so this should never fail. */
 369         if ((spr = sockevent_proc_alloc()) == NULL)
 370                 panic("libsockevent: too many suspended processes");
 371
 372         spr->spr_next = NULL;
 373         spr->spr_event = event;
 374         spr->spr_timer = FALSE;
 375         spr->spr_call = *call;
 376         spr->spr_endpt = user_endpt;
 377
 378         /*
 379          * Add the request to the tail of the queue.  This operation is O(n),
 380          * but the number of suspended requests per socket is expected to be
 381          * low at all times.
 382          */
 383         for (sprp = &sock->sock_proc; *sprp != NULL;
 384              sprp = &(*sprp)->spr_next);
 385         *sprp = spr;
 386 }
 387
 388 /*
 389  * Suspend a request with data, that is, a send or receive request.
 390  */
 391 static void
 392 sockevent_suspend_data(struct sock * sock, unsigned int event, int timer,
 393         const struct sockdriver_call * __restrict call, endpoint_t user_endpt,
 394         const struct sockdriver_data * __restrict data, size_t len, size_t off,
 395         const struct sockdriver_data * __restrict ctl, socklen_t ctl_len,
 396         socklen_t ctl_off, int flags, int rflags, clock_t time)
 397 {
 398         struct sockevent_proc *spr, **sprp;
 399
 400         /* There is one slot for each process, so this should never fail. */
 401         if ((spr = sockevent_proc_alloc()) == NULL)
 402                 panic("libsockevent: too many suspended processes");
 403
 404         spr->spr_next = NULL;
 405         spr->spr_event = event;
 406         spr->spr_timer = timer;
 407         spr->spr_call = *call;
 408         spr->spr_endpt = user_endpt;
 409         sockdriver_pack_data(&spr->spr_data, call, data, len);
 410         spr->spr_datalen = len;
 411         spr->spr_dataoff = off;
 412         sockdriver_pack_data(&spr->spr_ctl, call, ctl, ctl_len);
 413         spr->spr_ctllen = ctl_len;
 414         spr->spr_ctloff = ctl_off;
 415         spr->spr_flags = flags;
 416         spr->spr_rflags = rflags;
 417         spr->spr_time = time;
 418
 419         /*
 420          * Add the request to the tail of the queue.  This operation is O(n),
 421          * but the number of suspended requests per socket is expected to be
 422          * low at all times.
 423          */
 424         for (sprp = &sock->sock_proc; *sprp != NULL;
 425              sprp = &(*sprp)->spr_next);
 426         *sprp = spr;
 427 }
 428
 429 /*
 430  * Return TRUE if there are any suspended requests on the given socket's queue
 431  * that match any of the events in the given event mask, or FALSE otherwise.
 432  */
 433 static int
 434 sockevent_has_suspended(struct sock * sock, unsigned int mask)
 435 {
 436         struct sockevent_proc *spr;
 437
 438         for (spr = sock->sock_proc; spr != NULL; spr = spr->spr_next)
 439                 if (spr->spr_event & mask)
 440                         return TRUE;
 441
 442         return FALSE;
 443 }
 444
 445 /*
 446  * Check whether the given call is on the given socket's queue of suspended
 447  * requests.  If so, remove it from the queue and return a pointer to the
 448  * suspension data structure.  The caller is then responsible for freeing that
 449  * data structure using sockevent_proc_free().  If the call was not found, the
 450  * function returns NULL.
 451  */
 452 static struct sockevent_proc *
 453 sockevent_unsuspend(struct sock * sock, const struct sockdriver_call * call)
 454 {
 455         struct sockevent_proc *spr, **sprp;
 456
 457         /* Find the suspended request being canceled. */
 458         for (sprp = &sock->sock_proc; (spr = *sprp) != NULL;
 459             sprp = &spr->spr_next) {
 460                 if (spr->spr_call.sc_endpt == call->sc_endpt &&
 461                     spr->spr_call.sc_req == call->sc_req) {
 462                         /* Found; remove and return it. */
 463                         *sprp = spr->spr_next;
 464
 465                         return spr;
 466                 }
 467         }
 468
 469         return NULL;
 470 }
 471
 472 /*
 473  * Attempt to resume the given suspended request for the given socket object.
 474  * Return TRUE if the suspended request has been fully resumed and can be
 475  * removed from the queue of suspended requests, or FALSE if it has not been
 476  * fully resumed and should stay on the queue.  In the latter case, no
 477  * resumption will be attempted for other suspended requests of the same type.
 478  */
 479 static int
 480 sockevent_resume(struct sock * sock, struct sockevent_proc * spr)
 481 {
 482         struct sock *newsock;
 483         struct sockdriver_data data, ctl;
 484         char addr[SOCKADDR_MAX];
 485         socklen_t addr_len;
 486         size_t len, min;
 487         sockid_t r;
 488
 489         switch (spr->spr_event) {
 490         case SEV_CONNECT:
 491                 /*
 492                  * If the connect call was suspended for the purpose of
 493                  * intercepting resumption, simply remove it from the queue.
 494                  */
 495                 if (spr->spr_call.sc_endpt == NONE)
 496                         return TRUE;
 497
 498                 /* FALLTHROUGH */
 499         case SEV_BIND:
 500                 if ((r = sock->sock_err) != OK)
 501                         sock->sock_err = OK;
 502
 503                 sockdriver_reply_generic(&spr->spr_call, r);
 504
 505                 return TRUE;
 506
 507         case SEV_ACCEPT:
 508                 /*
 509                  * A previous accept call may not have blocked on a socket that
 510                  * was not in listening mode.
 511                  */
 512                 assert(sock->sock_opt & SO_ACCEPTCONN);
 513
 514                 addr_len = 0;
 515                 newsock = NULL;
 516
 517                 /*
 518                  * This call is suspended, which implies that the call table
 519                  * pointer has already tested to be non-NULL.
 520                  */
 521                 if ((r = sock->sock_ops->sop_accept(sock,
 522                     (struct sockaddr *)&addr, &addr_len, spr->spr_endpt,
 523                     &newsock)) == SUSPEND)
 524                         return FALSE;
 525
 526                 if (r >= 0) {
 527                         assert(addr_len <= sizeof(addr));
 528
 529                         sockevent_accepted(sock, newsock, r);
 530                 }
 531
 532                 sockdriver_reply_accept(&spr->spr_call, r,
 533                     (struct sockaddr *)&addr, addr_len);
 534
 535                 return TRUE;
 536
 537         case SEV_SEND:
 538                 if (sock->sock_err != OK || (sock->sock_flags & SFL_SHUT_WR)) {
 539                         if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
 540                                 r = (int)spr->spr_dataoff;
 541                         else if ((r = sock->sock_err) != OK)
 542                                 sock->sock_err = OK;
 543                         else
 544                                 r = EPIPE;
 545                 } else {
 546                         sockdriver_unpack_data(&data, &spr->spr_call,
 547                             &spr->spr_data, spr->spr_datalen);
 548                         sockdriver_unpack_data(&ctl, &spr->spr_call,
 549                             &spr->spr_ctl, spr->spr_ctllen);
 550
 551                         len = spr->spr_datalen - spr->spr_dataoff;
 552
 553                         min = sock->sock_slowat;
 554                         if (min > len)
 555                                 min = len;
 556
 557                         /*
 558                          * As mentioned elsewhere, we do not save the address
 559                          * upon suspension so we cannot supply it anymore here.
 560                          */
 561                         r = sock->sock_ops->sop_send(sock, &data, len,
 562                             &spr->spr_dataoff, &ctl,
 563                             spr->spr_ctllen - spr->spr_ctloff,
 564                             &spr->spr_ctloff, NULL, 0, spr->spr_endpt,
 565                             spr->spr_flags, min);
 566
 567                         assert(r <= 0);
 568
 569                         if (r == SUSPEND)
 570                                 return FALSE;
 571
 572                         /*
 573                          * If an error occurred but some data were already
 574                          * sent, return the progress rather than the error.
 575                          * Note that if the socket driver detects an
 576                          * asynchronous error during the send, it itself must
 577                          * perform this check and call sockevent_set_error() as
 578                          * needed, to make sure the error does not get lost.
 579                          */
 580                         if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
 581                                 r = spr->spr_dataoff;
 582                 }
 583
 584                 if (r == EPIPE)
 585                         sockevent_sigpipe(sock, spr->spr_endpt,
 586                             spr->spr_flags);
 587
 588                 sockdriver_reply_generic(&spr->spr_call, r);
 589
 590                 return TRUE;
 591
 592         case SEV_RECV:
 593                 addr_len = 0;
 594
 595                 if (sock->sock_flags & SFL_SHUT_RD)
 596                         r = SOCKEVENT_EOF;
 597                 else {
 598                         len = spr->spr_datalen - spr->spr_dataoff;
 599
 600                         if (sock->sock_err == OK) {
 601                                 min = sock->sock_rlowat;
 602                                 if (min > len)
 603                                         min = len;
 604                         } else
 605                                 min = 0;
 606
 607                         sockdriver_unpack_data(&data, &spr->spr_call,
 608                             &spr->spr_data, spr->spr_datalen);
 609                         sockdriver_unpack_data(&ctl, &spr->spr_call,
 610                             &spr->spr_ctl, spr->spr_ctllen);
 611
 612                         r = sock->sock_ops->sop_recv(sock, &data, len,
 613                             &spr->spr_dataoff, &ctl,
 614                             spr->spr_ctllen - spr->spr_ctloff,
 615                             &spr->spr_ctloff, (struct sockaddr *)&addr,
 616                             &addr_len, spr->spr_endpt, spr->spr_flags, min,
 617                             &spr->spr_rflags);
 618
 619                         /*
 620                          * If the call remains suspended but a socket error is
 621                          * pending, return the pending socket error instead.
 622                          */
 623                         if (r == SUSPEND) {
 624                                 if (sock->sock_err == OK)
 625                                         return FALSE;
 626
 627                                 r = SOCKEVENT_EOF;
 628                         }
 629
 630                         assert(addr_len <= sizeof(addr));
 631                 }
 632
 633                 /*
 634                  * If the receive call reported success, or if some data were
 635                  * already received, return the (partial) result.  Otherwise,
 636                  * return a pending error if any, or otherwise a regular error
 637                  * or 0 for EOF.
 638                  */
 639                 if (r == OK || spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
 640                         r = (int)spr->spr_dataoff;
 641                 else if (sock->sock_err != OK) {
 642                         r = sock->sock_err;
 643
 644                         sock->sock_err = OK;
 645                 } else if (r == SOCKEVENT_EOF)
 646                         r = 0; /* EOF */
 647
 648                 sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff,
 649                     (struct sockaddr *)&addr, addr_len, spr->spr_rflags);
 650
 651                 return TRUE;
 652
 653         case SEV_CLOSE:
 654                 sockdriver_reply_generic(&spr->spr_call, OK);
 655
 656                 return TRUE;
 657
 658         default:
 659                 panic("libsockevent: process suspended on unknown event 0x%x",
 660                     spr->spr_event);
 661         }
 662 }
 663
 664 /*
 665  * Return TRUE if the given socket is ready for reading for a select call, or
 666  * FALSE otherwise.
 667  */
 668 static int
 669 sockevent_test_readable(struct sock * sock)
 670 {
 671         int r;
 672
 673         /*
 674          * The meaning of "ready-to-read" depends on whether the socket is a
 675          * listening socket or not.  For the former, it is a test on whether
 676          * there are any new sockets to accept.  However, shutdown flags take
 677          * precedence in both cases.
 678          */
 679         if (sock->sock_flags & SFL_SHUT_RD)
 680                 return TRUE;
 681
 682         if (sock->sock_err != OK)
 683                 return TRUE;
 684
 685         /*
 686          * Depending on whether this is a listening-mode socket, test whether
 687          * either accepts or receives would block.
 688          */
 689         if (sock->sock_opt & SO_ACCEPTCONN) {
 690                 if (sock->sock_ops->sop_test_accept == NULL)
 691                         return TRUE;
 692
 693                 r = sock->sock_ops->sop_test_accept(sock);
 694         } else {
 695                 if (sock->sock_ops->sop_test_recv == NULL)
 696                         return TRUE;
 697
 698                 r = sock->sock_ops->sop_test_recv(sock, sock->sock_rlowat,
 699                     NULL);
 700         }
 701
 702         return (r != SUSPEND);
 703 }
 704
 705 /*
 706  * Return TRUE if the given socket is ready for writing for a select call, or
 707  * FALSE otherwise.
 708  */
 709 static int
 710 sockevent_test_writable(struct sock * sock)
 711 {
 712         int r;
 713
 714         if (sock->sock_err != OK)
 715                 return TRUE;
 716
 717         if (sock->sock_flags & SFL_SHUT_WR)
 718                 return TRUE;
 719
 720         if (sock->sock_ops->sop_test_send == NULL)
 721                 return TRUE;
 722
 723         /*
 724          * Test whether sends would block.  The low send watermark is relevant
 725          * for stream-type sockets only.
 726          */
 727         r = sock->sock_ops->sop_test_send(sock, sock->sock_slowat);
 728
 729         return (r != SUSPEND);
 730 }
 731
 732 /*
 733  * Test whether any of the given select operations are ready on the given
 734  * socket.  Return the subset of ready operations; zero if none.
 735  */
 736 static unsigned int
 737 sockevent_test_select(struct sock * sock, unsigned int ops)
 738 {
 739         unsigned int ready_ops;
 740
 741         assert(!(ops & ~(SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR)));
 742
 743         /*
 744          * We do not support the "bind in progress" case here.  If a blocking
 745          * bind call is in progress, the file descriptor should not be ready
 746          * for either reading or writing.  Currently, socket drivers will have
 747          * to cover this case themselves.  Otherwise we would have to check the
 748          * queue of suspended calls, or create a custom flag for this.
 749          */
 750
 751         ready_ops = 0;
 752
 753         if ((ops & SDEV_OP_RD) && sockevent_test_readable(sock))
 754                 ready_ops |= SDEV_OP_RD;
 755
 756         if ((ops & SDEV_OP_WR) && sockevent_test_writable(sock))
 757                 ready_ops |= SDEV_OP_WR;
 758
 759         /* TODO: OOB receive support. */
 760
 761         return ready_ops;
 762 }
 763
 764 /*
 765  * Fire the given mask of events on the given socket object now.
 766  */
 767 static void
 768 sockevent_fire(struct sock * sock, unsigned int mask)
 769 {
 770         struct sockevent_proc *spr, **sprp;
 771         unsigned int r, flag, ops;
 772
 773         /*
 774          * A completed connection attempt (successful or not) also always
 775          * implies that the socket becomes writable.  For convenience we
 776          * enforce this rule here, because it is easy to forget.  Note that in
 777          * any case, a suspended connect request should be the first in the
 778          * list, so we do not risk returning 0 from a connect call as a result
 779          * of sock_err getting eaten by another resumed call.
 780          */
 781         if (mask & SEV_CONNECT)
 782                 mask |= SEV_SEND;
 783
 784         /*
 785          * First try resuming regular system calls.
 786          */
 787         for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) {
 788                 flag = spr->spr_event;
 789
 790                 if ((mask & flag) && sockevent_resume(sock, spr)) {
 791                         *sprp = spr->spr_next;
 792
 793                         sockevent_proc_free(spr);
 794                 } else {
 795                         mask &= ~flag;
 796
 797                         sprp = &spr->spr_next;
 798                 }
 799         }
 800
 801         /*
 802          * Then see if we can satisfy pending select queries.
 803          */
 804         if ((mask & (SEV_ACCEPT | SEV_SEND | SEV_RECV)) &&
 805             sock->sock_select.ss_endpt != NONE) {
 806                 assert(sock->sock_selops != 0);
 807
 808                 /*
 809                  * Only retest select operations that, based on the given event
 810                  * mask, could possibly be satisfied now.
 811                  */
 812                 ops = sock->sock_selops;
 813                 if (!(mask & (SEV_ACCEPT | SEV_RECV)))
 814                         ops &= ~SDEV_OP_RD;
 815                 if (!(mask & SEV_SEND))
 816                         ops &= ~SDEV_OP_WR;
 817                 if (!(0))                       /* TODO: OOB receive support */
 818                         ops &= ~SDEV_OP_ERR;
 819
 820                 /* Are there any operations to test? */
 821                 if (ops != 0) {
 822                         /* Test those operations. */
 823                         r = sockevent_test_select(sock, ops);
 824
 825                         /* Were any satisfied? */
 826                         if (r != 0) {
 827                                 /* Let the caller know. */
 828                                 sockdriver_reply_select(&sock->sock_select,
 829                                     sock->sock_id, r);
 830
 831                                 sock->sock_selops &= ~r;
 832
 833                                 /* Are there any saved operations left now? */
 834                                 if (sock->sock_selops == 0)
 835                                         sock->sock_select.ss_endpt = NONE;
 836                         }
 837                 }
 838         }
 839
 840         /*
 841          * Finally, a SEV_CLOSE event unconditionally frees the sock object.
 842          * This event should be fired only for sockets that are either not yet,
 843          * or not anymore, in use by userland.
 844          */
 845         if (mask & SEV_CLOSE) {
 846                 assert(sock->sock_flags & (SFL_CLONED | SFL_CLOSING));
 847
 848                 sockevent_free(sock);
 849         }
 850 }
 851
 852 /*
 853  * Process all pending events.  Events must still be blocked, so that if
 854  * handling one event generates a new event, that event is handled from here
 855  * rather than immediately.
 856  */
 857 static void
 858 sockevent_pump(void)
 859 {
 860         struct sock *sock;
 861         unsigned int mask;
 862
 863         assert(sockevent_working);
 864
 865         while (!SIMPLEQ_EMPTY(&sockevent_pending)) {
 866                 sock = SIMPLEQ_FIRST(&sockevent_pending);
 867                 SIMPLEQ_REMOVE_HEAD(&sockevent_pending, sock_next);
 868
 869                 mask = sock->sock_events;
 870                 assert(mask != 0);
 871                 sock->sock_events = 0;
 872
 873                 sockevent_fire(sock, mask);
 874                 /*
 875                  * At this point, the sock object may already have been readded
 876                  * to the event list, or even be deallocated altogether.
 877                  */
 878         }
 879 }
 880
 881 /*
 882  * Return TRUE if any events are pending on any sockets, or FALSE otherwise.
 883  */
 884 static int
 885 sockevent_has_events(void)
 886 {
 887
 888         return (!SIMPLEQ_EMPTY(&sockevent_pending));
 889 }
 890
 891 /*
 892  * Raise the given bitwise-OR'ed set of events on the given socket object.
 893  * Depending on the context of the call, they events may or may not be
 894  * processed immediately.
 895  */
 896 void
 897 sockevent_raise(struct sock * sock, unsigned int mask)
 898 {
 899
 900         assert(sock->sock_ops != NULL);
 901
 902         /*
 903          * Handle SEV_CLOSE first.  This event must not be deferred, so as to
 904          * let socket drivers recycle sock objects as they are needed.  For
 905          * example, a user-closed TCP socket may stay open to transmit the
 906          * remainder of its send buffer, until the TCP driver runs out of
 907          * sockets, in which case the connection is aborted.  The driver would
 908          * then raise SEV_CLOSE on the sock object so as to clean it up, and
 909          * immediately reuse it afterward.  If the close event were to be
 910          * deferred, this immediate reuse would not be possible.
 911          *
 912          * The sop_free() callback routine may not raise new events, and thus,
 913          * the state of 'sockevent_working' need not be checked or set here.
 914          */
 915         if (mask & SEV_CLOSE) {
 916                 assert(mask == SEV_CLOSE);
 917
 918                 sockevent_fire(sock, mask);
 919
 920                 return;
 921         }
 922
 923         /*
 924          * If we are currently processing a socket message, store the event for
 925          * later.  If not, this call is not coming from inside libsockevent,
 926          * and we must handle the event immediately.
 927          */
 928         if (sockevent_working) {
 929                 assert(mask != 0);
 930                 assert(mask <= UCHAR_MAX); /* sock_events field size check */
 931
 932                 if (sock->sock_events == 0)
 933                         SIMPLEQ_INSERT_TAIL(&sockevent_pending, sock,
 934                             sock_next);
 935
 936                 sock->sock_events |= mask;
 937         } else {
 938                 sockevent_working = TRUE;
 939
 940                 sockevent_fire(sock, mask);
 941
 942                 if (sockevent_has_events())
 943                         sockevent_pump();
 944
 945                 sockevent_working = FALSE;
 946         }
 947 }
 948
 949 /*
 950  * Set a pending error on the socket object, and wake up any suspended
 951  * operations that are affected by this.
 952  */
 953 void
 954 sockevent_set_error(struct sock * sock, int err)
 955 {
 956
 957         assert(err < 0);
 958         assert(sock->sock_ops != NULL);
 959
 960         /* If an error was set already, it will be overridden. */
 961         sock->sock_err = err;
 962
 963         sockevent_raise(sock, SEV_BIND | SEV_CONNECT | SEV_SEND | SEV_RECV);
 964 }
 965
 966 /*
 967  * Initialize timer-related data structures.
 968  */
 969 static void
 970 socktimer_init(void)
 971 {
 972
 973         SLIST_INIT(&socktimer);
 974
 975         init_timer(&sockevent_timer);
 976 }
 977
 978 /*
 979  * Check whether the given socket object has any suspended requests that have
 980  * now expired.  If so, cancel them.  Also, if the socket object has any
 981  * suspended requests with a timeout that has not yet expired, return the
 982  * earliest (relative) timeout of all of them, or TMR_NEVER if no such requests
 983  * are present.
 984  */
 985 static clock_t
 986 sockevent_expire(struct sock * sock, clock_t now)
 987 {
 988         struct sockevent_proc *spr, **sprp;
 989         clock_t lowest, left;
 990         int r;
 991
 992         /*
 993          * First handle the case that the socket is closed.  In this case,
 994          * there may be a linger timer, although the socket may also simply
 995          * still be on the timer list because of a request that did not time
 996          * out right before the socket was closed.
 997          */
 998         if (sock->sock_flags & SFL_CLOSING) {
 999                 /* Was there a linger timer and has it expired? */
1000                 if ((sock->sock_opt & SO_LINGER) &&
1001                     tmr_is_first(sock->sock_linger, now)) {
1002                         assert(sock->sock_ops->sop_close != NULL);
1003
1004                         /*
1005                          * Whatever happens next, we must now resume the
1006                          * pending close operation, if it was not canceled
1007                          * earlier.  As before, we return OK rather than the
1008                          * standardized EWOULDBLOCK, to ensure that the user
1009                          * process knows the file descriptor has been closed.
1010                          */
1011                         if ((spr = sock->sock_proc) != NULL) {
1012                                 assert(spr->spr_event == SEV_CLOSE);
1013                                 assert(spr->spr_next == NULL);
1014
1015                                 sock->sock_proc = NULL;
1016
1017                                 sockdriver_reply_generic(&spr->spr_call, OK);
1018
1019                                 sockevent_proc_free(spr);
1020                         }
1021
1022                         /*
1023                          * Tell the socket driver that closing the socket is
1024                          * now a bit more desired than the last time we asked.
1025                          */
1026                         r = sock->sock_ops->sop_close(sock, TRUE /*force*/);
1027
1028                         assert(r == OK || r == SUSPEND);
1029
1030                         /*
1031                          * The linger timer fires once.  After that, the socket
1032                          * driver is free to decide that it still will not
1033                          * close the socket.  If it does, do not fire the
1034                          * linger timer again.
1035                          */
1036                         if (r == SUSPEND)
1037                                 sock->sock_opt &= ~SO_LINGER;
1038                         else
1039                                 sockevent_free(sock);
1040                 }
1041
1042                 return TMR_NEVER;
1043         }
1044
1045         /*
1046          * Then see if any send and/or receive requests have expired.  Also see
1047          * if there are any send and/or receive requests left that have not yet
1048          * expired but do have a timeout, so that we can return the lowest of
1049          * those timeouts.
1050          */
1051         lowest = TMR_NEVER;
1052
1053         for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) {
1054                 /* Skip requests without a timeout. */
1055                 if (spr->spr_timer == 0) {
1056                         sprp = &spr->spr_next;
1057
1058                         continue;
1059                 }
1060
1061                 assert(spr->spr_event == SEV_SEND ||
1062                     spr->spr_event == SEV_RECV);
1063
1064                 /*
1065                  * If the request has expired, cancel it and remove it from the
1066                  * list.  Otherwise, see if the request has the lowest number
1067                  * of ticks until its timeout so far.
1068                  */
1069                 if (tmr_is_first(spr->spr_time, now)) {
1070                         *sprp = spr->spr_next;
1071
1072                         if (spr->spr_event == SEV_SEND)
1073                                 sockevent_cancel_send(sock, spr, EWOULDBLOCK);
1074                         else
1075                                 sockevent_cancel_recv(sock, spr, EWOULDBLOCK);
1076
1077                         sockevent_proc_free(spr);
1078                 } else {
1079                         left = spr->spr_time - now;
1080
1081                         if (lowest == TMR_NEVER || lowest > left)
1082                                 lowest = left;
1083
1084                         sprp = &spr->spr_next;
1085                 }
1086         }
1087
1088         return lowest;
1089 }
1090
1091 /*
1092  * The socket event alarm went off.  Go through the set of socket objects with
1093  * timers, and see if any of their requests have now expired.  Set a new alarm
1094  * as necessary.
1095  */
1096 static void
1097 socktimer_expire(int arg __unused)
1098 {
1099         SLIST_HEAD(, sock) oldtimer;
1100         struct sock *sock, *tsock;
1101         clock_t now, lowest, left;
1102         int working;
1103
1104         /*
1105          * This function may or may not be called from a context where we are
1106          * already deferring events, so we have to cover both cases here.
1107          */
1108         if ((working = sockevent_working) == FALSE)
1109                 sockevent_working = TRUE;
1110
1111         /* Start a new list. */
1112         memcpy(&oldtimer, &socktimer, sizeof(oldtimer));
1113         SLIST_INIT(&socktimer);
1114
1115         now = getticks();
1116         lowest = TMR_NEVER;
1117
1118         /*
1119          * Go through all sockets that have or had a request with a timeout,
1120          * canceling any expired requests and building a new list of sockets
1121          * that still have requests with timeouts as we go.
1122          */
1123         SLIST_FOREACH_SAFE(sock, &oldtimer, sock_timer, tsock) {
1124                 assert(sock->sock_flags & SFL_TIMER);
1125                 sock->sock_flags &= ~SFL_TIMER;
1126
1127                 left = sockevent_expire(sock, now);
1128                 /*
1129                  * The sock object may already have been deallocated now.
1130                  * If 'next' is TMR_NEVER, do not touch 'sock' anymore.
1131                  */
1132
1133                 if (left != TMR_NEVER) {
1134                         if (lowest == TMR_NEVER || lowest > left)
1135                                 lowest = left;
1136
1137                         SLIST_INSERT_HEAD(&socktimer, sock, sock_timer);
1138
1139                         sock->sock_flags |= SFL_TIMER;
1140                 }
1141         }
1142
1143         /* If there is a new lowest timeout at all, set a new timer. */
1144         if (lowest != TMR_NEVER)
1145                 set_timer(&sockevent_timer, lowest, socktimer_expire, 0);
1146
1147         if (!working) {
1148                 /* If any new events were raised, process them now. */
1149                 if (sockevent_has_events())
1150                         sockevent_pump();
1151
1152                 sockevent_working = FALSE;
1153         }
1154 }
1155
1156 /*
1157  * Set a timer for the given (relative) number of clock ticks, adding the
1158  * associated socket object to the set of socket objects with timers, if it was
1159  * not already in that set.  Set a new alarm if necessary, and return the
1160  * absolute timeout for the timer.  Since the timers list is maintained lazily,
1161  * the caller need not take the object off the set if the call was canceled
1162  * later; see also socktimer_del().
1163  */
1164 static clock_t
1165 socktimer_add(struct sock * sock, clock_t ticks)
1166 {
1167         clock_t now;
1168
1169         /*
1170          * Relative time comparisons require that any two times are no more
1171          * than half the comparison space (clock_t, unsigned long) apart.
1172          */
1173         assert(ticks <= TMRDIFF_MAX);
1174
1175         /* If the socket was not already on the timers list, put it on. */
1176         if (!(sock->sock_flags & SFL_TIMER)) {
1177                 SLIST_INSERT_HEAD(&socktimer, sock, sock_timer);
1178
1179                 sock->sock_flags |= SFL_TIMER;
1180         }
1181
1182         /*
1183          * (Re)set the timer if either it was not running at all or this new
1184          * timeout will occur sooner than the currently scheduled alarm.  Note
1185          * that setting a timer that was already set is allowed.
1186          */
1187         now = getticks();
1188
1189         if (!tmr_is_set(&sockevent_timer) ||
1190             tmr_is_first(now + ticks, tmr_exp_time(&sockevent_timer)))
1191                 set_timer(&sockevent_timer, ticks, socktimer_expire, 0);
1192
1193         /* Return the absolute timeout. */
1194         return now + ticks;
1195 }
1196
1197 /*
1198  * Remove a socket object from the set of socket objects with timers.  Since
1199  * the timer list is maintained lazily, this needs to be done only right before
1200  * the socket object is freed.
1201  */
1202 static void
1203 socktimer_del(struct sock * sock)
1204 {
1205
1206         if (sock->sock_flags & SFL_TIMER) {
1207                 /* This macro is O(n). */
1208                 SLIST_REMOVE(&socktimer, sock, sock, sock_timer);
1209
1210                 sock->sock_flags &= ~SFL_TIMER;
1211         }
1212 }
1213
1214 /*
1215  * Bind a socket to a local address.
1216  */
1217 static int
1218 sockevent_bind(sockid_t id, const struct sockaddr * __restrict addr,
1219         socklen_t addr_len, endpoint_t user_endpt,
1220         const struct sockdriver_call * __restrict call)
1221 {
1222         struct sock *sock;
1223         int r;
1224
1225         if ((sock = sockhash_get(id)) == NULL)
1226                 return EINVAL;
1227
1228         if (sock->sock_ops->sop_bind == NULL)
1229                 return EOPNOTSUPP;
1230
1231         /* Binding a socket in listening mode is never supported. */
1232         if (sock->sock_opt & SO_ACCEPTCONN)
1233                 return EINVAL;
1234
1235         r = sock->sock_ops->sop_bind(sock, addr, addr_len, user_endpt);
1236
1237         if (r == SUSPEND) {
1238                 if (call == NULL)
1239                         return EINPROGRESS;
1240
1241                 sockevent_suspend(sock, SEV_BIND, call, user_endpt);
1242         }
1243
1244         return r;
1245 }
1246
1247 /*
1248  * Connect a socket to a remote address.
1249  */
1250 static int
1251 sockevent_connect(sockid_t id, const struct sockaddr * __restrict addr,
1252         socklen_t addr_len, endpoint_t user_endpt,
1253         const struct sockdriver_call * call)
1254 {
1255         struct sockdriver_call fakecall;
1256         struct sockevent_proc *spr;
1257         struct sock *sock;
1258         int r;
1259
1260         if ((sock = sockhash_get(id)) == NULL)
1261                 return EINVAL;
1262
1263         if (sock->sock_ops->sop_connect == NULL)
1264                 return EOPNOTSUPP;
1265
1266         /* Connecting a socket in listening mode is never supported. */
1267         if (sock->sock_opt & SO_ACCEPTCONN)
1268                 return EOPNOTSUPP;
1269
1270         /*
1271          * The upcoming connect call may fire an accept event for which the
1272          * handler may in turn fire a connect event on this socket.  Since we
1273          * delay event processing until after processing calls, this would
1274          * create the problem that even if the connection is accepted right
1275          * away, non-blocking connect requests would return EINPROGRESS.  For
1276          * UDS, this is undesirable behavior.  To remedy this, we use a hack:
1277          * we temporarily suspend the connect even if non-blocking, then
1278          * process events, and then cancel the connect request again.  If the
1279          * connection was accepted immediately, the cancellation will have no
1280          * effect, since the request has already been replied to.  In order not
1281          * to violate libsockdriver rules with this hack, we fabricate a fake
1282          * 'conn' object.
1283          */
1284         r = sock->sock_ops->sop_connect(sock, addr, addr_len, user_endpt);
1285
1286         if (r == SUSPEND) {
1287                 if (call != NULL || sockevent_has_events()) {
1288                         if (call == NULL) {
1289                                 fakecall.sc_endpt = NONE;
1290
1291                                 call = &fakecall;
1292                         }
1293
1294                         assert(!sockevent_has_suspended(sock,
1295                             SEV_SEND | SEV_RECV));
1296
1297                         sockevent_suspend(sock, SEV_CONNECT, call, user_endpt);
1298
1299                         if (call == &fakecall) {
1300                                 /* Process any pending events first now. */
1301                                 sockevent_pump();
1302
1303                                 /*
1304                                  * If the connect request has not been resumed
1305                                  * yet now, we must remove it from the queue
1306                                  * again, and return EINPROGRESS ourselves.
1307                                  * Otherwise, return OK or a pending error.
1308                                  */
1309                                 spr = sockevent_unsuspend(sock, call);
1310                                 if (spr != NULL) {
1311                                         sockevent_proc_free(spr);
1312
1313                                         r = EINPROGRESS;
1314                                 } else if ((r = sock->sock_err) != OK)
1315                                         sock->sock_err = OK;
1316                         }
1317                 } else
1318                         r = EINPROGRESS;
1319         }
1320
1321         if (r == OK) {
1322                 /*
1323                  * A completed connection attempt also always implies that the
1324                  * socket becomes writable.  For convenience we enforce this
1325                  * rule here, because it is easy to forget.
1326                  */
1327                 sockevent_raise(sock, SEV_SEND);
1328         }
1329
1330         return r;
1331 }
1332
1333 /*
1334  * Put a socket in listening mode.
1335  */
1336 static int
1337 sockevent_listen(sockid_t id, int backlog)
1338 {
1339         struct sock *sock;
1340         int r;
1341
1342         if ((sock = sockhash_get(id)) == NULL)
1343                 return EINVAL;
1344
1345         if (sock->sock_ops->sop_listen == NULL)
1346                 return EOPNOTSUPP;
1347
1348         /*
1349          * Perform a general adjustment on the backlog value, applying the
1350          * customary BSD "fudge factor" of 1.5x.  Keep the value within bounds
1351          * though.  POSIX imposes that a negative backlog value is equal to a
1352          * backlog value of zero.  A backlog value of zero, in turn, may mean
1353          * anything; we take it to be one.  POSIX also imposes that all socket
1354          * drivers accept up to at least SOMAXCONN connections on the queue.
1355          */
1356         if (backlog < 0)
1357                 backlog = 0;
1358         if (backlog < SOMAXCONN)
1359                 backlog += 1 + ((unsigned int)backlog >> 1);
1360         if (backlog > SOMAXCONN)
1361                 backlog = SOMAXCONN;
1362
1363         r = sock->sock_ops->sop_listen(sock, backlog);
1364
1365         /*
1366          * On success, the socket is now in listening mode.  As part of that,
1367          * a select(2) ready-to-read condition now indicates that a connection
1368          * may be accepted on the socket, rather than that data may be read.
1369          * Since libsockevent is responsible for this distinction, we keep
1370          * track of the listening mode at this level.  Conveniently, there is a
1371          * socket option for this, which we support out of the box as a result.
1372          */
1373         if (r == OK) {
1374                 sock->sock_opt |= SO_ACCEPTCONN;
1375
1376                 /*
1377                  * For the extremely unlikely case that right after the socket
1378                  * is put into listening mode, it has a connection ready to
1379                  * accept, we retest blocked ready-to-read select queries now.
1380                  */
1381                 sockevent_raise(sock, SEV_ACCEPT);
1382         }
1383
1384         return r;
1385 }
1386
1387 /*
1388  * Accept a connection on a listening socket, creating a new socket.
1389  */
1390 static sockid_t
1391 sockevent_accept(sockid_t id, struct sockaddr * __restrict addr,
1392         socklen_t * __restrict addr_len, endpoint_t user_endpt,
1393         const struct sockdriver_call * __restrict call)
1394 {
1395         struct sock *sock, *newsock;
1396         sockid_t r;
1397
1398         if ((sock = sockhash_get(id)) == NULL)
1399                 return EINVAL;
1400
1401         if (sock->sock_ops->sop_accept == NULL)
1402                 return EOPNOTSUPP;
1403
1404         /*
1405          * Attempt to accept a connection.  The socket driver is responsible
1406          * for allocating a sock object (and identifier) on success.  It may
1407          * already have done so before, in which case it should leave newsock
1408          * filled with NULL; otherwise, the returned sock object is cloned from
1409          * the listening socket.  The socket driver is also responsible for
1410          * failing the call if the socket is not in listening mode, because it
1411          * must specify the error to return: EOPNOTSUPP or EINVAL.
1412          */
1413         newsock = NULL;
1414
1415         if ((r = sock->sock_ops->sop_accept(sock, addr, addr_len, user_endpt,
1416             &newsock)) == SUSPEND) {
1417                 assert(sock->sock_opt & SO_ACCEPTCONN);
1418
1419                 if (call == NULL)
1420                         return EWOULDBLOCK;
1421
1422                 sockevent_suspend(sock, SEV_ACCEPT, call, user_endpt);
1423
1424                 return SUSPEND;
1425         }
1426
1427         if (r >= 0)
1428                 sockevent_accepted(sock, newsock, r);
1429
1430         return r;
1431 }
1432
1433 /*
1434  * Send regular and/or control data.
1435  */
1436 static int
1437 sockevent_send(sockid_t id, const struct sockdriver_data * __restrict data,
1438         size_t len, const struct sockdriver_data * __restrict ctl_data,
1439         socklen_t ctl_len, const struct sockaddr * __restrict addr,
1440         socklen_t addr_len, endpoint_t user_endpt, int flags,
1441         const struct sockdriver_call * __restrict call)
1442 {
1443         struct sock *sock;
1444         clock_t time;
1445         size_t min, off;
1446         socklen_t ctl_off;
1447         int r, timer;
1448
1449         if ((sock = sockhash_get(id)) == NULL)
1450                 return EINVAL;
1451
1452         /*
1453          * The order of the following checks is not necessarily fixed, and may
1454          * be changed later.  As far as applicable, they should match the order
1455          * of the checks during call resumption, though.
1456          */
1457         if ((r = sock->sock_err) != OK) {
1458                 sock->sock_err = OK;
1459
1460                 return r;
1461         }
1462
1463         if (sock->sock_flags & SFL_SHUT_WR) {
1464                 sockevent_sigpipe(sock, user_endpt, flags);
1465
1466                 return EPIPE;
1467         }
1468
1469         /*
1470          * Translate the sticky SO_DONTROUTE option to a per-request
1471          * MSG_DONTROUTE flag.  This achieves two purposes: socket drivers have
1472          * to check only one flag, and socket drivers that do not support the
1473          * flag will fail send requests in a consistent way.
1474          */
1475         if (sock->sock_opt & SO_DONTROUTE)
1476                 flags |= MSG_DONTROUTE;
1477
1478         /*
1479          * Check if this is a valid send request as far as the socket driver is
1480          * concerned.  We do this separately from sop_send for the reason that
1481          * this send request may immediately be queued behind other pending
1482          * send requests (without a call to sop_send), which means even invalid
1483          * requests would be queued and not return failure until much later.
1484          */
1485         if (sock->sock_ops->sop_pre_send != NULL &&
1486             (r = sock->sock_ops->sop_pre_send(sock, len, ctl_len, addr,
1487             addr_len, user_endpt,
1488             flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK)
1489                 return r;
1490
1491         if (sock->sock_ops->sop_send == NULL)
1492                 return EOPNOTSUPP;
1493
1494         off = 0;
1495         ctl_off = 0;
1496
1497         /*
1498          * Sending out-of-band data is treated differently from regular data:
1499          *
1500          * - sop_send is called immediately, even if a partial non-OOB send
1501          *   operation is currently suspended (TODO: it may have to be aborted
1502          *   in order to maintain atomicity guarantees - that should be easy);
1503          * - sop_send must not return SUSPEND; instead, if it cannot process
1504          *   the OOB data immediately, it must return an appropriate error;
1505          * - the send low watermark is ignored.
1506          *
1507          * Given that none of the current socket drivers support OOB data at
1508          * all, more sophisticated approaches would have no added value now.
1509          */
1510         if (flags & MSG_OOB) {
1511                 r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data,
1512                     ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, 0);
1513
1514                 if (r == SUSPEND)
1515                         panic("libsockevent: MSG_OOB send calls may not be "
1516                             "suspended");
1517
1518                 return (r == OK) ? (int)off : r;
1519         }
1520
1521         /*
1522          * Only call the actual sop_send function now if no other send calls
1523          * are suspended already.
1524          *
1525          * Call sop_send with 'min' set to the minimum of the request size and
1526          * the socket's send low water mark, but only if the call is non-
1527          * blocking.  For stream-oriented sockets, this should have the effect
1528          * that non-blocking calls fail with EWOULDBLOCK if not at least that
1529          * much can be sent immediately. For consistency, we choose to apply
1530          * the same threshold to blocking calls.  For datagram-oriented
1531          * sockets, the minimum is not a factor to be considered.
1532          */
1533         if (!sockevent_has_suspended(sock, SEV_SEND)) {
1534                 min = sock->sock_slowat;
1535                 if (min > len)
1536                         min = len;
1537
1538                 r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data,
1539                     ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, min);
1540         } else
1541                 r = SUSPEND;
1542
1543         if (r == SUSPEND) {
1544                 /*
1545                  * We do not store the target's address on suspension, because
1546                  * that would add significantly to the per-process suspension
1547                  * state.  As a result, we disallow socket drivers from
1548                  * suspending send calls with addresses, because we would no
1549                  * longer have the address for proper call resumption.
1550                  * However, we do not know here whether the socket is in
1551                  * connection-oriented mode; if it is, the address is to be
1552                  * ignored altogether.  Therefore, there is no test on 'addr'
1553                  * here.  Resumed calls will get a NULL address pointer, and
1554                  * the socket driver is expected to do the right thing.
1555                  */
1556
1557                 /*
1558                  * For non-blocking socket calls, return an error only if we
1559                  * were not able to send anything at all.  If only control data
1560                  * were sent, the return value is therefore zero.
1561                  */
1562                 if (call != NULL) {
1563                         if (sock->sock_stimeo != 0) {
1564                                 timer = TRUE;
1565                                 time = socktimer_add(sock, sock->sock_stimeo);
1566                         } else {
1567                                 timer = FALSE;
1568                                 time = 0;
1569                         }
1570
1571                         sockevent_suspend_data(sock, SEV_SEND, timer, call,
1572                             user_endpt, data, len, off, ctl_data, ctl_len,
1573                             ctl_off, flags, 0, time);
1574                 } else
1575                         r = (off > 0 || ctl_off > 0) ? OK : EWOULDBLOCK;
1576         } else if (r == EPIPE)
1577                 sockevent_sigpipe(sock, user_endpt, flags);
1578
1579         return (r == OK) ? (int)off : r;
1580 }
1581
1582 /*
1583  * The inner part of the receive request handler.  An error returned from here
1584  * may be overridden by an error pending on the socket, although data returned
1585  * from here trumps such pending errors.
1586  */
1587 static int
1588 sockevent_recv_inner(struct sock * sock,
1589         const struct sockdriver_data * __restrict data,
1590         size_t len, size_t * __restrict off,
1591         const struct sockdriver_data * __restrict ctl_data,
1592         socklen_t ctl_len, socklen_t * __restrict ctl_off,
1593         struct sockaddr * __restrict addr,
1594         socklen_t * __restrict addr_len, endpoint_t user_endpt,
1595         int * __restrict flags, const struct sockdriver_call * __restrict call)
1596 {
1597         clock_t time;
1598         size_t min;
1599         int r, oob, inflags, timer;
1600
1601         /*
1602          * Check if this is a valid receive request as far as the socket driver
1603          * is concerned.  We do this separately from sop_recv for the reason
1604          * that this receive request may immediately be queued behind other
1605          * pending receive requests (without a call to sop_recv), which means
1606          * even invalid requests would be queued and not return failure until
1607          * much later.
1608          */
1609         inflags = *flags;
1610         *flags = 0;
1611
1612         if (sock->sock_ops->sop_pre_recv != NULL &&
1613             (r = sock->sock_ops->sop_pre_recv(sock, user_endpt,
1614             inflags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK)
1615                 return r;
1616
1617         /*
1618          * The order of the following checks is not necessarily fixed, and may
1619          * be changed later.  As far as applicable, they should match the order
1620          * of the checks during call resumption, though.
1621          */
1622         if (sock->sock_flags & SFL_SHUT_RD)
1623                 return SOCKEVENT_EOF;
1624
1625         if (sock->sock_ops->sop_recv == NULL)
1626                 return EOPNOTSUPP;
1627
1628         /*
1629          * Receiving out-of-band data is treated differently from regular data:
1630          *
1631          * - sop_recv is called immediately, even if a partial non-OOB receive
1632          *   operation is currently suspended (TODO: it may have to be aborted
1633          *   in order to maintain atomicity guarantees - that should be easy);
1634          * - sop_recv must not return SUSPEND; instead, if it cannot return any
1635          *   the OOB data immediately, it must return an appropriate error;
1636          * - the receive low watermark is ignored.
1637          *
1638          * Given that none of the current socket drivers support OOB data at
1639          * all, more sophisticated approaches would have no added value now.
1640          */
1641         oob = (inflags & MSG_OOB);
1642
1643         if (oob && (sock->sock_opt & SO_OOBINLINE))
1644                 return EINVAL;
1645
1646         /*
1647          * Only call the actual sop_recv function now if no other receive
1648          * calls are suspended already.
1649          *
1650          * Call sop_recv with 'min' set to the minimum of the request size and
1651          * the socket's socket's low water mark, unless there is a pending
1652          * error.  As a result, blocking calls will block, and non-blocking
1653          * calls will yield EWOULDBLOCK, if at least that much can be received,
1654          * unless another condition (EOF or that pending error) prevents more
1655          * from being received anyway.  For datagram-oriented sockets, the
1656          * minimum is not a factor to be considered.
1657          */
1658         if (oob || !sockevent_has_suspended(sock, SEV_RECV)) {
1659                 if (!oob && sock->sock_err == OK) {
1660                         min = sock->sock_rlowat;
1661                         if (min > len)
1662                                 min = len;
1663                 } else
1664                         min = 0; /* receive even no-data segments */
1665
1666                 r = sock->sock_ops->sop_recv(sock, data, len, off, ctl_data,
1667                     ctl_len, ctl_off, addr, addr_len, user_endpt, inflags, min,
1668                     flags);
1669         } else
1670                 r = SUSPEND;
1671
1672         assert(r <= 0 || r == SOCKEVENT_EOF);
1673
1674         if (r == SUSPEND) {
1675                 if (oob)
1676                         panic("libsockevent: MSG_OOB receive calls may not be "
1677                             "suspended");
1678
1679                 /*
1680                  * For non-blocking socket calls, return EWOULDBLOCK only if we
1681                  * did not receive anything at all.  If only control data were
1682                  * received, the return value is therefore zero.  Suspension
1683                  * implies that there is nothing to read.  For the purpose of
1684                  * the calling wrapper function, never suspend a call when
1685                  * there is a pending error.
1686                  */
1687                 if (call != NULL && sock->sock_err == OK) {
1688                         if (sock->sock_rtimeo != 0) {
1689                                 timer = TRUE;
1690                                 time = socktimer_add(sock, sock->sock_rtimeo);
1691                         } else {
1692                                 timer = FALSE;
1693                                 time = 0;
1694                         }
1695
1696                         sockevent_suspend_data(sock, SEV_RECV, timer, call,
1697                             user_endpt, data, len, *off, ctl_data,
1698                             ctl_len, *ctl_off, inflags, *flags, time);
1699                 } else
1700                         r = EWOULDBLOCK;
1701         }
1702
1703         return r;
1704 }
1705
1706 /*
1707  * Receive regular and/or control data.
1708  */
1709 static int
1710 sockevent_recv(sockid_t id, const struct sockdriver_data * __restrict data,
1711         size_t len, const struct sockdriver_data * __restrict ctl_data,
1712         socklen_t * __restrict ctl_len, struct sockaddr * __restrict addr,
1713         socklen_t * __restrict addr_len, endpoint_t user_endpt,
1714         int * __restrict flags, const struct sockdriver_call * __restrict call)
1715 {
1716         struct sock *sock;
1717         size_t off;
1718         socklen_t ctl_inlen;
1719         int r;
1720
1721         if ((sock = sockhash_get(id)) == NULL)
1722                 return EINVAL;
1723
1724         /*
1725          * This function is a wrapper around the actual receive functionality.
1726          * The reason for this is that receiving data should take precedence
1727          * over a pending socket error, while a pending socket error should
1728          * take precedence over both regular errors as well as EOF.  In other
1729          * words: if there is a pending error, we must try to receive anything
1730          * at all; if receiving does not work, we must fail the call with the
1731          * pending error.  However, until we call the receive callback, we have
1732          * no way of telling whether any data can be received.  So we must try
1733          * that before we can decide whether to return a pending error.
1734          */
1735         off = 0;
1736         ctl_inlen = *ctl_len;
1737         *ctl_len = 0;
1738
1739         /*
1740          * Attempt to perform the actual receive call.
1741          */
1742         r = sockevent_recv_inner(sock, data, len, &off, ctl_data, ctl_inlen,
1743             ctl_len, addr, addr_len, user_endpt, flags, call);
1744
1745         /*
1746          * If the receive request succeeded, or it failed but yielded a partial
1747          * result, then return the (partal) result.  Otherwise, if an error is
1748          * pending, return that error.  Otherwise, return either a regular
1749          * error or 0 for EOF.
1750          */
1751         if (r == OK || (r != SUSPEND && (off > 0 || *ctl_len > 0)))
1752                 r = (int)off;
1753         else if (sock->sock_err != OK) {
1754                 assert(r != SUSPEND);
1755
1756                 r = sock->sock_err;
1757
1758                 sock->sock_err = OK;
1759         } else if (r == SOCKEVENT_EOF)
1760                 r = 0;
1761
1762         return r;
1763 }
1764
1765 /*
1766  * Process an I/O control call.
1767  */
1768 static int
1769 sockevent_ioctl(sockid_t id, unsigned long request,
1770         const struct sockdriver_data * __restrict data, endpoint_t user_endpt,
1771         const struct sockdriver_call * __restrict call __unused)
1772 {
1773         struct sock *sock;
1774         size_t size;
1775         int r, val;
1776
1777         if ((sock = sockhash_get(id)) == NULL)
1778                 return EINVAL;
1779
1780         /* We handle a very small subset of generic IOCTLs here. */
1781         switch (request) {
1782         case FIONREAD:
1783                 size = 0;
1784                 if (!(sock->sock_flags & SFL_SHUT_RD) &&
1785                     sock->sock_ops->sop_test_recv != NULL)
1786                         (void)sock->sock_ops->sop_test_recv(sock, 0, &size);
1787
1788                 val = (int)size;
1789
1790                 return sockdriver_copyout(data, 0, &val, sizeof(val));
1791         }
1792
1793         if (sock->sock_ops->sop_ioctl == NULL)
1794                 return ENOTTY;
1795
1796         r = sock->sock_ops->sop_ioctl(sock, request, data, user_endpt);
1797
1798         /*
1799          * Suspending IOCTL requests is not currently supported by this
1800          * library, even though the VFS protocol and libsockdriver do support
1801          * it.  The reason is that IOCTLs do not match our proces suspension
1802          * model: they could be neither queued nor repeated.  For now, it seems
1803          * that this feature is not needed by the socket drivers either.  Thus,
1804          * even though there are possible solutions, we defer implementing them
1805          * until we know what exactly is needed.
1806          */
1807         if (r == SUSPEND)
1808                 panic("libsockevent: socket driver suspended IOCTL 0x%lx",
1809                     request);
1810
1811         return r;
1812 }
1813
1814 /*
1815  * Set socket options.
1816  */
1817 static int
1818 sockevent_setsockopt(sockid_t id, int level, int name,
1819         const struct sockdriver_data * data, socklen_t len)
1820 {
1821         struct sock *sock;
1822         struct linger linger;
1823         struct timeval tv;
1824         clock_t secs, ticks;
1825         int r, val;
1826
1827         if ((sock = sockhash_get(id)) == NULL)
1828                 return EINVAL;
1829
1830         if (level == SOL_SOCKET) {
1831                 /*
1832                  * Handle a subset of the socket-level options here.  For most
1833                  * of them, this means that the socket driver itself need not
1834                  * handle changing or returning the options, but still needs to
1835                  * implement the correct behavior based on them where needed.
1836                  * A few of them are handled exclusively in this library:
1837                  * SO_ACCEPTCONN, SO_NOSIGPIPE, SO_ERROR, SO_TYPE, SO_LINGER,
1838                  * SO_SNDLOWAT, SO_RCVLOWAT, SO_SNDTIMEO, and SO_RCVTIMEO.
1839                  * The SO_USELOOPBACK option is explicitly absent, as it is
1840                  * valid for routing sockets only and is set by default there.
1841                  */
1842                 switch (name) {
1843                 case SO_DEBUG:
1844                 case SO_REUSEADDR:
1845                 case SO_KEEPALIVE:
1846                 case SO_DONTROUTE:
1847                 case SO_BROADCAST:
1848                 case SO_OOBINLINE:
1849                 case SO_REUSEPORT:
1850                 case SO_NOSIGPIPE:
1851                 case SO_TIMESTAMP:
1852                         /*
1853                          * Simple on-off options.  Changing them does not
1854                          * involve the socket driver.
1855                          */
1856                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1857                             len)) != OK)
1858                                 return r;
1859
1860                         if (val)
1861                                 sock->sock_opt |= (unsigned int)name;
1862                         else
1863                                 sock->sock_opt &= ~(unsigned int)name;
1864
1865                         /*
1866                          * In priciple these on-off options are maintained in
1867                          * this library, but some socket drivers may need to
1868                          * apply the options elsewhere, so we notify them that
1869                          * something has changed.  Using the sop_setsockopt
1870                          * callback would be inconvenient for this for two
1871                          * reasons: multiple value copy-ins and default errors.
1872                          */
1873                         if (sock->sock_ops->sop_setsockmask != NULL)
1874                                 sock->sock_ops->sop_setsockmask(sock,
1875                                     sock->sock_opt);
1876
1877                         /*
1878                          * The inlining of OOB data may make new data available
1879                          * through regular receive calls.  Thus, see if we can
1880                          * wake up any suspended receive calls now.
1881                          */
1882                         if (name == SO_OOBINLINE && val)
1883                                 sockevent_raise(sock, SEV_RECV);
1884
1885                         return OK;
1886
1887                 case SO_LINGER:
1888                         /* The only on-off option with an associated value. */
1889                         if ((r = sockdriver_copyin_opt(data, &linger,
1890                             sizeof(linger), len)) != OK)
1891                                 return r;
1892
1893                         if (linger.l_onoff) {
1894                                 if (linger.l_linger < 0)
1895                                         return EINVAL;
1896                                 /* EDOM is the closest applicable error.. */
1897                                 secs = (clock_t)linger.l_linger;
1898                                 if (secs >= TMRDIFF_MAX / sys_hz())
1899                                         return EDOM;
1900
1901                                 sock->sock_opt |= SO_LINGER;
1902                                 sock->sock_linger = secs * sys_hz();
1903                         } else {
1904                                 sock->sock_opt &= ~SO_LINGER;
1905                                 sock->sock_linger = 0;
1906                         }
1907
1908                         return OK;
1909
1910                 case SO_SNDLOWAT:
1911                 case SO_RCVLOWAT:
1912                         if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1913                             len)) != OK)
1914                                 return r;
1915
1916                         if (val <= 0)
1917                                 return EINVAL;
1918
1919                         /*
1920                          * Setting these values may allow suspended operations
1921                          * (send, recv, select) to be resumed, so recheck.
1922                          */
1923                         if (name == SO_SNDLOWAT) {
1924                                 sock->sock_slowat = (size_t)val;
1925
1926                                 sockevent_raise(sock, SEV_SEND);
1927                         } else {
1928                                 sock->sock_rlowat = (size_t)val;
1929
1930                                 sockevent_raise(sock, SEV_RECV);
1931                         }
1932
1933                         return OK;
1934
1935                 case SO_SNDTIMEO:
1936                 case SO_RCVTIMEO:
1937                         if ((r = sockdriver_copyin_opt(data, &tv, sizeof(tv),
1938                             len)) != OK)
1939                                 return r;
1940
1941                         if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
1942                             (unsigned long)tv.tv_usec >= US)
1943                                 return EINVAL;
1944                         if (tv.tv_sec >= TMRDIFF_MAX / sys_hz())
1945                                 return EDOM;
1946
1947                         ticks = tv.tv_sec * sys_hz() +
1948                             (tv.tv_usec * sys_hz() + US - 1) / US;
1949
1950                         if (name == SO_SNDTIMEO)
1951                                 sock->sock_stimeo = ticks;
1952                         else
1953                                 sock->sock_rtimeo = ticks;
1954
1955                         /*
1956                          * The timeouts for any calls already in progress for
1957                          * this socket are left as is.
1958                          */
1959                         return OK;
1960
1961                 case SO_ACCEPTCONN:
1962                 case SO_ERROR:
1963                 case SO_TYPE:
1964                         /* These options may be retrieved but not set. */
1965                         return ENOPROTOOPT;
1966
1967                 default:
1968                         /*
1969                          * The remaining options either cannot be handled in a
1970                          * generic way, or are not recognized altogether.  Pass
1971                          * them to the socket driver, which should handle what
1972                          * it knows and reject the rest.
1973                          */
1974                         break;
1975                 }
1976         }
1977
1978         if (sock->sock_ops->sop_setsockopt == NULL)
1979                 return ENOPROTOOPT;
1980
1981         /*
1982          * The socket driver must return ENOPROTOOPT for all options it does
1983          * not recognize.
1984          */
1985         return sock->sock_ops->sop_setsockopt(sock, level, name, data, len);
1986 }
1987
1988 /*
1989  * Retrieve socket options.
1990  */
1991 static int
1992 sockevent_getsockopt(sockid_t id, int level, int name,
1993         const struct sockdriver_data * __restrict data,
1994         socklen_t * __restrict len)
1995 {
1996         struct sock *sock;
1997         struct linger linger;
1998         struct timeval tv;
1999         clock_t ticks;
2000         int val;
2001
2002         if ((sock = sockhash_get(id)) == NULL)
2003                 return EINVAL;
2004
2005         if (level == SOL_SOCKET) {
2006                 /*
2007                  * As with setting, handle a subset of the socket-level options
2008                  * here.  The rest is to be taken care of by the socket driver.
2009                  */
2010                 switch (name) {
2011                 case SO_DEBUG:
2012                 case SO_ACCEPTCONN:
2013                 case SO_REUSEADDR:
2014                 case SO_KEEPALIVE:
2015                 case SO_DONTROUTE:
2016                 case SO_BROADCAST:
2017                 case SO_OOBINLINE:
2018                 case SO_REUSEPORT:
2019                 case SO_NOSIGPIPE:
2020                 case SO_TIMESTAMP:
2021                         val = !!(sock->sock_opt & (unsigned int)name);
2022
2023                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2024                             len);
2025
2026                 case SO_LINGER:
2027                         linger.l_onoff = !!(sock->sock_opt & SO_LINGER);
2028                         linger.l_linger = sock->sock_linger / sys_hz();
2029
2030                         return sockdriver_copyout_opt(data, &linger,
2031                            sizeof(linger), len);
2032
2033                 case SO_ERROR:
2034                         if ((val = -sock->sock_err) != OK)
2035                                 sock->sock_err = OK;
2036
2037                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2038                             len);
2039
2040                 case SO_TYPE:
2041                         val = sock->sock_type;
2042
2043                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2044                             len);
2045
2046                 case SO_SNDLOWAT:
2047                         val = (int)sock->sock_slowat;
2048
2049                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2050                             len);
2051
2052                 case SO_RCVLOWAT:
2053                         val = (int)sock->sock_rlowat;
2054
2055                         return sockdriver_copyout_opt(data, &val, sizeof(val),
2056                             len);
2057
2058                 case SO_SNDTIMEO:
2059                 case SO_RCVTIMEO:
2060                         if (name == SO_SNDTIMEO)
2061                                 ticks = sock->sock_stimeo;
2062                         else
2063                                 ticks = sock->sock_rtimeo;
2064
2065                         tv.tv_sec = ticks / sys_hz();
2066                         tv.tv_usec = (ticks % sys_hz()) * US / sys_hz();
2067
2068                         return sockdriver_copyout_opt(data, &tv, sizeof(tv),
2069                             len);
2070
2071                 default:
2072                         break;
2073                 }
2074         }
2075
2076         if (sock->sock_ops->sop_getsockopt == NULL)
2077                 return ENOPROTOOPT;
2078
2079         /*
2080          * The socket driver must return ENOPROTOOPT for all options it does
2081          * not recognize.
2082          */
2083         return sock->sock_ops->sop_getsockopt(sock, level, name, data, len);
2084 }
2085
2086 /*
2087  * Retrieve a socket's local address.
2088  */
2089 static int
2090 sockevent_getsockname(sockid_t id, struct sockaddr * __restrict addr,
2091         socklen_t * __restrict addr_len)
2092 {
2093         struct sock *sock;
2094
2095         if ((sock = sockhash_get(id)) == NULL)
2096                 return EINVAL;
2097
2098         if (sock->sock_ops->sop_getsockname == NULL)
2099                 return EOPNOTSUPP;
2100
2101         return sock->sock_ops->sop_getsockname(sock, addr, addr_len);
2102 }
2103
2104 /*
2105  * Retrieve a socket's remote address.
2106  */
2107 static int
2108 sockevent_getpeername(sockid_t id, struct sockaddr * __restrict addr,
2109         socklen_t * __restrict addr_len)
2110 {
2111         struct sock *sock;
2112
2113         if ((sock = sockhash_get(id)) == NULL)
2114                 return EINVAL;
2115
2116         /* Listening-mode sockets cannot possibly have a peer address. */
2117         if (sock->sock_opt & SO_ACCEPTCONN)
2118                 return ENOTCONN;
2119
2120         if (sock->sock_ops->sop_getpeername == NULL)
2121                 return EOPNOTSUPP;
2122
2123         return sock->sock_ops->sop_getpeername(sock, addr, addr_len);
2124 }
2125
2126 /*
2127  * Mark the socket object as shut down for sending and/or receiving.  The flags
2128  * parameter may be a bitwise-OR'ed combination of SFL_SHUT_RD and SFL_SHUT_WR.
2129  * This function will wake up any suspended requests affected by this change,
2130  * but it will not invoke the sop_shutdown() callback function on the socket.
2131  * The function may in fact be called from sop_shutdown() before completion to
2132  * mark the socket as shut down as reflected by sockevent_is_shutdown().
2133  */
2134 void
2135 sockevent_set_shutdown(struct sock * sock, unsigned int flags)
2136 {
2137         unsigned int mask;
2138
2139         assert(sock->sock_ops != NULL);
2140         assert(!(flags & ~(SFL_SHUT_RD | SFL_SHUT_WR)));
2141
2142         /* Look at the newly set flags only. */
2143         flags &= ~(unsigned int)sock->sock_flags;
2144
2145         if (flags != 0) {
2146                 sock->sock_flags |= flags;
2147
2148                 /*
2149                  * Wake up any blocked calls that are affected by the shutdown.
2150                  * Shutting down listening sockets causes ongoing accept calls
2151                  * to be rechecked.
2152                  */
2153                 mask = 0;
2154                 if (flags & SFL_SHUT_RD)
2155                         mask |= SEV_RECV;
2156                 if (flags & SFL_SHUT_WR)
2157                         mask |= SEV_SEND;
2158                 if (sock->sock_opt & SO_ACCEPTCONN)
2159                         mask |= SEV_ACCEPT;
2160
2161                 assert(mask != 0);
2162                 sockevent_raise(sock, mask);
2163         }
2164 }
2165
2166 /*
2167  * Shut down socket send and receive operations.
2168  */
2169 static int
2170 sockevent_shutdown(sockid_t id, int how)
2171 {
2172         struct sock *sock;
2173         unsigned int flags;
2174         int r;
2175
2176         if ((sock = sockhash_get(id)) == NULL)
2177                 return EINVAL;
2178
2179         /* Convert the request to a set of flags. */
2180         flags = 0;
2181         if (how == SHUT_RD || how == SHUT_RDWR)
2182                 flags |= SFL_SHUT_RD;
2183         if (how == SHUT_WR || how == SHUT_RDWR)
2184                 flags |= SFL_SHUT_WR;
2185
2186         if (sock->sock_ops->sop_shutdown != NULL)
2187                 r = sock->sock_ops->sop_shutdown(sock, flags);
2188         else
2189                 r = OK;
2190
2191         /* On success, update our internal state as well. */
2192         if (r == OK)
2193                 sockevent_set_shutdown(sock, flags);
2194
2195         return r;
2196 }
2197
2198 /*
2199  * Close a socket.
2200  */
2201 static int
2202 sockevent_close(sockid_t id, const struct sockdriver_call * call)
2203 {
2204         struct sock *sock;
2205         int r, force;
2206
2207         if ((sock = sockhash_get(id)) == NULL)
2208                 return EINVAL;
2209
2210         assert(sock->sock_proc == NULL);
2211         sock->sock_select.ss_endpt = NONE;
2212
2213         /*
2214          * There are several scenarios when it comes to closing sockets.  First
2215          * of all, we never actually force the socket driver to close a socket.
2216          * The driver may always suspend the close call and take as long as it
2217          * wants.  After a suspension, it signals its completion of the close
2218          * through the SEV_CLOSE socket event.
2219          *
2220          * With that said, we offer two levels of urgency regarding the close
2221          * request: regular and forced.  The former allows for a graceful
2222          * close; the latter urges the socket driver to close the socket as
2223          * soon as possible.  A socket that has been requested to be closed
2224          * gracefully can, as long as it is still open (i.e., no SEV_CLOSE was
2225          * fired yet), later be requested to be closed forcefully.  This is how
2226          * SO_LINGER with a nonzero timeout is implemented.  If SO_LINGER is
2227          * set with a zero timeout, the socket is force-closed immediately.
2228          * Finally, if SO_LINGER is not set, the socket will be closed normally
2229          * and never be forced--akin to SO_LINGER with an infinite timeout.
2230          *
2231          * The return value of the caller's close(2) may only ever be either
2232          * OK or EINPROGRESS, to ensure that the caller knows that the file
2233          * descriptor is freed up, as per Austin Group Defect #529.  In fact,
2234          * EINPROGRESS is to be returned only on signal interruption (i.e.,
2235          * cancel).  For that reason, this function only ever returns OK.
2236          */
2237         force = ((sock->sock_opt & SO_LINGER) && sock->sock_linger == 0);
2238
2239         if (sock->sock_ops->sop_close != NULL)
2240                 r = sock->sock_ops->sop_close(sock, force);
2241         else
2242                 r = OK;
2243
2244         assert(r == OK || r == SUSPEND);
2245
2246         if (r == SUSPEND) {
2247                 sock->sock_flags |= SFL_CLOSING;
2248
2249                 /*
2250                  * If we were requested to force-close the socket immediately,
2251                  * but the socket driver needs more time anyway, then tell the
2252                  * caller that the socket was closed right away.
2253                  */
2254                 if (force)
2255                         return OK;
2256
2257                 /*
2258                  * If we are to force-close the socket only after a specific
2259                  * linger timeout, set the timer for that now, even if the call
2260                  * is non-blocking.  This also means that we cannot associate
2261                  * the linger timeout with the close call.  Instead, we convert
2262                  * the sock_linger value from a (relative) duration to an
2263                  * (absolute) timeout time, and use the SFL_CLOSING flag (along
2264                  * with SFL_TIMER) to tell the difference.  Since the socket is
2265                  * otherwise unreachable from userland at this point, the
2266                  * conversion is never visible in any way.
2267                  *
2268                  * The socket may already be in the timers list, so we must
2269                  * always check the SO_LINGER flag before checking sock_linger.
2270                  *
2271                  * If SO_LINGER is not set, we must never suspend the call.
2272                  */
2273                 if (sock->sock_opt & SO_LINGER) {
2274                         sock->sock_linger =
2275                             socktimer_add(sock, sock->sock_linger);
2276                 } else
2277                         call = NULL;
2278
2279                 /*
2280                  * A non-blocking close is completed asynchronously.  The
2281                  * caller is not told about this with EWOULDBLOCK as usual, for
2282                  * the reasons mentioned above.
2283                  */
2284                 if (call != NULL)
2285                         sockevent_suspend(sock, SEV_CLOSE, call, NONE);
2286                 else
2287                         r = OK;
2288         } else if (r == OK)
2289                 sockevent_free(sock);
2290
2291         return r;
2292 }
2293
2294 /*
2295  * Cancel a suspended send request.
2296  */
2297 static void
2298 sockevent_cancel_send(struct sock * sock, struct sockevent_proc * spr, int err)
2299 {
2300         int r;
2301
2302         /*
2303          * If any regular or control data were sent, return the number of data
2304          * bytes sent--possibly zero.  Otherwise return the given error code.
2305          */
2306         if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
2307                 r = (int)spr->spr_dataoff;
2308         else
2309                 r = err;
2310
2311         sockdriver_reply_generic(&spr->spr_call, r);
2312
2313         /*
2314          * In extremely rare circumstances, one send may be queued behind
2315          * another send even though the former can actually be sent on the
2316          * socket right away.  For this reason, we retry sending when canceling
2317          * a send.  We need to do this only when the first send in the queue
2318          * was canceled, but multiple blocked sends on a single socket should
2319          * be rare anyway.
2320          */
2321         sockevent_raise(sock, SEV_SEND);
2322 }
2323
2324 /*
2325  * Cancel a suspended receive request.
2326  */
2327 static void
2328 sockevent_cancel_recv(struct sock * sock, struct sockevent_proc * spr, int err)
2329 {
2330         int r;
2331
2332         /*
2333          * If any regular or control data were received, return the number of
2334          * data bytes received--possibly zero.  Otherwise return the given
2335          * error code.
2336          */
2337         if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
2338                 r = (int)spr->spr_dataoff;
2339         else
2340                 r = err;
2341
2342         /*
2343          * Also return any flags set for the data received so far, e.g.
2344          * MSG_CTRUNC.  Do not return an address: receive calls on unconnected
2345          * sockets must never block after receiving some data--instead, they
2346          * are supposed to return MSG_TRUNC if not all data were copied out.
2347          */
2348         sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, NULL, 0,
2349             spr->spr_rflags);
2350
2351         /*
2352          * The same story as for sends (see above) applies to receives,
2353          * although this case should be even more rare in practice.
2354          */
2355         sockevent_raise(sock, SEV_RECV);
2356 }
2357
2358 /*
2359  * Cancel a previous request that may currently be suspended.  The cancel
2360  * operation itself does not have a reply.  Instead, if the given request was
2361  * found to be suspended, that request must be aborted and an appropriate reply
2362  * must be sent for the request.  If no matching request was found, no reply
2363  * must be sent at all.
2364  */
2365 static void
2366 sockevent_cancel(sockid_t id, const struct sockdriver_call * call)
2367 {
2368         struct sockevent_proc *spr;
2369         struct sock *sock;
2370
2371         /*
2372          * Due to asynchronous close(2) operations, not even the sock object
2373          * may be found.  If this (entirely legitimate) case, do not send any
2374          * reply.
2375          */
2376         if ((sock = sockhash_get(id)) == NULL)
2377                 return;
2378
2379         /*
2380          * The request may already have completed by the time we receive the
2381          * cancel request, in which case we can not find it.  In this (entirely
2382          * legitimate) case, do not send any reply.
2383          */
2384         if ((spr = sockevent_unsuspend(sock, call)) == NULL)
2385                 return;
2386
2387         /*
2388          * We found the operation.  Cancel it according to its call type.
2389          * Then, once fully done with it, free the suspension data structure.
2390          *
2391          * Note that we have to use the call structure from the suspension data
2392          * structure rather than the given 'call' pointer: only the former
2393          * includes all the information necessary to resume the request!
2394          */
2395         switch (spr->spr_event) {
2396         case SEV_BIND:
2397         case SEV_CONNECT:
2398                 assert(spr->spr_call.sc_endpt != NONE);
2399
2400                 sockdriver_reply_generic(&spr->spr_call, EINTR);
2401
2402                 break;
2403
2404         case SEV_ACCEPT:
2405                 sockdriver_reply_accept(&spr->spr_call, EINTR, NULL, 0);
2406
2407                 break;
2408
2409         case SEV_SEND:
2410                 sockevent_cancel_send(sock, spr, EINTR);
2411
2412                 break;
2413
2414         case SEV_RECV:
2415                 sockevent_cancel_recv(sock, spr, EINTR);
2416
2417                 break;
2418
2419         case SEV_CLOSE:
2420                 /*
2421                  * Return EINPROGRESS rather than EINTR, so that the user
2422                  * process can tell from the close(2) result that the file
2423                  * descriptor has in fact been closed.
2424                  */
2425                 sockdriver_reply_generic(&spr->spr_call, EINPROGRESS);
2426
2427                 /*
2428                  * Do not free the sock object here: the socket driver will
2429                  * complete the close in the background, and fire SEV_CLOSE
2430                  * once it is done.  Only then is the sock object freed.
2431                  */
2432                 break;
2433
2434         default:
2435                 panic("libsockevent: process suspended on unknown event 0x%x",
2436                     spr->spr_event);
2437         }
2438
2439         sockevent_proc_free(spr);
2440 }
2441
2442 /*
2443  * Process a select request.
2444  */
2445 static int
2446 sockevent_select(sockid_t id, unsigned int ops,
2447         const struct sockdriver_select * sel)
2448 {
2449         struct sock *sock;
2450         unsigned int r, notify;
2451
2452         if ((sock = sockhash_get(id)) == NULL)
2453                 return EINVAL;
2454
2455         notify = (ops & SDEV_NOTIFY);
2456         ops &= (SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR);
2457
2458         /*
2459          * See if any of the requested select operations can be satisfied
2460          * immediately.
2461          */
2462         r = sockevent_test_select(sock, ops);
2463
2464         /*
2465          * If select operations were pending, the new results must not indicate
2466          * that any of those were satisfied, as that would indicate an internal
2467          * logic error: the socket driver is supposed to update its state
2468          * proactively, and thus, discovering that things have changed here is
2469          * not something that should ever happen.
2470          */
2471         assert(!(sock->sock_selops & r));
2472
2473         /*
2474          * If any select operations are not satisfied immediately, and we are
2475          * asked to notify the caller when they are satisfied later, save them
2476          * for later retesting.
2477          */
2478         ops &= ~r;
2479
2480         if (notify && ops != 0) {
2481                 /*
2482                  * For now, we support only one caller when it comes to select
2483                  * queries: VFS.  If we want to support a networked file system
2484                  * (or so) directly calling select as well, this library will
2485                  * have to be extended accordingly (should not be too hard).
2486                  */
2487                 if (sock->sock_select.ss_endpt != NONE) {
2488                         if (sock->sock_select.ss_endpt != sel->ss_endpt) {
2489                                 printf("libsockevent: no support for multiple "
2490                                     "select callers yet\n");
2491
2492                                 return EIO;
2493                         }
2494
2495                         /*
2496                          * If a select query was already pending for this
2497                          * caller, we must simply merge in the new operations.
2498                          */
2499                         sock->sock_selops |= ops;
2500                 } else {
2501                         assert(sel->ss_endpt != NONE);
2502
2503                         sock->sock_select = *sel;
2504                         sock->sock_selops = ops;
2505                 }
2506         }
2507
2508         return r;
2509 }
2510
2511 /*
2512  * An alarm has triggered.  Expire any timers.  Socket drivers that do not pass
2513  * clock notification messages to libsockevent must call expire_timers(3)
2514  * themselves instead.
2515  */
2516 static void
2517 sockevent_alarm(clock_t now)
2518 {
2519
2520         expire_timers(now);
2521 }
2522
2523 static const struct sockdriver sockevent_tab = {
2524         .sdr_socket             = sockevent_socket,
2525         .sdr_socketpair         = sockevent_socketpair,
2526         .sdr_bind               = sockevent_bind,
2527         .sdr_connect            = sockevent_connect,
2528         .sdr_listen             = sockevent_listen,
2529         .sdr_accept             = sockevent_accept,
2530         .sdr_send               = sockevent_send,
2531         .sdr_recv               = sockevent_recv,
2532         .sdr_ioctl              = sockevent_ioctl,
2533         .sdr_setsockopt         = sockevent_setsockopt,
2534         .sdr_getsockopt         = sockevent_getsockopt,
2535         .sdr_getsockname        = sockevent_getsockname,
2536         .sdr_getpeername        = sockevent_getpeername,
2537         .sdr_shutdown           = sockevent_shutdown,
2538         .sdr_close              = sockevent_close,
2539         .sdr_cancel             = sockevent_cancel,
2540         .sdr_select             = sockevent_select,
2541         .sdr_alarm              = sockevent_alarm
2542 };
2543
2544 /*
2545  * Initialize the socket event library.
2546  */
2547 void
2548 sockevent_init(sockevent_socket_cb_t socket_cb)
2549 {
2550
2551         sockhash_init();
2552
2553         socktimer_init();
2554
2555         sockevent_proc_init();
2556
2557         SIMPLEQ_INIT(&sockevent_pending);
2558
2559         assert(socket_cb != NULL);
2560         sockevent_socket_cb = socket_cb;
2561
2562         /* Announce we are up. */
2563         sockdriver_announce();
2564
2565         sockevent_working = FALSE;
2566 }
2567
2568 /*
2569  * Process a socket driver request message.
2570  */
2571 void
2572 sockevent_process(const message * m_ptr, int ipc_status)
2573 {
2574
2575         /* Block events until after we have processed the request. */
2576         assert(!sockevent_working);
2577         sockevent_working = TRUE;
2578
2579         /* Actually process the request. */
2580         sockdriver_process(&sockevent_tab, m_ptr, ipc_status);
2581
2582         /*
2583          * If any events were fired while processing the request, they will
2584          * have been queued for later.  Go through them now.
2585          */
2586         if (sockevent_has_events())
2587                 sockevent_pump();
2588
2589         sockevent_working = FALSE;
2590 }