minix/net/uds/io.c

   1 /* UNIX Domain Sockets - io.c - sending and receiving */
   2
   3 #include "uds.h"
   4 #include <sys/mman.h>
   5
   6 /*
   7  * Our UDS sockets do not have a send buffer.  They only have a receive buffer.
   8  * This receive buffer, when not empty, is split up in segments.  Each segment
   9  * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and
  10  * (SOCK_DGRAM) neither.  There are two types of ancillary data: in-flight file
  11  * descriptors and sender credentials.  In addition, for SOCK_DGRAM sockets,
  12  * the segment may contain the sender's socket path (if the sender's socket is
  13  * bound).  Each segment has a header, containing the full segment size, the
  14  * size of the actual data in the segment (if any), and a flags field that
  15  * states which ancillary are associated with the segment (if any).  For
  16  * SOCK_STREAM type sockets, new data may be merged into a previous segment,
  17  * but only if it has no ancillary data.  For the other two socket types, each
  18  * packet has its own header.  The resulting behavior should be in line with
  19  * the POSIX "Socket Receive Queue" specification.
  20  *
  21  * More specifically, each segment consists of the following parts:
  22  * - always a five-byte header, containing a two-byte segment length (including
  23  *   the header, so always non-zero), a two-byte regular data length (zero or
  24  *   more), and a one-byte flags field which is a bitwise combination of
  25  *   UDS_HAS_{FD,CRED,PATH} flags;
  26  * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure;
  27  *   since this structure is variable-size, the structure is prepended by a
  28  *   single byte that contains the length of the structure (excluding the byte
  29  *   itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN);
  30  * - next, if UDS_HAS_PATH is set in the segment header:
  31  * - next, if the data length is non-zero, the actual regular data.
  32  * If the segment is not the last in the receive buffer, it is followed by the
  33  * next segment immediately afterward.  There is no alignment.
  34  *
  35  * It is the sender's responsibility to merge new data into the last segment
  36  * whenever possible, so that the receiver side never needs to consider more
  37  * than one segment at once.  In order to allow such merging, each receive
  38  * buffer has not only a tail and in-use length (pointing to the head when
  39  * combined) but also an offset from the tail to the last header, if any.  Note
  40  * that the receiver may over time still look at multiple segments for a single
  41  * request: this happens when a MSG_WAITALL request empties the buffer and then
  42  * blocks - the next piece of arriving data can then obviously not be merged.
  43  *
  44  * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file
  45  * descriptors are associated with the segment.  These are stored in a separate
  46  * data structure, mainly to simplify cleaning up when the socket is shut down
  47  * for reading or closed.  That structure also contains the number of file
  48  * descriptors associated with the current segment, so this is not stored in
  49  * the segment itself.  As mentioned later, this may be changed in the future.
  50  *
  51  * On the sender side, there is a trade-off between fully utilizing the receive
  52  * buffer, and not repeatedly performing expensive actions for the same call:
  53  * it may be costly to determine exactly how many in-flight file descriptors
  54  * there will be (if any) and/or how much space is needed to store credentials.
  55  * We currently use the policy that we rather block/reject a send request that
  56  * may (just) have fit in the remaining part of the receive buffer, than obtain
  57  * the same information multiple times or keep state between callbacks.  In
  58  * practice this is not expected to make a difference, especially since
  59  * transfer of ancillary data should be rare anyway.
  60  */
  61 /*
  62  * The current layout of the segment header is as follows.
  63  *
  64  * The first byte contains the upper eight bits of the total segment length.
  65  * The second byte contains the lower eight bits of the total segment length.
  66  * The third byte contains the upper eight bits of the data length.
  67  * The fourth byte contains the lower eight bits of the data length.
  68  * The fifth byte is a bitmask for ancillary data associated with the segment.
  69  */
  70 #define UDS_HDRLEN      5
  71
  72 #define UDS_HAS_FDS     0x01    /* segment has in-flight file descriptors */
  73 #define UDS_HAS_CRED    0x02    /* segment has sender credentials */
  74 #define UDS_HAS_PATH    0x04    /* segment has source socket path */
  75
  76 #define UDS_MAXCREDLEN  SOCKCREDSIZE(NGROUPS_MAX)
  77
  78 #define uds_get_head(uds)       \
  79         ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF)
  80 #define uds_get_last(uds)       \
  81         ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF)
  82 #define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF)
  83
  84 /*
  85  * All in-flight file descriptors are (co-)owned by the UDS driver itself, as
  86  * local open file descriptors.  Like any other process, the UDS driver can not
  87  * have more than OPEN_MAX open file descriptors at any time.  Thus, this is
  88  * also the inherent maximum number of in-flight file descriptors.  Therefore,
  89  * we maintain a single pool of in-flight FD structures, and we associate these
  90  * structures with sockets as needed.
  91  */
  92 static struct uds_fd uds_fds[OPEN_MAX];
  93 static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds;
  94
  95 static char uds_ctlbuf[UDS_CTL_MAX];
  96 static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)];
  97
  98 /*
  99  * Initialize the input/output part of the UDS service.
 100  */
 101 void
 102 uds_io_init(void)
 103 {
 104         unsigned int slot;
 105
 106         SIMPLEQ_INIT(&uds_freefds);
 107
 108         for (slot = 0; slot < __arraycount(uds_fds); slot++)
 109                 SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next);
 110 }
 111
 112 /*
 113  * Set up all input/output state for the given socket, which has just been
 114  * allocated.  As part of this, allocate memory for the receive buffer of the
 115  * socket.  Return OK or a negative error code.
 116  */
 117 int
 118 uds_io_setup(struct udssock * uds)
 119 {
 120
 121         /* TODO: decide if we should preallocate the memory. */
 122         if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
 123             MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
 124                 return ENOMEM;
 125
 126         uds->uds_tail = 0;
 127         uds->uds_len = 0;
 128         uds->uds_last = 0;
 129
 130         SIMPLEQ_INIT(&uds->uds_fds);
 131
 132         return OK;
 133 }
 134
 135 /*
 136  * Clean up the input/output state for the given socket, which is about to be
 137  * freed.  As part of this, deallocate memory for the receive buffer and close
 138  * any file descriptors still in flight on the socket.
 139  */
 140 void
 141 uds_io_cleanup(struct udssock * uds)
 142 {
 143
 144         /* Close any in-flight file descriptors. */
 145         uds_io_reset(uds);
 146
 147         /* Free the receive buffer memory. */
 148         if (munmap(uds->uds_buf, UDS_BUF) != 0)
 149                 panic("UDS: munmap failed: %d", errno);
 150 }
 151
 152 /*
 153  * The socket is being closed or shut down for reading.  If there are still any
 154  * in-flight file descriptors, theey will never be received anymore, so close
 155  * them now.
 156  */
 157 void
 158 uds_io_reset(struct udssock * uds)
 159 {
 160         struct uds_fd *ufd;
 161
 162         /*
 163          * The UDS service may have the last and only reference to any of these
 164          * file descriptors here.  For that reason, we currently disallow
 165          * transfer of UDS file descriptors, because the close(2) here could
 166          * block on a socket close operation back to us, leading to a deadlock.
 167          * Also, we use a non-blocking variant of close(2), to prevent that we
 168          * end up hanging on sockets with SO_LINGER turned on.
 169          */
 170         SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) {
 171                 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
 172
 173                 closenb(ufd->ufd_fd);
 174         }
 175
 176         SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds);
 177
 178         /*
 179          * If this reset happens as part of a shutdown, it might be done
 180          * again on close, so ensure that it will find a clean state.  The
 181          * receive buffer should never be looked at again either way, but reset
 182          * it too just to be sure.
 183          */
 184         uds->uds_tail = 0;
 185         uds->uds_len = 0;
 186         uds->uds_last = 0;
 187
 188         SIMPLEQ_INIT(&uds->uds_fds);
 189 }
 190
 191 /*
 192  * Return the maximum usable part of the receive buffer, in bytes.  The return
 193  * value is used for the SO_SNDBUF and SO_RCVBUF socket options.
 194  */
 195 size_t
 196 uds_io_buflen(void)
 197 {
 198
 199         /*
 200          * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we
 201          * could use the full receive buffer for data.  This would require that
 202          * we store up to one header in the socket object rather than in the
 203          * receive buffer.
 204          */
 205         return UDS_BUF - UDS_HDRLEN;
 206 }
 207
 208 /*
 209  * Fetch 'len' bytes starting from absolute position 'pos' into the receive
 210  * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'.
 211  * Return the absolute position of the first byte after the fetched data in the
 212  * receive buffer.
 213  */
 214 static size_t
 215 uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len)
 216 {
 217         size_t left;
 218
 219         assert(off < UDS_BUF);
 220
 221         left = UDS_BUF - off;
 222         if (len >= left) {
 223                 memcpy(ptr, &uds->uds_buf[off], left);
 224
 225                 if ((len -= left) > 0)
 226                         memcpy((char *)ptr + left, &uds->uds_buf[0], len);
 227
 228                 return len;
 229         } else {
 230                 memcpy(ptr, &uds->uds_buf[off], len);
 231
 232                 return off + len;
 233         }
 234 }
 235
 236 /*
 237  * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive
 238  * buffer of socket 'uds', starting at absolute position 'pos' into the receive
 239  * buffer.  Return the absolute position of the first byte after the stored
 240  * data in the receive buffer.
 241  */
 242 static size_t
 243 uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len)
 244 {
 245         size_t left;
 246
 247         assert(off < UDS_BUF);
 248
 249         left = UDS_BUF - off;
 250         if (len >= left) {
 251                 memcpy(&uds->uds_buf[off], ptr, left);
 252
 253                 if ((len -= left) > 0)
 254                         memcpy(&uds->uds_buf[0], (const char *)ptr + left,
 255                             len);
 256
 257                 return len;
 258         } else {
 259                 memcpy(&uds->uds_buf[off], ptr, len);
 260
 261                 return off + len;
 262         }
 263 }
 264
 265 /*
 266  * Fetch a segment header previously stored in the receive buffer of socket
 267  * 'uds' at absolute position 'off'.  Return the absolute position of the first
 268  * byte after the header, as well as the entire segment length in 'seglen', the
 269  * length of the data in the segment in 'datalen', and the segment flags in
 270  * 'segflags'.
 271  */
 272 static size_t
 273 uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen,
 274         size_t * datalen, unsigned int * segflags)
 275 {
 276         unsigned char hdr[UDS_HDRLEN];
 277
 278         off = uds_fetch(uds, off, hdr, sizeof(hdr));
 279
 280         *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1];
 281         *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3];
 282         *segflags = hdr[4];
 283
 284         assert(*seglen >= UDS_HDRLEN);
 285         assert(*seglen <= uds->uds_len);
 286         assert(*datalen <= *seglen - UDS_HDRLEN);
 287         assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN);
 288         assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
 289
 290         return off;
 291 }
 292
 293 /*
 294  * Store a segment header in the receive buffer of socket 'uds' at absolute
 295  * position 'off', with the segment length 'seglen', the segment data length
 296  * 'datalen', and the segment flags 'segflags'.  Return the absolute receive
 297  * buffer position of the first data byte after the stored header.
 298  */
 299 static size_t
 300 uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen,
 301         unsigned int segflags)
 302 {
 303         unsigned char hdr[UDS_HDRLEN];
 304
 305         assert(seglen <= USHRT_MAX);
 306         assert(datalen <= seglen);
 307         assert(segflags <= UCHAR_MAX);
 308         assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
 309
 310         hdr[0] = (seglen >> 8) & 0xff;
 311         hdr[1] = seglen & 0xff;
 312         hdr[2] = (datalen >> 8) & 0xff;
 313         hdr[3] = datalen & 0xff;
 314         hdr[4] = segflags;
 315
 316         return uds_store(uds, off, hdr, sizeof(hdr));
 317 }
 318
 319 /*
 320  * Perform initial checks on a send request, before it may potentially be
 321  * suspended.  Return OK if this send request is valid, or a negative error
 322  * code if it is not.
 323  */
 324 int
 325 uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
 326         const struct sockaddr * addr, socklen_t addr_len __unused,
 327         endpoint_t user_endpt __unused, int flags)
 328 {
 329         struct udssock *uds = (struct udssock *)sock;
 330         size_t pathlen;
 331
 332         /*
 333          * Reject calls with unknown flags.  Besides the flags handled entirely
 334          * by libsockevent (which are not part of 'flags' here), that is all of
 335          * them.  TODO: ensure that we should really reject all other flags
 336          * rather than ignore them.
 337          */
 338         if (flags != 0)
 339                 return EOPNOTSUPP;
 340
 341         /*
 342          * Perform very basic address and message size checks on the send call.
 343          * For non-stream sockets, we must reject packets that may never fit in
 344          * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the
 345          * send call may end up being suspended indefinitely.  Therefore, we
 346          * assume the worst-case scenario, which is that a full set of
 347          * credentials must be associated with the packet.  As a result, we may
 348          * reject some large packets that could actually just fit.  Checking
 349          * the peer's LOCAL_CREDS setting here is not safe: even if we know the
 350          * peer already at all (for SOCK_DGRAM we do not), the send may still
 351          * block and the option toggled before it unblocks.
 352          */
 353         switch (uds_get_type(uds)) {
 354         case SOCK_STREAM:
 355                 /* Nothing to check for this case. */
 356                 break;
 357
 358         case SOCK_SEQPACKET:
 359                 if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN)
 360                         return EMSGSIZE;
 361
 362                 break;
 363
 364         case SOCK_DGRAM:
 365                 if (!uds_has_link(uds) && addr == NULL)
 366                         return EDESTADDRREQ;
 367
 368                 /*
 369                  * The path is stored without null terminator, but with leading
 370                  * byte containing the path length--if there is a path at all.
 371                  */
 372                 pathlen = (size_t)uds->uds_pathlen;
 373                 if (pathlen > 0)
 374                         pathlen++;
 375
 376                 if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN)
 377                         return EMSGSIZE;
 378
 379                 break;
 380
 381         default:
 382                 assert(0);
 383         }
 384
 385         return OK;
 386 }
 387
 388 /*
 389  * Determine whether the (real or pretend) send request should be processed
 390  * now, suspended until later, or rejected based on the current socket state.
 391  * Return OK if the send request should be processed now.  Return SUSPEND if
 392  * the send request should be retried later.  Return an appropriate negative
 393  * error code if the send request should fail.
 394  */
 395 static int
 396 uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min,
 397         int partial)
 398 {
 399         struct udssock *conn;
 400         size_t avail, hdrlen, credlen;
 401
 402         assert(!uds_is_shutdown(uds, SFL_SHUT_WR));
 403
 404         if (uds_get_type(uds) != SOCK_DGRAM) {
 405                 if (uds_is_connecting(uds))
 406                         return SUSPEND;
 407                 if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
 408                         return ENOTCONN;
 409                 if (!uds_has_conn(uds))
 410                         return EPIPE;
 411
 412                 conn = uds->uds_conn;
 413
 414                 if (uds_is_shutdown(conn, SFL_SHUT_RD))
 415                         return EPIPE;
 416
 417                 /*
 418                  * For connection-type sockets, we now have to check if there
 419                  * is enough room in the receive buffer.  For SOCK_STREAM
 420                  * sockets, we must check if at least 'min' bytes can be moved
 421                  * into the receive buffer, at least if that is a reasonable
 422                  * value for ever making any forward progress at all.  For
 423                  * SOCK_SEQPACKET sockets, we must check if the entire packet
 424                  * of size 'len' can be stored in the receive buffer.  In both
 425                  * cases, we must take into account any metadata to store along
 426                  * with the data.
 427                  *
 428                  * Unlike in uds_pre_send(), we can now check safely whether
 429                  * the peer is expecting credentials, but we still don't know
 430                  * the actual size of the credentials, so again we take the
 431                  * maximum possible size.  The same applies to file descriptors
 432                  * transferred via control data: all we have the control length
 433                  * right now, which if non-zero we assume to mean there might
 434                  * be file descriptors.
 435                  *
 436                  * In both cases, the reason of overestimating is that actually
 437                  * getting accurate sizes, by obtaining credentials or copying
 438                  * in control data, is very costly.  We want to do that only
 439                  * when we are sure we will not suspend the send call after
 440                  * all.  It is no problem to overestimate how much space will
 441                  * be needed here, but not to underestimate: that could cause
 442                  * applications that use select(2) and non-blocking sockets to
 443                  * end up in a busy-wait loop.
 444                  */
 445                 if (!partial && (conn->uds_flags & UDSF_PASSCRED))
 446                         credlen = 1 + UDS_MAXCREDLEN;
 447                 else
 448                         credlen = 0;
 449
 450                 avail = UDS_BUF - conn->uds_len;
 451
 452                 if (uds_get_type(uds) == SOCK_STREAM) {
 453                         /*
 454                          * Limit the low threshold to the maximum that can ever
 455                          * be sent at once.
 456                          */
 457                         if (min > UDS_BUF - UDS_HDRLEN - credlen)
 458                                 min = UDS_BUF - UDS_HDRLEN - credlen;
 459
 460                         /*
 461                          * Suspend the call only if not even the low threshold
 462                          * is met.  Otherwise we may make (partial) progress.
 463                          */
 464                         if (len > min)
 465                                 len = min;
 466
 467                         /*
 468                          * If the receive buffer already has at least one
 469                          * segment, and there are certainly no file descriptors
 470                          * to transfer now, and we do not have to store
 471                          * credentials either, then this segment can be merged
 472                          * with the previous one.  In that case, we need no
 473                          * space for a header.  That is certainly the case if
 474                          * we are resuming an already partially completed send.
 475                          */
 476                         hdrlen = (avail == UDS_BUF || ctl_len != 0 ||
 477                             credlen > 0) ? UDS_HDRLEN : 0;
 478                 } else
 479                         hdrlen = UDS_HDRLEN;
 480
 481                 if (avail < hdrlen + credlen + len)
 482                         return SUSPEND;
 483         }
 484
 485         return OK;
 486 }
 487
 488 /*
 489  * Get the destination peer for a send request.  The send test has already been
 490  * performed first.  On success, return OK, with a pointer to the peer socket
 491  * stored in 'peerp'.  On failure, return an appropriate error code.
 492  */
 493 static int
 494 uds_send_peer(struct udssock * uds, const struct sockaddr * addr,
 495         socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
 496 {
 497         struct udssock *peer;
 498         int r;
 499
 500         if (uds_get_type(uds) == SOCK_DGRAM) {
 501                 if (!uds_has_link(uds)) {
 502                         /* This was already checked in uds_pre_check(). */
 503                         assert(addr != NULL);
 504
 505                         /*
 506                          * Find the socket identified by the given address.
 507                          * If it exists at all, see if it is a proper match.
 508                          */
 509                         if ((r = uds_lookup(uds, addr, addr_len, user_endpt,
 510                             &peer)) != OK)
 511                                 return r;
 512
 513                         /*
 514                          * If the peer socket is connected to a target, it
 515                          * must be this socket.  Unfortunately, POSIX does not
 516                          * specify an error code for this.  We borrow Linux's.
 517                          */
 518                         if (uds_has_link(peer) && peer->uds_link != uds)
 519                                 return EPERM;
 520                 } else
 521                         peer = uds->uds_link;
 522
 523                 /*
 524                  * If the receiving end will never receive this packet, we
 525                  * might as well not send it, so drop it immeiately.  Indicate
 526                  * as such to the caller, using NetBSD's chosen error code.
 527                  */
 528                 if (uds_is_shutdown(peer, SFL_SHUT_RD))
 529                         return ENOBUFS;
 530         } else {
 531                 assert(uds_has_conn(uds));
 532
 533                 peer = uds->uds_conn;
 534         }
 535
 536         *peerp = peer;
 537         return OK;
 538 }
 539
 540 /*
 541  * Generate a new segment for the current send request, or arrange things such
 542  * that new data can be merged with a previous segment.  As part of this,
 543  * decide whether we can merge data at all.  The segment will be merged if, and
 544  * only if, all of the following requirements are met:
 545  *
 546  *   1) the socket is of type SOCK_STREAM;
 547  *   2) there is a previous segment in the receive buffer;
 548  *   3) there is no ancillary data for the current send request.
 549  *
 550  * Also copy in regular data (if any), retrieve the sender's credentials (if
 551  * needed), and copy over the source path (if applicable).  However, do not yet
 552  * commit the segment (or the new part to be merged), because the send request
 553  * may still fail for other reasons.
 554  *
 555  * On success, return the length of the new segment (or, when merging, the
 556  * length to be added to the last segment), as well as a flag indicating
 557  * whether we are merging into the last segment in 'mergep', the length of the
 558  * (new) data in the segment in 'datalenp', and the new segment's flags in
 559  * 'segflagsp' (always zero when merging).  Note that a return value of zero
 560  * implies that we are merging zero extra bytes into the last segment, which
 561  * means that effectively nothing changes; in that case the send call will be
 562  * cut short and return zero to the caller as well.  On failure, return a
 563  * negative error code.
 564  */
 565 static int
 566 uds_send_data(struct udssock * uds, struct udssock * peer,
 567         const struct sockdriver_data * data, size_t len, size_t off,
 568         endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep,
 569         size_t * __restrict datalenp, unsigned int * __restrict segflagsp)
 570 {
 571         struct sockcred sockcred;
 572         gid_t groups[NGROUPS_MAX];
 573         iovec_t iov[2];
 574         unsigned int iovcnt, segflags;
 575         unsigned char lenbyte;
 576         size_t credlen, pathlen, datalen, seglen;
 577         size_t avail, pos, left;
 578         int r, merge;
 579
 580         /*
 581          * At this point we should add the data to the peer's receive buffer.
 582          * In the case of SOCK_STREAM sockets, we should add as much of the
 583          * data as possible and suspend the call to send the rest later, if
 584          * applicable.  In the case of SOCK_DGRAM sockets, we should drop the
 585          * packet if it does not fit in the buffer.
 586          *
 587          * Due to the checks in uds_can_send(), we know for sure that we no
 588          * longer have to suspend without making any progress at this point.
 589          */
 590         segflags = (nfds > 0) ? UDS_HAS_FDS : 0;
 591
 592         /*
 593          * Obtain the credentials now.  Doing so allows us to determine how
 594          * much space we actually need for them.
 595          */
 596         if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) {
 597                 memset(&sockcred, 0, sizeof(sockcred));
 598
 599                 if ((r = getsockcred(user_endpt, &sockcred, groups,
 600                     __arraycount(groups))) != OK)
 601                         return r;
 602
 603                 /*
 604                  * getsockcred(3) returns the total number of groups for the
 605                  * process, which may exceed the size of the given array.  Our
 606                  * groups array should always be large enough for all groups,
 607                  * but we check to be sure anyway.
 608                  */
 609                 assert(sockcred.sc_ngroups <= (int)__arraycount(groups));
 610
 611                 credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups);
 612
 613                 segflags |= UDS_HAS_CRED;
 614         } else
 615                 credlen = 0;
 616
 617         /* For bound source datagram sockets, include the source path. */
 618         if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) {
 619                 pathlen = (size_t)uds->uds_pathlen + 1;
 620
 621                 segflags |= UDS_HAS_PATH;
 622         } else
 623                 pathlen = 0;
 624
 625         avail = UDS_BUF - peer->uds_len;
 626
 627         if (uds_get_type(uds) == SOCK_STREAM) {
 628                 /*
 629                  * Determine whether we can merge data into the previous
 630                  * segment.  This is a more refined version of the test in
 631                  * uds_can_send(), as we now know whether there are actually
 632                  * any FDs to transfer.
 633                  */
 634                 merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0);
 635
 636                 /* Determine how much we can send at once. */
 637                 if (!merge) {
 638                         assert(avail > UDS_HDRLEN + credlen);
 639                         datalen = avail - UDS_HDRLEN - credlen;
 640                 } else
 641                         datalen = avail;
 642
 643                 if (datalen > len)
 644                         datalen = len;
 645
 646                 /* If we cannot make progress, we should have suspended.. */
 647                 assert(datalen != 0 || len == 0);
 648         } else {
 649                 merge = FALSE;
 650
 651                 datalen = len;
 652         }
 653         assert(datalen <= len);
 654         assert(datalen <= UDS_BUF);
 655
 656         /*
 657          * Compute the total amount of space we need for the segment in the
 658          * receive buffer.  Given that we have done will-it-fit tests in
 659          * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one
 660          * case left where the result may not fit, and that is for SOCK_DGRAM
 661          * packets.  In that case, we drop the packet.  POSIX says we should
 662          * throw an error in that case, and that is also what NetBSD does.
 663          */
 664         if (!merge)
 665                 seglen = UDS_HDRLEN + credlen + pathlen + datalen;
 666         else
 667                 seglen = datalen;
 668
 669         if (seglen > avail) {
 670                 assert(uds_get_type(uds) == SOCK_DGRAM);
 671
 672                 /* Drop the packet, borrowing NetBSD's chosen error code. */
 673                 return ENOBUFS;
 674         }
 675
 676         /*
 677          * Generate the full segment, but do not yet update the buffer head.
 678          * We may still run into an error (copying in file descriptors) or even
 679          * decide that nothing gets sent after all (if there are no data or
 680          * file descriptors).  If we are merging the new data into the previous
 681          * segment, do not generate a header.
 682          */
 683         pos = uds_get_head(peer);
 684
 685         /* Generate the header, if needed. */
 686         if (!merge)
 687                 pos = uds_store_hdr(peer, pos, seglen, datalen, segflags);
 688         else
 689                 assert(segflags == 0);
 690
 691         /* Copy in and store the sender's credentials, if desired. */
 692         if (credlen > 0) {
 693                 assert(credlen >= 1 + sizeof(sockcred));
 694                 assert(credlen <= UCHAR_MAX);
 695
 696                 lenbyte = credlen - 1;
 697                 pos = uds_store(peer, pos, &lenbyte, 1);
 698
 699                 if (sockcred.sc_ngroups > 0) {
 700                         pos = uds_store(peer, pos, &sockcred,
 701                             offsetof(struct sockcred, sc_groups));
 702                         pos = uds_store(peer, pos, groups,
 703                             sockcred.sc_ngroups * sizeof(gid_t));
 704                 } else
 705                         pos = uds_store(peer, pos, &sockcred,
 706                             sizeof(sockcred));
 707         }
 708
 709         /* Store the sender's address if any.  Datagram sockets only. */
 710         if (pathlen > 0) {
 711                 assert(pathlen > 1);
 712                 assert(pathlen <= UCHAR_MAX);
 713
 714                 lenbyte = uds->uds_pathlen;
 715                 pos = uds_store(peer, pos, &lenbyte, 1);
 716                 pos = uds_store(peer, pos, uds->uds_path, pathlen - 1);
 717         }
 718
 719         /* Lastly, copy in the actual data (if any) from the caller. */
 720         if (datalen > 0) {
 721                 iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos];
 722                 left = UDS_BUF - pos;
 723
 724                 if (left < datalen) {
 725                         assert(left > 0);
 726                         iov[0].iov_size = left;
 727                         iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0];
 728                         iov[1].iov_size = datalen - left;
 729                         iovcnt = 2;
 730                 } else {
 731                         iov[0].iov_size = datalen;
 732                         iovcnt = 1;
 733                 }
 734
 735                 if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK)
 736                         return r;
 737         }
 738
 739         *mergep = merge;
 740         *datalenp = datalen;
 741         *segflagsp = segflags;
 742         return seglen;
 743 }
 744
 745 /*
 746  * Copy in control data for the current send request, and extract any file
 747  * descriptors to be transferred.  Do not yet duplicate the file descriptors,
 748  * but rather store a list in a temporary buffer: the send request may still
 749  * fail in which case we want to avoid having to undo the duplication.
 750  *
 751  * On success, return the number of (zero or more) file descriptors extracted
 752  * from the request and stored in the temporary buffer.  On failure, return a
 753  * negative error code.
 754  */
 755 static int
 756 uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len,
 757         endpoint_t user_endpt)
 758 {
 759         struct msghdr msghdr;
 760         struct cmsghdr *cmsg;
 761         socklen_t left;
 762         unsigned int i, n, nfds;
 763         int r;
 764
 765         /*
 766          * Copy in the control data.  We can spend a lot of effort copying in
 767          * the data in small chunks, and change the receiving side to do the
 768          * same, but it is really not worth it: applications never send a whole
 769          * lot of file descriptors at once, and the buffer size is currently
 770          * such that the UDS service itself will exhaust its OPEN_MAX limit
 771          * anyway if they do.
 772          */
 773         if (ctl_len > sizeof(uds_ctlbuf))
 774                 return ENOBUFS;
 775
 776         if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK)
 777                 return r;
 778
 779         if (ctl_len < sizeof(uds_ctlbuf))
 780                 memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len);
 781
 782         /*
 783          * Look for any file descriptors, and store their remote file
 784          * descriptor numbers into a temporary array.
 785          */
 786         memset(&msghdr, 0, sizeof(msghdr));
 787         msghdr.msg_control = uds_ctlbuf;
 788         msghdr.msg_controllen = ctl_len;
 789
 790         nfds = 0;
 791         r = OK;
 792
 793         /*
 794          * The sender may provide file descriptors in multiple chunks.
 795          * Currently we do not preserve these chunk boundaries, instead
 796          * generating one single chunk with all file descriptors for the
 797          * segment upon receipt.  If needed, we can fairly easily adapt this
 798          * later.
 799          */
 800         for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
 801             cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
 802                 /*
 803                  * Check for bogus lengths.  There is no excuse for this;
 804                  * either the caller does not know what they are doing or we
 805                  * are looking at a hacking attempt.
 806                  */
 807                 assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len);
 808                 left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf);
 809                 assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
 810
 811                 if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
 812                         printf("UDS: malformed control data from %u\n",
 813                             user_endpt);
 814                         r = EINVAL;
 815                         break;
 816                 }
 817
 818                 if (cmsg->cmsg_level != SOL_SOCKET ||
 819                     cmsg->cmsg_type != SCM_RIGHTS)
 820                         continue;
 821
 822                 n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
 823
 824                 for (i = 0; i < n; i++) {
 825                         /*
 826                          * Copy the file descriptor to the temporary buffer,
 827                          * whose size is based on the control data buffer, so
 828                          * it is always large enough to contain all FDs.
 829                          */
 830                         assert(nfds < __arraycount(uds_ctlfds));
 831
 832                         memcpy(&uds_ctlfds[nfds],
 833                             &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
 834
 835                         nfds++;
 836                 }
 837         }
 838
 839         return nfds;
 840 }
 841
 842 /*
 843  * Actually duplicate any file descriptors that we extracted from the sender's
 844  * control data and stored in our temporary buffer.  On success, return OK,
 845  * with all file descriptors stored in file descriptor objects that are
 846  * appended to the socket's list of in-flight FD objects.  Thus, on success,
 847  * the send request may no longer fail.  On failure, return a negative error
 848  * code, with any partial duplication undone.
 849  */
 850 static int
 851 uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt)
 852 {
 853         SIMPLEQ_HEAD(, uds_fd) fds;
 854         struct uds_fd *ufd;
 855         unsigned int i;
 856         int r;
 857
 858         SIMPLEQ_INIT(&fds);
 859
 860         for (i = 0; i < nfds; i++) {
 861                 if (SIMPLEQ_EMPTY(&uds_freefds)) {
 862                         /* UDS itself may already have OPEN_MAX FDs. */
 863                         r = ENFILE;
 864                         break;
 865                 }
 866
 867                 /*
 868                  * The caller may have given an invalid FD, or UDS itself may
 869                  * unexpectedly have run out of available file descriptors etc.
 870                  */
 871                 if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0)
 872                         break;
 873
 874                 ufd = SIMPLEQ_FIRST(&uds_freefds);
 875                 SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next);
 876
 877                 ufd->ufd_fd = r;
 878                 ufd->ufd_count = 0;
 879
 880                 SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next);
 881
 882                 dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r));
 883         }
 884
 885         /* Did we experience an error while copying in the file descriptors? */
 886         if (r < 0) {
 887                 /* Revert the successful copyfd() calls made so far. */
 888                 SIMPLEQ_FOREACH(ufd, &fds, ufd_next) {
 889                         dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
 890
 891                         closenb(ufd->ufd_fd);
 892                 }
 893
 894                 SIMPLEQ_CONCAT(&uds_freefds, &fds);
 895
 896                 return r;
 897         }
 898
 899         /*
 900          * Success.  If there were any file descriptors at all, add them to the
 901          * peer's list of in-flight file descriptors.  Assign the number of
 902          * file descriptors copied in to the first file descriptor object, so
 903          * that we know how many to copy out (or discard) for this segment.
 904          * Also set the UDS_HAS_FDS flag on the segment.
 905          */
 906         ufd = SIMPLEQ_FIRST(&fds);
 907         ufd->ufd_count = nfds;
 908
 909         SIMPLEQ_CONCAT(&peer->uds_fds, &fds);
 910
 911         return OK;
 912 }
 913
 914 /*
 915  * The current send request is successful or at least has made progress.
 916  * Commit the new segment or, if we decided to merge the new data into the last
 917  * segment, update the header of the last segment.  Also wake up the receiving
 918  * side, because there will now be new data to receive.
 919  */
 920 static void
 921 uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen,
 922         int merge, size_t seglen, unsigned int segflags)
 923 {
 924         size_t pos, prevseglen, prevdatalen;
 925
 926         /*
 927          * For non-datagram sockets, credentials are sent only once after
 928          * setting the LOCAL_CREDS option.  After that, the option is unset.
 929          */
 930         if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM)
 931                 peer->uds_flags &= ~UDSF_PASSCRED;
 932
 933         if (merge) {
 934                 assert(segflags == 0);
 935
 936                 pos = uds_get_last(peer);
 937
 938                 (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen,
 939                     &segflags);
 940
 941                 peer->uds_len += seglen;
 942                 assert(peer->uds_len <= UDS_BUF);
 943
 944                 seglen += prevseglen;
 945                 datalen += prevdatalen;
 946                 assert(seglen <= UDS_BUF);
 947
 948                 uds_store_hdr(peer, pos, seglen, datalen, segflags);
 949         } else {
 950                 peer->uds_last = peer->uds_len;
 951
 952                 peer->uds_len += seglen;
 953                 assert(peer->uds_len <= UDS_BUF);
 954         }
 955
 956         /* Now that there are new data, wake up the receiver side. */
 957         sockevent_raise(&peer->uds_sock, SEV_RECV);
 958 }
 959
 960 /*
 961  * Process a send request.  Return OK if the send request has successfully
 962  * completed, SUSPEND if it should be tried again later, or a negative error
 963  * code on failure.  In all cases, the values of 'off' and 'ctl_off' must be
 964  * updated if any progress has been made; if either is non-zero, libsockevent
 965  * will return the partial progress rather than an error code.
 966  */
 967 int
 968 uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len,
 969         size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
 970         socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len,
 971         endpoint_t user_endpt, int flags __unused, size_t min)
 972 {
 973         struct udssock *uds = (struct udssock *)sock;
 974         struct udssock *peer;
 975         size_t seglen, datalen = 0 /*gcc*/;
 976         unsigned int nfds, segflags = 0 /*gcc*/;
 977         int r, partial, merge = 0 /*gcc*/;
 978
 979         dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n",
 980             uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
 981             (ctl_off != NULL) ? *ctl_off : 0, flags));
 982
 983         partial = (off != NULL && *off > 0);
 984
 985         /*
 986          * First see whether we can process this send call at all right now.
 987          * Most importantly, for connected sockets, if the peer's receive
 988          * buffer is full, we may have to suspend the call until some space has
 989          * been freed up.
 990          */
 991         if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK)
 992                 return r;
 993
 994         /*
 995          * Then get the peer socket.  For connected sockets, this is trivial.
 996          * For unconnected sockets, it may involve a lookup of the given
 997          * address.
 998          */
 999         if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK)
1000                 return r;
1001
1002         /*
1003          * We now know for sure that we will not suspend this call without
1004          * making any progress.  However, the call may still fail.  Copy in
1005          * control data first now, so that we know whether there are any file
1006          * descriptors to transfer.  This aspect may determine whether or not
1007          * we can merge data with a previous segment.  Do not actually copy in
1008          * the actual file descriptors yet, because that is much harder to undo
1009          * in case of a failure later on.
1010          */
1011         if (ctl_len > 0) {
1012                 /* We process control data once, in full. */
1013                 assert(*ctl_off == 0);
1014
1015                 if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0)
1016                         return r;
1017                 nfds = (unsigned int)r;
1018         } else
1019                 nfds = 0;
1020
1021         /*
1022          * Now generate a new segment, or (if possible) merge new data into the
1023          * last segment.  Since the call may still fail, prepare the segment
1024          * but do not update the buffer head yet.  Note that the segment
1025          * contains not just regular data (in fact it may contain no data at
1026          * all) but (also) certain ancillary data.
1027          */
1028         if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds,
1029             &merge, &datalen, &segflags)) <= 0)
1030                 return r;
1031         seglen = (size_t)r;
1032
1033         /*
1034          * If we extracted any file descriptors from the control data earlier,
1035          * copy them over to ourselves now.  The resulting in-flight file
1036          * descriptors are stored in a separate data structure.  This is the
1037          * last point where the send call may actually fail.
1038          */
1039         if (nfds > 0) {
1040                 if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK)
1041                         return r;
1042         }
1043
1044         /*
1045          * The transmission is now known to be (partially) successful.  Commit
1046          * the new work by moving the receive buffer head.
1047          */
1048         uds_send_advance(uds, peer, datalen, merge, seglen, segflags);
1049
1050         /*
1051          * Register the result.  For stream-type sockets, the expected behavior
1052          * is that all data be sent, and so we may still have to suspend the
1053          * call after partial progress.  Otherwise, we are now done.  Either
1054          * way, we are done with the control data, so mark it as consumed.
1055          */
1056         *off += datalen;
1057         *ctl_off += ctl_len;
1058         if (uds_get_type(uds) == SOCK_STREAM && datalen < len)
1059                 return SUSPEND;
1060         else
1061                 return OK;
1062 }
1063
1064 /*
1065  * Test whether a send request would block.  The given 'min' parameter contains
1066  * the minimum number of bytes that should be possible to send without blocking
1067  * (the low send watermark).  Return SUSPEND if the send request would block,
1068  * or any other error code if it would not.
1069  */
1070 int
1071 uds_test_send(struct sock * sock, size_t min)
1072 {
1073         struct udssock *uds = (struct udssock *)sock;
1074
1075         return uds_send_test(uds, min, 0, min, FALSE /*partial*/);
1076 }
1077
1078 /*
1079  * Perform initial checks on a receive request, before it may potentially be
1080  * suspended.  Return OK if this receive request is valid, or a negative error
1081  * code if it is not.
1082  */
1083 int
1084 uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1085         int flags)
1086 {
1087
1088         /*
1089          * Reject calls with unknown flags.  TODO: ensure that we should really
1090          * reject all other flags rather than ignore them.
1091          */
1092         if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0)
1093                 return EOPNOTSUPP;
1094
1095         return OK;
1096 }
1097
1098 /*
1099  * Determine whether the (real or pretend) receive request should be processed
1100  * now, suspended until later, or rejected based on the current socket state.
1101  * Return OK if the receive request should be processed now, along with a first
1102  * indication whether the call may still be suspended later in 'may_block'.
1103  * Return SUSPEND if the receive request should be retried later.  Return an
1104  * appropriate negative error code if the receive request should fail.
1105  */
1106 static int
1107 uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial,
1108         int * may_block)
1109 {
1110         size_t seglen, datalen;
1111         unsigned int segflags;
1112         int r;
1113
1114         /*
1115          * If there are any pending data, those should always be received
1116          * first.  However, if there is nothing to receive, then whether we
1117          * should suspend the receive call or fail immediately depends on other
1118          * conditions.  We first look at these other conditions.
1119          */
1120         r = OK;
1121
1122         if (uds_get_type(uds) != SOCK_DGRAM) {
1123                 if (uds_is_connecting(uds))
1124                         r = SUSPEND;
1125                 else if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
1126                         r = ENOTCONN;
1127                 else if (!uds_has_conn(uds) ||
1128                     uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR))
1129                         r = SOCKEVENT_EOF;
1130         }
1131
1132         if (uds->uds_len == 0) {
1133                 /*
1134                  * For stream-type sockets, we use the policy: if no regular
1135                  * data is requested, then end the call without receiving
1136                  * anything.  For packet-type sockets, the request should block
1137                  * until there is a packet to discard, though.
1138                  */
1139                 if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0))
1140                         return r;
1141
1142                 return SUSPEND;
1143         }
1144
1145         /*
1146          * For stream-type sockets, we should still suspend the call if fewer
1147          * than 'min' bytes are available right now, and there is a possibility
1148          * that more data may arrive later.  More may arrive later iff 'r' is
1149          * OK (i.e., no EOF or error will follow) and, in case we already
1150          * received some partial results, there is not already a next segment
1151          * with ancillary data (i.e, nonzero segment flags), or in any case
1152          * there isn't more than one segment in the buffer.  Limit 'min' to the
1153          * maximum that can ever be received, though.  Since that is difficult
1154          * in our case, we check whether the buffer is entirely full instead.
1155          */
1156         if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 &&
1157             uds->uds_len < UDS_BUF) {
1158                 assert(uds->uds_len >= UDS_HDRLEN);
1159
1160                 (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen,
1161                     &segflags);
1162
1163                 if (datalen < min && seglen == uds->uds_len &&
1164                     (!partial || segflags == 0))
1165                         return SUSPEND;
1166         }
1167
1168         /*
1169          * Also start the decision process as to whether we should suspend the
1170          * current call if MSG_WAITALL is given.  Unfortunately there is no one
1171          * place where we can conveniently do all the required checks.
1172          */
1173         if (may_block != NULL)
1174                 *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM);
1175         return OK;
1176 }
1177
1178 /*
1179  * Receive regular data, and possibly the source path, from the tail segment in
1180  * the receive buffer.  On success, return the positive non-zero length of the
1181  * tail segment, with 'addr' and 'addr_len' modified to store the source
1182  * address if applicable, the result flags in 'rflags' updated as appropriate,
1183  * the tail segment's data length stored in 'datalen', the number of received
1184  * regular data bytes stored in 'reslen', the segment flags stored in
1185  * 'segflags', and the absolute receive buffer position of the credentials in
1186  * the segment stored in 'credpos' if applicable.  Since the receive call may
1187  * still fail, this function must not yet update the tail or any other aspect
1188  * of the receive buffer.  Return zero if the current receive call was already
1189  * partially successful (due to MSG_WAITALL) and can no longer make progress,
1190  * and thus should be ended.  Return a negative error code on failure.
1191  */
1192 static int
1193 uds_recv_data(struct udssock * uds, const struct sockdriver_data * data,
1194         size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len,
1195         int * __restrict rflags, size_t * __restrict datalen,
1196         size_t * __restrict reslen, unsigned int * __restrict segflags,
1197         size_t * __restrict credpos)
1198 {
1199         iovec_t iov[2];
1200         unsigned char lenbyte;
1201         unsigned int iovcnt;
1202         size_t pos, seglen, left;
1203         int r;
1204
1205         pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags);
1206
1207         /*
1208          * If a partially completed receive now runs into a segment that cannot
1209          * be logically merged with the previous one (because it has at least
1210          * one segment flag set, meaning it has ancillary data), then we must
1211          * shortcut the receive now.
1212          */
1213         if (off != 0 && *segflags != 0)
1214                 return OK;
1215
1216         /*
1217          * As stated, for stream-type sockets, we choose to ignore zero-size
1218          * receive calls.  This has the consequence that reading a zero-sized
1219          * segment (with ancillary data) requires a receive request for at
1220          * least one regular data byte.  Such a receive call would then return
1221          * zero.  The problem with handling zero-data receive requests is that
1222          * we need to know whether the current segment is terminated (i.e., no
1223          * more data can possibly be merged into it later), which is a test
1224          * that we rather not perform, not in the least because we do not know
1225          * whether there is an error pending on the socket.
1226          *
1227          * For datagrams, we currently allow a zero-size receive call to
1228          * discard the next datagram.
1229          *
1230          * TODO: compare this against policies on other platforms.
1231          */
1232         if (len == 0 && uds_get_type(uds) == SOCK_STREAM)
1233                 return OK;
1234
1235         /*
1236          * We have to skip the credentials for now: these are copied out as
1237          * control data, and thus will (well, may) be looked at when dealing
1238          * with the control data.  For the same reason, we do not even look at
1239          * UDS_HAS_FDS here.
1240          */
1241         if (*segflags & UDS_HAS_CRED) {
1242                 *credpos = pos;
1243
1244                 pos = uds_fetch(uds, pos, &lenbyte, 1);
1245                 pos = uds_advance(pos, (size_t)lenbyte);
1246         }
1247
1248         /*
1249          * Copy out the source address, but only if the (datagram) socket is
1250          * not connected.  TODO: even when it is connected, it may still
1251          * receive packets sent to it from other sockets *before* being
1252          * connected, and the receiver has no way of knowing that those packets
1253          * did not come from its new peer.  Ideally, the older packets should
1254          * be dropped..
1255          */
1256         if (*segflags & UDS_HAS_PATH) {
1257                 pos = uds_fetch(uds, pos, &lenbyte, 1);
1258
1259                 if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds))
1260                         uds_make_addr((const char *)&uds->uds_buf[pos],
1261                             (size_t)lenbyte, addr, addr_len);
1262
1263                 pos = uds_advance(pos, (size_t)lenbyte);
1264         }
1265
1266         /*
1267          * We can receive no more data than those that are present in the
1268          * segment, obviously.  For stream-type sockets, any more data that
1269          * could have been received along with the current data would have been
1270          * merged in the current segment, so we need not search for any next
1271          * segments.
1272          *
1273          * For non-stream sockets, the caller may receive less than a whole
1274          * packet if it supplied a small buffer.  In that case, the rest of the
1275          * packet will be discarded (but not here yet!) and the caller gets
1276          * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway.
1277          */
1278         if (len > *datalen)
1279                 len = *datalen;
1280         else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM)
1281                 *rflags |= MSG_TRUNC;
1282
1283         /* Copy out the data to the caller. */
1284         if (len > 0) {
1285                 iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos];
1286                 left = UDS_BUF - pos;
1287
1288                 if (left < len) {
1289                         iov[0].iov_size = left;
1290                         iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0];
1291                         iov[1].iov_size = len - left;
1292                         iovcnt = 2;
1293                 } else {
1294                         iov[0].iov_size = len;
1295                         iovcnt = 1;
1296                 }
1297
1298                 if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK)
1299                         return r;
1300         }
1301
1302         *reslen = len;
1303         assert(seglen > 0 && seglen <= INT_MAX);
1304         return (int)seglen;
1305 }
1306
1307 /*
1308  * The current segment has associated file descriptors.  If possible, copy out
1309  * all file descriptors to the receiver, and generate and copy out a chunk of
1310  * control data that contains their file descriptor numbers.  If not all
1311  * file descriptors fit in the receiver's buffer, or if any error occurs, no
1312  * file descriptors are copied out.
1313  */
1314 static int
1315 uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl,
1316         socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags)
1317 {
1318         struct msghdr msghdr;
1319         struct cmsghdr *cmsg;
1320         struct uds_fd *ufd;
1321         unsigned int i, nfds;
1322         socklen_t chunklen, chunkspace;
1323         int r, fd, what;
1324
1325         /* See how many file descriptors should be part of this chunk. */
1326         assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
1327         ufd = SIMPLEQ_FIRST(&uds->uds_fds);
1328         nfds = ufd->ufd_count;
1329         assert(nfds > 0);
1330
1331         /*
1332          * We produce and copy out potentially unaligned chunks, using
1333          * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE.
1334          * This may leave "gap" bytes unchanged in userland, but that should
1335          * not be a problem.  By producing unaligned chunks, we eliminate a
1336          * potential boundary case where the unaligned chunk passed in (by the
1337          * sender) no longer fits in the same buffer after being aligned here.
1338          */
1339         chunklen = CMSG_LEN(sizeof(int) * nfds);
1340         chunkspace = CMSG_SPACE(sizeof(int) * nfds);
1341         assert(chunklen <= sizeof(uds_ctlbuf));
1342         if (chunklen > ctl_len)
1343                 return 0; /* chunk would not fit, so produce nothing instead */
1344         if (chunkspace > ctl_len)
1345                 chunkspace = ctl_len;
1346
1347         memset(&msghdr, 0, sizeof(msghdr));
1348         msghdr.msg_control = uds_ctlbuf;
1349         msghdr.msg_controllen = sizeof(uds_ctlbuf);
1350
1351         memset(uds_ctlbuf, 0, chunklen);
1352         cmsg = CMSG_FIRSTHDR(&msghdr);
1353         cmsg->cmsg_len = chunklen;
1354         cmsg->cmsg_level = SOL_SOCKET;
1355         cmsg->cmsg_type = SCM_RIGHTS;
1356
1357         /*
1358          * Copy the group's local file descriptors to the target endpoint, and
1359          * store the resulting remote file descriptors in the chunk buffer.
1360          */
1361         r = OK;
1362
1363         for (i = 0; i < nfds; i++) {
1364                 assert(ufd != SIMPLEQ_END(&uds->uds_fds));
1365                 assert(i == 0 || ufd->ufd_count == 0);
1366
1367                 what = COPYFD_TO;
1368                 if (flags & MSG_CMSG_CLOEXEC)
1369                         what |= COPYFD_CLOEXEC;
1370
1371                 /* Failure may happen legitimately here (e.g., EMFILE). */
1372                 if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0)
1373                         break; /* we keep our progress so far in 'i' */
1374
1375                 fd = r;
1376
1377                 dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd));
1378
1379                 memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int));
1380
1381                 ufd = SIMPLEQ_NEXT(ufd, ufd_next);
1382         }
1383
1384         /* If everything went well so far, copy out the produced chunk. */
1385         if (r >= 0)
1386                 r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen);
1387
1388         /*
1389          * Handle errors.  At this point, the 'i' variable contains the number
1390          * of file descriptors that have already been successfully copied out.
1391          */
1392         if (r < 0) {
1393                 /* Revert the successful copyfd() calls made so far. */
1394                 while (i-- > 0) {
1395                         memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
1396
1397                         (void)copyfd(user_endpt, fd, COPYFD_CLOSE);
1398                 }
1399
1400                 return r;
1401         }
1402
1403         /*
1404          * Success.  Return the aligned size of the produced chunk, if the
1405          * given length permits it.  From here on, the receive call may no
1406          * longer fail, as that would result in lost file descriptors.
1407          */
1408         return chunkspace;
1409 }
1410
1411 /*
1412  * Generate and copy out a chunk of control data with the sender's credentials.
1413  * Return the aligned chunk size on success, or a negative error code on
1414  * failure.
1415  */
1416 static int
1417 uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl,
1418         socklen_t ctl_len, socklen_t ctl_off, size_t credpos)
1419 {
1420         struct msghdr msghdr;
1421         struct cmsghdr *cmsg;
1422         socklen_t chunklen, chunkspace;
1423         unsigned char lenbyte;
1424         size_t credlen;
1425         int r;
1426
1427         /*
1428          * Since the sender side already did the hard work of producing the
1429          * (variable-size) sockcred structure as it should be received, there
1430          * is relatively little work to be done here.
1431          */
1432         credpos = uds_fetch(uds, credpos, &lenbyte, 1);
1433         credlen = (size_t)lenbyte;
1434
1435         chunklen = CMSG_LEN(credlen);
1436         chunkspace = CMSG_SPACE(credlen);
1437         assert(chunklen <= sizeof(uds_ctlbuf));
1438         if (chunklen > ctl_len)
1439                 return 0; /* chunk would not fit, so produce nothing instead */
1440         if (chunkspace > ctl_len)
1441                 chunkspace = ctl_len;
1442
1443         memset(&msghdr, 0, sizeof(msghdr));
1444         msghdr.msg_control = uds_ctlbuf;
1445         msghdr.msg_controllen = sizeof(uds_ctlbuf);
1446
1447         memset(uds_ctlbuf, 0, chunklen);
1448         cmsg = CMSG_FIRSTHDR(&msghdr);
1449         cmsg->cmsg_len = chunklen;
1450         cmsg->cmsg_level = SOL_SOCKET;
1451         cmsg->cmsg_type = SCM_CREDS;
1452
1453         uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen);
1454
1455         if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK)
1456                 return r;
1457
1458         return chunkspace;
1459 }
1460
1461 /*
1462  * Copy out control data for the ancillary data associated with the current
1463  * segment, if any.  Return OK on success, at which point the current receive
1464  * call may no longer fail.  'rflags' may be updated with additional result
1465  * flags.  Return a negative error code on failure.
1466  */
1467 static int
1468 uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl,
1469         socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt,
1470         int flags, unsigned int segflags, size_t credpos, int * rflags)
1471 {
1472         int r;
1473
1474         /*
1475          * We first copy out all file descriptors, if any.  We put them in one
1476          * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS
1477          * chunks.  We believe that this should not cause application-level
1478          * issues, but if it does, we can change that later with some effort.
1479          * We then copy out credentials, if any.
1480          *
1481          * We copy out each control chunk independently of the others, and also
1482          * perform error recovery on a per-chunk basis.  This implies the
1483          * following.  If producing or copying out the first chunk fails, the
1484          * entire recvmsg(2) call will fail with an appropriate error.  If
1485          * producing or copying out any subsequent chunk fails, the recvmsg(2)
1486          * call will still return the previously generated chunks (a "short
1487          * control read" if you will) as well as the MSG_CTRUNC flag.  This
1488          * approach is simple and clean, and it guarantees that we can always
1489          * copy out at least as many file descriptors as we copied in for this
1490          * segment, even if credentials are present as well.  However, the
1491          * approach does cause slightly more overhead when there are multiple
1492          * chunks per call, as those are copied out separately.
1493          *
1494          * Since the generated SCM_RIGHTS chunk is never larger than the
1495          * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf"
1496          * buffer is always large enough to contain the chunk in its entirety.
1497          * SCM_CREDS chunks should always fit easily as well.
1498          *
1499          * The MSG_CTRUNC flag will be returned iff not the entire user-given
1500          * control buffer was filled and not all control chunks were delivered.
1501          * Our current implementation does not deliver partial chunks.  NetBSD
1502          * does, except for SCM_RIGHTS chunks.
1503          *
1504          * TODO: get rid of the redundancy in processing return values.
1505          */
1506         if (segflags & UDS_HAS_FDS) {
1507                 r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt,
1508                     flags);
1509
1510                 /*
1511                  * At this point, 'r' contains one of the following:
1512                  *
1513                  *   r > 0      a chunk of 'r' bytes was added successfully.
1514                  *   r == 0     not enough space left; the chunk was not added.
1515                  *   r < 0      an error occurred; the chunk was not added.
1516                  */
1517                 if (r < 0 && *ctl_off == 0)
1518                         return r;
1519
1520                 if (r > 0) {
1521                         ctl_len -= r;
1522                         *ctl_off += r;
1523                 } else
1524                         *rflags |= MSG_CTRUNC;
1525         }
1526
1527         if (segflags & UDS_HAS_CRED) {
1528                 r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos);
1529
1530                 /* As above. */
1531                 if (r < 0 && *ctl_off == 0)
1532                         return r;
1533
1534                 if (r > 0) {
1535                         ctl_len -= r;
1536                         *ctl_off += r;
1537                 } else
1538                         *rflags |= MSG_CTRUNC;
1539         }
1540
1541         return OK;
1542 }
1543
1544 /*
1545  * The current receive request is successful or, in the case of MSG_WAITALL,
1546  * has made progress.  Advance the receive buffer tail, either by discarding
1547  * the entire tail segment or by generating a new, smaller tail segment that
1548  * contains only the regular data left to be received from the original tail
1549  * segment.  Also wake up the sending side for connection-oriented sockets if
1550  * applicable, because there may now be room for more data to be sent.  Update
1551  * 'may_block' if we are now sure that the call may not block on MSG_WAITALL
1552  * after all.
1553  */
1554 static void
1555 uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen,
1556         size_t reslen, unsigned int segflags, int * may_block)
1557 {
1558         struct udssock *conn;
1559         struct uds_fd *ufd;
1560         size_t delta, nseglen, advance;
1561         unsigned int nfds;
1562
1563         /* Note that 'reslen' may be legitimately zero. */
1564         assert(reslen <= datalen);
1565
1566         if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen)
1567                 reslen = datalen;
1568
1569         delta = datalen - reslen;
1570
1571         if (delta == 0) {
1572                 /*
1573                  * Fully consume the tail segment.  We advance the tail by the
1574                  * full segment length, thus moving up to either the next
1575                  * segment in the receive buffer, or an empty receive buffer.
1576                  */
1577                 advance = seglen;
1578
1579                 uds->uds_tail = uds_advance(uds->uds_tail, advance);
1580         } else {
1581                 /*
1582                  * Partially consume the tail segment.  We put a new segment
1583                  * header right in front of the remaining data, which obviously
1584                  * always fits.  Since any ancillary data was consumed along
1585                  * with the first data byte of the segment, the new segment has
1586                  * no ancillary data anymore (and thus a zero flags field).
1587                  */
1588                 nseglen = UDS_HDRLEN + delta;
1589                 assert(nseglen < seglen);
1590
1591                 advance = seglen - nseglen;
1592
1593                 uds->uds_tail = uds_advance(uds->uds_tail, advance);
1594
1595                 uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0);
1596         }
1597
1598         /*
1599          * For datagram-oriented sockets, we always consume at least a header.
1600          * For stream-type sockets, we either consume a zero-data segment along
1601          * with its ancillary data, or we consume at least one byte from a
1602          * segment that does have regular data.  In all other cases, the
1603          * receive call has already been ended by now.  Thus, we always advance
1604          * the tail of the receive buffer here.
1605          */
1606         assert(advance > 0);
1607
1608         /*
1609          * The receive buffer's used length (uds_len) and pointer to the
1610          * previous segment header (uds_last) are offsets from the tail.  Now
1611          * that we have moved the tail, we need to adjust these accordingly.
1612          * If the buffer is now empty, reset the tail to the buffer start so as
1613          * to avoid splitting inter-process copies whenever possible.
1614          */
1615         assert(uds->uds_len >= advance);
1616         uds->uds_len -= advance;
1617
1618         if (uds->uds_len == 0)
1619                 uds->uds_tail = 0;
1620
1621         /*
1622          * If uds_last is zero here, it was pointing to the segment we just
1623          * (partially) consumed.  By leaving it zero, it will still point to
1624          * the new or next segment.
1625          */
1626         if (uds->uds_last > 0) {
1627                 assert(uds->uds_len > 0);
1628                 assert(uds->uds_last >= advance);
1629                 uds->uds_last -= advance;
1630         }
1631
1632         /*
1633          * If there were any file descriptors associated with this segment,
1634          * close and free them now.
1635          */
1636         if (segflags & UDS_HAS_FDS) {
1637                 assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
1638                 ufd = SIMPLEQ_FIRST(&uds->uds_fds);
1639                 nfds = ufd->ufd_count;
1640                 assert(nfds > 0);
1641
1642                 while (nfds-- > 0) {
1643                         assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
1644                         ufd = SIMPLEQ_FIRST(&uds->uds_fds);
1645                         SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next);
1646
1647                         dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
1648
1649                         closenb(ufd->ufd_fd);
1650
1651                         SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next);
1652                 }
1653         }
1654
1655         /*
1656          * If there is now any data left in the receive buffer, then there has
1657          * been a reason that we haven't received it.  For stream sockets, that
1658          * reason is that the next segment has ancillary data.  In any case,
1659          * this means we should never block the current receive operation
1660          * waiting for more data.  Otherwise, we may block on MSG_WAITALL.
1661          */
1662         if (uds->uds_len > 0)
1663                 *may_block = FALSE;
1664
1665         /*
1666          * If the (non-datagram) socket has a peer that is not shut down for
1667          * writing, see if it can be woken up to send more data.  Note that
1668          * the event will never be processed immediately.
1669          */
1670         if (uds_is_connected(uds)) {
1671                 assert(uds_get_type(uds) != SOCK_DGRAM);
1672
1673                 conn = uds->uds_conn;
1674
1675                 if (!uds_is_shutdown(conn, SFL_SHUT_WR))
1676                         sockevent_raise(&conn->uds_sock, SEV_SEND);
1677         }
1678 }
1679
1680 /*
1681  * Process a receive request.  Return OK if the receive request has completed
1682  * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an
1683  * end-of-file condition is reached, or a negative error code on failure.  In
1684  * all cases, the values of 'off' and 'ctl_off' must be updated if any progress
1685  * has been made; if either is non-zero, libsockevent will return the partial
1686  * progress rather than an error code or EOF.
1687  */
1688 int
1689 uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len,
1690         size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
1691         socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len,
1692         endpoint_t user_endpt, int flags, size_t min, int * rflags)
1693 {
1694         struct udssock *uds = (struct udssock *)sock;
1695         size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/;
1696         unsigned int segflags;
1697         int r, partial, may_block = 0 /*gcc*/;
1698
1699         dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n",
1700             uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
1701             (ctl_off != NULL) ? *ctl_off : 0, flags));
1702
1703         /*
1704          * Start by testing whether anything can be received at all, or whether
1705          * an error or EOF should be returned instead, or whether the receive
1706          * call should be suspended until later otherwise.  If no (regular or
1707          * control) data can be received, or if this was a test for select,
1708          * we bail out right after.
1709          */
1710         partial = (off != NULL && *off > 0);
1711
1712         if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK)
1713                 return r;
1714
1715         /*
1716          * Copy out regular data, if any.  Do this before copying out control
1717          * data, because the latter is harder to undo on failure.  This data
1718          * copy function returns returns OK (0) if we are to return a result of
1719          * zero bytes (which is *not* EOF) to the caller without doing anything
1720          * else.  The function returns a nonzero positive segment length if we
1721          * should carry on with the receive call (as it happens, all its other
1722          * returned values may in fact be zero).
1723          */
1724         if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags,
1725             &datalen, &reslen, &segflags, &credpos)) <= 0)
1726                 return r;
1727         seglen = (size_t)r;
1728
1729         /*
1730          * Copy out control data, if any: transfer and copy out records of file
1731          * descriptors, and/or copy out sender credentials.  This is the last
1732          * part of the call that may fail.
1733          */
1734         if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags,
1735             segflags, credpos, rflags)) != OK)
1736                 return r;
1737
1738         /*
1739          * Now that the call has succeeded, move the tail of the receive
1740          * buffer, unless we were merely peeking.
1741          */
1742         if (!(flags & MSG_PEEK))
1743                 uds_recv_advance(uds, seglen, datalen, reslen, segflags,
1744                     &may_block);
1745         else
1746                 may_block = FALSE;
1747
1748         /*
1749          * If the MSG_WAITALL flag was given, we may still have to suspend the
1750          * call after partial success.  In particular, the receive call may
1751          * suspend after partial success if all of these conditions are met:
1752          *
1753          *   1) the socket is a stream-type socket;
1754          *   2) MSG_WAITALL is set;
1755          *   3) MSG_PEEK is not set;
1756          *   4) MSG_DONTWAIT is not set (tested upon return);
1757          *   5) the socket must not have a pending error (tested upon return);
1758          *   6) the socket must not be shut down for reading (tested later);
1759          *   7) the socket must still be connected to a peer (no EOF);
1760          *   8) the peer must not have been shut down for writing (no EOF);
1761          *   9) the next segment, if any, contains no ancillary data.
1762          *
1763          * Together, these points guarantee that the call could conceivably
1764          * receive more after being resumed.  Points 4 to 6 are covered by
1765          * libsockevent, which will end the call even if we return SUSPEND
1766          * here.  Due to segment merging, we cover point 9 by checking that
1767          * there is currently no next segment at all.  Once a new segment
1768          * arrives, the ancillary-data test is done then.
1769          */
1770         *off += reslen;
1771         if ((flags & MSG_WAITALL) && reslen < len && may_block)
1772                 return SUSPEND;
1773         else
1774                 return OK;
1775 }
1776
1777 /*
1778  * Test whether a receive request would block.  The given 'min' parameter
1779  * contains the minimum number of bytes that should be possible to receive
1780  * without blocking (the low receive watermark).  Return SUSPEND if the send
1781  * request would block.  Otherwise, return any other error code (including OK
1782  * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled
1783  * with the number of bytes available for receipt right now (if not zero).
1784  * Note that if 'size' is not NULL, 'min' will always be zero.
1785  */
1786 int
1787 uds_test_recv(struct sock * sock, size_t min, size_t * size)
1788 {
1789         struct udssock *uds = (struct udssock *)sock;
1790         size_t seglen;
1791         unsigned int segflags;
1792         int r;
1793
1794         if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/,
1795             NULL /*may_block*/)) == SUSPEND)
1796                 return r;
1797
1798         if (size != NULL && uds->uds_len > 0)
1799                 (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size,
1800                     &segflags);
1801
1802         return r;
1803 }