1 /* UNIX Domain Sockets - io.c - sending and receiving */
7 * Our UDS sockets do not have a send buffer. They only have a receive buffer.
8 * This receive buffer, when not empty, is split up in segments. Each segment
9 * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and
10 * (SOCK_DGRAM) neither. There are two types of ancillary data: in-flight file
11 * descriptors and sender credentials. In addition, for SOCK_DGRAM sockets,
12 * the segment may contain the sender's socket path (if the sender's socket is
13 * bound). Each segment has a header, containing the full segment size, the
14 * size of the actual data in the segment (if any), and a flags field that
15 * states which ancillary are associated with the segment (if any). For
16 * SOCK_STREAM type sockets, new data may be merged into a previous segment,
17 * but only if it has no ancillary data. For the other two socket types, each
18 * packet has its own header. The resulting behavior should be in line with
19 * the POSIX "Socket Receive Queue" specification.
21 * More specifically, each segment consists of the following parts:
22 * - always a five-byte header, containing a two-byte segment length (including
23 * the header, so always non-zero), a two-byte regular data length (zero or
24 * more), and a one-byte flags field which is a bitwise combination of
25 * UDS_HAS_{FD,CRED,PATH} flags;
26 * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure;
27 * since this structure is variable-size, the structure is prepended by a
28 * single byte that contains the length of the structure (excluding the byte
29 * itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN);
30 * - next, if UDS_HAS_PATH is set in the segment header:
31 * - next, if the data length is non-zero, the actual regular data.
32 * If the segment is not the last in the receive buffer, it is followed by the
33 * next segment immediately afterward. There is no alignment.
35 * It is the sender's responsibility to merge new data into the last segment
36 * whenever possible, so that the receiver side never needs to consider more
37 * than one segment at once. In order to allow such merging, each receive
38 * buffer has not only a tail and in-use length (pointing to the head when
39 * combined) but also an offset from the tail to the last header, if any. Note
40 * that the receiver may over time still look at multiple segments for a single
41 * request: this happens when a MSG_WAITALL request empties the buffer and then
42 * blocks - the next piece of arriving data can then obviously not be merged.
44 * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file
45 * descriptors are associated with the segment. These are stored in a separate
46 * data structure, mainly to simplify cleaning up when the socket is shut down
47 * for reading or closed. That structure also contains the number of file
48 * descriptors associated with the current segment, so this is not stored in
49 * the segment itself. As mentioned later, this may be changed in the future.
51 * On the sender side, there is a trade-off between fully utilizing the receive
52 * buffer, and not repeatedly performing expensive actions for the same call:
53 * it may be costly to determine exactly how many in-flight file descriptors
54 * there will be (if any) and/or how much space is needed to store credentials.
55 * We currently use the policy that we rather block/reject a send request that
56 * may (just) have fit in the remaining part of the receive buffer, than obtain
57 * the same information multiple times or keep state between callbacks. In
58 * practice this is not expected to make a difference, especially since
59 * transfer of ancillary data should be rare anyway.
62 * The current layout of the segment header is as follows.
64 * The first byte contains the upper eight bits of the total segment length.
65 * The second byte contains the lower eight bits of the total segment length.
66 * The third byte contains the upper eight bits of the data length.
67 * The fourth byte contains the lower eight bits of the data length.
68 * The fifth byte is a bitmask for ancillary data associated with the segment.
72 #define UDS_HAS_FDS 0x01 /* segment has in-flight file descriptors */
73 #define UDS_HAS_CRED 0x02 /* segment has sender credentials */
74 #define UDS_HAS_PATH 0x04 /* segment has source socket path */
76 #define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX)
78 #define uds_get_head(uds) \
79 ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF)
80 #define uds_get_last(uds) \
81 ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF)
82 #define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF)
85 * All in-flight file descriptors are (co-)owned by the UDS driver itself, as
86 * local open file descriptors. Like any other process, the UDS driver can not
87 * have more than OPEN_MAX open file descriptors at any time. Thus, this is
88 * also the inherent maximum number of in-flight file descriptors. Therefore,
89 * we maintain a single pool of in-flight FD structures, and we associate these
90 * structures with sockets as needed.
92 static struct uds_fd uds_fds
[OPEN_MAX
];
93 static SIMPLEQ_HEAD(uds_freefds
, uds_fd
) uds_freefds
;
95 static char uds_ctlbuf
[UDS_CTL_MAX
];
96 static int uds_ctlfds
[UDS_CTL_MAX
/ sizeof(int)];
99 * Initialize the input/output part of the UDS service.
106 SIMPLEQ_INIT(&uds_freefds
);
108 for (slot
= 0; slot
< __arraycount(uds_fds
); slot
++)
109 SIMPLEQ_INSERT_TAIL(&uds_freefds
, &uds_fds
[slot
], ufd_next
);
113 * Set up all input/output state for the given socket, which has just been
114 * allocated. As part of this, allocate memory for the receive buffer of the
115 * socket. Return OK or a negative error code.
118 uds_io_setup(struct udssock
* uds
)
121 /* TODO: decide if we should preallocate the memory. */
122 if ((uds
->uds_buf
= mmap(NULL
, UDS_BUF
, PROT_READ
| PROT_WRITE
,
123 MAP_ANON
| MAP_PRIVATE
, -1, 0)) == MAP_FAILED
)
130 SIMPLEQ_INIT(&uds
->uds_fds
);
136 * Clean up the input/output state for the given socket, which is about to be
137 * freed. As part of this, deallocate memory for the receive buffer and close
138 * any file descriptors still in flight on the socket.
141 uds_io_cleanup(struct udssock
* uds
)
144 /* Close any in-flight file descriptors. */
147 /* Free the receive buffer memory. */
148 if (munmap(uds
->uds_buf
, UDS_BUF
) != 0)
149 panic("UDS: munmap failed: %d", errno
);
153 * The socket is being closed or shut down for reading. If there are still any
154 * in-flight file descriptors, theey will never be received anymore, so close
158 uds_io_reset(struct udssock
* uds
)
163 * The UDS service may have the last and only reference to any of these
164 * file descriptors here. For that reason, we currently disallow
165 * transfer of UDS file descriptors, because the close(2) here could
166 * block on a socket close operation back to us, leading to a deadlock.
167 * Also, we use a non-blocking variant of close(2), to prevent that we
168 * end up hanging on sockets with SO_LINGER turned on.
170 SIMPLEQ_FOREACH(ufd
, &uds
->uds_fds
, ufd_next
) {
171 dprintf(("UDS: closing local fd %d\n", ufd
->ufd_fd
));
173 closenb(ufd
->ufd_fd
);
176 SIMPLEQ_CONCAT(&uds_freefds
, &uds
->uds_fds
);
179 * If this reset happens as part of a shutdown, it might be done
180 * again on close, so ensure that it will find a clean state. The
181 * receive buffer should never be looked at again either way, but reset
182 * it too just to be sure.
188 SIMPLEQ_INIT(&uds
->uds_fds
);
192 * Return the maximum usable part of the receive buffer, in bytes. The return
193 * value is used for the SO_SNDBUF and SO_RCVBUF socket options.
200 * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we
201 * could use the full receive buffer for data. This would require that
202 * we store up to one header in the socket object rather than in the
205 return UDS_BUF
- UDS_HDRLEN
;
209 * Fetch 'len' bytes starting from absolute position 'pos' into the receive
210 * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'.
211 * Return the absolute position of the first byte after the fetched data in the
215 uds_fetch(struct udssock
* uds
, size_t off
, void * ptr
, size_t len
)
219 assert(off
< UDS_BUF
);
221 left
= UDS_BUF
- off
;
223 memcpy(ptr
, &uds
->uds_buf
[off
], left
);
225 if ((len
-= left
) > 0)
226 memcpy((char *)ptr
+ left
, &uds
->uds_buf
[0], len
);
230 memcpy(ptr
, &uds
->uds_buf
[off
], len
);
237 * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive
238 * buffer of socket 'uds', starting at absolute position 'pos' into the receive
239 * buffer. Return the absolute position of the first byte after the stored
240 * data in the receive buffer.
243 uds_store(struct udssock
* uds
, size_t off
, const void * ptr
, size_t len
)
247 assert(off
< UDS_BUF
);
249 left
= UDS_BUF
- off
;
251 memcpy(&uds
->uds_buf
[off
], ptr
, left
);
253 if ((len
-= left
) > 0)
254 memcpy(&uds
->uds_buf
[0], (const char *)ptr
+ left
,
259 memcpy(&uds
->uds_buf
[off
], ptr
, len
);
266 * Fetch a segment header previously stored in the receive buffer of socket
267 * 'uds' at absolute position 'off'. Return the absolute position of the first
268 * byte after the header, as well as the entire segment length in 'seglen', the
269 * length of the data in the segment in 'datalen', and the segment flags in
273 uds_fetch_hdr(struct udssock
* uds
, size_t off
, size_t * seglen
,
274 size_t * datalen
, unsigned int * segflags
)
276 unsigned char hdr
[UDS_HDRLEN
];
278 off
= uds_fetch(uds
, off
, hdr
, sizeof(hdr
));
280 *seglen
= ((size_t)hdr
[0] << 8) | (size_t)hdr
[1];
281 *datalen
= ((size_t)hdr
[2] << 8) | (size_t)hdr
[3];
284 assert(*seglen
>= UDS_HDRLEN
);
285 assert(*seglen
<= uds
->uds_len
);
286 assert(*datalen
<= *seglen
- UDS_HDRLEN
);
287 assert(*segflags
!= 0 || *datalen
== *seglen
- UDS_HDRLEN
);
288 assert(!(*segflags
& ~(UDS_HAS_FDS
| UDS_HAS_CRED
| UDS_HAS_PATH
)));
294 * Store a segment header in the receive buffer of socket 'uds' at absolute
295 * position 'off', with the segment length 'seglen', the segment data length
296 * 'datalen', and the segment flags 'segflags'. Return the absolute receive
297 * buffer position of the first data byte after the stored header.
300 uds_store_hdr(struct udssock
* uds
, size_t off
, size_t seglen
, size_t datalen
,
301 unsigned int segflags
)
303 unsigned char hdr
[UDS_HDRLEN
];
305 assert(seglen
<= USHRT_MAX
);
306 assert(datalen
<= seglen
);
307 assert(segflags
<= UCHAR_MAX
);
308 assert(!(segflags
& ~(UDS_HAS_FDS
| UDS_HAS_CRED
| UDS_HAS_PATH
)));
310 hdr
[0] = (seglen
>> 8) & 0xff;
311 hdr
[1] = seglen
& 0xff;
312 hdr
[2] = (datalen
>> 8) & 0xff;
313 hdr
[3] = datalen
& 0xff;
316 return uds_store(uds
, off
, hdr
, sizeof(hdr
));
320 * Perform initial checks on a send request, before it may potentially be
321 * suspended. Return OK if this send request is valid, or a negative error
325 uds_pre_send(struct sock
* sock
, size_t len
, socklen_t ctl_len __unused
,
326 const struct sockaddr
* addr
, socklen_t addr_len __unused
,
327 endpoint_t user_endpt __unused
, int flags
)
329 struct udssock
*uds
= (struct udssock
*)sock
;
333 * Reject calls with unknown flags. Besides the flags handled entirely
334 * by libsockevent (which are not part of 'flags' here), that is all of
335 * them. TODO: ensure that we should really reject all other flags
336 * rather than ignore them.
342 * Perform very basic address and message size checks on the send call.
343 * For non-stream sockets, we must reject packets that may never fit in
344 * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the
345 * send call may end up being suspended indefinitely. Therefore, we
346 * assume the worst-case scenario, which is that a full set of
347 * credentials must be associated with the packet. As a result, we may
348 * reject some large packets that could actually just fit. Checking
349 * the peer's LOCAL_CREDS setting here is not safe: even if we know the
350 * peer already at all (for SOCK_DGRAM we do not), the send may still
351 * block and the option toggled before it unblocks.
353 switch (uds_get_type(uds
)) {
355 /* Nothing to check for this case. */
359 if (len
> UDS_BUF
- UDS_HDRLEN
- 1 - UDS_MAXCREDLEN
)
365 if (!uds_has_link(uds
) && addr
== NULL
)
369 * The path is stored without null terminator, but with leading
370 * byte containing the path length--if there is a path at all.
372 pathlen
= (size_t)uds
->uds_pathlen
;
376 if (len
> UDS_BUF
- UDS_HDRLEN
- pathlen
- 1 - UDS_MAXCREDLEN
)
389 * Determine whether the (real or pretend) send request should be processed
390 * now, suspended until later, or rejected based on the current socket state.
391 * Return OK if the send request should be processed now. Return SUSPEND if
392 * the send request should be retried later. Return an appropriate negative
393 * error code if the send request should fail.
396 uds_send_test(struct udssock
* uds
, size_t len
, socklen_t ctl_len
, size_t min
,
399 struct udssock
*conn
;
400 size_t avail
, hdrlen
, credlen
;
402 assert(!uds_is_shutdown(uds
, SFL_SHUT_WR
));
404 if (uds_get_type(uds
) != SOCK_DGRAM
) {
405 if (uds_is_connecting(uds
))
407 if (!uds_is_connected(uds
) && !uds_is_disconnected(uds
))
409 if (!uds_has_conn(uds
))
412 conn
= uds
->uds_conn
;
414 if (uds_is_shutdown(conn
, SFL_SHUT_RD
))
418 * For connection-type sockets, we now have to check if there
419 * is enough room in the receive buffer. For SOCK_STREAM
420 * sockets, we must check if at least 'min' bytes can be moved
421 * into the receive buffer, at least if that is a reasonable
422 * value for ever making any forward progress at all. For
423 * SOCK_SEQPACKET sockets, we must check if the entire packet
424 * of size 'len' can be stored in the receive buffer. In both
425 * cases, we must take into account any metadata to store along
428 * Unlike in uds_pre_send(), we can now check safely whether
429 * the peer is expecting credentials, but we still don't know
430 * the actual size of the credentials, so again we take the
431 * maximum possible size. The same applies to file descriptors
432 * transferred via control data: all we have the control length
433 * right now, which if non-zero we assume to mean there might
434 * be file descriptors.
436 * In both cases, the reason of overestimating is that actually
437 * getting accurate sizes, by obtaining credentials or copying
438 * in control data, is very costly. We want to do that only
439 * when we are sure we will not suspend the send call after
440 * all. It is no problem to overestimate how much space will
441 * be needed here, but not to underestimate: that could cause
442 * applications that use select(2) and non-blocking sockets to
443 * end up in a busy-wait loop.
445 if (!partial
&& (conn
->uds_flags
& UDSF_PASSCRED
))
446 credlen
= 1 + UDS_MAXCREDLEN
;
450 avail
= UDS_BUF
- conn
->uds_len
;
452 if (uds_get_type(uds
) == SOCK_STREAM
) {
454 * Limit the low threshold to the maximum that can ever
457 if (min
> UDS_BUF
- UDS_HDRLEN
- credlen
)
458 min
= UDS_BUF
- UDS_HDRLEN
- credlen
;
461 * Suspend the call only if not even the low threshold
462 * is met. Otherwise we may make (partial) progress.
468 * If the receive buffer already has at least one
469 * segment, and there are certainly no file descriptors
470 * to transfer now, and we do not have to store
471 * credentials either, then this segment can be merged
472 * with the previous one. In that case, we need no
473 * space for a header. That is certainly the case if
474 * we are resuming an already partially completed send.
476 hdrlen
= (avail
== UDS_BUF
|| ctl_len
!= 0 ||
477 credlen
> 0) ? UDS_HDRLEN
: 0;
481 if (avail
< hdrlen
+ credlen
+ len
)
489 * Get the destination peer for a send request. The send test has already been
490 * performed first. On success, return OK, with a pointer to the peer socket
491 * stored in 'peerp'. On failure, return an appropriate error code.
494 uds_send_peer(struct udssock
* uds
, const struct sockaddr
* addr
,
495 socklen_t addr_len
, endpoint_t user_endpt
, struct udssock
** peerp
)
497 struct udssock
*peer
;
500 if (uds_get_type(uds
) == SOCK_DGRAM
) {
501 if (!uds_has_link(uds
)) {
502 /* This was already checked in uds_pre_check(). */
503 assert(addr
!= NULL
);
506 * Find the socket identified by the given address.
507 * If it exists at all, see if it is a proper match.
509 if ((r
= uds_lookup(uds
, addr
, addr_len
, user_endpt
,
514 * If the peer socket is connected to a target, it
515 * must be this socket. Unfortunately, POSIX does not
516 * specify an error code for this. We borrow Linux's.
518 if (uds_has_link(peer
) && peer
->uds_link
!= uds
)
521 peer
= uds
->uds_link
;
524 * If the receiving end will never receive this packet, we
525 * might as well not send it, so drop it immeiately. Indicate
526 * as such to the caller, using NetBSD's chosen error code.
528 if (uds_is_shutdown(peer
, SFL_SHUT_RD
))
531 assert(uds_has_conn(uds
));
533 peer
= uds
->uds_conn
;
541 * Generate a new segment for the current send request, or arrange things such
542 * that new data can be merged with a previous segment. As part of this,
543 * decide whether we can merge data at all. The segment will be merged if, and
544 * only if, all of the following requirements are met:
546 * 1) the socket is of type SOCK_STREAM;
547 * 2) there is a previous segment in the receive buffer;
548 * 3) there is no ancillary data for the current send request.
550 * Also copy in regular data (if any), retrieve the sender's credentials (if
551 * needed), and copy over the source path (if applicable). However, do not yet
552 * commit the segment (or the new part to be merged), because the send request
553 * may still fail for other reasons.
555 * On success, return the length of the new segment (or, when merging, the
556 * length to be added to the last segment), as well as a flag indicating
557 * whether we are merging into the last segment in 'mergep', the length of the
558 * (new) data in the segment in 'datalenp', and the new segment's flags in
559 * 'segflagsp' (always zero when merging). Note that a return value of zero
560 * implies that we are merging zero extra bytes into the last segment, which
561 * means that effectively nothing changes; in that case the send call will be
562 * cut short and return zero to the caller as well. On failure, return a
563 * negative error code.
566 uds_send_data(struct udssock
* uds
, struct udssock
* peer
,
567 const struct sockdriver_data
* data
, size_t len
, size_t off
,
568 endpoint_t user_endpt
, unsigned int nfds
, int * __restrict mergep
,
569 size_t * __restrict datalenp
, unsigned int * __restrict segflagsp
)
571 struct sockcred sockcred
;
572 gid_t groups
[NGROUPS_MAX
];
574 unsigned int iovcnt
, segflags
;
575 unsigned char lenbyte
;
576 size_t credlen
, pathlen
, datalen
, seglen
;
577 size_t avail
, pos
, left
;
581 * At this point we should add the data to the peer's receive buffer.
582 * In the case of SOCK_STREAM sockets, we should add as much of the
583 * data as possible and suspend the call to send the rest later, if
584 * applicable. In the case of SOCK_DGRAM sockets, we should drop the
585 * packet if it does not fit in the buffer.
587 * Due to the checks in uds_can_send(), we know for sure that we no
588 * longer have to suspend without making any progress at this point.
590 segflags
= (nfds
> 0) ? UDS_HAS_FDS
: 0;
593 * Obtain the credentials now. Doing so allows us to determine how
594 * much space we actually need for them.
596 if (off
== 0 && (peer
->uds_flags
& UDSF_PASSCRED
)) {
597 memset(&sockcred
, 0, sizeof(sockcred
));
599 if ((r
= getsockcred(user_endpt
, &sockcred
, groups
,
600 __arraycount(groups
))) != OK
)
604 * getsockcred(3) returns the total number of groups for the
605 * process, which may exceed the size of the given array. Our
606 * groups array should always be large enough for all groups,
607 * but we check to be sure anyway.
609 assert(sockcred
.sc_ngroups
<= (int)__arraycount(groups
));
611 credlen
= 1 + SOCKCREDSIZE(sockcred
.sc_ngroups
);
613 segflags
|= UDS_HAS_CRED
;
617 /* For bound source datagram sockets, include the source path. */
618 if (uds_get_type(uds
) == SOCK_DGRAM
&& uds
->uds_pathlen
!= 0) {
619 pathlen
= (size_t)uds
->uds_pathlen
+ 1;
621 segflags
|= UDS_HAS_PATH
;
625 avail
= UDS_BUF
- peer
->uds_len
;
627 if (uds_get_type(uds
) == SOCK_STREAM
) {
629 * Determine whether we can merge data into the previous
630 * segment. This is a more refined version of the test in
631 * uds_can_send(), as we now know whether there are actually
632 * any FDs to transfer.
634 merge
= (peer
->uds_len
!= 0 && nfds
== 0 && credlen
== 0);
636 /* Determine how much we can send at once. */
638 assert(avail
> UDS_HDRLEN
+ credlen
);
639 datalen
= avail
- UDS_HDRLEN
- credlen
;
646 /* If we cannot make progress, we should have suspended.. */
647 assert(datalen
!= 0 || len
== 0);
653 assert(datalen
<= len
);
654 assert(datalen
<= UDS_BUF
);
657 * Compute the total amount of space we need for the segment in the
658 * receive buffer. Given that we have done will-it-fit tests in
659 * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one
660 * case left where the result may not fit, and that is for SOCK_DGRAM
661 * packets. In that case, we drop the packet. POSIX says we should
662 * throw an error in that case, and that is also what NetBSD does.
665 seglen
= UDS_HDRLEN
+ credlen
+ pathlen
+ datalen
;
669 if (seglen
> avail
) {
670 assert(uds_get_type(uds
) == SOCK_DGRAM
);
672 /* Drop the packet, borrowing NetBSD's chosen error code. */
677 * Generate the full segment, but do not yet update the buffer head.
678 * We may still run into an error (copying in file descriptors) or even
679 * decide that nothing gets sent after all (if there are no data or
680 * file descriptors). If we are merging the new data into the previous
681 * segment, do not generate a header.
683 pos
= uds_get_head(peer
);
685 /* Generate the header, if needed. */
687 pos
= uds_store_hdr(peer
, pos
, seglen
, datalen
, segflags
);
689 assert(segflags
== 0);
691 /* Copy in and store the sender's credentials, if desired. */
693 assert(credlen
>= 1 + sizeof(sockcred
));
694 assert(credlen
<= UCHAR_MAX
);
696 lenbyte
= credlen
- 1;
697 pos
= uds_store(peer
, pos
, &lenbyte
, 1);
699 if (sockcred
.sc_ngroups
> 0) {
700 pos
= uds_store(peer
, pos
, &sockcred
,
701 offsetof(struct sockcred
, sc_groups
));
702 pos
= uds_store(peer
, pos
, groups
,
703 sockcred
.sc_ngroups
* sizeof(gid_t
));
705 pos
= uds_store(peer
, pos
, &sockcred
,
709 /* Store the sender's address if any. Datagram sockets only. */
712 assert(pathlen
<= UCHAR_MAX
);
714 lenbyte
= uds
->uds_pathlen
;
715 pos
= uds_store(peer
, pos
, &lenbyte
, 1);
716 pos
= uds_store(peer
, pos
, uds
->uds_path
, pathlen
- 1);
719 /* Lastly, copy in the actual data (if any) from the caller. */
721 iov
[0].iov_addr
= (vir_bytes
)&peer
->uds_buf
[pos
];
722 left
= UDS_BUF
- pos
;
724 if (left
< datalen
) {
726 iov
[0].iov_size
= left
;
727 iov
[1].iov_addr
= (vir_bytes
)&peer
->uds_buf
[0];
728 iov
[1].iov_size
= datalen
- left
;
731 iov
[0].iov_size
= datalen
;
735 if ((r
= sockdriver_vcopyin(data
, off
, iov
, iovcnt
)) != OK
)
741 *segflagsp
= segflags
;
746 * Copy in control data for the current send request, and extract any file
747 * descriptors to be transferred. Do not yet duplicate the file descriptors,
748 * but rather store a list in a temporary buffer: the send request may still
749 * fail in which case we want to avoid having to undo the duplication.
751 * On success, return the number of (zero or more) file descriptors extracted
752 * from the request and stored in the temporary buffer. On failure, return a
753 * negative error code.
756 uds_send_ctl(const struct sockdriver_data
* ctl
, socklen_t ctl_len
,
757 endpoint_t user_endpt
)
759 struct msghdr msghdr
;
760 struct cmsghdr
*cmsg
;
762 unsigned int i
, n
, nfds
;
766 * Copy in the control data. We can spend a lot of effort copying in
767 * the data in small chunks, and change the receiving side to do the
768 * same, but it is really not worth it: applications never send a whole
769 * lot of file descriptors at once, and the buffer size is currently
770 * such that the UDS service itself will exhaust its OPEN_MAX limit
773 if (ctl_len
> sizeof(uds_ctlbuf
))
776 if ((r
= sockdriver_copyin(ctl
, 0, uds_ctlbuf
, ctl_len
)) != OK
)
779 if (ctl_len
< sizeof(uds_ctlbuf
))
780 memset(&uds_ctlbuf
[ctl_len
], 0, sizeof(uds_ctlbuf
) - ctl_len
);
783 * Look for any file descriptors, and store their remote file
784 * descriptor numbers into a temporary array.
786 memset(&msghdr
, 0, sizeof(msghdr
));
787 msghdr
.msg_control
= uds_ctlbuf
;
788 msghdr
.msg_controllen
= ctl_len
;
794 * The sender may provide file descriptors in multiple chunks.
795 * Currently we do not preserve these chunk boundaries, instead
796 * generating one single chunk with all file descriptors for the
797 * segment upon receipt. If needed, we can fairly easily adapt this
800 for (cmsg
= CMSG_FIRSTHDR(&msghdr
); cmsg
!= NULL
;
801 cmsg
= CMSG_NXTHDR(&msghdr
, cmsg
)) {
803 * Check for bogus lengths. There is no excuse for this;
804 * either the caller does not know what they are doing or we
805 * are looking at a hacking attempt.
807 assert((socklen_t
)((char *)cmsg
- uds_ctlbuf
) <= ctl_len
);
808 left
= ctl_len
- (socklen_t
)((char *)cmsg
- uds_ctlbuf
);
809 assert(left
>= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
811 if (cmsg
->cmsg_len
< CMSG_LEN(0) || cmsg
->cmsg_len
> left
) {
812 printf("UDS: malformed control data from %u\n",
818 if (cmsg
->cmsg_level
!= SOL_SOCKET
||
819 cmsg
->cmsg_type
!= SCM_RIGHTS
)
822 n
= (cmsg
->cmsg_len
- CMSG_LEN(0)) / sizeof(int);
824 for (i
= 0; i
< n
; i
++) {
826 * Copy the file descriptor to the temporary buffer,
827 * whose size is based on the control data buffer, so
828 * it is always large enough to contain all FDs.
830 assert(nfds
< __arraycount(uds_ctlfds
));
832 memcpy(&uds_ctlfds
[nfds
],
833 &((int *)CMSG_DATA(cmsg
))[i
], sizeof(int));
843 * Actually duplicate any file descriptors that we extracted from the sender's
844 * control data and stored in our temporary buffer. On success, return OK,
845 * with all file descriptors stored in file descriptor objects that are
846 * appended to the socket's list of in-flight FD objects. Thus, on success,
847 * the send request may no longer fail. On failure, return a negative error
848 * code, with any partial duplication undone.
851 uds_send_fds(struct udssock
* peer
, unsigned int nfds
, endpoint_t user_endpt
)
853 SIMPLEQ_HEAD(, uds_fd
) fds
;
860 for (i
= 0; i
< nfds
; i
++) {
861 if (SIMPLEQ_EMPTY(&uds_freefds
)) {
862 /* UDS itself may already have OPEN_MAX FDs. */
868 * The caller may have given an invalid FD, or UDS itself may
869 * unexpectedly have run out of available file descriptors etc.
871 if ((r
= copyfd(user_endpt
, uds_ctlfds
[i
], COPYFD_FROM
)) < 0)
874 ufd
= SIMPLEQ_FIRST(&uds_freefds
);
875 SIMPLEQ_REMOVE_HEAD(&uds_freefds
, ufd_next
);
880 SIMPLEQ_INSERT_TAIL(&fds
, ufd
, ufd_next
);
882 dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds
[i
], r
));
885 /* Did we experience an error while copying in the file descriptors? */
887 /* Revert the successful copyfd() calls made so far. */
888 SIMPLEQ_FOREACH(ufd
, &fds
, ufd_next
) {
889 dprintf(("UDS: closing local fd %d\n", ufd
->ufd_fd
));
891 closenb(ufd
->ufd_fd
);
894 SIMPLEQ_CONCAT(&uds_freefds
, &fds
);
900 * Success. If there were any file descriptors at all, add them to the
901 * peer's list of in-flight file descriptors. Assign the number of
902 * file descriptors copied in to the first file descriptor object, so
903 * that we know how many to copy out (or discard) for this segment.
904 * Also set the UDS_HAS_FDS flag on the segment.
906 ufd
= SIMPLEQ_FIRST(&fds
);
907 ufd
->ufd_count
= nfds
;
909 SIMPLEQ_CONCAT(&peer
->uds_fds
, &fds
);
915 * The current send request is successful or at least has made progress.
916 * Commit the new segment or, if we decided to merge the new data into the last
917 * segment, update the header of the last segment. Also wake up the receiving
918 * side, because there will now be new data to receive.
921 uds_send_advance(struct udssock
* uds
, struct udssock
* peer
, size_t datalen
,
922 int merge
, size_t seglen
, unsigned int segflags
)
924 size_t pos
, prevseglen
, prevdatalen
;
927 * For non-datagram sockets, credentials are sent only once after
928 * setting the LOCAL_CREDS option. After that, the option is unset.
930 if ((segflags
& UDS_HAS_CRED
) && uds_get_type(uds
) != SOCK_DGRAM
)
931 peer
->uds_flags
&= ~UDSF_PASSCRED
;
934 assert(segflags
== 0);
936 pos
= uds_get_last(peer
);
938 (void)uds_fetch_hdr(peer
, pos
, &prevseglen
, &prevdatalen
,
941 peer
->uds_len
+= seglen
;
942 assert(peer
->uds_len
<= UDS_BUF
);
944 seglen
+= prevseglen
;
945 datalen
+= prevdatalen
;
946 assert(seglen
<= UDS_BUF
);
948 uds_store_hdr(peer
, pos
, seglen
, datalen
, segflags
);
950 peer
->uds_last
= peer
->uds_len
;
952 peer
->uds_len
+= seglen
;
953 assert(peer
->uds_len
<= UDS_BUF
);
956 /* Now that there are new data, wake up the receiver side. */
957 sockevent_raise(&peer
->uds_sock
, SEV_RECV
);
961 * Process a send request. Return OK if the send request has successfully
962 * completed, SUSPEND if it should be tried again later, or a negative error
963 * code on failure. In all cases, the values of 'off' and 'ctl_off' must be
964 * updated if any progress has been made; if either is non-zero, libsockevent
965 * will return the partial progress rather than an error code.
968 uds_send(struct sock
* sock
, const struct sockdriver_data
* data
, size_t len
,
969 size_t * off
, const struct sockdriver_data
* ctl
, socklen_t ctl_len
,
970 socklen_t
* ctl_off
, const struct sockaddr
* addr
, socklen_t addr_len
,
971 endpoint_t user_endpt
, int flags __unused
, size_t min
)
973 struct udssock
*uds
= (struct udssock
*)sock
;
974 struct udssock
*peer
;
975 size_t seglen
, datalen
= 0 /*gcc*/;
976 unsigned int nfds
, segflags
= 0 /*gcc*/;
977 int r
, partial
, merge
= 0 /*gcc*/;
979 dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n",
980 uds_get_id(uds
), len
, (off
!= NULL
) ? *off
: 0, ctl_len
,
981 (ctl_off
!= NULL
) ? *ctl_off
: 0, flags
));
983 partial
= (off
!= NULL
&& *off
> 0);
986 * First see whether we can process this send call at all right now.
987 * Most importantly, for connected sockets, if the peer's receive
988 * buffer is full, we may have to suspend the call until some space has
991 if ((r
= uds_send_test(uds
, len
, ctl_len
, min
, partial
)) != OK
)
995 * Then get the peer socket. For connected sockets, this is trivial.
996 * For unconnected sockets, it may involve a lookup of the given
999 if ((r
= uds_send_peer(uds
, addr
, addr_len
, user_endpt
, &peer
)) != OK
)
1003 * We now know for sure that we will not suspend this call without
1004 * making any progress. However, the call may still fail. Copy in
1005 * control data first now, so that we know whether there are any file
1006 * descriptors to transfer. This aspect may determine whether or not
1007 * we can merge data with a previous segment. Do not actually copy in
1008 * the actual file descriptors yet, because that is much harder to undo
1009 * in case of a failure later on.
1012 /* We process control data once, in full. */
1013 assert(*ctl_off
== 0);
1015 if ((r
= uds_send_ctl(ctl
, ctl_len
, user_endpt
)) < 0)
1017 nfds
= (unsigned int)r
;
1022 * Now generate a new segment, or (if possible) merge new data into the
1023 * last segment. Since the call may still fail, prepare the segment
1024 * but do not update the buffer head yet. Note that the segment
1025 * contains not just regular data (in fact it may contain no data at
1026 * all) but (also) certain ancillary data.
1028 if ((r
= uds_send_data(uds
, peer
, data
, len
, *off
, user_endpt
, nfds
,
1029 &merge
, &datalen
, &segflags
)) <= 0)
1034 * If we extracted any file descriptors from the control data earlier,
1035 * copy them over to ourselves now. The resulting in-flight file
1036 * descriptors are stored in a separate data structure. This is the
1037 * last point where the send call may actually fail.
1040 if ((r
= uds_send_fds(peer
, nfds
, user_endpt
)) != OK
)
1045 * The transmission is now known to be (partially) successful. Commit
1046 * the new work by moving the receive buffer head.
1048 uds_send_advance(uds
, peer
, datalen
, merge
, seglen
, segflags
);
1051 * Register the result. For stream-type sockets, the expected behavior
1052 * is that all data be sent, and so we may still have to suspend the
1053 * call after partial progress. Otherwise, we are now done. Either
1054 * way, we are done with the control data, so mark it as consumed.
1057 *ctl_off
+= ctl_len
;
1058 if (uds_get_type(uds
) == SOCK_STREAM
&& datalen
< len
)
1065 * Test whether a send request would block. The given 'min' parameter contains
1066 * the minimum number of bytes that should be possible to send without blocking
1067 * (the low send watermark). Return SUSPEND if the send request would block,
1068 * or any other error code if it would not.
1071 uds_test_send(struct sock
* sock
, size_t min
)
1073 struct udssock
*uds
= (struct udssock
*)sock
;
1075 return uds_send_test(uds
, min
, 0, min
, FALSE
/*partial*/);
1079 * Perform initial checks on a receive request, before it may potentially be
1080 * suspended. Return OK if this receive request is valid, or a negative error
1081 * code if it is not.
1084 uds_pre_recv(struct sock
* sock __unused
, endpoint_t user_endpt __unused
,
1089 * Reject calls with unknown flags. TODO: ensure that we should really
1090 * reject all other flags rather than ignore them.
1092 if ((flags
& ~(MSG_PEEK
| MSG_WAITALL
| MSG_CMSG_CLOEXEC
)) != 0)
1099 * Determine whether the (real or pretend) receive request should be processed
1100 * now, suspended until later, or rejected based on the current socket state.
1101 * Return OK if the receive request should be processed now, along with a first
1102 * indication whether the call may still be suspended later in 'may_block'.
1103 * Return SUSPEND if the receive request should be retried later. Return an
1104 * appropriate negative error code if the receive request should fail.
1107 uds_recv_test(struct udssock
* uds
, size_t len
, size_t min
, int partial
,
1110 size_t seglen
, datalen
;
1111 unsigned int segflags
;
1115 * If there are any pending data, those should always be received
1116 * first. However, if there is nothing to receive, then whether we
1117 * should suspend the receive call or fail immediately depends on other
1118 * conditions. We first look at these other conditions.
1122 if (uds_get_type(uds
) != SOCK_DGRAM
) {
1123 if (uds_is_connecting(uds
))
1125 else if (!uds_is_connected(uds
) && !uds_is_disconnected(uds
))
1127 else if (!uds_has_conn(uds
) ||
1128 uds_is_shutdown(uds
->uds_conn
, SFL_SHUT_WR
))
1132 if (uds
->uds_len
== 0) {
1134 * For stream-type sockets, we use the policy: if no regular
1135 * data is requested, then end the call without receiving
1136 * anything. For packet-type sockets, the request should block
1137 * until there is a packet to discard, though.
1139 if (r
!= OK
|| (uds_get_type(uds
) == SOCK_STREAM
&& len
== 0))
1146 * For stream-type sockets, we should still suspend the call if fewer
1147 * than 'min' bytes are available right now, and there is a possibility
1148 * that more data may arrive later. More may arrive later iff 'r' is
1149 * OK (i.e., no EOF or error will follow) and, in case we already
1150 * received some partial results, there is not already a next segment
1151 * with ancillary data (i.e, nonzero segment flags), or in any case
1152 * there isn't more than one segment in the buffer. Limit 'min' to the
1153 * maximum that can ever be received, though. Since that is difficult
1154 * in our case, we check whether the buffer is entirely full instead.
1156 if (r
== OK
&& uds_get_type(uds
) == SOCK_STREAM
&& min
> 0 &&
1157 uds
->uds_len
< UDS_BUF
) {
1158 assert(uds
->uds_len
>= UDS_HDRLEN
);
1160 (void)uds_fetch_hdr(uds
, uds
->uds_tail
, &seglen
, &datalen
,
1163 if (datalen
< min
&& seglen
== uds
->uds_len
&&
1164 (!partial
|| segflags
== 0))
1169 * Also start the decision process as to whether we should suspend the
1170 * current call if MSG_WAITALL is given. Unfortunately there is no one
1171 * place where we can conveniently do all the required checks.
1173 if (may_block
!= NULL
)
1174 *may_block
= (r
== OK
&& uds_get_type(uds
) == SOCK_STREAM
);
1179 * Receive regular data, and possibly the source path, from the tail segment in
1180 * the receive buffer. On success, return the positive non-zero length of the
1181 * tail segment, with 'addr' and 'addr_len' modified to store the source
1182 * address if applicable, the result flags in 'rflags' updated as appropriate,
1183 * the tail segment's data length stored in 'datalen', the number of received
1184 * regular data bytes stored in 'reslen', the segment flags stored in
1185 * 'segflags', and the absolute receive buffer position of the credentials in
1186 * the segment stored in 'credpos' if applicable. Since the receive call may
1187 * still fail, this function must not yet update the tail or any other aspect
1188 * of the receive buffer. Return zero if the current receive call was already
1189 * partially successful (due to MSG_WAITALL) and can no longer make progress,
1190 * and thus should be ended. Return a negative error code on failure.
1193 uds_recv_data(struct udssock
* uds
, const struct sockdriver_data
* data
,
1194 size_t len
, size_t off
, struct sockaddr
* addr
, socklen_t
* addr_len
,
1195 int * __restrict rflags
, size_t * __restrict datalen
,
1196 size_t * __restrict reslen
, unsigned int * __restrict segflags
,
1197 size_t * __restrict credpos
)
1200 unsigned char lenbyte
;
1201 unsigned int iovcnt
;
1202 size_t pos
, seglen
, left
;
1205 pos
= uds_fetch_hdr(uds
, uds
->uds_tail
, &seglen
, datalen
, segflags
);
1208 * If a partially completed receive now runs into a segment that cannot
1209 * be logically merged with the previous one (because it has at least
1210 * one segment flag set, meaning it has ancillary data), then we must
1211 * shortcut the receive now.
1213 if (off
!= 0 && *segflags
!= 0)
1217 * As stated, for stream-type sockets, we choose to ignore zero-size
1218 * receive calls. This has the consequence that reading a zero-sized
1219 * segment (with ancillary data) requires a receive request for at
1220 * least one regular data byte. Such a receive call would then return
1221 * zero. The problem with handling zero-data receive requests is that
1222 * we need to know whether the current segment is terminated (i.e., no
1223 * more data can possibly be merged into it later), which is a test
1224 * that we rather not perform, not in the least because we do not know
1225 * whether there is an error pending on the socket.
1227 * For datagrams, we currently allow a zero-size receive call to
1228 * discard the next datagram.
1230 * TODO: compare this against policies on other platforms.
1232 if (len
== 0 && uds_get_type(uds
) == SOCK_STREAM
)
1236 * We have to skip the credentials for now: these are copied out as
1237 * control data, and thus will (well, may) be looked at when dealing
1238 * with the control data. For the same reason, we do not even look at
1241 if (*segflags
& UDS_HAS_CRED
) {
1244 pos
= uds_fetch(uds
, pos
, &lenbyte
, 1);
1245 pos
= uds_advance(pos
, (size_t)lenbyte
);
1249 * Copy out the source address, but only if the (datagram) socket is
1250 * not connected. TODO: even when it is connected, it may still
1251 * receive packets sent to it from other sockets *before* being
1252 * connected, and the receiver has no way of knowing that those packets
1253 * did not come from its new peer. Ideally, the older packets should
1256 if (*segflags
& UDS_HAS_PATH
) {
1257 pos
= uds_fetch(uds
, pos
, &lenbyte
, 1);
1259 if (uds_get_type(uds
) == SOCK_DGRAM
&& !uds_has_link(uds
))
1260 uds_make_addr((const char *)&uds
->uds_buf
[pos
],
1261 (size_t)lenbyte
, addr
, addr_len
);
1263 pos
= uds_advance(pos
, (size_t)lenbyte
);
1267 * We can receive no more data than those that are present in the
1268 * segment, obviously. For stream-type sockets, any more data that
1269 * could have been received along with the current data would have been
1270 * merged in the current segment, so we need not search for any next
1273 * For non-stream sockets, the caller may receive less than a whole
1274 * packet if it supplied a small buffer. In that case, the rest of the
1275 * packet will be discarded (but not here yet!) and the caller gets
1276 * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway.
1280 else if (len
< *datalen
&& uds_get_type(uds
) != SOCK_STREAM
)
1281 *rflags
|= MSG_TRUNC
;
1283 /* Copy out the data to the caller. */
1285 iov
[0].iov_addr
= (vir_bytes
)&uds
->uds_buf
[pos
];
1286 left
= UDS_BUF
- pos
;
1289 iov
[0].iov_size
= left
;
1290 iov
[1].iov_addr
= (vir_bytes
)&uds
->uds_buf
[0];
1291 iov
[1].iov_size
= len
- left
;
1294 iov
[0].iov_size
= len
;
1298 if ((r
= sockdriver_vcopyout(data
, off
, iov
, iovcnt
)) != OK
)
1303 assert(seglen
> 0 && seglen
<= INT_MAX
);
1308 * The current segment has associated file descriptors. If possible, copy out
1309 * all file descriptors to the receiver, and generate and copy out a chunk of
1310 * control data that contains their file descriptor numbers. If not all
1311 * file descriptors fit in the receiver's buffer, or if any error occurs, no
1312 * file descriptors are copied out.
1315 uds_recv_fds(struct udssock
* uds
, const struct sockdriver_data
* ctl
,
1316 socklen_t ctl_len
, socklen_t ctl_off
, endpoint_t user_endpt
, int flags
)
1318 struct msghdr msghdr
;
1319 struct cmsghdr
*cmsg
;
1321 unsigned int i
, nfds
;
1322 socklen_t chunklen
, chunkspace
;
1325 /* See how many file descriptors should be part of this chunk. */
1326 assert(!SIMPLEQ_EMPTY(&uds
->uds_fds
));
1327 ufd
= SIMPLEQ_FIRST(&uds
->uds_fds
);
1328 nfds
= ufd
->ufd_count
;
1332 * We produce and copy out potentially unaligned chunks, using
1333 * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE.
1334 * This may leave "gap" bytes unchanged in userland, but that should
1335 * not be a problem. By producing unaligned chunks, we eliminate a
1336 * potential boundary case where the unaligned chunk passed in (by the
1337 * sender) no longer fits in the same buffer after being aligned here.
1339 chunklen
= CMSG_LEN(sizeof(int) * nfds
);
1340 chunkspace
= CMSG_SPACE(sizeof(int) * nfds
);
1341 assert(chunklen
<= sizeof(uds_ctlbuf
));
1342 if (chunklen
> ctl_len
)
1343 return 0; /* chunk would not fit, so produce nothing instead */
1344 if (chunkspace
> ctl_len
)
1345 chunkspace
= ctl_len
;
1347 memset(&msghdr
, 0, sizeof(msghdr
));
1348 msghdr
.msg_control
= uds_ctlbuf
;
1349 msghdr
.msg_controllen
= sizeof(uds_ctlbuf
);
1351 memset(uds_ctlbuf
, 0, chunklen
);
1352 cmsg
= CMSG_FIRSTHDR(&msghdr
);
1353 cmsg
->cmsg_len
= chunklen
;
1354 cmsg
->cmsg_level
= SOL_SOCKET
;
1355 cmsg
->cmsg_type
= SCM_RIGHTS
;
1358 * Copy the group's local file descriptors to the target endpoint, and
1359 * store the resulting remote file descriptors in the chunk buffer.
1363 for (i
= 0; i
< nfds
; i
++) {
1364 assert(ufd
!= SIMPLEQ_END(&uds
->uds_fds
));
1365 assert(i
== 0 || ufd
->ufd_count
== 0);
1368 if (flags
& MSG_CMSG_CLOEXEC
)
1369 what
|= COPYFD_CLOEXEC
;
1371 /* Failure may happen legitimately here (e.g., EMFILE). */
1372 if ((r
= copyfd(user_endpt
, ufd
->ufd_fd
, what
)) < 0)
1373 break; /* we keep our progress so far in 'i' */
1377 dprintf(("UDS: copied out fd %d -> %d\n", ufd
->ufd_fd
, fd
));
1379 memcpy(&((int *)CMSG_DATA(cmsg
))[i
], &fd
, sizeof(int));
1381 ufd
= SIMPLEQ_NEXT(ufd
, ufd_next
);
1384 /* If everything went well so far, copy out the produced chunk. */
1386 r
= sockdriver_copyout(ctl
, ctl_off
, uds_ctlbuf
, chunklen
);
1389 * Handle errors. At this point, the 'i' variable contains the number
1390 * of file descriptors that have already been successfully copied out.
1393 /* Revert the successful copyfd() calls made so far. */
1395 memcpy(&fd
, &((int *)CMSG_DATA(cmsg
))[i
], sizeof(int));
1397 (void)copyfd(user_endpt
, fd
, COPYFD_CLOSE
);
1404 * Success. Return the aligned size of the produced chunk, if the
1405 * given length permits it. From here on, the receive call may no
1406 * longer fail, as that would result in lost file descriptors.
1412 * Generate and copy out a chunk of control data with the sender's credentials.
1413 * Return the aligned chunk size on success, or a negative error code on
1417 uds_recv_cred(struct udssock
* uds
, const struct sockdriver_data
* ctl
,
1418 socklen_t ctl_len
, socklen_t ctl_off
, size_t credpos
)
1420 struct msghdr msghdr
;
1421 struct cmsghdr
*cmsg
;
1422 socklen_t chunklen
, chunkspace
;
1423 unsigned char lenbyte
;
1428 * Since the sender side already did the hard work of producing the
1429 * (variable-size) sockcred structure as it should be received, there
1430 * is relatively little work to be done here.
1432 credpos
= uds_fetch(uds
, credpos
, &lenbyte
, 1);
1433 credlen
= (size_t)lenbyte
;
1435 chunklen
= CMSG_LEN(credlen
);
1436 chunkspace
= CMSG_SPACE(credlen
);
1437 assert(chunklen
<= sizeof(uds_ctlbuf
));
1438 if (chunklen
> ctl_len
)
1439 return 0; /* chunk would not fit, so produce nothing instead */
1440 if (chunkspace
> ctl_len
)
1441 chunkspace
= ctl_len
;
1443 memset(&msghdr
, 0, sizeof(msghdr
));
1444 msghdr
.msg_control
= uds_ctlbuf
;
1445 msghdr
.msg_controllen
= sizeof(uds_ctlbuf
);
1447 memset(uds_ctlbuf
, 0, chunklen
);
1448 cmsg
= CMSG_FIRSTHDR(&msghdr
);
1449 cmsg
->cmsg_len
= chunklen
;
1450 cmsg
->cmsg_level
= SOL_SOCKET
;
1451 cmsg
->cmsg_type
= SCM_CREDS
;
1453 uds_fetch(uds
, credpos
, CMSG_DATA(cmsg
), credlen
);
1455 if ((r
= sockdriver_copyout(ctl
, ctl_off
, uds_ctlbuf
, chunklen
)) != OK
)
1462 * Copy out control data for the ancillary data associated with the current
1463 * segment, if any. Return OK on success, at which point the current receive
1464 * call may no longer fail. 'rflags' may be updated with additional result
1465 * flags. Return a negative error code on failure.
1468 uds_recv_ctl(struct udssock
* uds
, const struct sockdriver_data
* ctl
,
1469 socklen_t ctl_len
, socklen_t
* ctl_off
, endpoint_t user_endpt
,
1470 int flags
, unsigned int segflags
, size_t credpos
, int * rflags
)
1475 * We first copy out all file descriptors, if any. We put them in one
1476 * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS
1477 * chunks. We believe that this should not cause application-level
1478 * issues, but if it does, we can change that later with some effort.
1479 * We then copy out credentials, if any.
1481 * We copy out each control chunk independently of the others, and also
1482 * perform error recovery on a per-chunk basis. This implies the
1483 * following. If producing or copying out the first chunk fails, the
1484 * entire recvmsg(2) call will fail with an appropriate error. If
1485 * producing or copying out any subsequent chunk fails, the recvmsg(2)
1486 * call will still return the previously generated chunks (a "short
1487 * control read" if you will) as well as the MSG_CTRUNC flag. This
1488 * approach is simple and clean, and it guarantees that we can always
1489 * copy out at least as many file descriptors as we copied in for this
1490 * segment, even if credentials are present as well. However, the
1491 * approach does cause slightly more overhead when there are multiple
1492 * chunks per call, as those are copied out separately.
1494 * Since the generated SCM_RIGHTS chunk is never larger than the
1495 * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf"
1496 * buffer is always large enough to contain the chunk in its entirety.
1497 * SCM_CREDS chunks should always fit easily as well.
1499 * The MSG_CTRUNC flag will be returned iff not the entire user-given
1500 * control buffer was filled and not all control chunks were delivered.
1501 * Our current implementation does not deliver partial chunks. NetBSD
1502 * does, except for SCM_RIGHTS chunks.
1504 * TODO: get rid of the redundancy in processing return values.
1506 if (segflags
& UDS_HAS_FDS
) {
1507 r
= uds_recv_fds(uds
, ctl
, ctl_len
, *ctl_off
, user_endpt
,
1511 * At this point, 'r' contains one of the following:
1513 * r > 0 a chunk of 'r' bytes was added successfully.
1514 * r == 0 not enough space left; the chunk was not added.
1515 * r < 0 an error occurred; the chunk was not added.
1517 if (r
< 0 && *ctl_off
== 0)
1524 *rflags
|= MSG_CTRUNC
;
1527 if (segflags
& UDS_HAS_CRED
) {
1528 r
= uds_recv_cred(uds
, ctl
, ctl_len
, *ctl_off
, credpos
);
1531 if (r
< 0 && *ctl_off
== 0)
1538 *rflags
|= MSG_CTRUNC
;
1545 * The current receive request is successful or, in the case of MSG_WAITALL,
1546 * has made progress. Advance the receive buffer tail, either by discarding
1547 * the entire tail segment or by generating a new, smaller tail segment that
1548 * contains only the regular data left to be received from the original tail
1549 * segment. Also wake up the sending side for connection-oriented sockets if
1550 * applicable, because there may now be room for more data to be sent. Update
1551 * 'may_block' if we are now sure that the call may not block on MSG_WAITALL
1555 uds_recv_advance(struct udssock
* uds
, size_t seglen
, size_t datalen
,
1556 size_t reslen
, unsigned int segflags
, int * may_block
)
1558 struct udssock
*conn
;
1560 size_t delta
, nseglen
, advance
;
1563 /* Note that 'reslen' may be legitimately zero. */
1564 assert(reslen
<= datalen
);
1566 if (uds_get_type(uds
) != SOCK_STREAM
&& reslen
< datalen
)
1569 delta
= datalen
- reslen
;
1573 * Fully consume the tail segment. We advance the tail by the
1574 * full segment length, thus moving up to either the next
1575 * segment in the receive buffer, or an empty receive buffer.
1579 uds
->uds_tail
= uds_advance(uds
->uds_tail
, advance
);
1582 * Partially consume the tail segment. We put a new segment
1583 * header right in front of the remaining data, which obviously
1584 * always fits. Since any ancillary data was consumed along
1585 * with the first data byte of the segment, the new segment has
1586 * no ancillary data anymore (and thus a zero flags field).
1588 nseglen
= UDS_HDRLEN
+ delta
;
1589 assert(nseglen
< seglen
);
1591 advance
= seglen
- nseglen
;
1593 uds
->uds_tail
= uds_advance(uds
->uds_tail
, advance
);
1595 uds_store_hdr(uds
, uds
->uds_tail
, nseglen
, delta
, 0);
1599 * For datagram-oriented sockets, we always consume at least a header.
1600 * For stream-type sockets, we either consume a zero-data segment along
1601 * with its ancillary data, or we consume at least one byte from a
1602 * segment that does have regular data. In all other cases, the
1603 * receive call has already been ended by now. Thus, we always advance
1604 * the tail of the receive buffer here.
1606 assert(advance
> 0);
1609 * The receive buffer's used length (uds_len) and pointer to the
1610 * previous segment header (uds_last) are offsets from the tail. Now
1611 * that we have moved the tail, we need to adjust these accordingly.
1612 * If the buffer is now empty, reset the tail to the buffer start so as
1613 * to avoid splitting inter-process copies whenever possible.
1615 assert(uds
->uds_len
>= advance
);
1616 uds
->uds_len
-= advance
;
1618 if (uds
->uds_len
== 0)
1622 * If uds_last is zero here, it was pointing to the segment we just
1623 * (partially) consumed. By leaving it zero, it will still point to
1624 * the new or next segment.
1626 if (uds
->uds_last
> 0) {
1627 assert(uds
->uds_len
> 0);
1628 assert(uds
->uds_last
>= advance
);
1629 uds
->uds_last
-= advance
;
1633 * If there were any file descriptors associated with this segment,
1634 * close and free them now.
1636 if (segflags
& UDS_HAS_FDS
) {
1637 assert(!SIMPLEQ_EMPTY(&uds
->uds_fds
));
1638 ufd
= SIMPLEQ_FIRST(&uds
->uds_fds
);
1639 nfds
= ufd
->ufd_count
;
1642 while (nfds
-- > 0) {
1643 assert(!SIMPLEQ_EMPTY(&uds
->uds_fds
));
1644 ufd
= SIMPLEQ_FIRST(&uds
->uds_fds
);
1645 SIMPLEQ_REMOVE_HEAD(&uds
->uds_fds
, ufd_next
);
1647 dprintf(("UDS: closing local fd %d\n", ufd
->ufd_fd
));
1649 closenb(ufd
->ufd_fd
);
1651 SIMPLEQ_INSERT_TAIL(&uds_freefds
, ufd
, ufd_next
);
1656 * If there is now any data left in the receive buffer, then there has
1657 * been a reason that we haven't received it. For stream sockets, that
1658 * reason is that the next segment has ancillary data. In any case,
1659 * this means we should never block the current receive operation
1660 * waiting for more data. Otherwise, we may block on MSG_WAITALL.
1662 if (uds
->uds_len
> 0)
1666 * If the (non-datagram) socket has a peer that is not shut down for
1667 * writing, see if it can be woken up to send more data. Note that
1668 * the event will never be processed immediately.
1670 if (uds_is_connected(uds
)) {
1671 assert(uds_get_type(uds
) != SOCK_DGRAM
);
1673 conn
= uds
->uds_conn
;
1675 if (!uds_is_shutdown(conn
, SFL_SHUT_WR
))
1676 sockevent_raise(&conn
->uds_sock
, SEV_SEND
);
1681 * Process a receive request. Return OK if the receive request has completed
1682 * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an
1683 * end-of-file condition is reached, or a negative error code on failure. In
1684 * all cases, the values of 'off' and 'ctl_off' must be updated if any progress
1685 * has been made; if either is non-zero, libsockevent will return the partial
1686 * progress rather than an error code or EOF.
1689 uds_recv(struct sock
* sock
, const struct sockdriver_data
* data
, size_t len
,
1690 size_t * off
, const struct sockdriver_data
* ctl
, socklen_t ctl_len
,
1691 socklen_t
* ctl_off
, struct sockaddr
* addr
, socklen_t
* addr_len
,
1692 endpoint_t user_endpt
, int flags
, size_t min
, int * rflags
)
1694 struct udssock
*uds
= (struct udssock
*)sock
;
1695 size_t seglen
, datalen
, reslen
= 0 /*gcc*/, credpos
= 0 /*gcc*/;
1696 unsigned int segflags
;
1697 int r
, partial
, may_block
= 0 /*gcc*/;
1699 dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n",
1700 uds_get_id(uds
), len
, (off
!= NULL
) ? *off
: 0, ctl_len
,
1701 (ctl_off
!= NULL
) ? *ctl_off
: 0, flags
));
1704 * Start by testing whether anything can be received at all, or whether
1705 * an error or EOF should be returned instead, or whether the receive
1706 * call should be suspended until later otherwise. If no (regular or
1707 * control) data can be received, or if this was a test for select,
1708 * we bail out right after.
1710 partial
= (off
!= NULL
&& *off
> 0);
1712 if ((r
= uds_recv_test(uds
, len
, min
, partial
, &may_block
)) != OK
)
1716 * Copy out regular data, if any. Do this before copying out control
1717 * data, because the latter is harder to undo on failure. This data
1718 * copy function returns returns OK (0) if we are to return a result of
1719 * zero bytes (which is *not* EOF) to the caller without doing anything
1720 * else. The function returns a nonzero positive segment length if we
1721 * should carry on with the receive call (as it happens, all its other
1722 * returned values may in fact be zero).
1724 if ((r
= uds_recv_data(uds
, data
, len
, *off
, addr
, addr_len
, rflags
,
1725 &datalen
, &reslen
, &segflags
, &credpos
)) <= 0)
1730 * Copy out control data, if any: transfer and copy out records of file
1731 * descriptors, and/or copy out sender credentials. This is the last
1732 * part of the call that may fail.
1734 if ((r
= uds_recv_ctl(uds
, ctl
, ctl_len
, ctl_off
, user_endpt
, flags
,
1735 segflags
, credpos
, rflags
)) != OK
)
1739 * Now that the call has succeeded, move the tail of the receive
1740 * buffer, unless we were merely peeking.
1742 if (!(flags
& MSG_PEEK
))
1743 uds_recv_advance(uds
, seglen
, datalen
, reslen
, segflags
,
1749 * If the MSG_WAITALL flag was given, we may still have to suspend the
1750 * call after partial success. In particular, the receive call may
1751 * suspend after partial success if all of these conditions are met:
1753 * 1) the socket is a stream-type socket;
1754 * 2) MSG_WAITALL is set;
1755 * 3) MSG_PEEK is not set;
1756 * 4) MSG_DONTWAIT is not set (tested upon return);
1757 * 5) the socket must not have a pending error (tested upon return);
1758 * 6) the socket must not be shut down for reading (tested later);
1759 * 7) the socket must still be connected to a peer (no EOF);
1760 * 8) the peer must not have been shut down for writing (no EOF);
1761 * 9) the next segment, if any, contains no ancillary data.
1763 * Together, these points guarantee that the call could conceivably
1764 * receive more after being resumed. Points 4 to 6 are covered by
1765 * libsockevent, which will end the call even if we return SUSPEND
1766 * here. Due to segment merging, we cover point 9 by checking that
1767 * there is currently no next segment at all. Once a new segment
1768 * arrives, the ancillary-data test is done then.
1771 if ((flags
& MSG_WAITALL
) && reslen
< len
&& may_block
)
1778 * Test whether a receive request would block. The given 'min' parameter
1779 * contains the minimum number of bytes that should be possible to receive
1780 * without blocking (the low receive watermark). Return SUSPEND if the send
1781 * request would block. Otherwise, return any other error code (including OK
1782 * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled
1783 * with the number of bytes available for receipt right now (if not zero).
1784 * Note that if 'size' is not NULL, 'min' will always be zero.
1787 uds_test_recv(struct sock
* sock
, size_t min
, size_t * size
)
1789 struct udssock
*uds
= (struct udssock
*)sock
;
1791 unsigned int segflags
;
1794 if ((r
= uds_recv_test(uds
, min
, min
, FALSE
/*partial*/,
1795 NULL
/*may_block*/)) == SUSPEND
)
1798 if (size
!= NULL
&& uds
->uds_len
> 0)
1799 (void)uds_fetch_hdr(uds
, uds
->uds_tail
, &seglen
, size
,