2 * This file implements the upper socket layer of VFS: the BSD socket system
3 * calls, and any associated file descriptor, file pointer, vnode, and file
4 * system processing. In most cases, this layer will call into the lower
5 * socket layer in order to send the request to a socket driver. Generic file
6 * calls (e.g., read, write, ioctl, and select) are not implemented here, and
7 * will directly call into the lower socket layer as well.
9 * The following table shows the system call numbers implemented in this file,
10 * along with their request and reply message types. Each request layout
11 * message type is prefixed with "m_lc_vfs_". Each reply layout message type
12 * is prefixed with "m_vfs_lc_". For requests without a specific reply layout,
13 * only the "m_type" message field is used in the reply message.
15 * Type Request layout Reply layout
16 * ---- -------------- ------------
18 * VFS_SOCKETPAIR socket fdpair
20 * VFS_CONNECT sockaddr
22 * VFS_ACCEPT sockaddr socklen
24 * VFS_RECVFROM sendrecv socklen
27 * VFS_SETSOCKOPT sockopt
28 * VFS_GETSOCKOPT sockopt socklen
29 * VFS_GETSOCKNAME sockaddr socklen
30 * VFS_GETPEERNAME sockaddr socklen
31 * VFS_SHUTDOWN shutdown
38 #include <sys/socket.h>
41 * Convert any SOCK_xx open flags to O_xx open flags.
44 get_sock_flags(int type
)
49 if (type
& SOCK_CLOEXEC
)
51 if (type
& SOCK_NONBLOCK
)
53 if (type
& SOCK_NOSIGPIPE
)
60 * Perform cheap pre-call checks to ensure that the given number of socket FDs
61 * can be created for the current process.
64 check_sock_fds(int nfds
)
68 * For now, we simply check if there are enough file descriptor slots
69 * free in the process. Since the process is blocked on a socket call,
70 * this aspect will not change. Availability of file pointers, vnodes,
71 * and PFS nodes may vary, and is therefore less interesting to check
72 * here - it will have to be checked again upon completion anyway.
74 return check_fds(fp
, nfds
);
78 * Create a new file descriptor, including supporting objects, for the open
79 * socket identified by 'dev', in the current process, using the O_xx open
80 * flags 'flags'. On success, return the file descriptor number. The results
81 * of a successful call can be undone with close_fd(), which will also close
82 * the socket itself. On failure, return a negative error code. In this case,
83 * the socket will be left open.
86 make_sock_fd(dev_t dev
, int flags
)
91 struct node_details res
;
94 assert((flags
& ~(O_CLOEXEC
| O_NONBLOCK
| O_NOSIGPIPE
)) == 0);
98 * Check whether there is a socket object for the new device already.
99 * This is an expensive check, but if the socket driver sends us a new
100 * socket ID that is already in use, this is a sure sign of driver
101 * misbehavior. So far it does seem like nothing would go wrong within
102 * VFS in this case though, which is why this is a debug-only check.
104 if (find_filp_by_sock_dev(dev
) != NULL
) {
105 printf("VFS: socket driver %d generated in-use socket ID!\n",
106 get_smap_by_dev(dev
, NULL
)->smap_endpt
);
112 * Get a lock on PFS. TODO: it is not clear whether locking PFS is
113 * needed at all, let alone which lock: map_vnode() uses a write lock,
114 * create_pipe() uses a read lock, and cdev_clone() uses no lock at
115 * all. As is, the README prescribes VMNT_READ, so that's what we use
116 * here. The code below largely copies the create_pipe() code anyway.
118 if ((vmp
= find_vmnt(PFS_PROC_NR
)) == NULL
)
120 if ((r
= lock_vmnt(vmp
, VMNT_READ
)) != OK
)
123 /* Obtain a free vnode. */
124 if ((vp
= get_free_vnode()) == NULL
) {
128 lock_vnode(vp
, VNODE_OPCL
);
130 /* Acquire a file descriptor. */
131 if ((r
= get_fd(fp
, 0, R_BIT
| W_BIT
, &fd
, &filp
)) != OK
) {
137 /* Create a PFS node for the socket. */
138 if ((r
= req_newnode(PFS_PROC_NR
, fp
->fp_effuid
, fp
->fp_effgid
,
139 S_IFSOCK
| ACCESSPERMS
, dev
, &res
)) != OK
) {
146 /* Fill in the objects, and link them together. */
147 vp
->v_fs_e
= res
.fs_e
;
148 vp
->v_inode_nr
= res
.inode_nr
;
149 vp
->v_mode
= res
.fmode
;
158 filp
->filp_flags
= O_RDWR
| flags
;
159 filp
->filp_count
= 1;
161 fp
->fp_filp
[fd
] = filp
;
162 if (flags
& O_CLOEXEC
)
163 FD_SET(fd
, &fp
->fp_cloexec_set
);
165 /* Release locks, and return the new file descriptor. */
166 unlock_filp(filp
); /* this also unlocks the vnode now! */
178 int domain
, type
, sock_type
, protocol
;
182 domain
= job_m_in
.m_lc_vfs_socket
.domain
;
183 type
= job_m_in
.m_lc_vfs_socket
.type
;
184 protocol
= job_m_in
.m_lc_vfs_socket
.protocol
;
186 /* Is there a socket driver for this domain at all? */
187 if (get_smap_by_domain(domain
) == NULL
)
191 * Ensure that it is at least likely that after creating a socket, we
192 * will be able to create a file descriptor for it, along with all the
193 * necessary supporting objects. While it would be slightly neater to
194 * allocate these objects before trying to create the socket, this is
195 * offset by the fact that that approach results in a downright mess in
196 * do_socketpair() below, and with the current approach we can reuse
197 * the same code for accepting sockets as well. For newly created
198 * sockets, it is no big deal to close them right after creation; for
199 * newly accepted sockets, we have no choice but to do that anyway.
200 * Moreover, object creation failures should be rare and our approach
201 * does not cause significantly more overhead anyway, so the entire
202 * issue is largely philosophical anyway. For now, this will do.
204 if ((r
= check_sock_fds(1)) != OK
)
207 sock_type
= type
& ~SOCK_FLAGS_MASK
;
208 flags
= get_sock_flags(type
);
210 if ((r
= sdev_socket(domain
, sock_type
, protocol
, &dev
,
211 FALSE
/*pair*/)) != OK
)
214 if ((r
= make_sock_fd(dev
, flags
)) < 0)
215 (void)sdev_close(dev
, FALSE
/*may_suspend*/);
221 * Create a pair of connected sockets.
226 int domain
, type
, sock_type
, protocol
;
228 int r
, fd0
, fd1
, flags
;
230 domain
= job_m_in
.m_lc_vfs_socket
.domain
;
231 type
= job_m_in
.m_lc_vfs_socket
.type
;
232 protocol
= job_m_in
.m_lc_vfs_socket
.protocol
;
234 /* Is there a socket driver for this domain at all? */
235 if (get_smap_by_domain(domain
) == NULL
)
239 * See the lengthy comment in do_socket(). This time we need two of
240 * everything, though.
242 if ((r
= check_sock_fds(2)) != OK
)
245 sock_type
= type
& ~SOCK_FLAGS_MASK
;
246 flags
= get_sock_flags(type
);
248 if ((r
= sdev_socket(domain
, sock_type
, protocol
, dev
,
249 TRUE
/*pair*/)) != OK
)
252 if ((fd0
= make_sock_fd(dev
[0], flags
)) < 0) {
253 (void)sdev_close(dev
[0], FALSE
/*may_suspend*/);
254 (void)sdev_close(dev
[1], FALSE
/*may_suspend*/);
258 if ((fd1
= make_sock_fd(dev
[1], flags
)) < 0) {
259 close_fd(fp
, fd0
, FALSE
/*may_suspend*/);
260 (void)sdev_close(dev
[1], FALSE
/*may_suspend*/);
264 job_m_out
.m_vfs_lc_fdpair
.fd0
= fd0
;
265 job_m_out
.m_vfs_lc_fdpair
.fd1
= fd1
;
270 * Check whether the given file descriptor identifies an open socket in the
271 * current process. If so, return OK, with the socket device number stored in
272 * 'dev' and its file pointer flags stored in 'flags' (if not NULL). If not,
273 * return an appropriate error code.
276 get_sock(int fd
, dev_t
* dev
, int * flags
)
280 if ((filp
= get_filp(fd
, VNODE_READ
)) == NULL
)
283 if (!S_ISSOCK(filp
->filp_vno
->v_mode
)) {
288 *dev
= filp
->filp_vno
->v_sdev
;
290 *flags
= filp
->filp_flags
;
293 * It is safe to leave the file pointer object unlocked during the
294 * actual call. Since the current process is blocked for the duration
295 * of the socket call, we know the socket's file descriptor, and thus
296 * its file pointer, can not possibly be freed. In addition, we will
297 * not be accessing the file pointer anymore later, with the exception
298 * of accept calls, which reacquire the lock when the reply comes in.
305 * Bind a socket to a local address.
313 fd
= job_m_in
.m_lc_vfs_sockaddr
.fd
;
315 if ((r
= get_sock(fd
, &dev
, &flags
)) != OK
)
318 return sdev_bind(dev
, job_m_in
.m_lc_vfs_sockaddr
.addr
,
319 job_m_in
.m_lc_vfs_sockaddr
.addr_len
, flags
);
323 * Connect a socket to a remote address.
331 fd
= job_m_in
.m_lc_vfs_sockaddr
.fd
;
333 if ((r
= get_sock(fd
, &dev
, &flags
)) != OK
)
336 return sdev_connect(dev
, job_m_in
.m_lc_vfs_sockaddr
.addr
,
337 job_m_in
.m_lc_vfs_sockaddr
.addr_len
, flags
);
341 * Put a socket in listening mode.
349 fd
= job_m_in
.m_lc_vfs_listen
.fd
;
350 backlog
= job_m_in
.m_lc_vfs_listen
.backlog
;
352 if ((r
= get_sock(fd
, &dev
, NULL
)) != OK
)
358 return sdev_listen(dev
, backlog
);
362 * Accept a connection on a listening socket, creating a new socket.
370 fd
= job_m_in
.m_lc_vfs_sockaddr
.fd
;
372 if ((r
= get_sock(fd
, &dev
, &flags
)) != OK
)
375 if ((r
= check_sock_fds(1)) != OK
)
378 return sdev_accept(dev
, job_m_in
.m_lc_vfs_sockaddr
.addr
,
379 job_m_in
.m_lc_vfs_sockaddr
.addr_len
, flags
, fd
);
383 * Resume a previously suspended accept(2) system call. This routine must
384 * cover three distinct cases, depending on the 'status' and 'dev' values:
386 * #1. If the 'status' parameter is set to OK, the accept call succeeded. In
387 * that case, the function is guaranteed to be called from a worker thread,
388 * with 'fp' set to the user process that made the system call. In that
389 * case, this function may block its calling thread. The 'dev' parameter
390 * will contain the device number of the newly accepted socket.
391 * #2. If the 'status' parameter contains a negative error code, but 'dev' is
392 * *not* set to NO_DEV, then the same as above applies, except that the new
393 * socket must be closed immediately.
394 * #3. If 'status' is a negative error code and 'dev' is set to NO_DEV, then
395 * the accept call has failed and no new socket was ever created. In this
396 * case, the function MUST NOT block its calling thread.
399 resume_accept(struct fproc
* rfp
, int status
, dev_t dev
, unsigned int addr_len
,
407 * If the call did not succeed and no socket was created (case #3), we
408 * cannot and should not do more than send the error to the user
411 if (status
!= OK
&& dev
== NO_DEV
) {
412 replycode(rfp
->fp_endpoint
, status
);
418 * The call succeeded. The lower socket layer (sdev.c) ensures that in
419 * that case, we are called from a worker thread which is associated
420 * with the original user process. Thus, we can block the current
421 * thread. Start by verifying that the listening socket is still
422 * around. If it is not, it must have been invalidated as a result of
423 * a socket driver death, in which case we must report an error but
424 * need not close the new socket. As a side effect, obtain the
425 * listening socket's flags, which on BSD systems are inherited by the
428 assert(fp
== rfp
); /* needed for get_sock() and make_sock_fd() */
430 if (get_sock(listen_fd
, &ldev
, &flags
) != OK
) {
431 replycode(rfp
->fp_endpoint
, EIO
);
436 /* The same socket driver must host both sockets, obviously. */
437 assert(get_smap_by_dev(ldev
, NULL
) == get_smap_by_dev(dev
, NULL
));
440 * If an error status was returned (case #2), we must now close the
441 * newly accepted socket. Effectively, this allows socket drivers to
442 * handle address copy failures in the cleanest possible way.
445 (void)sdev_close(dev
, FALSE
/*may_suspend*/);
447 replycode(rfp
->fp_endpoint
, status
);
453 * A new socket has been successfully accepted (case #1). Try to
454 * create a file descriptor for the new socket. If this fails, we have
455 * to close the new socket after all. That is not great, but we have
456 * no way to prevent this except by preallocating all objects for the
457 * duration of the accept call, which is not exactly great either.
459 flags
&= O_CLOEXEC
| O_NONBLOCK
| O_NOSIGPIPE
;
461 if ((r
= make_sock_fd(dev
, flags
)) < 0) {
462 (void)sdev_close(dev
, FALSE
/*may_suspend*/);
464 replycode(rfp
->fp_endpoint
, r
);
470 * The accept call has succeeded. Send a reply message with the new
471 * file descriptor and an address length (which may be zero).
473 memset(&m
, 0, sizeof(m
));
474 m
.m_vfs_lc_socklen
.len
= addr_len
;
476 reply(&m
, rfp
->fp_endpoint
, r
);
480 * Send a message on a socket.
488 fd
= job_m_in
.m_lc_vfs_sendrecv
.fd
;
490 if ((r
= get_sock(fd
, &dev
, &flags
)) != OK
)
493 return sdev_readwrite(dev
, job_m_in
.m_lc_vfs_sendrecv
.buf
,
494 job_m_in
.m_lc_vfs_sendrecv
.len
, 0, 0,
495 job_m_in
.m_lc_vfs_sendrecv
.addr
,
496 job_m_in
.m_lc_vfs_sendrecv
.addr_len
,
497 job_m_in
.m_lc_vfs_sendrecv
.flags
, WRITING
, flags
, 0);
501 * Receive a message from a socket.
509 fd
= job_m_in
.m_lc_vfs_sendrecv
.fd
;
511 if ((r
= get_sock(fd
, &dev
, &flags
)) != OK
)
514 return sdev_readwrite(dev
, job_m_in
.m_lc_vfs_sendrecv
.buf
,
515 job_m_in
.m_lc_vfs_sendrecv
.len
, 0, 0,
516 job_m_in
.m_lc_vfs_sendrecv
.addr
,
517 job_m_in
.m_lc_vfs_sendrecv
.addr_len
,
518 job_m_in
.m_lc_vfs_sendrecv
.flags
, READING
, flags
, 0);
522 * Resume a previously suspended recvfrom(2) system call. This function MUST
523 * NOT block its calling thread.
526 resume_recvfrom(struct fproc
* rfp
, int status
, unsigned int addr_len
)
531 memset(&m
, 0, sizeof(m
));
532 m
.m_vfs_lc_socklen
.len
= addr_len
;
534 reply(&m
, rfp
->fp_endpoint
, status
);
536 replycode(rfp
->fp_endpoint
, status
);
540 * Send or receive a message on a socket using a message structure.
547 vir_bytes msg_buf
, data_buf
;
552 assert(job_call_nr
== VFS_SENDMSG
|| job_call_nr
== VFS_RECVMSG
);
554 fd
= job_m_in
.m_lc_vfs_sockmsg
.fd
;
555 msg_buf
= job_m_in
.m_lc_vfs_sockmsg
.msgbuf
;
557 if ((r
= get_sock(fd
, &dev
, &flags
)) != OK
)
560 if ((r
= sys_datacopy_wrapper(who_e
, msg_buf
, SELF
, (vir_bytes
)&msg
,
566 if (msg
.msg_iovlen
> 0) {
568 * We do not yet support vectors with more than one element;
569 * for this reason, libc is currently expected to consolidate
570 * the entire vector into a single element. Once we do add
571 * proper vector support, the ABI itself need not be changed.
573 if (msg
.msg_iovlen
> 1)
576 if ((r
= sys_datacopy_wrapper(who_e
, (vir_bytes
)msg
.msg_iov
,
577 SELF
, (vir_bytes
)&iov
, sizeof(iov
))) != OK
)
580 if (iov
.iov_len
> SSIZE_MAX
)
583 if (iov
.iov_len
> 0) {
584 data_buf
= (vir_bytes
)iov
.iov_base
;
585 data_len
= iov
.iov_len
;
589 return sdev_readwrite(dev
, data_buf
, data_len
,
590 (vir_bytes
)msg
.msg_control
, msg
.msg_controllen
,
591 (vir_bytes
)msg
.msg_name
, msg
.msg_namelen
,
592 job_m_in
.m_lc_vfs_sockmsg
.flags
,
593 (job_call_nr
== VFS_RECVMSG
) ? READING
: WRITING
, flags
,
594 (job_call_nr
== VFS_RECVMSG
) ? msg_buf
: 0);
598 * Resume a previously suspended recvmsg(2) system call. The 'status'
599 * parameter contains either the number of data bytes received or a negative
600 * error code. The 'msg_buf' parameter contains the user address of the msghdr
601 * structure. If a failure occurs in this function, the received data
602 * (including, in the worst case, references to received file descriptors) will
603 * be lost - while seriously ugly, this is always the calling process's fault,
604 * extremely hard to deal with, and on par with current behavior in other
605 * operating systems. This function MUST NOT block its calling thread.
608 resume_recvmsg(struct fproc
* rfp
, int status
, unsigned int ctl_len
,
609 unsigned int addr_len
, int flags
, vir_bytes msg_buf
)
615 replycode(rfp
->fp_endpoint
, status
);
621 * Unfortunately, we now need to update a subset of the fields of the
622 * msghdr structure. We can 1) copy in the entire structure for the
623 * second time, modify some fields, and copy it out in its entirety
624 * again, 2) copy out individual fields that have been changed, 3) save
625 * a copy of the original structure somewhere. The third option is the
626 * most efficient, but would increase the fproc structure size by quite
627 * a bit. The main difference between the first and second options is
628 * the number of kernel calls; we choose to use the first option.
630 if ((r
= sys_datacopy_wrapper(rfp
->fp_endpoint
, msg_buf
, SELF
,
631 (vir_bytes
)&msg
, sizeof(msg
))) != OK
) {
632 /* We copied it in before, how could it fail now? */
633 printf("VFS: resume_recvmsg cannot copy in msghdr? (%d)\n", r
);
635 replycode(rfp
->fp_endpoint
, r
);
640 /* Modify and copy out the structure, and wake up the caller. */
641 msg
.msg_controllen
= ctl_len
;
642 msg
.msg_flags
= flags
;
644 msg
.msg_namelen
= addr_len
;
646 if ((r
= sys_datacopy_wrapper(SELF
, (vir_bytes
)&msg
, rfp
->fp_endpoint
,
647 msg_buf
, sizeof(msg
))) != OK
)
650 replycode(rfp
->fp_endpoint
, status
);
654 * Set socket options.
662 fd
= job_m_in
.m_lc_vfs_sockopt
.fd
;
664 if ((r
= get_sock(fd
, &dev
, NULL
)) != OK
)
667 return sdev_setsockopt(dev
, job_m_in
.m_lc_vfs_sockopt
.level
,
668 job_m_in
.m_lc_vfs_sockopt
.name
, job_m_in
.m_lc_vfs_sockopt
.buf
,
669 job_m_in
.m_lc_vfs_sockopt
.len
);
673 * Get socket options.
682 fd
= job_m_in
.m_lc_vfs_sockopt
.fd
;
683 len
= job_m_in
.m_lc_vfs_sockopt
.len
;
685 if ((r
= get_sock(fd
, &dev
, NULL
)) != OK
)
688 r
= sdev_getsockopt(dev
, job_m_in
.m_lc_vfs_sockopt
.level
,
689 job_m_in
.m_lc_vfs_sockopt
.name
, job_m_in
.m_lc_vfs_sockopt
.buf
,
693 job_m_out
.m_vfs_lc_socklen
.len
= len
;
698 * Get the local address of a socket.
707 fd
= job_m_in
.m_lc_vfs_sockaddr
.fd
;
708 len
= job_m_in
.m_lc_vfs_sockaddr
.addr_len
;
710 if ((r
= get_sock(fd
, &dev
, NULL
)) != OK
)
713 r
= sdev_getsockname(dev
, job_m_in
.m_lc_vfs_sockaddr
.addr
, &len
);
716 job_m_out
.m_vfs_lc_socklen
.len
= len
;
721 * Get the remote address of a socket.
730 fd
= job_m_in
.m_lc_vfs_sockaddr
.fd
;
731 len
= job_m_in
.m_lc_vfs_sockaddr
.addr_len
;
733 if ((r
= get_sock(fd
, &dev
, NULL
)) != OK
)
736 r
= sdev_getpeername(dev
, job_m_in
.m_lc_vfs_sockaddr
.addr
, &len
);
739 job_m_out
.m_vfs_lc_socklen
.len
= len
;
744 * Shut down socket send and receive operations.
752 fd
= job_m_in
.m_lc_vfs_shutdown
.fd
;
753 how
= job_m_in
.m_lc_vfs_shutdown
.how
;
755 if ((r
= get_sock(fd
, &dev
, NULL
)) != OK
)
758 if (how
!= SHUT_RD
&& how
!= SHUT_WR
&& how
!= SHUT_RDWR
)
761 return sdev_shutdown(dev
, how
);