Sync with cat.c from netbsd-8
[minix3.git] / minix / servers / vfs / socket.c
blob17ec25131e80bc3c78a98be9013f98542e916104
1 /*
2 * This file implements the upper socket layer of VFS: the BSD socket system
3 * calls, and any associated file descriptor, file pointer, vnode, and file
4 * system processing. In most cases, this layer will call into the lower
5 * socket layer in order to send the request to a socket driver. Generic file
6 * calls (e.g., read, write, ioctl, and select) are not implemented here, and
7 * will directly call into the lower socket layer as well.
9 * The following table shows the system call numbers implemented in this file,
10 * along with their request and reply message types. Each request layout
11 * message type is prefixed with "m_lc_vfs_". Each reply layout message type
12 * is prefixed with "m_vfs_lc_". For requests without a specific reply layout,
13 * only the "m_type" message field is used in the reply message.
15 * Type Request layout Reply layout
16 * ---- -------------- ------------
17 * VFS_SOCKET socket
18 * VFS_SOCKETPAIR socket fdpair
19 * VFS_BIND sockaddr
20 * VFS_CONNECT sockaddr
21 * VFS_LISTEN listen
22 * VFS_ACCEPT sockaddr socklen
23 * VFS_SENDTO sendrecv
24 * VFS_RECVFROM sendrecv socklen
25 * VFS_SENDMSG sockmsg
26 * VFS_RECVMSG sockmsg
27 * VFS_SETSOCKOPT sockopt
28 * VFS_GETSOCKOPT sockopt socklen
29 * VFS_GETSOCKNAME sockaddr socklen
30 * VFS_GETPEERNAME sockaddr socklen
31 * VFS_SHUTDOWN shutdown
34 #include "fs.h"
35 #include "vnode.h"
36 #include "file.h"
38 #include <sys/socket.h>
41 * Convert any SOCK_xx open flags to O_xx open flags.
43 static int
44 get_sock_flags(int type)
46 int flags;
48 flags = 0;
49 if (type & SOCK_CLOEXEC)
50 flags |= O_CLOEXEC;
51 if (type & SOCK_NONBLOCK)
52 flags |= O_NONBLOCK;
53 if (type & SOCK_NOSIGPIPE)
54 flags |= O_NOSIGPIPE;
56 return flags;
60 * Perform cheap pre-call checks to ensure that the given number of socket FDs
61 * can be created for the current process.
63 static int
64 check_sock_fds(int nfds)
68 * For now, we simply check if there are enough file descriptor slots
69 * free in the process. Since the process is blocked on a socket call,
70 * this aspect will not change. Availability of file pointers, vnodes,
71 * and PFS nodes may vary, and is therefore less interesting to check
72 * here - it will have to be checked again upon completion anyway.
74 return check_fds(fp, nfds);
78 * Create a new file descriptor, including supporting objects, for the open
79 * socket identified by 'dev', in the current process, using the O_xx open
80 * flags 'flags'. On success, return the file descriptor number. The results
81 * of a successful call can be undone with close_fd(), which will also close
82 * the socket itself. On failure, return a negative error code. In this case,
83 * the socket will be left open.
85 static int
86 make_sock_fd(dev_t dev, int flags)
88 struct vmnt *vmp;
89 struct vnode *vp;
90 struct filp *filp;
91 struct node_details res;
92 int r, fd;
94 assert((flags & ~(O_CLOEXEC | O_NONBLOCK | O_NOSIGPIPE)) == 0);
96 #if !NDEBUG
98 * Check whether there is a socket object for the new device already.
99 * This is an expensive check, but if the socket driver sends us a new
100 * socket ID that is already in use, this is a sure sign of driver
101 * misbehavior. So far it does seem like nothing would go wrong within
102 * VFS in this case though, which is why this is a debug-only check.
104 if (find_filp_by_sock_dev(dev) != NULL) {
105 printf("VFS: socket driver %d generated in-use socket ID!\n",
106 get_smap_by_dev(dev, NULL)->smap_endpt);
107 return EIO;
109 #endif /* !NDEBUG */
112 * Get a lock on PFS. TODO: it is not clear whether locking PFS is
113 * needed at all, let alone which lock: map_vnode() uses a write lock,
114 * create_pipe() uses a read lock, and cdev_clone() uses no lock at
115 * all. As is, the README prescribes VMNT_READ, so that's what we use
116 * here. The code below largely copies the create_pipe() code anyway.
118 if ((vmp = find_vmnt(PFS_PROC_NR)) == NULL)
119 panic("PFS gone");
120 if ((r = lock_vmnt(vmp, VMNT_READ)) != OK)
121 return r;
123 /* Obtain a free vnode. */
124 if ((vp = get_free_vnode()) == NULL) {
125 unlock_vmnt(vmp);
126 return err_code;
128 lock_vnode(vp, VNODE_OPCL);
130 /* Acquire a file descriptor. */
131 if ((r = get_fd(fp, 0, R_BIT | W_BIT, &fd, &filp)) != OK) {
132 unlock_vnode(vp);
133 unlock_vmnt(vmp);
134 return r;
137 /* Create a PFS node for the socket. */
138 if ((r = req_newnode(PFS_PROC_NR, fp->fp_effuid, fp->fp_effgid,
139 S_IFSOCK | ACCESSPERMS, dev, &res)) != OK) {
140 unlock_filp(filp);
141 unlock_vnode(vp);
142 unlock_vmnt(vmp);
143 return r;
146 /* Fill in the objects, and link them together. */
147 vp->v_fs_e = res.fs_e;
148 vp->v_inode_nr = res.inode_nr;
149 vp->v_mode = res.fmode;
150 vp->v_sdev = dev;
151 vp->v_fs_count = 1;
152 vp->v_ref_count = 1;
153 vp->v_vmnt = NULL;
154 vp->v_dev = NO_DEV;
155 vp->v_size = 0;
157 filp->filp_vno = vp;
158 filp->filp_flags = O_RDWR | flags;
159 filp->filp_count = 1;
161 fp->fp_filp[fd] = filp;
162 if (flags & O_CLOEXEC)
163 FD_SET(fd, &fp->fp_cloexec_set);
165 /* Release locks, and return the new file descriptor. */
166 unlock_filp(filp); /* this also unlocks the vnode now! */
167 unlock_vmnt(vmp);
169 return fd;
173 * Create a socket.
176 do_socket(void)
178 int domain, type, sock_type, protocol;
179 dev_t dev;
180 int r, flags;
182 domain = job_m_in.m_lc_vfs_socket.domain;
183 type = job_m_in.m_lc_vfs_socket.type;
184 protocol = job_m_in.m_lc_vfs_socket.protocol;
186 /* Is there a socket driver for this domain at all? */
187 if (get_smap_by_domain(domain) == NULL)
188 return EAFNOSUPPORT;
191 * Ensure that it is at least likely that after creating a socket, we
192 * will be able to create a file descriptor for it, along with all the
193 * necessary supporting objects. While it would be slightly neater to
194 * allocate these objects before trying to create the socket, this is
195 * offset by the fact that that approach results in a downright mess in
196 * do_socketpair() below, and with the current approach we can reuse
197 * the same code for accepting sockets as well. For newly created
198 * sockets, it is no big deal to close them right after creation; for
199 * newly accepted sockets, we have no choice but to do that anyway.
200 * Moreover, object creation failures should be rare and our approach
201 * does not cause significantly more overhead anyway, so the entire
202 * issue is largely philosophical anyway. For now, this will do.
204 if ((r = check_sock_fds(1)) != OK)
205 return r;
207 sock_type = type & ~SOCK_FLAGS_MASK;
208 flags = get_sock_flags(type);
210 if ((r = sdev_socket(domain, sock_type, protocol, &dev,
211 FALSE /*pair*/)) != OK)
212 return r;
214 if ((r = make_sock_fd(dev, flags)) < 0)
215 (void)sdev_close(dev, FALSE /*may_suspend*/);
217 return r;
221 * Create a pair of connected sockets.
224 do_socketpair(void)
226 int domain, type, sock_type, protocol;
227 dev_t dev[2];
228 int r, fd0, fd1, flags;
230 domain = job_m_in.m_lc_vfs_socket.domain;
231 type = job_m_in.m_lc_vfs_socket.type;
232 protocol = job_m_in.m_lc_vfs_socket.protocol;
234 /* Is there a socket driver for this domain at all? */
235 if (get_smap_by_domain(domain) == NULL)
236 return EAFNOSUPPORT;
239 * See the lengthy comment in do_socket(). This time we need two of
240 * everything, though.
242 if ((r = check_sock_fds(2)) != OK)
243 return r;
245 sock_type = type & ~SOCK_FLAGS_MASK;
246 flags = get_sock_flags(type);
248 if ((r = sdev_socket(domain, sock_type, protocol, dev,
249 TRUE /*pair*/)) != OK)
250 return r;
252 if ((fd0 = make_sock_fd(dev[0], flags)) < 0) {
253 (void)sdev_close(dev[0], FALSE /*may_suspend*/);
254 (void)sdev_close(dev[1], FALSE /*may_suspend*/);
255 return fd0;
258 if ((fd1 = make_sock_fd(dev[1], flags)) < 0) {
259 close_fd(fp, fd0, FALSE /*may_suspend*/);
260 (void)sdev_close(dev[1], FALSE /*may_suspend*/);
261 return fd1;
264 job_m_out.m_vfs_lc_fdpair.fd0 = fd0;
265 job_m_out.m_vfs_lc_fdpair.fd1 = fd1;
266 return OK;
270 * Check whether the given file descriptor identifies an open socket in the
271 * current process. If so, return OK, with the socket device number stored in
272 * 'dev' and its file pointer flags stored in 'flags' (if not NULL). If not,
273 * return an appropriate error code.
275 static int
276 get_sock(int fd, dev_t * dev, int * flags)
278 struct filp *filp;
280 if ((filp = get_filp(fd, VNODE_READ)) == NULL)
281 return err_code;
283 if (!S_ISSOCK(filp->filp_vno->v_mode)) {
284 unlock_filp(filp);
285 return ENOTSOCK;
288 *dev = filp->filp_vno->v_sdev;
289 if (flags != NULL)
290 *flags = filp->filp_flags;
293 * It is safe to leave the file pointer object unlocked during the
294 * actual call. Since the current process is blocked for the duration
295 * of the socket call, we know the socket's file descriptor, and thus
296 * its file pointer, can not possibly be freed. In addition, we will
297 * not be accessing the file pointer anymore later, with the exception
298 * of accept calls, which reacquire the lock when the reply comes in.
300 unlock_filp(filp);
301 return OK;
305 * Bind a socket to a local address.
308 do_bind(void)
310 dev_t dev;
311 int r, fd, flags;
313 fd = job_m_in.m_lc_vfs_sockaddr.fd;
315 if ((r = get_sock(fd, &dev, &flags)) != OK)
316 return r;
318 return sdev_bind(dev, job_m_in.m_lc_vfs_sockaddr.addr,
319 job_m_in.m_lc_vfs_sockaddr.addr_len, flags);
323 * Connect a socket to a remote address.
326 do_connect(void)
328 dev_t dev;
329 int r, fd, flags;
331 fd = job_m_in.m_lc_vfs_sockaddr.fd;
333 if ((r = get_sock(fd, &dev, &flags)) != OK)
334 return r;
336 return sdev_connect(dev, job_m_in.m_lc_vfs_sockaddr.addr,
337 job_m_in.m_lc_vfs_sockaddr.addr_len, flags);
341 * Put a socket in listening mode.
344 do_listen(void)
346 dev_t dev;
347 int r, fd, backlog;
349 fd = job_m_in.m_lc_vfs_listen.fd;
350 backlog = job_m_in.m_lc_vfs_listen.backlog;
352 if ((r = get_sock(fd, &dev, NULL)) != OK)
353 return r;
355 if (backlog < 0)
356 backlog = 0;
358 return sdev_listen(dev, backlog);
362 * Accept a connection on a listening socket, creating a new socket.
365 do_accept(void)
367 dev_t dev;
368 int r, fd, flags;
370 fd = job_m_in.m_lc_vfs_sockaddr.fd;
372 if ((r = get_sock(fd, &dev, &flags)) != OK)
373 return r;
375 if ((r = check_sock_fds(1)) != OK)
376 return r;
378 return sdev_accept(dev, job_m_in.m_lc_vfs_sockaddr.addr,
379 job_m_in.m_lc_vfs_sockaddr.addr_len, flags, fd);
383 * Resume a previously suspended accept(2) system call. This routine must
384 * cover three distinct cases, depending on the 'status' and 'dev' values:
386 * #1. If the 'status' parameter is set to OK, the accept call succeeded. In
387 * that case, the function is guaranteed to be called from a worker thread,
388 * with 'fp' set to the user process that made the system call. In that
389 * case, this function may block its calling thread. The 'dev' parameter
390 * will contain the device number of the newly accepted socket.
391 * #2. If the 'status' parameter contains a negative error code, but 'dev' is
392 * *not* set to NO_DEV, then the same as above applies, except that the new
393 * socket must be closed immediately.
394 * #3. If 'status' is a negative error code and 'dev' is set to NO_DEV, then
395 * the accept call has failed and no new socket was ever created. In this
396 * case, the function MUST NOT block its calling thread.
398 void
399 resume_accept(struct fproc * rfp, int status, dev_t dev, unsigned int addr_len,
400 int listen_fd)
402 message m;
403 dev_t ldev;
404 int r, flags;
407 * If the call did not succeed and no socket was created (case #3), we
408 * cannot and should not do more than send the error to the user
409 * process.
411 if (status != OK && dev == NO_DEV) {
412 replycode(rfp->fp_endpoint, status);
414 return;
418 * The call succeeded. The lower socket layer (sdev.c) ensures that in
419 * that case, we are called from a worker thread which is associated
420 * with the original user process. Thus, we can block the current
421 * thread. Start by verifying that the listening socket is still
422 * around. If it is not, it must have been invalidated as a result of
423 * a socket driver death, in which case we must report an error but
424 * need not close the new socket. As a side effect, obtain the
425 * listening socket's flags, which on BSD systems are inherited by the
426 * accepted socket.
428 assert(fp == rfp); /* needed for get_sock() and make_sock_fd() */
430 if (get_sock(listen_fd, &ldev, &flags) != OK) {
431 replycode(rfp->fp_endpoint, EIO);
433 return;
436 /* The same socket driver must host both sockets, obviously. */
437 assert(get_smap_by_dev(ldev, NULL) == get_smap_by_dev(dev, NULL));
440 * If an error status was returned (case #2), we must now close the
441 * newly accepted socket. Effectively, this allows socket drivers to
442 * handle address copy failures in the cleanest possible way.
444 if (status != OK) {
445 (void)sdev_close(dev, FALSE /*may_suspend*/);
447 replycode(rfp->fp_endpoint, status);
449 return;
453 * A new socket has been successfully accepted (case #1). Try to
454 * create a file descriptor for the new socket. If this fails, we have
455 * to close the new socket after all. That is not great, but we have
456 * no way to prevent this except by preallocating all objects for the
457 * duration of the accept call, which is not exactly great either.
459 flags &= O_CLOEXEC | O_NONBLOCK | O_NOSIGPIPE;
461 if ((r = make_sock_fd(dev, flags)) < 0) {
462 (void)sdev_close(dev, FALSE /*may_suspend*/);
464 replycode(rfp->fp_endpoint, r);
466 return;
470 * The accept call has succeeded. Send a reply message with the new
471 * file descriptor and an address length (which may be zero).
473 memset(&m, 0, sizeof(m));
474 m.m_vfs_lc_socklen.len = addr_len;
476 reply(&m, rfp->fp_endpoint, r);
480 * Send a message on a socket.
483 do_sendto(void)
485 dev_t dev;
486 int r, fd, flags;
488 fd = job_m_in.m_lc_vfs_sendrecv.fd;
490 if ((r = get_sock(fd, &dev, &flags)) != OK)
491 return r;
493 return sdev_readwrite(dev, job_m_in.m_lc_vfs_sendrecv.buf,
494 job_m_in.m_lc_vfs_sendrecv.len, 0, 0,
495 job_m_in.m_lc_vfs_sendrecv.addr,
496 job_m_in.m_lc_vfs_sendrecv.addr_len,
497 job_m_in.m_lc_vfs_sendrecv.flags, WRITING, flags, 0);
501 * Receive a message from a socket.
504 do_recvfrom(void)
506 dev_t dev;
507 int r, fd, flags;
509 fd = job_m_in.m_lc_vfs_sendrecv.fd;
511 if ((r = get_sock(fd, &dev, &flags)) != OK)
512 return r;
514 return sdev_readwrite(dev, job_m_in.m_lc_vfs_sendrecv.buf,
515 job_m_in.m_lc_vfs_sendrecv.len, 0, 0,
516 job_m_in.m_lc_vfs_sendrecv.addr,
517 job_m_in.m_lc_vfs_sendrecv.addr_len,
518 job_m_in.m_lc_vfs_sendrecv.flags, READING, flags, 0);
522 * Resume a previously suspended recvfrom(2) system call. This function MUST
523 * NOT block its calling thread.
525 void
526 resume_recvfrom(struct fproc * rfp, int status, unsigned int addr_len)
528 message m;
530 if (status >= 0) {
531 memset(&m, 0, sizeof(m));
532 m.m_vfs_lc_socklen.len = addr_len;
534 reply(&m, rfp->fp_endpoint, status);
535 } else
536 replycode(rfp->fp_endpoint, status);
540 * Send or receive a message on a socket using a message structure.
543 do_sockmsg(void)
545 struct msghdr msg;
546 struct iovec iov;
547 vir_bytes msg_buf, data_buf;
548 size_t data_len;
549 dev_t dev;
550 int r, fd, flags;
552 assert(job_call_nr == VFS_SENDMSG || job_call_nr == VFS_RECVMSG);
554 fd = job_m_in.m_lc_vfs_sockmsg.fd;
555 msg_buf = job_m_in.m_lc_vfs_sockmsg.msgbuf;
557 if ((r = get_sock(fd, &dev, &flags)) != OK)
558 return r;
560 if ((r = sys_datacopy_wrapper(who_e, msg_buf, SELF, (vir_bytes)&msg,
561 sizeof(msg))) != OK)
562 return r;
564 data_buf = 0;
565 data_len = 0;
566 if (msg.msg_iovlen > 0) {
568 * We do not yet support vectors with more than one element;
569 * for this reason, libc is currently expected to consolidate
570 * the entire vector into a single element. Once we do add
571 * proper vector support, the ABI itself need not be changed.
573 if (msg.msg_iovlen > 1)
574 return EMSGSIZE;
576 if ((r = sys_datacopy_wrapper(who_e, (vir_bytes)msg.msg_iov,
577 SELF, (vir_bytes)&iov, sizeof(iov))) != OK)
578 return r;
580 if (iov.iov_len > SSIZE_MAX)
581 return EINVAL;
583 if (iov.iov_len > 0) {
584 data_buf = (vir_bytes)iov.iov_base;
585 data_len = iov.iov_len;
589 return sdev_readwrite(dev, data_buf, data_len,
590 (vir_bytes)msg.msg_control, msg.msg_controllen,
591 (vir_bytes)msg.msg_name, msg.msg_namelen,
592 job_m_in.m_lc_vfs_sockmsg.flags,
593 (job_call_nr == VFS_RECVMSG) ? READING : WRITING, flags,
594 (job_call_nr == VFS_RECVMSG) ? msg_buf : 0);
598 * Resume a previously suspended recvmsg(2) system call. The 'status'
599 * parameter contains either the number of data bytes received or a negative
600 * error code. The 'msg_buf' parameter contains the user address of the msghdr
601 * structure. If a failure occurs in this function, the received data
602 * (including, in the worst case, references to received file descriptors) will
603 * be lost - while seriously ugly, this is always the calling process's fault,
604 * extremely hard to deal with, and on par with current behavior in other
605 * operating systems. This function MUST NOT block its calling thread.
607 void
608 resume_recvmsg(struct fproc * rfp, int status, unsigned int ctl_len,
609 unsigned int addr_len, int flags, vir_bytes msg_buf)
611 struct msghdr msg;
612 int r;
614 if (status < 0) {
615 replycode(rfp->fp_endpoint, status);
617 return;
621 * Unfortunately, we now need to update a subset of the fields of the
622 * msghdr structure. We can 1) copy in the entire structure for the
623 * second time, modify some fields, and copy it out in its entirety
624 * again, 2) copy out individual fields that have been changed, 3) save
625 * a copy of the original structure somewhere. The third option is the
626 * most efficient, but would increase the fproc structure size by quite
627 * a bit. The main difference between the first and second options is
628 * the number of kernel calls; we choose to use the first option.
630 if ((r = sys_datacopy_wrapper(rfp->fp_endpoint, msg_buf, SELF,
631 (vir_bytes)&msg, sizeof(msg))) != OK) {
632 /* We copied it in before, how could it fail now? */
633 printf("VFS: resume_recvmsg cannot copy in msghdr? (%d)\n", r);
635 replycode(rfp->fp_endpoint, r);
637 return;
640 /* Modify and copy out the structure, and wake up the caller. */
641 msg.msg_controllen = ctl_len;
642 msg.msg_flags = flags;
643 if (addr_len > 0)
644 msg.msg_namelen = addr_len;
646 if ((r = sys_datacopy_wrapper(SELF, (vir_bytes)&msg, rfp->fp_endpoint,
647 msg_buf, sizeof(msg))) != OK)
648 status = r;
650 replycode(rfp->fp_endpoint, status);
654 * Set socket options.
657 do_setsockopt(void)
659 dev_t dev;
660 int r, fd;
662 fd = job_m_in.m_lc_vfs_sockopt.fd;
664 if ((r = get_sock(fd, &dev, NULL)) != OK)
665 return r;
667 return sdev_setsockopt(dev, job_m_in.m_lc_vfs_sockopt.level,
668 job_m_in.m_lc_vfs_sockopt.name, job_m_in.m_lc_vfs_sockopt.buf,
669 job_m_in.m_lc_vfs_sockopt.len);
673 * Get socket options.
676 do_getsockopt(void)
678 unsigned int len;
679 dev_t dev;
680 int r, fd;
682 fd = job_m_in.m_lc_vfs_sockopt.fd;
683 len = job_m_in.m_lc_vfs_sockopt.len;
685 if ((r = get_sock(fd, &dev, NULL)) != OK)
686 return r;
688 r = sdev_getsockopt(dev, job_m_in.m_lc_vfs_sockopt.level,
689 job_m_in.m_lc_vfs_sockopt.name, job_m_in.m_lc_vfs_sockopt.buf,
690 &len);
692 if (r == OK)
693 job_m_out.m_vfs_lc_socklen.len = len;
694 return r;
698 * Get the local address of a socket.
701 do_getsockname(void)
703 unsigned int len;
704 dev_t dev;
705 int r, fd;
707 fd = job_m_in.m_lc_vfs_sockaddr.fd;
708 len = job_m_in.m_lc_vfs_sockaddr.addr_len;
710 if ((r = get_sock(fd, &dev, NULL)) != OK)
711 return r;
713 r = sdev_getsockname(dev, job_m_in.m_lc_vfs_sockaddr.addr, &len);
715 if (r == OK)
716 job_m_out.m_vfs_lc_socklen.len = len;
717 return r;
721 * Get the remote address of a socket.
724 do_getpeername(void)
726 unsigned int len;
727 dev_t dev;
728 int r, fd;
730 fd = job_m_in.m_lc_vfs_sockaddr.fd;
731 len = job_m_in.m_lc_vfs_sockaddr.addr_len;
733 if ((r = get_sock(fd, &dev, NULL)) != OK)
734 return r;
736 r = sdev_getpeername(dev, job_m_in.m_lc_vfs_sockaddr.addr, &len);
738 if (r == OK)
739 job_m_out.m_vfs_lc_socklen.len = len;
740 return r;
744 * Shut down socket send and receive operations.
747 do_shutdown(void)
749 dev_t dev;
750 int r, fd, how;
752 fd = job_m_in.m_lc_vfs_shutdown.fd;
753 how = job_m_in.m_lc_vfs_shutdown.how;
755 if ((r = get_sock(fd, &dev, NULL)) != OK)
756 return r;
758 if (how != SHUT_RD && how != SHUT_WR && how != SHUT_RDWR)
759 return EINVAL;
761 return sdev_shutdown(dev, how);