dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / sockfs / socksubr.c
blob5752c0519c601350a40f54ff022e942c7d747ea0
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/open.h>
44 #include <sys/user.h>
45 #include <sys/termios.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/strsun.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/mkdev.h>
54 #include <sys/pathname.h>
55 #include <sys/ddi.h>
56 #include <sys/stat.h>
57 #include <sys/fs/snode.h>
58 #include <sys/fs/dv_node.h>
59 #include <sys/zone.h>
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <netinet/in.h>
64 #include <sys/un.h>
65 #include <sys/ucred.h>
67 #include <sys/tiuser.h>
68 #define _SUN_TPI_VERSION 2
69 #include <sys/tihdr.h>
71 #include <c2/audit.h>
73 #include "sockcommon.h"
74 #include "sockfilter_impl.h"
75 #include "socktpi.h"
76 #include "socktpi_impl.h"
77 #include "sodirect.h"
80 * Macros that operate on struct cmsghdr.
81 * The CMSG_VALID macro does not assume that the last option buffer is padded.
83 #define CMSG_CONTENT(cmsg) (&((cmsg)[1]))
84 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
85 #define CMSG_VALID(cmsg, start, end) \
86 (ISALIGNED_cmsghdr(cmsg) && \
87 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \
88 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \
89 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
90 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
91 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */
93 dev_t sockdev; /* For fsid in getattr */
95 struct socklist socklist;
97 struct kmem_cache *socket_cache;
100 * sockconf_lock protects the socket configuration (socket types and
101 * socket filters) which is changed via the sockconfig system call.
103 krwlock_t sockconf_lock;
105 static int sockfs_update(kstat_t *, int);
106 static int sockfs_snapshot(kstat_t *, void *, int);
107 extern smod_info_t *sotpi_smod_create(void);
109 extern void sendfile_init();
111 extern int modrootloaded;
113 #define ADRSTRLEN (2 * sizeof (void *) + 1)
115 * kernel structure for passing the sockinfo data back up to the user.
116 * the strings array allows us to convert AF_UNIX addresses into strings
117 * with a common method regardless of which n-bit kernel we're running.
119 struct k_sockinfo {
120 struct sockinfo ks_si;
121 char ks_straddr[3][ADRSTRLEN];
125 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
126 * Returns with the vnode held.
129 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
131 struct snode *csp;
132 vnode_t *vp, *dvp;
133 major_t maj;
134 int error;
136 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
139 * Lookup the underlying filesystem vnode.
141 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
142 if (error)
143 return (error);
145 /* Check that it is the correct vnode */
146 if (vp->v_type != VCHR) {
147 VN_RELE(vp);
148 return (ENOTSOCK);
152 * If devpath went through devfs, the device should already
153 * be configured. If devpath is a mknod file, however, we
154 * need to make sure the device is properly configured.
155 * To do this, we do something similar to spec_open()
156 * except that we resolve to the minor/leaf level since
157 * we need to return a vnode.
159 csp = VTOS(VTOS(vp)->s_commonvp);
160 if (!(csp->s_flag & SDIPSET)) {
161 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
162 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
163 if (error == 0)
164 error = devfs_lookupname(pathname, NULLVPP, &dvp);
165 VN_RELE(vp);
166 kmem_free(pathname, MAXPATHLEN);
167 if (error != 0)
168 return (ENXIO);
169 vp = dvp; /* use the devfs vp */
172 /* device is configured at this point */
173 maj = getmajor(vp->v_rdev);
174 if (!STREAMSTAB(maj)) {
175 VN_RELE(vp);
176 return (ENOSTR);
179 *vpp = vp;
180 return (0);
184 * Update the accessed, updated, or changed times in an sonode
185 * with the current time.
187 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
188 * attributes in a fstat call. (They return the current time and 0 for
189 * all timestamps, respectively.) We maintain the current timestamps
190 * here primarily so that should sockmod be popped the resulting
191 * file descriptor will behave like a stream w.r.t. the timestamps.
193 void
194 so_update_attrs(struct sonode *so, int flag)
196 time_t now = gethrestime_sec();
198 if (SOCK_IS_NONSTR(so))
199 return;
201 mutex_enter(&so->so_lock);
202 so->so_flag |= flag;
203 if (flag & SOACC)
204 SOTOTPI(so)->sti_atime = now;
205 if (flag & SOMOD)
206 SOTOTPI(so)->sti_mtime = now;
207 mutex_exit(&so->so_lock);
210 extern so_create_func_t sock_comm_create_function;
211 extern so_destroy_func_t sock_comm_destroy_function;
213 /* yes, we want all defaults */
214 static const struct vfsops sock_vfsops;
217 * Init function called when sockfs is loaded.
220 sockinit(int fstype, char *name)
222 int error;
223 major_t dev;
224 char *err_str;
226 error = vfs_setfsops(fstype, &sock_vfsops);
227 if (error != 0) {
228 zcmn_err(GLOBAL_ZONEID, CE_WARN,
229 "sockinit: bad fstype");
230 return (error);
233 socket_cache = kmem_cache_create("socket_cache",
234 sizeof (struct sonode), 0, sonode_constructor,
235 sonode_destructor, NULL, NULL, NULL, 0);
237 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
239 error = socktpi_init();
240 if (error != 0) {
241 err_str = NULL;
242 goto failure;
245 error = sod_init();
246 if (error != 0) {
247 err_str = NULL;
248 goto failure;
252 * Set up the default create and destroy functions
254 sock_comm_create_function = socket_sonode_create;
255 sock_comm_destroy_function = socket_sonode_destroy;
258 * Build initial list mapping socket parameters to vnode.
260 smod_init();
261 smod_add(sotpi_smod_create());
263 sockparams_init();
266 * If sockets are needed before init runs /sbin/soconfig
267 * it is possible to preload the sockparams list here using
268 * calls like:
269 * sockconfig(1,2,3, "/dev/tcp", 0);
273 * Create a unique dev_t for use in so_fsid.
276 if ((dev = getudev()) == (major_t)-1)
277 dev = 0;
278 sockdev = makedevice(dev, 0);
280 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
281 sendfile_init();
282 /* Initialize socket filters */
283 sof_init();
285 return (0);
287 failure:
288 (void) vfs_freevfsops_by_type(fstype);
289 if (err_str != NULL)
290 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
291 return (error);
295 * Caller must hold the mutex. Used to set SOLOCKED.
297 void
298 so_lock_single(struct sonode *so)
300 ASSERT(MUTEX_HELD(&so->so_lock));
302 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
303 cv_wait_stop(&so->so_single_cv, &so->so_lock,
304 SO_LOCK_WAKEUP_TIME);
306 so->so_flag |= SOLOCKED;
310 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
311 * Used to clear SOLOCKED or SOASYNC_UNBIND.
313 void
314 so_unlock_single(struct sonode *so, int flag)
316 ASSERT(MUTEX_HELD(&so->so_lock));
317 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
318 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
319 ASSERT(so->so_flag & flag);
321 * Process the T_DISCON_IND on sti_discon_ind_mp.
323 * Call to so_drain_discon_ind will result in so_lock
324 * being dropped and re-acquired later.
326 if (!SOCK_IS_NONSTR(so)) {
327 sotpi_info_t *sti = SOTOTPI(so);
329 if (sti->sti_discon_ind_mp != NULL)
330 so_drain_discon_ind(so);
333 cv_signal(&so->so_single_cv);
334 so->so_flag &= ~flag;
338 * Caller must hold the mutex. Used to set SOREADLOCKED.
339 * If the caller wants nonblocking behavior it should set fmode.
342 so_lock_read(struct sonode *so, int fmode)
344 ASSERT(MUTEX_HELD(&so->so_lock));
346 while (so->so_flag & SOREADLOCKED) {
347 if (fmode & (FNDELAY|FNONBLOCK))
348 return (EWOULDBLOCK);
349 cv_wait_stop(&so->so_read_cv, &so->so_lock,
350 SO_LOCK_WAKEUP_TIME);
352 so->so_flag |= SOREADLOCKED;
353 return (0);
357 * Like so_lock_read above but allows signals.
360 so_lock_read_intr(struct sonode *so, int fmode)
362 ASSERT(MUTEX_HELD(&so->so_lock));
364 while (so->so_flag & SOREADLOCKED) {
365 if (fmode & (FNDELAY|FNONBLOCK))
366 return (EWOULDBLOCK);
367 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
368 return (EINTR);
370 so->so_flag |= SOREADLOCKED;
371 return (0);
375 * Caller must hold the mutex. Used to clear SOREADLOCKED,
376 * set in so_lock_read() or so_lock_read_intr().
378 void
379 so_unlock_read(struct sonode *so)
381 ASSERT(MUTEX_HELD(&so->so_lock));
382 ASSERT(so->so_flag & SOREADLOCKED);
384 cv_signal(&so->so_read_cv);
385 so->so_flag &= ~SOREADLOCKED;
389 * Verify that the specified offset falls within the mblk and
390 * that the resulting pointer is aligned.
391 * Returns NULL if not.
393 void *
394 sogetoff(mblk_t *mp, t_uscalar_t offset,
395 t_uscalar_t length, uint_t align_size)
397 uintptr_t ptr1, ptr2;
399 ASSERT(mp && mp->b_wptr >= mp->b_rptr);
400 ptr1 = (uintptr_t)mp->b_rptr + offset;
401 ptr2 = (uintptr_t)ptr1 + length;
402 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
403 eprintline(0);
404 return (NULL);
406 if ((ptr1 & (align_size - 1)) != 0) {
407 eprintline(0);
408 return (NULL);
410 return ((void *)ptr1);
414 * Return the AF_UNIX underlying filesystem vnode matching a given name.
415 * Makes sure the sending and the destination sonodes are compatible.
416 * The vnode is returned held.
418 * The underlying filesystem VSOCK vnode has a v_stream pointer that
419 * references the actual stream head (hence indirectly the actual sonode).
421 static int
422 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, vnode_t **vpp)
424 vnode_t *vp; /* Underlying filesystem vnode */
425 vnode_t *rvp; /* real vnode */
426 vnode_t *svp; /* sockfs vnode */
427 struct sonode *so2;
428 int error;
430 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
431 soun->sun_path));
433 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
434 if (error) {
435 eprintsoline(so, error);
436 return (error);
440 * Traverse lofs mounts get the real vnode
442 if (fop_realvp(vp, &rvp, NULL) == 0) {
443 VN_HOLD(rvp); /* hold the real vnode */
444 VN_RELE(vp); /* release hold from lookup */
445 vp = rvp;
448 if (vp->v_type != VSOCK) {
449 error = ENOTSOCK;
450 eprintsoline(so, error);
451 goto done2;
455 * Check that we have permissions to access the destination
456 * vnode.
458 if (error = fop_access(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
459 eprintsoline(so, error);
460 goto done2;
464 * Check if the remote socket has been closed.
466 * Synchronize with vn_rele_stream by holding v_lock while traversing
467 * v_stream->sd_vnode.
469 mutex_enter(&vp->v_lock);
470 if (vp->v_stream == NULL) {
471 mutex_exit(&vp->v_lock);
472 if (so->so_type == SOCK_DGRAM)
473 error = EDESTADDRREQ;
474 else
475 error = ECONNREFUSED;
477 eprintsoline(so, error);
478 goto done2;
480 ASSERT(vp->v_stream->sd_vnode);
481 svp = vp->v_stream->sd_vnode;
483 * holding v_lock on underlying filesystem vnode and acquiring
484 * it on sockfs vnode. Assumes that no code ever attempts to
485 * acquire these locks in the reverse order.
487 VN_HOLD(svp);
488 mutex_exit(&vp->v_lock);
490 if (svp->v_type != VSOCK) {
491 error = ENOTSOCK;
492 eprintsoline(so, error);
493 goto done;
496 so2 = VTOSO(svp);
498 if (so->so_type != so2->so_type) {
499 error = EPROTOTYPE;
500 eprintsoline(so, error);
501 goto done;
504 VN_RELE(svp);
505 *vpp = vp;
506 return (0);
508 done:
509 VN_RELE(svp);
510 done2:
511 VN_RELE(vp);
512 return (error);
516 * Verify peer address for connect and sendto/sendmsg.
517 * Since sendto/sendmsg would not get synchronous errors from the transport
518 * provider we have to do these ugly checks in the socket layer to
519 * preserve compatibility with SunOS 4.X.
522 so_addr_verify(struct sonode *so, const struct sockaddr *name,
523 socklen_t namelen)
525 int family;
527 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
528 (void *)so, (void *)name, namelen));
530 ASSERT(name != NULL);
532 family = so->so_family;
533 switch (family) {
534 case AF_INET:
535 if (name->sa_family != family) {
536 eprintsoline(so, EAFNOSUPPORT);
537 return (EAFNOSUPPORT);
539 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
540 eprintsoline(so, EINVAL);
541 return (EINVAL);
543 break;
544 case AF_INET6: {
545 #ifdef DEBUG
546 struct sockaddr_in6 *sin6;
547 #endif /* DEBUG */
549 if (name->sa_family != family) {
550 eprintsoline(so, EAFNOSUPPORT);
551 return (EAFNOSUPPORT);
553 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
554 eprintsoline(so, EINVAL);
555 return (EINVAL);
557 #ifdef DEBUG
558 /* Verify that apps don't forget to clear sin6_scope_id etc */
559 sin6 = (struct sockaddr_in6 *)name;
560 if (sin6->sin6_scope_id != 0 &&
561 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
562 zcmn_err(getzoneid(), CE_WARN,
563 "connect/send* with uninitialized sin6_scope_id "
564 "(%d) on socket. Pid = %d\n",
565 (int)sin6->sin6_scope_id, (int)curproc->p_pid);
567 #endif /* DEBUG */
568 break;
570 case AF_UNIX:
571 if (SOTOTPI(so)->sti_faddr_noxlate) {
572 return (0);
574 if (namelen < (socklen_t)sizeof (short)) {
575 eprintsoline(so, ENOENT);
576 return (ENOENT);
578 if (name->sa_family != family) {
579 eprintsoline(so, EAFNOSUPPORT);
580 return (EAFNOSUPPORT);
582 /* MAXPATHLEN + soun_family + nul termination */
583 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
584 eprintsoline(so, ENAMETOOLONG);
585 return (ENAMETOOLONG);
588 break;
590 default:
592 * Default is don't do any length or sa_family check
593 * to allow non-sockaddr style addresses.
595 break;
598 return (0);
603 * Translate an AF_UNIX sockaddr_un to the transport internal name.
604 * Assumes caller has called so_addr_verify first. The translated
605 * (internal form) address is stored in sti->sti_ux_taddr.
607 /*ARGSUSED*/
609 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
610 socklen_t namelen, void **addrp, socklen_t *addrlenp)
612 int error;
613 struct sockaddr_un *soun;
614 vnode_t *vp;
615 void *addr;
616 socklen_t addrlen;
617 sotpi_info_t *sti = SOTOTPI(so);
619 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d)\n",
620 (void *)so, (void *)name, namelen));
622 ASSERT(name != NULL);
623 ASSERT(so->so_family == AF_UNIX);
624 ASSERT(!sti->sti_faddr_noxlate);
625 ASSERT(namelen >= (socklen_t)sizeof (short));
626 ASSERT(name->sa_family == AF_UNIX);
627 soun = (struct sockaddr_un *)name;
629 * Lookup vnode for the specified path name and verify that
630 * it is a socket.
632 error = so_ux_lookup(so, soun, &vp);
633 if (error) {
634 eprintsoline(so, error);
635 return (error);
638 * Use the address of the peer vnode as the address to send
639 * to. We release the peer vnode here. In case it has been
640 * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the
641 * transport the message will get an error or be dropped.
642 * Note that that soua_vp is never dereferenced; it's just a
643 * convenient value by which we can identify the peer.
645 sti->sti_ux_taddr.soua_vp = vp;
646 sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT;
647 addr = &sti->sti_ux_taddr;
648 addrlen = (socklen_t)sizeof (sti->sti_ux_taddr);
649 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
650 addrlen, (void *)vp));
651 VN_RELE(vp);
652 *addrp = addr;
653 *addrlenp = (socklen_t)addrlen;
654 return (0);
658 * Esballoc free function for messages that contain SO_FILEP option.
659 * Decrement the reference count on the file pointers using closef.
661 void
662 fdbuf_free(struct fdbuf *fdbuf)
664 int i;
665 struct file *fp;
667 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
668 for (i = 0; i < fdbuf->fd_numfd; i++) {
670 * We need pointer size alignment for fd_fds. On a LP64
671 * kernel, the required alignment is 8 bytes while
672 * the option headers and values are only 4 bytes
673 * aligned. So its safer to do a bcopy compared to
674 * assigning fdbuf->fd_fds[i] to fp.
676 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
677 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
678 (void) closef(fp);
680 if (fdbuf->fd_ebuf != NULL)
681 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
682 kmem_free(fdbuf, fdbuf->fd_size);
686 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
687 * Waits if memory is not available.
689 mblk_t *
690 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
692 uchar_t *buf;
693 mblk_t *mp;
695 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
696 buf = kmem_alloc(size, KM_SLEEP);
697 fdbuf->fd_ebuf = (caddr_t)buf;
698 fdbuf->fd_ebuflen = size;
699 fdbuf->fd_frtn.free_func = fdbuf_free;
700 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
702 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
703 mp->b_datap->db_type = M_PROTO;
704 return (mp);
708 * Extract file descriptors from a fdbuf.
709 * Return list in rights/rightslen.
711 /*ARGSUSED*/
712 static int
713 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
715 int i, fd;
716 int *rp;
717 struct file *fp;
718 int numfd;
720 dprint(1, ("fdbuf_extract: %d fds, len %d\n",
721 fdbuf->fd_numfd, rightslen));
723 numfd = fdbuf->fd_numfd;
724 ASSERT(rightslen == numfd * (int)sizeof (int));
727 * Allocate a file descriptor and increment the f_count.
728 * The latter is needed since we always call fdbuf_free
729 * which performs a closef.
731 rp = (int *)rights;
732 for (i = 0; i < numfd; i++) {
733 if ((fd = ufalloc(0)) == -1)
734 goto cleanup;
736 * We need pointer size alignment for fd_fds. On a LP64
737 * kernel, the required alignment is 8 bytes while
738 * the option headers and values are only 4 bytes
739 * aligned. So its safer to do a bcopy compared to
740 * assigning fdbuf->fd_fds[i] to fp.
742 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
743 mutex_enter(&fp->f_tlock);
744 fp->f_count++;
745 mutex_exit(&fp->f_tlock);
746 setf(fd, fp);
747 *rp++ = fd;
748 if (AU_AUDITING())
749 audit_fdrecv(fd, fp);
750 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
751 i, fd, (void *)fp, fp->f_count));
753 return (0);
755 cleanup:
757 * Undo whatever partial work the loop above has done.
760 int j;
762 rp = (int *)rights;
763 for (j = 0; j < i; j++) {
764 dprint(0,
765 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
766 (void) closeandsetf(*rp++, NULL);
770 return (EMFILE);
774 * Insert file descriptors into an fdbuf.
775 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
776 * by calling fdbuf_free().
779 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
781 int numfd, i;
782 int *fds;
783 struct file *fp;
784 struct fdbuf *fdbuf;
785 int fdbufsize;
787 dprint(1, ("fdbuf_create: len %d\n", rightslen));
789 numfd = rightslen / (int)sizeof (int);
791 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
792 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
793 fdbuf->fd_size = fdbufsize;
794 fdbuf->fd_numfd = 0;
795 fdbuf->fd_ebuf = NULL;
796 fdbuf->fd_ebuflen = 0;
797 fds = (int *)rights;
798 for (i = 0; i < numfd; i++) {
799 if ((fp = getf(fds[i])) == NULL) {
800 fdbuf_free(fdbuf);
801 return (EBADF);
803 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
804 i, fds[i], (void *)fp, fp->f_count));
805 mutex_enter(&fp->f_tlock);
806 fp->f_count++;
807 mutex_exit(&fp->f_tlock);
809 * The maximum alignment for fdbuf (or any option header
810 * and its value) it 4 bytes. On a LP64 kernel, the alignment
811 * is not sufficient for pointers (fd_fds in this case). Since
812 * we just did a kmem_alloc (we get a double word alignment),
813 * we don't need to do anything on the send side (we loose
814 * the double word alignment because fdbuf goes after an
815 * option header (eg T_unitdata_req) which is only 4 byte
816 * aligned). We take care of this when we extract the file
817 * descriptor in fdbuf_extract or fdbuf_free.
819 fdbuf->fd_fds[i] = fp;
820 fdbuf->fd_numfd++;
821 releasef(fds[i]);
822 if (AU_AUDITING())
823 audit_fdsend(fds[i], fp, 0);
825 *fdbufp = fdbuf;
826 return (0);
829 static int
830 fdbuf_optlen(int rightslen)
832 int numfd;
834 numfd = rightslen / (int)sizeof (int);
836 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
839 static t_uscalar_t
840 fdbuf_cmsglen(int fdbuflen)
842 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
843 (int)sizeof (struct file *) * (int)sizeof (int));
848 * Return non-zero if the mblk and fdbuf are consistent.
850 static int
851 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
853 if (fdbuflen >= FDBUF_HDRSIZE &&
854 fdbuflen == fdbuf->fd_size) {
855 frtn_t *frp = mp->b_datap->db_frtnp;
857 * Check that the SO_FILEP portion of the
858 * message has not been modified by
859 * the loopback transport. The sending sockfs generates
860 * a message that is esballoc'ed with the free function
861 * being fdbuf_free() and where free_arg contains the
862 * identical information as the SO_FILEP content.
864 * If any of these constraints are not satisfied we
865 * silently ignore the option.
867 ASSERT(mp);
868 if (frp != NULL &&
869 frp->free_func == fdbuf_free &&
870 frp->free_arg != NULL &&
871 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
872 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
873 (void *)fdbuf, fdbuflen));
874 return (1);
875 } else {
876 zcmn_err(getzoneid(), CE_WARN,
877 "sockfs: mismatched fdbuf content (%p)",
878 (void *)mp);
879 return (0);
881 } else {
882 zcmn_err(getzoneid(), CE_WARN,
883 "sockfs: mismatched fdbuf len %d, %d\n",
884 fdbuflen, fdbuf->fd_size);
885 return (0);
890 * When the file descriptors returned by sorecvmsg can not be passed
891 * to the application this routine will cleanup the references on
892 * the files. Start at startoff bytes into the buffer.
894 static void
895 close_fds(void *fdbuf, int fdbuflen, int startoff)
897 int *fds = (int *)fdbuf;
898 int numfd = fdbuflen / (int)sizeof (int);
899 int i;
901 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
903 for (i = 0; i < numfd; i++) {
904 if (startoff < 0)
905 startoff = 0;
906 if (startoff < (int)sizeof (int)) {
908 * This file descriptor is partially or fully after
909 * the offset
911 dprint(0,
912 ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
913 (void) closeandsetf(fds[i], NULL);
915 startoff -= (int)sizeof (int);
920 * Close all file descriptors contained in the control part starting at
921 * the startoffset.
923 void
924 so_closefds(void *control, t_uscalar_t controllen, int startoff)
926 struct cmsghdr *cmsg;
928 if (control == NULL)
929 return;
931 /* Scan control part for file descriptors. */
932 for (cmsg = (struct cmsghdr *)control;
933 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
934 cmsg = CMSG_NEXT(cmsg)) {
935 if (cmsg->cmsg_level == SOL_SOCKET &&
936 cmsg->cmsg_type == SCM_RIGHTS) {
937 close_fds(CMSG_CONTENT(cmsg),
938 (int)CMSG_CONTENTLEN(cmsg),
939 startoff - (int)sizeof (struct cmsghdr));
941 startoff -= cmsg->cmsg_len;
946 * Returns a pointer/length for the file descriptors contained
947 * in the control buffer. Returns with *fdlenp == -1 if there are no
948 * file descriptor options present. This is different than there being
949 * a zero-length file descriptor option.
950 * Fail if there are multiple SCM_RIGHT cmsgs.
953 so_getfdopt(void *control, t_uscalar_t controllen, void **fdsp, int *fdlenp)
955 struct cmsghdr *cmsg;
956 void *fds;
957 int fdlen;
959 if (control == NULL) {
960 *fdsp = NULL;
961 *fdlenp = -1;
962 return (0);
965 fds = NULL;
966 fdlen = 0;
968 for (cmsg = (struct cmsghdr *)control;
969 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
970 cmsg = CMSG_NEXT(cmsg)) {
971 if (cmsg->cmsg_level == SOL_SOCKET &&
972 cmsg->cmsg_type == SCM_RIGHTS) {
973 if (fds != NULL)
974 return (EINVAL);
975 fds = CMSG_CONTENT(cmsg);
976 fdlen = (int)CMSG_CONTENTLEN(cmsg);
977 dprint(1, ("so_getfdopt: new %lu\n",
978 (size_t)CMSG_CONTENTLEN(cmsg)));
981 if (fds == NULL) {
982 dprint(1, ("so_getfdopt: NONE\n"));
983 *fdlenp = -1;
984 } else
985 *fdlenp = fdlen;
986 *fdsp = fds;
987 return (0);
991 * Return the length of the options including any file descriptor options.
993 t_uscalar_t
994 so_optlen(void *control, t_uscalar_t controllen)
996 struct cmsghdr *cmsg;
997 t_uscalar_t optlen = 0;
998 t_uscalar_t len;
1000 if (control == NULL)
1001 return (0);
1003 for (cmsg = (struct cmsghdr *)control;
1004 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1005 cmsg = CMSG_NEXT(cmsg)) {
1006 if (cmsg->cmsg_level == SOL_SOCKET &&
1007 cmsg->cmsg_type == SCM_RIGHTS) {
1008 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1009 } else {
1010 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1012 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1013 sizeof (struct T_opthdr));
1015 dprint(1, ("so_optlen: controllen %d -> optlen %d\n",
1016 controllen, optlen));
1017 return (optlen);
1021 * Copy options from control to the mblk. Skip any file descriptor options.
1023 void
1024 so_cmsg2opt(void *control, t_uscalar_t controllen, mblk_t *mp)
1026 struct T_opthdr toh;
1027 struct cmsghdr *cmsg;
1029 if (control == NULL)
1030 return;
1032 for (cmsg = (struct cmsghdr *)control;
1033 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1034 cmsg = CMSG_NEXT(cmsg)) {
1036 * Note: The caller handles file descriptors prior
1037 * to calling this function.
1039 t_uscalar_t len;
1041 if (cmsg->cmsg_level == SOL_SOCKET &&
1042 cmsg->cmsg_type == SCM_RIGHTS)
1043 continue;
1045 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1046 toh.level = cmsg->cmsg_level;
1047 toh.name = cmsg->cmsg_type;
1048 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1049 toh.status = 0;
1051 soappendmsg(mp, &toh, sizeof (toh));
1052 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1053 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1054 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1059 * Return the length of the control message derived from the options.
1060 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1061 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1062 * allocates the space that so_opt2cmsg fills. If one changes, the other should
1063 * also be checked for any possible impacts.
1065 t_uscalar_t
1066 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen)
1068 t_uscalar_t cmsglen = 0;
1069 struct T_opthdr *tohp;
1070 t_uscalar_t len;
1071 t_uscalar_t last_roundup = 0;
1073 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1075 for (tohp = (struct T_opthdr *)opt;
1076 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1077 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1078 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1079 tohp->level, tohp->name, tohp->len));
1080 if (tohp->level == SOL_SOCKET &&
1081 (tohp->name == SO_SRCADDR ||
1082 tohp->name == SO_UNIX_CLOSE)) {
1083 continue;
1085 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1086 struct fdbuf *fdbuf;
1087 int fdbuflen;
1089 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1090 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1092 if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1093 continue;
1094 len = fdbuf_cmsglen(fdbuflen);
1095 } else if (tohp->level == SOL_SOCKET &&
1096 tohp->name == SCM_TIMESTAMP) {
1098 if (get_udatamodel() == DATAMODEL_NATIVE) {
1099 len = sizeof (struct timeval);
1100 } else {
1101 len = sizeof (struct timeval32);
1103 } else {
1104 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1107 * Exclude roundup for last option to not set
1108 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1110 last_roundup = (t_uscalar_t)
1111 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1112 (len + (int)sizeof (struct cmsghdr)));
1113 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1114 last_roundup;
1116 cmsglen -= last_roundup;
1117 dprint(1, ("so_cmsglen: optlen %d -> cmsglen %d\n",
1118 optlen, cmsglen));
1119 return (cmsglen);
1123 * Copy options from options to the control. Convert SO_FILEP to
1124 * file descriptors.
1125 * Returns errno or zero.
1126 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1127 * allocates the space that so_opt2cmsg fills. If one changes, the other should
1128 * also be checked for any possible impacts.
1131 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, void *control,
1132 t_uscalar_t controllen)
1134 struct T_opthdr *tohp;
1135 struct cmsghdr *cmsg;
1136 struct fdbuf *fdbuf;
1137 int fdbuflen;
1138 int error;
1139 #if defined(DEBUG) || defined(__lint)
1140 struct cmsghdr *cend = (struct cmsghdr *)
1141 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1142 #endif
1143 cmsg = (struct cmsghdr *)control;
1145 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1147 for (tohp = (struct T_opthdr *)opt;
1148 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1149 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1150 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1151 tohp->level, tohp->name, tohp->len));
1153 if (tohp->level == SOL_SOCKET &&
1154 (tohp->name == SO_SRCADDR ||
1155 tohp->name == SO_UNIX_CLOSE)) {
1156 continue;
1158 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1159 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1160 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1161 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1163 if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1164 return (EPROTO);
1165 int fdlen;
1167 fdlen = (int)fdbuf_cmsglen(
1168 (int)_TPI_TOPT_DATALEN(tohp));
1170 cmsg->cmsg_level = tohp->level;
1171 cmsg->cmsg_type = SCM_RIGHTS;
1172 cmsg->cmsg_len = (socklen_t)(fdlen +
1173 sizeof (struct cmsghdr));
1175 error = fdbuf_extract(fdbuf,
1176 CMSG_CONTENT(cmsg), fdlen);
1177 if (error != 0)
1178 return (error);
1179 } else if (tohp->level == SOL_SOCKET &&
1180 tohp->name == SCM_TIMESTAMP) {
1181 timestruc_t *timestamp;
1183 cmsg->cmsg_level = tohp->level;
1184 cmsg->cmsg_type = tohp->name;
1186 timestamp =
1187 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1188 sizeof (intptr_t));
1190 if (get_udatamodel() == DATAMODEL_NATIVE) {
1191 struct timeval tv;
1193 cmsg->cmsg_len = sizeof (struct timeval) +
1194 sizeof (struct cmsghdr);
1195 tv.tv_sec = timestamp->tv_sec;
1196 tv.tv_usec = timestamp->tv_nsec /
1197 (NANOSEC / MICROSEC);
1199 * on LP64 systems, the struct timeval in
1200 * the destination will not be 8-byte aligned,
1201 * so use bcopy to avoid alignment trouble
1203 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1204 } else {
1205 struct timeval32 *time32;
1207 cmsg->cmsg_len = sizeof (struct timeval32) +
1208 sizeof (struct cmsghdr);
1209 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1210 time32->tv_sec = (time32_t)timestamp->tv_sec;
1211 time32->tv_usec =
1212 (int32_t)(timestamp->tv_nsec /
1213 (NANOSEC / MICROSEC));
1216 } else {
1217 cmsg->cmsg_level = tohp->level;
1218 cmsg->cmsg_type = tohp->name;
1219 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1220 sizeof (struct cmsghdr));
1222 /* copy content to control data part */
1223 bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1224 CMSG_CONTENTLEN(cmsg));
1226 /* move to next CMSG structure! */
1227 cmsg = CMSG_NEXT(cmsg);
1229 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1230 control, controllen, (void *)cend, (void *)cmsg));
1231 ASSERT(cmsg <= cend);
1232 return (0);
1236 * Extract the SO_SRCADDR option value if present.
1238 void
1239 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1240 t_uscalar_t *srclenp)
1242 struct T_opthdr *tohp;
1244 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1246 ASSERT(srcp != NULL && srclenp != NULL);
1247 *srcp = NULL;
1248 *srclenp = 0;
1250 for (tohp = (struct T_opthdr *)opt;
1251 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1252 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1253 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1254 tohp->level, tohp->name, tohp->len));
1255 if (tohp->level == SOL_SOCKET &&
1256 tohp->name == SO_SRCADDR) {
1257 *srcp = _TPI_TOPT_DATA(tohp);
1258 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1264 * Verify if the SO_UNIX_CLOSE option is present.
1267 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1269 struct T_opthdr *tohp;
1271 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1273 for (tohp = (struct T_opthdr *)opt;
1274 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1275 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1276 dprint(1,
1277 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1278 tohp->level, tohp->name, tohp->len));
1279 if (tohp->level == SOL_SOCKET &&
1280 tohp->name == SO_UNIX_CLOSE)
1281 return (1);
1283 return (0);
1287 * Allocate an M_PROTO message.
1289 * If allocation fails the behavior depends on sleepflg:
1290 * _ALLOC_NOSLEEP fail immediately
1291 * _ALLOC_INTR sleep for memory until a signal is caught
1292 * _ALLOC_SLEEP sleep forever. Don't return NULL.
1294 mblk_t *
1295 soallocproto(size_t size, int sleepflg, cred_t *cr)
1297 mblk_t *mp;
1299 /* Round up size for reuse */
1300 size = MAX(size, 64);
1301 if (cr != NULL)
1302 mp = allocb_cred(size, cr, curproc->p_pid);
1303 else
1304 mp = allocb(size, BPRI_MED);
1306 if (mp == NULL) {
1307 int error; /* Dummy - error not returned to caller */
1309 switch (sleepflg) {
1310 case _ALLOC_SLEEP:
1311 if (cr != NULL) {
1312 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1313 cr, curproc->p_pid);
1314 } else {
1315 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1316 &error);
1318 ASSERT(mp);
1319 break;
1320 case _ALLOC_INTR:
1321 if (cr != NULL) {
1322 mp = allocb_cred_wait(size, 0, &error, cr,
1323 curproc->p_pid);
1324 } else {
1325 mp = allocb_wait(size, BPRI_MED, 0, &error);
1327 if (mp == NULL) {
1328 /* Caught signal while sleeping for memory */
1329 eprintline(ENOBUFS);
1330 return (NULL);
1332 break;
1333 case _ALLOC_NOSLEEP:
1334 default:
1335 eprintline(ENOBUFS);
1336 return (NULL);
1339 DB_TYPE(mp) = M_PROTO;
1340 return (mp);
1344 * Allocate an M_PROTO message with a single component.
1345 * len is the length of buf. size is the amount to allocate.
1347 * buf can be NULL with a non-zero len.
1348 * This results in a bzero'ed chunk being placed the message.
1350 mblk_t *
1351 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1352 cred_t *cr)
1354 mblk_t *mp;
1356 if (size == 0)
1357 size = len;
1359 ASSERT(size >= len);
1360 /* Round up size for reuse */
1361 size = MAX(size, 64);
1362 mp = soallocproto(size, sleepflg, cr);
1363 if (mp == NULL)
1364 return (NULL);
1365 mp->b_datap->db_type = M_PROTO;
1366 if (len != 0) {
1367 if (buf != NULL)
1368 bcopy(buf, mp->b_wptr, len);
1369 else
1370 bzero(mp->b_wptr, len);
1371 mp->b_wptr += len;
1373 return (mp);
1377 * Append buf/len to mp.
1378 * The caller has to ensure that there is enough room in the mblk.
1380 * buf can be NULL with a non-zero len.
1381 * This results in a bzero'ed chunk being placed the message.
1383 void
1384 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1386 ASSERT(mp);
1388 if (len != 0) {
1389 /* Assert for room left */
1390 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1391 if (buf != NULL)
1392 bcopy(buf, mp->b_wptr, len);
1393 else
1394 bzero(mp->b_wptr, len);
1396 mp->b_wptr += len;
1400 * Create a message using two kernel buffers.
1401 * If size is set that will determine the allocation size (e.g. for future
1402 * soappendmsg calls). If size is zero it is derived from the buffer
1403 * lengths.
1405 mblk_t *
1406 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1407 ssize_t size, int sleepflg, cred_t *cr)
1409 mblk_t *mp;
1411 if (size == 0)
1412 size = len1 + len2;
1413 ASSERT(size >= len1 + len2);
1415 mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1416 if (mp)
1417 soappendmsg(mp, buf2, len2);
1418 return (mp);
1422 * Create a message using three kernel buffers.
1423 * If size is set that will determine the allocation size (for future
1424 * soappendmsg calls). If size is zero it is derived from the buffer
1425 * lengths.
1427 mblk_t *
1428 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1429 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1431 mblk_t *mp;
1433 if (size == 0)
1434 size = len1 + len2 +len3;
1435 ASSERT(size >= len1 + len2 + len3);
1437 mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1438 if (mp != NULL) {
1439 soappendmsg(mp, buf2, len2);
1440 soappendmsg(mp, buf3, len3);
1442 return (mp);
1445 #ifdef DEBUG
1446 char *
1447 pr_state(uint_t state, uint_t mode)
1449 static char buf[1024];
1451 buf[0] = 0;
1452 if (state & SS_ISCONNECTED)
1453 (void) strcat(buf, "ISCONNECTED ");
1454 if (state & SS_ISCONNECTING)
1455 (void) strcat(buf, "ISCONNECTING ");
1456 if (state & SS_ISDISCONNECTING)
1457 (void) strcat(buf, "ISDISCONNECTING ");
1458 if (state & SS_CANTSENDMORE)
1459 (void) strcat(buf, "CANTSENDMORE ");
1461 if (state & SS_CANTRCVMORE)
1462 (void) strcat(buf, "CANTRCVMORE ");
1463 if (state & SS_ISBOUND)
1464 (void) strcat(buf, "ISBOUND ");
1465 if (state & SS_NDELAY)
1466 (void) strcat(buf, "NDELAY ");
1467 if (state & SS_NONBLOCK)
1468 (void) strcat(buf, "NONBLOCK ");
1470 if (state & SS_ASYNC)
1471 (void) strcat(buf, "ASYNC ");
1472 if (state & SS_ACCEPTCONN)
1473 (void) strcat(buf, "ACCEPTCONN ");
1474 if (state & SS_SAVEDEOR)
1475 (void) strcat(buf, "SAVEDEOR ");
1477 if (state & SS_RCVATMARK)
1478 (void) strcat(buf, "RCVATMARK ");
1479 if (state & SS_OOBPEND)
1480 (void) strcat(buf, "OOBPEND ");
1481 if (state & SS_HAVEOOBDATA)
1482 (void) strcat(buf, "HAVEOOBDATA ");
1483 if (state & SS_HADOOBDATA)
1484 (void) strcat(buf, "HADOOBDATA ");
1486 if (mode & SM_PRIV)
1487 (void) strcat(buf, "PRIV ");
1488 if (mode & SM_ATOMIC)
1489 (void) strcat(buf, "ATOMIC ");
1490 if (mode & SM_ADDR)
1491 (void) strcat(buf, "ADDR ");
1492 if (mode & SM_CONNREQUIRED)
1493 (void) strcat(buf, "CONNREQUIRED ");
1495 if (mode & SM_FDPASSING)
1496 (void) strcat(buf, "FDPASSING ");
1497 if (mode & SM_EXDATA)
1498 (void) strcat(buf, "EXDATA ");
1499 if (mode & SM_OPTDATA)
1500 (void) strcat(buf, "OPTDATA ");
1501 if (mode & SM_BYTESTREAM)
1502 (void) strcat(buf, "BYTESTREAM ");
1503 return (buf);
1506 char *
1507 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1509 static char buf[1024];
1511 if (addr == NULL || addrlen == 0) {
1512 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1513 return (buf);
1515 switch (family) {
1516 case AF_INET: {
1517 struct sockaddr_in sin;
1519 bcopy(addr, &sin, sizeof (sin));
1521 (void) sprintf(buf, "(len %d) %x/%d",
1522 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1523 break;
1525 case AF_INET6: {
1526 struct sockaddr_in6 sin6;
1527 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1529 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1530 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1531 addrlen,
1532 ntohs(piece[0]), ntohs(piece[1]),
1533 ntohs(piece[2]), ntohs(piece[3]),
1534 ntohs(piece[4]), ntohs(piece[5]),
1535 ntohs(piece[6]), ntohs(piece[7]),
1536 ntohs(sin6.sin6_port));
1537 break;
1539 case AF_UNIX: {
1540 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1542 (void) sprintf(buf, "(len %d) %s", addrlen,
1543 (soun == NULL) ? "(none)" : soun->sun_path);
1544 break;
1546 default:
1547 (void) sprintf(buf, "(unknown af %d)", family);
1548 break;
1550 return (buf);
1553 /* The logical equivalence operator (a if-and-only-if b) */
1554 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b))))
1557 * Verify limitations and invariants on oob state.
1558 * Return 1 if OK, otherwise 0 so that it can be used as
1559 * ASSERT(verify_oobstate(so));
1562 so_verify_oobstate(struct sonode *so)
1564 boolean_t havemark;
1566 ASSERT(MUTEX_HELD(&so->so_lock));
1569 * The possible state combinations are:
1571 * SS_OOBPEND
1572 * SS_OOBPEND|SS_HAVEOOBDATA
1573 * SS_OOBPEND|SS_HADOOBDATA
1574 * SS_HADOOBDATA
1576 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1577 case 0:
1578 case SS_OOBPEND:
1579 case SS_OOBPEND|SS_HAVEOOBDATA:
1580 case SS_OOBPEND|SS_HADOOBDATA:
1581 case SS_HADOOBDATA:
1582 break;
1583 default:
1584 printf("Bad oob state 1 (%p): state %s\n",
1585 (void *)so, pr_state(so->so_state, so->so_mode));
1586 return (0);
1589 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1590 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1591 printf("Bad oob state 2 (%p): state %s\n",
1592 (void *)so, pr_state(so->so_state, so->so_mode));
1593 return (0);
1597 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1598 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1600 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1601 SOTOTPI(so)->sti_oobsigcnt > 0;
1603 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1604 so->so_state & SS_OOBPEND)) {
1605 printf("Bad oob state 3 (%p): state %s\n",
1606 (void *)so, pr_state(so->so_state, so->so_mode));
1607 return (0);
1611 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1613 if (!(so->so_options & SO_OOBINLINE) &&
1614 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1615 printf("Bad oob state 4 (%p): state %s\n",
1616 (void *)so, pr_state(so->so_state, so->so_mode));
1617 return (0);
1620 if (!SOCK_IS_NONSTR(so) &&
1621 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1622 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1623 (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1624 SOTOTPI(so)->sti_oobcnt,
1625 pr_state(so->so_state, so->so_mode));
1626 return (0);
1629 return (1);
1631 #undef EQUIVALENT
1632 #endif /* DEBUG */
1634 /* initialize sockfs zone specific kstat related items */
1635 void *
1636 sock_kstat_init(zoneid_t zoneid)
1638 kstat_t *ksp;
1640 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1641 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1643 if (ksp != NULL) {
1644 ksp->ks_update = sockfs_update;
1645 ksp->ks_snapshot = sockfs_snapshot;
1646 ksp->ks_lock = &socklist.sl_lock;
1647 ksp->ks_private = (void *)(uintptr_t)zoneid;
1648 kstat_install(ksp);
1651 return (ksp);
1654 /* tear down sockfs zone specific kstat related items */
1655 /*ARGSUSED*/
1656 void
1657 sock_kstat_fini(zoneid_t zoneid, void *arg)
1659 kstat_t *ksp = (kstat_t *)arg;
1661 if (ksp != NULL) {
1662 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1663 kstat_delete(ksp);
1668 * Zones:
1669 * Note that nactive is going to be different for each zone.
1670 * This means we require kstat to call sockfs_update and then sockfs_snapshot
1671 * for the same zone, or sockfs_snapshot will be taken into the wrong size
1672 * buffer. This is safe, but if the buffer is too small, user will not be
1673 * given details of all sockets. However, as this kstat has a ks_lock, kstat
1674 * driver will keep it locked between the update and the snapshot, so no
1675 * other process (zone) can currently get inbetween resulting in a wrong size
1676 * buffer allocation.
1678 static int
1679 sockfs_update(kstat_t *ksp, int rw)
1681 uint_t nactive = 0; /* # of active AF_UNIX sockets */
1682 struct sonode *so; /* current sonode on socklist */
1683 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1685 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1687 if (rw == KSTAT_WRITE) { /* bounce all writes */
1688 return (EACCES);
1691 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1692 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1693 nactive++;
1696 ksp->ks_ndata = nactive;
1697 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo);
1699 return (0);
1702 static int
1703 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1705 int ns; /* # of sonodes we've copied */
1706 struct sonode *so; /* current sonode on socklist */
1707 struct k_sockinfo *pksi; /* where we put sockinfo data */
1708 t_uscalar_t sn_len; /* soa_len */
1709 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1710 sotpi_info_t *sti;
1712 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1714 ksp->ks_snaptime = gethrtime();
1716 if (rw == KSTAT_WRITE) { /* bounce all writes */
1717 return (EACCES);
1721 * for each sonode on the socklist, we massage the important
1722 * info into buf, in k_sockinfo format.
1724 pksi = (struct k_sockinfo *)buf;
1725 ns = 0;
1726 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1727 /* only stuff active sonodes and the same zone: */
1728 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1729 continue;
1733 * If the sonode was activated between the update and the
1734 * snapshot, we're done - as this is only a snapshot.
1736 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) {
1737 break;
1740 sti = SOTOTPI(so);
1741 /* copy important info into buf: */
1742 pksi->ks_si.si_size = sizeof (struct k_sockinfo);
1743 pksi->ks_si.si_family = so->so_family;
1744 pksi->ks_si.si_type = so->so_type;
1745 pksi->ks_si.si_flag = so->so_flag;
1746 pksi->ks_si.si_state = so->so_state;
1747 pksi->ks_si.si_serv_type = sti->sti_serv_type;
1748 pksi->ks_si.si_ux_laddr_sou_magic =
1749 sti->sti_ux_laddr.soua_magic;
1750 pksi->ks_si.si_ux_faddr_sou_magic =
1751 sti->sti_ux_faddr.soua_magic;
1752 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len;
1753 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len;
1754 pksi->ks_si.si_szoneid = so->so_zoneid;
1755 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate;
1757 mutex_enter(&so->so_lock);
1759 if (sti->sti_laddr_sa != NULL) {
1760 ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1761 sn_len = sti->sti_laddr_len;
1762 ASSERT(sn_len <= sizeof (short) +
1763 sizeof (pksi->ks_si.si_laddr_sun_path));
1765 pksi->ks_si.si_laddr_family =
1766 sti->sti_laddr_sa->sa_family;
1767 if (sn_len != 0) {
1768 /* AF_UNIX socket names are NULL terminated */
1769 (void) strncpy(pksi->ks_si.si_laddr_sun_path,
1770 sti->sti_laddr_sa->sa_data,
1771 sizeof (pksi->ks_si.si_laddr_sun_path));
1772 sn_len = strlen(pksi->ks_si.si_laddr_sun_path);
1774 pksi->ks_si.si_laddr_sun_path[sn_len] = 0;
1777 if (sti->sti_faddr_sa != NULL) {
1778 ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1779 sn_len = sti->sti_faddr_len;
1780 ASSERT(sn_len <= sizeof (short) +
1781 sizeof (pksi->ks_si.si_faddr_sun_path));
1783 pksi->ks_si.si_faddr_family =
1784 sti->sti_faddr_sa->sa_family;
1785 if (sn_len != 0) {
1786 (void) strncpy(pksi->ks_si.si_faddr_sun_path,
1787 sti->sti_faddr_sa->sa_data,
1788 sizeof (pksi->ks_si.si_faddr_sun_path));
1789 sn_len = strlen(pksi->ks_si.si_faddr_sun_path);
1791 pksi->ks_si.si_faddr_sun_path[sn_len] = 0;
1794 mutex_exit(&so->so_lock);
1796 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so);
1797 (void) sprintf(pksi->ks_straddr[1], "%p",
1798 (void *)sti->sti_ux_laddr.soua_vp);
1799 (void) sprintf(pksi->ks_straddr[2], "%p",
1800 (void *)sti->sti_ux_faddr.soua_vp);
1802 ns++;
1803 pksi++;
1806 ksp->ks_ndata = ns;
1807 return (0);
1810 ssize_t
1811 soreadfile(file_t *fp, uchar_t *buf, uoff_t fileoff, int *err, size_t size)
1813 struct uio auio;
1814 struct iovec aiov[1];
1815 register vnode_t *vp;
1816 int ioflag, rwflag;
1817 ssize_t cnt;
1818 int error = 0;
1819 int iovcnt = 0;
1820 short fflag;
1822 vp = fp->f_vnode;
1823 fflag = fp->f_flag;
1825 rwflag = 0;
1826 aiov[0].iov_base = (caddr_t)buf;
1827 aiov[0].iov_len = size;
1828 iovcnt = 1;
1829 cnt = (ssize_t)size;
1830 (void) fop_rwlock(vp, rwflag, NULL);
1832 auio.uio_loffset = fileoff;
1833 auio.uio_iov = aiov;
1834 auio.uio_iovcnt = iovcnt;
1835 auio.uio_resid = cnt;
1836 auio.uio_segflg = UIO_SYSSPACE;
1837 auio.uio_llimit = MAXOFFSET_T;
1838 auio.uio_fmode = fflag;
1839 auio.uio_extflg = UIO_COPY_CACHED;
1841 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1843 /* If read sync is not asked for, filter sync flags */
1844 if ((ioflag & FRSYNC) == 0)
1845 ioflag &= ~(FSYNC|FDSYNC);
1846 error = fop_read(vp, &auio, ioflag, fp->f_cred, NULL);
1847 cnt -= auio.uio_resid;
1849 fop_rwunlock(vp, rwflag, NULL);
1851 if (error == EINTR && cnt != 0)
1852 error = 0;
1853 out:
1854 if (error != 0) {
1855 *err = error;
1856 return (0);
1857 } else {
1858 *err = 0;
1859 return (cnt);
1864 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1866 if (fromkernel) {
1867 bcopy(from, to, size);
1868 return (0);
1870 return (xcopyin(from, to, size));
1874 so_copyout(const void *from, void *to, size_t size, int tokernel)
1876 if (tokernel) {
1877 bcopy(from, to, size);
1878 return (0);
1880 return (xcopyout(from, to, size));