dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / sockfs / socktpi.c
blobdc737ca86e126821afb0089bd629bc3a9bc4ceed
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/suntpi.h>
51 #include <sys/ddi.h>
52 #include <sys/esunddi.h>
53 #include <sys/flock.h>
54 #include <sys/modctl.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathname.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
66 #include <sys/tiuser.h>
67 #define _SUN_TPI_VERSION 2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
71 #include <c2/audit.h>
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
79 #include <sys/zone.h>
82 #include "sockcommon.h"
83 #include "socktpi.h"
84 #include "socktpi_impl.h"
87 * Possible failures when memory can't be allocated. The documented behavior:
89 * 5.5: 4.X: XNET:
90 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
91 * EINTR
92 * (4.X does not document EINTR but returns it)
93 * bind: ENOSR - ENOBUFS/ENOSR
94 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
95 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
96 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
97 * (4.X getpeername and getsockname do not fail in practice)
98 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
99 * listen: - - ENOBUFS
100 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
101 * EINTR
102 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
103 * EINTR
104 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
105 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
106 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
107 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
109 * Resolution. When allocation fails:
110 * recv: return EINTR
111 * send: return EINTR
112 * connect, accept: EINTR
113 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114 * socket, socketpair: ENOBUFS
115 * getpeername, getsockname: sleep
116 * getsockopt, setsockopt: sleep
119 #ifdef SOCK_TEST
121 * Variables that make sockfs do something other than the standard TPI
122 * for the AF_INET transports.
124 * solisten_tpi_tcp:
125 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
126 * the transport is already bound. This is needed to avoid loosing the
127 * port number should listen() do a T_UNBIND_REQ followed by a
128 * O_T_BIND_REQ.
130 * soconnect_tpi_udp:
131 * UDP and ICMP can handle a T_CONN_REQ.
132 * This is needed to make the sequence of connect(), getsockname()
133 * return the local IP address used to send packets to the connected to
134 * destination.
136 * soconnect_tpi_tcp:
137 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138 * Set this to non-zero to send TPI conformant messages to TCP in this
139 * respect. This is a performance optimization.
141 * soaccept_tpi_tcp:
142 * TCP can handle a T_CONN_REQ without the acceptor being bound.
143 * This is a performance optimization that has been picked up in XTI.
145 * soaccept_tpi_multioptions:
146 * When inheriting SOL_SOCKET options from the listener to the accepting
147 * socket send them as a single message for AF_INET{,6}.
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define soconnect_tpi_tcp 0
156 #define soconnect_tpi_udp 0
157 #define solisten_tpi_tcp 0
158 #define soaccept_tpi_tcp 0
159 #define soaccept_tpi_multioptions 1
160 #endif /* SOCK_TEST */
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
167 extern uint32_t ucredsize;
170 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
171 * applications working. Turn on this flag to disable these checks.
173 int xnet_skip_checks = 0;
174 int xnet_check_print = 0;
175 int xnet_truncate_print = 0;
177 static void sotpi_destroy(struct sonode *);
178 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
179 int *, cred_t *cr);
181 static boolean_t sotpi_info_create(struct sonode *, int);
182 static void sotpi_info_init(struct sonode *);
183 static void sotpi_info_fini(struct sonode *);
184 static void sotpi_info_destroy(struct sonode *);
187 * Do direct function call to the transport layer below; this would
188 * also allow the transport to utilize read-side synchronous stream
189 * interface if necessary. This is a /etc/system tunable that must
190 * not be modified on a running system. By default this is enabled
191 * for performance reasons and may be disabled for debugging purposes.
193 boolean_t socktpi_direct = B_TRUE;
195 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197 extern void sigintr(k_sigset_t *, int);
198 extern void sigunintr(k_sigset_t *);
200 static int sotpi_unbind(struct sonode *, int);
202 /* TPI sockfs sonode operations */
203 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
204 int);
205 static int sotpi_accept(struct sonode *, int, struct cred *,
206 struct sonode **);
207 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
208 int, struct cred *);
209 static int sotpi_listen(struct sonode *, int, struct cred *);
210 static int sotpi_connect(struct sonode *, struct sockaddr *,
211 socklen_t, int, int, struct cred *);
212 extern int sotpi_recvmsg(struct sonode *, struct msghdr *,
213 struct uio *, struct cred *);
214 static int sotpi_sendmsg(struct sonode *, struct msghdr *,
215 struct uio *, struct cred *);
216 static int sotpi_sendmblk(struct sonode *, struct msghdr *, int,
217 struct cred *, mblk_t **);
218 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
219 struct uio *, void *, t_uscalar_t, int);
220 static int sodgram_direct(struct sonode *, struct sockaddr *,
221 socklen_t, struct uio *, int);
222 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
223 socklen_t *, boolean_t, struct cred *);
224 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
225 socklen_t *, struct cred *);
226 static int sotpi_shutdown(struct sonode *, int, struct cred *);
227 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
228 socklen_t *, int, struct cred *);
229 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
230 socklen_t, struct cred *);
231 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
232 int32_t *);
233 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
234 struct cred *, int32_t *);
235 static int sotpi_poll(struct sonode *, short, int, short *,
236 struct pollhead **);
237 static int sotpi_close(struct sonode *, int, struct cred *);
239 static int i_sotpi_info_constructor(sotpi_info_t *);
240 static void i_sotpi_info_destructor(sotpi_info_t *);
242 sonodeops_t sotpi_sonodeops = {
243 sotpi_init, /* sop_init */
244 sotpi_accept, /* sop_accept */
245 sotpi_bind, /* sop_bind */
246 sotpi_listen, /* sop_listen */
247 sotpi_connect, /* sop_connect */
248 sotpi_recvmsg, /* sop_recvmsg */
249 sotpi_sendmsg, /* sop_sendmsg */
250 sotpi_sendmblk, /* sop_sendmblk */
251 sotpi_getpeername, /* sop_getpeername */
252 sotpi_getsockname, /* sop_getsockname */
253 sotpi_shutdown, /* sop_shutdown */
254 sotpi_getsockopt, /* sop_getsockopt */
255 sotpi_setsockopt, /* sop_setsockopt */
256 sotpi_ioctl, /* sop_ioctl */
257 sotpi_poll, /* sop_poll */
258 sotpi_close, /* sop_close */
262 * Return a TPI socket vnode.
264 * Note that sockets assume that the driver will clone (either itself
265 * or by using the clone driver) i.e. a socket() call will always
266 * result in a new vnode being created.
270 * Common create code for socket and accept. If tso is set the values
271 * from that node is used instead of issuing a T_INFO_REQ.
274 /* ARGSUSED */
275 static struct sonode *
276 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
277 int sflags, int *errorp, cred_t *cr)
279 struct sonode *so;
280 kmem_cache_t *cp;
281 int sfamily = family;
283 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
286 * to be compatible with old tpi socket implementation ignore
287 * sleep flag (sflags) passed in
289 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
290 so = kmem_cache_alloc(cp, KM_SLEEP);
291 if (so == NULL) {
292 *errorp = ENOMEM;
293 return (NULL);
296 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
297 sotpi_info_init(so);
299 so->so_is_stream = false;
300 *errorp = 0;
302 return (so);
305 static void
306 sotpi_destroy(struct sonode *so)
308 kmem_cache_t *cp;
309 struct sockparams *origsp;
312 * If there is a new dealloc function (ie. smod_destroy_func),
313 * then it should check the correctness of the ops.
316 ASSERT(so->so_ops == &sotpi_sonodeops);
318 origsp = SOTOTPI(so)->sti_orig_sp;
320 sotpi_info_fini(so);
322 if (so->so_state & SS_FALLBACK_COMP) {
324 * A fallback happend, which means that a sotpi_info_t struct
325 * was allocated (as opposed to being allocated from the TPI
326 * sonode cache. Therefore we explicitly free the struct
327 * here.
329 sotpi_info_destroy(so);
330 ASSERT(origsp != NULL);
332 origsp->sp_smod_info->smod_sock_destroy_func(so);
333 SOCKPARAMS_DEC_REF(origsp);
334 } else {
335 sonode_fini(so);
336 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
337 socktpi_cache;
338 kmem_cache_free(cp, so);
342 /* ARGSUSED1 */
344 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
346 major_t maj;
347 dev_t newdev;
348 struct vnode *vp;
349 int error = 0;
350 struct stdata *stp;
352 sotpi_info_t *sti = SOTOTPI(so);
354 dprint(1, ("sotpi_init()\n"));
357 * over write the sleep flag passed in but that is ok
358 * as tpi socket does not honor sleep flag.
360 flags |= FREAD|FWRITE;
363 * Record in so_flag that it is a clone.
365 if (getmajor(sti->sti_dev) == clone_major)
366 so->so_flag |= SOCLONE;
368 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
369 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
370 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
371 so->so_protocol == IPPROTO_IP)) {
372 /* Tell tcp or udp that it's talking to sockets */
373 flags |= SO_SOCKSTR;
376 * Here we indicate to socktpi_open() our attempt to
377 * make direct calls between sockfs and transport.
378 * The final decision is left to socktpi_open().
380 sti->sti_direct = 1;
382 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
383 if (so->so_type == SOCK_STREAM && tso != NULL) {
384 if (SOTOTPI(tso)->sti_direct) {
386 * Inherit sti_direct from listener and pass
387 * SO_ACCEPTOR open flag to tcp, indicating
388 * that this is an accept fast-path instance.
390 flags |= SO_ACCEPTOR;
391 } else {
393 * sti_direct is not set on listener, meaning
394 * that the listener has been converted from
395 * a socket to a stream. Ensure that the
396 * acceptor inherits these settings.
398 sti->sti_direct = 0;
399 flags &= ~SO_SOCKSTR;
405 * Tell local transport that it is talking to sockets.
407 if (so->so_family == AF_UNIX) {
408 flags |= SO_SOCKSTR;
411 vp = SOTOV(so);
412 newdev = vp->v_rdev;
413 maj = getmajor(newdev);
414 ASSERT(STREAMSTAB(maj));
416 error = stropen(vp, &newdev, flags, cr);
418 stp = vp->v_stream;
419 if (error == 0) {
420 if (so->so_flag & SOCLONE)
421 ASSERT(newdev != vp->v_rdev);
422 mutex_enter(&so->so_lock);
423 sti->sti_dev = newdev;
424 vp->v_rdev = newdev;
425 mutex_exit(&so->so_lock);
427 if (stp->sd_flag & STRISTTY) {
429 * this is a post SVR4 tty driver - a socket can not
430 * be a controlling terminal. Fail the open.
432 (void) sotpi_close(so, flags, cr);
433 return (ENOTTY); /* XXX */
436 ASSERT(stp->sd_wrq != NULL);
437 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
440 * If caller is interested in doing direct function call
441 * interface to/from transport module, probe the module
442 * directly beneath the streamhead to see if it qualifies.
444 * We turn off the direct interface when qualifications fail.
445 * In the acceptor case, we simply turn off the sti_direct
446 * flag on the socket. We do the fallback after the accept
447 * has completed, before the new socket is returned to the
448 * application.
450 if (sti->sti_direct) {
451 queue_t *tq = stp->sd_wrq->q_next;
454 * sti_direct is currently supported and tested
455 * only for tcp/udp; this is the main reason to
456 * have the following assertions.
458 ASSERT(so->so_family == AF_INET ||
459 so->so_family == AF_INET6);
460 ASSERT(so->so_protocol == IPPROTO_UDP ||
461 so->so_protocol == IPPROTO_TCP ||
462 so->so_protocol == IPPROTO_IP);
463 ASSERT(so->so_type == SOCK_DGRAM ||
464 so->so_type == SOCK_STREAM);
467 * Abort direct call interface if the module directly
468 * underneath the stream head is not defined with the
469 * _D_DIRECT flag. This could happen in the tcp or
470 * udp case, when some other module is autopushed
471 * above it, or for some reasons the expected module
472 * isn't purely D_MP (which is the main requirement).
474 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
475 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
476 int rval;
478 /* Continue on without direct calls */
479 sti->sti_direct = 0;
482 * Cannot issue ioctl on fallback socket since
483 * there is no conn associated with the queue.
484 * The fallback downcall will notify the proto
485 * of the change.
487 if (!(flags & SO_ACCEPTOR) &&
488 !(flags & SO_FALLBACK)) {
489 if ((error = strioctl(vp,
490 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
491 cr, &rval)) != 0) {
492 (void) sotpi_close(so, flags,
493 cr);
494 return (error);
500 if (flags & SO_FALLBACK) {
502 * The stream created does not have a conn.
503 * do stream set up after conn has been assigned
505 return (error);
507 if (error = so_strinit(so, tso)) {
508 (void) sotpi_close(so, flags, cr);
509 return (error);
512 /* Enable sendfile() on AF_UNIX streams */
513 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
514 mutex_enter(&so->so_lock);
515 so->so_mode |= SM_SENDFILESUPP;
516 mutex_exit(&so->so_lock);
519 /* Wildcard */
520 if (so->so_protocol != so->so_sockparams->sp_protocol) {
521 int protocol = so->so_protocol;
523 * Issue SO_PROTOTYPE setsockopt.
525 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
526 &protocol, (t_uscalar_t)sizeof (protocol), cr);
527 if (error != 0) {
528 (void) sotpi_close(so, flags, cr);
530 * Setsockopt often fails with ENOPROTOOPT but
531 * socket() should fail with
532 * EPROTONOSUPPORT/EPROTOTYPE.
534 return (EPROTONOSUPPORT);
538 } else {
540 * While the same socket can not be reopened (unlike specfs)
541 * the stream head sets STREOPENFAIL when the autopush fails.
543 if ((stp != NULL) &&
544 (stp->sd_flag & STREOPENFAIL)) {
546 * Open failed part way through.
548 mutex_enter(&stp->sd_lock);
549 stp->sd_flag &= ~STREOPENFAIL;
550 mutex_exit(&stp->sd_lock);
551 (void) sotpi_close(so, flags, cr);
552 return (error);
553 /*NOTREACHED*/
555 ASSERT(stp == NULL);
557 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
558 "sockfs open:maj %d vp %p so %p error %d",
559 maj, vp, so, error);
560 return (error);
564 * Bind the socket to an unspecified address in sockfs only.
565 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
566 * required in all cases.
568 static void
569 so_automatic_bind(struct sonode *so)
571 sotpi_info_t *sti = SOTOTPI(so);
572 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
574 ASSERT(MUTEX_HELD(&so->so_lock));
575 ASSERT(!(so->so_state & SS_ISBOUND));
576 ASSERT(sti->sti_unbind_mp);
578 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
579 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
580 sti->sti_laddr_sa->sa_family = so->so_family;
581 so->so_state |= SS_ISBOUND;
586 * bind the socket.
588 * A null "name" can be used to unbind the socket if:
589 * - it is a SOCK_DGRAM, or
590 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
591 * and no listen() has been done.
593 /* ARGSUSED */
594 static int
595 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
596 socklen_t namelen, int backlog, int flags, struct cred *cr)
598 struct T_bind_req bind_req;
599 struct T_bind_ack *bind_ack;
600 int error = 0;
601 mblk_t *mp;
602 void *addr;
603 t_uscalar_t addrlen;
604 int unbind_on_err = 1;
605 boolean_t clear_acceptconn_on_err = B_FALSE;
606 boolean_t restore_backlog_on_err = B_FALSE;
607 int save_so_backlog;
608 t_scalar_t PRIM_type = O_T_BIND_REQ;
609 boolean_t tcp_udp_xport;
610 sotpi_info_t *sti = SOTOTPI(so);
612 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
613 (void *)so, (void *)name, namelen, backlog, flags,
614 pr_state(so->so_state, so->so_mode)));
616 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
618 if (!(flags & _SOBIND_LOCK_HELD)) {
619 mutex_enter(&so->so_lock);
620 so_lock_single(so); /* Set SOLOCKED */
621 } else {
622 ASSERT(MUTEX_HELD(&so->so_lock));
623 ASSERT(so->so_flag & SOLOCKED);
627 * Make sure that there is a preallocated unbind_req message
628 * before binding. This message allocated when the socket is
629 * created but it might be have been consumed.
631 if (sti->sti_unbind_mp == NULL) {
632 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
633 /* NOTE: holding so_lock while sleeping */
634 sti->sti_unbind_mp =
635 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
636 cr);
639 if (flags & _SOBIND_REBIND) {
641 * Called from solisten after doing an sotpi_unbind() or
642 * potentially without the unbind (latter for AF_INET{,6}).
644 ASSERT(name == NULL && namelen == 0);
646 if (so->so_family == AF_UNIX) {
647 ASSERT(sti->sti_ux_bound_vp);
648 addr = &sti->sti_ux_laddr;
649 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
650 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
651 "addr 0x%p, vp %p\n",
652 addrlen,
653 (void *)((struct so_ux_addr *)addr)->soua_vp,
654 (void *)sti->sti_ux_bound_vp));
655 } else {
656 addr = sti->sti_laddr_sa;
657 addrlen = (t_uscalar_t)sti->sti_laddr_len;
659 } else if (flags & _SOBIND_UNSPEC) {
660 ASSERT(name == NULL && namelen == 0);
663 * The caller checked SS_ISBOUND but not necessarily
664 * under so_lock
666 if (so->so_state & SS_ISBOUND) {
667 /* No error */
668 goto done;
671 /* Set an initial local address */
672 switch (so->so_family) {
673 case AF_UNIX:
675 * Use an address with same size as struct sockaddr
676 * just like BSD.
678 sti->sti_laddr_len =
679 (socklen_t)sizeof (struct sockaddr);
680 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
681 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
682 sti->sti_laddr_sa->sa_family = so->so_family;
685 * Pass down an address with the implicit bind
686 * magic number and the rest all zeros.
687 * The transport will return a unique address.
689 sti->sti_ux_laddr.soua_vp = NULL;
690 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
691 addr = &sti->sti_ux_laddr;
692 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
693 break;
695 case AF_INET:
696 case AF_INET6:
698 * An unspecified bind in TPI has a NULL address.
699 * Set the address in sockfs to have the sa_family.
701 sti->sti_laddr_len = (so->so_family == AF_INET) ?
702 (socklen_t)sizeof (sin_t) :
703 (socklen_t)sizeof (sin6_t);
704 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
705 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
706 sti->sti_laddr_sa->sa_family = so->so_family;
707 addr = NULL;
708 addrlen = 0;
709 break;
711 default:
713 * An unspecified bind in TPI has a NULL address.
714 * Set the address in sockfs to be zero length.
716 * Can not assume there is a sa_family for all
717 * protocol families. For example, AF_X25 does not
718 * have a family field.
720 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
721 sti->sti_laddr_len = 0; /* XXX correct? */
722 addr = NULL;
723 addrlen = 0;
724 break;
727 } else {
728 if (so->so_state & SS_ISBOUND) {
729 error = EINVAL;
730 unbind_on_err = 0;
731 eprintsoline(so, error);
732 goto done;
735 /* X/Open requires this check */
736 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
737 if (xnet_check_print) {
738 printf("sockfs: X/Open bind state check "
739 "caused EINVAL\n");
741 error = EINVAL;
742 goto done;
745 switch (so->so_family) {
746 case AF_UNIX:
748 * All AF_UNIX addresses are nul terminated
749 * when copied (copyin_name) in so the minimum
750 * length is 3 bytes.
752 if (name == NULL ||
753 (ssize_t)namelen <= sizeof (short) + 1) {
754 error = EISDIR;
755 eprintsoline(so, error);
756 goto done;
759 * Verify so_family matches the bound family.
760 * BSD does not check this for AF_UNIX resulting
761 * in funny mknods.
763 if (name->sa_family != so->so_family) {
764 error = EAFNOSUPPORT;
765 goto done;
767 break;
768 case AF_INET:
769 if (name == NULL) {
770 error = EINVAL;
771 eprintsoline(so, error);
772 goto done;
774 if ((size_t)namelen != sizeof (sin_t)) {
775 error = name->sa_family != so->so_family ?
776 EAFNOSUPPORT : EINVAL;
777 eprintsoline(so, error);
778 goto done;
780 if ((name->sa_family != so->so_family)) {
781 error = EAFNOSUPPORT;
782 eprintsoline(so, error);
783 goto done;
786 * Force a zero sa_family to match so_family.
788 * Some programs like inetd(1M) don't set the
789 * family field. Other programs leave
790 * sin_family set to garbage - SunOS 4.X does
791 * not check the family field on a bind.
792 * We use the family field that
793 * was passed in to the socket() call.
795 name->sa_family = so->so_family;
796 break;
798 case AF_INET6: {
799 #ifdef DEBUG
800 sin6_t *sin6 = (sin6_t *)name;
801 #endif /* DEBUG */
803 if (name == NULL) {
804 error = EINVAL;
805 eprintsoline(so, error);
806 goto done;
808 if ((size_t)namelen != sizeof (sin6_t)) {
809 error = name->sa_family != so->so_family ?
810 EAFNOSUPPORT : EINVAL;
811 eprintsoline(so, error);
812 goto done;
814 if (name->sa_family != so->so_family) {
816 * With IPv6 we require the family to match
817 * unlike in IPv4.
819 error = EAFNOSUPPORT;
820 eprintsoline(so, error);
821 goto done;
823 #ifdef DEBUG
825 * Verify that apps don't forget to clear
826 * sin6_scope_id etc
828 if (sin6->sin6_scope_id != 0 &&
829 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
830 zcmn_err(getzoneid(), CE_WARN,
831 "bind with uninitialized sin6_scope_id "
832 "(%d) on socket. Pid = %d\n",
833 (int)sin6->sin6_scope_id,
834 (int)curproc->p_pid);
836 if (sin6->__sin6_src_id != 0) {
837 zcmn_err(getzoneid(), CE_WARN,
838 "bind with uninitialized __sin6_src_id "
839 "(%d) on socket. Pid = %d\n",
840 (int)sin6->__sin6_src_id,
841 (int)curproc->p_pid);
843 #endif /* DEBUG */
844 break;
846 default:
848 * Don't do any length or sa_family check to allow
849 * non-sockaddr style addresses.
851 if (name == NULL) {
852 error = EINVAL;
853 eprintsoline(so, error);
854 goto done;
856 break;
859 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
860 error = ENAMETOOLONG;
861 eprintsoline(so, error);
862 goto done;
865 * Save local address.
867 sti->sti_laddr_len = (socklen_t)namelen;
868 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
869 bcopy(name, sti->sti_laddr_sa, namelen);
871 addr = sti->sti_laddr_sa;
872 addrlen = (t_uscalar_t)sti->sti_laddr_len;
873 switch (so->so_family) {
874 case AF_INET6:
875 case AF_INET:
876 break;
877 case AF_UNIX: {
878 struct sockaddr_un *soun =
879 (struct sockaddr_un *)sti->sti_laddr_sa;
880 struct vnode *vp, *rvp;
881 struct vattr vattr;
883 ASSERT(sti->sti_ux_bound_vp == NULL);
885 * Create vnode for the specified path name.
886 * Keep vnode held with a reference in sti_ux_bound_vp.
887 * Use the vnode pointer as the address used in the
888 * bind with the transport.
890 * Use the same mode as in BSD. In particular this does
891 * not observe the umask.
893 /* MAXPATHLEN + soun_family + nul termination */
894 if (sti->sti_laddr_len >
895 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
896 error = ENAMETOOLONG;
897 eprintsoline(so, error);
898 goto done;
900 vattr.va_type = VSOCK;
901 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
902 vattr.va_mask = AT_TYPE|AT_MODE;
903 /* NOTE: holding so_lock */
904 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
905 EXCL, 0, &vp, CRMKNOD, 0, 0);
906 if (error) {
907 if (error == EEXIST)
908 error = EADDRINUSE;
909 eprintsoline(so, error);
910 goto done;
913 * Establish pointer from the underlying filesystem
914 * vnode to the socket node.
915 * sti_ux_bound_vp and v_stream->sd_vnode form the
916 * cross-linkage between the underlying filesystem
917 * node and the socket node.
920 if ((fop_realvp(vp, &rvp, NULL) == 0) && (vp != rvp)) {
921 VN_HOLD(rvp);
922 VN_RELE(vp);
923 vp = rvp;
926 ASSERT(SOTOV(so)->v_stream);
927 mutex_enter(&vp->v_lock);
928 vp->v_stream = SOTOV(so)->v_stream;
929 sti->sti_ux_bound_vp = vp;
930 mutex_exit(&vp->v_lock);
933 * Use the vnode pointer value as a unique address
934 * (together with the magic number to avoid conflicts
935 * with implicit binds) in the transport provider.
937 sti->sti_ux_laddr.soua_vp =
938 (void *)sti->sti_ux_bound_vp;
939 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
940 addr = &sti->sti_ux_laddr;
941 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
942 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
943 addrlen,
944 (void *)((struct so_ux_addr *)addr)->soua_vp));
945 break;
947 } /* end switch (so->so_family) */
951 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
952 * the transport can start passing up T_CONN_IND messages
953 * as soon as it receives the bind req and strsock_proto()
954 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
956 if (flags & _SOBIND_LISTEN) {
957 if ((so->so_state & SS_ACCEPTCONN) == 0)
958 clear_acceptconn_on_err = B_TRUE;
959 save_so_backlog = so->so_backlog;
960 restore_backlog_on_err = B_TRUE;
961 so->so_state |= SS_ACCEPTCONN;
962 so->so_backlog = backlog;
966 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
967 * for other transports we will send in a O_T_BIND_REQ.
969 if (tcp_udp_xport &&
970 (so->so_family == AF_INET || so->so_family == AF_INET6))
971 PRIM_type = T_BIND_REQ;
973 bind_req.PRIM_type = PRIM_type;
974 bind_req.ADDR_length = addrlen;
975 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
976 bind_req.CONIND_number = backlog;
977 /* NOTE: holding so_lock while sleeping */
978 mp = soallocproto2(&bind_req, sizeof (bind_req),
979 addr, addrlen, 0, _ALLOC_SLEEP, cr);
980 sti->sti_laddr_valid = 0;
982 /* Done using sti_laddr_sa - can drop the lock */
983 mutex_exit(&so->so_lock);
985 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
986 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
987 if (error) {
988 eprintsoline(so, error);
989 mutex_enter(&so->so_lock);
990 goto done;
993 mutex_enter(&so->so_lock);
994 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
995 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
996 if (error) {
997 eprintsoline(so, error);
998 goto done;
1000 ASSERT(mp);
1002 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1003 * strsock_proto while the lock was dropped above, the bind
1004 * is allowed to complete.
1007 /* Mark as bound. This will be undone if we detect errors below. */
1008 if (flags & _SOBIND_NOXLATE) {
1009 ASSERT(so->so_family == AF_UNIX);
1010 sti->sti_faddr_noxlate = 1;
1012 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1013 so->so_state |= SS_ISBOUND;
1014 ASSERT(sti->sti_unbind_mp);
1016 /* note that we've already set SS_ACCEPTCONN above */
1019 * Recompute addrlen - an unspecied bind sent down an
1020 * address of length zero but we expect the appropriate length
1021 * in return.
1023 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1024 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1026 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1028 * The alignment restriction is really too strict but
1029 * we want enough alignment to inspect the fields of
1030 * a sockaddr_in.
1032 addr = sogetoff(mp, bind_ack->ADDR_offset,
1033 bind_ack->ADDR_length,
1034 __TPI_ALIGN_SIZE);
1035 if (addr == NULL) {
1036 freemsg(mp);
1037 error = EPROTO;
1038 eprintsoline(so, error);
1039 goto done;
1041 if (!(flags & _SOBIND_UNSPEC)) {
1043 * Verify that the transport didn't return something we
1044 * did not want e.g. an address other than what we asked for.
1046 * NOTE: These checks would go away if/when we switch to
1047 * using the new TPI (in which the transport would fail
1048 * the request instead of assigning a different address).
1050 * NOTE2: For protocols that we don't know (i.e. any
1051 * other than AF_INET6, AF_INET and AF_UNIX), we
1052 * cannot know if the transport should be expected to
1053 * return the same address as that requested.
1055 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1056 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1058 * For example, in the case of netatalk it may be
1059 * inappropriate for the transport to return the
1060 * requested address (as it may have allocated a local
1061 * port number in behaviour similar to that of an
1062 * AF_INET bind request with a port number of zero).
1064 * Given the definition of O_T_BIND_REQ, where the
1065 * transport may bind to an address other than the
1066 * requested address, it's not possible to determine
1067 * whether a returned address that differs from the
1068 * requested address is a reason to fail (because the
1069 * requested address was not available) or succeed
1070 * (because the transport allocated an appropriate
1071 * address and/or port).
1073 * sockfs currently requires that the transport return
1074 * the requested address in the T_BIND_ACK, unless
1075 * there is code here to allow for any discrepancy.
1076 * Such code exists for AF_INET and AF_INET6.
1078 * Netatalk chooses to return the requested address
1079 * rather than the (correct) allocated address. This
1080 * means that netatalk violates the TPI specification
1081 * (and would not function correctly if used from a
1082 * TLI application), but it does mean that it works
1083 * with sockfs.
1085 * As noted above, using the newer XTI bind primitive
1086 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1087 * allow sockfs to be more sure about whether or not
1088 * the bind request had succeeded (as transports are
1089 * not permitted to bind to a different address than
1090 * that requested - they must return failure).
1091 * Unfortunately, support for T_BIND_REQ may not be
1092 * present in all transport implementations (netatalk,
1093 * for example, doesn't have it), making the
1094 * transition difficult.
1096 if (bind_ack->ADDR_length != addrlen) {
1097 /* Assumes that the requested address was in use */
1098 freemsg(mp);
1099 error = EADDRINUSE;
1100 eprintsoline(so, error);
1101 goto done;
1104 switch (so->so_family) {
1105 case AF_INET6:
1106 case AF_INET: {
1107 sin_t *rname, *aname;
1109 rname = (sin_t *)addr;
1110 aname = (sin_t *)sti->sti_laddr_sa;
1113 * Take advantage of the alignment
1114 * of sin_port and sin6_port which fall
1115 * in the same place in their data structures.
1116 * Just use sin_port for either address family.
1118 * This may become a problem if (heaven forbid)
1119 * there's a separate ipv6port_reserved... :-P
1121 * Binding to port 0 has the semantics of letting
1122 * the transport bind to any port.
1124 * If the transport is TCP or UDP since we had sent
1125 * a T_BIND_REQ we would not get a port other than
1126 * what we asked for.
1128 if (tcp_udp_xport) {
1130 * Pick up the new port number if we bound to
1131 * port 0.
1133 if (aname->sin_port == 0)
1134 aname->sin_port = rname->sin_port;
1135 sti->sti_laddr_valid = 1;
1136 break;
1138 if (aname->sin_port != 0 &&
1139 aname->sin_port != rname->sin_port) {
1140 freemsg(mp);
1141 error = EADDRINUSE;
1142 eprintsoline(so, error);
1143 goto done;
1146 * Pick up the new port number if we bound to port 0.
1148 aname->sin_port = rname->sin_port;
1151 * Unfortunately, addresses aren't _quite_ the same.
1153 if (so->so_family == AF_INET) {
1154 if (aname->sin_addr.s_addr !=
1155 rname->sin_addr.s_addr) {
1156 freemsg(mp);
1157 error = EADDRNOTAVAIL;
1158 eprintsoline(so, error);
1159 goto done;
1161 } else {
1162 sin6_t *rname6 = (sin6_t *)rname;
1163 sin6_t *aname6 = (sin6_t *)aname;
1165 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1166 &rname6->sin6_addr)) {
1167 freemsg(mp);
1168 error = EADDRNOTAVAIL;
1169 eprintsoline(so, error);
1170 goto done;
1173 break;
1175 case AF_UNIX:
1176 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1177 freemsg(mp);
1178 error = EADDRINUSE;
1179 eprintsoline(so, error);
1180 eprintso(so,
1181 ("addrlen %d, addr 0x%x, vp %p\n",
1182 addrlen, *((int *)addr),
1183 (void *)sti->sti_ux_bound_vp));
1184 goto done;
1186 sti->sti_laddr_valid = 1;
1187 break;
1188 default:
1190 * NOTE: This assumes that addresses can be
1191 * byte-compared for equivalence.
1193 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1194 freemsg(mp);
1195 error = EADDRINUSE;
1196 eprintsoline(so, error);
1197 goto done;
1200 * Don't mark sti_laddr_valid, as we cannot be
1201 * sure that the returned address is the real
1202 * bound address when talking to an unknown
1203 * transport.
1205 break;
1207 } else {
1209 * Save for returned address for getsockname.
1210 * Needed for unspecific bind unless transport supports
1211 * the TI_GETMYNAME ioctl.
1212 * Do this for AF_INET{,6} even though they do, as
1213 * caching info here is much better performance than
1214 * a TPI/STREAMS trip to the transport for getsockname.
1215 * Any which can't for some reason _must_ _not_ set
1216 * sti_laddr_valid here for the caching version of
1217 * getsockname to not break;
1219 switch (so->so_family) {
1220 case AF_UNIX:
1222 * Record the address bound with the transport
1223 * for use by socketpair.
1225 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1226 sti->sti_laddr_valid = 1;
1227 break;
1228 case AF_INET:
1229 case AF_INET6:
1230 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1231 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1232 sti->sti_laddr_valid = 1;
1233 break;
1234 default:
1236 * Don't mark sti_laddr_valid, as we cannot be
1237 * sure that the returned address is the real
1238 * bound address when talking to an unknown
1239 * transport.
1241 break;
1245 freemsg(mp);
1247 done:
1248 if (error) {
1249 /* reset state & backlog to values held on entry */
1250 if (clear_acceptconn_on_err == B_TRUE)
1251 so->so_state &= ~SS_ACCEPTCONN;
1252 if (restore_backlog_on_err == B_TRUE)
1253 so->so_backlog = save_so_backlog;
1255 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1256 int err;
1258 err = sotpi_unbind(so, 0);
1259 /* LINTED - statement has no consequent: if */
1260 if (err) {
1261 eprintsoline(so, error);
1262 } else {
1263 ASSERT(!(so->so_state & SS_ISBOUND));
1267 if (!(flags & _SOBIND_LOCK_HELD)) {
1268 so_unlock_single(so, SOLOCKED);
1269 mutex_exit(&so->so_lock);
1270 } else {
1271 ASSERT(MUTEX_HELD(&so->so_lock));
1272 ASSERT(so->so_flag & SOLOCKED);
1274 return (error);
1277 /* bind the socket */
1278 static int
1279 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1280 int flags, struct cred *cr)
1282 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1283 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1285 flags &= ~_SOBIND_SOCKETPAIR;
1286 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1290 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1291 * address, or when listen needs to unbind and bind.
1292 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1293 * so that a sobind can pick them up.
1295 static int
1296 sotpi_unbind(struct sonode *so, int flags)
1298 struct T_unbind_req unbind_req;
1299 int error = 0;
1300 mblk_t *mp;
1301 sotpi_info_t *sti = SOTOTPI(so);
1303 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1304 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1306 ASSERT(MUTEX_HELD(&so->so_lock));
1307 ASSERT(so->so_flag & SOLOCKED);
1309 if (!(so->so_state & SS_ISBOUND)) {
1310 error = EINVAL;
1311 eprintsoline(so, error);
1312 goto done;
1315 mutex_exit(&so->so_lock);
1318 * Flush the read and write side (except stream head read queue)
1319 * and send down T_UNBIND_REQ.
1321 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1323 unbind_req.PRIM_type = T_UNBIND_REQ;
1324 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1325 0, _ALLOC_SLEEP, CRED());
1326 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1327 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1328 mutex_enter(&so->so_lock);
1329 if (error) {
1330 eprintsoline(so, error);
1331 goto done;
1334 error = sowaitokack(so, T_UNBIND_REQ);
1335 if (error) {
1336 eprintsoline(so, error);
1337 goto done;
1341 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1342 * strsock_proto while the lock was dropped above, the unbind
1343 * is allowed to complete.
1345 if (!(flags & _SOUNBIND_REBIND)) {
1347 * Clear out bound address.
1349 vnode_t *vp;
1351 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1352 sti->sti_ux_bound_vp = NULL;
1353 vn_rele_stream(vp);
1355 /* Clear out address */
1356 sti->sti_laddr_len = 0;
1358 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1359 sti->sti_laddr_valid = 0;
1361 done:
1363 /* If the caller held the lock don't release it here */
1364 ASSERT(MUTEX_HELD(&so->so_lock));
1365 ASSERT(so->so_flag & SOLOCKED);
1367 return (error);
1371 * listen on the socket.
1372 * For TPI conforming transports this has to first unbind with the transport
1373 * and then bind again using the new backlog.
1375 /* ARGSUSED */
1377 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1379 int error = 0;
1380 sotpi_info_t *sti = SOTOTPI(so);
1382 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1383 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1385 if (sti->sti_serv_type == T_CLTS)
1386 return (EOPNOTSUPP);
1389 * If the socket is ready to accept connections already, then
1390 * return without doing anything. This avoids a problem where
1391 * a second listen() call fails if a connection is pending and
1392 * leaves the socket unbound. Only when we are not unbinding
1393 * with the transport can we safely increase the backlog.
1395 if (so->so_state & SS_ACCEPTCONN &&
1396 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1397 /*CONSTCOND*/
1398 !solisten_tpi_tcp))
1399 return (0);
1401 if (so->so_state & SS_ISCONNECTED)
1402 return (EINVAL);
1404 mutex_enter(&so->so_lock);
1405 so_lock_single(so); /* Set SOLOCKED */
1408 * If the listen doesn't change the backlog we do nothing.
1409 * This avoids an EPROTO error from the transport.
1411 if ((so->so_state & SS_ACCEPTCONN) &&
1412 so->so_backlog == backlog)
1413 goto done;
1415 if (!(so->so_state & SS_ISBOUND)) {
1417 * Must have been explicitly bound in the UNIX domain.
1419 if (so->so_family == AF_UNIX) {
1420 error = EINVAL;
1421 goto done;
1423 error = sotpi_bindlisten(so, NULL, 0, backlog,
1424 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1425 } else if (backlog > 0) {
1427 * AF_INET{,6} hack to avoid losing the port.
1428 * Assumes that all AF_INET{,6} transports can handle a
1429 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1430 * has already bound thus it is possible to avoid the unbind.
1432 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1433 /*CONSTCOND*/
1434 !solisten_tpi_tcp)) {
1435 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1436 if (error)
1437 goto done;
1439 error = sotpi_bindlisten(so, NULL, 0, backlog,
1440 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1441 } else {
1442 so->so_state |= SS_ACCEPTCONN;
1443 so->so_backlog = backlog;
1445 if (error)
1446 goto done;
1447 ASSERT(so->so_state & SS_ACCEPTCONN);
1448 done:
1449 so_unlock_single(so, SOLOCKED);
1450 mutex_exit(&so->so_lock);
1451 return (error);
1455 * Disconnect either a specified seqno or all (-1).
1456 * The former is used on listening sockets only.
1458 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1459 * the current use of sodisconnect(seqno == -1) is only for shutdown
1460 * so there is no point (and potentially incorrect) to unbind.
1462 static int
1463 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1465 struct T_discon_req discon_req;
1466 int error = 0;
1467 mblk_t *mp;
1469 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1470 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1472 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1473 mutex_enter(&so->so_lock);
1474 so_lock_single(so); /* Set SOLOCKED */
1475 } else {
1476 ASSERT(MUTEX_HELD(&so->so_lock));
1477 ASSERT(so->so_flag & SOLOCKED);
1480 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1481 error = EINVAL;
1482 eprintsoline(so, error);
1483 goto done;
1486 mutex_exit(&so->so_lock);
1488 * Flush the write side (unless this is a listener)
1489 * and then send down a T_DISCON_REQ.
1490 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1491 * and other messages.)
1493 if (!(so->so_state & SS_ACCEPTCONN))
1494 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1496 discon_req.PRIM_type = T_DISCON_REQ;
1497 discon_req.SEQ_number = seqno;
1498 mp = soallocproto1(&discon_req, sizeof (discon_req),
1499 0, _ALLOC_SLEEP, CRED());
1500 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1501 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1502 mutex_enter(&so->so_lock);
1503 if (error) {
1504 eprintsoline(so, error);
1505 goto done;
1508 error = sowaitokack(so, T_DISCON_REQ);
1509 if (error) {
1510 eprintsoline(so, error);
1511 goto done;
1514 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1515 * strsock_proto while the lock was dropped above, the disconnect
1516 * is allowed to complete. However, it is not possible to
1517 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1519 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1520 SOTOTPI(so)->sti_laddr_valid = 0;
1521 SOTOTPI(so)->sti_faddr_valid = 0;
1522 done:
1523 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1524 so_unlock_single(so, SOLOCKED);
1525 mutex_exit(&so->so_lock);
1526 } else {
1527 /* If the caller held the lock don't release it here */
1528 ASSERT(MUTEX_HELD(&so->so_lock));
1529 ASSERT(so->so_flag & SOLOCKED);
1531 return (error);
1534 /* ARGSUSED */
1536 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1537 struct sonode **nsop)
1539 struct T_conn_ind *conn_ind;
1540 struct T_conn_res *conn_res;
1541 int error = 0;
1542 mblk_t *mp, *ack_mp;
1543 struct sonode *nso;
1544 vnode_t *nvp;
1545 void *src;
1546 t_uscalar_t srclen;
1547 void *opt;
1548 t_uscalar_t optlen;
1549 t_scalar_t PRIM_type;
1550 t_scalar_t SEQ_number;
1551 size_t sinlen;
1552 sotpi_info_t *sti = SOTOTPI(so);
1553 sotpi_info_t *nsti;
1555 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1556 (void *)so, fflag, (void *)nsop,
1557 pr_state(so->so_state, so->so_mode)));
1560 * Defer single-threading the accepting socket until
1561 * the T_CONN_IND has been received and parsed and the
1562 * new sonode has been opened.
1565 /* Check that we are not already connected */
1566 if ((so->so_state & SS_ACCEPTCONN) == 0)
1567 goto conn_bad;
1568 again:
1569 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1570 goto e_bad;
1572 ASSERT(mp != NULL);
1573 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1576 * Save SEQ_number for error paths.
1578 SEQ_number = conn_ind->SEQ_number;
1580 srclen = conn_ind->SRC_length;
1581 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1582 if (src == NULL) {
1583 error = EPROTO;
1584 freemsg(mp);
1585 eprintsoline(so, error);
1586 goto disconnect_unlocked;
1588 optlen = conn_ind->OPT_length;
1589 switch (so->so_family) {
1590 case AF_INET:
1591 case AF_INET6:
1592 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1593 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1594 &opt, conn_ind->OPT_length);
1595 } else {
1597 * The transport (in this case TCP) hasn't sent up
1598 * a pointer to an instance for the accept fast-path.
1599 * Disable fast-path completely because the call to
1600 * sotpi_create() below would otherwise create an
1601 * incomplete TCP instance, which would lead to
1602 * problems when sockfs sends a normal T_CONN_RES
1603 * message down the new stream.
1605 if (sti->sti_direct) {
1606 int rval;
1608 * For consistency we inform tcp to disable
1609 * direct interface on the listener, though
1610 * we can certainly live without doing this
1611 * because no data will ever travel upstream
1612 * on the listening socket.
1614 sti->sti_direct = 0;
1615 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1616 0, 0, K_TO_K, cr, &rval);
1618 opt = NULL;
1619 optlen = 0;
1621 break;
1622 case AF_UNIX:
1623 default:
1624 if (optlen != 0) {
1625 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1626 __TPI_ALIGN_SIZE);
1627 if (opt == NULL) {
1628 error = EPROTO;
1629 freemsg(mp);
1630 eprintsoline(so, error);
1631 goto disconnect_unlocked;
1634 if (so->so_family == AF_UNIX) {
1635 if (!sti->sti_faddr_noxlate) {
1636 src = NULL;
1637 srclen = 0;
1639 /* Extract src address from options */
1640 if (optlen != 0)
1641 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1643 break;
1647 * Create the new socket.
1649 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1650 if (nso == NULL) {
1651 ASSERT(error != 0);
1653 * Accept can not fail with ENOBUFS. sotpi_create
1654 * sleeps waiting for memory until a signal is caught
1655 * so return EINTR.
1657 freemsg(mp);
1658 if (error == ENOBUFS)
1659 error = EINTR;
1660 goto e_disc_unl;
1662 nvp = SOTOV(nso);
1663 nsti = SOTOTPI(nso);
1665 #ifdef DEBUG
1667 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1668 * it's inherited early to allow debugging of the accept code itself.
1670 nso->so_options |= so->so_options & SO_DEBUG;
1671 #endif /* DEBUG */
1674 * Save the SRC address from the T_CONN_IND
1675 * for getpeername to work on AF_UNIX and on transports that do not
1676 * support TI_GETPEERNAME.
1678 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1679 * copyin_name().
1681 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1682 error = EINVAL;
1683 freemsg(mp);
1684 eprintsoline(so, error);
1685 goto disconnect_vp_unlocked;
1687 nsti->sti_faddr_len = (socklen_t)srclen;
1688 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1689 bcopy(src, nsti->sti_faddr_sa, srclen);
1690 nsti->sti_faddr_valid = 1;
1693 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1695 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1696 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1697 cred_t *cr;
1698 pid_t cpid;
1700 cr = msg_getcred(mp, &cpid);
1701 if (cr != NULL) {
1702 crhold(cr);
1703 nso->so_peercred = cr;
1704 nso->so_cpid = cpid;
1706 freemsg(mp);
1708 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1709 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1710 if (mp == NULL) {
1712 * Accept can not fail with ENOBUFS.
1713 * A signal was caught so return EINTR.
1715 error = EINTR;
1716 eprintsoline(so, error);
1717 goto disconnect_vp_unlocked;
1719 conn_res = (struct T_conn_res *)mp->b_rptr;
1720 } else {
1722 * For efficency reasons we use msg_extractcred; no crhold
1723 * needed since db_credp is cleared (i.e., we move the cred
1724 * from the message to so_peercred.
1726 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1728 mp->b_rptr = DB_BASE(mp);
1729 conn_res = (struct T_conn_res *)mp->b_rptr;
1730 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1732 mblk_setcred(mp, cr, curproc->p_pid);
1736 * New socket must be bound at least in sockfs and, except for AF_INET,
1737 * (or AF_INET6) it also has to be bound in the transport provider.
1738 * We set the local address in the sonode from the T_OK_ACK of the
1739 * T_CONN_RES. For this reason the address we bind to here isn't
1740 * important.
1742 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1743 /*CONSTCOND*/
1744 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1746 * Optimization for AF_INET{,6} transports
1747 * that can handle a T_CONN_RES without being bound.
1749 mutex_enter(&nso->so_lock);
1750 so_automatic_bind(nso);
1751 mutex_exit(&nso->so_lock);
1752 } else {
1753 /* Perform NULL bind with the transport provider. */
1754 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1755 cr)) != 0) {
1756 ASSERT(error != ENOBUFS);
1757 freemsg(mp);
1758 eprintsoline(nso, error);
1759 goto disconnect_vp_unlocked;
1764 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1765 * so that any data arriving on the new socket will cause the
1766 * appropriate signals to be delivered for the new socket.
1768 * No other thread (except strsock_proto and strsock_misc)
1769 * can access the new socket thus we relax the locking.
1771 nso->so_pgrp = so->so_pgrp;
1772 nso->so_state |= so->so_state & SS_ASYNC;
1773 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1775 if (nso->so_pgrp != 0) {
1776 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1777 eprintsoline(nso, error);
1778 error = 0;
1779 nso->so_pgrp = 0;
1784 * Make note of the socket level options. TCP and IP level options
1785 * are already inherited. We could do all this after accept is
1786 * successful but doing it here simplifies code and no harm done
1787 * for error case.
1789 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1790 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1791 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1792 nso->so_sndbuf = so->so_sndbuf;
1793 nso->so_rcvbuf = so->so_rcvbuf;
1794 if (nso->so_options & SO_LINGER)
1795 nso->so_linger = so->so_linger;
1798 * Note that the following sti_direct code path should be
1799 * removed once we are confident that the direct sockets
1800 * do not result in any degradation.
1802 if (sti->sti_direct) {
1804 ASSERT(opt != NULL);
1806 conn_res->OPT_length = optlen;
1807 conn_res->OPT_offset = MBLKL(mp);
1808 bcopy(&opt, mp->b_wptr, optlen);
1809 mp->b_wptr += optlen;
1810 conn_res->PRIM_type = T_CONN_RES;
1811 conn_res->ACCEPTOR_id = 0;
1812 PRIM_type = T_CONN_RES;
1814 /* Send down the T_CONN_RES on acceptor STREAM */
1815 error = kstrputmsg(SOTOV(nso), mp, NULL,
1816 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1817 if (error) {
1818 mutex_enter(&so->so_lock);
1819 so_lock_single(so);
1820 eprintsoline(so, error);
1821 goto disconnect_vp;
1823 mutex_enter(&nso->so_lock);
1824 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1825 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1826 if (error) {
1827 mutex_exit(&nso->so_lock);
1828 mutex_enter(&so->so_lock);
1829 so_lock_single(so);
1830 eprintsoline(so, error);
1831 goto disconnect_vp;
1833 if (nso->so_family == AF_INET) {
1834 sin_t *sin;
1836 sin = (sin_t *)(ack_mp->b_rptr +
1837 sizeof (struct T_ok_ack));
1838 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1839 nsti->sti_laddr_len = sizeof (sin_t);
1840 } else {
1841 sin6_t *sin6;
1843 sin6 = (sin6_t *)(ack_mp->b_rptr +
1844 sizeof (struct T_ok_ack));
1845 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1846 nsti->sti_laddr_len = sizeof (sin6_t);
1848 freemsg(ack_mp);
1850 nso->so_state |= SS_ISCONNECTED;
1851 nso->so_proto_handle = (sock_lower_handle_t)opt;
1852 nsti->sti_laddr_valid = 1;
1854 mutex_exit(&nso->so_lock);
1857 * It's possible, through the use of autopush for example,
1858 * that the acceptor stream may not support sti_direct
1859 * semantics. If the new socket does not support sti_direct
1860 * we issue a _SIOCSOCKFALLBACK to inform the transport
1861 * as we would in the I_PUSH case.
1863 if (nsti->sti_direct == 0) {
1864 int rval;
1866 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1867 0, 0, K_TO_K, cr, &rval)) != 0) {
1868 mutex_enter(&so->so_lock);
1869 so_lock_single(so);
1870 eprintsoline(so, error);
1871 goto disconnect_vp;
1876 * Pass out new socket.
1878 if (nsop != NULL)
1879 *nsop = nso;
1881 return (0);
1885 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1886 * which don't support the FireEngine accept fast-path. It is also
1887 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1888 * again. Neither sockfs nor TCP attempt to find out if some other
1889 * random module has been inserted in between (in which case we
1890 * should follow TLI accept behaviour). We blindly assume the worst
1891 * case and revert back to old behaviour i.e. TCP will not send us
1892 * any option (eager) and the accept should happen on the listener
1893 * queue. Any queued T_conn_ind have already got their options removed
1894 * by so_sock2_stream() when "sockmod" was I_POP'd.
1897 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1899 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1900 #ifdef _ILP32
1901 queue_t *q;
1904 * Find read queue in driver
1905 * Can safely do this since we "own" nso/nvp.
1907 q = strvp2wq(nvp)->q_next;
1908 while (SAMESTR(q))
1909 q = q->q_next;
1910 q = RD(q);
1911 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1912 #else
1913 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1914 #endif /* _ILP32 */
1915 conn_res->PRIM_type = O_T_CONN_RES;
1916 PRIM_type = O_T_CONN_RES;
1917 } else {
1918 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1919 conn_res->PRIM_type = T_CONN_RES;
1920 PRIM_type = T_CONN_RES;
1922 conn_res->SEQ_number = SEQ_number;
1923 conn_res->OPT_length = 0;
1924 conn_res->OPT_offset = 0;
1926 mutex_enter(&so->so_lock);
1927 so_lock_single(so); /* Set SOLOCKED */
1928 mutex_exit(&so->so_lock);
1930 error = kstrputmsg(SOTOV(so), mp, NULL,
1931 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1932 mutex_enter(&so->so_lock);
1933 if (error) {
1934 eprintsoline(so, error);
1935 goto disconnect_vp;
1937 error = sowaitprim(so, PRIM_type, T_OK_ACK,
1938 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1939 if (error) {
1940 eprintsoline(so, error);
1941 goto disconnect_vp;
1943 mutex_exit(&so->so_lock);
1945 * If there is a sin/sin6 appended onto the T_OK_ACK use
1946 * that to set the local address. If this is not present
1947 * then we zero out the address and don't set the
1948 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
1949 * the pathname from the listening socket.
1950 * In the case where this is TCP or an AF_UNIX socket the
1951 * client side may have queued data or a T_ORDREL in the
1952 * transport. Having now sent the T_CONN_RES we may receive
1953 * those queued messages at any time. Hold the acceptor
1954 * so_lock until its state and laddr are finalized.
1956 mutex_enter(&nso->so_lock);
1957 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
1958 if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
1959 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
1960 ack_mp->b_rptr += sizeof (struct T_ok_ack);
1961 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
1962 nsti->sti_laddr_len = sinlen;
1963 nsti->sti_laddr_valid = 1;
1964 } else if (nso->so_family == AF_UNIX) {
1965 ASSERT(so->so_family == AF_UNIX);
1966 nsti->sti_laddr_len = sti->sti_laddr_len;
1967 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
1968 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
1969 nsti->sti_laddr_len);
1970 nsti->sti_laddr_valid = 1;
1971 } else {
1972 nsti->sti_laddr_len = sti->sti_laddr_len;
1973 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
1974 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
1975 nsti->sti_laddr_sa->sa_family = nso->so_family;
1977 nso->so_state |= SS_ISCONNECTED;
1978 mutex_exit(&nso->so_lock);
1980 freemsg(ack_mp);
1982 mutex_enter(&so->so_lock);
1983 so_unlock_single(so, SOLOCKED);
1984 mutex_exit(&so->so_lock);
1987 * Pass out new socket.
1989 if (nsop != NULL)
1990 *nsop = nso;
1992 return (0);
1995 eproto_disc_unl:
1996 error = EPROTO;
1997 e_disc_unl:
1998 eprintsoline(so, error);
1999 goto disconnect_unlocked;
2001 pr_disc_vp_unl:
2002 eprintsoline(so, error);
2003 disconnect_vp_unlocked:
2004 (void) fop_close(nvp, 0, 1, 0, cr, NULL);
2005 VN_RELE(nvp);
2006 disconnect_unlocked:
2007 (void) sodisconnect(so, SEQ_number, 0);
2008 return (error);
2010 pr_disc_vp:
2011 eprintsoline(so, error);
2012 disconnect_vp:
2013 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2014 so_unlock_single(so, SOLOCKED);
2015 mutex_exit(&so->so_lock);
2016 (void) fop_close(nvp, 0, 1, 0, cr, NULL);
2017 VN_RELE(nvp);
2018 return (error);
2020 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2021 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2022 ? EOPNOTSUPP : EINVAL;
2023 e_bad:
2024 eprintsoline(so, error);
2025 return (error);
2029 * connect a socket.
2031 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2032 * unconnect (by specifying a null address).
2035 sotpi_connect(struct sonode *so,
2036 struct sockaddr *name,
2037 socklen_t namelen,
2038 int fflag,
2039 int flags,
2040 struct cred *cr)
2042 struct T_conn_req conn_req;
2043 int error = 0;
2044 mblk_t *mp;
2045 void *src;
2046 socklen_t srclen;
2047 void *addr;
2048 socklen_t addrlen;
2049 boolean_t need_unlock;
2050 sotpi_info_t *sti = SOTOTPI(so);
2052 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2053 (void *)so, (void *)name, namelen, fflag, flags,
2054 pr_state(so->so_state, so->so_mode)));
2057 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2058 * avoid sleeping for memory with SOLOCKED held.
2059 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2060 * + sizeof (struct T_opthdr).
2061 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2062 * exceed sti_faddr_maxlen).
2064 mp = soallocproto(sizeof (struct T_conn_req) +
2065 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2066 cr);
2067 if (mp == NULL) {
2069 * Connect can not fail with ENOBUFS. A signal was
2070 * caught so return EINTR.
2072 error = EINTR;
2073 eprintsoline(so, error);
2074 return (error);
2077 mutex_enter(&so->so_lock);
2079 * Make sure there is a preallocated T_unbind_req message
2080 * before any binding. This message is allocated when the
2081 * socket is created. Since another thread can consume
2082 * so_unbind_mp by the time we return from so_lock_single(),
2083 * we should check the availability of so_unbind_mp after
2084 * we return from so_lock_single().
2087 so_lock_single(so); /* Set SOLOCKED */
2088 need_unlock = B_TRUE;
2090 if (sti->sti_unbind_mp == NULL) {
2091 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2092 /* NOTE: holding so_lock while sleeping */
2093 sti->sti_unbind_mp =
2094 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2095 if (sti->sti_unbind_mp == NULL) {
2096 error = EINTR;
2097 goto done;
2102 * Can't have done a listen before connecting.
2104 if (so->so_state & SS_ACCEPTCONN) {
2105 error = EOPNOTSUPP;
2106 goto done;
2110 * Must be bound with the transport
2112 if (!(so->so_state & SS_ISBOUND)) {
2113 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2114 /*CONSTCOND*/
2115 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2117 * Optimization for AF_INET{,6} transports
2118 * that can handle a T_CONN_REQ without being bound.
2120 so_automatic_bind(so);
2121 } else {
2122 error = sotpi_bind(so, NULL, 0,
2123 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2124 if (error)
2125 goto done;
2127 ASSERT(so->so_state & SS_ISBOUND);
2128 flags |= _SOCONNECT_DID_BIND;
2132 * Handle a connect to a name parameter of type AF_UNSPEC like a
2133 * connect to a null address. This is the portable method to
2134 * unconnect a socket.
2136 if ((namelen >= sizeof (sa_family_t)) &&
2137 (name->sa_family == AF_UNSPEC)) {
2138 name = NULL;
2139 namelen = 0;
2143 * Check that we are not already connected.
2144 * A connection-oriented socket cannot be reconnected.
2145 * A connected connection-less socket can be
2146 * - connected to a different address by a subsequent connect
2147 * - "unconnected" by a connect to the NULL address
2149 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2150 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2151 if (so->so_mode & SM_CONNREQUIRED) {
2152 /* Connection-oriented socket */
2153 error = so->so_state & SS_ISCONNECTED ?
2154 EISCONN : EALREADY;
2155 goto done;
2157 /* Connection-less socket */
2158 if (name == NULL) {
2160 * Remove the connected state and clear SO_DGRAM_ERRIND
2161 * since it was set when the socket was connected.
2162 * If this is UDP also send down a T_DISCON_REQ.
2164 int val;
2166 if ((so->so_family == AF_INET ||
2167 so->so_family == AF_INET6) &&
2168 (so->so_type == SOCK_DGRAM ||
2169 so->so_type == SOCK_RAW) &&
2170 /*CONSTCOND*/
2171 !soconnect_tpi_udp) {
2172 /* XXX What about implicitly unbinding here? */
2173 error = sodisconnect(so, -1,
2174 _SODISCONNECT_LOCK_HELD);
2175 } else {
2176 so->so_state &=
2177 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2178 sti->sti_faddr_valid = 0;
2179 sti->sti_faddr_len = 0;
2182 /* Remove SOLOCKED since setsockopt will grab it */
2183 so_unlock_single(so, SOLOCKED);
2184 mutex_exit(&so->so_lock);
2186 val = 0;
2187 (void) sotpi_setsockopt(so, SOL_SOCKET,
2188 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2189 cr);
2191 mutex_enter(&so->so_lock);
2192 so_lock_single(so); /* Set SOLOCKED */
2193 goto done;
2196 ASSERT(so->so_state & SS_ISBOUND);
2198 if (name == NULL || namelen == 0) {
2199 error = EINVAL;
2200 goto done;
2203 * Mark the socket if sti_faddr_sa represents the transport level
2204 * address.
2206 if (flags & _SOCONNECT_NOXLATE) {
2207 struct sockaddr_ux *soaddr_ux;
2209 ASSERT(so->so_family == AF_UNIX);
2210 if (namelen != sizeof (struct sockaddr_ux)) {
2211 error = EINVAL;
2212 goto done;
2214 soaddr_ux = (struct sockaddr_ux *)name;
2215 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2216 namelen = sizeof (soaddr_ux->sou_addr);
2217 sti->sti_faddr_noxlate = 1;
2221 * Length and family checks.
2223 error = so_addr_verify(so, name, namelen);
2224 if (error)
2225 goto bad;
2228 * Save foreign address. Needed for AF_UNIX as well as
2229 * transport providers that do not support TI_GETPEERNAME.
2230 * Also used for cached foreign address for TCP and UDP.
2232 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2233 error = EINVAL;
2234 goto done;
2236 sti->sti_faddr_len = (socklen_t)namelen;
2237 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2238 bcopy(name, sti->sti_faddr_sa, namelen);
2239 sti->sti_faddr_valid = 1;
2241 if (so->so_family == AF_UNIX) {
2242 if (sti->sti_faddr_noxlate) {
2244 * sti_faddr is a transport-level address, so
2245 * don't pass it as an option. Do save it in
2246 * sti_ux_faddr, used for connected DG send.
2248 src = NULL;
2249 srclen = 0;
2250 addr = sti->sti_faddr_sa;
2251 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2252 bcopy(addr, &sti->sti_ux_faddr,
2253 sizeof (sti->sti_ux_faddr));
2254 } else {
2256 * Pass the sockaddr_un source address as an option
2257 * and translate the remote address.
2258 * Holding so_lock thus sti_laddr_sa can not change.
2260 src = sti->sti_laddr_sa;
2261 srclen = (t_uscalar_t)sti->sti_laddr_len;
2262 dprintso(so, 1,
2263 ("sotpi_connect UNIX: srclen %d, src %p\n",
2264 srclen, src));
2266 * Translate the destination address into our
2267 * internal form, and save it in sti_ux_faddr.
2268 * After this call, addr==&sti->sti_ux_taddr,
2269 * and we copy that to sti->sti_ux_faddr so
2270 * we save the connected peer address.
2272 error = so_ux_addr_xlate(so,
2273 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2274 &addr, &addrlen);
2275 if (error)
2276 goto bad;
2277 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2278 sizeof (sti->sti_ux_faddr));
2280 } else {
2281 addr = sti->sti_faddr_sa;
2282 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2283 src = NULL;
2284 srclen = 0;
2287 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2288 * option which asks the transport provider to send T_UDERR_IND
2289 * messages. These T_UDERR_IND messages are used to return connected
2290 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2292 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2293 * we send down a T_CONN_REQ. This is needed to let the
2294 * transport assign a local address that is consistent with
2295 * the remote address. Applications depend on a getsockname()
2296 * after a connect() to retrieve the "source" IP address for
2297 * the connected socket. Invalidate the cached local address
2298 * to force getsockname() to enquire of the transport.
2300 if (!(so->so_mode & SM_CONNREQUIRED)) {
2302 * Datagram socket.
2304 int32_t val;
2306 so_unlock_single(so, SOLOCKED);
2307 mutex_exit(&so->so_lock);
2309 val = 1;
2310 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2311 &val, (t_uscalar_t)sizeof (val), cr);
2313 mutex_enter(&so->so_lock);
2314 so_lock_single(so); /* Set SOLOCKED */
2315 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2316 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2317 soconnect_tpi_udp) {
2318 soisconnected(so);
2319 goto done;
2322 * Send down T_CONN_REQ etc.
2323 * Clear fflag to avoid returning EWOULDBLOCK.
2325 fflag = 0;
2326 ASSERT(so->so_family != AF_UNIX);
2327 sti->sti_laddr_valid = 0;
2328 } else if (sti->sti_laddr_len != 0) {
2330 * If the local address or port was "any" then it may be
2331 * changed by the transport as a result of the
2332 * connect. Invalidate the cached version if we have one.
2334 switch (so->so_family) {
2335 case AF_INET:
2336 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2337 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2338 INADDR_ANY ||
2339 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2340 sti->sti_laddr_valid = 0;
2341 break;
2343 case AF_INET6:
2344 ASSERT(sti->sti_laddr_len ==
2345 (socklen_t)sizeof (sin6_t));
2346 if (IN6_IS_ADDR_UNSPECIFIED(
2347 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2348 IN6_IS_ADDR_V4MAPPED_ANY(
2349 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2350 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2351 sti->sti_laddr_valid = 0;
2352 break;
2354 default:
2355 break;
2360 * Check for failure of an earlier call
2362 if (so->so_error != 0)
2363 goto so_bad;
2366 * Send down T_CONN_REQ. Message was allocated above.
2368 conn_req.PRIM_type = T_CONN_REQ;
2369 conn_req.DEST_length = addrlen;
2370 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2371 if (srclen == 0) {
2372 conn_req.OPT_length = 0;
2373 conn_req.OPT_offset = 0;
2374 soappendmsg(mp, &conn_req, sizeof (conn_req));
2375 soappendmsg(mp, addr, addrlen);
2376 } else {
2378 * There is a AF_UNIX sockaddr_un to include as a source
2379 * address option.
2381 struct T_opthdr toh;
2383 toh.level = SOL_SOCKET;
2384 toh.name = SO_SRCADDR;
2385 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2386 toh.status = 0;
2387 conn_req.OPT_length =
2388 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2389 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2390 _TPI_ALIGN_TOPT(addrlen));
2392 soappendmsg(mp, &conn_req, sizeof (conn_req));
2393 soappendmsg(mp, addr, addrlen);
2394 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2395 soappendmsg(mp, &toh, sizeof (toh));
2396 soappendmsg(mp, src, srclen);
2397 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2398 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2401 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2402 * in order to have the right state when the T_CONN_CON shows up.
2404 soisconnecting(so);
2405 mutex_exit(&so->so_lock);
2407 if (AU_AUDITING())
2408 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2410 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2411 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2412 mp = NULL;
2413 mutex_enter(&so->so_lock);
2414 if (error != 0)
2415 goto bad;
2417 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2418 goto bad;
2420 /* Allow other threads to access the socket */
2421 so_unlock_single(so, SOLOCKED);
2422 need_unlock = B_FALSE;
2425 * Wait until we get a T_CONN_CON or an error
2427 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2428 so_lock_single(so); /* Set SOLOCKED */
2429 need_unlock = B_TRUE;
2432 done:
2433 freemsg(mp);
2434 switch (error) {
2435 case EINPROGRESS:
2436 case EALREADY:
2437 case EISCONN:
2438 case EINTR:
2439 /* Non-fatal errors */
2440 sti->sti_laddr_valid = 0;
2441 /* FALLTHRU */
2442 case 0:
2443 break;
2444 default:
2445 ASSERT(need_unlock);
2447 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2448 * and invalidate local-address cache
2450 so->so_state &= ~SS_ISCONNECTING;
2451 sti->sti_laddr_valid = 0;
2452 /* A discon_ind might have already unbound us */
2453 if ((flags & _SOCONNECT_DID_BIND) &&
2454 (so->so_state & SS_ISBOUND)) {
2455 int err;
2457 err = sotpi_unbind(so, 0);
2458 /* LINTED - statement has no conseq */
2459 if (err) {
2460 eprintsoline(so, err);
2463 break;
2465 if (need_unlock)
2466 so_unlock_single(so, SOLOCKED);
2467 mutex_exit(&so->so_lock);
2468 return (error);
2470 so_bad: error = sogeterr(so, B_TRUE);
2471 bad: eprintsoline(so, error);
2472 goto done;
2475 /* ARGSUSED */
2477 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2479 struct T_ordrel_req ordrel_req;
2480 mblk_t *mp;
2481 uint_t old_state, state_change;
2482 int error = 0;
2483 sotpi_info_t *sti = SOTOTPI(so);
2485 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2486 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2488 mutex_enter(&so->so_lock);
2489 so_lock_single(so); /* Set SOLOCKED */
2492 * SunOS 4.X has no check for datagram sockets.
2493 * 5.X checks that it is connected (ENOTCONN)
2494 * X/Open requires that we check the connected state.
2496 if (!(so->so_state & SS_ISCONNECTED)) {
2497 if (!xnet_skip_checks) {
2498 error = ENOTCONN;
2499 if (xnet_check_print) {
2500 printf("sockfs: X/Open shutdown check "
2501 "caused ENOTCONN\n");
2504 goto done;
2507 * Record the current state and then perform any state changes.
2508 * Then use the difference between the old and new states to
2509 * determine which messages need to be sent.
2510 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2511 * duplicate calls to shutdown().
2513 old_state = so->so_state;
2515 switch (how) {
2516 case 0:
2517 socantrcvmore(so);
2518 break;
2519 case 1:
2520 socantsendmore(so);
2521 break;
2522 case 2:
2523 socantsendmore(so);
2524 socantrcvmore(so);
2525 break;
2526 default:
2527 error = EINVAL;
2528 goto done;
2532 * Assumes that the SS_CANT* flags are never cleared in the above code.
2534 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2535 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2536 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2538 switch (state_change) {
2539 case 0:
2540 dprintso(so, 1,
2541 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2542 so->so_state));
2543 goto done;
2545 case SS_CANTRCVMORE:
2546 mutex_exit(&so->so_lock);
2547 strseteof(SOTOV(so), 1);
2549 * strseteof takes care of read side wakeups,
2550 * pollwakeups, and signals.
2553 * Get the read lock before flushing data to avoid problems
2554 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2556 mutex_enter(&so->so_lock);
2557 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2558 mutex_exit(&so->so_lock);
2560 /* Flush read side queue */
2561 strflushrq(SOTOV(so), FLUSHALL);
2563 mutex_enter(&so->so_lock);
2564 so_unlock_read(so); /* Clear SOREADLOCKED */
2565 break;
2567 case SS_CANTSENDMORE:
2568 mutex_exit(&so->so_lock);
2569 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2570 mutex_enter(&so->so_lock);
2571 break;
2573 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2574 mutex_exit(&so->so_lock);
2575 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2576 strseteof(SOTOV(so), 1);
2578 * strseteof takes care of read side wakeups,
2579 * pollwakeups, and signals.
2582 * Get the read lock before flushing data to avoid problems
2583 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2585 mutex_enter(&so->so_lock);
2586 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2587 mutex_exit(&so->so_lock);
2589 /* Flush read side queue */
2590 strflushrq(SOTOV(so), FLUSHALL);
2592 mutex_enter(&so->so_lock);
2593 so_unlock_read(so); /* Clear SOREADLOCKED */
2594 break;
2597 ASSERT(MUTEX_HELD(&so->so_lock));
2600 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2601 * was set due to this call and the new state has both of them set:
2602 * Send the AF_UNIX close indication
2603 * For T_COTS send a discon_ind
2605 * If cantsend was set due to this call:
2606 * For T_COTSORD send an ordrel_ind
2608 * Note that for T_CLTS there is no message sent here.
2610 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2611 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2613 * For SunOS 4.X compatibility we tell the other end
2614 * that we are unable to receive at this point.
2616 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2617 so_unix_close(so);
2619 if (sti->sti_serv_type == T_COTS)
2620 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2622 if ((state_change & SS_CANTSENDMORE) &&
2623 (sti->sti_serv_type == T_COTS_ORD)) {
2624 /* Send an orderly release */
2625 ordrel_req.PRIM_type = T_ORDREL_REQ;
2627 mutex_exit(&so->so_lock);
2628 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2629 0, _ALLOC_SLEEP, cr);
2631 * Send down the T_ORDREL_REQ even if there is flow control.
2632 * This prevents shutdown from blocking.
2633 * Note that there is no T_OK_ACK for ordrel_req.
2635 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2636 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2637 mutex_enter(&so->so_lock);
2638 if (error) {
2639 eprintsoline(so, error);
2640 goto done;
2644 done:
2645 so_unlock_single(so, SOLOCKED);
2646 mutex_exit(&so->so_lock);
2647 return (error);
2651 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2652 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2653 * that we have closed.
2654 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2655 * T_UNITDATA_REQ containing the same option.
2657 * For SOCK_DGRAM half-connections (somebody connected to this end
2658 * but this end is not connect) we don't know where to send any
2659 * SO_UNIX_CLOSE.
2661 * We have to ignore stream head errors just in case there has been
2662 * a shutdown(output).
2663 * Ignore any flow control to try to get the message more quickly to the peer.
2664 * While locally ignoring flow control solves the problem when there
2665 * is only the loopback transport on the stream it would not provide
2666 * the correct AF_UNIX socket semantics when one or more modules have
2667 * been pushed.
2669 void
2670 so_unix_close(struct sonode *so)
2672 struct T_opthdr toh;
2673 mblk_t *mp;
2674 sotpi_info_t *sti = SOTOTPI(so);
2676 ASSERT(MUTEX_HELD(&so->so_lock));
2678 ASSERT(so->so_family == AF_UNIX);
2680 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2681 (SS_ISCONNECTED|SS_ISBOUND))
2682 return;
2684 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2685 (void *)so, pr_state(so->so_state, so->so_mode)));
2687 toh.level = SOL_SOCKET;
2688 toh.name = SO_UNIX_CLOSE;
2690 /* zero length + header */
2691 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2692 toh.status = 0;
2694 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2695 struct T_optdata_req tdr;
2697 tdr.PRIM_type = T_OPTDATA_REQ;
2698 tdr.DATA_flag = 0;
2700 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2701 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2703 /* NOTE: holding so_lock while sleeping */
2704 mp = soallocproto2(&tdr, sizeof (tdr),
2705 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2706 } else {
2707 struct T_unitdata_req tudr;
2708 void *addr;
2709 socklen_t addrlen;
2710 void *src;
2711 socklen_t srclen;
2712 struct T_opthdr toh2;
2713 t_scalar_t size;
2716 * We know this is an AF_UNIX connected DGRAM socket.
2717 * We therefore already have the destination address
2718 * in the internal form needed for this send. This is
2719 * similar to the sosend_dgram call later in this file
2720 * when there's no user-specified destination address.
2722 if (sti->sti_faddr_noxlate) {
2724 * Already have a transport internal address. Do not
2725 * pass any (transport internal) source address.
2727 addr = sti->sti_faddr_sa;
2728 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2729 src = NULL;
2730 srclen = 0;
2731 } else {
2733 * Pass the sockaddr_un source address as an option
2734 * and translate the remote address.
2735 * Holding so_lock thus sti_laddr_sa can not change.
2737 src = sti->sti_laddr_sa;
2738 srclen = (socklen_t)sti->sti_laddr_len;
2739 dprintso(so, 1,
2740 ("so_ux_close: srclen %d, src %p\n",
2741 srclen, src));
2743 * Use the destination address saved in connect.
2745 addr = &sti->sti_ux_faddr;
2746 addrlen = sizeof (sti->sti_ux_faddr);
2748 tudr.PRIM_type = T_UNITDATA_REQ;
2749 tudr.DEST_length = addrlen;
2750 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2751 if (srclen == 0) {
2752 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2753 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2754 _TPI_ALIGN_TOPT(addrlen));
2756 size = tudr.OPT_offset + tudr.OPT_length;
2757 /* NOTE: holding so_lock while sleeping */
2758 mp = soallocproto2(&tudr, sizeof (tudr),
2759 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2760 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2761 soappendmsg(mp, &toh, sizeof (toh));
2762 } else {
2764 * There is a AF_UNIX sockaddr_un to include as a
2765 * source address option.
2767 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2768 _TPI_ALIGN_TOPT(srclen));
2769 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2770 _TPI_ALIGN_TOPT(addrlen));
2772 toh2.level = SOL_SOCKET;
2773 toh2.name = SO_SRCADDR;
2774 toh2.len = (t_uscalar_t)(srclen +
2775 sizeof (struct T_opthdr));
2776 toh2.status = 0;
2778 size = tudr.OPT_offset + tudr.OPT_length;
2780 /* NOTE: holding so_lock while sleeping */
2781 mp = soallocproto2(&tudr, sizeof (tudr),
2782 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2783 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2784 soappendmsg(mp, &toh, sizeof (toh));
2785 soappendmsg(mp, &toh2, sizeof (toh2));
2786 soappendmsg(mp, src, srclen);
2787 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2789 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2791 mutex_exit(&so->so_lock);
2792 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2793 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2794 mutex_enter(&so->so_lock);
2798 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2799 * In addition, the caller typically verifies that there is some
2800 * potential state to clear by checking
2801 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2802 * before calling this routine.
2803 * Note that such a check can be made without holding so_lock since
2804 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2805 * decrements sti_oobsigcnt.
2807 * When data is read *after* the point that all pending
2808 * oob data has been consumed the oob indication is cleared.
2810 * This logic keeps select/poll returning POLLRDBAND and
2811 * SIOCATMARK returning true until we have read past
2812 * the mark.
2814 static void
2815 sorecv_update_oobstate(struct sonode *so)
2817 sotpi_info_t *sti = SOTOTPI(so);
2819 mutex_enter(&so->so_lock);
2820 ASSERT(so_verify_oobstate(so));
2821 dprintso(so, 1,
2822 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2823 sti->sti_oobsigcnt,
2824 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2825 if (sti->sti_oobsigcnt == 0) {
2826 /* No more pending oob indications */
2827 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2828 freemsg(so->so_oobmsg);
2829 so->so_oobmsg = NULL;
2831 ASSERT(so_verify_oobstate(so));
2832 mutex_exit(&so->so_lock);
2836 * Receive the next message on the queue.
2837 * If msg_controllen is non-zero when called the caller is interested in
2838 * any received control info (options).
2839 * If msg_namelen is non-zero when called the caller is interested in
2840 * any received source address.
2841 * The routine returns with msg_control and msg_name pointing to
2842 * kmem_alloc'ed memory which the caller has to free.
2844 /* ARGSUSED */
2846 sotpi_recvmsg(struct sonode *so, struct msghdr *msg, struct uio *uiop,
2847 struct cred *cr)
2849 union T_primitives *tpr;
2850 mblk_t *mp;
2851 uchar_t pri;
2852 int pflag, opflag;
2853 void *control;
2854 t_uscalar_t controllen;
2855 t_uscalar_t namelen;
2856 int so_state = so->so_state; /* Snapshot */
2857 ssize_t saved_resid;
2858 rval_t rval;
2859 int flags;
2860 clock_t timout;
2861 int error = 0;
2862 sotpi_info_t *sti = SOTOTPI(so);
2864 flags = msg->msg_flags;
2865 msg->msg_flags = 0;
2867 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2868 (void *)so, (void *)msg, flags,
2869 pr_state(so->so_state, so->so_mode), so->so_error));
2871 if (so->so_is_stream) {
2872 so_update_attrs(so, SOACC);
2873 /* The imaginary "sockmod" has been popped - act as a stream */
2874 return (strread(SOTOV(so), uiop, cr));
2878 * If we are not connected because we have never been connected
2879 * we return ENOTCONN. If we have been connected (but are no longer
2880 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2881 * the EOF.
2883 * An alternative would be to post an ENOTCONN error in stream head
2884 * (read+write) and clear it when we're connected. However, that error
2885 * would cause incorrect poll/select behavior!
2887 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2888 (so->so_mode & SM_CONNREQUIRED)) {
2889 return (ENOTCONN);
2893 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2894 * after checking that the read queue is empty) and returns zero.
2895 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2896 * is zero.
2899 if (flags & MSG_OOB) {
2900 /* Check that the transport supports OOB */
2901 if (!(so->so_mode & SM_EXDATA))
2902 return (EOPNOTSUPP);
2903 so_update_attrs(so, SOACC);
2904 return (sorecvoob(so, msg, uiop, flags,
2905 (so->so_options & SO_OOBINLINE)));
2908 so_update_attrs(so, SOACC);
2911 * Set msg_controllen and msg_namelen to zero here to make it
2912 * simpler in the cases that no control or name is returned.
2914 controllen = msg->msg_controllen;
2915 namelen = msg->msg_namelen;
2916 msg->msg_controllen = 0;
2917 msg->msg_namelen = 0;
2919 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2920 namelen, controllen));
2922 mutex_enter(&so->so_lock);
2924 * Only one reader is allowed at any given time. This is needed
2925 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2927 * This is slightly different that BSD behavior in that it fails with
2928 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2929 * is single-threaded using sblock(), which is dropped while waiting
2930 * for data to appear. The difference shows up e.g. if one
2931 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2932 * does use nonblocking io and different threads are reading each
2933 * file descriptor. In BSD there would never be an EWOULDBLOCK error
2934 * in this case as long as the read queue doesn't get empty.
2935 * In this implementation the thread using nonblocking io can
2936 * get an EWOULDBLOCK error due to the blocking thread executing
2937 * e.g. in the uiomove in kstrgetmsg.
2938 * This difference is not believed to be significant.
2940 /* Set SOREADLOCKED */
2941 error = so_lock_read_intr(so,
2942 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
2943 mutex_exit(&so->so_lock);
2944 if (error)
2945 return (error);
2948 * Tell kstrgetmsg to not inspect the stream head errors until all
2949 * queued data has been consumed.
2950 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
2951 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
2953 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
2954 * to T_OPTDATA_IND that do not contain any user-visible control msg.
2955 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
2957 pflag = MSG_ANY | MSG_DELAYERROR;
2958 if (flags & MSG_PEEK) {
2959 pflag |= MSG_IPEEK;
2960 flags &= ~MSG_WAITALL;
2962 if (so->so_mode & SM_ATOMIC)
2963 pflag |= MSG_DISCARDTAIL;
2965 if (flags & MSG_DONTWAIT)
2966 timout = 0;
2967 else if (so->so_rcvtimeo != 0)
2968 timout = TICK_TO_MSEC(so->so_rcvtimeo);
2969 else
2970 timout = -1;
2971 opflag = pflag;
2972 retry:
2973 saved_resid = uiop->uio_resid;
2974 pri = 0;
2975 mp = NULL;
2976 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag, timout, &rval);
2977 if (error != 0) {
2978 /* kstrgetmsg returns ETIME when timeout expires */
2979 if (error == ETIME)
2980 error = EWOULDBLOCK;
2981 goto out;
2984 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
2985 * For non-datagrams MOREDATA is used to set MSG_EOR.
2987 ASSERT(!(rval.r_val1 & MORECTL));
2988 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
2989 msg->msg_flags |= MSG_TRUNC;
2991 if (mp == NULL) {
2992 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
2994 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
2995 * The draft Posix socket spec states that the mark should
2996 * not be cleared when peeking. We follow the latter.
2998 if ((so->so_state &
2999 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3000 (uiop->uio_resid != saved_resid) &&
3001 !(flags & MSG_PEEK)) {
3002 sorecv_update_oobstate(so);
3005 mutex_enter(&so->so_lock);
3006 /* Set MSG_EOR based on MOREDATA */
3007 if (!(rval.r_val1 & MOREDATA)) {
3008 if (so->so_state & SS_SAVEDEOR) {
3009 msg->msg_flags |= MSG_EOR;
3010 so->so_state &= ~SS_SAVEDEOR;
3014 * If some data was received (i.e. not EOF) and the
3015 * read/recv* has not been satisfied wait for some more.
3017 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3018 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3019 mutex_exit(&so->so_lock);
3020 pflag = opflag | MSG_NOMARK;
3021 goto retry;
3023 goto out_locked;
3026 /* strsock_proto has already verified length and alignment */
3027 tpr = (union T_primitives *)mp->b_rptr;
3028 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3030 switch (tpr->type) {
3031 case T_DATA_IND: {
3032 if ((so->so_state &
3033 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3034 (uiop->uio_resid != saved_resid) &&
3035 !(flags & MSG_PEEK)) {
3036 sorecv_update_oobstate(so);
3040 * Set msg_flags to MSG_EOR based on
3041 * MORE_flag and MOREDATA.
3043 mutex_enter(&so->so_lock);
3044 so->so_state &= ~SS_SAVEDEOR;
3045 if (!(tpr->data_ind.MORE_flag & 1)) {
3046 if (!(rval.r_val1 & MOREDATA))
3047 msg->msg_flags |= MSG_EOR;
3048 else
3049 so->so_state |= SS_SAVEDEOR;
3051 freemsg(mp);
3053 * If some data was received (i.e. not EOF) and the
3054 * read/recv* has not been satisfied wait for some more.
3056 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3057 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3058 mutex_exit(&so->so_lock);
3059 pflag = opflag | MSG_NOMARK;
3060 goto retry;
3062 goto out_locked;
3064 case T_UNITDATA_IND: {
3065 void *addr;
3066 t_uscalar_t addrlen;
3067 void *abuf;
3068 t_uscalar_t optlen;
3069 void *opt;
3071 if ((so->so_state &
3072 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3073 (uiop->uio_resid != saved_resid) &&
3074 !(flags & MSG_PEEK)) {
3075 sorecv_update_oobstate(so);
3078 if (namelen != 0) {
3079 /* Caller wants source address */
3080 addrlen = tpr->unitdata_ind.SRC_length;
3081 addr = sogetoff(mp,
3082 tpr->unitdata_ind.SRC_offset,
3083 addrlen, 1);
3084 if (addr == NULL) {
3085 freemsg(mp);
3086 error = EPROTO;
3087 eprintsoline(so, error);
3088 goto out;
3090 if (so->so_family == AF_UNIX) {
3092 * Can not use the transport level address.
3093 * If there is a SO_SRCADDR option carrying
3094 * the socket level address it will be
3095 * extracted below.
3097 addr = NULL;
3098 addrlen = 0;
3101 optlen = tpr->unitdata_ind.OPT_length;
3102 if (optlen != 0) {
3103 t_uscalar_t ncontrollen;
3106 * Extract any source address option.
3107 * Determine how large cmsg buffer is needed.
3109 opt = sogetoff(mp,
3110 tpr->unitdata_ind.OPT_offset,
3111 optlen, __TPI_ALIGN_SIZE);
3113 if (opt == NULL) {
3114 freemsg(mp);
3115 error = EPROTO;
3116 eprintsoline(so, error);
3117 goto out;
3119 if (so->so_family == AF_UNIX)
3120 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3121 ncontrollen = so_cmsglen(mp, opt, optlen);
3122 if (controllen != 0)
3123 controllen = ncontrollen;
3124 else if (ncontrollen != 0)
3125 msg->msg_flags |= MSG_CTRUNC;
3126 } else {
3127 controllen = 0;
3130 if (namelen != 0) {
3132 * Return address to caller.
3133 * Caller handles truncation if length
3134 * exceeds msg_namelen.
3135 * NOTE: AF_UNIX NUL termination is ensured by
3136 * the sender's copyin_name().
3138 abuf = kmem_alloc(addrlen, KM_SLEEP);
3140 bcopy(addr, abuf, addrlen);
3141 msg->msg_name = abuf;
3142 msg->msg_namelen = addrlen;
3145 if (controllen != 0) {
3147 * Return control msg to caller.
3148 * Caller handles truncation if length
3149 * exceeds msg_controllen.
3151 control = kmem_zalloc(controllen, KM_SLEEP);
3153 error = so_opt2cmsg(mp, opt, optlen, control,
3154 controllen);
3155 if (error) {
3156 freemsg(mp);
3157 if (msg->msg_namelen != 0)
3158 kmem_free(msg->msg_name,
3159 msg->msg_namelen);
3160 kmem_free(control, controllen);
3161 eprintsoline(so, error);
3162 goto out;
3164 msg->msg_control = control;
3165 msg->msg_controllen = controllen;
3168 freemsg(mp);
3169 goto out;
3171 case T_OPTDATA_IND: {
3172 struct T_optdata_req *tdr;
3173 void *opt;
3174 t_uscalar_t optlen;
3176 if ((so->so_state &
3177 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3178 (uiop->uio_resid != saved_resid) &&
3179 !(flags & MSG_PEEK)) {
3180 sorecv_update_oobstate(so);
3183 tdr = (struct T_optdata_req *)mp->b_rptr;
3184 optlen = tdr->OPT_length;
3185 if (optlen != 0) {
3186 t_uscalar_t ncontrollen;
3188 * Determine how large cmsg buffer is needed.
3190 opt = sogetoff(mp,
3191 tpr->optdata_ind.OPT_offset,
3192 optlen, __TPI_ALIGN_SIZE);
3194 if (opt == NULL) {
3195 freemsg(mp);
3196 error = EPROTO;
3197 eprintsoline(so, error);
3198 goto out;
3201 ncontrollen = so_cmsglen(mp, opt, optlen);
3202 if (controllen != 0)
3203 controllen = ncontrollen;
3204 else if (ncontrollen != 0)
3205 msg->msg_flags |= MSG_CTRUNC;
3206 } else {
3207 controllen = 0;
3210 if (controllen != 0) {
3212 * Return control msg to caller.
3213 * Caller handles truncation if length
3214 * exceeds msg_controllen.
3216 control = kmem_zalloc(controllen, KM_SLEEP);
3218 error = so_opt2cmsg(mp, opt, optlen, control,
3219 controllen);
3220 if (error) {
3221 freemsg(mp);
3222 kmem_free(control, controllen);
3223 eprintsoline(so, error);
3224 goto out;
3226 msg->msg_control = control;
3227 msg->msg_controllen = controllen;
3231 * Set msg_flags to MSG_EOR based on
3232 * DATA_flag and MOREDATA.
3234 mutex_enter(&so->so_lock);
3235 so->so_state &= ~SS_SAVEDEOR;
3236 if (!(tpr->data_ind.MORE_flag & 1)) {
3237 if (!(rval.r_val1 & MOREDATA))
3238 msg->msg_flags |= MSG_EOR;
3239 else
3240 so->so_state |= SS_SAVEDEOR;
3242 freemsg(mp);
3244 * If some data was received (i.e. not EOF) and the
3245 * read/recv* has not been satisfied wait for some more.
3246 * Not possible to wait if control info was received.
3248 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3249 controllen == 0 &&
3250 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3251 mutex_exit(&so->so_lock);
3252 pflag = opflag | MSG_NOMARK;
3253 goto retry;
3255 goto out_locked;
3257 case T_EXDATA_IND: {
3258 dprintso(so, 1,
3259 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3260 "state %s\n",
3261 sti->sti_oobsigcnt, sti->sti_oobcnt,
3262 saved_resid - uiop->uio_resid,
3263 pr_state(so->so_state, so->so_mode)));
3265 * kstrgetmsg handles MSGMARK so there is nothing to
3266 * inspect in the T_EXDATA_IND.
3267 * strsock_proto makes the stream head queue the T_EXDATA_IND
3268 * as a separate message with no M_DATA component. Furthermore,
3269 * the stream head does not consolidate M_DATA messages onto
3270 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3271 * remains a message by itself. This is needed since MSGMARK
3272 * marks both the whole message as well as the last byte
3273 * of the message.
3275 freemsg(mp);
3276 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3277 if (flags & MSG_PEEK) {
3279 * Even though we are peeking we consume the
3280 * T_EXDATA_IND thereby moving the mark information
3281 * to SS_RCVATMARK. Then the oob code below will
3282 * retry the peeking kstrgetmsg.
3283 * Note that the stream head read queue is
3284 * never flushed without holding SOREADLOCKED
3285 * thus the T_EXDATA_IND can not disappear
3286 * underneath us.
3288 dprintso(so, 1,
3289 ("sotpi_recvmsg: consume EXDATA_IND "
3290 "counts %d/%d state %s\n",
3291 sti->sti_oobsigcnt,
3292 sti->sti_oobcnt,
3293 pr_state(so->so_state, so->so_mode)));
3295 pflag = MSG_ANY | MSG_DELAYERROR;
3296 if (so->so_mode & SM_ATOMIC)
3297 pflag |= MSG_DISCARDTAIL;
3299 pri = 0;
3300 mp = NULL;
3302 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3303 &pri, &pflag, (clock_t)-1, &rval);
3304 ASSERT(uiop->uio_resid == saved_resid);
3306 if (error) {
3307 #ifdef SOCK_DEBUG
3308 if (error != EWOULDBLOCK && error != EINTR) {
3309 eprintsoline(so, error);
3311 #endif /* SOCK_DEBUG */
3312 goto out;
3314 ASSERT(mp);
3315 tpr = (union T_primitives *)mp->b_rptr;
3316 ASSERT(tpr->type == T_EXDATA_IND);
3317 freemsg(mp);
3318 } /* end "if (flags & MSG_PEEK)" */
3321 * Decrement the number of queued and pending oob.
3323 * SS_RCVATMARK is cleared when we read past a mark.
3324 * SS_HAVEOOBDATA is cleared when we've read past the
3325 * last mark.
3326 * SS_OOBPEND is cleared if we've read past the last
3327 * mark and no (new) SIGURG has been posted.
3329 mutex_enter(&so->so_lock);
3330 ASSERT(so_verify_oobstate(so));
3331 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3332 ASSERT(sti->sti_oobsigcnt > 0);
3333 sti->sti_oobsigcnt--;
3334 ASSERT(sti->sti_oobcnt > 0);
3335 sti->sti_oobcnt--;
3337 * Since the T_EXDATA_IND has been removed from the stream
3338 * head, but we have not read data past the mark,
3339 * sockfs needs to track that the socket is still at the mark.
3341 * Since no data was received call kstrgetmsg again to wait
3342 * for data.
3344 so->so_state |= SS_RCVATMARK;
3345 mutex_exit(&so->so_lock);
3346 dprintso(so, 1,
3347 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3348 sti->sti_oobsigcnt, sti->sti_oobcnt,
3349 pr_state(so->so_state, so->so_mode)));
3350 pflag = opflag;
3351 goto retry;
3353 default:
3354 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3355 (void *)so, tpr->type, (void *)mp);
3356 ASSERT(0);
3357 freemsg(mp);
3358 error = EPROTO;
3359 eprintsoline(so, error);
3360 goto out;
3362 /* NOTREACHED */
3363 out:
3364 mutex_enter(&so->so_lock);
3365 out_locked:
3366 so_unlock_read(so); /* Clear SOREADLOCKED */
3367 mutex_exit(&so->so_lock);
3368 return (error);
3372 * Sending data with options on a datagram socket.
3373 * Assumes caller has verified that SS_ISBOUND etc. are set.
3375 * For AF_UNIX the destination address may be already in
3376 * internal form, as indicated by sti->sti_faddr_noxlate
3377 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3378 * translate the destination address to internal form.
3380 * The source address is passed as an option. If passing
3381 * file descriptors, those are passed as file pointers in
3382 * another option.
3384 static int
3385 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3386 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3388 struct T_unitdata_req tudr;
3389 mblk_t *mp;
3390 int error;
3391 void *addr;
3392 socklen_t addrlen;
3393 void *src;
3394 socklen_t srclen;
3395 ssize_t len;
3396 int size;
3397 struct T_opthdr toh;
3398 struct fdbuf *fdbuf;
3399 t_uscalar_t optlen;
3400 void *fds;
3401 int fdlen;
3402 sotpi_info_t *sti = SOTOTPI(so);
3404 ASSERT(name && namelen);
3405 ASSERT(control && controllen);
3407 len = uiop->uio_resid;
3408 if (len > (ssize_t)sti->sti_tidu_size) {
3409 return (EMSGSIZE);
3412 if (sti->sti_faddr_noxlate == 0 &&
3413 (flags & MSG_SENDTO_NOXLATE) == 0) {
3415 * Length and family checks.
3416 * Don't verify internal form.
3418 error = so_addr_verify(so, name, namelen);
3419 if (error) {
3420 eprintsoline(so, error);
3421 return (error);
3425 if (so->so_family == AF_UNIX) {
3426 if (sti->sti_faddr_noxlate) {
3428 * Already have a transport internal address. Do not
3429 * pass any (transport internal) source address.
3431 addr = name;
3432 addrlen = namelen;
3433 src = NULL;
3434 srclen = 0;
3435 } else if (flags & MSG_SENDTO_NOXLATE) {
3437 * Have an internal form dest. address.
3438 * Pass the source address as usual.
3440 addr = name;
3441 addrlen = namelen;
3442 src = sti->sti_laddr_sa;
3443 srclen = (socklen_t)sti->sti_laddr_len;
3444 } else {
3446 * Pass the sockaddr_un source address as an option
3447 * and translate the remote address.
3449 * Note that this code does not prevent sti_laddr_sa
3450 * from changing while it is being used. Thus
3451 * if an unbind+bind occurs concurrently with this
3452 * send the peer might see a partially new and a
3453 * partially old "from" address.
3455 src = sti->sti_laddr_sa;
3456 srclen = (socklen_t)sti->sti_laddr_len;
3457 dprintso(so, 1,
3458 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3459 srclen, src));
3461 * The sendmsg caller specified a destination
3462 * address, which we must translate into our
3463 * internal form. addr = &sti->sti_ux_taddr
3465 error = so_ux_addr_xlate(so, name, namelen,
3466 &addr, &addrlen);
3467 if (error) {
3468 eprintsoline(so, error);
3469 return (error);
3472 } else {
3473 addr = name;
3474 addrlen = namelen;
3475 src = NULL;
3476 srclen = 0;
3478 optlen = so_optlen(control, controllen);
3479 tudr.PRIM_type = T_UNITDATA_REQ;
3480 tudr.DEST_length = addrlen;
3481 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3482 if (srclen != 0)
3483 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3484 _TPI_ALIGN_TOPT(srclen));
3485 else
3486 tudr.OPT_length = optlen;
3487 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3488 _TPI_ALIGN_TOPT(addrlen));
3490 size = tudr.OPT_offset + tudr.OPT_length;
3493 * File descriptors only when SM_FDPASSING set.
3495 error = so_getfdopt(control, controllen, &fds, &fdlen);
3496 if (error)
3497 return (error);
3498 if (fdlen != -1) {
3499 if (!(so->so_mode & SM_FDPASSING))
3500 return (EOPNOTSUPP);
3502 error = fdbuf_create(fds, fdlen, &fdbuf);
3503 if (error)
3504 return (error);
3507 * Pre-allocate enough additional space for lower level modules
3508 * to append an option (e.g. see tl_unitdata). The following
3509 * is enough extra space for the largest option we might append.
3511 size += sizeof (struct T_opthdr) + ucredsize;
3512 mp = fdbuf_allocmsg(size, fdbuf);
3513 } else {
3514 mp = soallocproto(size, _ALLOC_INTR, CRED());
3515 if (mp == NULL) {
3517 * Caught a signal waiting for memory.
3518 * Let send* return EINTR.
3520 return (EINTR);
3523 soappendmsg(mp, &tudr, sizeof (tudr));
3524 soappendmsg(mp, addr, addrlen);
3525 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3527 if (fdlen != -1) {
3528 ASSERT(fdbuf != NULL);
3529 toh.level = SOL_SOCKET;
3530 toh.name = SO_FILEP;
3531 toh.len = fdbuf->fd_size +
3532 (t_uscalar_t)sizeof (struct T_opthdr);
3533 toh.status = 0;
3534 soappendmsg(mp, &toh, sizeof (toh));
3535 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3536 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3538 if (srclen != 0) {
3540 * There is a AF_UNIX sockaddr_un to include as a source
3541 * address option.
3543 toh.level = SOL_SOCKET;
3544 toh.name = SO_SRCADDR;
3545 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3546 toh.status = 0;
3547 soappendmsg(mp, &toh, sizeof (toh));
3548 soappendmsg(mp, src, srclen);
3549 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3550 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3552 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3553 so_cmsg2opt(control, controllen, mp);
3555 * Normally at most 3 bytes left in the message, but we might have
3556 * allowed for extra space if we're passing fd's through.
3558 ASSERT(MBLKL(mp) <= (ssize_t)size);
3560 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3561 if (AU_AUDITING())
3562 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3564 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3565 #ifdef SOCK_DEBUG
3566 if (error) {
3567 eprintsoline(so, error);
3569 #endif /* SOCK_DEBUG */
3570 return (error);
3574 * Sending data with options on a connected stream socket.
3575 * Assumes caller has verified that SS_ISCONNECTED is set.
3577 static int
3578 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3579 t_uscalar_t controllen, int flags)
3581 struct T_optdata_req tdr;
3582 mblk_t *mp;
3583 int error;
3584 ssize_t iosize;
3585 int size;
3586 struct fdbuf *fdbuf;
3587 t_uscalar_t optlen;
3588 void *fds;
3589 int fdlen;
3590 struct T_opthdr toh;
3591 sotpi_info_t *sti = SOTOTPI(so);
3593 dprintso(so, 1,
3594 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3597 * Has to be bound and connected. However, since no locks are
3598 * held the state could have changed after sotpi_sendmsg checked it
3599 * thus it is not possible to ASSERT on the state.
3602 /* Options on connection-oriented only when SM_OPTDATA set. */
3603 if (!(so->so_mode & SM_OPTDATA))
3604 return (EOPNOTSUPP);
3606 do {
3608 * Set the MORE flag if uio_resid does not fit in this
3609 * message or if the caller passed in "more".
3610 * Error for transports with zero tidu_size.
3612 tdr.PRIM_type = T_OPTDATA_REQ;
3613 iosize = sti->sti_tidu_size;
3614 if (iosize <= 0)
3615 return (EMSGSIZE);
3616 if (uiop->uio_resid > iosize) {
3617 tdr.DATA_flag = 1;
3618 } else {
3619 if (more)
3620 tdr.DATA_flag = 1;
3621 else
3622 tdr.DATA_flag = 0;
3623 iosize = uiop->uio_resid;
3625 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3626 tdr.DATA_flag, iosize));
3628 optlen = so_optlen(control, controllen);
3629 tdr.OPT_length = optlen;
3630 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3632 size = (int)sizeof (tdr) + optlen;
3634 * File descriptors only when SM_FDPASSING set.
3636 error = so_getfdopt(control, controllen, &fds, &fdlen);
3637 if (error)
3638 return (error);
3639 if (fdlen != -1) {
3640 if (!(so->so_mode & SM_FDPASSING))
3641 return (EOPNOTSUPP);
3643 error = fdbuf_create(fds, fdlen, &fdbuf);
3644 if (error)
3645 return (error);
3648 * Pre-allocate enough additional space for lower level
3649 * modules to append an option (e.g. see tl_unitdata).
3650 * The following is enough extra space for the largest
3651 * option we might append.
3653 size += sizeof (struct T_opthdr) + ucredsize;
3654 mp = fdbuf_allocmsg(size, fdbuf);
3655 } else {
3656 mp = soallocproto(size, _ALLOC_INTR, CRED());
3657 if (mp == NULL) {
3659 * Caught a signal waiting for memory.
3660 * Let send* return EINTR.
3662 return (EINTR);
3665 soappendmsg(mp, &tdr, sizeof (tdr));
3667 if (fdlen != -1) {
3668 ASSERT(fdbuf != NULL);
3669 toh.level = SOL_SOCKET;
3670 toh.name = SO_FILEP;
3671 toh.len = fdbuf->fd_size +
3672 (t_uscalar_t)sizeof (struct T_opthdr);
3673 toh.status = 0;
3674 soappendmsg(mp, &toh, sizeof (toh));
3675 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3676 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3678 so_cmsg2opt(control, controllen, mp);
3680 * Normally at most 3 bytes left in the message, but we might
3681 * have allowed for extra space if we're passing fd's through.
3683 ASSERT(MBLKL(mp) <= (ssize_t)size);
3685 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3687 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3688 0, MSG_BAND, 0);
3689 if (error) {
3690 eprintsoline(so, error);
3691 return (error);
3693 control = NULL;
3694 if (uiop->uio_resid > 0) {
3696 * Recheck for fatal errors. Fail write even though
3697 * some data have been written. This is consistent
3698 * with strwrite semantics and BSD sockets semantics.
3700 if (so->so_state & SS_CANTSENDMORE) {
3701 eprintsoline(so, error);
3702 return (EPIPE);
3704 if (so->so_error != 0) {
3705 mutex_enter(&so->so_lock);
3706 error = sogeterr(so, B_TRUE);
3707 mutex_exit(&so->so_lock);
3708 if (error != 0) {
3709 eprintsoline(so, error);
3710 return (error);
3714 } while (uiop->uio_resid > 0);
3715 return (0);
3719 * Sending data on a datagram socket.
3720 * Assumes caller has verified that SS_ISBOUND etc. are set.
3722 * For AF_UNIX the destination address may be already in
3723 * internal form, as indicated by sti->sti_faddr_noxlate
3724 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3725 * translate the destination address to internal form.
3727 * The source address is passed as an option.
3730 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3731 struct uio *uiop, int flags)
3733 struct T_unitdata_req tudr;
3734 mblk_t *mp;
3735 int error;
3736 void *addr;
3737 socklen_t addrlen;
3738 void *src;
3739 socklen_t srclen;
3740 ssize_t len;
3741 sotpi_info_t *sti = SOTOTPI(so);
3743 ASSERT(name != NULL && namelen != 0);
3745 len = uiop->uio_resid;
3746 if (len > sti->sti_tidu_size) {
3747 error = EMSGSIZE;
3748 goto done;
3751 if (sti->sti_faddr_noxlate == 0 &&
3752 (flags & MSG_SENDTO_NOXLATE) == 0) {
3754 * Length and family checks.
3755 * Don't verify internal form.
3757 error = so_addr_verify(so, name, namelen);
3758 if (error != 0)
3759 goto done;
3762 if (sti->sti_direct) /* Never on AF_UNIX */
3763 return (sodgram_direct(so, name, namelen, uiop, flags));
3765 if (so->so_family == AF_UNIX) {
3766 if (sti->sti_faddr_noxlate) {
3768 * Already have a transport internal address. Do not
3769 * pass any (transport internal) source address.
3771 addr = name;
3772 addrlen = namelen;
3773 src = NULL;
3774 srclen = 0;
3775 } else if (flags & MSG_SENDTO_NOXLATE) {
3777 * Have an internal form dest. address.
3778 * Pass the source address as usual.
3780 addr = name;
3781 addrlen = namelen;
3782 src = sti->sti_laddr_sa;
3783 srclen = (socklen_t)sti->sti_laddr_len;
3784 } else {
3786 * Pass the sockaddr_un source address as an option
3787 * and translate the remote address.
3789 * Note that this code does not prevent sti_laddr_sa
3790 * from changing while it is being used. Thus
3791 * if an unbind+bind occurs concurrently with this
3792 * send the peer might see a partially new and a
3793 * partially old "from" address.
3795 src = sti->sti_laddr_sa;
3796 srclen = (socklen_t)sti->sti_laddr_len;
3797 dprintso(so, 1,
3798 ("sosend_dgram UNIX: srclen %d, src %p\n",
3799 srclen, src));
3801 * The sendmsg caller specified a destination
3802 * address, which we must translate into our
3803 * internal form. addr = &sti->sti_ux_taddr
3805 error = so_ux_addr_xlate(so, name, namelen,
3806 &addr, &addrlen);
3807 if (error) {
3808 eprintsoline(so, error);
3809 goto done;
3812 } else {
3813 addr = name;
3814 addrlen = namelen;
3815 src = NULL;
3816 srclen = 0;
3818 tudr.PRIM_type = T_UNITDATA_REQ;
3819 tudr.DEST_length = addrlen;
3820 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3821 if (srclen == 0) {
3822 tudr.OPT_length = 0;
3823 tudr.OPT_offset = 0;
3825 mp = soallocproto2(&tudr, sizeof (tudr),
3826 addr, addrlen, 0, _ALLOC_INTR, CRED());
3827 if (mp == NULL) {
3829 * Caught a signal waiting for memory.
3830 * Let send* return EINTR.
3832 error = EINTR;
3833 goto done;
3835 } else {
3837 * There is a AF_UNIX sockaddr_un to include as a source
3838 * address option.
3840 struct T_opthdr toh;
3841 ssize_t size;
3843 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3844 _TPI_ALIGN_TOPT(srclen));
3845 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3846 _TPI_ALIGN_TOPT(addrlen));
3848 toh.level = SOL_SOCKET;
3849 toh.name = SO_SRCADDR;
3850 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3851 toh.status = 0;
3853 size = tudr.OPT_offset + tudr.OPT_length;
3854 mp = soallocproto2(&tudr, sizeof (tudr),
3855 addr, addrlen, size, _ALLOC_INTR, CRED());
3856 if (mp == NULL) {
3858 * Caught a signal waiting for memory.
3859 * Let send* return EINTR.
3861 error = EINTR;
3862 goto done;
3864 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3865 soappendmsg(mp, &toh, sizeof (toh));
3866 soappendmsg(mp, src, srclen);
3867 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3868 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3871 if (AU_AUDITING())
3872 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3874 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3875 done:
3876 #ifdef SOCK_DEBUG
3877 if (error) {
3878 eprintsoline(so, error);
3880 #endif /* SOCK_DEBUG */
3881 return (error);
3885 * Sending data on a connected stream socket.
3886 * Assumes caller has verified that SS_ISCONNECTED is set.
3889 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3890 int sflag)
3892 struct T_data_req tdr;
3893 mblk_t *mp;
3894 int error;
3895 ssize_t iosize;
3896 sotpi_info_t *sti = SOTOTPI(so);
3898 dprintso(so, 1,
3899 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3900 (void *)so, uiop->uio_resid, prim, sflag));
3903 * Has to be bound and connected. However, since no locks are
3904 * held the state could have changed after sotpi_sendmsg checked it
3905 * thus it is not possible to ASSERT on the state.
3908 do {
3910 * Set the MORE flag if uio_resid does not fit in this
3911 * message or if the caller passed in "more".
3912 * Error for transports with zero tidu_size.
3914 tdr.PRIM_type = prim;
3915 iosize = sti->sti_tidu_size;
3916 if (iosize <= 0)
3917 return (EMSGSIZE);
3918 if (uiop->uio_resid > iosize) {
3919 tdr.MORE_flag = 1;
3920 } else {
3921 if (more)
3922 tdr.MORE_flag = 1;
3923 else
3924 tdr.MORE_flag = 0;
3925 iosize = uiop->uio_resid;
3927 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
3928 prim, tdr.MORE_flag, iosize));
3929 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
3930 if (mp == NULL) {
3932 * Caught a signal waiting for memory.
3933 * Let send* return EINTR.
3935 return (EINTR);
3938 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3939 0, sflag | MSG_BAND, 0);
3940 if (error) {
3941 eprintsoline(so, error);
3942 return (error);
3944 if (uiop->uio_resid > 0) {
3946 * Recheck for fatal errors. Fail write even though
3947 * some data have been written. This is consistent
3948 * with strwrite semantics and BSD sockets semantics.
3950 if (so->so_state & SS_CANTSENDMORE) {
3951 eprintsoline(so, error);
3952 return (EPIPE);
3954 if (so->so_error != 0) {
3955 mutex_enter(&so->so_lock);
3956 error = sogeterr(so, B_TRUE);
3957 mutex_exit(&so->so_lock);
3958 if (error != 0) {
3959 eprintsoline(so, error);
3960 return (error);
3964 } while (uiop->uio_resid > 0);
3965 return (0);
3969 * Check the state for errors and call the appropriate send function.
3971 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
3972 * this function issues a setsockopt to toggle SO_DONTROUTE before and
3973 * after sending the message.
3975 * The caller may optionally specify a destination address, for either
3976 * stream or datagram sockets. This table summarizes the cases:
3978 * Socket type Dest. given Connected Result
3979 * ----------- ----------- --------- --------------
3980 * Stream * Yes send to conn. addr.
3981 * Stream * No error ENOTCONN
3982 * Dgram yes * send to given addr.
3983 * Dgram no yes send to conn. addr.
3984 * Dgram no no error EDESTADDRREQ
3986 * There are subtleties around the destination address when using
3987 * AF_UNIX datagram sockets. When the sendmsg call specifies the
3988 * destination address, it's in (struct sockaddr_un) form and we
3989 * need to translate it to our internal form (struct so_ux_addr).
3991 * When the sendmsg call does not specify a destination address
3992 * we're using the peer address saved during sotpi_connect, and
3993 * that address is already in internal form. In this case, the
3994 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
3995 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
3996 * those functions should skip translation to internal form.
3997 * Avoiding that translation is not only more efficient, but it's
3998 * also necessary when a process does a connect on an AF_UNIX
3999 * datagram socket and then drops privileges. After the process
4000 * has dropped privileges, it may no longer be able to lookup the
4001 * the external name in the filesystem, but it should still be
4002 * able to send messages on the connected socket by leaving the
4003 * destination name unspecified.
4005 * Yet more subtleties arise with sockets connected by socketpair(),
4006 * which puts internal form addresses in the fields where normally
4007 * the external form is found, and sets sti_faddr_noxlate=1, which
4008 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4009 * to skip translation of destination addresses to internal form.
4010 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4011 * different behaviour almost everywhere AF_UNIX addresses appear.
4013 static int
4014 sotpi_sendmsg(struct sonode *so, struct msghdr *msg, struct uio *uiop,
4015 struct cred *cr)
4017 int so_state;
4018 int so_mode;
4019 int error;
4020 struct sockaddr *name;
4021 t_uscalar_t namelen;
4022 int dontroute;
4023 int flags;
4024 sotpi_info_t *sti = SOTOTPI(so);
4026 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4027 (void *)so, (void *)msg, msg->msg_flags,
4028 pr_state(so->so_state, so->so_mode), so->so_error));
4030 if (so->so_is_stream) {
4031 /* The imaginary "sockmod" has been popped - act as a stream */
4032 so_update_attrs(so, SOMOD);
4033 return (strwrite(SOTOV(so), uiop, cr));
4036 mutex_enter(&so->so_lock);
4037 so_state = so->so_state;
4039 if (so_state & SS_CANTSENDMORE) {
4040 mutex_exit(&so->so_lock);
4041 return (EPIPE);
4044 if (so->so_error != 0) {
4045 error = sogeterr(so, B_TRUE);
4046 if (error != 0) {
4047 mutex_exit(&so->so_lock);
4048 return (error);
4052 name = (struct sockaddr *)msg->msg_name;
4053 namelen = msg->msg_namelen;
4054 flags = msg->msg_flags;
4057 * Historically, this function does not validate the flags
4058 * passed in, and any errant bits are ignored. However,
4059 * we would not want any such errant flag bits accidently
4060 * being treated as one of the internal-only flags, so
4061 * clear the internal-only flag bits.
4063 flags &= ~MSG_SENDTO_NOXLATE;
4065 so_mode = so->so_mode;
4067 if (name == NULL) {
4068 if (!(so_state & SS_ISCONNECTED)) {
4069 mutex_exit(&so->so_lock);
4070 if (so_mode & SM_CONNREQUIRED)
4071 return (ENOTCONN);
4072 else
4073 return (EDESTADDRREQ);
4076 * This is a connected socket.
4078 if (so_mode & SM_CONNREQUIRED) {
4080 * This is a connected STREAM socket,
4081 * destination not specified.
4083 name = NULL;
4084 namelen = 0;
4085 } else {
4087 * Datagram send on connected socket with
4088 * the destination name not specified.
4089 * Use the peer address from connect.
4091 if (so->so_family == AF_UNIX) {
4093 * Use the (internal form) address saved
4094 * in sotpi_connect. See above.
4096 name = (void *)&sti->sti_ux_faddr;
4097 namelen = sizeof (sti->sti_ux_faddr);
4098 flags |= MSG_SENDTO_NOXLATE;
4099 } else {
4100 ASSERT(sti->sti_faddr_sa);
4101 name = sti->sti_faddr_sa;
4102 namelen = (t_uscalar_t)sti->sti_faddr_len;
4105 } else {
4107 * Sendmsg specifies a destination name
4109 if (!(so_state & SS_ISCONNECTED) &&
4110 (so_mode & SM_CONNREQUIRED)) {
4111 /* i.e. TCP not connected */
4112 mutex_exit(&so->so_lock);
4113 return (ENOTCONN);
4116 * Ignore the address on connection-oriented sockets.
4117 * Just like BSD this code does not generate an error for
4118 * TCP (a CONNREQUIRED socket) when sending to an address
4119 * passed in with sendto/sendmsg. Instead the data is
4120 * delivered on the connection as if no address had been
4121 * supplied.
4123 if ((so_state & SS_ISCONNECTED) &&
4124 !(so_mode & SM_CONNREQUIRED)) {
4125 mutex_exit(&so->so_lock);
4126 return (EISCONN);
4128 if (!(so_state & SS_ISBOUND)) {
4129 so_lock_single(so); /* Set SOLOCKED */
4130 error = sotpi_bind(so, NULL, 0,
4131 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4132 so_unlock_single(so, SOLOCKED);
4133 if (error) {
4134 mutex_exit(&so->so_lock);
4135 eprintsoline(so, error);
4136 return (error);
4140 * Handle delayed datagram errors. These are only queued
4141 * when the application sets SO_DGRAM_ERRIND.
4142 * Return the error if we are sending to the address
4143 * that was returned in the last T_UDERROR_IND.
4144 * If sending to some other address discard the delayed
4145 * error indication.
4147 if (sti->sti_delayed_error) {
4148 struct T_uderror_ind *tudi;
4149 void *addr;
4150 t_uscalar_t addrlen;
4151 boolean_t match = B_FALSE;
4153 ASSERT(sti->sti_eaddr_mp);
4154 error = sti->sti_delayed_error;
4155 sti->sti_delayed_error = 0;
4156 tudi =
4157 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4158 addrlen = tudi->DEST_length;
4159 addr = sogetoff(sti->sti_eaddr_mp,
4160 tudi->DEST_offset, addrlen, 1);
4161 ASSERT(addr); /* Checked by strsock_proto */
4162 switch (so->so_family) {
4163 case AF_INET: {
4164 /* Compare just IP address and port */
4165 sin_t *sin1 = (sin_t *)name;
4166 sin_t *sin2 = (sin_t *)addr;
4168 if (addrlen == sizeof (sin_t) &&
4169 namelen == addrlen &&
4170 sin1->sin_port == sin2->sin_port &&
4171 sin1->sin_addr.s_addr ==
4172 sin2->sin_addr.s_addr)
4173 match = B_TRUE;
4174 break;
4176 case AF_INET6: {
4177 /* Compare just IP address and port. Not flow */
4178 sin6_t *sin1 = (sin6_t *)name;
4179 sin6_t *sin2 = (sin6_t *)addr;
4181 if (addrlen == sizeof (sin6_t) &&
4182 namelen == addrlen &&
4183 sin1->sin6_port == sin2->sin6_port &&
4184 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4185 &sin2->sin6_addr))
4186 match = B_TRUE;
4187 break;
4189 case AF_UNIX:
4190 default:
4191 if (namelen == addrlen &&
4192 bcmp(name, addr, namelen) == 0)
4193 match = B_TRUE;
4195 if (match) {
4196 freemsg(sti->sti_eaddr_mp);
4197 sti->sti_eaddr_mp = NULL;
4198 mutex_exit(&so->so_lock);
4199 #ifdef DEBUG
4200 dprintso(so, 0,
4201 ("sockfs delayed error %d for %s\n",
4202 error,
4203 pr_addr(so->so_family, name, namelen)));
4204 #endif /* DEBUG */
4205 return (error);
4207 freemsg(sti->sti_eaddr_mp);
4208 sti->sti_eaddr_mp = NULL;
4211 mutex_exit(&so->so_lock);
4213 dontroute = 0;
4214 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4215 uint32_t val;
4217 val = 1;
4218 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4219 &val, (t_uscalar_t)sizeof (val), cr);
4220 if (error)
4221 return (error);
4222 dontroute = 1;
4225 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4226 error = EOPNOTSUPP;
4227 goto done;
4229 if (msg->msg_controllen != 0) {
4230 if (!(so_mode & SM_CONNREQUIRED)) {
4231 so_update_attrs(so, SOMOD);
4232 error = sosend_dgramcmsg(so, name, namelen, uiop,
4233 msg->msg_control, msg->msg_controllen, flags);
4234 } else {
4235 if (flags & MSG_OOB) {
4236 /* Can't generate T_EXDATA_REQ with options */
4237 error = EOPNOTSUPP;
4238 goto done;
4240 so_update_attrs(so, SOMOD);
4241 error = sosend_svccmsg(so, uiop,
4242 !(flags & MSG_EOR),
4243 msg->msg_control, msg->msg_controllen,
4244 flags);
4246 goto done;
4249 so_update_attrs(so, SOMOD);
4250 if (!(so_mode & SM_CONNREQUIRED)) {
4252 * If there is no SO_DONTROUTE to turn off return immediately
4253 * from send_dgram. This can allow tail-call optimizations.
4255 if (!dontroute) {
4256 return (sosend_dgram(so, name, namelen, uiop, flags));
4258 error = sosend_dgram(so, name, namelen, uiop, flags);
4259 } else {
4260 t_scalar_t prim;
4261 int sflag;
4263 /* Ignore msg_name in the connected state */
4264 if (flags & MSG_OOB) {
4265 prim = T_EXDATA_REQ;
4267 * Send down T_EXDATA_REQ even if there is flow
4268 * control for data.
4270 sflag = MSG_IGNFLOW;
4271 } else {
4272 if (so_mode & SM_BYTESTREAM) {
4273 /* Byte stream transport - use write */
4274 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4277 * If there is no SO_DONTROUTE to turn off,
4278 * sti_direct is on, and there is no flow
4279 * control, we can take the fast path.
4281 if (!dontroute && sti->sti_direct != 0 &&
4282 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4283 return (sostream_direct(so, uiop,
4284 NULL, cr));
4286 error = strwrite(SOTOV(so), uiop, cr);
4287 goto done;
4289 prim = T_DATA_REQ;
4290 sflag = 0;
4293 * If there is no SO_DONTROUTE to turn off return immediately
4294 * from sosend_svc. This can allow tail-call optimizations.
4296 if (!dontroute)
4297 return (sosend_svc(so, uiop, prim,
4298 !(flags & MSG_EOR), sflag));
4299 error = sosend_svc(so, uiop, prim,
4300 !(flags & MSG_EOR), sflag);
4302 ASSERT(dontroute);
4303 done:
4304 if (dontroute) {
4305 uint32_t val;
4307 val = 0;
4308 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4309 &val, (t_uscalar_t)sizeof (val), cr);
4311 return (error);
4315 * kstrwritemp() has very similar semantics as that of strwrite().
4316 * The main difference is it obtains mblks from the caller and also
4317 * does not do any copy as done in strwrite() from user buffers to
4318 * kernel buffers.
4320 * Currently, this routine is used by sendfile to send data allocated
4321 * within the kernel without any copying. This interface does not use the
4322 * synchronous stream interface as synch. stream interface implies
4323 * copying.
4326 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4328 struct stdata *stp;
4329 struct queue *wqp;
4330 mblk_t *newmp;
4331 char waitflag;
4332 int tempmode;
4333 int error = 0;
4334 int done = 0;
4335 struct sonode *so;
4336 boolean_t direct;
4338 ASSERT(vp->v_stream);
4339 stp = vp->v_stream;
4341 so = VTOSO(vp);
4342 direct = _SOTOTPI(so)->sti_direct;
4345 * This is the sockfs direct fast path. canputnext() need
4346 * not be accurate so we don't grab the sd_lock here. If
4347 * we get flow-controlled, we grab sd_lock just before the
4348 * do..while loop below to emulate what strwrite() does.
4350 wqp = stp->sd_wrq;
4351 if (canputnext(wqp) && direct &&
4352 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4353 return (sostream_direct(so, NULL, mp, CRED()));
4354 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4355 /* Fast check of flags before acquiring the lock */
4356 mutex_enter(&stp->sd_lock);
4357 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4358 mutex_exit(&stp->sd_lock);
4359 if (error != 0) {
4360 if (!(stp->sd_flag & STPLEX) &&
4361 (stp->sd_wput_opt & SW_SIGPIPE)) {
4362 error = EPIPE;
4364 return (error);
4368 waitflag = WRITEWAIT;
4369 if (stp->sd_flag & OLDNDELAY)
4370 tempmode = fmode & ~FNDELAY;
4371 else
4372 tempmode = fmode;
4374 mutex_enter(&stp->sd_lock);
4375 do {
4376 if (canputnext(wqp)) {
4377 mutex_exit(&stp->sd_lock);
4378 if (stp->sd_wputdatafunc != NULL) {
4379 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4380 NULL, NULL, NULL);
4381 if (newmp == NULL) {
4382 /* The caller will free mp */
4383 return (ECOMM);
4385 mp = newmp;
4387 putnext(wqp, mp);
4388 return (0);
4390 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4391 &done);
4392 } while (error == 0 && !done);
4394 mutex_exit(&stp->sd_lock);
4396 * EAGAIN tells the application to try again. ENOMEM
4397 * is returned only if the memory allocation size
4398 * exceeds the physical limits of the system. ENOMEM
4399 * can't be true here.
4401 if (error == ENOMEM)
4402 error = EAGAIN;
4403 return (error);
4406 /* ARGSUSED */
4407 static int
4408 sotpi_sendmblk(struct sonode *so, struct msghdr *msg, int fflag,
4409 struct cred *cr, mblk_t **mpp)
4411 int error;
4413 switch (so->so_family) {
4414 case AF_INET:
4415 case AF_INET6:
4416 case AF_UNIX:
4417 break;
4418 default:
4419 return (EAFNOSUPPORT);
4423 if (so->so_state & SS_CANTSENDMORE)
4424 return (EPIPE);
4426 if (so->so_type != SOCK_STREAM)
4427 return (EOPNOTSUPP);
4429 if ((so->so_state & SS_ISCONNECTED) == 0)
4430 return (ENOTCONN);
4432 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4433 if (error == 0)
4434 *mpp = NULL;
4435 return (error);
4439 * Sending data on a datagram socket.
4440 * Assumes caller has verified that SS_ISBOUND etc. are set.
4442 /* ARGSUSED */
4443 static int
4444 sodgram_direct(struct sonode *so, struct sockaddr *name,
4445 socklen_t namelen, struct uio *uiop, int flags)
4447 struct T_unitdata_req tudr;
4448 mblk_t *mp = NULL;
4449 int error = 0;
4450 void *addr;
4451 socklen_t addrlen;
4452 ssize_t len;
4453 struct stdata *stp = SOTOV(so)->v_stream;
4454 int so_state;
4455 queue_t *udp_wq;
4456 boolean_t connected;
4457 mblk_t *mpdata = NULL;
4458 sotpi_info_t *sti = SOTOTPI(so);
4459 uint32_t auditing = AU_AUDITING();
4461 ASSERT(name != NULL && namelen != 0);
4462 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4463 ASSERT(!(so->so_mode & SM_EXDATA));
4464 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4465 ASSERT(SOTOV(so)->v_type == VSOCK);
4467 /* Caller checked for proper length */
4468 len = uiop->uio_resid;
4469 ASSERT(len <= sti->sti_tidu_size);
4471 /* Length and family checks have been done by caller */
4472 ASSERT(name->sa_family == so->so_family);
4473 ASSERT(so->so_family == AF_INET ||
4474 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4475 ASSERT(so->so_family == AF_INET6 ||
4476 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4478 addr = name;
4479 addrlen = namelen;
4481 if (stp->sd_sidp != NULL &&
4482 (error = straccess(stp, JCWRITE)) != 0)
4483 goto done;
4485 so_state = so->so_state;
4487 connected = so_state & SS_ISCONNECTED;
4488 if (!connected) {
4489 tudr.PRIM_type = T_UNITDATA_REQ;
4490 tudr.DEST_length = addrlen;
4491 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4492 tudr.OPT_length = 0;
4493 tudr.OPT_offset = 0;
4495 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4496 _ALLOC_INTR, CRED());
4497 if (mp == NULL) {
4499 * Caught a signal waiting for memory.
4500 * Let send* return EINTR.
4502 error = EINTR;
4503 goto done;
4508 * For UDP we don't break up the copyin into smaller pieces
4509 * as in the TCP case. That means if ENOMEM is returned by
4510 * mcopyinuio() then the uio vector has not been modified at
4511 * all and we fallback to either strwrite() or kstrputmsg()
4512 * below. Note also that we never generate priority messages
4513 * from here.
4515 udp_wq = stp->sd_wrq->q_next;
4516 if (canput(udp_wq) &&
4517 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4518 ASSERT(DB_TYPE(mpdata) == M_DATA);
4519 ASSERT(uiop->uio_resid == 0);
4520 if (!connected)
4521 linkb(mp, mpdata);
4522 else
4523 mp = mpdata;
4524 if (auditing)
4525 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4527 udp_wput(udp_wq, mp);
4528 return (0);
4531 ASSERT(mpdata == NULL);
4532 if (error != 0 && error != ENOMEM) {
4533 freemsg(mp);
4534 return (error);
4538 * For connected, let strwrite() handle the blocking case.
4539 * Otherwise we fall thru and use kstrputmsg().
4541 if (connected)
4542 return (strwrite(SOTOV(so), uiop, CRED()));
4544 if (auditing)
4545 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4547 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4548 done:
4549 #ifdef SOCK_DEBUG
4550 if (error != 0) {
4551 eprintsoline(so, error);
4553 #endif /* SOCK_DEBUG */
4554 return (error);
4558 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4560 struct stdata *stp = SOTOV(so)->v_stream;
4561 ssize_t iosize, rmax, maxblk;
4562 queue_t *tcp_wq = stp->sd_wrq->q_next;
4563 mblk_t *newmp;
4564 int error = 0, wflag = 0;
4566 ASSERT(so->so_mode & SM_BYTESTREAM);
4567 ASSERT(SOTOV(so)->v_type == VSOCK);
4569 if (stp->sd_sidp != NULL &&
4570 (error = straccess(stp, JCWRITE)) != 0)
4571 return (error);
4573 if (uiop == NULL) {
4575 * kstrwritemp() should have checked sd_flag and
4576 * flow-control before coming here. If we end up
4577 * here it means that we can simply pass down the
4578 * data to tcp.
4580 ASSERT(mp != NULL);
4581 if (stp->sd_wputdatafunc != NULL) {
4582 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4583 NULL, NULL, NULL);
4584 if (newmp == NULL) {
4585 /* The caller will free mp */
4586 return (ECOMM);
4588 mp = newmp;
4590 tcp_wput(tcp_wq, mp);
4591 return (0);
4594 /* Fallback to strwrite() to do proper error handling */
4595 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4596 return (strwrite(SOTOV(so), uiop, cr));
4598 rmax = stp->sd_qn_maxpsz;
4599 ASSERT(rmax >= 0 || rmax == INFPSZ);
4600 if (rmax == 0 || uiop->uio_resid <= 0)
4601 return (0);
4603 if (rmax == INFPSZ)
4604 rmax = uiop->uio_resid;
4606 maxblk = stp->sd_maxblk;
4608 for (;;) {
4609 iosize = MIN(uiop->uio_resid, rmax);
4611 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4612 if (mp == NULL) {
4614 * Fallback to strwrite() for ENOMEM; if this
4615 * is our first time in this routine and the uio
4616 * vector has not been modified, we will end up
4617 * calling strwrite() without any flag set.
4619 if (error == ENOMEM)
4620 goto slow_send;
4621 else
4622 return (error);
4624 ASSERT(uiop->uio_resid >= 0);
4626 * If mp is non-NULL and ENOMEM is set, it means that
4627 * mcopyinuio() was able to break down some of the user
4628 * data into one or more mblks. Send the partial data
4629 * to tcp and let the rest be handled in strwrite().
4631 ASSERT(error == 0 || error == ENOMEM);
4632 if (stp->sd_wputdatafunc != NULL) {
4633 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4634 NULL, NULL, NULL);
4635 if (newmp == NULL) {
4636 /* The caller will free mp */
4637 return (ECOMM);
4639 mp = newmp;
4641 tcp_wput(tcp_wq, mp);
4643 wflag |= NOINTR;
4645 if (uiop->uio_resid == 0) { /* No more data; we're done */
4646 ASSERT(error == 0);
4647 break;
4648 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4649 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4650 slow_send:
4652 * We were able to send down partial data using
4653 * the direct call interface, but are now relying
4654 * on strwrite() to handle the non-fastpath cases.
4655 * If the socket is blocking we will sleep in
4656 * strwaitq() until write is permitted, otherwise,
4657 * we will need to return the amount of bytes
4658 * written so far back to the app. This is the
4659 * reason why we pass NOINTR flag to strwrite()
4660 * for non-blocking socket, because we don't want
4661 * to return EAGAIN when portion of the user data
4662 * has actually been sent down.
4664 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4667 return (0);
4671 * Update sti_faddr by asking the transport (unless AF_UNIX).
4673 /* ARGSUSED */
4675 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4676 boolean_t accept, struct cred *cr)
4678 struct strbuf strbuf;
4679 int error = 0, res;
4680 void *addr;
4681 t_uscalar_t addrlen;
4682 k_sigset_t smask;
4683 sotpi_info_t *sti = SOTOTPI(so);
4685 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4686 (void *)so, pr_state(so->so_state, so->so_mode)));
4688 ASSERT(*namelen > 0);
4689 mutex_enter(&so->so_lock);
4690 so_lock_single(so); /* Set SOLOCKED */
4692 if (accept) {
4693 bcopy(sti->sti_faddr_sa, name,
4694 MIN(*namelen, sti->sti_faddr_len));
4695 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4696 goto done;
4699 if (!(so->so_state & SS_ISCONNECTED)) {
4700 error = ENOTCONN;
4701 goto done;
4703 /* Added this check for X/Open */
4704 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4705 error = EINVAL;
4706 if (xnet_check_print) {
4707 printf("sockfs: X/Open getpeername check => EINVAL\n");
4709 goto done;
4712 if (sti->sti_faddr_valid) {
4713 bcopy(sti->sti_faddr_sa, name,
4714 MIN(*namelen, sti->sti_faddr_len));
4715 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4716 goto done;
4719 #ifdef DEBUG
4720 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4721 pr_addr(so->so_family, sti->sti_faddr_sa,
4722 (t_uscalar_t)sti->sti_faddr_len)));
4723 #endif /* DEBUG */
4725 if (so->so_family == AF_UNIX) {
4726 /* Transport has different name space - return local info */
4727 if (sti->sti_faddr_noxlate)
4728 *namelen = 0;
4729 error = 0;
4730 goto done;
4733 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4735 ASSERT(sti->sti_faddr_sa);
4736 /* Allocate local buffer to use with ioctl */
4737 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4738 mutex_exit(&so->so_lock);
4739 addr = kmem_alloc(addrlen, KM_SLEEP);
4742 * Issue TI_GETPEERNAME with signals masked.
4743 * Put the result in sti_faddr_sa so that getpeername works after
4744 * a shutdown(output).
4745 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4746 * back to the socket.
4748 strbuf.buf = addr;
4749 strbuf.maxlen = addrlen;
4750 strbuf.len = 0;
4752 sigintr(&smask, 0);
4753 res = 0;
4754 ASSERT(cr);
4755 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4756 0, K_TO_K, cr, &res);
4757 sigunintr(&smask);
4759 mutex_enter(&so->so_lock);
4761 * If there is an error record the error in so_error put don't fail
4762 * the getpeername. Instead fallback on the recorded
4763 * sti->sti_faddr_sa.
4765 if (error) {
4767 * Various stream head errors can be returned to the ioctl.
4768 * However, it is impossible to determine which ones of
4769 * these are really socket level errors that were incorrectly
4770 * consumed by the ioctl. Thus this code silently ignores the
4771 * error - to code explicitly does not reinstate the error
4772 * using soseterror().
4773 * Experiments have shows that at least this set of
4774 * errors are reported and should not be reinstated on the
4775 * socket:
4776 * EINVAL E.g. if an I_LINK was in effect when
4777 * getpeername was called.
4778 * EPIPE The ioctl error semantics prefer the write
4779 * side error over the read side error.
4780 * ENOTCONN The transport just got disconnected but
4781 * sockfs had not yet seen the T_DISCON_IND
4782 * when issuing the ioctl.
4784 error = 0;
4785 } else if (res == 0 && strbuf.len > 0 &&
4786 (so->so_state & SS_ISCONNECTED)) {
4787 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4788 sti->sti_faddr_len = (socklen_t)strbuf.len;
4789 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4790 sti->sti_faddr_valid = 1;
4792 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4793 *namelen = sti->sti_faddr_len;
4795 kmem_free(addr, addrlen);
4796 #ifdef DEBUG
4797 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4798 pr_addr(so->so_family, sti->sti_faddr_sa,
4799 (t_uscalar_t)sti->sti_faddr_len)));
4800 #endif /* DEBUG */
4801 done:
4802 so_unlock_single(so, SOLOCKED);
4803 mutex_exit(&so->so_lock);
4804 return (error);
4808 * Update sti_laddr by asking the transport (unless AF_UNIX).
4811 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4812 struct cred *cr)
4814 struct strbuf strbuf;
4815 int error = 0, res;
4816 void *addr;
4817 t_uscalar_t addrlen;
4818 k_sigset_t smask;
4819 sotpi_info_t *sti = SOTOTPI(so);
4821 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4822 (void *)so, pr_state(so->so_state, so->so_mode)));
4824 ASSERT(*namelen > 0);
4825 mutex_enter(&so->so_lock);
4826 so_lock_single(so); /* Set SOLOCKED */
4828 #ifdef DEBUG
4830 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4831 pr_addr(so->so_family, sti->sti_laddr_sa,
4832 (t_uscalar_t)sti->sti_laddr_len)));
4833 #endif /* DEBUG */
4834 if (sti->sti_laddr_valid) {
4835 bcopy(sti->sti_laddr_sa, name,
4836 MIN(*namelen, sti->sti_laddr_len));
4837 *namelen = sti->sti_laddr_len;
4838 goto done;
4841 if (so->so_family == AF_UNIX) {
4843 * Transport has different name space - return local info. If we
4844 * have enough space, let consumers know the family.
4846 if (*namelen >= sizeof (sa_family_t)) {
4847 name->sa_family = AF_UNIX;
4848 *namelen = sizeof (sa_family_t);
4849 } else {
4850 *namelen = 0;
4852 error = 0;
4853 goto done;
4855 if (!(so->so_state & SS_ISBOUND)) {
4856 /* If not bound, then nothing to return. */
4857 error = 0;
4858 goto done;
4861 /* Allocate local buffer to use with ioctl */
4862 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4863 mutex_exit(&so->so_lock);
4864 addr = kmem_alloc(addrlen, KM_SLEEP);
4867 * Issue TI_GETMYNAME with signals masked.
4868 * Put the result in sti_laddr_sa so that getsockname works after
4869 * a shutdown(output).
4870 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4871 * back to the socket.
4873 strbuf.buf = addr;
4874 strbuf.maxlen = addrlen;
4875 strbuf.len = 0;
4877 sigintr(&smask, 0);
4878 res = 0;
4879 ASSERT(cr);
4880 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4881 0, K_TO_K, cr, &res);
4882 sigunintr(&smask);
4884 mutex_enter(&so->so_lock);
4886 * If there is an error record the error in so_error put don't fail
4887 * the getsockname. Instead fallback on the recorded
4888 * sti->sti_laddr_sa.
4890 if (error) {
4892 * Various stream head errors can be returned to the ioctl.
4893 * However, it is impossible to determine which ones of
4894 * these are really socket level errors that were incorrectly
4895 * consumed by the ioctl. Thus this code silently ignores the
4896 * error - to code explicitly does not reinstate the error
4897 * using soseterror().
4898 * Experiments have shows that at least this set of
4899 * errors are reported and should not be reinstated on the
4900 * socket:
4901 * EINVAL E.g. if an I_LINK was in effect when
4902 * getsockname was called.
4903 * EPIPE The ioctl error semantics prefer the write
4904 * side error over the read side error.
4906 error = 0;
4907 } else if (res == 0 && strbuf.len > 0 &&
4908 (so->so_state & SS_ISBOUND)) {
4909 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4910 sti->sti_laddr_len = (socklen_t)strbuf.len;
4911 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4912 sti->sti_laddr_valid = 1;
4914 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
4915 *namelen = sti->sti_laddr_len;
4917 kmem_free(addr, addrlen);
4918 #ifdef DEBUG
4919 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4920 pr_addr(so->so_family, sti->sti_laddr_sa,
4921 (t_uscalar_t)sti->sti_laddr_len)));
4922 #endif /* DEBUG */
4923 done:
4924 so_unlock_single(so, SOLOCKED);
4925 mutex_exit(&so->so_lock);
4926 return (error);
4930 * Get socket options. For SOL_SOCKET options some options are handled
4931 * by the sockfs while others use the value recorded in the sonode as a
4932 * fallback should the T_SVR4_OPTMGMT_REQ fail.
4934 * On the return most *optlenp bytes are copied to optval.
4936 /* ARGSUSED */
4938 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4939 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
4941 struct T_optmgmt_req optmgmt_req;
4942 struct T_optmgmt_ack *optmgmt_ack;
4943 struct opthdr oh;
4944 struct opthdr *opt_res;
4945 mblk_t *mp = NULL;
4946 int error = 0;
4947 void *option = NULL; /* Set if fallback value */
4948 t_uscalar_t maxlen = *optlenp;
4949 t_uscalar_t len;
4950 uint32_t value;
4951 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
4952 struct timeval32 tmo_val32;
4953 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
4955 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4956 (void *)so, level, option_name, optval, (void *)optlenp,
4957 pr_state(so->so_state, so->so_mode)));
4959 mutex_enter(&so->so_lock);
4960 so_lock_single(so); /* Set SOLOCKED */
4963 * Check for SOL_SOCKET options.
4964 * Certain SOL_SOCKET options are returned directly whereas
4965 * others only provide a default (fallback) value should
4966 * the T_SVR4_OPTMGMT_REQ fail.
4968 if (level == SOL_SOCKET) {
4969 /* Check parameters */
4970 switch (option_name) {
4971 case SO_TYPE:
4972 case SO_ERROR:
4973 case SO_DEBUG:
4974 case SO_ACCEPTCONN:
4975 case SO_REUSEADDR:
4976 case SO_KEEPALIVE:
4977 case SO_DONTROUTE:
4978 case SO_BROADCAST:
4979 case SO_USELOOPBACK:
4980 case SO_OOBINLINE:
4981 case SO_SNDBUF:
4982 case SO_RCVBUF:
4983 #ifdef notyet
4984 case SO_SNDLOWAT:
4985 case SO_RCVLOWAT:
4986 #endif /* notyet */
4987 case SO_DOMAIN:
4988 case SO_DGRAM_ERRIND:
4989 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4990 error = EINVAL;
4991 eprintsoline(so, error);
4992 goto done2;
4994 break;
4995 case SO_RCVTIMEO:
4996 case SO_SNDTIMEO:
4997 if (get_udatamodel() == DATAMODEL_NONE ||
4998 get_udatamodel() == DATAMODEL_NATIVE) {
4999 if (maxlen < sizeof (struct timeval)) {
5000 error = EINVAL;
5001 eprintsoline(so, error);
5002 goto done2;
5004 } else {
5005 if (maxlen < sizeof (struct timeval32)) {
5006 error = EINVAL;
5007 eprintsoline(so, error);
5008 goto done2;
5012 break;
5013 case SO_LINGER:
5014 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5015 error = EINVAL;
5016 eprintsoline(so, error);
5017 goto done2;
5019 break;
5020 case SO_SND_BUFINFO:
5021 if (maxlen < (t_uscalar_t)
5022 sizeof (struct so_snd_bufinfo)) {
5023 error = EINVAL;
5024 eprintsoline(so, error);
5025 goto done2;
5027 break;
5030 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5032 switch (option_name) {
5033 case SO_TYPE:
5034 value = so->so_type;
5035 option = &value;
5036 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5038 case SO_ERROR:
5039 value = sogeterr(so, B_TRUE);
5040 option = &value;
5041 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5043 case SO_ACCEPTCONN:
5044 if (so->so_state & SS_ACCEPTCONN)
5045 value = SO_ACCEPTCONN;
5046 else
5047 value = 0;
5048 #ifdef DEBUG
5049 if (value) {
5050 dprintso(so, 1,
5051 ("sotpi_getsockopt: 0x%x is set\n",
5052 option_name));
5053 } else {
5054 dprintso(so, 1,
5055 ("sotpi_getsockopt: 0x%x not set\n",
5056 option_name));
5058 #endif /* DEBUG */
5059 option = &value;
5060 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5062 case SO_DEBUG:
5063 case SO_REUSEADDR:
5064 case SO_KEEPALIVE:
5065 case SO_DONTROUTE:
5066 case SO_BROADCAST:
5067 case SO_USELOOPBACK:
5068 case SO_OOBINLINE:
5069 case SO_DGRAM_ERRIND:
5070 value = (so->so_options & option_name);
5071 #ifdef DEBUG
5072 if (value) {
5073 dprintso(so, 1,
5074 ("sotpi_getsockopt: 0x%x is set\n",
5075 option_name));
5076 } else {
5077 dprintso(so, 1,
5078 ("sotpi_getsockopt: 0x%x not set\n",
5079 option_name));
5081 #endif /* DEBUG */
5082 option = &value;
5083 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5086 * The following options are only returned by sockfs when the
5087 * T_SVR4_OPTMGMT_REQ fails.
5089 case SO_LINGER:
5090 option = &so->so_linger;
5091 len = (t_uscalar_t)sizeof (struct linger);
5092 break;
5093 case SO_SNDBUF: {
5094 ssize_t lvalue;
5097 * If the option has not been set then get a default
5098 * value from the read queue. This value is
5099 * returned if the transport fails
5100 * the T_SVR4_OPTMGMT_REQ.
5102 lvalue = so->so_sndbuf;
5103 if (lvalue == 0) {
5104 mutex_exit(&so->so_lock);
5105 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5106 QHIWAT, 0, &lvalue);
5107 mutex_enter(&so->so_lock);
5108 dprintso(so, 1,
5109 ("got SO_SNDBUF %ld from q\n", lvalue));
5111 value = (int)lvalue;
5112 option = &value;
5113 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5114 break;
5116 case SO_RCVBUF: {
5117 ssize_t lvalue;
5120 * If the option has not been set then get a default
5121 * value from the read queue. This value is
5122 * returned if the transport fails
5123 * the T_SVR4_OPTMGMT_REQ.
5125 lvalue = so->so_rcvbuf;
5126 if (lvalue == 0) {
5127 mutex_exit(&so->so_lock);
5128 (void) strqget(RD(strvp2wq(SOTOV(so))),
5129 QHIWAT, 0, &lvalue);
5130 mutex_enter(&so->so_lock);
5131 dprintso(so, 1,
5132 ("got SO_RCVBUF %ld from q\n", lvalue));
5134 value = (int)lvalue;
5135 option = &value;
5136 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5137 break;
5139 case SO_DOMAIN:
5140 value = so->so_family;
5141 option = &value;
5142 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5144 #ifdef notyet
5146 * We do not implement the semantics of these options
5147 * thus we shouldn't implement the options either.
5149 case SO_SNDLOWAT:
5150 value = so->so_sndlowat;
5151 option = &value;
5152 break;
5153 case SO_RCVLOWAT:
5154 value = so->so_rcvlowat;
5155 option = &value;
5156 break;
5157 #endif /* notyet */
5158 case SO_SNDTIMEO:
5159 case SO_RCVTIMEO: {
5160 clock_t val;
5162 if (option_name == SO_RCVTIMEO)
5163 val = drv_hztousec(so->so_rcvtimeo);
5164 else
5165 val = drv_hztousec(so->so_sndtimeo);
5166 tmo_val.tv_sec = val / (1000 * 1000);
5167 tmo_val.tv_usec = val % (1000 * 1000);
5168 if (get_udatamodel() == DATAMODEL_NONE ||
5169 get_udatamodel() == DATAMODEL_NATIVE) {
5170 option = &tmo_val;
5171 len = sizeof (struct timeval);
5172 } else {
5173 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5174 option = &tmo_val32;
5175 len = sizeof (struct timeval32);
5177 break;
5179 case SO_SND_BUFINFO: {
5180 snd_bufinfo.sbi_wroff =
5181 (so->so_proto_props).sopp_wroff;
5182 snd_bufinfo.sbi_maxblk =
5183 (so->so_proto_props).sopp_maxblk;
5184 snd_bufinfo.sbi_maxpsz =
5185 (so->so_proto_props).sopp_maxpsz;
5186 snd_bufinfo.sbi_tail =
5187 (so->so_proto_props).sopp_tail;
5188 option = &snd_bufinfo;
5189 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5190 break;
5195 mutex_exit(&so->so_lock);
5197 /* Send request */
5198 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5199 optmgmt_req.MGMT_flags = T_CHECK;
5200 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5201 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5203 oh.level = level;
5204 oh.name = option_name;
5205 oh.len = maxlen;
5207 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5208 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5209 /* Let option management work in the presence of data flow control */
5210 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5211 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5212 mp = NULL;
5213 mutex_enter(&so->so_lock);
5214 if (error) {
5215 eprintsoline(so, error);
5216 goto done2;
5218 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5219 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5220 if (error) {
5221 if (option != NULL) {
5222 /* We have a fallback value */
5223 error = 0;
5224 goto copyout;
5226 eprintsoline(so, error);
5227 goto done2;
5229 ASSERT(mp);
5230 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5231 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5232 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5233 if (opt_res == NULL) {
5234 if (option != NULL) {
5235 /* We have a fallback value */
5236 error = 0;
5237 goto copyout;
5239 error = EPROTO;
5240 eprintsoline(so, error);
5241 goto done;
5243 option = &opt_res[1];
5245 /* check to ensure that the option is within bounds */
5246 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5247 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5248 if (option != NULL) {
5249 /* We have a fallback value */
5250 error = 0;
5251 goto copyout;
5253 error = EPROTO;
5254 eprintsoline(so, error);
5255 goto done;
5258 len = opt_res->len;
5260 copyout: {
5261 t_uscalar_t size = MIN(len, maxlen);
5262 bcopy(option, optval, size);
5263 bcopy(&size, optlenp, sizeof (size));
5265 done:
5266 freemsg(mp);
5267 done2:
5268 so_unlock_single(so, SOLOCKED);
5269 mutex_exit(&so->so_lock);
5271 return (error);
5275 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5276 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5277 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5278 * setsockopt has to work even if the transport does not support the option.
5280 /* ARGSUSED */
5282 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5283 const void *optval, t_uscalar_t optlen, struct cred *cr)
5285 struct T_optmgmt_req optmgmt_req;
5286 struct opthdr oh;
5287 mblk_t *mp;
5288 int error = 0;
5289 boolean_t handled = B_FALSE;
5291 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5292 (void *)so, level, option_name, optval, optlen,
5293 pr_state(so->so_state, so->so_mode)));
5295 /* X/Open requires this check */
5296 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5297 if (xnet_check_print)
5298 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5299 return (EINVAL);
5302 mutex_enter(&so->so_lock);
5303 so_lock_single(so); /* Set SOLOCKED */
5304 mutex_exit(&so->so_lock);
5306 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5307 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5308 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5309 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5311 oh.level = level;
5312 oh.name = option_name;
5313 oh.len = optlen;
5315 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5316 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5317 /* Let option management work in the presence of data flow control */
5318 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5319 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5320 mp = NULL;
5321 mutex_enter(&so->so_lock);
5322 if (error) {
5323 eprintsoline(so, error);
5324 goto done2;
5326 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5327 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5328 if (error) {
5329 eprintsoline(so, error);
5330 goto done;
5332 ASSERT(mp);
5333 /* No need to verify T_optmgmt_ack */
5334 freemsg(mp);
5335 done:
5337 * Check for SOL_SOCKET options and record their values.
5338 * If we know about a SOL_SOCKET parameter and the transport
5339 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5340 * EPROTO) we let the setsockopt succeed.
5342 if (level == SOL_SOCKET) {
5343 /* Check parameters */
5344 switch (option_name) {
5345 case SO_DEBUG:
5346 case SO_REUSEADDR:
5347 case SO_KEEPALIVE:
5348 case SO_DONTROUTE:
5349 case SO_BROADCAST:
5350 case SO_USELOOPBACK:
5351 case SO_OOBINLINE:
5352 case SO_SNDBUF:
5353 case SO_RCVBUF:
5354 #ifdef notyet
5355 case SO_SNDLOWAT:
5356 case SO_RCVLOWAT:
5357 #endif /* notyet */
5358 case SO_DGRAM_ERRIND:
5359 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5360 error = EINVAL;
5361 eprintsoline(so, error);
5362 goto done2;
5364 ASSERT(optval);
5365 handled = B_TRUE;
5366 break;
5367 case SO_SNDTIMEO:
5368 case SO_RCVTIMEO:
5369 if (get_udatamodel() == DATAMODEL_NONE ||
5370 get_udatamodel() == DATAMODEL_NATIVE) {
5371 if (optlen != sizeof (struct timeval)) {
5372 error = EINVAL;
5373 eprintsoline(so, error);
5374 goto done2;
5376 } else {
5377 if (optlen != sizeof (struct timeval32)) {
5378 error = EINVAL;
5379 eprintsoline(so, error);
5380 goto done2;
5383 ASSERT(optval);
5384 handled = B_TRUE;
5385 break;
5386 case SO_LINGER:
5387 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5388 error = EINVAL;
5389 eprintsoline(so, error);
5390 goto done2;
5392 ASSERT(optval);
5393 handled = B_TRUE;
5394 break;
5397 #define intvalue (*(int32_t *)optval)
5399 switch (option_name) {
5400 case SO_TYPE:
5401 case SO_ERROR:
5402 case SO_ACCEPTCONN:
5403 /* Can't be set */
5404 error = ENOPROTOOPT;
5405 goto done2;
5406 case SO_LINGER: {
5407 struct linger *l = (struct linger *)optval;
5409 so->so_linger.l_linger = l->l_linger;
5410 if (l->l_onoff) {
5411 so->so_linger.l_onoff = SO_LINGER;
5412 so->so_options |= SO_LINGER;
5413 } else {
5414 so->so_linger.l_onoff = 0;
5415 so->so_options &= ~SO_LINGER;
5417 break;
5420 case SO_DEBUG:
5421 #ifdef SOCK_TEST
5422 if (intvalue & 2)
5423 sock_test_timelimit = 10 * hz;
5424 else
5425 sock_test_timelimit = 0;
5427 if (intvalue & 4)
5428 do_useracc = 0;
5429 else
5430 do_useracc = 1;
5431 #endif /* SOCK_TEST */
5432 /* FALLTHRU */
5433 case SO_REUSEADDR:
5434 case SO_KEEPALIVE:
5435 case SO_DONTROUTE:
5436 case SO_BROADCAST:
5437 case SO_USELOOPBACK:
5438 case SO_OOBINLINE:
5439 case SO_DGRAM_ERRIND:
5440 if (intvalue != 0) {
5441 dprintso(so, 1,
5442 ("socket_setsockopt: setting 0x%x\n",
5443 option_name));
5444 so->so_options |= option_name;
5445 } else {
5446 dprintso(so, 1,
5447 ("socket_setsockopt: clearing 0x%x\n",
5448 option_name));
5449 so->so_options &= ~option_name;
5451 break;
5453 * The following options are only returned by us when the
5454 * transport layer fails.
5455 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5456 * since the transport might adjust the value and not
5457 * return exactly what was set by the application.
5459 case SO_SNDBUF:
5460 so->so_sndbuf = intvalue;
5461 break;
5462 case SO_RCVBUF:
5463 so->so_rcvbuf = intvalue;
5464 break;
5465 case SO_RCVPSH:
5466 so->so_rcv_timer_interval = intvalue;
5467 break;
5468 #ifdef notyet
5470 * We do not implement the semantics of these options
5471 * thus we shouldn't implement the options either.
5473 case SO_SNDLOWAT:
5474 so->so_sndlowat = intvalue;
5475 break;
5476 case SO_RCVLOWAT:
5477 so->so_rcvlowat = intvalue;
5478 break;
5479 #endif /* notyet */
5480 case SO_SNDTIMEO:
5481 case SO_RCVTIMEO: {
5482 struct timeval tl;
5483 clock_t val;
5485 if (get_udatamodel() == DATAMODEL_NONE ||
5486 get_udatamodel() == DATAMODEL_NATIVE)
5487 bcopy(&tl, (struct timeval *)optval,
5488 sizeof (struct timeval));
5489 else
5490 TIMEVAL32_TO_TIMEVAL(&tl,
5491 (struct timeval32 *)optval);
5492 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5493 if (option_name == SO_RCVTIMEO)
5494 so->so_rcvtimeo = drv_usectohz(val);
5495 else
5496 so->so_sndtimeo = drv_usectohz(val);
5497 break;
5500 #undef intvalue
5502 if (error) {
5503 if ((error == ENOPROTOOPT || error == EPROTO ||
5504 error == EINVAL) && handled) {
5505 dprintso(so, 1,
5506 ("setsockopt: ignoring error %d for 0x%x\n",
5507 error, option_name));
5508 error = 0;
5512 done2:
5513 so_unlock_single(so, SOLOCKED);
5514 mutex_exit(&so->so_lock);
5515 return (error);
5519 * sotpi_close() is called when the last open reference goes away.
5521 /* ARGSUSED */
5523 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5525 struct vnode *vp = SOTOV(so);
5526 dev_t dev;
5527 int error = 0;
5528 sotpi_info_t *sti = SOTOTPI(so);
5530 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5531 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5533 dev = sti->sti_dev;
5535 ASSERT(STREAMSTAB(getmajor(dev)));
5537 mutex_enter(&so->so_lock);
5538 so_lock_single(so); /* Set SOLOCKED */
5540 ASSERT(so_verify_oobstate(so));
5542 if (vp->v_stream != NULL) {
5543 vnode_t *ux_vp;
5545 if (so->so_family == AF_UNIX) {
5546 /* Could avoid this when CANTSENDMORE for !dgram */
5547 so_unix_close(so);
5550 mutex_exit(&so->so_lock);
5552 * Disassemble the linkage from the AF_UNIX underlying file
5553 * system vnode to this socket (by atomically clearing
5554 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5555 * and frees the stream head.
5557 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5558 ASSERT(ux_vp->v_stream);
5559 sti->sti_ux_bound_vp = NULL;
5560 vn_rele_stream(ux_vp);
5562 error = strclose(vp, flag, cr);
5563 vp->v_stream = NULL;
5564 mutex_enter(&so->so_lock);
5568 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5570 so_flush_discon_ind(so);
5572 so_unlock_single(so, SOLOCKED);
5573 mutex_exit(&so->so_lock);
5576 * Needed for STREAMs.
5577 * Decrement the device driver's reference count for streams
5578 * opened via the clone dip. The driver was held in clone_open().
5579 * The absence of clone_close() forces this asymmetry.
5581 if (so->so_flag & SOCLONE)
5582 ddi_rele_driver(getmajor(dev));
5584 return (error);
5587 static int
5588 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5589 struct cred *cr, int32_t *rvalp)
5591 struct vnode *vp = SOTOV(so);
5592 sotpi_info_t *sti = SOTOTPI(so);
5593 int error = 0;
5595 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5596 cmd, arg, pr_state(so->so_state, so->so_mode)));
5598 switch (cmd) {
5599 case SIOCSQPTR:
5601 * SIOCSQPTR is valid only when helper stream is created
5602 * by the protocol.
5604 case _I_INSERT:
5605 case _I_REMOVE:
5607 * Since there's no compelling reason to support these ioctls
5608 * on sockets, and doing so would increase the complexity
5609 * markedly, prevent it.
5611 return (EOPNOTSUPP);
5613 case I_FIND:
5614 case I_LIST:
5615 case I_LOOK:
5616 case I_POP:
5617 case I_PUSH:
5619 * To prevent races and inconsistencies between the actual
5620 * state of the stream and the state according to the sonode,
5621 * we serialize all operations which modify or operate on the
5622 * list of modules on the socket's stream.
5624 mutex_enter(&sti->sti_plumb_lock);
5625 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5626 mutex_exit(&sti->sti_plumb_lock);
5627 return (error);
5629 default:
5630 if (!so->so_is_stream)
5631 break;
5634 * The imaginary "sockmod" has been popped; act as a stream.
5636 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5639 ASSERT(!so->so_is_stream);
5642 * Process socket-specific ioctls.
5644 switch (cmd) {
5645 case FIONBIO: {
5646 int32_t value;
5648 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5649 (mode & (int)FKIOCTL)))
5650 return (EFAULT);
5652 mutex_enter(&so->so_lock);
5653 if (value) {
5654 so->so_state |= SS_NDELAY;
5655 } else {
5656 so->so_state &= ~SS_NDELAY;
5658 mutex_exit(&so->so_lock);
5659 return (0);
5662 case FIOASYNC: {
5663 int32_t value;
5665 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5666 (mode & (int)FKIOCTL)))
5667 return (EFAULT);
5669 mutex_enter(&so->so_lock);
5671 * SS_ASYNC flag not already set correctly?
5672 * (!value != !(so->so_state & SS_ASYNC))
5673 * but some engineers find that too hard to read.
5675 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5676 value != 0 && (so->so_state & SS_ASYNC) == 0)
5677 error = so_flip_async(so, vp, mode, cr);
5678 mutex_exit(&so->so_lock);
5679 return (error);
5682 case SIOCSPGRP:
5683 case FIOSETOWN: {
5684 pid_t pgrp;
5686 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5687 (mode & (int)FKIOCTL)))
5688 return (EFAULT);
5690 mutex_enter(&so->so_lock);
5691 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5692 /* Any change? */
5693 if (pgrp != so->so_pgrp)
5694 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5695 mutex_exit(&so->so_lock);
5696 return (error);
5698 case SIOCGPGRP:
5699 case FIOGETOWN:
5700 if (so_copyout(&so->so_pgrp, (void *)arg,
5701 sizeof (pid_t), (mode & (int)FKIOCTL)))
5702 return (EFAULT);
5703 return (0);
5705 case SIOCATMARK: {
5706 int retval;
5707 uint_t so_state;
5710 * strwaitmark has a finite timeout after which it
5711 * returns -1 if the mark state is undetermined.
5712 * In order to avoid any race between the mark state
5713 * in sockfs and the mark state in the stream head this
5714 * routine loops until the mark state can be determined
5715 * (or the urgent data indication has been removed by some
5716 * other thread).
5718 do {
5719 mutex_enter(&so->so_lock);
5720 so_state = so->so_state;
5721 mutex_exit(&so->so_lock);
5722 if (so_state & SS_RCVATMARK) {
5723 retval = 1;
5724 } else if (!(so_state & SS_OOBPEND)) {
5726 * No SIGURG has been generated -- there is no
5727 * pending or present urgent data. Thus can't
5728 * possibly be at the mark.
5730 retval = 0;
5731 } else {
5733 * Have the stream head wait until there is
5734 * either some messages on the read queue, or
5735 * STRATMARK or STRNOTATMARK gets set. The
5736 * STRNOTATMARK flag is used so that the
5737 * transport can send up a MSGNOTMARKNEXT
5738 * M_DATA to indicate that it is not
5739 * at the mark and additional data is not about
5740 * to be send upstream.
5742 * If the mark state is undetermined this will
5743 * return -1 and we will loop rechecking the
5744 * socket state.
5746 retval = strwaitmark(vp);
5748 } while (retval == -1);
5750 if (so_copyout(&retval, (void *)arg, sizeof (int),
5751 (mode & (int)FKIOCTL)))
5752 return (EFAULT);
5753 return (0);
5756 case I_FDINSERT:
5757 case I_SENDFD:
5758 case I_RECVFD:
5759 case I_ATMARK:
5760 case _SIOCSOCKFALLBACK:
5762 * These ioctls do not apply to sockets. I_FDINSERT can be
5763 * used to send M_PROTO messages without modifying the socket
5764 * state. I_SENDFD/RECVFD should not be used for socket file
5765 * descriptor passing since they assume a twisted stream.
5766 * SIOCATMARK must be used instead of I_ATMARK.
5768 * _SIOCSOCKFALLBACK from an application should never be
5769 * processed. It is only generated by socktpi_open() or
5770 * in response to I_POP or I_PUSH.
5772 #ifdef DEBUG
5773 zcmn_err(getzoneid(), CE_WARN,
5774 "Unsupported STREAMS ioctl 0x%x on socket. "
5775 "Pid = %d\n", cmd, curproc->p_pid);
5776 #endif /* DEBUG */
5777 return (EOPNOTSUPP);
5779 case _I_GETPEERCRED:
5780 if ((mode & FKIOCTL) == 0)
5781 return (EINVAL);
5783 mutex_enter(&so->so_lock);
5784 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5785 error = ENOTSUP;
5786 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
5787 error = ENOTCONN;
5788 } else if (so->so_peercred != NULL) {
5789 k_peercred_t *kp = (k_peercred_t *)arg;
5790 kp->pc_cr = so->so_peercred;
5791 kp->pc_cpid = so->so_cpid;
5792 crhold(so->so_peercred);
5793 } else {
5794 error = EINVAL;
5796 mutex_exit(&so->so_lock);
5797 return (error);
5799 default:
5801 * Do the higher-order bits of the ioctl cmd indicate
5802 * that it is an I_* streams ioctl?
5804 if ((cmd & 0xffffff00U) == STR &&
5805 !so->so_is_stream) {
5806 #ifdef DEBUG
5807 zcmn_err(getzoneid(), CE_WARN,
5808 "Unsupported STREAMS ioctl 0x%x on socket. "
5809 "Pid = %d\n", cmd, curproc->p_pid);
5810 #endif /* DEBUG */
5811 return (EOPNOTSUPP);
5813 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5818 * Handle plumbing-related ioctls.
5820 static int
5821 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5822 struct cred *cr, int32_t *rvalp)
5824 static const char sockmod_name[] = "sockmod";
5825 struct sonode *so = VTOSO(vp);
5826 char mname[FMNAMESZ + 1];
5827 int error;
5828 sotpi_info_t *sti = SOTOTPI(so);
5830 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5832 if (!so->so_is_stream)
5833 return (EOPNOTSUPP);
5835 if (so->so_is_stream) {
5837 * The imaginary "sockmod" has been popped - act as a stream.
5838 * If this is a push of sockmod then change back to a socket.
5840 if (cmd == I_PUSH) {
5841 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5842 (void *)arg, mname, sizeof (mname), NULL);
5844 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5845 dprintso(so, 0, ("socktpi_ioctl: going to "
5846 "socket version\n"));
5847 so_stream2sock(so);
5848 return (0);
5851 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5854 switch (cmd) {
5855 case I_PUSH:
5856 if (sti->sti_direct) {
5857 mutex_enter(&so->so_lock);
5858 so_lock_single(so);
5859 mutex_exit(&so->so_lock);
5861 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5862 cr, rvalp);
5864 mutex_enter(&so->so_lock);
5865 if (error == 0)
5866 sti->sti_direct = 0;
5867 so_unlock_single(so, SOLOCKED);
5868 mutex_exit(&so->so_lock);
5870 if (error != 0)
5871 return (error);
5874 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5875 if (error == 0)
5876 sti->sti_pushcnt++;
5877 return (error);
5879 case I_POP:
5880 if (sti->sti_pushcnt == 0) {
5881 /* Emulate sockmod being popped */
5882 dprintso(so, 0,
5883 ("socktpi_ioctl: going to STREAMS version\n"));
5884 return (so_sock2stream(so));
5887 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5888 if (error == 0)
5889 sti->sti_pushcnt--;
5890 return (error);
5892 case I_LIST: {
5893 struct str_mlist *kmlistp, *umlistp;
5894 struct str_list kstrlist;
5895 ssize_t kstrlistsize;
5896 int i, nmods;
5898 STRUCT_DECL(str_list, ustrlist);
5899 STRUCT_INIT(ustrlist, mode);
5901 if (arg == (intptr_t)NULL) {
5902 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5903 if (error == 0)
5904 (*rvalp)++; /* Add one for sockmod */
5905 return (error);
5908 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
5909 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
5910 if (error != 0)
5911 return (error);
5913 nmods = STRUCT_FGET(ustrlist, sl_nmods);
5914 if (nmods <= 0)
5915 return (EINVAL);
5917 * Ceiling nmods at nstrpush to prevent someone from
5918 * maliciously consuming lots of kernel memory.
5920 nmods = MIN(nmods, nstrpush);
5922 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
5923 kstrlist.sl_nmods = nmods;
5924 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
5926 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
5927 cr, rvalp);
5928 if (error != 0)
5929 goto done;
5932 * Considering the module list as a 0-based array of sl_nmods
5933 * modules, sockmod should conceptually exist at slot
5934 * sti_pushcnt. Insert sockmod at this location by sliding all
5935 * of the module names after so_pushcnt over by one. We know
5936 * that there will be room to do this since we allocated
5937 * sl_modlist with an additional slot.
5939 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
5940 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
5942 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
5943 kstrlist.sl_nmods++;
5946 * Copy all of the entries out to ustrlist.
5948 kmlistp = kstrlist.sl_modlist;
5949 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
5950 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
5951 error = so_copyout(kmlistp++, umlistp++,
5952 sizeof (struct str_mlist), mode & FKIOCTL);
5953 if (error != 0)
5954 goto done;
5957 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
5958 mode & FKIOCTL);
5959 if (error == 0)
5960 *rvalp = 0;
5961 done:
5962 kmem_free(kstrlist.sl_modlist, kstrlistsize);
5963 return (error);
5965 case I_LOOK:
5966 if (sti->sti_pushcnt == 0) {
5967 return (so_copyout(sockmod_name, (void *)arg,
5968 sizeof (sockmod_name), mode & FKIOCTL));
5970 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5972 case I_FIND:
5973 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5974 if (error && error != EINVAL)
5975 return (error);
5977 /* if not found and string was sockmod return 1 */
5978 if (*rvalp == 0 || error == EINVAL) {
5979 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5980 (void *)arg, mname, sizeof (mname), NULL);
5981 if (error == ENAMETOOLONG)
5982 error = EINVAL;
5984 if (error == 0 && strcmp(mname, sockmod_name) == 0)
5985 *rvalp = 1;
5987 return (error);
5989 default:
5990 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
5991 break;
5994 return (0);
5998 * Wrapper around the streams poll routine that implements socket poll
5999 * semantics.
6000 * The sockfs never calls pollwakeup itself - the stream head take care
6001 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6002 * stream head there can never be a deadlock due to holding so_lock across
6003 * pollwakeup and acquiring so_lock in this routine.
6005 * However, since the performance of fop_poll is critical we avoid
6006 * acquiring so_lock here. This is based on two assumptions:
6007 * - The poll implementation holds locks to serialize the fop_poll call
6008 * and a pollwakeup for the same pollhead. This ensures that should
6009 * e.g. so_state change during a socktpi_poll call the pollwakeup
6010 * (which strsock_* and strrput conspire to issue) is issued after
6011 * the state change. Thus the pollwakeup will block until fop_poll has
6012 * returned and then wake up poll and have it call fop_poll again.
6013 * - The reading of so_state without holding so_lock does not result in
6014 * stale data that is older than the latest state change that has dropped
6015 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6016 * memory barrier to force the data into the coherency domain.
6018 static int
6019 sotpi_poll(
6020 struct sonode *so,
6021 short events,
6022 int anyyet,
6023 short *reventsp,
6024 struct pollhead **phpp)
6026 short origevents = events;
6027 struct vnode *vp = SOTOV(so);
6028 int error;
6029 int so_state = so->so_state; /* snapshot */
6030 sotpi_info_t *sti = SOTOTPI(so);
6032 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6033 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6035 ASSERT(vp->v_type == VSOCK);
6036 ASSERT(vp->v_stream != NULL);
6038 if (so->so_is_stream) {
6039 /* The imaginary "sockmod" has been popped - act as a stream */
6040 return (strpoll(vp->v_stream, events, anyyet,
6041 reventsp, phpp));
6044 if (!(so_state & SS_ISCONNECTED) &&
6045 (so->so_mode & SM_CONNREQUIRED)) {
6046 /* Not connected yet - turn off write side events */
6047 events &= ~(POLLOUT|POLLWRBAND);
6050 * Check for errors without calling strpoll if the caller wants them.
6051 * In sockets the errors are represented as input/output events
6052 * and there is no need to ask the stream head for this information.
6054 if (so->so_error != 0 &&
6055 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6056 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6057 return (0);
6060 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6061 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6062 * will not trigger a POLLIN event with POLLRDDATA set.
6063 * The handling of urgent data (causing POLLRDBAND) is done by
6064 * inspecting SS_OOBPEND below.
6066 events |= POLLRDDATA;
6069 * After shutdown(output) a stream head write error is set.
6070 * However, we should not return output events.
6072 events |= POLLNOERR;
6073 error = strpoll(vp->v_stream, events, anyyet,
6074 reventsp, phpp);
6075 if (error)
6076 return (error);
6078 ASSERT(!(*reventsp & POLLERR));
6081 * Notes on T_CONN_IND handling for sockets.
6083 * If strpoll() returned without events, SR_POLLIN is guaranteed
6084 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6086 * Since the so_lock is not held, soqueueconnind() may have run
6087 * and a T_CONN_IND may be waiting. We now check for any queued
6088 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6089 * to ensure poll returns.
6091 * However:
6092 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6093 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6094 * the following actions will occur; taken together they ensure the
6095 * syscall will return.
6097 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6098 * the accept() was run on a non-blocking socket sowaitconnind()
6099 * may have already returned EWOULDBLOCK, so not be waiting to
6100 * process the message. Additionally socktpi_poll() has probably
6101 * proceeded past the sti_conn_ind_head check below.
6102 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6103 * this thread, however that could occur before poll_common()
6104 * has entered cv_wait.
6105 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6107 * Before proceeding to cv_wait() in poll_common() for an event,
6108 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6109 * and if set, re-calls strpoll() to ensure the late arriving
6110 * T_CONN_IND is recognized, and pollsys() returns.
6113 if (sti->sti_conn_ind_head != NULL)
6114 *reventsp |= (POLLIN|POLLRDNORM) & events;
6116 if (so->so_state & SS_CANTRCVMORE) {
6117 *reventsp |= POLLRDHUP & events;
6119 if (so->so_state & SS_CANTSENDMORE)
6120 *reventsp |= POLLHUP;
6123 if (so->so_state & SS_OOBPEND)
6124 *reventsp |= POLLRDBAND & events;
6126 return (0);
6129 /*ARGSUSED*/
6130 static int
6131 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6133 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6134 int error = 0;
6136 error = sonode_constructor(buf, cdrarg, kmflags);
6137 if (error != 0)
6138 return (error);
6140 error = i_sotpi_info_constructor(&st->st_info);
6141 if (error != 0)
6142 sonode_destructor(buf, cdrarg);
6144 st->st_sonode.so_priv = &st->st_info;
6146 return (error);
6149 /*ARGSUSED1*/
6150 static void
6151 socktpi_destructor(void *buf, void *cdrarg)
6153 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6155 ASSERT(st->st_sonode.so_priv == &st->st_info);
6156 st->st_sonode.so_priv = NULL;
6158 i_sotpi_info_destructor(&st->st_info);
6159 sonode_destructor(buf, cdrarg);
6162 static int
6163 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6165 int retval;
6167 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6168 struct sonode *so = (struct sonode *)buf;
6169 sotpi_info_t *sti = SOTOTPI(so);
6171 mutex_enter(&socklist.sl_lock);
6173 sti->sti_next_so = socklist.sl_list;
6174 sti->sti_prev_so = NULL;
6175 if (sti->sti_next_so != NULL)
6176 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6177 socklist.sl_list = so;
6179 mutex_exit(&socklist.sl_lock);
6182 return (retval);
6185 static void
6186 socktpi_unix_destructor(void *buf, void *cdrarg)
6188 struct sonode *so = (struct sonode *)buf;
6189 sotpi_info_t *sti = SOTOTPI(so);
6191 mutex_enter(&socklist.sl_lock);
6193 if (sti->sti_next_so != NULL)
6194 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6195 if (sti->sti_prev_so != NULL)
6196 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6197 else
6198 socklist.sl_list = sti->sti_next_so;
6200 mutex_exit(&socklist.sl_lock);
6202 socktpi_destructor(buf, cdrarg);
6206 socktpi_init(void)
6209 * Create sonode caches. We create a special one for AF_UNIX so
6210 * that we can track them for netstat(1m).
6212 socktpi_cache = kmem_cache_create("socktpi_cache",
6213 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6214 socktpi_destructor, NULL, NULL, NULL, 0);
6216 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6217 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6218 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6220 return (0);
6224 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6226 * Caller must still update state and mode using sotpi_update_state().
6229 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6230 boolean_t *direct, queue_t **qp, struct cred *cr)
6232 sotpi_info_t *sti;
6233 struct sockparams *origsp = so->so_sockparams;
6234 sock_lower_handle_t handle = so->so_proto_handle;
6235 struct stdata *stp;
6236 struct vnode *vp;
6237 queue_t *q;
6238 int error = 0;
6240 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6241 SS_FALLBACK_PENDING);
6242 ASSERT(SOCK_IS_NONSTR(so));
6244 *qp = NULL;
6245 *direct = B_FALSE;
6246 so->so_sockparams = newsp;
6248 * Allocate and initalize fields required by TPI.
6250 (void) sotpi_info_create(so, KM_SLEEP);
6251 sotpi_info_init(so);
6253 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6254 sotpi_info_fini(so);
6255 sotpi_info_destroy(so);
6256 return (error);
6258 ASSERT(handle == so->so_proto_handle);
6259 sti = SOTOTPI(so);
6260 if (sti->sti_direct != 0)
6261 *direct = B_TRUE;
6264 * Keep the original sp around so we can properly dispose of the
6265 * sonode when the socket is being closed.
6267 sti->sti_orig_sp = origsp;
6269 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6270 so_alloc_addr(so, so->so_max_addr_len);
6273 * If the application has done a SIOCSPGRP, make sure the
6274 * STREAM head is aware. This needs to take place before
6275 * the protocol start sending up messages. Otherwise we
6276 * might miss to generate SIGPOLL.
6278 * It is possible that the application will receive duplicate
6279 * signals if some were already generated for either data or
6280 * connection indications.
6282 if (so->so_pgrp != 0) {
6283 if (so_set_events(so, so->so_vnode, cr) != 0)
6284 so->so_pgrp = 0;
6288 * Determine which queue to use.
6290 vp = SOTOV(so);
6291 stp = vp->v_stream;
6292 ASSERT(stp != NULL);
6293 q = stp->sd_wrq->q_next;
6296 * Skip any modules that may have been auto pushed when the device
6297 * was opened
6299 while (q->q_next != NULL)
6300 q = q->q_next;
6301 *qp = _RD(q);
6303 /* This is now a STREAMS sockets */
6304 so->so_not_str = B_FALSE;
6306 return (error);
6310 * Revert a TPI sonode. It is only allowed to revert the sonode during
6311 * the fallback process.
6313 void
6314 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6316 vnode_t *vp = SOTOV(so);
6318 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6319 SS_FALLBACK_PENDING);
6320 ASSERT(!SOCK_IS_NONSTR(so));
6321 ASSERT(vp->v_stream != NULL);
6323 strclean(vp);
6324 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6327 * Restore the original sockparams. The caller is responsible for
6328 * dropping the ref to the new sp.
6330 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6332 sotpi_info_fini(so);
6333 sotpi_info_destroy(so);
6335 /* This is no longer a STREAMS sockets */
6336 so->so_not_str = B_TRUE;
6339 void
6340 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6341 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6342 socklen_t faddrlen, short opts)
6344 sotpi_info_t *sti = SOTOTPI(so);
6346 so_proc_tcapability_ack(so, tcap);
6348 so->so_options |= opts;
6351 * Determine whether the foreign and local address are valid
6353 if (laddrlen != 0) {
6354 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6355 sti->sti_laddr_len = laddrlen;
6356 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6357 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6360 if (faddrlen != 0) {
6361 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6362 sti->sti_faddr_len = faddrlen;
6363 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6364 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6370 * Allocate enough space to cache the local and foreign addresses.
6372 void
6373 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6375 sotpi_info_t *sti = SOTOTPI(so);
6377 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6378 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6379 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6380 P2ROUNDUP(maxlen, KMEM_ALIGN);
6381 so->so_max_addr_len = sti->sti_laddr_maxlen;
6382 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6383 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6384 + sti->sti_laddr_maxlen);
6386 if (so->so_family == AF_UNIX) {
6388 * Initialize AF_UNIX related fields.
6390 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6391 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6396 sotpi_info_t *
6397 sotpi_sototpi(struct sonode *so)
6399 sotpi_info_t *sti;
6401 ASSERT(so != NULL);
6403 sti = (sotpi_info_t *)so->so_priv;
6405 ASSERT(sti != NULL);
6406 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6408 return (sti);
6411 static int
6412 i_sotpi_info_constructor(sotpi_info_t *sti)
6414 sti->sti_magic = SOTPI_INFO_MAGIC;
6415 sti->sti_ack_mp = NULL;
6416 sti->sti_discon_ind_mp = NULL;
6417 sti->sti_ux_bound_vp = NULL;
6418 sti->sti_unbind_mp = NULL;
6420 sti->sti_conn_ind_head = NULL;
6421 sti->sti_conn_ind_tail = NULL;
6423 sti->sti_laddr_sa = NULL;
6424 sti->sti_faddr_sa = NULL;
6426 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6427 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6429 return (0);
6432 static void
6433 i_sotpi_info_destructor(sotpi_info_t *sti)
6435 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6436 ASSERT(sti->sti_ack_mp == NULL);
6437 ASSERT(sti->sti_discon_ind_mp == NULL);
6438 ASSERT(sti->sti_ux_bound_vp == NULL);
6439 ASSERT(sti->sti_unbind_mp == NULL);
6441 ASSERT(sti->sti_conn_ind_head == NULL);
6442 ASSERT(sti->sti_conn_ind_tail == NULL);
6444 ASSERT(sti->sti_laddr_sa == NULL);
6445 ASSERT(sti->sti_faddr_sa == NULL);
6447 mutex_destroy(&sti->sti_plumb_lock);
6448 cv_destroy(&sti->sti_ack_cv);
6452 * Creates and attaches TPI information to the given sonode
6454 static boolean_t
6455 sotpi_info_create(struct sonode *so, int kmflags)
6457 sotpi_info_t *sti;
6459 ASSERT(so->so_priv == NULL);
6461 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6462 return (B_FALSE);
6464 if (i_sotpi_info_constructor(sti) != 0) {
6465 kmem_free(sti, sizeof (*sti));
6466 return (B_FALSE);
6469 so->so_priv = (void *)sti;
6470 return (B_TRUE);
6474 * Initializes the TPI information.
6476 static void
6477 sotpi_info_init(struct sonode *so)
6479 struct vnode *vp = SOTOV(so);
6480 sotpi_info_t *sti = SOTOTPI(so);
6481 time_t now;
6483 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6484 vp->v_rdev = sti->sti_dev;
6486 sti->sti_orig_sp = NULL;
6488 sti->sti_pushcnt = 0;
6490 now = gethrestime_sec();
6491 sti->sti_atime = now;
6492 sti->sti_mtime = now;
6493 sti->sti_ctime = now;
6495 sti->sti_eaddr_mp = NULL;
6496 sti->sti_delayed_error = 0;
6498 sti->sti_provinfo = NULL;
6500 sti->sti_oobcnt = 0;
6501 sti->sti_oobsigcnt = 0;
6503 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6505 sti->sti_laddr_sa = 0;
6506 sti->sti_faddr_sa = 0;
6507 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6508 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6510 sti->sti_laddr_valid = 0;
6511 sti->sti_faddr_valid = 0;
6512 sti->sti_faddr_noxlate = 0;
6514 sti->sti_direct = 0;
6516 ASSERT(sti->sti_ack_mp == NULL);
6517 ASSERT(sti->sti_ux_bound_vp == NULL);
6518 ASSERT(sti->sti_unbind_mp == NULL);
6520 ASSERT(sti->sti_conn_ind_head == NULL);
6521 ASSERT(sti->sti_conn_ind_tail == NULL);
6525 * Given a sonode, grab the TPI info and free any data.
6527 static void
6528 sotpi_info_fini(struct sonode *so)
6530 sotpi_info_t *sti = SOTOTPI(so);
6531 mblk_t *mp;
6533 ASSERT(sti->sti_discon_ind_mp == NULL);
6535 if ((mp = sti->sti_conn_ind_head) != NULL) {
6536 mblk_t *mp1;
6538 while (mp) {
6539 mp1 = mp->b_next;
6540 mp->b_next = NULL;
6541 freemsg(mp);
6542 mp = mp1;
6544 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6548 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6549 * indirect them. It also uses so_count as a validity test.
6551 mutex_enter(&so->so_lock);
6553 if (sti->sti_laddr_sa) {
6554 ASSERT((caddr_t)sti->sti_faddr_sa ==
6555 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6556 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6557 sti->sti_laddr_valid = 0;
6558 sti->sti_faddr_valid = 0;
6559 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6560 sti->sti_laddr_sa = NULL;
6561 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6562 sti->sti_faddr_sa = NULL;
6563 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6566 mutex_exit(&so->so_lock);
6568 if ((mp = sti->sti_eaddr_mp) != NULL) {
6569 freemsg(mp);
6570 sti->sti_eaddr_mp = NULL;
6571 sti->sti_delayed_error = 0;
6574 if ((mp = sti->sti_ack_mp) != NULL) {
6575 freemsg(mp);
6576 sti->sti_ack_mp = NULL;
6579 ASSERT(sti->sti_ux_bound_vp == NULL);
6580 if ((mp = sti->sti_unbind_mp) != NULL) {
6581 freemsg(mp);
6582 sti->sti_unbind_mp = NULL;
6587 * Destroys the TPI information attached to a sonode.
6589 static void
6590 sotpi_info_destroy(struct sonode *so)
6592 sotpi_info_t *sti = SOTOTPI(so);
6594 i_sotpi_info_destructor(sti);
6595 kmem_free(sti, sizeof (*sti));
6597 so->so_priv = NULL;
6601 * Create the global sotpi socket module entry. It will never be freed.
6603 smod_info_t *
6604 sotpi_smod_create(void)
6606 smod_info_t *smodp;
6608 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6609 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6610 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6612 * Initialize the smod_refcnt to 1 so it will never be freed.
6614 smodp->smod_refcnt = 1;
6615 smodp->smod_uc_version = SOCK_UC_VERSION;
6616 smodp->smod_dc_version = SOCK_DC_VERSION;
6617 smodp->smod_sock_create_func = &sotpi_create;
6618 smodp->smod_sock_destroy_func = &sotpi_destroy;
6619 return (smodp);