4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/signal.h>
32 #include <sys/cmn_err.h>
34 #include <sys/stropts.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/sockio.h>
38 #include <sys/strsubr.h>
39 #include <sys/strsun.h>
40 #include <sys/atomic.h>
41 #include <sys/tihdr.h>
43 #include "sockcommon.h"
44 #include "sockfilter_impl.h"
50 #include <sys/cmn_err.h>
53 extern int do_useracc
;
54 extern clock_t sock_test_timelimit
;
55 #endif /* SOCK_TEST */
57 #define MBLK_PULL_LEN 64
58 uint32_t so_mblk_pull_len
= MBLK_PULL_LEN
;
61 boolean_t so_debug_length
= B_FALSE
;
62 static boolean_t
so_check_length(sonode_t
*so
);
66 so_acceptq_dequeue_locked(struct sonode
*so
, boolean_t dontblock
,
69 struct sonode
*nso
= NULL
;
72 ASSERT(MUTEX_HELD(&so
->so_acceptq_lock
));
73 while ((nso
= list_remove_head(&so
->so_acceptq_list
)) == NULL
) {
75 * No need to check so_error here, because it is not
76 * possible for a listening socket to be reset or otherwise
79 * So now we just need check if it's ok to wait.
83 if (so
->so_state
& (SS_CLOSING
| SS_FALLBACK_PENDING
))
86 if (cv_wait_sig_swap(&so
->so_acceptq_cv
,
87 &so
->so_acceptq_lock
) == 0)
92 ASSERT(so
->so_acceptq_len
> 0);
94 nso
->so_listener
= NULL
;
102 * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
104 * Pulls a connection off of the accept queue.
107 * so - listening socket
108 * dontblock - indicate whether it's ok to sleep if there are no
109 * connections on the queue
110 * nsop - Value-return argument
113 * 0 when a connection is successfully dequeued, in which case nsop
114 * is set to point to the new connection. Upon failure a non-zero
115 * value is returned, and the value of nsop is set to NULL.
118 * so_acceptq_dequeue() may return prematurly if the socket is falling
122 so_acceptq_dequeue(struct sonode
*so
, boolean_t dontblock
,
123 struct sonode
**nsop
)
127 mutex_enter(&so
->so_acceptq_lock
);
128 error
= so_acceptq_dequeue_locked(so
, dontblock
, nsop
);
129 mutex_exit(&so
->so_acceptq_lock
);
135 so_acceptq_flush_impl(struct sonode
*so
, list_t
*list
, boolean_t doclose
)
139 while ((nso
= list_remove_head(list
)) != NULL
) {
140 nso
->so_listener
= NULL
;
142 (void) socket_close(nso
, 0, CRED());
145 * Only used for fallback - not possible when filters
148 ASSERT(so
->so_filter_active
== 0);
150 * Since the socket is on the accept queue, there can
151 * only be one reference. We drop the reference and
152 * just blow off the socket.
154 ASSERT(nso
->so_count
== 1);
156 /* drop the proto ref */
163 * void so_acceptq_flush(struct sonode *so)
165 * Removes all pending connections from a listening socket, and
166 * frees the associated resources.
169 * so - listening socket
170 * doclose - make a close downcall for each socket on the accept queue
176 * The caller has to ensure that no calls to so_acceptq_enqueue() or
177 * so_acceptq_dequeue() occur while the accept queue is being flushed.
178 * So either the socket needs to be in a state where no operations
179 * would come in, or so_lock needs to be obtained.
182 so_acceptq_flush(struct sonode
*so
, boolean_t doclose
)
184 so_acceptq_flush_impl(so
, &so
->so_acceptq_list
, doclose
);
185 so_acceptq_flush_impl(so
, &so
->so_acceptq_defer
, doclose
);
187 so
->so_acceptq_len
= 0;
191 so_wait_connected_locked(struct sonode
*so
, boolean_t nonblock
,
194 ASSERT(MUTEX_HELD(&so
->so_lock
));
197 * The protocol has notified us that a connection attempt is being
198 * made, so before we wait for a notification to arrive we must
199 * clear out any errors associated with earlier connection attempts.
201 if (so
->so_error
!= 0 && SOCK_CONNID_LT(so
->so_proto_connid
, id
))
204 while (SOCK_CONNID_LT(so
->so_proto_connid
, id
)) {
206 return (EINPROGRESS
);
208 if (so
->so_state
& (SS_CLOSING
| SS_FALLBACK_PENDING
))
211 if (cv_wait_sig_swap(&so
->so_state_cv
, &so
->so_lock
) == 0)
215 if (so
->so_error
!= 0)
216 return (sogeterr(so
, B_TRUE
));
218 * Under normal circumstances, so_error should contain an error
219 * in case the connect failed. However, it is possible for another
220 * thread to come in a consume the error, so generate a sensible
221 * error in that case.
223 if ((so
->so_state
& SS_ISCONNECTED
) == 0)
224 return (ECONNREFUSED
);
230 * int so_wait_connected(struct sonode *so, boolean_t nonblock,
233 * Wait until the socket is connected or an error has occured.
237 * nonblock - indicate whether it's ok to sleep if the connection has
238 * not yet been established
239 * gen - generation number that was returned by the protocol
240 * when the operation was started
243 * 0 if the connection attempt was successful, or an error indicating why
244 * the connection attempt failed.
247 so_wait_connected(struct sonode
*so
, boolean_t nonblock
, sock_connid_t id
)
251 mutex_enter(&so
->so_lock
);
252 error
= so_wait_connected_locked(so
, nonblock
, id
);
253 mutex_exit(&so
->so_lock
);
259 so_snd_wait_qnotfull_locked(struct sonode
*so
, boolean_t dontblock
)
263 ASSERT(MUTEX_HELD(&so
->so_lock
));
264 while (SO_SND_FLOWCTRLD(so
)) {
265 if (so
->so_state
& SS_CANTSENDMORE
)
268 return (EWOULDBLOCK
);
270 if (so
->so_state
& (SS_CLOSING
| SS_FALLBACK_PENDING
))
273 if (so
->so_sndtimeo
== 0) {
275 * Zero means disable timeout.
277 error
= cv_wait_sig(&so
->so_snd_cv
, &so
->so_lock
);
279 error
= cv_reltimedwait_sig(&so
->so_snd_cv
,
280 &so
->so_lock
, so
->so_sndtimeo
, TR_CLOCK_TICK
);
284 else if (error
== -1)
291 * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
293 * Wait for the transport to notify us about send buffers becoming
297 so_snd_wait_qnotfull(struct sonode
*so
, boolean_t dontblock
)
301 mutex_enter(&so
->so_lock
);
302 so
->so_snd_wakeup
= B_TRUE
;
303 error
= so_snd_wait_qnotfull_locked(so
, dontblock
);
304 so
->so_snd_wakeup
= B_FALSE
;
305 mutex_exit(&so
->so_lock
);
311 so_snd_qfull(struct sonode
*so
)
313 mutex_enter(&so
->so_lock
);
314 so
->so_snd_qfull
= B_TRUE
;
315 mutex_exit(&so
->so_lock
);
319 so_snd_qnotfull(struct sonode
*so
)
321 mutex_enter(&so
->so_lock
);
322 so
->so_snd_qfull
= B_FALSE
;
323 /* wake up everyone waiting for buffers */
324 cv_broadcast(&so
->so_snd_cv
);
325 mutex_exit(&so
->so_lock
);
329 * Change the process/process group to which SIGIO is sent.
332 socket_chgpgrp(struct sonode
*so
, pid_t pid
)
336 ASSERT(MUTEX_HELD(&so
->so_lock
));
339 * Permissions check by sending signal 0.
340 * Note that when kill fails it does a
341 * set_errno causing the system call to fail.
343 error
= kill(pid
, 0);
354 * Generate a SIGIO, for 'writable' events include siginfo structure,
355 * for read events just send the signal.
359 socket_sigproc(proc_t
*proc
, int event
)
363 ASSERT(event
& (SOCKETSIG_WRITE
| SOCKETSIG_READ
| SOCKETSIG_URG
));
365 if (event
& SOCKETSIG_WRITE
) {
366 info
.si_signo
= SIGPOLL
;
367 info
.si_code
= POLL_OUT
;
371 sigaddq(proc
, NULL
, &info
, KM_NOSLEEP
);
373 if (event
& SOCKETSIG_READ
) {
374 sigtoproc(proc
, NULL
, SIGPOLL
);
376 if (event
& SOCKETSIG_URG
) {
377 sigtoproc(proc
, NULL
, SIGURG
);
382 socket_sendsig(struct sonode
*so
, int event
)
386 ASSERT(MUTEX_HELD(&so
->so_lock
));
388 if (so
->so_pgrp
== 0 || (!(so
->so_state
& SS_ASYNC
) &&
389 event
!= SOCKETSIG_URG
)) {
393 dprint(3, ("sending sig %d to %d\n", event
, so
->so_pgrp
));
395 if (so
->so_pgrp
> 0) {
397 * XXX This unfortunately still generates
398 * a signal when a fd is closed but
399 * the proc is active.
401 mutex_enter(&pidlock
);
403 * Even if the thread started in another zone, we're receiving
404 * on behalf of this socket's zone, so find the proc using the
407 proc
= prfind_zone(so
->so_pgrp
, so
->so_zoneid
);
409 mutex_exit(&pidlock
);
412 mutex_enter(&proc
->p_lock
);
413 mutex_exit(&pidlock
);
414 socket_sigproc(proc
, event
);
415 mutex_exit(&proc
->p_lock
);
418 * Send to process group. Hold pidlock across
419 * calls to socket_sigproc().
421 pid_t pgrp
= -so
->so_pgrp
;
423 mutex_enter(&pidlock
);
425 * Even if the thread started in another zone, we're receiving
426 * on behalf of this socket's zone, so find the pgrp using the
429 proc
= pgfind_zone(pgrp
, so
->so_zoneid
);
430 while (proc
!= NULL
) {
431 mutex_enter(&proc
->p_lock
);
432 socket_sigproc(proc
, event
);
433 mutex_exit(&proc
->p_lock
);
434 proc
= proc
->p_pglink
;
436 mutex_exit(&pidlock
);
440 #define MIN(a, b) ((a) < (b) ? (a) : (b))
441 /* Copy userdata into a new mblk_t */
443 socopyinuio(uio_t
*uiop
, ssize_t iosize
, size_t wroff
, ssize_t maxblk
,
444 size_t tail_len
, int *errorp
)
446 mblk_t
*head
= NULL
, **tail
= &head
;
448 ASSERT(iosize
== INFPSZ
|| iosize
> 0);
450 if (iosize
== INFPSZ
|| iosize
> uiop
->uio_resid
)
451 iosize
= uiop
->uio_resid
;
453 if (maxblk
== INFPSZ
)
456 /* Nothing to do in these cases, so we're done */
457 if (iosize
< 0 || maxblk
< 0 || (maxblk
== 0 && iosize
> 0))
461 * We will enter the loop below if iosize is 0; it will allocate an
462 * empty message block and call uiomove(9F) which will just return.
463 * We could avoid that with an extra check but would only slow
464 * down the much more likely case where iosize is larger than 0.
470 blocksize
= MIN(iosize
, maxblk
);
471 ASSERT(blocksize
>= 0);
472 mp
= allocb(wroff
+ blocksize
+ tail_len
, BPRI_MED
);
478 mp
->b_wptr
= mp
->b_rptr
+ blocksize
;
483 /* uiomove(9F) either returns 0 or EFAULT */
484 if ((*errorp
= uiomove(mp
->b_rptr
, (size_t)blocksize
,
485 UIO_WRITE
, uiop
)) != 0) {
486 ASSERT(*errorp
!= ENOMEM
);
492 } while (iosize
> 0);
500 socopyoutuio(mblk_t
*mp
, struct uio
*uiop
, ssize_t max_read
, int *errorp
)
506 ASSERT(mp
->b_wptr
>= mp
->b_rptr
);
509 * max_read is the offset of the oobmark and read can not go pass
512 if (max_read
== INFPSZ
|| max_read
> uiop
->uio_resid
)
513 max_read
= uiop
->uio_resid
;
516 if ((n
= MIN(max_read
, MBLKL(mp
))) != 0) {
519 error
= uiomove(mp
->b_rptr
, n
, UIO_READ
, uiop
);
529 while (mp
!= NULL
&& (mp
->b_rptr
>= mp
->b_wptr
)) {
531 * get rid of zero length mblks
537 } while (mp
!= NULL
&& max_read
> 0);
544 so_prepend_msg(struct sonode
*so
, mblk_t
*mp
, mblk_t
*last_tail
)
546 ASSERT(last_tail
!= NULL
);
547 mp
->b_next
= so
->so_rcv_q_head
;
548 mp
->b_prev
= last_tail
;
549 ASSERT(!(DB_FLAGS(mp
) & DBLK_UIOA
));
551 if (so
->so_rcv_q_head
== NULL
) {
552 ASSERT(so
->so_rcv_q_last_head
== NULL
);
553 so
->so_rcv_q_last_head
= mp
;
556 ASSERT(!(DB_FLAGS(so
->so_rcv_q_head
) & DBLK_UIOA
));
559 so
->so_rcv_q_head
= mp
;
562 if (so_debug_length
) {
563 mutex_enter(&so
->so_lock
);
564 ASSERT(so_check_length(so
));
565 mutex_exit(&so
->so_lock
);
571 * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
572 * can be processed by so_dequeue_msg().
575 so_process_new_message(struct sonode
*so
, mblk_t
*mp_head
, mblk_t
*mp_last_head
)
577 if (so
->so_filter_active
> 0 &&
578 (mp_head
= sof_filter_data_in_proc(so
, mp_head
,
579 &mp_last_head
)) == NULL
)
582 ASSERT(mp_head
->b_prev
!= NULL
);
583 if (so
->so_rcv_q_head
== NULL
) {
584 so
->so_rcv_q_head
= mp_head
;
585 so
->so_rcv_q_last_head
= mp_last_head
;
586 ASSERT(so
->so_rcv_q_last_head
->b_prev
!= NULL
);
588 boolean_t flag_equal
= ((DB_FLAGS(mp_head
) & DBLK_UIOA
) ==
589 (DB_FLAGS(so
->so_rcv_q_last_head
) & DBLK_UIOA
));
591 if (mp_head
->b_next
== NULL
&&
592 DB_TYPE(mp_head
) == M_DATA
&&
593 DB_TYPE(so
->so_rcv_q_last_head
) == M_DATA
&& flag_equal
) {
594 so
->so_rcv_q_last_head
->b_prev
->b_cont
= mp_head
;
595 so
->so_rcv_q_last_head
->b_prev
= mp_head
->b_prev
;
596 mp_head
->b_prev
= NULL
;
597 } else if (flag_equal
&& (DB_FLAGS(mp_head
) & DBLK_UIOA
)) {
599 * Append to last_head if more than one mblks, and both
600 * mp_head and last_head are I/OAT mblks.
602 ASSERT(mp_head
->b_next
!= NULL
);
603 so
->so_rcv_q_last_head
->b_prev
->b_cont
= mp_head
;
604 so
->so_rcv_q_last_head
->b_prev
= mp_head
->b_prev
;
605 mp_head
->b_prev
= NULL
;
607 so
->so_rcv_q_last_head
->b_next
= mp_head
->b_next
;
608 mp_head
->b_next
= NULL
;
609 so
->so_rcv_q_last_head
= mp_last_head
;
615 while (tmp_mblk
!= NULL
) {
616 ASSERT(tmp_mblk
->b_prev
!= NULL
);
617 tmp_mblk
= tmp_mblk
->b_next
;
621 so
->so_rcv_q_last_head
->b_next
= mp_head
;
622 so
->so_rcv_q_last_head
= mp_last_head
;
628 * Check flow control on a given sonode. Must have so_lock held, and
629 * this function will release the hold. Return true if flow control
633 so_check_flow_control(struct sonode
*so
)
635 ASSERT(MUTEX_HELD(&so
->so_lock
));
637 if (so
->so_flowctrld
&& (so
->so_rcv_queued
< so
->so_rcvlowat
&&
638 !(so
->so_state
& SS_FIL_RCV_FLOWCTRL
))) {
639 so
->so_flowctrld
= B_FALSE
;
640 mutex_exit(&so
->so_lock
);
642 * Open up flow control. SCTP does not have any downcalls, and
643 * it will clr flow ctrl in sosctp_recvmsg().
645 if (so
->so_downcalls
!= NULL
&&
646 so
->so_downcalls
->sd_clr_flowctrl
!= NULL
) {
647 (*so
->so_downcalls
->sd_clr_flowctrl
)
648 (so
->so_proto_handle
);
650 /* filters can start injecting data */
651 sof_sonode_notify_filters(so
, SOF_EV_INJECT_DATA_IN_OK
, 0);
654 mutex_exit(&so
->so_lock
);
660 so_dequeue_msg(struct sonode
*so
, mblk_t
**mctlp
, struct uio
*uiop
,
661 rval_t
*rvalp
, int flags
)
664 mblk_t
*savemp
, *savemptail
;
665 mblk_t
*new_msg_head
;
666 mblk_t
*new_msg_last_head
;
668 boolean_t partial_read
;
669 boolean_t reset_atmark
= B_FALSE
;
673 sodirect_t
*sodp
= so
->so_direct
;
675 partial_read
= B_FALSE
;
678 mutex_enter(&so
->so_lock
);
681 if (so_debug_length
) {
682 ASSERT(so_check_length(so
));
685 if (so
->so_state
& SS_RCVATMARK
) {
686 /* Check whether the caller is OK to read past the mark */
687 if (flags
& MSG_NOMARK
) {
688 mutex_exit(&so
->so_lock
);
689 return (EWOULDBLOCK
);
691 reset_atmark
= B_TRUE
;
694 * First move messages from the dump area to processing area
697 if (sodp
->sod_enabled
) {
698 if (sodp
->sod_uioa
.uioa_state
& UIOA_ALLOC
) {
699 /* nothing to uioamove */
701 } else if (sodp
->sod_uioa
.uioa_state
& UIOA_INIT
) {
702 sodp
->sod_uioa
.uioa_state
&= UIOA_CLR
;
703 sodp
->sod_uioa
.uioa_state
|= UIOA_ENABLED
;
705 * try to uioamove() the data that
706 * has already queued.
708 sod_uioa_so_init(so
, sodp
, uiop
);
714 new_msg_head
= so
->so_rcv_head
;
715 new_msg_last_head
= so
->so_rcv_last_head
;
716 so
->so_rcv_head
= NULL
;
717 so
->so_rcv_last_head
= NULL
;
718 oobmark
= so
->so_oobmark
;
720 * We can release the lock as there can only be one reader
722 mutex_exit(&so
->so_lock
);
724 if (new_msg_head
!= NULL
) {
725 so_process_new_message(so
, new_msg_head
, new_msg_last_head
);
727 savemp
= savemptail
= NULL
;
730 mp
= so
->so_rcv_q_head
;
733 (so
->so_rcv_timer_tid
== 0 ||
734 so
->so_rcv_queued
>= so
->so_rcv_thresh
)) {
735 partial_read
= B_FALSE
;
737 if (flags
& MSG_PEEK
) {
738 if ((nmp
= dupmsg(mp
)) == NULL
&&
739 (nmp
= copymsg(mp
)) == NULL
) {
740 size_t size
= msgsize(mp
);
742 error
= strwaitbuf(size
, BPRI_HI
);
750 ASSERT(mp
->b_prev
!= NULL
);
751 last_tail
= mp
->b_prev
;
753 so
->so_rcv_q_head
= mp
->b_next
;
754 if (so
->so_rcv_q_head
== NULL
) {
755 so
->so_rcv_q_last_head
= NULL
;
760 ASSERT(mctlp
!= NULL
);
762 * First process PROTO or PCPROTO blocks, if any.
764 if (DB_TYPE(mp
) != M_DATA
) {
768 ASSERT(DB_TYPE(mp
) == M_PROTO
||
769 DB_TYPE(mp
) == M_PCPROTO
);
770 while (mp
->b_cont
!= NULL
&&
771 DB_TYPE(mp
->b_cont
) != M_DATA
) {
772 ASSERT(DB_TYPE(mp
->b_cont
) == M_PROTO
||
773 DB_TYPE(mp
->b_cont
) == M_PCPROTO
);
777 mp
= savemptail
->b_cont
;
778 savemptail
->b_cont
= NULL
;
781 ASSERT(DB_TYPE(mp
) == M_DATA
);
783 * Now process DATA blocks, if any. Note that for sodirect
784 * enabled socket, uio_resid can be 0.
786 if (uiop
->uio_resid
>= 0) {
789 if (sodp
!= NULL
&& (DB_FLAGS(mp
) & DBLK_UIOA
)) {
790 mutex_enter(&so
->so_lock
);
791 ASSERT(uiop
== (uio_t
*)&sodp
->sod_uioa
);
792 copied
= sod_uioa_mblk(so
, mp
);
794 partial_read
= B_TRUE
;
795 mutex_exit(&so
->so_lock
);
796 /* mark this mblk as processed */
799 ssize_t oldresid
= uiop
->uio_resid
;
801 if (MBLKL(mp
) < so_mblk_pull_len
) {
802 if (pullupmsg(mp
, -1) == 1) {
807 * Can not read beyond the oobmark
809 mp
= socopyoutuio(mp
, uiop
,
810 oobmark
== 0 ? INFPSZ
: oobmark
, &error
);
817 ASSERT(oldresid
>= uiop
->uio_resid
);
818 copied
= oldresid
- uiop
->uio_resid
;
819 if (oldresid
> uiop
->uio_resid
)
820 partial_read
= B_TRUE
;
823 if (copied
> 0 && !(flags
& MSG_PEEK
)) {
824 mutex_enter(&so
->so_lock
);
825 so
->so_rcv_queued
-= copied
;
826 ASSERT(so
->so_oobmark
>= 0);
827 if (so
->so_oobmark
> 0) {
828 so
->so_oobmark
-= copied
;
829 ASSERT(so
->so_oobmark
>= 0);
830 if (so
->so_oobmark
== 0) {
831 ASSERT(so
->so_state
&
834 so
->so_state
|= SS_RCVATMARK
;
838 * so_check_flow_control() will drop
841 rvalp
->r_val2
= so_check_flow_control(so
);
844 if (mp
!= NULL
) { /* more data blocks in msg */
846 if ((flags
& (MSG_PEEK
|MSG_TRUNC
))) {
847 if (flags
& MSG_PEEK
) {
850 unsigned int msize
= msgdsize(mp
);
853 mutex_enter(&so
->so_lock
);
854 so
->so_rcv_queued
-= msize
;
856 * so_check_flow_control() will drop
860 so_check_flow_control(so
);
862 } else if (partial_read
&& !somsghasdata(mp
)) {
864 * Avoid queuing a zero-length tail part of
865 * a message. partial_read == 1 indicates that
866 * we read some of the message.
871 if (savemp
!= NULL
&&
872 (flags
& MSG_DUPCTRL
)) {
875 * There should only be non data mblks
877 ASSERT(DB_TYPE(savemp
) != M_DATA
&&
878 DB_TYPE(savemptail
) != M_DATA
);
880 if ((nmp
= dupmsg(savemp
)) == NULL
&&
881 (nmp
= copymsg(savemp
)) == NULL
) {
883 size_t size
= msgsize(savemp
);
885 error
= strwaitbuf(size
,
902 ASSERT(DB_TYPE(nmp
) != M_DATA
);
903 savemptail
->b_cont
= mp
;
910 so_prepend_msg(so
, mp
, last_tail
);
914 /* fast check so_rcv_head if there is more data */
915 if (partial_read
&& !(so
->so_state
& SS_RCVATMARK
) &&
916 *mctlp
== NULL
&& uiop
->uio_resid
> 0 &&
917 !(flags
& MSG_PEEK
) && so
->so_rcv_head
!= NULL
) {
920 } else if (!partial_read
) {
921 mutex_enter(&so
->so_lock
);
922 if (so
->so_error
!= 0) {
923 error
= sogeterr(so
, !(flags
& MSG_PEEK
));
924 mutex_exit(&so
->so_lock
);
928 * No pending data. Return right away for nonblocking
929 * socket, otherwise sleep waiting for data.
931 if (!(so
->so_state
& SS_CANTRCVMORE
) && uiop
->uio_resid
> 0) {
932 if ((uiop
->uio_fmode
& (FNDELAY
|FNONBLOCK
)) ||
933 (flags
& MSG_DONTWAIT
)) {
936 if (so
->so_state
& (SS_CLOSING
|
937 SS_FALLBACK_PENDING
)) {
938 mutex_exit(&so
->so_lock
);
943 if (so
->so_rcv_head
!= NULL
) {
946 so
->so_rcv_wakeup
= B_TRUE
;
947 so
->so_rcv_wanted
= uiop
->uio_resid
;
948 if (so
->so_rcvtimeo
== 0) {
950 * Zero means disable timeout.
952 error
= cv_wait_sig(&so
->so_rcv_cv
,
955 error
= cv_reltimedwait_sig(
956 &so
->so_rcv_cv
, &so
->so_lock
,
957 so
->so_rcvtimeo
, TR_CLOCK_TICK
);
959 so
->so_rcv_wakeup
= B_FALSE
;
960 so
->so_rcv_wanted
= 0;
964 } else if (error
== -1) {
971 mutex_exit(&so
->so_lock
);
973 if (reset_atmark
&& partial_read
&& !(flags
& MSG_PEEK
)) {
975 * We are passed the mark, update state
976 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
977 * The draft Posix socket spec states that the mark should
978 * not be cleared when peeking. We follow the latter.
980 mutex_enter(&so
->so_lock
);
981 ASSERT(so_verify_oobstate(so
));
982 so
->so_state
&= ~(SS_OOBPEND
|SS_HAVEOOBDATA
|SS_RCVATMARK
);
983 freemsg(so
->so_oobmsg
);
984 so
->so_oobmsg
= NULL
;
985 ASSERT(so_verify_oobstate(so
));
986 mutex_exit(&so
->so_lock
);
988 ASSERT(so
->so_rcv_wakeup
== B_FALSE
);
991 mutex_enter(&so
->so_lock
);
992 if (sodp
->sod_enabled
&&
993 (sodp
->sod_uioa
.uioa_state
& UIOA_ENABLED
)) {
995 if (sodp
->sod_uioa
.uioa_mbytes
> 0) {
996 ASSERT(so
->so_rcv_q_head
!= NULL
||
997 so
->so_rcv_head
!= NULL
);
998 so
->so_rcv_queued
-= sod_uioa_mblk(so
, NULL
);
999 if (error
== EWOULDBLOCK
)
1003 mutex_exit(&so
->so_lock
);
1006 if (so_debug_length
) {
1007 mutex_enter(&so
->so_lock
);
1008 ASSERT(so_check_length(so
));
1009 mutex_exit(&so
->so_lock
);
1012 rvalp
->r_val1
= more
;
1013 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1018 * Enqueue data from the protocol on the socket's rcv queue.
1020 * We try to hook new M_DATA mblks onto an existing chain, however,
1021 * that cannot be done if the existing chain has already been
1022 * processed by I/OAT. Non-M_DATA mblks are just linked together via
1023 * b_next. In all cases the b_prev of the enqueued mblk is set to
1024 * point to the last mblk in its b_cont chain.
1027 so_enqueue_msg(struct sonode
*so
, mblk_t
*mp
, size_t msg_size
)
1029 ASSERT(MUTEX_HELD(&so
->so_lock
));
1032 if (so_debug_length
) {
1033 ASSERT(so_check_length(so
));
1036 so
->so_rcv_queued
+= msg_size
;
1038 if (so
->so_rcv_head
== NULL
) {
1039 ASSERT(so
->so_rcv_last_head
== NULL
);
1040 so
->so_rcv_head
= mp
;
1041 so
->so_rcv_last_head
= mp
;
1042 } else if ((DB_TYPE(mp
) == M_DATA
&&
1043 DB_TYPE(so
->so_rcv_last_head
) == M_DATA
) &&
1044 ((DB_FLAGS(mp
) & DBLK_UIOA
) ==
1045 (DB_FLAGS(so
->so_rcv_last_head
) & DBLK_UIOA
))) {
1046 /* Added to the end */
1047 ASSERT(so
->so_rcv_last_head
!= NULL
);
1048 ASSERT(so
->so_rcv_last_head
->b_prev
!= NULL
);
1049 so
->so_rcv_last_head
->b_prev
->b_cont
= mp
;
1051 /* Start a new end */
1052 so
->so_rcv_last_head
->b_next
= mp
;
1053 so
->so_rcv_last_head
= mp
;
1055 while (mp
->b_cont
!= NULL
)
1058 so
->so_rcv_last_head
->b_prev
= mp
;
1060 if (so_debug_length
) {
1061 ASSERT(so_check_length(so
));
1067 * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1070 somsghasdata(mblk_t
*mp
)
1072 for (; mp
; mp
= mp
->b_cont
)
1073 if (mp
->b_datap
->db_type
== M_DATA
) {
1074 ASSERT(mp
->b_wptr
>= mp
->b_rptr
);
1075 if (mp
->b_wptr
> mp
->b_rptr
)
1082 * Flush the read side of sockfs.
1084 * The caller must be sure that a reader is not already active when the
1085 * buffer is being flushed.
1088 so_rcv_flush(struct sonode
*so
)
1092 ASSERT(MUTEX_HELD(&so
->so_lock
));
1094 if (so
->so_oobmsg
!= NULL
) {
1095 freemsg(so
->so_oobmsg
);
1096 so
->so_oobmsg
= NULL
;
1099 ~(SS_OOBPEND
|SS_HAVEOOBDATA
|SS_HADOOBDATA
|SS_RCVATMARK
);
1103 * Free messages sitting in the recv queues
1105 while (so
->so_rcv_q_head
!= NULL
) {
1106 mp
= so
->so_rcv_q_head
;
1107 so
->so_rcv_q_head
= mp
->b_next
;
1108 mp
->b_next
= mp
->b_prev
= NULL
;
1111 while (so
->so_rcv_head
!= NULL
) {
1112 mp
= so
->so_rcv_head
;
1113 so
->so_rcv_head
= mp
->b_next
;
1114 mp
->b_next
= mp
->b_prev
= NULL
;
1117 so
->so_rcv_queued
= 0;
1118 so
->so_rcv_q_head
= NULL
;
1119 so
->so_rcv_q_last_head
= NULL
;
1120 so
->so_rcv_head
= NULL
;
1121 so
->so_rcv_last_head
= NULL
;
1125 * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1128 sorecvoob(struct sonode
*so
, struct msghdr
*msg
, struct uio
*uiop
, int flags
,
1129 boolean_t oob_inline
)
1134 dprintso(so
, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so
, (void *)msg
,
1139 * There is never any oob data with addresses or control since
1140 * the T_EXDATA_IND does not carry any options.
1142 msg
->msg_controllen
= 0;
1143 msg
->msg_namelen
= 0;
1147 mutex_enter(&so
->so_lock
);
1148 ASSERT(so_verify_oobstate(so
));
1150 (so
->so_state
& (SS_OOBPEND
|SS_HADOOBDATA
)) != SS_OOBPEND
) {
1151 dprintso(so
, 1, ("sorecvoob: inline or data consumed\n"));
1152 mutex_exit(&so
->so_lock
);
1155 if (!(so
->so_state
& SS_HAVEOOBDATA
)) {
1156 dprintso(so
, 1, ("sorecvoob: no data yet\n"));
1157 mutex_exit(&so
->so_lock
);
1158 return (EWOULDBLOCK
);
1160 ASSERT(so
->so_oobmsg
!= NULL
);
1162 if (flags
& MSG_PEEK
) {
1164 * Since recv* can not return ENOBUFS we can not use dupmsg.
1165 * Instead we revert to the consolidation private
1166 * allocb_wait plus bcopy.
1170 mp1
= allocb_wait(msgdsize(mp
), BPRI_MED
, STR_NOSIG
, NULL
);
1173 while (mp
!= NULL
) {
1177 bcopy(mp
->b_rptr
, mp1
->b_wptr
, size
);
1178 mp1
->b_wptr
+= size
;
1179 ASSERT(mp1
->b_wptr
<= mp1
->b_datap
->db_lim
);
1185 * Update the state indicating that the data has been consumed.
1186 * Keep SS_OOBPEND set until data is consumed past the mark.
1188 so
->so_oobmsg
= NULL
;
1189 so
->so_state
^= SS_HAVEOOBDATA
|SS_HADOOBDATA
;
1191 ASSERT(so_verify_oobstate(so
));
1192 mutex_exit(&so
->so_lock
);
1196 while (nmp
!= NULL
&& uiop
->uio_resid
> 0) {
1197 ssize_t n
= MBLKL(nmp
);
1199 n
= MIN(n
, uiop
->uio_resid
);
1201 error
= uiomove(nmp
->b_rptr
, n
,
1207 ASSERT(mp
->b_next
== NULL
&& mp
->b_prev
== NULL
);
1213 * Allocate and initializ sonode
1216 socket_sonode_create(struct sockparams
*sp
, int family
, int type
,
1217 int protocol
, int sflags
, int *errorp
, struct cred
*cr
)
1223 * Choose the right set of sonodeops based on the upcall and
1224 * down call version that the protocol has provided
1226 if (SOCK_UC_VERSION
!= sp
->sp_smod_info
->smod_uc_version
||
1227 SOCK_DC_VERSION
!= sp
->sp_smod_info
->smod_dc_version
) {
1232 cmn_err(CE_CONT
, "protocol and socket module version mismatch");
1238 kmflags
= (sflags
& SOCKET_NOSLEEP
) ? KM_NOSLEEP
: KM_SLEEP
;
1240 so
= kmem_cache_alloc(socket_cache
, kmflags
);
1246 sonode_init(so
, sp
, family
, type
, protocol
, &so_sonodeops
);
1248 so
->so_is_stream
= false;
1251 * set the default values to be INFPSZ
1252 * if a protocol desires it can change the value later
1254 so
->so_proto_props
.sopp_rxhiwat
= SOCKET_RECVHIWATER
;
1255 so
->so_proto_props
.sopp_rxlowat
= SOCKET_RECVLOWATER
;
1256 so
->so_proto_props
.sopp_maxpsz
= INFPSZ
;
1257 so
->so_proto_props
.sopp_maxblk
= INFPSZ
;
1263 socket_init_common(struct sonode
*so
, struct sonode
*pso
, int flags
, cred_t
*cr
)
1269 * We have a passive open, so inherit basic state from
1270 * the parent (listener).
1272 * No need to grab the new sonode's lock, since there is no
1273 * one that can have a reference to it.
1275 mutex_enter(&pso
->so_lock
);
1277 so
->so_state
|= SS_ISCONNECTED
| (pso
->so_state
& SS_ASYNC
);
1278 so
->so_pgrp
= pso
->so_pgrp
;
1279 so
->so_rcvtimeo
= pso
->so_rcvtimeo
;
1280 so
->so_sndtimeo
= pso
->so_sndtimeo
;
1282 * Make note of the socket level options. TCP and IP level
1283 * options are already inherited. We could do all this after
1284 * accept is successful but doing it here simplifies code and
1285 * no harm done for error case.
1287 so
->so_options
= pso
->so_options
& (SO_DEBUG
|SO_REUSEADDR
|
1288 SO_KEEPALIVE
|SO_DONTROUTE
|SO_BROADCAST
|SO_USELOOPBACK
|
1289 SO_OOBINLINE
|SO_DGRAM_ERRIND
|SO_LINGER
);
1290 so
->so_proto_props
= pso
->so_proto_props
;
1291 so
->so_mode
= pso
->so_mode
;
1292 so
->so_pollev
= pso
->so_pollev
& SO_POLLEV_ALWAYS
;
1294 mutex_exit(&pso
->so_lock
);
1297 * If the parent has any filters, try to inherit them.
1299 if (pso
->so_filter_active
> 0 &&
1300 (error
= sof_sonode_inherit_filters(so
, pso
)) != 0)
1304 struct sockparams
*sp
= so
->so_sockparams
;
1305 sock_upcalls_t
*upcalls_to_use
;
1308 * Attach automatic filters, if there are any.
1310 if (!list_is_empty(&sp
->sp_auto_filters
) &&
1311 (error
= sof_sonode_autoattach_filters(so
, cr
)) != 0)
1314 /* OK to attach filters */
1315 so
->so_state
|= SS_FILOP_OK
;
1318 * Based on the version number select the right upcalls to
1319 * pass down. Currently we only have one version so choose
1322 upcalls_to_use
= &so_upcalls
;
1324 /* active open, so create a lower handle */
1325 so
->so_proto_handle
=
1326 sp
->sp_smod_info
->smod_proto_create_func(so
->so_family
,
1327 so
->so_type
, so
->so_protocol
, &so
->so_downcalls
,
1328 &so
->so_mode
, &error
, flags
, cr
);
1330 if (so
->so_proto_handle
== NULL
) {
1333 * To be safe; if a lower handle cannot be created, and
1334 * the proto does not give a reason why, assume there
1335 * was a lack of memory.
1337 return ((error
== 0) ? ENOMEM
: error
);
1339 ASSERT(so
->so_downcalls
!= NULL
);
1340 ASSERT(so
->so_downcalls
->sd_send
!= NULL
||
1341 so
->so_downcalls
->sd_send_uio
!= NULL
);
1342 if (so
->so_downcalls
->sd_recv_uio
!= NULL
) {
1343 ASSERT(so
->so_downcalls
->sd_poll
!= NULL
);
1344 so
->so_pollev
|= SO_POLLEV_ALWAYS
;
1347 (*so
->so_downcalls
->sd_activate
)(so
->so_proto_handle
,
1348 (sock_upper_handle_t
)so
, upcalls_to_use
, 0, cr
);
1353 * FIXME No need for this, the protocol can deal with it in
1354 * sd_create(). Should update ICMP.
1356 if (so
->so_protocol
!= so
->so_sockparams
->sp_protocol
) {
1357 int protocol
= so
->so_protocol
;
1360 * Issue SO_PROTOTYPE setsockopt.
1362 error
= socket_setsockopt(so
, SOL_SOCKET
, SO_PROTOTYPE
,
1363 &protocol
, (t_uscalar_t
)sizeof (protocol
), cr
);
1365 (void) (*so
->so_downcalls
->sd_close
)
1366 (so
->so_proto_handle
, 0, cr
);
1368 mutex_enter(&so
->so_lock
);
1370 mutex_exit(&so
->so_lock
);
1372 * Setsockopt often fails with ENOPROTOOPT but
1373 * socket() should fail with
1374 * EPROTONOSUPPORT/EPROTOTYPE.
1376 return (EPROTONOSUPPORT
);
1381 if (uioasync
.enabled
)
1384 /* put an extra reference on the socket for the protocol */
1391 * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1392 * struct cred *cr, int32_t *rvalp)
1394 * Handle ioctls that manipulate basic socket state; non-blocking,
1398 * < 0 - ioctl was not handle
1399 * >= 0 - ioctl was handled, if > 0, then it is an errno
1402 * Assumes the standard receive buffer is used to obtain info for
1407 socket_ioctl_common(struct sonode
*so
, int cmd
, intptr_t arg
, int mode
,
1408 struct cred
*cr
, int32_t *rvalp
)
1413 * SIOCSQPTR is valid only when helper stream is created
1417 return (EOPNOTSUPP
);
1421 if (so_copyin((void *)arg
, &value
, sizeof (int32_t),
1422 (mode
& (int)FKIOCTL
)))
1425 mutex_enter(&so
->so_lock
);
1427 so
->so_state
|= SS_NDELAY
;
1429 so
->so_state
&= ~SS_NDELAY
;
1431 mutex_exit(&so
->so_lock
);
1437 if (so_copyin((void *)arg
, &value
, sizeof (int32_t),
1438 (mode
& (int)FKIOCTL
)))
1441 mutex_enter(&so
->so_lock
);
1445 so
->so_state
|= SS_ASYNC
;
1447 /* Turn off SIGIO */
1448 so
->so_state
&= ~SS_ASYNC
;
1450 mutex_exit(&so
->so_lock
);
1460 if (so_copyin((void *)arg
, &pid
, sizeof (pid_t
),
1461 (mode
& (int)FKIOCTL
)))
1464 mutex_enter(&so
->so_lock
);
1465 error
= (pid
!= so
->so_pgrp
) ? socket_chgpgrp(so
, pid
) : 0;
1466 mutex_exit(&so
->so_lock
);
1471 if (so_copyout(&so
->so_pgrp
, (void *)arg
,
1472 sizeof (pid_t
), (mode
& (int)FKIOCTL
)))
1480 * Only protocols that support urgent data can handle ATMARK.
1482 if ((so
->so_mode
& SM_EXDATA
) == 0)
1486 * If the protocol is maintaining its own buffer, then the
1487 * request must be passed down.
1489 if (so
->so_downcalls
->sd_recv_uio
!= NULL
)
1492 retval
= (so
->so_state
& SS_RCVATMARK
) != 0;
1494 if (so_copyout(&retval
, (void *)arg
, sizeof (int),
1495 (mode
& (int)FKIOCTL
))) {
1505 * If the protocol is maintaining its own buffer, then the
1506 * request must be passed down.
1508 if (so
->so_downcalls
->sd_recv_uio
!= NULL
)
1511 retval
= MIN(so
->so_rcv_queued
, INT_MAX
);
1513 if (so_copyout(&retval
, (void *)arg
,
1514 sizeof (retval
), (mode
& (int)FKIOCTL
))) {
1520 case _I_GETPEERCRED
: {
1523 if ((mode
& FKIOCTL
) == 0)
1526 mutex_enter(&so
->so_lock
);
1527 if ((so
->so_mode
& SM_CONNREQUIRED
) == 0) {
1529 } else if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1531 } else if (so
->so_peercred
!= NULL
) {
1532 k_peercred_t
*kp
= (k_peercred_t
*)arg
;
1533 kp
->pc_cr
= so
->so_peercred
;
1534 kp
->pc_cpid
= so
->so_cpid
;
1535 crhold(so
->so_peercred
);
1539 mutex_exit(&so
->so_lock
);
1548 * Handle the I_NREAD STREAM ioctl.
1551 so_strioc_nread(struct sonode
*so
, intptr_t arg
, int mode
, int32_t *rvalp
)
1557 clock_t wakeup
= drv_usectohz(10);
1559 if (so
->so_downcalls
== NULL
||
1560 so
->so_downcalls
->sd_recv_uio
!= NULL
)
1563 mutex_enter(&so
->so_lock
);
1564 /* Wait for reader to get out of the way. */
1565 while (so
->so_flag
& SOREADLOCKED
) {
1567 * If reader is waiting for data, then there should be nothing
1570 if (so
->so_rcv_wakeup
)
1573 /* Do a timed sleep, in case the reader goes to sleep. */
1574 (void) cv_reltimedwait(&so
->so_read_cv
, &so
->so_lock
, wakeup
,
1579 * Since we are holding so_lock no new reader will come in, and the
1580 * protocol will not be able to enqueue data. So it's safe to walk
1583 mp
= so
->so_rcv_q_head
;
1585 size
= msgdsize(so
->so_rcv_q_head
);
1586 for (; mp
!= NULL
; mp
= mp
->b_next
)
1590 * In case the processing list was empty, get the size of the
1593 size
= msgdsize(so
->so_rcv_head
);
1596 for (mp
= so
->so_rcv_head
; mp
!= NULL
; mp
= mp
->b_next
)
1599 mutex_exit(&so
->so_lock
);
1602 * Drop down from size_t to the "int" required by the
1603 * interface. Cap at INT_MAX.
1605 retval
= MIN(size
, INT_MAX
);
1606 if (so_copyout(&retval
, (void *)arg
, sizeof (retval
),
1607 (mode
& (int)FKIOCTL
))) {
1616 * Process STREAM ioctls.
1619 * < 0 - ioctl was not handle
1620 * >= 0 - ioctl was handled, if > 0, then it is an errno
1623 socket_strioc_common(struct sonode
*so
, int cmd
, intptr_t arg
, int mode
,
1624 struct cred
*cr
, int32_t *rvalp
)
1628 /* Only STREAM iotcls are handled here */
1629 if ((cmd
& 0xffffff00U
) != STR
)
1635 * We return an error for I_CANPUT so that isastream(3C) will
1636 * not report the socket as being a STREAM.
1638 return (EOPNOTSUPP
);
1640 /* Avoid doing a fallback for I_NREAD. */
1641 return (so_strioc_nread(so
, arg
, mode
, rvalp
));
1643 /* Avoid doing a fallback for I_LOOK. */
1644 if (so_copyout("sockmod", (void *)arg
, strlen("sockmod") + 1,
1645 (mode
& (int)FKIOCTL
))) {
1654 * Try to fall back to TPI, and if successful, reissue the ioctl.
1656 if ((retval
= so_tpi_fallback(so
, cr
)) == 0) {
1657 /* Reissue the ioctl */
1658 ASSERT(so
->so_rcv_q_head
== NULL
);
1659 return (SOP_IOCTL(so
, cmd
, arg
, mode
, cr
, rvalp
));
1666 * This is called for all socket types to verify that the buffer size is large
1667 * enough for the option, and if we can, handle the request as well. Most
1668 * options will be forwarded to the protocol.
1671 socket_getopt_common(struct sonode
*so
, int level
, int option_name
,
1672 void *optval
, socklen_t
*optlenp
, int flags
)
1674 if (level
!= SOL_SOCKET
)
1677 switch (option_name
) {
1681 case SO_ACCEPTCONN
: {
1683 socklen_t optlen
= *optlenp
;
1685 if (optlen
< (t_uscalar_t
)sizeof (int32_t)) {
1689 switch (option_name
) {
1691 mutex_enter(&so
->so_lock
);
1692 value
= sogeterr(so
, B_TRUE
);
1693 mutex_exit(&so
->so_lock
);
1696 value
= so
->so_family
;
1699 value
= so
->so_type
;
1702 if (so
->so_state
& SS_ACCEPTCONN
)
1703 value
= SO_ACCEPTCONN
;
1709 bcopy(&value
, optval
, sizeof (value
));
1710 *optlenp
= sizeof (value
);
1717 socklen_t optlen
= *optlenp
;
1719 if (get_udatamodel() == DATAMODEL_NONE
||
1720 get_udatamodel() == DATAMODEL_NATIVE
) {
1721 if (optlen
< sizeof (struct timeval
))
1724 if (optlen
< sizeof (struct timeval32
))
1727 if (option_name
== SO_RCVTIMEO
)
1728 value
= drv_hztousec(so
->so_rcvtimeo
);
1730 value
= drv_hztousec(so
->so_sndtimeo
);
1732 if (get_udatamodel() == DATAMODEL_NONE
||
1733 get_udatamodel() == DATAMODEL_NATIVE
) {
1734 ((struct timeval
*)(optval
))->tv_sec
=
1735 value
/ (1000 * 1000);
1736 ((struct timeval
*)(optval
))->tv_usec
=
1737 value
% (1000 * 1000);
1738 *optlenp
= sizeof (struct timeval
);
1740 ((struct timeval32
*)(optval
))->tv_sec
=
1741 value
/ (1000 * 1000);
1742 ((struct timeval32
*)(optval
))->tv_usec
=
1743 value
% (1000 * 1000);
1744 *optlenp
= sizeof (struct timeval32
);
1753 case SO_USELOOPBACK
:
1760 case SO_DGRAM_ERRIND
: {
1761 socklen_t optlen
= *optlenp
;
1763 if (optlen
< (t_uscalar_t
)sizeof (int32_t))
1768 socklen_t optlen
= *optlenp
;
1770 if (optlen
< (t_uscalar_t
)sizeof (int32_t))
1775 socklen_t optlen
= *optlenp
;
1777 if (optlen
< (t_uscalar_t
)sizeof (struct linger
))
1781 case SO_SND_BUFINFO
: {
1782 socklen_t optlen
= *optlenp
;
1784 if (optlen
< (t_uscalar_t
)sizeof (struct so_snd_bufinfo
))
1786 ((struct so_snd_bufinfo
*)(optval
))->sbi_wroff
=
1787 (so
->so_proto_props
).sopp_wroff
;
1788 ((struct so_snd_bufinfo
*)(optval
))->sbi_maxblk
=
1789 (so
->so_proto_props
).sopp_maxblk
;
1790 ((struct so_snd_bufinfo
*)(optval
))->sbi_maxpsz
=
1791 (so
->so_proto_props
).sopp_maxpsz
;
1792 ((struct so_snd_bufinfo
*)(optval
))->sbi_tail
=
1793 (so
->so_proto_props
).sopp_tail
;
1794 *optlenp
= sizeof (struct so_snd_bufinfo
);
1797 case SO_SND_COPYAVOID
: {
1798 sof_instance_t
*inst
;
1801 * Avoid zero-copy if there is a filter with a data_out
1802 * callback. We could let the operation succeed, but then
1803 * the filter would have to copy the data anyway.
1805 for (inst
= so
->so_filter_top
; inst
!= NULL
;
1806 inst
= inst
->sofi_next
) {
1807 if (SOF_INTERESTED(inst
, data_out
))
1808 return (EOPNOTSUPP
);
1817 /* Unknown Option */
1822 socket_sonode_destroy(struct sonode
*so
)
1825 kmem_cache_free(socket_cache
, so
);
1829 so_zcopy_wait(struct sonode
*so
)
1833 mutex_enter(&so
->so_lock
);
1834 while (!(so
->so_copyflag
& STZCNOTIFY
)) {
1835 if (so
->so_state
& SS_CLOSING
) {
1836 mutex_exit(&so
->so_lock
);
1839 if (cv_wait_sig(&so
->so_copy_cv
, &so
->so_lock
) == 0) {
1844 so
->so_copyflag
&= ~STZCNOTIFY
;
1845 mutex_exit(&so
->so_lock
);
1850 so_timer_callback(void *arg
)
1852 struct sonode
*so
= (struct sonode
*)arg
;
1854 mutex_enter(&so
->so_lock
);
1856 so
->so_rcv_timer_tid
= 0;
1857 if (so
->so_rcv_queued
> 0) {
1858 so_notify_data(so
, so
->so_rcv_queued
);
1860 mutex_exit(&so
->so_lock
);
1866 * Verify that the length stored in so_rcv_queued and the length of data blocks
1870 so_check_length(sonode_t
*so
)
1872 mblk_t
*mp
= so
->so_rcv_q_head
;
1875 ASSERT(MUTEX_HELD(&so
->so_lock
));
1879 while ((mp
= mp
->b_next
) != NULL
)
1880 len
+= msgdsize(mp
);
1882 mp
= so
->so_rcv_head
;
1884 len
+= msgdsize(mp
);
1885 while ((mp
= mp
->b_next
) != NULL
)
1886 len
+= msgdsize(mp
);
1888 return ((len
== so
->so_rcv_queued
) ? B_TRUE
: B_FALSE
);
1893 so_get_mod_version(struct sockparams
*sp
)
1895 ASSERT(sp
!= NULL
&& sp
->sp_smod_info
!= NULL
);
1896 return (sp
->sp_smod_info
->smod_version
);
1900 * so_start_fallback()
1902 * Block new socket operations from coming in, and wait for active operations
1903 * to complete. Threads that are sleeping will be woken up so they can get
1906 * The caller must be a reader on so_fallback_rwlock.
1909 so_start_fallback(struct sonode
*so
)
1911 ASSERT(RW_READ_HELD(&so
->so_fallback_rwlock
));
1913 mutex_enter(&so
->so_lock
);
1914 if (so
->so_state
& SS_FALLBACK_PENDING
) {
1915 mutex_exit(&so
->so_lock
);
1918 so
->so_state
|= SS_FALLBACK_PENDING
;
1920 * Poke all threads that might be sleeping. Any operation that comes
1921 * in after the cv_broadcast will observe the fallback pending flag
1922 * which cause the call to return where it would normally sleep.
1924 cv_broadcast(&so
->so_state_cv
); /* threads in connect() */
1925 cv_broadcast(&so
->so_rcv_cv
); /* threads in recvmsg() */
1926 cv_broadcast(&so
->so_snd_cv
); /* threads in sendmsg() */
1927 mutex_enter(&so
->so_acceptq_lock
);
1928 cv_broadcast(&so
->so_acceptq_cv
); /* threads in accept() */
1929 mutex_exit(&so
->so_acceptq_lock
);
1930 mutex_exit(&so
->so_lock
);
1933 * The main reason for the rw_tryupgrade call is to provide
1934 * observability during the fallback process. We want to
1935 * be able to see if there are pending operations.
1937 if (rw_tryupgrade(&so
->so_fallback_rwlock
) == 0) {
1939 * It is safe to drop and reaquire the fallback lock, because
1940 * we are guaranteed that another fallback cannot take place.
1942 rw_exit(&so
->so_fallback_rwlock
);
1943 DTRACE_PROBE1(pending__ops__wait
, (struct sonode
*), so
);
1944 rw_enter(&so
->so_fallback_rwlock
, RW_WRITER
);
1945 DTRACE_PROBE1(pending__ops__complete
, (struct sonode
*), so
);
1954 * Allow socket opertions back in.
1956 * The caller must be a writer on so_fallback_rwlock.
1959 so_end_fallback(struct sonode
*so
)
1961 ASSERT(RW_ISWRITER(&so
->so_fallback_rwlock
));
1963 mutex_enter(&so
->so_lock
);
1964 so
->so_state
&= ~(SS_FALLBACK_PENDING
|SS_FALLBACK_DRAIN
);
1965 mutex_exit(&so
->so_lock
);
1967 rw_downgrade(&so
->so_fallback_rwlock
);
1973 * Callback passed to the protocol during fallback. It is called once
1974 * the endpoint is quiescent.
1976 * No requests from the user, no notifications from the protocol, so it
1977 * is safe to synchronize the state. Data can also be moved without
1978 * risk for reordering.
1980 * We do not need to hold so_lock, since there can be only one thread
1981 * operating on the sonode.
1984 so_quiesced_cb(sock_upper_handle_t sock_handle
, sock_quiesce_arg_t
*arg
,
1985 struct T_capability_ack
*tcap
,
1986 struct sockaddr
*laddr
, socklen_t laddrlen
,
1987 struct sockaddr
*faddr
, socklen_t faddrlen
, short opts
)
1989 struct sonode
*so
= (struct sonode
*)sock_handle
;
1991 mblk_t
*retmp
= NULL
, **tailmpp
= &retmp
;
1994 sotpi_update_state(so
, tcap
, laddr
, laddrlen
, faddr
, faddrlen
,
1998 * Some protocols do not quiece the data path during fallback. Once
1999 * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2000 * fail and the protocol is responsible for saving the data for later
2001 * delivery (i.e., once the fallback has completed).
2003 mutex_enter(&so
->so_lock
);
2004 so
->so_state
|= SS_FALLBACK_DRAIN
;
2005 SOCKET_TIMER_CANCEL(so
);
2006 mutex_exit(&so
->so_lock
);
2008 if (so
->so_rcv_head
!= NULL
) {
2009 if (so
->so_rcv_q_last_head
== NULL
)
2010 so
->so_rcv_q_head
= so
->so_rcv_head
;
2012 so
->so_rcv_q_last_head
->b_next
= so
->so_rcv_head
;
2013 so
->so_rcv_q_last_head
= so
->so_rcv_last_head
;
2016 atmark
= (so
->so_state
& SS_RCVATMARK
) != 0;
2018 * Clear any OOB state having to do with pending data. The TPI
2019 * code path will set the appropriate oob state when we move the
2020 * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2021 * data has already been consumed.
2023 so
->so_state
&= ~(SS_RCVATMARK
|SS_OOBPEND
|SS_HAVEOOBDATA
);
2025 ASSERT(so
->so_oobmsg
!= NULL
|| so
->so_oobmark
<= so
->so_rcv_queued
);
2028 * Move data to the STREAM head.
2030 while (so
->so_rcv_q_head
!= NULL
) {
2031 mblk_t
*mp
= so
->so_rcv_q_head
;
2032 size_t mlen
= msgdsize(mp
);
2034 so
->so_rcv_q_head
= mp
->b_next
;
2039 * Send T_EXDATA_IND if we are at the oob mark.
2042 struct T_exdata_ind
*tei
;
2043 mblk_t
*mp1
= arg
->soqa_exdata_mp
;
2045 arg
->soqa_exdata_mp
= NULL
;
2046 ASSERT(mp1
!= NULL
);
2047 mp1
->b_datap
->db_type
= M_PROTO
;
2048 tei
= (struct T_exdata_ind
*)mp1
->b_rptr
;
2049 tei
->PRIM_type
= T_EXDATA_IND
;
2051 mp1
->b_wptr
= (uchar_t
*)&tei
[1];
2053 if (IS_SO_OOB_INLINE(so
)) {
2056 ASSERT(so
->so_oobmsg
!= NULL
);
2057 mp1
->b_cont
= so
->so_oobmsg
;
2058 so
->so_oobmsg
= NULL
;
2060 /* process current mp next time around */
2061 mp
->b_next
= so
->so_rcv_q_head
;
2062 so
->so_rcv_q_head
= mp
;
2067 /* we have consumed the oob mark */
2069 } else if (so
->so_oobmark
> 0) {
2071 * Check if the OOB mark is within the current
2072 * mblk chain. In that case we have to split it up.
2074 if (so
->so_oobmark
< mlen
) {
2075 mblk_t
*urg_mp
= mp
;
2079 mlen
= so
->so_oobmark
;
2082 * It is assumed that the OOB mark does
2083 * not land within a mblk.
2086 so
->so_oobmark
-= MBLKL(urg_mp
);
2088 urg_mp
= urg_mp
->b_cont
;
2089 } while (so
->so_oobmark
> 0);
2091 if (urg_mp
!= NULL
) {
2092 urg_mp
->b_next
= so
->so_rcv_q_head
;
2093 so
->so_rcv_q_head
= urg_mp
;
2096 so
->so_oobmark
-= mlen
;
2097 if (so
->so_oobmark
== 0)
2103 * Queue data on the STREAM head.
2105 so
->so_rcv_queued
-= mlen
;
2107 tailmpp
= &mp
->b_next
;
2109 so
->so_rcv_head
= NULL
;
2110 so
->so_rcv_last_head
= NULL
;
2111 so
->so_rcv_q_head
= NULL
;
2112 so
->so_rcv_q_last_head
= NULL
;
2115 * Check if the oob byte is at the end of the data stream, or if the
2116 * oob byte has not yet arrived. In the latter case we have to send a
2117 * SIGURG and a mark indicator to the STREAM head. The mark indicator
2118 * is needed to guarantee correct behavior for SIOCATMARK. See block
2119 * comment in socktpi.h for more details.
2121 if (atmark
|| so
->so_oobmark
> 0) {
2124 if (atmark
&& so
->so_oobmsg
!= NULL
) {
2125 struct T_exdata_ind
*tei
;
2127 mp
= arg
->soqa_exdata_mp
;
2128 arg
->soqa_exdata_mp
= NULL
;
2130 mp
->b_datap
->db_type
= M_PROTO
;
2131 tei
= (struct T_exdata_ind
*)mp
->b_rptr
;
2132 tei
->PRIM_type
= T_EXDATA_IND
;
2134 mp
->b_wptr
= (uchar_t
*)&tei
[1];
2136 mp
->b_cont
= so
->so_oobmsg
;
2137 so
->so_oobmsg
= NULL
;
2140 tailmpp
= &mp
->b_next
;
2142 /* Send up the signal */
2143 mp
= arg
->soqa_exdata_mp
;
2144 arg
->soqa_exdata_mp
= NULL
;
2146 DB_TYPE(mp
) = M_PCSIG
;
2147 *mp
->b_wptr
++ = (uchar_t
)SIGURG
;
2149 tailmpp
= &mp
->b_next
;
2151 /* Send up the mark indicator */
2152 mp
= arg
->soqa_urgmark_mp
;
2153 arg
->soqa_urgmark_mp
= NULL
;
2154 mp
->b_flag
= atmark
? MSGMARKNEXT
: MSGNOTMARKNEXT
;
2156 tailmpp
= &mp
->b_next
;
2161 ASSERT(so
->so_oobmark
== 0);
2162 ASSERT(so
->so_rcv_queued
== 0);
2169 * Do an integrity check of the sonode. This should be done if a
2170 * fallback fails after sonode has initially been converted to use
2171 * TPI and subsequently have to be reverted.
2173 * Failure to pass the integrity check will panic the system.
2176 so_integrity_check(struct sonode
*cur
, struct sonode
*orig
)
2178 VERIFY(cur
->so_vnode
== orig
->so_vnode
);
2179 VERIFY(cur
->so_ops
== orig
->so_ops
);
2181 * For so_state we can only VERIFY the state flags in CHECK_STATE.
2182 * The other state flags might be affected by a notification from the
2185 #define CHECK_STATE (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2186 SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2187 SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2188 VERIFY((cur
->so_state
& (orig
->so_state
& CHECK_STATE
)) ==
2189 (orig
->so_state
& CHECK_STATE
));
2190 VERIFY(cur
->so_mode
== orig
->so_mode
);
2191 VERIFY(cur
->so_flag
== orig
->so_flag
);
2192 VERIFY(cur
->so_count
== orig
->so_count
);
2193 /* Cannot VERIFY so_proto_connid; proto can update it */
2194 VERIFY(cur
->so_sockparams
== orig
->so_sockparams
);
2195 /* an error might have been recorded, but it can not be lost */
2196 VERIFY(cur
->so_error
!= 0 || orig
->so_error
== 0);
2197 VERIFY(cur
->so_family
== orig
->so_family
);
2198 VERIFY(cur
->so_type
== orig
->so_type
);
2199 VERIFY(cur
->so_protocol
== orig
->so_protocol
);
2200 VERIFY(cur
->so_is_stream
== orig
->so_is_stream
);
2201 /* New conns might have arrived, but none should have been lost */
2202 VERIFY(cur
->so_acceptq_len
>= orig
->so_acceptq_len
);
2203 VERIFY(list_head(&cur
->so_acceptq_list
) ==
2204 list_head(&orig
->so_acceptq_list
));
2205 VERIFY(cur
->so_backlog
== orig
->so_backlog
);
2206 /* New OOB migth have arrived, but mark should not have been lost */
2207 VERIFY(cur
->so_oobmark
>= orig
->so_oobmark
);
2208 /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2209 VERIFY(cur
->so_pgrp
== orig
->so_pgrp
);
2210 VERIFY(cur
->so_peercred
== orig
->so_peercred
);
2211 VERIFY(cur
->so_cpid
== orig
->so_cpid
);
2212 VERIFY(cur
->so_zoneid
== orig
->so_zoneid
);
2213 /* New data migth have arrived, but none should have been lost */
2214 VERIFY(cur
->so_rcv_queued
>= orig
->so_rcv_queued
);
2215 VERIFY(cur
->so_rcv_q_head
== orig
->so_rcv_q_head
);
2216 VERIFY(cur
->so_rcv_head
== orig
->so_rcv_head
);
2217 VERIFY(cur
->so_proto_handle
== orig
->so_proto_handle
);
2218 VERIFY(cur
->so_downcalls
== orig
->so_downcalls
);
2219 /* Cannot VERIFY so_proto_props; they can be updated by proto */
2226 * This is the fallback initation routine; things start here.
2229 * o Block new socket operations from coming in
2230 * o Allocate/initate info needed by TPI
2231 * o Quiesce the connection, at which point we sync
2232 * state and move data
2233 * o Change operations (sonodeops) associated with the socket
2234 * o Unblock threads waiting for the fallback to finish
2237 so_tpi_fallback(struct sonode
*so
, struct cred
*cr
)
2241 struct sockparams
*sp
;
2242 struct sockparams
*newsp
= NULL
;
2243 so_proto_fallback_func_t fbfunc
;
2244 const char *devpath
;
2247 sock_quiesce_arg_t arg
= { NULL
, NULL
};
2249 struct sonode origso
;
2252 sp
= so
->so_sockparams
;
2253 fbfunc
= sp
->sp_smod_info
->smod_proto_fallback_func
;
2256 * Cannot fallback if the socket has active filters
2258 if (so
->so_filter_active
> 0)
2261 switch (so
->so_family
) {
2263 devpath
= sp
->sp_smod_info
->smod_fallback_devpath_v4
;
2266 devpath
= sp
->sp_smod_info
->smod_fallback_devpath_v6
;
2273 * Fallback can only happen if the socket module has a TPI device
2274 * and fallback function.
2276 if (devpath
== NULL
|| fbfunc
== NULL
)
2280 * Initiate fallback; upon success we know that no new requests
2281 * will come in from the user.
2283 if (!so_start_fallback(so
))
2287 * Make a copy of the sonode in case we need to make an integrity
2290 bcopy(so
, &origso
, sizeof (*so
));
2293 sp
->sp_stats
.sps_nfallback
.value
.ui64
++;
2295 newsp
= sockparams_hold_ephemeral_bydev(so
->so_family
, so
->so_type
,
2296 so
->so_protocol
, devpath
, KM_SLEEP
, &error
);
2300 if (so
->so_direct
!= NULL
) {
2301 sodirect_t
*sodp
= so
->so_direct
;
2302 mutex_enter(&so
->so_lock
);
2304 so
->so_direct
->sod_enabled
= B_FALSE
;
2305 so
->so_state
&= ~SS_SODIRECT
;
2306 ASSERT(sodp
->sod_uioafh
== NULL
);
2307 mutex_exit(&so
->so_lock
);
2310 /* Turn sonode into a TPI socket */
2311 error
= sotpi_convert_sonode(so
, newsp
, &direct
, &q
, cr
);
2315 * When it comes to urgent data we have two cases to deal with;
2316 * (1) The oob byte has already arrived, or (2) the protocol has
2317 * notified that oob data is pending, but it has not yet arrived.
2319 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2320 * in the byte stream the oob byte is. For (2) we have to send a
2321 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2322 * the oob byte will be the next byte from the protocol.
2324 * So in the worst case we need two mblks, one for the signal, another
2325 * for mark indication. In that case we use the exdata_mp for the sig.
2327 arg
.soqa_exdata_mp
= allocb_wait(sizeof (struct T_exdata_ind
),
2328 BPRI_MED
, STR_NOSIG
, NULL
);
2329 arg
.soqa_urgmark_mp
= allocb_wait(0, BPRI_MED
, STR_NOSIG
, NULL
);
2332 * Now tell the protocol to start using TPI. so_quiesced_cb be
2333 * called once it's safe to synchronize state.
2335 DTRACE_PROBE1(proto__fallback__begin
, struct sonode
*, so
);
2336 error
= (*fbfunc
)(so
->so_proto_handle
, q
, direct
, so_quiesced_cb
,
2338 DTRACE_PROBE1(proto__fallback__end
, struct sonode
*, so
);
2341 /* protocol was unable to do a fallback, revert the sonode */
2342 sotpi_revert_sonode(so
, cr
);
2347 * Walk the accept queue and notify the proto that they should
2348 * fall back to TPI. The protocol will send up the T_CONN_IND.
2350 nso
= list_head(&so
->so_acceptq_list
);
2351 while (nso
!= NULL
) {
2353 struct sonode
*next
;
2355 if (arg
.soqa_exdata_mp
== NULL
) {
2356 arg
.soqa_exdata_mp
=
2357 allocb_wait(sizeof (struct T_exdata_ind
),
2358 BPRI_MED
, STR_NOSIG
, NULL
);
2360 if (arg
.soqa_urgmark_mp
== NULL
) {
2361 arg
.soqa_urgmark_mp
= allocb_wait(0, BPRI_MED
,
2365 DTRACE_PROBE1(proto__fallback__begin
, struct sonode
*, nso
);
2366 rval
= (*fbfunc
)(nso
->so_proto_handle
, NULL
, direct
,
2367 so_quiesced_cb
, &arg
);
2368 DTRACE_PROBE1(proto__fallback__end
, struct sonode
*, nso
);
2370 /* Abort the connection */
2371 zcmn_err(getzoneid(), CE_WARN
,
2372 "Failed to convert socket in accept queue to TPI. "
2373 "Pid = %d\n", curproc
->p_pid
);
2374 next
= list_next(&so
->so_acceptq_list
, nso
);
2375 list_remove(&so
->so_acceptq_list
, nso
);
2376 so
->so_acceptq_len
--;
2378 (void) socket_close(nso
, 0, CRED());
2379 socket_destroy(nso
);
2382 nso
= list_next(&so
->so_acceptq_list
, nso
);
2387 * Now flush the acceptq, this will destroy all sockets. They will
2388 * be recreated in sotpi_accept().
2390 so_acceptq_flush(so
, B_FALSE
);
2392 mutex_enter(&so
->so_lock
);
2393 so
->so_state
|= SS_FALLBACK_COMP
;
2394 mutex_exit(&so
->so_lock
);
2397 * Swap the sonode ops. Socket opertations that come in once this
2398 * is done will proceed without blocking.
2400 so
->so_ops
= &sotpi_sonodeops
;
2403 * Wake up any threads stuck in poll. This is needed since the poll
2404 * head changes when the fallback happens (moves from the sonode to
2405 * the STREAMS head).
2407 pollwakeup(&so
->so_poll_list
, POLLERR
);
2410 * When this non-STREAM socket was created we placed an extra ref on
2411 * the associated vnode to support asynchronous close. Drop that ref
2414 ASSERT(SOTOV(so
)->v_count
>= 2);
2417 so_end_fallback(so
);
2421 so_integrity_check(so
, &origso
);
2423 zcmn_err(getzoneid(), CE_WARN
,
2424 "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2425 error
, curproc
->p_pid
);
2427 SOCKPARAMS_DEC_REF(newsp
);
2429 if (arg
.soqa_exdata_mp
!= NULL
)
2430 freemsg(arg
.soqa_exdata_mp
);
2431 if (arg
.soqa_urgmark_mp
!= NULL
)
2432 freemsg(arg
.soqa_urgmark_mp
);