1 /* $NetBSD: uipc_socket.c,v 1.199 2009/12/30 06:58:50 elad Exp $ */
4 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 2004 The FreeBSD Foundation
34 * Copyright (c) 2004 Robert Watson
35 * Copyright (c) 1982, 1986, 1988, 1990, 1993
36 * The Regents of the University of California. All rights reserved.
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
65 #include <sys/cdefs.h>
66 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.199 2009/12/30 06:58:50 elad Exp $");
68 #include "opt_compat_netbsd.h"
69 #include "opt_sock_counters.h"
70 #include "opt_sosend_loan.h"
71 #include "opt_mbuftrace.h"
72 #include "opt_somaxkva.h"
73 #include "opt_multiprocessor.h" /* XXX */
75 #include <sys/param.h>
76 #include <sys/systm.h>
79 #include <sys/filedesc.h>
82 #include <sys/domain.h>
83 #include <sys/kernel.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/signalvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/uidinfo.h>
90 #include <sys/event.h>
92 #include <sys/kauth.h>
93 #include <sys/mutex.h>
94 #include <sys/condvar.h>
97 #include <compat/sys/time.h>
98 #include <compat/sys/socket.h>
103 MALLOC_DEFINE(M_SOOPTS
, "soopts", "socket options");
104 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
106 extern const struct fileops socketops
;
108 extern int somaxconn
; /* patchable (XXX sysctl) */
109 int somaxconn
= SOMAXCONN
;
110 kmutex_t
*softnet_lock
;
112 #ifdef SOSEND_COUNTERS
113 #include <sys/device.h>
115 static struct evcnt sosend_loan_big
= EVCNT_INITIALIZER(EVCNT_TYPE_MISC
,
116 NULL
, "sosend", "loan big");
117 static struct evcnt sosend_copy_big
= EVCNT_INITIALIZER(EVCNT_TYPE_MISC
,
118 NULL
, "sosend", "copy big");
119 static struct evcnt sosend_copy_small
= EVCNT_INITIALIZER(EVCNT_TYPE_MISC
,
120 NULL
, "sosend", "copy small");
121 static struct evcnt sosend_kvalimit
= EVCNT_INITIALIZER(EVCNT_TYPE_MISC
,
122 NULL
, "sosend", "kva limit");
124 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++
126 EVCNT_ATTACH_STATIC(sosend_loan_big
);
127 EVCNT_ATTACH_STATIC(sosend_copy_big
);
128 EVCNT_ATTACH_STATIC(sosend_copy_small
);
129 EVCNT_ATTACH_STATIC(sosend_kvalimit
);
132 #define SOSEND_COUNTER_INCR(ev) /* nothing */
134 #endif /* SOSEND_COUNTERS */
136 static struct callback_entry sokva_reclaimerentry
;
138 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
139 int sock_loan_thresh
= -1;
141 int sock_loan_thresh
= 4096;
144 static kmutex_t so_pendfree_lock
;
145 static struct mbuf
*so_pendfree
;
148 #define SOMAXKVA (16 * 1024 * 1024)
150 int somaxkva
= SOMAXKVA
;
152 static kcondvar_t socurkva_cv
;
154 static kauth_listener_t socket_listener
;
156 #define SOCK_LOAN_CHUNK 65536
158 static size_t sodopendfree(void);
159 static size_t sodopendfreel(void);
161 static void sysctl_kern_somaxkva_setup(void);
162 static struct sysctllog
*socket_sysctllog
;
165 sokvareserve(struct socket
*so
, vsize_t len
)
169 mutex_enter(&so_pendfree_lock
);
170 while (socurkva
+ len
> somaxkva
) {
174 * try to do pendfree.
177 freed
= sodopendfreel();
180 * if some kva was freed, try again.
186 SOSEND_COUNTER_INCR(&sosend_kvalimit
);
187 error
= cv_wait_sig(&socurkva_cv
, &so_pendfree_lock
);
194 mutex_exit(&so_pendfree_lock
);
199 sokvaunreserve(vsize_t len
)
202 mutex_enter(&so_pendfree_lock
);
204 cv_broadcast(&socurkva_cv
);
205 mutex_exit(&so_pendfree_lock
);
209 * sokvaalloc: allocate kva for loan.
213 sokvaalloc(vsize_t len
, struct socket
*so
)
221 if (sokvareserve(so
, len
) == 0)
228 lva
= uvm_km_alloc(kernel_map
, len
, 0, UVM_KMF_VAONLY
| UVM_KMF_WAITVA
);
238 * sokvafree: free kva for loan.
242 sokvafree(vaddr_t sva
, vsize_t len
)
249 uvm_km_free(kernel_map
, sva
, len
, UVM_KMF_VAONLY
);
259 sodoloanfree(struct vm_page
**pgs
, void *buf
, size_t size
)
265 KASSERT(pgs
!= NULL
);
267 eva
= round_page((vaddr_t
) buf
+ size
);
268 sva
= trunc_page((vaddr_t
) buf
);
270 npgs
= len
>> PAGE_SHIFT
;
272 pmap_kremove(sva
, len
);
273 pmap_update(pmap_kernel());
274 uvm_unloan(pgs
, npgs
, UVM_LOAN_TOPAGE
);
283 if (__predict_true(so_pendfree
== NULL
))
286 mutex_enter(&so_pendfree_lock
);
287 rv
= sodopendfreel();
288 mutex_exit(&so_pendfree_lock
);
294 * sodopendfreel: free mbufs on "pendfree" list.
295 * unlock and relock so_pendfree_lock when freeing mbufs.
297 * => called with so_pendfree_lock held.
303 struct mbuf
*m
, *next
;
306 KASSERT(mutex_owned(&so_pendfree_lock
));
308 while (so_pendfree
!= NULL
) {
311 mutex_exit(&so_pendfree_lock
);
313 for (; m
!= NULL
; m
= next
) {
315 KASSERT((~m
->m_flags
& (M_EXT
|M_EXT_PAGES
)) == 0);
316 KASSERT(m
->m_ext
.ext_refcnt
== 0);
318 rv
+= m
->m_ext
.ext_size
;
319 sodoloanfree(m
->m_ext
.ext_pgs
, m
->m_ext
.ext_buf
,
321 pool_cache_put(mb_cache
, m
);
324 mutex_enter(&so_pendfree_lock
);
331 soloanfree(struct mbuf
*m
, void *buf
, size_t size
, void *arg
)
337 * postpone freeing mbuf.
339 * we can't do it in interrupt context
340 * because we need to put kva back to kernel_map.
343 mutex_enter(&so_pendfree_lock
);
344 m
->m_next
= so_pendfree
;
346 cv_broadcast(&socurkva_cv
);
347 mutex_exit(&so_pendfree_lock
);
351 sosend_loan(struct socket
*so
, struct uio
*uio
, struct mbuf
*m
, long space
)
353 struct iovec
*iov
= uio
->uio_iov
;
361 if (VMSPACE_IS_KERNEL_P(uio
->uio_vmspace
))
364 if (iov
->iov_len
< (size_t) space
)
365 space
= iov
->iov_len
;
366 if (space
> SOCK_LOAN_CHUNK
)
367 space
= SOCK_LOAN_CHUNK
;
369 eva
= round_page((vaddr_t
) iov
->iov_base
+ space
);
370 sva
= trunc_page((vaddr_t
) iov
->iov_base
);
372 npgs
= len
>> PAGE_SHIFT
;
374 KASSERT(npgs
<= M_EXT_MAXPAGES
);
376 lva
= sokvaalloc(len
, so
);
380 error
= uvm_loan(&uio
->uio_vmspace
->vm_map
, sva
, len
,
381 m
->m_ext
.ext_pgs
, UVM_LOAN_TOPAGE
);
387 for (i
= 0, va
= lva
; i
< npgs
; i
++, va
+= PAGE_SIZE
)
388 pmap_kenter_pa(va
, VM_PAGE_TO_PHYS(m
->m_ext
.ext_pgs
[i
]),
390 pmap_update(pmap_kernel());
392 lva
+= (vaddr_t
) iov
->iov_base
& PAGE_MASK
;
394 MEXTADD(m
, (void *) lva
, space
, M_MBUF
, soloanfree
, so
);
395 m
->m_flags
|= M_EXT_PAGES
| M_EXT_ROMAP
;
397 uio
->uio_resid
-= space
;
398 /* uio_offset not updated, not set/used for write(2) */
399 uio
->uio_iov
->iov_base
= (char *)uio
->uio_iov
->iov_base
+ space
;
400 uio
->uio_iov
->iov_len
-= space
;
401 if (uio
->uio_iov
->iov_len
== 0) {
410 sokva_reclaim_callback(struct callback_entry
*ce
, void *obj
, void *arg
)
413 KASSERT(ce
== &sokva_reclaimerentry
);
414 KASSERT(obj
== NULL
);
417 if (!vm_map_starved_p(kernel_map
)) {
418 return CALLBACK_CHAIN_ABORT
;
420 return CALLBACK_CHAIN_CONTINUE
;
424 getsombuf(struct socket
*so
, int type
)
428 m
= m_get(M_WAIT
, type
);
429 MCLAIM(m
, so
->so_mowner
);
434 socket_listener_cb(kauth_cred_t cred
, kauth_action_t action
, void *cookie
,
435 void *arg0
, void *arg1
, void *arg2
, void *arg3
)
438 enum kauth_network_req req
;
440 result
= KAUTH_RESULT_DEFER
;
441 req
= (enum kauth_network_req
)arg0
;
443 if ((action
!= KAUTH_NETWORK_SOCKET
) &&
444 (action
!= KAUTH_NETWORK_BIND
))
448 case KAUTH_REQ_NETWORK_BIND_PORT
:
449 result
= KAUTH_RESULT_ALLOW
;
452 case KAUTH_REQ_NETWORK_SOCKET_DROP
: {
453 /* Normal users can only drop their own connections. */
454 struct socket
*so
= (struct socket
*)arg1
;
456 if (proc_uidmatch(cred
, so
->so_cred
))
457 result
= KAUTH_RESULT_ALLOW
;
462 case KAUTH_REQ_NETWORK_SOCKET_OPEN
:
463 /* We allow "raw" routing/bluetooth sockets to anyone. */
464 if ((u_long
)arg1
== PF_ROUTE
|| (u_long
)arg1
== PF_BLUETOOTH
)
465 result
= KAUTH_RESULT_ALLOW
;
467 /* Privileged, let secmodel handle this. */
468 if ((u_long
)arg2
== SOCK_RAW
)
472 result
= KAUTH_RESULT_ALLOW
;
476 case KAUTH_REQ_NETWORK_SOCKET_CANSEE
:
477 result
= KAUTH_RESULT_ALLOW
;
492 sysctl_kern_somaxkva_setup();
494 mutex_init(&so_pendfree_lock
, MUTEX_DEFAULT
, IPL_VM
);
495 softnet_lock
= mutex_obj_alloc(MUTEX_DEFAULT
, IPL_NONE
);
496 cv_init(&socurkva_cv
, "sokva");
499 /* Set the initial adjusted socket buffer size. */
500 if (sb_max_set(sb_max
))
501 panic("bad initial sb_max value: %lu", sb_max
);
503 callback_register(&vm_map_to_kernel(kernel_map
)->vmk_reclaim_callback
,
504 &sokva_reclaimerentry
, NULL
, sokva_reclaim_callback
);
506 socket_listener
= kauth_listen_scope(KAUTH_SCOPE_NETWORK
,
507 socket_listener_cb
, NULL
);
511 * Socket operation routines.
512 * These routines are called by the routines in
513 * sys_socket.c or from a system process, and
514 * implement the semantics of socket operations by
515 * switching out to the protocol specific routines.
519 socreate(int dom
, struct socket
**aso
, int type
, int proto
, struct lwp
*l
,
520 struct socket
*lockso
)
522 const struct protosw
*prp
;
528 error
= kauth_authorize_network(l
->l_cred
, KAUTH_NETWORK_SOCKET
,
529 KAUTH_REQ_NETWORK_SOCKET_OPEN
, KAUTH_ARG(dom
), KAUTH_ARG(type
),
535 prp
= pffindproto(dom
, proto
, type
);
537 prp
= pffindtype(dom
, type
);
539 /* no support for domain */
540 if (pffinddomain(dom
) == 0)
542 /* no support for socket type */
543 if (proto
== 0 && type
!= 0)
545 return EPROTONOSUPPORT
;
547 if (prp
->pr_usrreq
== NULL
)
548 return EPROTONOSUPPORT
;
549 if (prp
->pr_type
!= type
)
555 so
->so_send
= sosend
;
556 so
->so_receive
= soreceive
;
558 so
->so_rcv
.sb_mowner
= &prp
->pr_domain
->dom_mowner
;
559 so
->so_snd
.sb_mowner
= &prp
->pr_domain
->dom_mowner
;
560 so
->so_mowner
= &prp
->pr_domain
->dom_mowner
;
562 uid
= kauth_cred_geteuid(l
->l_cred
);
563 so
->so_uidinfo
= uid_find(uid
);
564 so
->so_cpid
= l
->l_proc
->p_pid
;
565 if (lockso
!= NULL
) {
566 /* Caller wants us to share a lock. */
567 lock
= lockso
->so_lock
;
569 mutex_obj_hold(lock
);
572 /* Lock assigned and taken during PRU_ATTACH. */
574 error
= (*prp
->pr_usrreq
)(so
, PRU_ATTACH
, NULL
,
575 (struct mbuf
*)(long)proto
, NULL
, l
);
576 KASSERT(solocked(so
));
578 so
->so_state
|= SS_NOFDREF
;
582 so
->so_cred
= kauth_cred_dup(l
->l_cred
);
588 /* On success, write file descriptor to fdout and return zero. On
589 * failure, return non-zero; *fdout will be undefined.
592 fsocreate(int domain
, struct socket
**sop
, int type
, int protocol
,
593 struct lwp
*l
, int *fdout
)
599 if ((error
= fd_allocfile(&fp
, &fd
)) != 0)
601 fp
->f_flag
= FREAD
|FWRITE
;
602 fp
->f_type
= DTYPE_SOCKET
;
603 fp
->f_ops
= &socketops
;
604 error
= socreate(domain
, &so
, type
, protocol
, l
, NULL
);
606 fd_abort(curproc
, fp
, fd
);
611 fd_affix(curproc
, fp
, fd
);
618 sofamily(const struct socket
*so
)
620 const struct protosw
*pr
;
621 const struct domain
*dom
;
623 if ((pr
= so
->so_proto
) == NULL
)
625 if ((dom
= pr
->pr_domain
) == NULL
)
627 return dom
->dom_family
;
631 sobind(struct socket
*so
, struct mbuf
*nam
, struct lwp
*l
)
636 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_BIND
, NULL
, nam
, NULL
, l
);
642 solisten(struct socket
*so
, int backlog
, struct lwp
*l
)
647 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
|
648 SS_ISDISCONNECTING
)) != 0) {
652 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_LISTEN
, NULL
,
658 if (TAILQ_EMPTY(&so
->so_q
))
659 so
->so_options
|= SO_ACCEPTCONN
;
662 so
->so_qlimit
= min(backlog
, somaxconn
);
668 sofree(struct socket
*so
)
672 KASSERT(solocked(so
));
674 if (so
->so_pcb
|| (so
->so_state
& SS_NOFDREF
) == 0) {
680 * We must not decommission a socket that's on the accept(2)
681 * queue. If we do, then accept(2) may hang after select(2)
682 * indicated that the listening socket was ready.
684 if (!soqremque(so
, 0)) {
689 if (so
->so_rcv
.sb_hiwat
)
690 (void)chgsbsize(so
->so_uidinfo
, &so
->so_rcv
.sb_hiwat
, 0,
692 if (so
->so_snd
.sb_hiwat
)
693 (void)chgsbsize(so
->so_uidinfo
, &so
->so_snd
.sb_hiwat
, 0,
695 sbrelease(&so
->so_snd
, so
);
696 KASSERT(!cv_has_waiters(&so
->so_cv
));
697 KASSERT(!cv_has_waiters(&so
->so_rcv
.sb_cv
));
698 KASSERT(!cv_has_waiters(&so
->so_snd
.sb_cv
));
700 refs
= so
->so_aborting
; /* XXX */
701 /* Remove acccept filter if one is present. */
702 if (so
->so_accf
!= NULL
)
703 (void)accept_filt_clear(so
);
705 if (refs
== 0) /* XXX */
710 * Close a socket on last file table reference removal.
711 * Initiate disconnect if connected.
712 * Free socket when disconnect complete.
715 soclose(struct socket
*so
)
723 if (so
->so_options
& SO_ACCEPTCONN
) {
725 if ((so2
= TAILQ_FIRST(&so
->so_q0
)) != 0) {
726 KASSERT(solocked2(so
, so2
));
727 (void) soqremque(so2
, 0);
728 /* soabort drops the lock. */
733 if ((so2
= TAILQ_FIRST(&so
->so_q
)) != 0) {
734 KASSERT(solocked2(so
, so2
));
735 (void) soqremque(so2
, 1);
736 /* soabort drops the lock. */
746 if (so
->so_state
& SS_ISCONNECTED
) {
747 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
748 error
= sodisconnect(so
);
752 if (so
->so_options
& SO_LINGER
) {
753 if ((so
->so_state
& SS_ISDISCONNECTING
) && so
->so_nbio
)
755 while (so
->so_state
& SS_ISCONNECTED
) {
756 error
= sowait(so
, true, so
->so_linger
* hz
);
764 error2
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_DETACH
,
765 NULL
, NULL
, NULL
, NULL
);
770 if (so
->so_state
& SS_NOFDREF
)
771 panic("soclose: NOFDREF");
772 kauth_cred_free(so
->so_cred
);
773 so
->so_state
|= SS_NOFDREF
;
779 * Must be called with the socket locked.. Will return with it unlocked.
782 soabort(struct socket
*so
)
787 KASSERT(solocked(so
));
788 KASSERT(so
->so_head
== NULL
);
790 so
->so_aborting
++; /* XXX */
791 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_ABORT
, NULL
,
793 refs
= --so
->so_aborting
; /* XXX */
794 if (error
|| (refs
== 0)) {
803 soaccept(struct socket
*so
, struct mbuf
*nam
)
807 KASSERT(solocked(so
));
810 if ((so
->so_state
& SS_NOFDREF
) == 0)
811 panic("soaccept: !NOFDREF");
812 so
->so_state
&= ~SS_NOFDREF
;
813 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 ||
814 (so
->so_proto
->pr_flags
& PR_ABRTACPTDIS
) == 0)
815 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_ACCEPT
,
816 NULL
, nam
, NULL
, NULL
);
818 error
= ECONNABORTED
;
824 soconnect(struct socket
*so
, struct mbuf
*nam
, struct lwp
*l
)
828 KASSERT(solocked(so
));
830 if (so
->so_options
& SO_ACCEPTCONN
)
833 * If protocol is connection-based, can only connect once.
834 * Otherwise, if connected, try to disconnect first.
835 * This allows user to disconnect by connecting to, e.g.,
838 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
839 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
840 (error
= sodisconnect(so
))))
843 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_CONNECT
,
849 soconnect2(struct socket
*so1
, struct socket
*so2
)
853 KASSERT(solocked2(so1
, so2
));
855 error
= (*so1
->so_proto
->pr_usrreq
)(so1
, PRU_CONNECT2
,
856 NULL
, (struct mbuf
*)so2
, NULL
, NULL
);
861 sodisconnect(struct socket
*so
)
865 KASSERT(solocked(so
));
867 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
869 } else if (so
->so_state
& SS_ISDISCONNECTING
) {
872 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_DISCONNECT
,
873 NULL
, NULL
, NULL
, NULL
);
879 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
882 * If send must go all at once and message is larger than
883 * send buffering, then hard error.
884 * Lock against other senders.
885 * If must go all at once and not enough room now, then
886 * inform user that this would block and do nothing.
887 * Otherwise, if nonblocking, send as much as possible.
888 * The data to be sent is described by "uio" if nonzero,
889 * otherwise by the mbuf chain "top" (which must be null
890 * if uio is not). Data provided in mbuf chain must be small
891 * enough to send all at once.
893 * Returns nonzero on error, timeout or signal; callers
894 * must check for short counts if EINTR/ERESTART are returned.
895 * Data and control buffers are freed on return.
898 sosend(struct socket
*so
, struct mbuf
*addr
, struct uio
*uio
, struct mbuf
*top
,
899 struct mbuf
*control
, int flags
, struct lwp
*l
)
901 struct mbuf
**mp
, *m
;
903 long space
, len
, resid
, clen
, mlen
;
904 int error
, s
, dontroute
, atomic
;
905 short wakeup_state
= 0;
912 * solock() provides atomicity of access. splsoftnet() prevents
913 * protocol processing soft interrupts from interrupting us and
914 * blocking (expensive).
918 atomic
= sosendallatonce(so
) || top
;
920 resid
= uio
->uio_resid
;
922 resid
= top
->m_pkthdr
.len
;
924 * In theory resid should be unsigned.
925 * However, space must be signed, as it might be less than 0
926 * if we over-committed, and we must use a signed comparison
927 * of space and resid. On the other hand, a negative resid
928 * causes us to loop sending 0-length segments to the protocol.
935 (flags
& MSG_DONTROUTE
) && (so
->so_options
& SO_DONTROUTE
) == 0 &&
936 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
939 clen
= control
->m_len
;
941 if ((error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
))) != 0)
944 if (so
->so_state
& SS_CANTSENDMORE
) {
949 error
= so
->so_error
;
953 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
954 if (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) {
955 if ((so
->so_state
& SS_ISCONFIRMING
) == 0 &&
956 !(resid
== 0 && clen
!= 0)) {
960 } else if (addr
== 0) {
961 error
= EDESTADDRREQ
;
965 space
= sbspace(&so
->so_snd
);
968 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
969 clen
> so
->so_snd
.sb_hiwat
) {
973 if (space
< resid
+ clen
&&
974 (atomic
|| space
< so
->so_snd
.sb_lowat
|| space
< clen
)) {
979 sbunlock(&so
->so_snd
);
980 if (wakeup_state
& SS_RESTARTSYS
) {
984 error
= sbwait(&so
->so_snd
);
987 wakeup_state
= so
->so_state
;
996 * Data is prepackaged in "top".
1000 top
->m_flags
|= M_EOR
;
1005 m
= m_gethdr(M_WAIT
, MT_DATA
);
1007 m
->m_pkthdr
.len
= 0;
1008 m
->m_pkthdr
.rcvif
= NULL
;
1010 m
= m_get(M_WAIT
, MT_DATA
);
1013 MCLAIM(m
, so
->so_snd
.sb_mowner
);
1014 if (sock_loan_thresh
>= 0 &&
1015 uio
->uio_iov
->iov_len
>= sock_loan_thresh
&&
1016 space
>= sock_loan_thresh
&&
1017 (len
= sosend_loan(so
, uio
, m
,
1019 SOSEND_COUNTER_INCR(&sosend_loan_big
);
1023 if (resid
>= MINCLSIZE
&& space
>= MCLBYTES
) {
1024 SOSEND_COUNTER_INCR(&sosend_copy_big
);
1026 if ((m
->m_flags
& M_EXT
) == 0)
1029 if (atomic
&& top
== 0) {
1030 len
= lmin(MCLBYTES
- max_hdr
,
1032 m
->m_data
+= max_hdr
;
1034 len
= lmin(MCLBYTES
, resid
);
1038 SOSEND_COUNTER_INCR(&sosend_copy_small
);
1039 len
= lmin(lmin(mlen
, resid
), space
);
1042 * For datagram protocols, leave room
1043 * for protocol headers in first mbuf.
1045 if (atomic
&& top
== 0 && len
< mlen
)
1048 error
= uiomove(mtod(m
, void *), (int)len
, uio
);
1050 resid
= uio
->uio_resid
;
1053 top
->m_pkthdr
.len
+= len
;
1060 if (flags
& MSG_EOR
)
1061 top
->m_flags
|= M_EOR
;
1064 } while (space
> 0 && atomic
);
1066 if (so
->so_state
& SS_CANTSENDMORE
) {
1071 so
->so_options
|= SO_DONTROUTE
;
1073 so
->so_state
|= SS_MORETOCOME
;
1074 error
= (*so
->so_proto
->pr_usrreq
)(so
,
1075 (flags
& MSG_OOB
) ? PRU_SENDOOB
: PRU_SEND
,
1076 top
, addr
, control
, curlwp
);
1078 so
->so_options
&= ~SO_DONTROUTE
;
1080 so
->so_state
&= ~SS_MORETOCOME
;
1087 } while (resid
&& space
> 0);
1091 sbunlock(&so
->so_snd
);
1103 * Following replacement or removal of the first mbuf on the first
1104 * mbuf chain of a socket buffer, push necessary state changes back
1105 * into the socket buffer so that other consumers see the values
1106 * consistently. 'nextrecord' is the callers locally stored value of
1107 * the original value of sb->sb_mb->m_nextpkt which must be restored
1108 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
1111 sbsync(struct sockbuf
*sb
, struct mbuf
*nextrecord
)
1114 KASSERT(solocked(sb
->sb_so
));
1117 * First, update for the new value of nextrecord. If necessary,
1118 * make it the first record.
1120 if (sb
->sb_mb
!= NULL
)
1121 sb
->sb_mb
->m_nextpkt
= nextrecord
;
1123 sb
->sb_mb
= nextrecord
;
1126 * Now update any dependent socket buffer fields to reflect
1127 * the new state. This is an inline of SB_EMPTY_FIXUP, with
1128 * the addition of a second clause that takes care of the
1129 * case where sb_mb has been updated, but remains the last
1132 if (sb
->sb_mb
== NULL
) {
1133 sb
->sb_mbtail
= NULL
;
1134 sb
->sb_lastrecord
= NULL
;
1135 } else if (sb
->sb_mb
->m_nextpkt
== NULL
)
1136 sb
->sb_lastrecord
= sb
->sb_mb
;
1140 * Implement receive operations on a socket.
1141 * We depend on the way that records are added to the sockbuf
1142 * by sbappend*. In particular, each record (mbufs linked through m_next)
1143 * must begin with an address if the protocol so specifies,
1144 * followed by an optional mbuf or mbufs containing ancillary data,
1145 * and then zero or more mbufs of data.
1146 * In order to avoid blocking network interrupts for the entire time here,
1147 * we splx() while doing the actual copy to user space.
1148 * Although the sockbuf is locked, new data may still be appended,
1149 * and thus we must maintain consistency of the sockbuf during that time.
1151 * The caller may receive the data as a single mbuf chain by supplying
1152 * an mbuf **mp0 for use in returning the chain. The uio is then used
1153 * only for the count in uio_resid.
1156 soreceive(struct socket
*so
, struct mbuf
**paddr
, struct uio
*uio
,
1157 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
1159 struct lwp
*l
= curlwp
;
1160 struct mbuf
*m
, **mp
, *mt
;
1161 int atomic
, flags
, len
, error
, s
, offset
, moff
, type
, orig_resid
;
1162 const struct protosw
*pr
;
1163 struct mbuf
*nextrecord
;
1164 int mbuf_removed
= 0;
1165 const struct domain
*dom
;
1166 short wakeup_state
= 0;
1169 atomic
= pr
->pr_flags
& PR_ATOMIC
;
1170 dom
= pr
->pr_domain
;
1173 orig_resid
= uio
->uio_resid
;
1177 if (controlp
!= NULL
)
1180 flags
= *flagsp
&~ MSG_EOR
;
1184 if ((flags
& MSG_DONTWAIT
) == 0)
1187 if (flags
& MSG_OOB
) {
1188 m
= m_get(M_WAIT
, MT_DATA
);
1190 error
= (*pr
->pr_usrreq
)(so
, PRU_RCVOOB
, m
,
1191 (struct mbuf
*)(long)(flags
& MSG_PEEK
), NULL
, l
);
1196 error
= uiomove(mtod(m
, void *),
1197 (int) min(uio
->uio_resid
, m
->m_len
), uio
);
1199 } while (uio
->uio_resid
> 0 && error
== 0 && m
);
1209 * solock() provides atomicity of access. splsoftnet() prevents
1210 * protocol processing soft interrupts from interrupting us and
1211 * blocking (expensive).
1215 if (so
->so_state
& SS_ISCONFIRMING
&& uio
->uio_resid
)
1216 (*pr
->pr_usrreq
)(so
, PRU_RCVD
, NULL
, NULL
, NULL
, l
);
1219 if ((error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
))) != 0) {
1225 m
= so
->so_rcv
.sb_mb
;
1227 * If we have less data than requested, block awaiting more
1228 * (subject to any timeout) if:
1229 * 1. the current count is less than the low water mark,
1230 * 2. MSG_WAITALL is set, and it is possible to do the entire
1231 * receive operation at once if we block (resid <= hiwat), or
1232 * 3. MSG_DONTWAIT is not set.
1233 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1234 * we have to do the receive in sections, and thus risk returning
1235 * a short count if a timeout or signal occurs after we start.
1238 ((flags
& MSG_DONTWAIT
) == 0 &&
1239 so
->so_rcv
.sb_cc
< uio
->uio_resid
&&
1240 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
1241 ((flags
& MSG_WAITALL
) &&
1242 uio
->uio_resid
<= so
->so_rcv
.sb_hiwat
)) &&
1243 m
->m_nextpkt
== NULL
&& !atomic
)) {
1245 if (m
== NULL
&& so
->so_rcv
.sb_cc
)
1251 error
= so
->so_error
;
1252 if ((flags
& MSG_PEEK
) == 0)
1256 if (so
->so_state
& SS_CANTRCVMORE
) {
1262 for (; m
!= NULL
; m
= m
->m_next
)
1263 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
1264 m
= so
->so_rcv
.sb_mb
;
1267 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
1268 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
1272 if (uio
->uio_resid
== 0)
1274 if (so
->so_nbio
|| (flags
& MSG_DONTWAIT
)) {
1275 error
= EWOULDBLOCK
;
1278 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
1279 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
1280 sbunlock(&so
->so_rcv
);
1281 if (wakeup_state
& SS_RESTARTSYS
)
1284 error
= sbwait(&so
->so_rcv
);
1290 wakeup_state
= so
->so_state
;
1295 * On entry here, m points to the first record of the socket buffer.
1296 * From this point onward, we maintain 'nextrecord' as a cache of the
1297 * pointer to the next record in the socket buffer. We must keep the
1298 * various socket buffer pointers and local stack versions of the
1299 * pointers in sync, pushing out modifications before dropping the
1300 * socket lock, and re-reading them when picking it up.
1302 * Otherwise, we will race with the network stack appending new data
1303 * or records onto the socket buffer by using inconsistent/stale
1304 * versions of the field, possibly resulting in socket buffer
1307 * By holding the high-level sblock(), we prevent simultaneous
1308 * readers from pulling off the front of the socket buffer.
1311 l
->l_ru
.ru_msgrcv
++;
1312 KASSERT(m
== so
->so_rcv
.sb_mb
);
1313 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
1314 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
1315 nextrecord
= m
->m_nextpkt
;
1316 if (pr
->pr_flags
& PR_ADDR
) {
1318 if (m
->m_type
!= MT_SONAME
)
1319 panic("receive 1a");
1322 if (flags
& MSG_PEEK
) {
1324 *paddr
= m_copy(m
, 0, m
->m_len
);
1327 sbfree(&so
->so_rcv
, m
);
1329 if (paddr
!= NULL
) {
1331 so
->so_rcv
.sb_mb
= m
->m_next
;
1333 m
= so
->so_rcv
.sb_mb
;
1335 MFREE(m
, so
->so_rcv
.sb_mb
);
1336 m
= so
->so_rcv
.sb_mb
;
1338 sbsync(&so
->so_rcv
, nextrecord
);
1343 * Process one or more MT_CONTROL mbufs present before any data mbufs
1344 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1345 * just copy the data; if !MSG_PEEK, we call into the protocol to
1346 * perform externalization (or freeing if controlp == NULL).
1348 if (__predict_false(m
!= NULL
&& m
->m_type
== MT_CONTROL
)) {
1349 struct mbuf
*cm
= NULL
, *cmn
;
1350 struct mbuf
**cme
= &cm
;
1353 if (flags
& MSG_PEEK
) {
1354 if (controlp
!= NULL
) {
1355 *controlp
= m_copy(m
, 0, m
->m_len
);
1356 controlp
= &(*controlp
)->m_next
;
1360 sbfree(&so
->so_rcv
, m
);
1361 so
->so_rcv
.sb_mb
= m
->m_next
;
1364 cme
= &(*cme
)->m_next
;
1365 m
= so
->so_rcv
.sb_mb
;
1367 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
1368 if ((flags
& MSG_PEEK
) == 0)
1369 sbsync(&so
->so_rcv
, nextrecord
);
1370 for (; cm
!= NULL
; cm
= cmn
) {
1373 type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
1374 if (controlp
!= NULL
) {
1375 if (dom
->dom_externalize
!= NULL
&&
1376 type
== SCM_RIGHTS
) {
1379 error
= (*dom
->dom_externalize
)(cm
, l
);
1384 while (*controlp
!= NULL
)
1385 controlp
= &(*controlp
)->m_next
;
1388 * Dispose of any SCM_RIGHTS message that went
1389 * through the read path rather than recv.
1391 if (dom
->dom_dispose
!= NULL
&&
1392 type
== SCM_RIGHTS
) {
1394 (*dom
->dom_dispose
)(cm
);
1401 nextrecord
= so
->so_rcv
.sb_mb
->m_nextpkt
;
1403 nextrecord
= so
->so_rcv
.sb_mb
;
1407 /* If m is non-NULL, we have some data to read. */
1408 if (__predict_true(m
!= NULL
)) {
1410 if (type
== MT_OOBDATA
)
1413 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
1414 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
1418 while (m
!= NULL
&& uio
->uio_resid
> 0 && error
== 0) {
1419 if (m
->m_type
== MT_OOBDATA
) {
1420 if (type
!= MT_OOBDATA
)
1422 } else if (type
== MT_OOBDATA
)
1425 else if (m
->m_type
!= MT_DATA
&& m
->m_type
!= MT_HEADER
)
1428 so
->so_state
&= ~SS_RCVATMARK
;
1430 len
= uio
->uio_resid
;
1431 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
1432 len
= so
->so_oobmark
- offset
;
1433 if (len
> m
->m_len
- moff
)
1434 len
= m
->m_len
- moff
;
1436 * If mp is set, just pass back the mbufs.
1437 * Otherwise copy them out via the uio, then free.
1438 * Sockbuf must be consistent here (points to current mbuf,
1439 * it points to next record) when we drop priority;
1440 * we must note any additions to the sockbuf when we
1441 * block interrupts again.
1444 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
1445 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
1448 error
= uiomove(mtod(m
, char *) + moff
, (int)len
, uio
);
1453 * If any part of the record has been removed
1454 * (such as the MT_SONAME mbuf, which will
1455 * happen when PR_ADDR, and thus also
1456 * PR_ATOMIC, is set), then drop the entire
1457 * record to maintain the atomicity of the
1458 * receive operation.
1460 * This avoids a later panic("receive 1a")
1461 * when compiled with DIAGNOSTIC.
1463 if (m
&& mbuf_removed
&& atomic
)
1464 (void) sbdroprecord(&so
->so_rcv
);
1469 uio
->uio_resid
-= len
;
1470 if (len
== m
->m_len
- moff
) {
1471 if (m
->m_flags
& M_EOR
)
1473 if (flags
& MSG_PEEK
) {
1477 nextrecord
= m
->m_nextpkt
;
1478 sbfree(&so
->so_rcv
, m
);
1482 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
1485 MFREE(m
, so
->so_rcv
.sb_mb
);
1486 m
= so
->so_rcv
.sb_mb
;
1489 * If m != NULL, we also know that
1490 * so->so_rcv.sb_mb != NULL.
1492 KASSERT(so
->so_rcv
.sb_mb
== m
);
1494 m
->m_nextpkt
= nextrecord
;
1495 if (nextrecord
== NULL
)
1496 so
->so_rcv
.sb_lastrecord
= m
;
1498 so
->so_rcv
.sb_mb
= nextrecord
;
1499 SB_EMPTY_FIXUP(&so
->so_rcv
);
1501 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
1502 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
1504 } else if (flags
& MSG_PEEK
)
1508 mt
= m_copym(m
, 0, len
, M_NOWAIT
);
1509 if (__predict_false(mt
== NULL
)) {
1511 mt
= m_copym(m
, 0, len
, M_WAIT
);
1518 so
->so_rcv
.sb_cc
-= len
;
1520 if (so
->so_oobmark
) {
1521 if ((flags
& MSG_PEEK
) == 0) {
1522 so
->so_oobmark
-= len
;
1523 if (so
->so_oobmark
== 0) {
1524 so
->so_state
|= SS_RCVATMARK
;
1529 if (offset
== so
->so_oobmark
)
1533 if (flags
& MSG_EOR
)
1536 * If the MSG_WAITALL flag is set (for non-atomic socket),
1537 * we must not quit until "uio->uio_resid == 0" or an error
1538 * termination. If a signal/timeout occurs, return
1539 * with a short count but without error.
1540 * Keep sockbuf locked against other readers.
1542 while (flags
& MSG_WAITALL
&& m
== NULL
&& uio
->uio_resid
> 0 &&
1543 !sosendallatonce(so
) && !nextrecord
) {
1544 if (so
->so_error
|| so
->so_state
& SS_CANTRCVMORE
)
1547 * If we are peeking and the socket receive buffer is
1548 * full, stop since we can't get more data to peek at.
1550 if ((flags
& MSG_PEEK
) && sbspace(&so
->so_rcv
) <= 0)
1553 * If we've drained the socket buffer, tell the
1554 * protocol in case it needs to do something to
1555 * get it filled again.
1557 if ((pr
->pr_flags
& PR_WANTRCVD
) && so
->so_pcb
)
1558 (*pr
->pr_usrreq
)(so
, PRU_RCVD
,
1559 NULL
, (struct mbuf
*)(long)flags
, NULL
, l
);
1560 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
1561 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
1562 if (wakeup_state
& SS_RESTARTSYS
)
1565 error
= sbwait(&so
->so_rcv
);
1567 sbunlock(&so
->so_rcv
);
1572 if ((m
= so
->so_rcv
.sb_mb
) != NULL
)
1573 nextrecord
= m
->m_nextpkt
;
1574 wakeup_state
= so
->so_state
;
1580 if ((flags
& MSG_PEEK
) == 0)
1581 (void) sbdroprecord(&so
->so_rcv
);
1583 if ((flags
& MSG_PEEK
) == 0) {
1586 * First part is an inline SB_EMPTY_FIXUP(). Second
1587 * part makes sure sb_lastrecord is up-to-date if
1588 * there is still data in the socket buffer.
1590 so
->so_rcv
.sb_mb
= nextrecord
;
1591 if (so
->so_rcv
.sb_mb
== NULL
) {
1592 so
->so_rcv
.sb_mbtail
= NULL
;
1593 so
->so_rcv
.sb_lastrecord
= NULL
;
1594 } else if (nextrecord
->m_nextpkt
== NULL
)
1595 so
->so_rcv
.sb_lastrecord
= nextrecord
;
1597 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
1598 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
1599 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
1600 (*pr
->pr_usrreq
)(so
, PRU_RCVD
, NULL
,
1601 (struct mbuf
*)(long)flags
, NULL
, l
);
1603 if (orig_resid
== uio
->uio_resid
&& orig_resid
&&
1604 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
1605 sbunlock(&so
->so_rcv
);
1612 sbunlock(&so
->so_rcv
);
1619 soshutdown(struct socket
*so
, int how
)
1621 const struct protosw
*pr
;
1624 KASSERT(solocked(so
));
1627 if (!(how
== SHUT_RD
|| how
== SHUT_WR
|| how
== SHUT_RDWR
))
1630 if (how
== SHUT_RD
|| how
== SHUT_RDWR
) {
1634 if (how
== SHUT_WR
|| how
== SHUT_RDWR
)
1635 error
= (*pr
->pr_usrreq
)(so
, PRU_SHUTDOWN
, NULL
,
1642 sorestart(struct socket
*so
)
1645 * An application has called close() on an fd on which another
1646 * of its threads has called a socket system call.
1647 * Mark this and wake everyone up, and code that would block again
1648 * instead returns ERESTART.
1649 * On system call re-entry the fd is validated and EBADF returned.
1650 * Any other fd will block again on the 2nd syscall.
1653 so
->so_state
|= SS_RESTARTSYS
;
1654 cv_broadcast(&so
->so_cv
);
1655 cv_broadcast(&so
->so_snd
.sb_cv
);
1656 cv_broadcast(&so
->so_rcv
.sb_cv
);
1661 sorflush(struct socket
*so
)
1663 struct sockbuf
*sb
, asb
;
1664 const struct protosw
*pr
;
1666 KASSERT(solocked(so
));
1671 sb
->sb_flags
|= SB_NOINTR
;
1672 (void )sblock(sb
, M_WAITOK
);
1676 * Clear most of the sockbuf structure, but leave some of the
1679 memset(&sb
->sb_startzero
, 0,
1680 sizeof(*sb
) - offsetof(struct sockbuf
, sb_startzero
));
1681 if (pr
->pr_flags
& PR_RIGHTS
&& pr
->pr_domain
->dom_dispose
) {
1683 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
1686 sbrelease(&asb
, so
);
1690 * internal set SOL_SOCKET options
1693 sosetopt1(struct socket
*so
, const struct sockopt
*sopt
)
1695 int error
= EINVAL
, optval
, opt
;
1699 switch ((opt
= sopt
->sopt_name
)) {
1701 case SO_ACCEPTFILTER
:
1702 error
= accept_filt_setopt(so
, sopt
);
1703 KASSERT(solocked(so
));
1707 error
= sockopt_get(sopt
, &l
, sizeof(l
));
1711 if (l
.l_linger
< 0 || l
.l_linger
> USHRT_MAX
||
1712 l
.l_linger
> (INT_MAX
/ hz
)) {
1716 so
->so_linger
= l
.l_linger
;
1718 so
->so_options
|= SO_LINGER
;
1720 so
->so_options
&= ~SO_LINGER
;
1726 case SO_USELOOPBACK
:
1732 #ifdef SO_OTIMESTAMP
1735 error
= sockopt_getint(sopt
, &optval
);
1740 so
->so_options
|= opt
;
1742 so
->so_options
&= ~opt
;
1749 error
= sockopt_getint(sopt
, &optval
);
1755 * Values < 1 make no sense for any of these
1756 * options, so disallow them.
1765 if (sbreserve(&so
->so_snd
, (u_long
)optval
, so
) == 0) {
1769 so
->so_snd
.sb_flags
&= ~SB_AUTOSIZE
;
1773 if (sbreserve(&so
->so_rcv
, (u_long
)optval
, so
) == 0) {
1777 so
->so_rcv
.sb_flags
&= ~SB_AUTOSIZE
;
1781 * Make sure the low-water is never greater than
1785 if (optval
> so
->so_snd
.sb_hiwat
)
1786 optval
= so
->so_snd
.sb_hiwat
;
1788 so
->so_snd
.sb_lowat
= optval
;
1792 if (optval
> so
->so_rcv
.sb_hiwat
)
1793 optval
= so
->so_rcv
.sb_hiwat
;
1795 so
->so_rcv
.sb_lowat
= optval
;
1802 case SO_ORCVTIMEO
: {
1803 struct timeval50 otv
;
1804 error
= sockopt_get(sopt
, &otv
, sizeof(otv
));
1809 timeval50_to_timeval(&otv
, &tv
);
1810 opt
= opt
== SO_OSNDTIMEO
? SO_SNDTIMEO
: SO_RCVTIMEO
;
1814 #endif /* COMPAT_50 */
1819 error
= sockopt_get(sopt
, &tv
, sizeof(tv
));
1824 if (tv
.tv_sec
> (INT_MAX
- tv
.tv_usec
/ tick
) / hz
) {
1829 optval
= tv
.tv_sec
* hz
+ tv
.tv_usec
/ tick
;
1830 if (optval
== 0 && tv
.tv_usec
!= 0)
1835 so
->so_snd
.sb_timeo
= optval
;
1838 so
->so_rcv
.sb_timeo
= optval
;
1845 error
= ENOPROTOOPT
;
1848 KASSERT(solocked(so
));
1853 sosetopt(struct socket
*so
, struct sockopt
*sopt
)
1857 if (sopt
->sopt_level
== SOL_SOCKET
) {
1858 error
= sosetopt1(so
, sopt
);
1859 KASSERT(solocked(so
));
1861 error
= ENOPROTOOPT
;
1865 if ((error
== 0 || error
== ENOPROTOOPT
) &&
1866 so
->so_proto
!= NULL
&& so
->so_proto
->pr_ctloutput
!= NULL
) {
1867 /* give the protocol stack a shot */
1868 prerr
= (*so
->so_proto
->pr_ctloutput
)(PRCO_SETOPT
, so
, sopt
);
1871 else if (prerr
!= ENOPROTOOPT
)
1879 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
1882 so_setsockopt(struct lwp
*l
, struct socket
*so
, int level
, int name
,
1883 const void *val
, size_t valsize
)
1885 struct sockopt sopt
;
1888 KASSERT(valsize
== 0 || val
!= NULL
);
1890 sockopt_init(&sopt
, level
, name
, valsize
);
1891 sockopt_set(&sopt
, val
, valsize
);
1893 error
= sosetopt(so
, &sopt
);
1895 sockopt_destroy(&sopt
);
1901 * internal get SOL_SOCKET options
1904 sogetopt1(struct socket
*so
, struct sockopt
*sopt
)
1906 int error
, optval
, opt
;
1910 switch ((opt
= sopt
->sopt_name
)) {
1912 case SO_ACCEPTFILTER
:
1913 error
= accept_filt_getopt(so
, sopt
);
1917 l
.l_onoff
= (so
->so_options
& SO_LINGER
) ? 1 : 0;
1918 l
.l_linger
= so
->so_linger
;
1920 error
= sockopt_set(sopt
, &l
, sizeof(l
));
1923 case SO_USELOOPBACK
:
1932 #ifdef SO_OTIMESTAMP
1935 error
= sockopt_setint(sopt
, (so
->so_options
& opt
) ? 1 : 0);
1939 error
= sockopt_setint(sopt
, so
->so_type
);
1943 error
= sockopt_setint(sopt
, so
->so_error
);
1948 error
= sockopt_setint(sopt
, so
->so_snd
.sb_hiwat
);
1952 error
= sockopt_setint(sopt
, so
->so_rcv
.sb_hiwat
);
1956 error
= sockopt_setint(sopt
, so
->so_snd
.sb_lowat
);
1960 error
= sockopt_setint(sopt
, so
->so_rcv
.sb_lowat
);
1965 case SO_ORCVTIMEO
: {
1966 struct timeval50 otv
;
1968 optval
= (opt
== SO_OSNDTIMEO
?
1969 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
1971 otv
.tv_sec
= optval
/ hz
;
1972 otv
.tv_usec
= (optval
% hz
) * tick
;
1974 error
= sockopt_set(sopt
, &otv
, sizeof(otv
));
1977 #endif /* COMPAT_50 */
1981 optval
= (opt
== SO_SNDTIMEO
?
1982 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
1984 tv
.tv_sec
= optval
/ hz
;
1985 tv
.tv_usec
= (optval
% hz
) * tick
;
1987 error
= sockopt_set(sopt
, &tv
, sizeof(tv
));
1991 error
= sockopt_setint(sopt
, so
->so_rcv
.sb_overflowed
);
1995 error
= ENOPROTOOPT
;
2003 sogetopt(struct socket
*so
, struct sockopt
*sopt
)
2008 if (sopt
->sopt_level
!= SOL_SOCKET
) {
2009 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
2010 error
= ((*so
->so_proto
->pr_ctloutput
)
2011 (PRCO_GETOPT
, so
, sopt
));
2013 error
= (ENOPROTOOPT
);
2015 error
= sogetopt1(so
, sopt
);
2022 * alloc sockopt data buffer buffer
2023 * - will be released at destroy
2026 sockopt_alloc(struct sockopt
*sopt
, size_t len
, km_flag_t kmflag
)
2029 KASSERT(sopt
->sopt_size
== 0);
2031 if (len
> sizeof(sopt
->sopt_buf
)) {
2032 sopt
->sopt_data
= kmem_zalloc(len
, kmflag
);
2033 if (sopt
->sopt_data
== NULL
)
2036 sopt
->sopt_data
= sopt
->sopt_buf
;
2038 sopt
->sopt_size
= len
;
2043 * initialise sockopt storage
2044 * - MAY sleep during allocation
2047 sockopt_init(struct sockopt
*sopt
, int level
, int name
, size_t size
)
2050 memset(sopt
, 0, sizeof(*sopt
));
2052 sopt
->sopt_level
= level
;
2053 sopt
->sopt_name
= name
;
2054 (void)sockopt_alloc(sopt
, size
, KM_SLEEP
);
2058 * destroy sockopt storage
2059 * - will release any held memory references
2062 sockopt_destroy(struct sockopt
*sopt
)
2065 if (sopt
->sopt_data
!= sopt
->sopt_buf
)
2066 kmem_free(sopt
->sopt_data
, sopt
->sopt_size
);
2068 memset(sopt
, 0, sizeof(*sopt
));
2073 * - value is copied into sockopt
2074 * - memory is allocated when necessary, will not sleep
2077 sockopt_set(struct sockopt
*sopt
, const void *buf
, size_t len
)
2081 if (sopt
->sopt_size
== 0) {
2082 error
= sockopt_alloc(sopt
, len
, KM_NOSLEEP
);
2087 KASSERT(sopt
->sopt_size
== len
);
2088 memcpy(sopt
->sopt_data
, buf
, len
);
2093 * common case of set sockopt integer value
2096 sockopt_setint(struct sockopt
*sopt
, int val
)
2099 return sockopt_set(sopt
, &val
, sizeof(int));
2104 * - correct size must be given
2107 sockopt_get(const struct sockopt
*sopt
, void *buf
, size_t len
)
2110 if (sopt
->sopt_size
!= len
)
2113 memcpy(buf
, sopt
->sopt_data
, len
);
2118 * common case of get sockopt integer value
2121 sockopt_getint(const struct sockopt
*sopt
, int *valp
)
2124 return sockopt_get(sopt
, valp
, sizeof(int));
2128 * set sockopt value from mbuf
2129 * - ONLY for legacy code
2130 * - mbuf is released by sockopt
2134 sockopt_setmbuf(struct sockopt
*sopt
, struct mbuf
*m
)
2141 if (sopt
->sopt_size
== 0) {
2142 error
= sockopt_alloc(sopt
, len
, KM_NOSLEEP
);
2147 KASSERT(sopt
->sopt_size
== len
);
2148 m_copydata(m
, 0, len
, sopt
->sopt_data
);
2155 * get sockopt value into mbuf
2156 * - ONLY for legacy code
2157 * - mbuf to be released by the caller
2161 sockopt_getmbuf(const struct sockopt
*sopt
)
2165 if (sopt
->sopt_size
> MCLBYTES
)
2168 m
= m_get(M_DONTWAIT
, MT_SOOPTS
);
2172 if (sopt
->sopt_size
> MLEN
) {
2173 MCLGET(m
, M_DONTWAIT
);
2174 if ((m
->m_flags
& M_EXT
) == 0) {
2180 memcpy(mtod(m
, void *), sopt
->sopt_data
, sopt
->sopt_size
);
2181 m
->m_len
= sopt
->sopt_size
;
2187 sohasoutofband(struct socket
*so
)
2190 fownsignal(so
->so_pgid
, SIGURG
, POLL_PRI
, POLLPRI
|POLLRDBAND
, so
);
2191 selnotify(&so
->so_rcv
.sb_sel
, POLLPRI
| POLLRDBAND
, NOTE_SUBMIT
);
2195 filt_sordetach(struct knote
*kn
)
2199 so
= ((file_t
*)kn
->kn_obj
)->f_data
;
2201 SLIST_REMOVE(&so
->so_rcv
.sb_sel
.sel_klist
, kn
, knote
, kn_selnext
);
2202 if (SLIST_EMPTY(&so
->so_rcv
.sb_sel
.sel_klist
))
2203 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
2209 filt_soread(struct knote
*kn
, long hint
)
2214 so
= ((file_t
*)kn
->kn_obj
)->f_data
;
2215 if (hint
!= NOTE_SUBMIT
)
2217 kn
->kn_data
= so
->so_rcv
.sb_cc
;
2218 if (so
->so_state
& SS_CANTRCVMORE
) {
2219 kn
->kn_flags
|= EV_EOF
;
2220 kn
->kn_fflags
= so
->so_error
;
2222 } else if (so
->so_error
) /* temporary udp error */
2224 else if (kn
->kn_sfflags
& NOTE_LOWAT
)
2225 rv
= (kn
->kn_data
>= kn
->kn_sdata
);
2227 rv
= (kn
->kn_data
>= so
->so_rcv
.sb_lowat
);
2228 if (hint
!= NOTE_SUBMIT
)
2234 filt_sowdetach(struct knote
*kn
)
2238 so
= ((file_t
*)kn
->kn_obj
)->f_data
;
2240 SLIST_REMOVE(&so
->so_snd
.sb_sel
.sel_klist
, kn
, knote
, kn_selnext
);
2241 if (SLIST_EMPTY(&so
->so_snd
.sb_sel
.sel_klist
))
2242 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
2248 filt_sowrite(struct knote
*kn
, long hint
)
2253 so
= ((file_t
*)kn
->kn_obj
)->f_data
;
2254 if (hint
!= NOTE_SUBMIT
)
2256 kn
->kn_data
= sbspace(&so
->so_snd
);
2257 if (so
->so_state
& SS_CANTSENDMORE
) {
2258 kn
->kn_flags
|= EV_EOF
;
2259 kn
->kn_fflags
= so
->so_error
;
2261 } else if (so
->so_error
) /* temporary udp error */
2263 else if (((so
->so_state
& SS_ISCONNECTED
) == 0) &&
2264 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
))
2266 else if (kn
->kn_sfflags
& NOTE_LOWAT
)
2267 rv
= (kn
->kn_data
>= kn
->kn_sdata
);
2269 rv
= (kn
->kn_data
>= so
->so_snd
.sb_lowat
);
2270 if (hint
!= NOTE_SUBMIT
)
2277 filt_solisten(struct knote
*kn
, long hint
)
2282 so
= ((file_t
*)kn
->kn_obj
)->f_data
;
2285 * Set kn_data to number of incoming connections, not
2286 * counting partial (incomplete) connections.
2288 if (hint
!= NOTE_SUBMIT
)
2290 kn
->kn_data
= so
->so_qlen
;
2291 rv
= (kn
->kn_data
> 0);
2292 if (hint
!= NOTE_SUBMIT
)
2297 static const struct filterops solisten_filtops
=
2298 { 1, NULL
, filt_sordetach
, filt_solisten
};
2299 static const struct filterops soread_filtops
=
2300 { 1, NULL
, filt_sordetach
, filt_soread
};
2301 static const struct filterops sowrite_filtops
=
2302 { 1, NULL
, filt_sowdetach
, filt_sowrite
};
2305 soo_kqfilter(struct file
*fp
, struct knote
*kn
)
2310 so
= ((file_t
*)kn
->kn_obj
)->f_data
;
2312 switch (kn
->kn_filter
) {
2314 if (so
->so_options
& SO_ACCEPTCONN
)
2315 kn
->kn_fop
= &solisten_filtops
;
2317 kn
->kn_fop
= &soread_filtops
;
2321 kn
->kn_fop
= &sowrite_filtops
;
2328 SLIST_INSERT_HEAD(&sb
->sb_sel
.sel_klist
, kn
, kn_selnext
);
2329 sb
->sb_flags
|= SB_KNOTE
;
2335 sodopoll(struct socket
*so
, int events
)
2341 if (events
& (POLLIN
| POLLRDNORM
))
2343 revents
|= events
& (POLLIN
| POLLRDNORM
);
2345 if (events
& (POLLOUT
| POLLWRNORM
))
2347 revents
|= events
& (POLLOUT
| POLLWRNORM
);
2349 if (events
& (POLLPRI
| POLLRDBAND
))
2350 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
2351 revents
|= events
& (POLLPRI
| POLLRDBAND
);
2357 sopoll(struct socket
*so
, int events
)
2363 * Do a quick, unlocked check in expectation that the socket
2364 * will be ready for I/O. Don't do this check if DIAGNOSTIC,
2365 * as the solocked() assertions will fail.
2367 if ((revents
= sodopoll(so
, events
)) != 0)
2372 if ((revents
= sodopoll(so
, events
)) == 0) {
2373 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
2374 selrecord(curlwp
, &so
->so_rcv
.sb_sel
);
2375 so
->so_rcv
.sb_flags
|= SB_NOTIFY
;
2378 if (events
& (POLLOUT
| POLLWRNORM
)) {
2379 selrecord(curlwp
, &so
->so_snd
.sb_sel
);
2380 so
->so_snd
.sb_flags
|= SB_NOTIFY
;
2389 #include <sys/sysctl.h>
2391 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO
);
2394 * sysctl helper routine for kern.somaxkva. ensures that the given
2395 * value is not too small.
2396 * (XXX should we maybe make sure it's not too large as well?)
2399 sysctl_kern_somaxkva(SYSCTLFN_ARGS
)
2401 int error
, new_somaxkva
;
2402 struct sysctlnode node
;
2404 new_somaxkva
= somaxkva
;
2406 node
.sysctl_data
= &new_somaxkva
;
2407 error
= sysctl_lookup(SYSCTLFN_CALL(&node
));
2408 if (error
|| newp
== NULL
)
2411 if (new_somaxkva
< (16 * 1024 * 1024)) /* sanity */
2414 mutex_enter(&so_pendfree_lock
);
2415 somaxkva
= new_somaxkva
;
2416 cv_broadcast(&socurkva_cv
);
2417 mutex_exit(&so_pendfree_lock
);
2423 sysctl_kern_somaxkva_setup(void)
2426 KASSERT(socket_sysctllog
== NULL
);
2427 sysctl_createv(&socket_sysctllog
, 0, NULL
, NULL
,
2429 CTLTYPE_NODE
, "kern", NULL
,
2433 sysctl_createv(&socket_sysctllog
, 0, NULL
, NULL
,
2434 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
2435 CTLTYPE_INT
, "somaxkva",
2436 SYSCTL_DESCR("Maximum amount of kernel memory to be "
2437 "used for socket buffers"),
2438 sysctl_kern_somaxkva
, 0, NULL
, 0,
2439 CTL_KERN
, KERN_SOMAXKVA
, CTL_EOL
);