1 /* $NetBSD: uipc_socket2.c,v 1.105 2009/12/30 18:33:53 elad Exp $ */
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.105 2009/12/30 18:33:53 elad Exp $");
63 #include "opt_mbuftrace.h"
64 #include "opt_sb_max.h"
66 #include <sys/param.h>
67 #include <sys/systm.h>
71 #include <sys/malloc.h>
73 #include <sys/protosw.h>
74 #include <sys/domain.h>
76 #include <sys/socket.h>
77 #include <sys/socketvar.h>
78 #include <sys/signalvar.h>
79 #include <sys/kauth.h>
81 #include <sys/uidinfo.h>
84 * Primitive routines for operating on sockets and socket buffers.
86 * Locking rules and assumptions:
88 * o socket::so_lock can change on the fly. The low level routines used
89 * to lock sockets are aware of this. When so_lock is acquired, the
90 * routine locking must check to see if so_lock still points to the
91 * lock that was acquired. If so_lock has changed in the meantime, the
92 * now irellevant lock that was acquired must be dropped and the lock
93 * operation retried. Although not proven here, this is completely safe
94 * on a multiprocessor system, even with relaxed memory ordering, given
97 * o In order to mutate so_lock, the lock pointed to by the current value
98 * of so_lock must be held: i.e., the socket must be held locked by the
99 * changing thread. The thread must issue membar_exit() to prevent
100 * memory accesses being reordered, and can set so_lock to the desired
101 * value. If the lock pointed to by the new value of so_lock is not
102 * held by the changing thread, the socket must then be considered
105 * o If so_lock is mutated, and the previous lock referred to by so_lock
106 * could still be visible to other threads in the system (e.g. via file
107 * descriptor or protocol-internal reference), then the old lock must
108 * remain valid until the socket and/or protocol control block has been
111 * o If a socket has a non-NULL so_head value (i.e. is in the process of
112 * connecting), then locking the socket must also lock the socket pointed
113 * to by so_head: their lock pointers must match.
115 * o If a socket has connections in progress (so_q, so_q0 not empty) then
116 * locking the socket must also lock the sockets attached to both queues.
117 * Again, their lock pointers must match.
119 * o Beyond the initial lock assigment in socreate(), assigning locks to
120 * sockets is the responsibility of the individual protocols / protocol
124 static pool_cache_t socket_cache
;
126 u_long sb_max
= SB_MAX
; /* maximum socket buffer size */
127 static u_long sb_max_adj
; /* adjusted sb_max */
130 * Procedures to manipulate state flags of socket
131 * and do appropriate wakeups. Normal sequence from the
132 * active (originating) side is that soisconnecting() is
133 * called during processing of connect() call,
134 * resulting in an eventual call to soisconnected() if/when the
135 * connection is established. When the connection is torn down
136 * soisdisconnecting() is called during processing of disconnect() call,
137 * and soisdisconnected() is called when the connection to the peer
138 * is totally severed. The semantics of these routines are such that
139 * connectionless protocols can call soisconnected() and soisdisconnected()
140 * only, bypassing the in-progress calls when setting up a ``connection''
143 * From the passive side, a socket is created with
144 * two queues of sockets: so_q0 for connections in progress
145 * and so_q for connections already made and awaiting user acceptance.
146 * As a protocol is preparing incoming connections, it creates a socket
147 * structure queued on so_q0 by calling sonewconn(). When the connection
148 * is established, soisconnected() is called, and transfers the
149 * socket structure to so_q, making it available to accept().
151 * If a socket is closed with sockets on either
152 * so_q0 or so_q, these sockets are dropped.
154 * If higher level protocols are implemented in
155 * the kernel, the wakeups done here will sometimes
156 * cause software-interrupt process scheduling.
160 soisconnecting(struct socket
*so
)
163 KASSERT(solocked(so
));
165 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
166 so
->so_state
|= SS_ISCONNECTING
;
170 soisconnected(struct socket
*so
)
176 KASSERT(solocked(so
));
177 KASSERT(head
== NULL
|| solocked2(so
, head
));
179 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
180 so
->so_state
|= SS_ISCONNECTED
;
181 if (head
&& so
->so_onq
== &head
->so_q0
) {
182 if ((so
->so_options
& SO_ACCEPTFILTER
) == 0) {
184 soqinsque(head
, so
, 1);
186 cv_broadcast(&head
->so_cv
);
189 head
->so_accf
->so_accept_filter
->accf_callback
;
190 so
->so_upcallarg
= head
->so_accf
->so_accept_filter_arg
;
191 so
->so_rcv
.sb_flags
|= SB_UPCALL
;
192 so
->so_options
&= ~SO_ACCEPTFILTER
;
193 (*so
->so_upcall
)(so
, so
->so_upcallarg
,
194 POLLIN
|POLLRDNORM
, M_DONTWAIT
);
197 cv_broadcast(&so
->so_cv
);
204 soisdisconnecting(struct socket
*so
)
207 KASSERT(solocked(so
));
209 so
->so_state
&= ~SS_ISCONNECTING
;
210 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
211 cv_broadcast(&so
->so_cv
);
217 soisdisconnected(struct socket
*so
)
220 KASSERT(solocked(so
));
222 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
223 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
224 cv_broadcast(&so
->so_cv
);
233 socket_cache
= pool_cache_init(sizeof(struct socket
), 0, 0, 0,
234 "socket", NULL
, IPL_SOFTNET
, NULL
, NULL
, NULL
);
238 * When an attempt at a new connection is noted on a socket
239 * which accepts connections, sonewconn is called. If the
240 * connection is possible (subject to space constraints, etc.)
241 * then we allocate a new structure, propoerly linked into the
242 * data structure of the original socket, and return this.
243 * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED.
246 sonewconn(struct socket
*head
, int connstatus
)
251 KASSERT(connstatus
== 0 || connstatus
== SS_ISCONFIRMING
||
252 connstatus
== SS_ISCONNECTED
);
253 KASSERT(solocked(head
));
255 if ((head
->so_options
& SO_ACCEPTFILTER
) != 0)
257 soqueue
= connstatus
? 1 : 0;
258 if (head
->so_qlen
+ head
->so_q0len
> 3 * head
->so_qlimit
/ 2)
263 mutex_obj_hold(head
->so_lock
);
264 so
->so_lock
= head
->so_lock
;
265 so
->so_type
= head
->so_type
;
266 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
267 so
->so_linger
= head
->so_linger
;
268 so
->so_state
= head
->so_state
| SS_NOFDREF
;
269 so
->so_nbio
= head
->so_nbio
;
270 so
->so_proto
= head
->so_proto
;
271 so
->so_timeo
= head
->so_timeo
;
272 so
->so_pgid
= head
->so_pgid
;
273 so
->so_send
= head
->so_send
;
274 so
->so_receive
= head
->so_receive
;
275 so
->so_uidinfo
= head
->so_uidinfo
;
276 so
->so_cpid
= head
->so_cpid
;
278 so
->so_mowner
= head
->so_mowner
;
279 so
->so_rcv
.sb_mowner
= head
->so_rcv
.sb_mowner
;
280 so
->so_snd
.sb_mowner
= head
->so_snd
.sb_mowner
;
282 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
) != 0)
284 so
->so_snd
.sb_lowat
= head
->so_snd
.sb_lowat
;
285 so
->so_rcv
.sb_lowat
= head
->so_rcv
.sb_lowat
;
286 so
->so_rcv
.sb_timeo
= head
->so_rcv
.sb_timeo
;
287 so
->so_snd
.sb_timeo
= head
->so_snd
.sb_timeo
;
288 so
->so_rcv
.sb_flags
|= head
->so_rcv
.sb_flags
& SB_AUTOSIZE
;
289 so
->so_snd
.sb_flags
|= head
->so_snd
.sb_flags
& SB_AUTOSIZE
;
290 soqinsque(head
, so
, soqueue
);
291 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_ATTACH
, NULL
, NULL
,
293 KASSERT(solocked(so
));
295 (void) soqremque(so
, soqueue
);
298 * Remove acccept filter if one is present.
299 * XXX Is this really needed?
301 if (so
->so_accf
!= NULL
)
302 (void)accept_filt_clear(so
);
308 cv_broadcast(&head
->so_cv
);
309 so
->so_state
|= connstatus
;
319 so
= pool_cache_get(socket_cache
, (waitok
? PR_WAITOK
: PR_NOWAIT
));
320 if (__predict_false(so
== NULL
))
322 memset(so
, 0, sizeof(*so
));
323 TAILQ_INIT(&so
->so_q0
);
324 TAILQ_INIT(&so
->so_q
);
325 cv_init(&so
->so_cv
, "socket");
326 cv_init(&so
->so_rcv
.sb_cv
, "netio");
327 cv_init(&so
->so_snd
.sb_cv
, "netio");
328 selinit(&so
->so_rcv
.sb_sel
);
329 selinit(&so
->so_snd
.sb_sel
);
330 so
->so_rcv
.sb_so
= so
;
331 so
->so_snd
.sb_so
= so
;
336 soput(struct socket
*so
)
339 KASSERT(!cv_has_waiters(&so
->so_cv
));
340 KASSERT(!cv_has_waiters(&so
->so_rcv
.sb_cv
));
341 KASSERT(!cv_has_waiters(&so
->so_snd
.sb_cv
));
342 seldestroy(&so
->so_rcv
.sb_sel
);
343 seldestroy(&so
->so_snd
.sb_sel
);
344 mutex_obj_free(so
->so_lock
);
345 cv_destroy(&so
->so_cv
);
346 cv_destroy(&so
->so_rcv
.sb_cv
);
347 cv_destroy(&so
->so_snd
.sb_cv
);
348 pool_cache_put(socket_cache
, so
);
352 soqinsque(struct socket
*head
, struct socket
*so
, int q
)
355 KASSERT(solocked2(head
, so
));
358 if (so
->so_onq
!= NULL
)
365 so
->so_onq
= &head
->so_q0
;
368 so
->so_onq
= &head
->so_q
;
370 TAILQ_INSERT_TAIL(so
->so_onq
, so
, so_qe
);
374 soqremque(struct socket
*so
, int q
)
380 KASSERT(solocked(so
));
382 if (so
->so_onq
!= &head
->so_q0
)
386 if (so
->so_onq
!= &head
->so_q
)
390 KASSERT(solocked2(so
, head
));
391 TAILQ_REMOVE(so
->so_onq
, so
, so_qe
);
398 * Socantsendmore indicates that no more data will be sent on the
399 * socket; it would normally be applied to a socket when the user
400 * informs the system that no more data is to be sent, by the protocol
401 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
402 * will be received, and will normally be applied to the socket by a
403 * protocol when it detects that the peer will send no more data.
404 * Data queued for reading in the socket may yet be read.
408 socantsendmore(struct socket
*so
)
411 KASSERT(solocked(so
));
413 so
->so_state
|= SS_CANTSENDMORE
;
418 socantrcvmore(struct socket
*so
)
421 KASSERT(solocked(so
));
423 so
->so_state
|= SS_CANTRCVMORE
;
428 * Wait for data to arrive at/drain from a socket buffer.
431 sbwait(struct sockbuf
*sb
)
439 KASSERT(solocked(so
));
441 sb
->sb_flags
|= SB_NOTIFY
;
443 if ((sb
->sb_flags
& SB_NOINTR
) != 0)
444 error
= cv_timedwait(&sb
->sb_cv
, lock
, sb
->sb_timeo
);
446 error
= cv_timedwait_sig(&sb
->sb_cv
, lock
, sb
->sb_timeo
);
447 if (__predict_false(lock
!= so
->so_lock
))
448 solockretry(so
, lock
);
453 * Wakeup processes waiting on a socket buffer.
454 * Do asynchronous notification via SIGIO
455 * if the socket buffer has the SB_ASYNC flag set.
458 sowakeup(struct socket
*so
, struct sockbuf
*sb
, int code
)
462 KASSERT(solocked(so
));
463 KASSERT(sb
->sb_so
== so
);
466 band
= POLLIN
|POLLRDNORM
;
468 band
= POLLOUT
|POLLWRNORM
;
469 sb
->sb_flags
&= ~SB_NOTIFY
;
470 selnotify(&sb
->sb_sel
, band
, NOTE_SUBMIT
);
471 cv_broadcast(&sb
->sb_cv
);
472 if (sb
->sb_flags
& SB_ASYNC
)
473 fownsignal(so
->so_pgid
, SIGIO
, code
, band
, so
);
474 if (sb
->sb_flags
& SB_UPCALL
)
475 (*so
->so_upcall
)(so
, so
->so_upcallarg
, band
, M_DONTWAIT
);
479 * Reset a socket's lock pointer. Wake all threads waiting on the
480 * socket's condition variables so that they can restart their waits
481 * using the new lock. The existing lock must be held.
484 solockreset(struct socket
*so
, kmutex_t
*lock
)
487 KASSERT(solocked(so
));
490 cv_broadcast(&so
->so_snd
.sb_cv
);
491 cv_broadcast(&so
->so_rcv
.sb_cv
);
492 cv_broadcast(&so
->so_cv
);
496 * Socket buffer (struct sockbuf) utility routines.
498 * Each socket contains two socket buffers: one for sending data and
499 * one for receiving data. Each buffer contains a queue of mbufs,
500 * information about the number of mbufs and amount of data in the
501 * queue, and other fields allowing poll() statements and notification
502 * on data availability to be implemented.
504 * Data stored in a socket buffer is maintained as a list of records.
505 * Each record is a list of mbufs chained together with the m_next
506 * field. Records are chained together with the m_nextpkt field. The upper
507 * level routine soreceive() expects the following conventions to be
508 * observed when placing information in the receive buffer:
510 * 1. If the protocol requires each message be preceded by the sender's
511 * name, then a record containing that name must be present before
512 * any associated data (mbuf's must be of type MT_SONAME).
513 * 2. If the protocol supports the exchange of ``access rights'' (really
514 * just additional data associated with the message), and there are
515 * ``rights'' to be received, then a record containing this data
516 * should be present (mbuf's must be of type MT_CONTROL).
517 * 3. If a name or rights record exists, then it must be followed by
518 * a data record, perhaps of zero length.
520 * Before using a new socket structure it is first necessary to reserve
521 * buffer space to the socket, by calling sbreserve(). This should commit
522 * some of the available buffer space in the system buffer pool for the
523 * socket (currently, it does nothing but enforce limits). The space
524 * should be released by calling sbrelease() when the socket is destroyed.
528 sb_max_set(u_long new_sbmax
)
532 if (new_sbmax
< (16 * 1024))
537 sb_max_adj
= (u_quad_t
)new_sbmax
* MCLBYTES
/ (MSIZE
+ MCLBYTES
);
544 soreserve(struct socket
*so
, u_long sndcc
, u_long rcvcc
)
547 KASSERT(so
->so_lock
== NULL
|| solocked(so
));
550 * there's at least one application (a configure script of screen)
551 * which expects a fifo is writable even if it has "some" bytes
553 * so we want to make sure (hiwat - lowat) >= (some bytes).
555 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
556 * we expect it's large enough for such applications.
558 u_long lowat
= MAX(sock_loan_thresh
, MCLBYTES
);
559 u_long hiwat
= lowat
+ PIPE_BUF
;
563 if (sbreserve(&so
->so_snd
, sndcc
, so
) == 0)
565 if (sbreserve(&so
->so_rcv
, rcvcc
, so
) == 0)
567 if (so
->so_rcv
.sb_lowat
== 0)
568 so
->so_rcv
.sb_lowat
= 1;
569 if (so
->so_snd
.sb_lowat
== 0)
570 so
->so_snd
.sb_lowat
= lowat
;
571 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
572 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
575 sbrelease(&so
->so_snd
, so
);
581 * Allot mbufs to a sockbuf.
582 * Attempt to scale mbmax so that mbcnt doesn't become limiting
583 * if buffering efficiency is near the normal case.
586 sbreserve(struct sockbuf
*sb
, u_long cc
, struct socket
*so
)
588 struct lwp
*l
= curlwp
; /* XXX */
590 struct uidinfo
*uidinfo
;
592 KASSERT(so
->so_lock
== NULL
|| solocked(so
));
593 KASSERT(sb
->sb_so
== so
);
594 KASSERT(sb_max_adj
!= 0);
596 if (cc
== 0 || cc
> sb_max_adj
)
599 maxcc
= l
->l_proc
->p_rlimit
[RLIMIT_SBSIZE
].rlim_cur
;
601 uidinfo
= so
->so_uidinfo
;
602 if (!chgsbsize(uidinfo
, &sb
->sb_hiwat
, cc
, maxcc
))
604 sb
->sb_mbmax
= min(cc
* 2, sb_max
);
605 if (sb
->sb_lowat
> sb
->sb_hiwat
)
606 sb
->sb_lowat
= sb
->sb_hiwat
;
611 * Free mbufs held by a socket, and reserved mbuf space. We do not assert
612 * that the socket is held locked here: see sorflush().
615 sbrelease(struct sockbuf
*sb
, struct socket
*so
)
618 KASSERT(sb
->sb_so
== so
);
621 (void)chgsbsize(so
->so_uidinfo
, &sb
->sb_hiwat
, 0, RLIM_INFINITY
);
626 * Routines to add and remove
627 * data from an mbuf queue.
629 * The routines sbappend() or sbappendrecord() are normally called to
630 * append new mbufs to a socket buffer, after checking that adequate
631 * space is available, comparing the function sbspace() with the amount
632 * of data to be added. sbappendrecord() differs from sbappend() in
633 * that data supplied is treated as the beginning of a new record.
634 * To place a sender's address, optional access rights, and data in a
635 * socket receive buffer, sbappendaddr() should be used. To place
636 * access rights and data in a socket receive buffer, sbappendrights()
637 * should be used. In either case, the new data begins a new record.
638 * Note that unlike sbappend() and sbappendrecord(), these routines check
639 * for the caller that there will be enough space to store the data.
640 * Each fails if there is not enough space, or if it cannot find mbufs
641 * to store additional information in.
643 * Reliable protocols may use the socket send buffer to hold data
644 * awaiting acknowledgement. Data is normally copied from a socket
645 * send buffer in a protocol with m_copy for output to a peer,
646 * and then removing the data from the socket buffer with sbdrop()
647 * or sbdroprecord() when the data is acknowledged by the peer.
652 sblastrecordchk(struct sockbuf
*sb
, const char *where
)
654 struct mbuf
*m
= sb
->sb_mb
;
656 KASSERT(solocked(sb
->sb_so
));
658 while (m
&& m
->m_nextpkt
)
661 if (m
!= sb
->sb_lastrecord
) {
662 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
663 sb
->sb_mb
, sb
->sb_lastrecord
, m
);
664 printf("packet chain:\n");
665 for (m
= sb
->sb_mb
; m
!= NULL
; m
= m
->m_nextpkt
)
667 panic("sblastrecordchk from %s", where
);
672 sblastmbufchk(struct sockbuf
*sb
, const char *where
)
674 struct mbuf
*m
= sb
->sb_mb
;
677 KASSERT(solocked(sb
->sb_so
));
679 while (m
&& m
->m_nextpkt
)
682 while (m
&& m
->m_next
)
685 if (m
!= sb
->sb_mbtail
) {
686 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
687 sb
->sb_mb
, sb
->sb_mbtail
, m
);
688 printf("packet tree:\n");
689 for (m
= sb
->sb_mb
; m
!= NULL
; m
= m
->m_nextpkt
) {
691 for (n
= m
; n
!= NULL
; n
= n
->m_next
)
695 panic("sblastmbufchk from %s", where
);
698 #endif /* SOCKBUF_DEBUG */
701 * Link a chain of records onto a socket buffer
703 #define SBLINKRECORDCHAIN(sb, m0, mlast) \
705 if ((sb)->sb_lastrecord != NULL) \
706 (sb)->sb_lastrecord->m_nextpkt = (m0); \
708 (sb)->sb_mb = (m0); \
709 (sb)->sb_lastrecord = (mlast); \
710 } while (/*CONSTCOND*/0)
713 #define SBLINKRECORD(sb, m0) \
714 SBLINKRECORDCHAIN(sb, m0, m0)
717 * Append mbuf chain m to the last record in the
718 * socket buffer sb. The additional space associated
719 * the mbuf chain is recorded in sb. Empty mbufs are
720 * discarded and mbufs are compacted where possible.
723 sbappend(struct sockbuf
*sb
, struct mbuf
*m
)
727 KASSERT(solocked(sb
->sb_so
));
733 m_claimm(m
, sb
->sb_mowner
);
736 SBLASTRECORDCHK(sb
, "sbappend 1");
738 if ((n
= sb
->sb_lastrecord
) != NULL
) {
740 * XXX Would like to simply use sb_mbtail here, but
741 * XXX I need to verify that I won't miss an EOR that
745 if (n
->m_flags
& M_EOR
) {
746 sbappendrecord(sb
, m
); /* XXXXXX!!!! */
749 } while (n
->m_next
&& (n
= n
->m_next
));
752 * If this is the first record in the socket buffer, it's
753 * also the last record.
755 sb
->sb_lastrecord
= m
;
757 sbcompress(sb
, m
, n
);
758 SBLASTRECORDCHK(sb
, "sbappend 2");
762 * This version of sbappend() should only be used when the caller
763 * absolutely knows that there will never be more than one record
764 * in the socket buffer, that is, a stream protocol (such as TCP).
767 sbappendstream(struct sockbuf
*sb
, struct mbuf
*m
)
770 KASSERT(solocked(sb
->sb_so
));
771 KDASSERT(m
->m_nextpkt
== NULL
);
772 KASSERT(sb
->sb_mb
== sb
->sb_lastrecord
);
774 SBLASTMBUFCHK(sb
, __func__
);
777 m_claimm(m
, sb
->sb_mowner
);
780 sbcompress(sb
, m
, sb
->sb_mbtail
);
782 sb
->sb_lastrecord
= sb
->sb_mb
;
783 SBLASTRECORDCHK(sb
, __func__
);
788 sbcheck(struct sockbuf
*sb
)
793 KASSERT(solocked(sb
->sb_so
));
797 for (m
= sb
->sb_mb
; m
; m
= m
->m_nextpkt
) {
798 for (m2
= m
; m2
!= NULL
; m2
= m2
->m_next
) {
801 if (m2
->m_flags
& M_EXT
)
802 mbcnt
+= m2
->m_ext
.ext_size
;
803 if (m2
->m_nextpkt
!= NULL
)
804 panic("sbcheck nextpkt");
807 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
808 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len
, sb
->sb_cc
,
809 mbcnt
, sb
->sb_mbcnt
);
816 * As above, except the mbuf chain
817 * begins a new record.
820 sbappendrecord(struct sockbuf
*sb
, struct mbuf
*m0
)
824 KASSERT(solocked(sb
->sb_so
));
830 m_claimm(m0
, sb
->sb_mowner
);
833 * Put the first mbuf on the queue.
834 * Note this permits zero length records.
837 SBLASTRECORDCHK(sb
, "sbappendrecord 1");
838 SBLINKRECORD(sb
, m0
);
841 if (m
&& (m0
->m_flags
& M_EOR
)) {
842 m0
->m_flags
&= ~M_EOR
;
845 sbcompress(sb
, m
, m0
);
846 SBLASTRECORDCHK(sb
, "sbappendrecord 2");
850 * As above except that OOB data
851 * is inserted at the beginning of the sockbuf,
852 * but after any other OOB data.
855 sbinsertoob(struct sockbuf
*sb
, struct mbuf
*m0
)
857 struct mbuf
*m
, **mp
;
859 KASSERT(solocked(sb
->sb_so
));
864 SBLASTRECORDCHK(sb
, "sbinsertoob 1");
866 for (mp
= &sb
->sb_mb
; (m
= *mp
) != NULL
; mp
= &((*mp
)->m_nextpkt
)) {
871 continue; /* WANT next train */
874 if ((m
= m
->m_next
) != NULL
)
875 goto again
; /* inspect THIS train further */
880 * Put the first mbuf on the queue.
881 * Note this permits zero length records.
886 /* m0 is actually the new tail */
887 sb
->sb_lastrecord
= m0
;
892 if (m
&& (m0
->m_flags
& M_EOR
)) {
893 m0
->m_flags
&= ~M_EOR
;
896 sbcompress(sb
, m
, m0
);
897 SBLASTRECORDCHK(sb
, "sbinsertoob 2");
901 * Append address and data, and optionally, control (ancillary) data
902 * to the receive queue of a socket. If present,
903 * m0 must include a packet header with total length.
904 * Returns 0 if no space in sockbuf or insufficient mbufs.
907 sbappendaddr(struct sockbuf
*sb
, const struct sockaddr
*asa
, struct mbuf
*m0
,
908 struct mbuf
*control
)
910 struct mbuf
*m
, *n
, *nlast
;
913 KASSERT(solocked(sb
->sb_so
));
918 if ((m0
->m_flags
& M_PKTHDR
) == 0)
919 panic("sbappendaddr");
920 space
+= m0
->m_pkthdr
.len
;
922 m_claimm(m0
, sb
->sb_mowner
);
925 for (n
= control
; n
; n
= n
->m_next
) {
927 MCLAIM(n
, sb
->sb_mowner
);
928 if (n
->m_next
== 0) /* keep pointer to last control buf */
931 if (space
> sbspace(sb
))
933 MGET(m
, M_DONTWAIT
, MT_SONAME
);
936 MCLAIM(m
, sb
->sb_mowner
);
938 * XXX avoid 'comparison always true' warning which isn't easily
943 MEXTMALLOC(m
, asa
->sa_len
, M_NOWAIT
);
944 if ((m
->m_flags
& M_EXT
) == 0) {
949 m
->m_len
= asa
->sa_len
;
950 memcpy(mtod(m
, void *), asa
, asa
->sa_len
);
952 n
->m_next
= m0
; /* concatenate data to control */
957 SBLASTRECORDCHK(sb
, "sbappendaddr 1");
959 for (n
= m
; n
->m_next
!= NULL
; n
= n
->m_next
)
965 sb
->sb_mbtail
= nlast
;
966 SBLASTMBUFCHK(sb
, "sbappendaddr");
967 SBLASTRECORDCHK(sb
, "sbappendaddr 2");
973 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
976 static inline struct mbuf
*
977 m_prepend_sockaddr(struct sockbuf
*sb
, struct mbuf
*m0
,
978 const struct sockaddr
*asa
)
981 const int salen
= asa
->sa_len
;
983 KASSERT(solocked(sb
->sb_so
));
985 /* only the first in each chain need be a pkthdr */
986 MGETHDR(m
, M_DONTWAIT
, MT_SONAME
);
989 MCLAIM(m
, sb
->sb_mowner
);
992 MEXTMALLOC(m
, salen
, M_NOWAIT
);
993 if ((m
->m_flags
& M_EXT
) == 0) {
999 KASSERT(salen
<= MHLEN
);
1002 memcpy(mtod(m
, void *), asa
, salen
);
1004 m
->m_pkthdr
.len
= salen
+ m0
->m_pkthdr
.len
;
1010 sbappendaddrchain(struct sockbuf
*sb
, const struct sockaddr
*asa
,
1011 struct mbuf
*m0
, int sbprio
)
1014 struct mbuf
*m
, *n
, *n0
, *nlast
;
1017 KASSERT(solocked(sb
->sb_so
));
1020 * XXX sbprio reserved for encoding priority of this* request:
1021 * SB_PRIO_NONE --> honour normal sb limits
1022 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1023 * take whole chain. Intended for large requests
1024 * that should be delivered atomically (all, or none).
1025 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1026 * over normal socket limits, for messages indicating
1027 * buffer overflow in earlier normal/lower-priority messages
1028 * SB_PRIO_BESTEFFORT --> ignore limits entirely.
1029 * Intended for kernel-generated messages only.
1030 * Up to generator to avoid total mbuf resource exhaustion.
1034 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
1035 panic("sbappendaddrchain");
1037 space
= sbspace(sb
);
1041 * Enforce SB_PRIO_* limits as described above.
1047 for (m
= m0
; m
; m
= m
->m_nextpkt
) {
1051 m_claimm(m
, sb
->sb_mowner
);
1054 /* Prepend sockaddr to this record (m) of input chain m0 */
1055 n
= m_prepend_sockaddr(sb
, m
, asa
);
1061 /* Append record (asa+m) to end of new chain n0 */
1065 nlast
->m_nextpkt
= n
;
1067 /* Keep track of last record on new chain */
1070 for (np
= n
; np
; np
= np
->m_next
)
1074 SBLASTRECORDCHK(sb
, "sbappendaddrchain 1");
1076 /* Drop the entire chain of (asa+m) records onto the socket */
1077 SBLINKRECORDCHAIN(sb
, n0
, nlast
);
1079 SBLASTRECORDCHK(sb
, "sbappendaddrchain 2");
1081 for (m
= nlast
; m
->m_next
; m
= m
->m_next
)
1084 SBLASTMBUFCHK(sb
, "sbappendaddrchain");
1090 * On error, free the prepended addreseses. For consistency
1091 * with sbappendaddr(), leave it to our caller to free
1092 * the input record chain passed to us as m0.
1094 while ((n
= n0
) != NULL
) {
1097 /* Undo the sballoc() of this record */
1098 for (np
= n
; np
; np
= np
->m_next
)
1101 n0
= n
->m_nextpkt
; /* iterate at next prepended address */
1102 MFREE(n
, np
); /* free prepended address (not data) */
1109 sbappendcontrol(struct sockbuf
*sb
, struct mbuf
*m0
, struct mbuf
*control
)
1111 struct mbuf
*m
, *mlast
, *n
;
1114 KASSERT(solocked(sb
->sb_so
));
1118 panic("sbappendcontrol");
1119 for (m
= control
; ; m
= m
->m_next
) {
1121 MCLAIM(m
, sb
->sb_mowner
);
1125 n
= m
; /* save pointer to last control buffer */
1126 for (m
= m0
; m
; m
= m
->m_next
) {
1127 MCLAIM(m
, sb
->sb_mowner
);
1130 if (space
> sbspace(sb
))
1132 n
->m_next
= m0
; /* concatenate data to control */
1134 SBLASTRECORDCHK(sb
, "sbappendcontrol 1");
1136 for (m
= control
; m
->m_next
!= NULL
; m
= m
->m_next
)
1140 SBLINKRECORD(sb
, control
);
1142 sb
->sb_mbtail
= mlast
;
1143 SBLASTMBUFCHK(sb
, "sbappendcontrol");
1144 SBLASTRECORDCHK(sb
, "sbappendcontrol 2");
1150 * Compress mbuf chain m into the socket
1151 * buffer sb following mbuf n. If n
1152 * is null, the buffer is presumed empty.
1155 sbcompress(struct sockbuf
*sb
, struct mbuf
*m
, struct mbuf
*n
)
1160 KASSERT(solocked(sb
->sb_so
));
1164 eor
|= m
->m_flags
& M_EOR
;
1165 if (m
->m_len
== 0 &&
1167 (((o
= m
->m_next
) || (o
= n
)) &&
1168 o
->m_type
== m
->m_type
))) {
1169 if (sb
->sb_lastrecord
== m
)
1170 sb
->sb_lastrecord
= m
->m_next
;
1174 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1175 /* M_TRAILINGSPACE() checks buffer writeability */
1176 m
->m_len
<= MCLBYTES
/ 4 && /* XXX Don't copy too much */
1177 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1178 n
->m_type
== m
->m_type
) {
1179 memcpy(mtod(n
, char *) + n
->m_len
, mtod(m
, void *),
1180 (unsigned)m
->m_len
);
1181 n
->m_len
+= m
->m_len
;
1182 sb
->sb_cc
+= m
->m_len
;
1193 m
->m_flags
&= ~M_EOR
;
1201 printf("semi-panic: sbcompress\n");
1203 SBLASTMBUFCHK(sb
, __func__
);
1207 * Free all mbufs in a sockbuf.
1208 * Check that all resources are reclaimed.
1211 sbflush(struct sockbuf
*sb
)
1214 KASSERT(solocked(sb
->sb_so
));
1215 KASSERT((sb
->sb_flags
& SB_LOCK
) == 0);
1217 while (sb
->sb_mbcnt
)
1218 sbdrop(sb
, (int)sb
->sb_cc
);
1220 KASSERT(sb
->sb_cc
== 0);
1221 KASSERT(sb
->sb_mb
== NULL
);
1222 KASSERT(sb
->sb_mbtail
== NULL
);
1223 KASSERT(sb
->sb_lastrecord
== NULL
);
1227 * Drop data from (the front of) a sockbuf.
1230 sbdrop(struct sockbuf
*sb
, int len
)
1232 struct mbuf
*m
, *mn
, *next
;
1234 KASSERT(solocked(sb
->sb_so
));
1236 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1242 next
= m
->m_nextpkt
;
1245 if (m
->m_len
> len
) {
1256 while (m
&& m
->m_len
== 0) {
1263 m
->m_nextpkt
= next
;
1267 * First part is an inline SB_EMPTY_FIXUP(). Second part
1268 * makes sure sb_lastrecord is up-to-date if we dropped
1269 * part of the last record.
1273 sb
->sb_mbtail
= NULL
;
1274 sb
->sb_lastrecord
= NULL
;
1275 } else if (m
->m_nextpkt
== NULL
)
1276 sb
->sb_lastrecord
= m
;
1280 * Drop a record off the front of a sockbuf
1281 * and move the next record to the front.
1284 sbdroprecord(struct sockbuf
*sb
)
1286 struct mbuf
*m
, *mn
;
1288 KASSERT(solocked(sb
->sb_so
));
1292 sb
->sb_mb
= m
->m_nextpkt
;
1296 } while ((m
= mn
) != NULL
);
1302 * Create a "control" mbuf containing the specified data
1303 * with the specified type for presentation on a socket buffer.
1306 sbcreatecontrol(void *p
, int size
, int type
, int level
)
1311 if (CMSG_SPACE(size
) > MCLBYTES
) {
1312 printf("sbcreatecontrol: message too large %d\n", size
);
1316 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1317 return ((struct mbuf
*) NULL
);
1318 if (CMSG_SPACE(size
) > MLEN
) {
1319 MCLGET(m
, M_DONTWAIT
);
1320 if ((m
->m_flags
& M_EXT
) == 0) {
1325 cp
= mtod(m
, struct cmsghdr
*);
1326 memcpy(CMSG_DATA(cp
), p
, size
);
1327 m
->m_len
= CMSG_SPACE(size
);
1328 cp
->cmsg_len
= CMSG_LEN(size
);
1329 cp
->cmsg_level
= level
;
1330 cp
->cmsg_type
= type
;
1335 solockretry(struct socket
*so
, kmutex_t
*lock
)
1338 while (lock
!= so
->so_lock
) {
1346 solocked(struct socket
*so
)
1349 return mutex_owned(so
->so_lock
);
1353 solocked2(struct socket
*so1
, struct socket
*so2
)
1357 lock
= so1
->so_lock
;
1358 if (lock
!= so2
->so_lock
)
1360 return mutex_owned(lock
);
1364 * Assign a default lock to a new socket. For PRU_ATTACH, and done by
1365 * protocols that do not have special locking requirements.
1368 sosetlock(struct socket
*so
)
1372 if (so
->so_lock
== NULL
) {
1373 lock
= softnet_lock
;
1375 mutex_obj_hold(lock
);
1379 /* In all cases, lock must be held on return from PRU_ATTACH. */
1380 KASSERT(solocked(so
));
1384 * Set lock on sockbuf sb; sleep if lock is already held.
1385 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1386 * Returns error without lock if sleep is interrupted.
1389 sblock(struct sockbuf
*sb
, int wf
)
1395 KASSERT(solocked(sb
->sb_so
));
1398 if (__predict_true((sb
->sb_flags
& SB_LOCK
) == 0)) {
1399 sb
->sb_flags
|= SB_LOCK
;
1406 if ((sb
->sb_flags
& SB_NOINTR
) != 0) {
1407 cv_wait(&so
->so_cv
, lock
);
1410 error
= cv_wait_sig(&so
->so_cv
, lock
);
1411 if (__predict_false(lock
!= so
->so_lock
))
1412 solockretry(so
, lock
);
1419 sbunlock(struct sockbuf
*sb
)
1425 KASSERT(solocked(so
));
1426 KASSERT((sb
->sb_flags
& SB_LOCK
) != 0);
1428 sb
->sb_flags
&= ~SB_LOCK
;
1429 cv_broadcast(&so
->so_cv
);
1433 sowait(struct socket
*so
, bool catch, int timo
)
1438 KASSERT(solocked(so
));
1439 KASSERT(catch || timo
!= 0);
1443 error
= cv_timedwait_sig(&so
->so_cv
, lock
, timo
);
1445 error
= cv_timedwait(&so
->so_cv
, lock
, timo
);
1446 if (__predict_false(lock
!= so
->so_lock
))
1447 solockretry(so
, lock
);