2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004-2007 Robert N. M. Watson
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
35 * UNIX Domain (Local) Sockets
37 * This is an implementation of UNIX (local) domain sockets. Each socket has
38 * an associated struct unpcb (UNIX protocol control block). Stream sockets
39 * may be connected to 0 or 1 other socket. Datagram sockets may be
40 * connected to 0, 1, or many other sockets. Sockets may be created and
41 * connected in pairs (socketpair(2)), or bound/connected to using the file
42 * system name space. For most purposes, only the receive socket buffer is
43 * used, as sending on one socket delivers directly to the receive socket
44 * buffer of a second socket.
46 * The implementation is substantially complicated by the fact that
47 * "ancillary data", such as file descriptors or credentials, may be passed
48 * across UNIX domain sockets. The potential for passing UNIX domain sockets
49 * over other UNIX domain sockets requires the implementation of a simple
50 * garbage collector to find and tear down cycles of disconnected sockets.
54 * rethink name space problems
55 * need a proper out-of-band
58 #include <sys/cdefs.h>
59 __FBSDID("$FreeBSD$");
64 #include <sys/param.h>
65 #include <sys/domain.h>
66 #include <sys/fcntl.h>
67 #include <sys/malloc.h> /* XXX must be before <sys/file.h> */
68 #include <sys/eventhandler.h>
70 #include <sys/filedesc.h>
72 #include <sys/kernel.h>
75 #include <sys/mount.h>
76 #include <sys/mutex.h>
77 #include <sys/namei.h>
79 #include <sys/protosw.h>
80 #include <sys/resourcevar.h>
81 #include <sys/rwlock.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/signalvar.h>
87 #include <sys/sysctl.h>
88 #include <sys/systm.h>
89 #include <sys/taskqueue.h>
91 #include <sys/unpcb.h>
92 #include <sys/vnode.h>
98 #include <security/mac/mac_framework.h>
102 static uma_zone_t unp_zone
;
103 static unp_gen_t unp_gencnt
;
104 static u_int unp_count
; /* Count of local sockets. */
105 static ino_t unp_ino
; /* Prototype for fake inode numbers. */
106 static int unp_rights
; /* File descriptors in flight. */
107 static struct unp_head unp_shead
; /* List of local stream sockets. */
108 static struct unp_head unp_dhead
; /* List of local datagram sockets. */
110 static const struct sockaddr sun_noname
= { sizeof(sun_noname
), AF_LOCAL
};
113 * Garbage collection of cyclic file descriptor/socket references occurs
114 * asynchronously in a taskqueue context in order to avoid recursion and
115 * reentrance in the UNIX domain socket, file descriptor, and socket layer
116 * code. See unp_gc() for a full description.
118 static struct task unp_gc_task
;
121 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
122 * stream sockets, although the total for sender and receiver is actually
125 * Datagram sockets really use the sendspace as the maximum datagram size,
126 * and don't really want to reserve the sendspace. Their recvspace should be
127 * large enough for at least one max-size datagram plus address.
132 static u_long unpst_sendspace
= PIPSIZ
;
133 static u_long unpst_recvspace
= PIPSIZ
;
134 static u_long unpdg_sendspace
= 2*1024; /* really max datagram size */
135 static u_long unpdg_recvspace
= 4*1024;
137 SYSCTL_NODE(_net
, PF_LOCAL
, local
, CTLFLAG_RW
, 0, "Local domain");
138 SYSCTL_NODE(_net_local
, SOCK_STREAM
, stream
, CTLFLAG_RW
, 0, "SOCK_STREAM");
139 SYSCTL_NODE(_net_local
, SOCK_DGRAM
, dgram
, CTLFLAG_RW
, 0, "SOCK_DGRAM");
141 SYSCTL_ULONG(_net_local_stream
, OID_AUTO
, sendspace
, CTLFLAG_RW
,
142 &unpst_sendspace
, 0, "Default stream send space.");
143 SYSCTL_ULONG(_net_local_stream
, OID_AUTO
, recvspace
, CTLFLAG_RW
,
144 &unpst_recvspace
, 0, "Default stream receive space.");
145 SYSCTL_ULONG(_net_local_dgram
, OID_AUTO
, maxdgram
, CTLFLAG_RW
,
146 &unpdg_sendspace
, 0, "Default datagram send space.");
147 SYSCTL_ULONG(_net_local_dgram
, OID_AUTO
, recvspace
, CTLFLAG_RW
,
148 &unpdg_recvspace
, 0, "Default datagram receive space.");
149 SYSCTL_INT(_net_local
, OID_AUTO
, inflight
, CTLFLAG_RD
, &unp_rights
, 0,
150 "File descriptors in flight.");
153 * Locking and synchronization:
155 * The global UNIX domain socket rwlock (unp_global_rwlock) protects all
156 * global variables, including the linked lists tracking the set of allocated
157 * UNIX domain sockets. The global rwlock also serves to prevent deadlock
158 * when more than one PCB lock is acquired at a time (i.e., during
159 * connect()). Finally, the global rwlock protects uncounted references from
160 * vnodes to sockets bound to those vnodes: to safely dereference the
161 * v_socket pointer, the global rwlock must be held while a full reference is
164 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
165 * allocated in pru_attach() and freed in pru_detach(). The validity of that
166 * pointer is an invariant, so no lock is required to dereference the so_pcb
167 * pointer if a valid socket reference is held by the caller. In practice,
168 * this is always true during operations performed on a socket. Each unpcb
169 * has a back-pointer to its socket, unp_socket, which will be stable under
170 * the same circumstances.
172 * This pointer may only be safely dereferenced as long as a valid reference
173 * to the unpcb is held. Typically, this reference will be from the socket,
174 * or from another unpcb when the referring unpcb's lock is held (in order
175 * that the reference not be invalidated during use). For example, to follow
176 * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
177 * as unp_socket remains valid as long as the reference to unp_conn is valid.
179 * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx. Individual
180 * atomic reads without the lock may be performed "lockless", but more
181 * complex reads and read-modify-writes require the mutex to be held. No
182 * lock order is defined between unpcb locks -- multiple unpcb locks may be
183 * acquired at the same time only when holding the global UNIX domain socket
184 * rwlock exclusively, which prevents deadlocks.
186 * Blocking with UNIX domain sockets is a tricky issue: unlike most network
187 * protocols, bind() is a non-atomic operation, and connect() requires
188 * potential sleeping in the protocol, due to potentially waiting on local or
189 * distributed file systems. We try to separate "lookup" operations, which
190 * may sleep, and the IPC operations themselves, which typically can occur
191 * with relative atomicity as locks can be held over the entire operation.
193 * Another tricky issue is simultaneous multi-threaded or multi-process
194 * access to a single UNIX domain socket. These are handled by the flags
195 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
196 * binding, both of which involve dropping UNIX domain socket locks in order
197 * to perform namei() and other file system operations.
199 static struct rwlock unp_global_rwlock
;
201 #define UNP_GLOBAL_LOCK_INIT() rw_init(&unp_global_rwlock, \
204 #define UNP_GLOBAL_LOCK_ASSERT() rw_assert(&unp_global_rwlock, \
206 #define UNP_GLOBAL_UNLOCK_ASSERT() rw_assert(&unp_global_rwlock, \
209 #define UNP_GLOBAL_WLOCK() rw_wlock(&unp_global_rwlock)
210 #define UNP_GLOBAL_WUNLOCK() rw_wunlock(&unp_global_rwlock)
211 #define UNP_GLOBAL_WLOCK_ASSERT() rw_assert(&unp_global_rwlock, \
213 #define UNP_GLOBAL_WOWNED() rw_wowned(&unp_global_rwlock)
215 #define UNP_GLOBAL_RLOCK() rw_rlock(&unp_global_rwlock)
216 #define UNP_GLOBAL_RUNLOCK() rw_runlock(&unp_global_rwlock)
217 #define UNP_GLOBAL_RLOCK_ASSERT() rw_assert(&unp_global_rwlock, \
220 #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \
221 "unp_mtx", "unp_mtx", \
222 MTX_DUPOK|MTX_DEF|MTX_RECURSE)
223 #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx)
224 #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx)
225 #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx)
226 #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED)
228 static int unp_connect(struct socket
*, struct sockaddr
*,
230 static int unp_connect2(struct socket
*so
, struct socket
*so2
, int);
231 static void unp_disconnect(struct unpcb
*unp
, struct unpcb
*unp2
);
232 static void unp_shutdown(struct unpcb
*);
233 static void unp_drop(struct unpcb
*, int);
234 static void unp_gc(__unused
void *, int);
235 static void unp_scan(struct mbuf
*, void (*)(struct file
*));
236 static void unp_discard(struct file
*);
237 static void unp_freerights(struct file
**, int);
238 static int unp_internalize(struct mbuf
**, struct thread
*);
239 static void unp_internalize_fp(struct file
*);
240 static void unp_externalize_fp(struct file
*);
241 static struct mbuf
*unp_addsockcred(struct thread
*, struct mbuf
*);
244 * Definitions of protocols supported in the LOCAL domain.
246 static struct domain localdomain
;
247 static struct protosw localsw
[] = {
249 .pr_type
= SOCK_STREAM
,
250 .pr_domain
= &localdomain
,
251 .pr_flags
= PR_CONNREQUIRED
|PR_WANTRCVD
|PR_RIGHTS
,
252 .pr_ctloutput
= &uipc_ctloutput
,
253 .pr_usrreqs
= &uipc_usrreqs
256 .pr_type
= SOCK_DGRAM
,
257 .pr_domain
= &localdomain
,
258 .pr_flags
= PR_ATOMIC
|PR_ADDR
|PR_RIGHTS
,
259 .pr_usrreqs
= &uipc_usrreqs
263 static struct domain localdomain
= {
264 .dom_family
= AF_LOCAL
,
266 .dom_init
= unp_init
,
267 .dom_externalize
= unp_externalize
,
268 .dom_dispose
= unp_dispose
,
269 .dom_protosw
= localsw
,
270 .dom_protoswNPROTOSW
= &localsw
[sizeof(localsw
)/sizeof(localsw
[0])]
275 uipc_abort(struct socket
*so
)
277 struct unpcb
*unp
, *unp2
;
280 KASSERT(unp
!= NULL
, ("uipc_abort: unp == NULL"));
284 unp2
= unp
->unp_conn
;
287 unp_drop(unp2
, ECONNABORTED
);
288 UNP_PCB_UNLOCK(unp2
);
291 UNP_GLOBAL_WUNLOCK();
295 uipc_accept(struct socket
*so
, struct sockaddr
**nam
)
297 struct unpcb
*unp
, *unp2
;
298 const struct sockaddr
*sa
;
301 * Pass back name of connected socket, if it was bound and we are
302 * still connected (our peer may have closed already!).
305 KASSERT(unp
!= NULL
, ("uipc_accept: unp == NULL"));
307 *nam
= malloc(sizeof(struct sockaddr_un
), M_SONAME
, M_WAITOK
);
309 unp2
= unp
->unp_conn
;
310 if (unp2
!= NULL
&& unp2
->unp_addr
!= NULL
) {
312 sa
= (struct sockaddr
*) unp2
->unp_addr
;
313 bcopy(sa
, *nam
, sa
->sa_len
);
314 UNP_PCB_UNLOCK(unp2
);
317 bcopy(sa
, *nam
, sa
->sa_len
);
319 UNP_GLOBAL_RUNLOCK();
324 uipc_attach(struct socket
*so
, int proto
, struct thread
*td
)
326 u_long sendspace
, recvspace
;
330 KASSERT(so
->so_pcb
== NULL
, ("uipc_attach: so_pcb != NULL"));
331 if (so
->so_snd
.sb_hiwat
== 0 || so
->so_rcv
.sb_hiwat
== 0) {
332 switch (so
->so_type
) {
334 sendspace
= unpst_sendspace
;
335 recvspace
= unpst_recvspace
;
339 sendspace
= unpdg_sendspace
;
340 recvspace
= unpdg_recvspace
;
344 panic("uipc_attach");
346 error
= soreserve(so
, sendspace
, recvspace
);
350 unp
= uma_zalloc(unp_zone
, M_NOWAIT
| M_ZERO
);
353 LIST_INIT(&unp
->unp_refs
);
354 UNP_PCB_LOCK_INIT(unp
);
355 unp
->unp_socket
= so
;
357 unp
->unp_refcount
= 1;
360 * uipc_attach() may be called indirectly from within the UNIX domain
361 * socket code via sonewconn() in unp_connect(). Since rwlocks can
362 * not be recursed, we do the closest thing.
365 if (!UNP_GLOBAL_WOWNED()) {
369 unp
->unp_gencnt
= ++unp_gencnt
;
371 LIST_INSERT_HEAD(so
->so_type
== SOCK_DGRAM
? &unp_dhead
: &unp_shead
,
374 UNP_GLOBAL_WUNLOCK();
380 uipc_bind(struct socket
*so
, struct sockaddr
*nam
, struct thread
*td
)
382 struct sockaddr_un
*soun
= (struct sockaddr_un
*)nam
;
384 int error
, namelen
, vfslocked
;
392 KASSERT(unp
!= NULL
, ("uipc_bind: unp == NULL"));
394 namelen
= soun
->sun_len
- offsetof(struct sockaddr_un
, sun_path
);
399 * We don't allow simultaneous bind() calls on a single UNIX domain
400 * socket, so flag in-progress operations, and return an error if an
401 * operation is already in progress.
403 * Historically, we have not allowed a socket to be rebound, so this
404 * also returns an error. Not allowing re-binding simplifies the
405 * implementation and avoids a great many possible failure modes.
408 if (unp
->unp_vnode
!= NULL
) {
412 if (unp
->unp_flags
& UNP_BINDING
) {
416 unp
->unp_flags
|= UNP_BINDING
;
419 buf
= malloc(namelen
+ 1, M_TEMP
, M_WAITOK
);
420 bcopy(soun
->sun_path
, buf
, namelen
);
425 NDINIT(&nd
, CREATE
, MPSAFE
| NOFOLLOW
| LOCKPARENT
| SAVENAME
,
426 UIO_SYSSPACE
, buf
, td
);
427 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
432 vfslocked
= NDHASGIANT(&nd
);
433 if (vp
!= NULL
|| vn_start_write(nd
.ni_dvp
, &mp
, V_NOWAIT
) != 0) {
434 NDFREE(&nd
, NDF_ONLY_PNBUF
);
444 error
= vn_start_write(NULL
, &mp
, V_XSLEEP
| PCATCH
);
447 VFS_UNLOCK_GIANT(vfslocked
);
451 vattr
.va_type
= VSOCK
;
452 vattr
.va_mode
= (ACCESSPERMS
& ~td
->td_proc
->p_fd
->fd_cmask
);
454 error
= mac_vnode_check_create(td
->td_ucred
, nd
.ni_dvp
, &nd
.ni_cnd
,
458 VOP_LEASE(nd
.ni_dvp
, td
, td
->td_ucred
, LEASE_WRITE
);
459 error
= VOP_CREATE(nd
.ni_dvp
, &nd
.ni_vp
, &nd
.ni_cnd
, &vattr
);
461 NDFREE(&nd
, NDF_ONLY_PNBUF
);
464 vn_finished_write(mp
);
468 ASSERT_VOP_ELOCKED(vp
, "uipc_bind");
469 soun
= (struct sockaddr_un
*)sodupsockaddr(nam
, M_WAITOK
);
473 vp
->v_socket
= unp
->unp_socket
;
475 unp
->unp_addr
= soun
;
476 unp
->unp_flags
&= ~UNP_BINDING
;
478 UNP_GLOBAL_WUNLOCK();
480 vn_finished_write(mp
);
481 VFS_UNLOCK_GIANT(vfslocked
);
486 VFS_UNLOCK_GIANT(vfslocked
);
488 unp
->unp_flags
&= ~UNP_BINDING
;
495 uipc_connect(struct socket
*so
, struct sockaddr
*nam
, struct thread
*td
)
499 KASSERT(td
== curthread
, ("uipc_connect: td != curthread"));
501 error
= unp_connect(so
, nam
, td
);
502 UNP_GLOBAL_WUNLOCK();
507 uipc_close(struct socket
*so
)
509 struct unpcb
*unp
, *unp2
;
512 KASSERT(unp
!= NULL
, ("uipc_close: unp == NULL"));
516 unp2
= unp
->unp_conn
;
519 unp_disconnect(unp
, unp2
);
520 UNP_PCB_UNLOCK(unp2
);
523 UNP_GLOBAL_WUNLOCK();
527 uipc_connect2(struct socket
*so1
, struct socket
*so2
)
529 struct unpcb
*unp
, *unp2
;
534 KASSERT(unp
!= NULL
, ("uipc_connect2: unp == NULL"));
537 KASSERT(unp2
!= NULL
, ("uipc_connect2: unp2 == NULL"));
539 error
= unp_connect2(so1
, so2
, PRU_CONNECT2
);
540 UNP_PCB_UNLOCK(unp2
);
542 UNP_GLOBAL_WUNLOCK();
546 /* control is EOPNOTSUPP */
549 uipc_detach(struct socket
*so
)
551 struct unpcb
*unp
, *unp2
;
552 struct sockaddr_un
*saved_unp_addr
;
554 int freeunp
, local_unp_rights
;
557 KASSERT(unp
!= NULL
, ("uipc_detach: unp == NULL"));
562 LIST_REMOVE(unp
, unp_link
);
563 unp
->unp_gencnt
= ++unp_gencnt
;
567 * XXXRW: Should assert vp->v_socket == so.
569 if ((vp
= unp
->unp_vnode
) != NULL
) {
570 unp
->unp_vnode
->v_socket
= NULL
;
571 unp
->unp_vnode
= NULL
;
573 unp2
= unp
->unp_conn
;
576 unp_disconnect(unp
, unp2
);
577 UNP_PCB_UNLOCK(unp2
);
581 * We hold the global lock, so it's OK to acquire multiple pcb locks
584 while (!LIST_EMPTY(&unp
->unp_refs
)) {
585 struct unpcb
*ref
= LIST_FIRST(&unp
->unp_refs
);
588 unp_drop(ref
, ECONNRESET
);
591 local_unp_rights
= unp_rights
;
592 UNP_GLOBAL_WUNLOCK();
593 unp
->unp_socket
->so_pcb
= NULL
;
594 saved_unp_addr
= unp
->unp_addr
;
595 unp
->unp_addr
= NULL
;
597 freeunp
= (unp
->unp_refcount
== 0);
598 if (saved_unp_addr
!= NULL
)
599 FREE(saved_unp_addr
, M_SONAME
);
601 UNP_PCB_LOCK_DESTROY(unp
);
602 uma_zfree(unp_zone
, unp
);
608 vfslocked
= VFS_LOCK_GIANT(vp
->v_mount
);
610 VFS_UNLOCK_GIANT(vfslocked
);
612 if (local_unp_rights
)
613 taskqueue_enqueue(taskqueue_thread
, &unp_gc_task
);
617 uipc_disconnect(struct socket
*so
)
619 struct unpcb
*unp
, *unp2
;
622 KASSERT(unp
!= NULL
, ("uipc_disconnect: unp == NULL"));
626 unp2
= unp
->unp_conn
;
629 unp_disconnect(unp
, unp2
);
630 UNP_PCB_UNLOCK(unp2
);
633 UNP_GLOBAL_WUNLOCK();
638 uipc_listen(struct socket
*so
, int backlog
, struct thread
*td
)
644 KASSERT(unp
!= NULL
, ("uipc_listen: unp == NULL"));
647 if (unp
->unp_vnode
== NULL
) {
653 error
= solisten_proto_check(so
);
655 cru2x(td
->td_ucred
, &unp
->unp_peercred
);
656 unp
->unp_flags
|= UNP_HAVEPCCACHED
;
657 solisten_proto(so
, backlog
);
665 uipc_peeraddr(struct socket
*so
, struct sockaddr
**nam
)
667 struct unpcb
*unp
, *unp2
;
668 const struct sockaddr
*sa
;
671 KASSERT(unp
!= NULL
, ("uipc_peeraddr: unp == NULL"));
673 *nam
= malloc(sizeof(struct sockaddr_un
), M_SONAME
, M_WAITOK
);
676 * XXX: It seems that this test always fails even when connection is
677 * established. So, this else clause is added as workaround to
678 * return PF_LOCAL sockaddr.
680 unp2
= unp
->unp_conn
;
683 if (unp2
->unp_addr
!= NULL
)
684 sa
= (struct sockaddr
*) unp
->unp_conn
->unp_addr
;
687 bcopy(sa
, *nam
, sa
->sa_len
);
688 UNP_PCB_UNLOCK(unp2
);
691 bcopy(sa
, *nam
, sa
->sa_len
);
698 uipc_rcvd(struct socket
*so
, int flags
)
700 struct unpcb
*unp
, *unp2
;
706 KASSERT(unp
!= NULL
, ("uipc_rcvd: unp == NULL"));
708 if (so
->so_type
== SOCK_DGRAM
)
709 panic("uipc_rcvd DGRAM?");
711 if (so
->so_type
!= SOCK_STREAM
)
712 panic("uipc_rcvd unknown socktype");
715 * Adjust backpressure on sender and wakeup any waiting to write.
717 * The unp lock is acquired to maintain the validity of the unp_conn
718 * pointer; no lock on unp2 is required as unp2->unp_socket will be
719 * static as long as we don't permit unp2 to disconnect from unp,
720 * which is prevented by the lock on unp. We cache values from
721 * so_rcv to avoid holding the so_rcv lock over the entire
722 * transaction on the remote so_snd.
724 SOCKBUF_LOCK(&so
->so_rcv
);
725 mbcnt
= so
->so_rcv
.sb_mbcnt
;
726 sbcc
= so
->so_rcv
.sb_cc
;
727 SOCKBUF_UNLOCK(&so
->so_rcv
);
729 unp2
= unp
->unp_conn
;
734 so2
= unp2
->unp_socket
;
735 SOCKBUF_LOCK(&so2
->so_snd
);
736 so2
->so_snd
.sb_mbmax
+= unp
->unp_mbcnt
- mbcnt
;
737 newhiwat
= so2
->so_snd
.sb_hiwat
+ unp
->unp_cc
- sbcc
;
738 (void)chgsbsize(so2
->so_cred
->cr_uidinfo
, &so2
->so_snd
.sb_hiwat
,
739 newhiwat
, RLIM_INFINITY
);
740 sowwakeup_locked(so2
);
741 unp
->unp_mbcnt
= mbcnt
;
747 /* pru_rcvoob is EOPNOTSUPP */
750 uipc_send(struct socket
*so
, int flags
, struct mbuf
*m
, struct sockaddr
*nam
,
751 struct mbuf
*control
, struct thread
*td
)
753 struct unpcb
*unp
, *unp2
;
760 KASSERT(unp
!= NULL
, ("uipc_send: unp == NULL"));
762 if (flags
& PRUS_OOB
) {
767 if (control
!= NULL
&& (error
= unp_internalize(&control
, td
)))
770 if ((nam
!= NULL
) || (flags
& PRUS_EOF
))
775 switch (so
->so_type
) {
778 const struct sockaddr
*from
;
780 unp2
= unp
->unp_conn
;
782 UNP_GLOBAL_WLOCK_ASSERT();
787 error
= unp_connect(so
, nam
, td
);
790 unp2
= unp
->unp_conn
;
793 * Because connect() and send() are non-atomic in a sendto()
794 * with a target address, it's possible that the socket will
795 * have disconnected before the send() can run. In that case
796 * return the slightly counter-intuitive but otherwise
797 * correct error that the socket is not connected.
804 if (unp2
->unp_flags
& UNP_WANTCRED
)
805 control
= unp_addsockcred(td
, control
);
807 if (unp
->unp_addr
!= NULL
)
808 from
= (struct sockaddr
*)unp
->unp_addr
;
811 so2
= unp2
->unp_socket
;
812 SOCKBUF_LOCK(&so2
->so_rcv
);
813 if (sbappendaddr_locked(&so2
->so_rcv
, from
, m
, control
)) {
814 sorwakeup_locked(so2
);
818 SOCKBUF_UNLOCK(&so2
->so_rcv
);
822 UNP_GLOBAL_WLOCK_ASSERT();
824 unp_disconnect(unp
, unp2
);
825 UNP_PCB_UNLOCK(unp2
);
833 * Connect if not connected yet.
835 * Note: A better implementation would complain if not equal
836 * to the peer's address.
838 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
840 UNP_GLOBAL_WLOCK_ASSERT();
841 error
= unp_connect(so
, nam
, td
);
851 if (so
->so_snd
.sb_state
& SBS_CANTSENDMORE
) {
856 * Because connect() and send() are non-atomic in a sendto()
857 * with a target address, it's possible that the socket will
858 * have disconnected before the send() can run. In that case
859 * return the slightly counter-intuitive but otherwise
860 * correct error that the socket is not connected.
862 * Locking here must be done carefully: the global lock
863 * prevents interconnections between unpcbs from changing, so
864 * we can traverse from unp to unp2 without acquiring unp's
865 * lock. Socket buffer locks follow unpcb locks, so we can
866 * acquire both remote and lock socket buffer locks.
868 unp2
= unp
->unp_conn
;
873 so2
= unp2
->unp_socket
;
875 SOCKBUF_LOCK(&so2
->so_rcv
);
876 if (unp2
->unp_flags
& UNP_WANTCRED
) {
878 * Credentials are passed only once on SOCK_STREAM.
880 unp2
->unp_flags
&= ~UNP_WANTCRED
;
881 control
= unp_addsockcred(td
, control
);
884 * Send to paired receive port, and then reduce send buffer
885 * hiwater marks to maintain backpressure. Wake up readers.
887 if (control
!= NULL
) {
888 if (sbappendcontrol_locked(&so2
->so_rcv
, m
, control
))
891 sbappend_locked(&so2
->so_rcv
, m
);
892 mbcnt
= so2
->so_rcv
.sb_mbcnt
- unp2
->unp_mbcnt
;
893 unp2
->unp_mbcnt
= so2
->so_rcv
.sb_mbcnt
;
894 sbcc
= so2
->so_rcv
.sb_cc
;
895 sorwakeup_locked(so2
);
897 SOCKBUF_LOCK(&so
->so_snd
);
898 newhiwat
= so
->so_snd
.sb_hiwat
- (sbcc
- unp2
->unp_cc
);
899 (void)chgsbsize(so
->so_cred
->cr_uidinfo
, &so
->so_snd
.sb_hiwat
,
900 newhiwat
, RLIM_INFINITY
);
901 so
->so_snd
.sb_mbmax
-= mbcnt
;
902 SOCKBUF_UNLOCK(&so
->so_snd
);
904 UNP_PCB_UNLOCK(unp2
);
909 panic("uipc_send unknown socktype");
913 * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN.
915 if (flags
& PRUS_EOF
) {
922 if ((nam
!= NULL
) || (flags
& PRUS_EOF
))
923 UNP_GLOBAL_WUNLOCK();
925 UNP_GLOBAL_RUNLOCK();
927 if (control
!= NULL
&& error
!= 0)
928 unp_dispose(control
);
939 uipc_sense(struct socket
*so
, struct stat
*sb
)
941 struct unpcb
*unp
, *unp2
;
945 KASSERT(unp
!= NULL
, ("uipc_sense: unp == NULL"));
947 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
950 unp2
= unp
->unp_conn
;
951 if (so
->so_type
== SOCK_STREAM
&& unp2
!= NULL
) {
952 so2
= unp2
->unp_socket
;
953 sb
->st_blksize
+= so2
->so_rcv
.sb_cc
;
956 if (unp
->unp_ino
== 0)
957 unp
->unp_ino
= (++unp_ino
== 0) ? ++unp_ino
: unp_ino
;
958 sb
->st_ino
= unp
->unp_ino
;
960 UNP_GLOBAL_RUNLOCK();
965 uipc_shutdown(struct socket
*so
)
970 KASSERT(unp
!= NULL
, ("uipc_shutdown: unp == NULL"));
977 UNP_GLOBAL_WUNLOCK();
982 uipc_sockaddr(struct socket
*so
, struct sockaddr
**nam
)
985 const struct sockaddr
*sa
;
988 KASSERT(unp
!= NULL
, ("uipc_sockaddr: unp == NULL"));
990 *nam
= malloc(sizeof(struct sockaddr_un
), M_SONAME
, M_WAITOK
);
992 if (unp
->unp_addr
!= NULL
)
993 sa
= (struct sockaddr
*) unp
->unp_addr
;
996 bcopy(sa
, *nam
, sa
->sa_len
);
1001 struct pr_usrreqs uipc_usrreqs
= {
1002 .pru_abort
= uipc_abort
,
1003 .pru_accept
= uipc_accept
,
1004 .pru_attach
= uipc_attach
,
1005 .pru_bind
= uipc_bind
,
1006 .pru_connect
= uipc_connect
,
1007 .pru_connect2
= uipc_connect2
,
1008 .pru_detach
= uipc_detach
,
1009 .pru_disconnect
= uipc_disconnect
,
1010 .pru_listen
= uipc_listen
,
1011 .pru_peeraddr
= uipc_peeraddr
,
1012 .pru_rcvd
= uipc_rcvd
,
1013 .pru_send
= uipc_send
,
1014 .pru_sense
= uipc_sense
,
1015 .pru_shutdown
= uipc_shutdown
,
1016 .pru_sockaddr
= uipc_sockaddr
,
1017 .pru_close
= uipc_close
,
1021 uipc_ctloutput(struct socket
*so
, struct sockopt
*sopt
)
1027 if (sopt
->sopt_level
!= 0)
1030 unp
= sotounpcb(so
);
1031 KASSERT(unp
!= NULL
, ("uipc_ctloutput: unp == NULL"));
1033 switch (sopt
->sopt_dir
) {
1035 switch (sopt
->sopt_name
) {
1036 case LOCAL_PEERCRED
:
1038 if (unp
->unp_flags
& UNP_HAVEPC
)
1039 xu
= unp
->unp_peercred
;
1041 if (so
->so_type
== SOCK_STREAM
)
1046 UNP_PCB_UNLOCK(unp
);
1048 error
= sooptcopyout(sopt
, &xu
, sizeof(xu
));
1052 /* Unlocked read. */
1053 optval
= unp
->unp_flags
& UNP_WANTCRED
? 1 : 0;
1054 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
1057 case LOCAL_CONNWAIT
:
1058 /* Unlocked read. */
1059 optval
= unp
->unp_flags
& UNP_CONNWAIT
? 1 : 0;
1060 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
1070 switch (sopt
->sopt_name
) {
1072 case LOCAL_CONNWAIT
:
1073 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
1078 #define OPTSET(bit) do { \
1079 UNP_PCB_LOCK(unp); \
1081 unp->unp_flags |= bit; \
1083 unp->unp_flags &= ~bit; \
1084 UNP_PCB_UNLOCK(unp); \
1087 switch (sopt
->sopt_name
) {
1089 OPTSET(UNP_WANTCRED
);
1092 case LOCAL_CONNWAIT
:
1093 OPTSET(UNP_CONNWAIT
);
1102 error
= ENOPROTOOPT
;
1115 unp_connect(struct socket
*so
, struct sockaddr
*nam
, struct thread
*td
)
1117 struct sockaddr_un
*soun
= (struct sockaddr_un
*)nam
;
1119 struct socket
*so2
, *so3
;
1120 struct unpcb
*unp
, *unp2
, *unp3
;
1121 int error
, len
, vfslocked
;
1122 struct nameidata nd
;
1123 char buf
[SOCK_MAXADDRLEN
];
1124 struct sockaddr
*sa
;
1126 UNP_GLOBAL_WLOCK_ASSERT();
1128 unp
= sotounpcb(so
);
1129 KASSERT(unp
!= NULL
, ("unp_connect: unp == NULL"));
1131 len
= nam
->sa_len
- offsetof(struct sockaddr_un
, sun_path
);
1134 bcopy(soun
->sun_path
, buf
, len
);
1138 if (unp
->unp_flags
& UNP_CONNECTING
) {
1139 UNP_PCB_UNLOCK(unp
);
1142 UNP_GLOBAL_WUNLOCK();
1143 unp
->unp_flags
|= UNP_CONNECTING
;
1144 UNP_PCB_UNLOCK(unp
);
1146 sa
= malloc(sizeof(struct sockaddr_un
), M_SONAME
, M_WAITOK
);
1147 NDINIT(&nd
, LOOKUP
, MPSAFE
| FOLLOW
| LOCKLEAF
, UIO_SYSSPACE
, buf
,
1154 ASSERT_VOP_LOCKED(vp
, "unp_connect");
1155 vfslocked
= NDHASGIANT(&nd
);
1156 NDFREE(&nd
, NDF_ONLY_PNBUF
);
1160 if (vp
->v_type
!= VSOCK
) {
1165 error
= mac_vnode_check_open(td
->td_ucred
, vp
, VWRITE
| VREAD
);
1169 error
= VOP_ACCESS(vp
, VWRITE
, td
->td_ucred
, td
);
1172 VFS_UNLOCK_GIANT(vfslocked
);
1174 unp
= sotounpcb(so
);
1175 KASSERT(unp
!= NULL
, ("unp_connect: unp == NULL"));
1178 * Lock global lock for two reasons: make sure v_socket is stable,
1179 * and to protect simultaneous locking of multiple pcbs.
1184 error
= ECONNREFUSED
;
1187 if (so
->so_type
!= so2
->so_type
) {
1191 if (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) {
1192 if (so2
->so_options
& SO_ACCEPTCONN
) {
1194 * We can't drop the global lock here or 'so2' may
1195 * become invalid. As a result, we need to handle
1196 * possibly lock recursion in uipc_attach.
1198 so3
= sonewconn(so2
, 0);
1202 error
= ECONNREFUSED
;
1205 unp
= sotounpcb(so
);
1206 unp2
= sotounpcb(so2
);
1207 unp3
= sotounpcb(so3
);
1211 if (unp2
->unp_addr
!= NULL
) {
1212 bcopy(unp2
->unp_addr
, sa
, unp2
->unp_addr
->sun_len
);
1213 unp3
->unp_addr
= (struct sockaddr_un
*) sa
;
1217 * unp_peercred management:
1219 * The connecter's (client's) credentials are copied from its
1220 * process structure at the time of connect() (which is now).
1222 cru2x(td
->td_ucred
, &unp3
->unp_peercred
);
1223 unp3
->unp_flags
|= UNP_HAVEPC
;
1225 * The receiver's (server's) credentials are copied from the
1226 * unp_peercred member of socket on which the former called
1227 * listen(); uipc_listen() cached that process's credentials
1228 * at that time so we can use them now.
1230 KASSERT(unp2
->unp_flags
& UNP_HAVEPCCACHED
,
1231 ("unp_connect: listener without cached peercred"));
1232 memcpy(&unp
->unp_peercred
, &unp2
->unp_peercred
,
1233 sizeof(unp
->unp_peercred
));
1234 unp
->unp_flags
|= UNP_HAVEPC
;
1235 if (unp2
->unp_flags
& UNP_WANTCRED
)
1236 unp3
->unp_flags
|= UNP_WANTCRED
;
1237 UNP_PCB_UNLOCK(unp3
);
1238 UNP_PCB_UNLOCK(unp2
);
1239 UNP_PCB_UNLOCK(unp
);
1242 mac_socketpeer_set_from_socket(so
, so3
);
1243 mac_socketpeer_set_from_socket(so3
, so
);
1249 unp
= sotounpcb(so
);
1250 KASSERT(unp
!= NULL
, ("unp_connect: unp == NULL"));
1251 unp2
= sotounpcb(so2
);
1252 KASSERT(unp2
!= NULL
, ("unp_connect: unp2 == NULL"));
1255 error
= unp_connect2(so
, so2
, PRU_CONNECT
);
1256 UNP_PCB_UNLOCK(unp2
);
1257 UNP_PCB_UNLOCK(unp
);
1259 UNP_GLOBAL_WUNLOCK();
1262 * Giant has been previously acquired. This means filesystem
1263 * isn't MPSAFE. Do it once again.
1269 VFS_UNLOCK_GIANT(vfslocked
);
1273 unp
->unp_flags
&= ~UNP_CONNECTING
;
1274 UNP_PCB_UNLOCK(unp
);
1279 unp_connect2(struct socket
*so
, struct socket
*so2
, int req
)
1284 unp
= sotounpcb(so
);
1285 KASSERT(unp
!= NULL
, ("unp_connect2: unp == NULL"));
1286 unp2
= sotounpcb(so2
);
1287 KASSERT(unp2
!= NULL
, ("unp_connect2: unp2 == NULL"));
1289 UNP_GLOBAL_WLOCK_ASSERT();
1290 UNP_PCB_LOCK_ASSERT(unp
);
1291 UNP_PCB_LOCK_ASSERT(unp2
);
1293 if (so2
->so_type
!= so
->so_type
)
1294 return (EPROTOTYPE
);
1295 unp
->unp_conn
= unp2
;
1297 switch (so
->so_type
) {
1299 LIST_INSERT_HEAD(&unp2
->unp_refs
, unp
, unp_reflink
);
1304 unp2
->unp_conn
= unp
;
1305 if (req
== PRU_CONNECT
&&
1306 ((unp
->unp_flags
| unp2
->unp_flags
) & UNP_CONNWAIT
))
1314 panic("unp_connect2");
1320 unp_disconnect(struct unpcb
*unp
, struct unpcb
*unp2
)
1324 KASSERT(unp2
!= NULL
, ("unp_disconnect: unp2 == NULL"));
1326 UNP_GLOBAL_WLOCK_ASSERT();
1327 UNP_PCB_LOCK_ASSERT(unp
);
1328 UNP_PCB_LOCK_ASSERT(unp2
);
1330 unp
->unp_conn
= NULL
;
1331 switch (unp
->unp_socket
->so_type
) {
1333 LIST_REMOVE(unp
, unp_reflink
);
1334 so
= unp
->unp_socket
;
1336 so
->so_state
&= ~SS_ISCONNECTED
;
1341 soisdisconnected(unp
->unp_socket
);
1342 unp2
->unp_conn
= NULL
;
1343 soisdisconnected(unp2
->unp_socket
);
1349 * unp_pcblist() walks the global list of struct unpcb's to generate a
1350 * pointer list, bumping the refcount on each unpcb. It then copies them out
1351 * sequentially, validating the generation number on each to see if it has
1352 * been detached. All of this is necessary because copyout() may sleep on
1356 unp_pcblist(SYSCTL_HANDLER_ARGS
)
1360 struct unpcb
*unp
, **unp_list
;
1362 struct xunpgen
*xug
;
1363 struct unp_head
*head
;
1366 head
= ((intptr_t)arg1
== SOCK_DGRAM
? &unp_dhead
: &unp_shead
);
1369 * The process of preparing the PCB list is too time-consuming and
1370 * resource-intensive to repeat twice on every request.
1372 if (req
->oldptr
== NULL
) {
1374 req
->oldidx
= 2 * (sizeof *xug
)
1375 + (n
+ n
/8) * sizeof(struct xunpcb
);
1379 if (req
->newptr
!= NULL
)
1383 * OK, now we're committed to doing something.
1385 xug
= malloc(sizeof(*xug
), M_TEMP
, M_WAITOK
);
1387 gencnt
= unp_gencnt
;
1389 UNP_GLOBAL_RUNLOCK();
1391 xug
->xug_len
= sizeof *xug
;
1393 xug
->xug_gen
= gencnt
;
1394 xug
->xug_sogen
= so_gencnt
;
1395 error
= SYSCTL_OUT(req
, xug
, sizeof *xug
);
1401 unp_list
= malloc(n
* sizeof *unp_list
, M_TEMP
, M_WAITOK
);
1404 for (unp
= LIST_FIRST(head
), i
= 0; unp
&& i
< n
;
1405 unp
= LIST_NEXT(unp
, unp_link
)) {
1407 if (unp
->unp_gencnt
<= gencnt
) {
1408 if (cr_cansee(req
->td
->td_ucred
,
1409 unp
->unp_socket
->so_cred
)) {
1410 UNP_PCB_UNLOCK(unp
);
1413 unp_list
[i
++] = unp
;
1414 unp
->unp_refcount
++;
1416 UNP_PCB_UNLOCK(unp
);
1418 UNP_GLOBAL_RUNLOCK();
1419 n
= i
; /* In case we lost some during malloc. */
1422 xu
= malloc(sizeof(*xu
), M_TEMP
, M_WAITOK
| M_ZERO
);
1423 for (i
= 0; i
< n
; i
++) {
1426 unp
->unp_refcount
--;
1427 if (unp
->unp_refcount
!= 0 && unp
->unp_gencnt
<= gencnt
) {
1428 xu
->xu_len
= sizeof *xu
;
1431 * XXX - need more locking here to protect against
1432 * connect/disconnect races for SMP.
1434 if (unp
->unp_addr
!= NULL
)
1435 bcopy(unp
->unp_addr
, &xu
->xu_addr
,
1436 unp
->unp_addr
->sun_len
);
1437 if (unp
->unp_conn
!= NULL
&&
1438 unp
->unp_conn
->unp_addr
!= NULL
)
1439 bcopy(unp
->unp_conn
->unp_addr
,
1441 unp
->unp_conn
->unp_addr
->sun_len
);
1442 bcopy(unp
, &xu
->xu_unp
, sizeof *unp
);
1443 sotoxsocket(unp
->unp_socket
, &xu
->xu_socket
);
1444 UNP_PCB_UNLOCK(unp
);
1445 error
= SYSCTL_OUT(req
, xu
, sizeof *xu
);
1447 freeunp
= (unp
->unp_refcount
== 0);
1448 UNP_PCB_UNLOCK(unp
);
1450 UNP_PCB_LOCK_DESTROY(unp
);
1451 uma_zfree(unp_zone
, unp
);
1458 * Give the user an updated idea of our state. If the
1459 * generation differs from what we told her before, she knows
1460 * that something happened while we were processing this
1461 * request, and it might be necessary to retry.
1463 xug
->xug_gen
= unp_gencnt
;
1464 xug
->xug_sogen
= so_gencnt
;
1465 xug
->xug_count
= unp_count
;
1466 error
= SYSCTL_OUT(req
, xug
, sizeof *xug
);
1468 free(unp_list
, M_TEMP
);
1473 SYSCTL_PROC(_net_local_dgram
, OID_AUTO
, pcblist
, CTLFLAG_RD
,
1474 (caddr_t
)(long)SOCK_DGRAM
, 0, unp_pcblist
, "S,xunpcb",
1475 "List of active local datagram sockets");
1476 SYSCTL_PROC(_net_local_stream
, OID_AUTO
, pcblist
, CTLFLAG_RD
,
1477 (caddr_t
)(long)SOCK_STREAM
, 0, unp_pcblist
, "S,xunpcb",
1478 "List of active local stream sockets");
1481 unp_shutdown(struct unpcb
*unp
)
1486 UNP_GLOBAL_WLOCK_ASSERT();
1487 UNP_PCB_LOCK_ASSERT(unp
);
1489 unp2
= unp
->unp_conn
;
1490 if (unp
->unp_socket
->so_type
== SOCK_STREAM
&& unp2
!= NULL
) {
1491 so
= unp2
->unp_socket
;
1498 unp_drop(struct unpcb
*unp
, int errno
)
1500 struct socket
*so
= unp
->unp_socket
;
1503 UNP_GLOBAL_WLOCK_ASSERT();
1504 UNP_PCB_LOCK_ASSERT(unp
);
1506 so
->so_error
= errno
;
1507 unp2
= unp
->unp_conn
;
1512 unp_disconnect(unp
, unp2
);
1513 UNP_PCB_UNLOCK(unp2
);
1517 unp_freerights(struct file
**rp
, int fdcount
)
1522 for (i
= 0; i
< fdcount
; i
++) {
1524 * Zero the pointer before calling unp_discard since it may
1525 * end up in unp_gc()..
1527 * XXXRW: This is less true than it used to be.
1536 unp_externalize(struct mbuf
*control
, struct mbuf
**controlp
)
1538 struct thread
*td
= curthread
; /* XXX */
1539 struct cmsghdr
*cm
= mtod(control
, struct cmsghdr
*);
1545 socklen_t clen
= control
->m_len
, datalen
;
1550 UNP_GLOBAL_UNLOCK_ASSERT();
1553 if (controlp
!= NULL
) /* controlp == NULL => free control messages */
1556 while (cm
!= NULL
) {
1557 if (sizeof(*cm
) > clen
|| cm
->cmsg_len
> clen
) {
1562 data
= CMSG_DATA(cm
);
1563 datalen
= (caddr_t
)cm
+ cm
->cmsg_len
- (caddr_t
)data
;
1565 if (cm
->cmsg_level
== SOL_SOCKET
1566 && cm
->cmsg_type
== SCM_RIGHTS
) {
1567 newfds
= datalen
/ sizeof(struct file
*);
1570 /* If we're not outputting the descriptors free them. */
1571 if (error
|| controlp
== NULL
) {
1572 unp_freerights(rp
, newfds
);
1575 FILEDESC_XLOCK(td
->td_proc
->p_fd
);
1576 /* if the new FD's will not fit free them. */
1577 if (!fdavail(td
, newfds
)) {
1578 FILEDESC_XUNLOCK(td
->td_proc
->p_fd
);
1580 unp_freerights(rp
, newfds
);
1584 * Now change each pointer to an fd in the global
1585 * table to an integer that is the index to the local
1586 * fd table entry that we set up to point to the
1587 * global one we are transferring.
1589 newlen
= newfds
* sizeof(int);
1590 *controlp
= sbcreatecontrol(NULL
, newlen
,
1591 SCM_RIGHTS
, SOL_SOCKET
);
1592 if (*controlp
== NULL
) {
1593 FILEDESC_XUNLOCK(td
->td_proc
->p_fd
);
1595 unp_freerights(rp
, newfds
);
1600 CMSG_DATA(mtod(*controlp
, struct cmsghdr
*));
1601 for (i
= 0; i
< newfds
; i
++) {
1602 if (fdalloc(td
, 0, &f
))
1603 panic("unp_externalize fdalloc failed");
1605 td
->td_proc
->p_fd
->fd_ofiles
[f
] = fp
;
1606 unp_externalize_fp(fp
);
1609 FILEDESC_XUNLOCK(td
->td_proc
->p_fd
);
1611 /* We can just copy anything else across. */
1612 if (error
|| controlp
== NULL
)
1614 *controlp
= sbcreatecontrol(NULL
, datalen
,
1615 cm
->cmsg_type
, cm
->cmsg_level
);
1616 if (*controlp
== NULL
) {
1621 CMSG_DATA(mtod(*controlp
, struct cmsghdr
*)),
1625 controlp
= &(*controlp
)->m_next
;
1628 if (CMSG_SPACE(datalen
) < clen
) {
1629 clen
-= CMSG_SPACE(datalen
);
1630 cm
= (struct cmsghdr
*)
1631 ((caddr_t
)cm
+ CMSG_SPACE(datalen
));
1644 unp_zone_change(void *tag
)
1647 uma_zone_set_max(unp_zone
, maxsockets
);
1654 unp_zone
= uma_zcreate("unpcb", sizeof(struct unpcb
), NULL
, NULL
,
1655 NULL
, NULL
, UMA_ALIGN_PTR
, 0);
1656 if (unp_zone
== NULL
)
1658 uma_zone_set_max(unp_zone
, maxsockets
);
1659 EVENTHANDLER_REGISTER(maxsockets_change
, unp_zone_change
,
1660 NULL
, EVENTHANDLER_PRI_ANY
);
1661 LIST_INIT(&unp_dhead
);
1662 LIST_INIT(&unp_shead
);
1663 TASK_INIT(&unp_gc_task
, 0, unp_gc
, NULL
);
1664 UNP_GLOBAL_LOCK_INIT();
1668 unp_internalize(struct mbuf
**controlp
, struct thread
*td
)
1670 struct mbuf
*control
= *controlp
;
1671 struct proc
*p
= td
->td_proc
;
1672 struct filedesc
*fdescp
= p
->p_fd
;
1673 struct cmsghdr
*cm
= mtod(control
, struct cmsghdr
*);
1674 struct cmsgcred
*cmcred
;
1680 socklen_t clen
= control
->m_len
, datalen
;
1684 UNP_GLOBAL_UNLOCK_ASSERT();
1689 while (cm
!= NULL
) {
1690 if (sizeof(*cm
) > clen
|| cm
->cmsg_level
!= SOL_SOCKET
1691 || cm
->cmsg_len
> clen
) {
1696 data
= CMSG_DATA(cm
);
1697 datalen
= (caddr_t
)cm
+ cm
->cmsg_len
- (caddr_t
)data
;
1699 switch (cm
->cmsg_type
) {
1701 * Fill in credential information.
1704 *controlp
= sbcreatecontrol(NULL
, sizeof(*cmcred
),
1705 SCM_CREDS
, SOL_SOCKET
);
1706 if (*controlp
== NULL
) {
1711 cmcred
= (struct cmsgcred
*)
1712 CMSG_DATA(mtod(*controlp
, struct cmsghdr
*));
1713 cmcred
->cmcred_pid
= p
->p_pid
;
1714 cmcred
->cmcred_uid
= td
->td_ucred
->cr_ruid
;
1715 cmcred
->cmcred_gid
= td
->td_ucred
->cr_rgid
;
1716 cmcred
->cmcred_euid
= td
->td_ucred
->cr_uid
;
1717 cmcred
->cmcred_ngroups
= MIN(td
->td_ucred
->cr_ngroups
,
1719 for (i
= 0; i
< cmcred
->cmcred_ngroups
; i
++)
1720 cmcred
->cmcred_groups
[i
] =
1721 td
->td_ucred
->cr_groups
[i
];
1725 oldfds
= datalen
/ sizeof (int);
1727 * Check that all the FDs passed in refer to legal
1728 * files. If not, reject the entire operation.
1731 FILEDESC_SLOCK(fdescp
);
1732 for (i
= 0; i
< oldfds
; i
++) {
1734 if ((unsigned)fd
>= fdescp
->fd_nfiles
||
1735 fdescp
->fd_ofiles
[fd
] == NULL
) {
1736 FILEDESC_SUNLOCK(fdescp
);
1740 fp
= fdescp
->fd_ofiles
[fd
];
1741 if (!(fp
->f_ops
->fo_flags
& DFLAG_PASSABLE
)) {
1742 FILEDESC_SUNLOCK(fdescp
);
1750 * Now replace the integer FDs with pointers to
1751 * the associated global file table entry..
1753 newlen
= oldfds
* sizeof(struct file
*);
1754 *controlp
= sbcreatecontrol(NULL
, newlen
,
1755 SCM_RIGHTS
, SOL_SOCKET
);
1756 if (*controlp
== NULL
) {
1757 FILEDESC_SUNLOCK(fdescp
);
1763 rp
= (struct file
**)
1764 CMSG_DATA(mtod(*controlp
, struct cmsghdr
*));
1765 for (i
= 0; i
< oldfds
; i
++) {
1766 fp
= fdescp
->fd_ofiles
[*fdp
++];
1768 unp_internalize_fp(fp
);
1770 FILEDESC_SUNLOCK(fdescp
);
1774 *controlp
= sbcreatecontrol(NULL
, sizeof(*tv
),
1775 SCM_TIMESTAMP
, SOL_SOCKET
);
1776 if (*controlp
== NULL
) {
1780 tv
= (struct timeval
*)
1781 CMSG_DATA(mtod(*controlp
, struct cmsghdr
*));
1790 controlp
= &(*controlp
)->m_next
;
1792 if (CMSG_SPACE(datalen
) < clen
) {
1793 clen
-= CMSG_SPACE(datalen
);
1794 cm
= (struct cmsghdr
*)
1795 ((caddr_t
)cm
+ CMSG_SPACE(datalen
));
1808 static struct mbuf
*
1809 unp_addsockcred(struct thread
*td
, struct mbuf
*control
)
1811 struct mbuf
*m
, *n
, *n_prev
;
1812 struct sockcred
*sc
;
1813 const struct cmsghdr
*cm
;
1817 ngroups
= MIN(td
->td_ucred
->cr_ngroups
, CMGROUP_MAX
);
1819 m
= sbcreatecontrol(NULL
, SOCKCREDSIZE(ngroups
), SCM_CREDS
, SOL_SOCKET
);
1823 sc
= (struct sockcred
*) CMSG_DATA(mtod(m
, struct cmsghdr
*));
1824 sc
->sc_uid
= td
->td_ucred
->cr_ruid
;
1825 sc
->sc_euid
= td
->td_ucred
->cr_uid
;
1826 sc
->sc_gid
= td
->td_ucred
->cr_rgid
;
1827 sc
->sc_egid
= td
->td_ucred
->cr_gid
;
1828 sc
->sc_ngroups
= ngroups
;
1829 for (i
= 0; i
< sc
->sc_ngroups
; i
++)
1830 sc
->sc_groups
[i
] = td
->td_ucred
->cr_groups
[i
];
1833 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
1834 * created SCM_CREDS control message (struct sockcred) has another
1837 if (control
!= NULL
)
1838 for (n
= control
, n_prev
= NULL
; n
!= NULL
;) {
1839 cm
= mtod(n
, struct cmsghdr
*);
1840 if (cm
->cmsg_level
== SOL_SOCKET
&&
1841 cm
->cmsg_type
== SCM_CREDS
) {
1843 control
= n
->m_next
;
1845 n_prev
->m_next
= n
->m_next
;
1853 /* Prepend it to the head. */
1854 m
->m_next
= control
;
1859 static struct unpcb
*
1860 fptounp(struct file
*fp
)
1864 if (fp
->f_type
!= DTYPE_SOCKET
)
1866 if ((so
= fp
->f_data
) == NULL
)
1868 if (so
->so_proto
->pr_domain
!= &localdomain
)
1870 return sotounpcb(so
);
1874 unp_discard(struct file
*fp
)
1877 unp_externalize_fp(fp
);
1878 (void) closef(fp
, (struct thread
*)NULL
);
1882 unp_internalize_fp(struct file
*fp
)
1887 if ((unp
= fptounp(fp
)) != NULL
) {
1889 unp
->unp_msgcount
++;
1893 UNP_GLOBAL_WUNLOCK();
1897 unp_externalize_fp(struct file
*fp
)
1902 if ((unp
= fptounp(fp
)) != NULL
)
1903 unp
->unp_msgcount
--;
1905 UNP_GLOBAL_WUNLOCK();
1909 * unp_defer indicates whether additional work has been defered for a future
1910 * pass through unp_gc(). It is thread local and does not require explicit
1913 static int unp_marked
;
1914 static int unp_unreachable
;
1917 unp_accessable(struct file
*fp
)
1921 if ((unp
= fptounp(fp
)) == NULL
)
1923 if (unp
->unp_gcflag
& UNPGC_REF
)
1925 unp
->unp_gcflag
&= ~UNPGC_DEAD
;
1926 unp
->unp_gcflag
|= UNPGC_REF
;
1931 unp_gc_process(struct unpcb
*unp
)
1937 /* Already processed. */
1938 if (unp
->unp_gcflag
& UNPGC_SCANNED
)
1942 * Check for a socket potentially in a cycle. It must be in a
1943 * queue as indicated by msgcount, and this must equal the file
1944 * reference count. Note that when msgcount is 0 the file is NULL.
1946 if ((unp
->unp_gcflag
& UNPGC_REF
) == 0 && fp
&&
1947 unp
->unp_msgcount
!= 0 && fp
->f_count
== unp
->unp_msgcount
) {
1948 unp
->unp_gcflag
|= UNPGC_DEAD
;
1953 * Mark all sockets we reference with RIGHTS.
1955 so
= unp
->unp_socket
;
1956 SOCKBUF_LOCK(&so
->so_rcv
);
1957 unp_scan(so
->so_rcv
.sb_mb
, unp_accessable
);
1958 SOCKBUF_UNLOCK(&so
->so_rcv
);
1960 * Mark all sockets in our accept queue.
1963 TAILQ_FOREACH(soa
, &so
->so_comp
, so_list
) {
1964 SOCKBUF_LOCK(&soa
->so_rcv
);
1965 unp_scan(soa
->so_rcv
.sb_mb
, unp_accessable
);
1966 SOCKBUF_UNLOCK(&soa
->so_rcv
);
1969 unp
->unp_gcflag
|= UNPGC_SCANNED
;
1972 static int unp_recycled
;
1973 SYSCTL_INT(_net_local
, OID_AUTO
, recycled
, CTLFLAG_RD
, &unp_recycled
, 0,
1974 "Number of unreachable sockets claimed by the garbage collector.");
1976 static int unp_taskcount
;
1977 SYSCTL_INT(_net_local
, OID_AUTO
, taskcount
, CTLFLAG_RD
, &unp_taskcount
, 0,
1978 "Number of times the garbage collector has run.");
1981 unp_gc(__unused
void *arg
, int pending
)
1983 struct unp_head
*heads
[] = { &unp_dhead
, &unp_shead
, NULL
};
1984 struct unp_head
**head
;
1985 struct file
**unref
;
1992 * First clear all gc flags from previous runs.
1994 for (head
= heads
; *head
!= NULL
; head
++)
1995 LIST_FOREACH(unp
, *head
, unp_link
)
1996 unp
->unp_gcflag
= 0;
1998 * Scan marking all reachable sockets with UNPGC_REF. Once a socket
1999 * is reachable all of the sockets it references are reachable.
2000 * Stop the scan once we do a complete loop without discovering
2001 * a new reachable socket.
2004 unp_unreachable
= 0;
2006 for (head
= heads
; *head
!= NULL
; head
++)
2007 LIST_FOREACH(unp
, *head
, unp_link
)
2008 unp_gc_process(unp
);
2009 } while (unp_marked
);
2010 UNP_GLOBAL_RUNLOCK();
2011 if (unp_unreachable
== 0)
2014 * Allocate space for a local list of dead unpcbs.
2016 unref
= malloc(unp_unreachable
* sizeof(struct file
*),
2019 * Iterate looking for sockets which have been specifically marked
2020 * as as unreachable and store them locally.
2023 for (i
= 0, head
= heads
; *head
!= NULL
; head
++)
2024 LIST_FOREACH(unp
, *head
, unp_link
)
2025 if (unp
->unp_gcflag
& UNPGC_DEAD
) {
2026 unref
[i
++] = unp
->unp_file
;
2027 fhold(unp
->unp_file
);
2028 KASSERT(unp
->unp_file
!= NULL
,
2029 ("unp_gc: Invalid unpcb."));
2030 KASSERT(i
<= unp_unreachable
,
2031 ("unp_gc: incorrect unreachable count."));
2033 UNP_GLOBAL_RUNLOCK();
2035 * Now flush all sockets, free'ing rights. This will free the
2036 * struct files associated with these sockets but leave each socket
2037 * with one remaining ref.
2039 for (i
= 0; i
< unp_unreachable
; i
++)
2040 sorflush(unref
[i
]->f_data
);
2042 * And finally release the sockets so they can be reclaimed.
2044 for (i
= 0; i
< unp_unreachable
; i
++)
2045 fdrop(unref
[i
], NULL
);
2046 unp_recycled
+= unp_unreachable
;
2047 free(unref
, M_TEMP
);
2051 unp_dispose(struct mbuf
*m
)
2055 unp_scan(m
, unp_discard
);
2059 unp_scan(struct mbuf
*m0
, void (*op
)(struct file
*))
2066 socklen_t clen
, datalen
;
2069 while (m0
!= NULL
) {
2070 for (m
= m0
; m
; m
= m
->m_next
) {
2071 if (m
->m_type
!= MT_CONTROL
)
2074 cm
= mtod(m
, struct cmsghdr
*);
2077 while (cm
!= NULL
) {
2078 if (sizeof(*cm
) > clen
|| cm
->cmsg_len
> clen
)
2081 data
= CMSG_DATA(cm
);
2082 datalen
= (caddr_t
)cm
+ cm
->cmsg_len
2085 if (cm
->cmsg_level
== SOL_SOCKET
&&
2086 cm
->cmsg_type
== SCM_RIGHTS
) {
2087 qfds
= datalen
/ sizeof (struct file
*);
2089 for (i
= 0; i
< qfds
; i
++)
2093 if (CMSG_SPACE(datalen
) < clen
) {
2094 clen
-= CMSG_SPACE(datalen
);
2095 cm
= (struct cmsghdr
*)
2096 ((caddr_t
)cm
+ CMSG_SPACE(datalen
));
2109 db_print_indent(int indent
)
2113 for (i
= 0; i
< indent
; i
++)
2118 db_print_unpflags(int unp_flags
)
2123 if (unp_flags
& UNP_HAVEPC
) {
2124 db_printf("%sUNP_HAVEPC", comma
? ", " : "");
2127 if (unp_flags
& UNP_HAVEPCCACHED
) {
2128 db_printf("%sUNP_HAVEPCCACHED", comma
? ", " : "");
2131 if (unp_flags
& UNP_WANTCRED
) {
2132 db_printf("%sUNP_WANTCRED", comma
? ", " : "");
2135 if (unp_flags
& UNP_CONNWAIT
) {
2136 db_printf("%sUNP_CONNWAIT", comma
? ", " : "");
2139 if (unp_flags
& UNP_CONNECTING
) {
2140 db_printf("%sUNP_CONNECTING", comma
? ", " : "");
2143 if (unp_flags
& UNP_BINDING
) {
2144 db_printf("%sUNP_BINDING", comma
? ", " : "");
2150 db_print_xucred(int indent
, struct xucred
*xu
)
2154 db_print_indent(indent
);
2155 db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n",
2156 xu
->cr_version
, xu
->cr_uid
, xu
->cr_ngroups
);
2157 db_print_indent(indent
);
2158 db_printf("cr_groups: ");
2160 for (i
= 0; i
< xu
->cr_ngroups
; i
++) {
2161 db_printf("%s%u", comma
? ", " : "", xu
->cr_groups
[i
]);
2168 db_print_unprefs(int indent
, struct unp_head
*uh
)
2174 LIST_FOREACH(unp
, uh
, unp_reflink
) {
2175 if (counter
% 4 == 0)
2176 db_print_indent(indent
);
2177 db_printf("%p ", unp
);
2178 if (counter
% 4 == 3)
2182 if (counter
!= 0 && counter
% 4 != 0)
2186 DB_SHOW_COMMAND(unpcb
, db_show_unpcb
)
2191 db_printf("usage: show unpcb <addr>\n");
2194 unp
= (struct unpcb
*)addr
;
2196 db_printf("unp_socket: %p unp_vnode: %p\n", unp
->unp_socket
,
2199 db_printf("unp_ino: %d unp_conn: %p\n", unp
->unp_ino
,
2202 db_printf("unp_refs:\n");
2203 db_print_unprefs(2, &unp
->unp_refs
);
2205 /* XXXRW: Would be nice to print the full address, if any. */
2206 db_printf("unp_addr: %p\n", unp
->unp_addr
);
2208 db_printf("unp_cc: %d unp_mbcnt: %d unp_gencnt: %llu\n",
2209 unp
->unp_cc
, unp
->unp_mbcnt
,
2210 (unsigned long long)unp
->unp_gencnt
);
2212 db_printf("unp_flags: %x (", unp
->unp_flags
);
2213 db_print_unpflags(unp
->unp_flags
);
2216 db_printf("unp_peercred:\n");
2217 db_print_xucred(2, &unp
->unp_peercred
);
2219 db_printf("unp_refcount: %u\n", unp
->unp_refcount
);