sys/kern/uipc_usrreq.c

   1 /*      $NetBSD: uipc_usrreq.c,v 1.127 2009/08/26 22:34:47 bouyer Exp $ */
   2
   3 /*-
   4  * Copyright (c) 1998, 2000, 2004, 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
   9  * NASA Ames Research Center, and by Andrew Doran.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 /*
  34  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_usrreq.c       8.9 (Berkeley) 5/14/95
  62  */
  63
  64 /*
  65  * Copyright (c) 1997 Christopher G. Demetriou.  All rights reserved.
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  * 3. All advertising materials mentioning features or use of this software
  76  *    must display the following acknowledgement:
  77  *      This product includes software developed by the University of
  78  *      California, Berkeley and its contributors.
  79  * 4. Neither the name of the University nor the names of its contributors
  80  *    may be used to endorse or promote products derived from this software
  81  *    without specific prior written permission.
  82  *
  83  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  93  * SUCH DAMAGE.
  94  *
  95  *      @(#)uipc_usrreq.c       8.9 (Berkeley) 5/14/95
  96  */
  97
  98 #include <sys/cdefs.h>
  99 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.127 2009/08/26 22:34:47 bouyer Exp $");
 100
 101 #include <sys/param.h>
 102 #include <sys/systm.h>
 103 #include <sys/proc.h>
 104 #include <sys/filedesc.h>
 105 #include <sys/domain.h>
 106 #include <sys/protosw.h>
 107 #include <sys/socket.h>
 108 #include <sys/socketvar.h>
 109 #include <sys/unpcb.h>
 110 #include <sys/un.h>
 111 #include <sys/namei.h>
 112 #include <sys/vnode.h>
 113 #include <sys/file.h>
 114 #include <sys/stat.h>
 115 #include <sys/mbuf.h>
 116 #include <sys/kauth.h>
 117 #include <sys/kmem.h>
 118 #include <sys/atomic.h>
 119 #include <sys/uidinfo.h>
 120 #include <sys/kernel.h>
 121 #include <sys/kthread.h>
 122
 123 /*
 124  * Unix communications domain.
 125  *
 126  * TODO:
 127  *      SEQPACKET, RDM
 128  *      rethink name space problems
 129  *      need a proper out-of-band
 130  *
 131  * Notes on locking:
 132  *
 133  * The generic rules noted in uipc_socket2.c apply.  In addition:
 134  *
 135  * o We have a global lock, uipc_lock.
 136  *
 137  * o All datagram sockets are locked by uipc_lock.
 138  *
 139  * o For stream socketpairs, the two endpoints are created sharing the same
 140  *   independent lock.  Sockets presented to PRU_CONNECT2 must already have
 141  *   matching locks.
 142  *
 143  * o Stream sockets created via socket() start life with their own
 144  *   independent lock.
 145  *
 146  * o Stream connections to a named endpoint are slightly more complicated.
 147  *   Sockets that have called listen() have their lock pointer mutated to
 148  *   the global uipc_lock.  When establishing a connection, the connecting
 149  *   socket also has its lock mutated to uipc_lock, which matches the head
 150  *   (listening socket).  We create a new socket for accept() to return, and
 151  *   that also shares the head's lock.  Until the connection is completely
 152  *   done on both ends, all three sockets are locked by uipc_lock.  Once the
 153  *   connection is complete, the association with the head's lock is broken.
 154  *   The connecting socket and the socket returned from accept() have their
 155  *   lock pointers mutated away from uipc_lock, and back to the connecting
 156  *   socket's original, independent lock.  The head continues to be locked
 157  *   by uipc_lock.
 158  *
 159  * o If uipc_lock is determined to be a significant source of contention,
 160  *   it could easily be hashed out.  It is difficult to simply make it an
 161  *   independent lock because of visibility / garbage collection issues:
 162  *   if a socket has been associated with a lock at any point, that lock
 163  *   must remain valid until the socket is no longer visible in the system.
 164  *   The lock must not be freed or otherwise destroyed until any sockets
 165  *   that had referenced it have also been destroyed.
 166  */
 167 const struct sockaddr_un sun_noname = {
 168         .sun_len = sizeof(sun_noname),
 169         .sun_family = AF_LOCAL,
 170 };
 171 ino_t   unp_ino;                        /* prototype for fake inode numbers */
 172
 173 struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *);
 174 static void unp_mark(file_t *);
 175 static void unp_scan(struct mbuf *, void (*)(file_t *), int);
 176 static void unp_discard_now(file_t *);
 177 static void unp_discard_later(file_t *);
 178 static void unp_thread(void *);
 179 static void unp_thread_kick(void);
 180 static kmutex_t *uipc_lock;
 181
 182 static kcondvar_t unp_thread_cv;
 183 static lwp_t *unp_thread_lwp;
 184 static SLIST_HEAD(,file) unp_thread_discard;
 185 static int unp_defer;
 186
 187 /*
 188  * Initialize Unix protocols.
 189  */
 190 void
 191 uipc_init(void)
 192 {
 193         int error;
 194
 195         uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 196         cv_init(&unp_thread_cv, "unpgc");
 197
 198         error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
 199             NULL, &unp_thread_lwp, "unpgc");
 200         if (error != 0)
 201                 panic("uipc_init %d", error);
 202 }
 203
 204 /*
 205  * A connection succeeded: disassociate both endpoints from the head's
 206  * lock, and make them share their own lock.  There is a race here: for
 207  * a very brief time one endpoint will be locked by a different lock
 208  * than the other end.  However, since the current thread holds the old
 209  * lock (the listening socket's lock, the head) access can still only be
 210  * made to one side of the connection.
 211  */
 212 static void
 213 unp_setpeerlocks(struct socket *so, struct socket *so2)
 214 {
 215         struct unpcb *unp;
 216         kmutex_t *lock;
 217
 218         KASSERT(solocked2(so, so2));
 219
 220         /*
 221          * Bail out if either end of the socket is not yet fully
 222          * connected or accepted.  We only break the lock association
 223          * with the head when the pair of sockets stand completely
 224          * on their own.
 225          */
 226         KASSERT(so->so_head == NULL);
 227         if (so2->so_head != NULL)
 228                 return;
 229
 230         /*
 231          * Drop references to old lock.  A third reference (from the
 232          * queue head) must be held as we still hold its lock.  Bonus:
 233          * we don't need to worry about garbage collecting the lock.
 234          */
 235         lock = so->so_lock;
 236         KASSERT(lock == uipc_lock);
 237         mutex_obj_free(lock);
 238         mutex_obj_free(lock);
 239
 240         /*
 241          * Grab stream lock from the initiator and share between the two
 242          * endpoints.  Issue memory barrier to ensure all modifications
 243          * become globally visible before the lock change.  so2 is
 244          * assumed not to have a stream lock, because it was created
 245          * purely for the server side to accept this connection and
 246          * started out life using the domain-wide lock.
 247          */
 248         unp = sotounpcb(so);
 249         KASSERT(unp->unp_streamlock != NULL);
 250         KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
 251         lock = unp->unp_streamlock;
 252         unp->unp_streamlock = NULL;
 253         mutex_obj_hold(lock);
 254         membar_exit();
 255         /*
 256          * possible race if lock is not held - see comment in
 257          * uipc_usrreq(PRU_ACCEPT).
 258          */
 259         KASSERT(mutex_owned(lock));
 260         solockreset(so, lock);
 261         solockreset(so2, lock);
 262 }
 263
 264 /*
 265  * Reset a socket's lock back to the domain-wide lock.
 266  */
 267 static void
 268 unp_resetlock(struct socket *so)
 269 {
 270         kmutex_t *olock, *nlock;
 271         struct unpcb *unp;
 272
 273         KASSERT(solocked(so));
 274
 275         olock = so->so_lock;
 276         nlock = uipc_lock;
 277         if (olock == nlock)
 278                 return;
 279         unp = sotounpcb(so);
 280         KASSERT(unp->unp_streamlock == NULL);
 281         unp->unp_streamlock = olock;
 282         mutex_obj_hold(nlock);
 283         mutex_enter(nlock);
 284         solockreset(so, nlock);
 285         mutex_exit(olock);
 286 }
 287
 288 static void
 289 unp_free(struct unpcb *unp)
 290 {
 291
 292         if (unp->unp_addr)
 293                 free(unp->unp_addr, M_SONAME);
 294         if (unp->unp_streamlock != NULL)
 295                 mutex_obj_free(unp->unp_streamlock);
 296         free(unp, M_PCB);
 297 }
 298
 299 int
 300 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp,
 301         struct lwp *l)
 302 {
 303         struct socket *so2;
 304         const struct sockaddr_un *sun;
 305
 306         so2 = unp->unp_conn->unp_socket;
 307
 308         KASSERT(solocked(so2));
 309
 310         if (unp->unp_addr)
 311                 sun = unp->unp_addr;
 312         else
 313                 sun = &sun_noname;
 314         if (unp->unp_conn->unp_flags & UNP_WANTCRED)
 315                 control = unp_addsockcred(l, control);
 316         if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
 317             control) == 0) {
 318                 so2->so_rcv.sb_overflowed++;
 319                 unp_dispose(control);
 320                 m_freem(control);
 321                 m_freem(m);
 322                 return (ENOBUFS);
 323         } else {
 324                 sorwakeup(so2);
 325                 return (0);
 326         }
 327 }
 328
 329 void
 330 unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr)
 331 {
 332         const struct sockaddr_un *sun;
 333         struct unpcb *unp;
 334         bool ext;
 335
 336         KASSERT(solocked(so));
 337         unp = sotounpcb(so);
 338         ext = false;
 339
 340         for (;;) {
 341                 sun = NULL;
 342                 if (peeraddr) {
 343                         if (unp->unp_conn && unp->unp_conn->unp_addr)
 344                                 sun = unp->unp_conn->unp_addr;
 345                 } else {
 346                         if (unp->unp_addr)
 347                                 sun = unp->unp_addr;
 348                 }
 349                 if (sun == NULL)
 350                         sun = &sun_noname;
 351                 nam->m_len = sun->sun_len;
 352                 if (nam->m_len > MLEN && !ext) {
 353                         sounlock(so);
 354                         MEXTMALLOC(nam, MAXPATHLEN * 2, M_WAITOK);
 355                         solock(so);
 356                         ext = true;
 357                 } else {
 358                         KASSERT(nam->m_len <= MAXPATHLEN * 2);
 359                         memcpy(mtod(nam, void *), sun, (size_t)nam->m_len);
 360                         break;
 361                 }
 362         }
 363 }
 364
 365 /*ARGSUSED*/
 366 int
 367 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
 368         struct mbuf *control, struct lwp *l)
 369 {
 370         struct unpcb *unp = sotounpcb(so);
 371         struct socket *so2;
 372         struct proc *p;
 373         u_int newhiwat;
 374         int error = 0;
 375
 376         if (req == PRU_CONTROL)
 377                 return (EOPNOTSUPP);
 378
 379 #ifdef DIAGNOSTIC
 380         if (req != PRU_SEND && req != PRU_SENDOOB && control)
 381                 panic("uipc_usrreq: unexpected control mbuf");
 382 #endif
 383         p = l ? l->l_proc : NULL;
 384         if (req != PRU_ATTACH) {
 385                 if (unp == NULL) {
 386                         error = EINVAL;
 387                         goto release;
 388                 }
 389                 KASSERT(solocked(so));
 390         }
 391
 392         switch (req) {
 393
 394         case PRU_ATTACH:
 395                 if (unp != NULL) {
 396                         error = EISCONN;
 397                         break;
 398                 }
 399                 error = unp_attach(so);
 400                 break;
 401
 402         case PRU_DETACH:
 403                 unp_detach(unp);
 404                 break;
 405
 406         case PRU_BIND:
 407                 KASSERT(l != NULL);
 408                 error = unp_bind(so, nam, l);
 409                 break;
 410
 411         case PRU_LISTEN:
 412                 /*
 413                  * If the socket can accept a connection, it must be
 414                  * locked by uipc_lock.
 415                  */
 416                 unp_resetlock(so);
 417                 if (unp->unp_vnode == NULL)
 418                         error = EINVAL;
 419                 break;
 420
 421         case PRU_CONNECT:
 422                 KASSERT(l != NULL);
 423                 error = unp_connect(so, nam, l);
 424                 break;
 425
 426         case PRU_CONNECT2:
 427                 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2);
 428                 break;
 429
 430         case PRU_DISCONNECT:
 431                 unp_disconnect(unp);
 432                 break;
 433
 434         case PRU_ACCEPT:
 435                 KASSERT(so->so_lock == uipc_lock);
 436                 /*
 437                  * Mark the initiating STREAM socket as connected *ONLY*
 438                  * after it's been accepted.  This prevents a client from
 439                  * overrunning a server and receiving ECONNREFUSED.
 440                  */
 441                 if (unp->unp_conn == NULL)
 442                         break;
 443                 so2 = unp->unp_conn->unp_socket;
 444                 if (so2->so_state & SS_ISCONNECTING) {
 445                         KASSERT(solocked2(so, so->so_head));
 446                         KASSERT(solocked2(so2, so->so_head));
 447                         soisconnected(so2);
 448                 }
 449                 /*
 450                  * If the connection is fully established, break the
 451                  * association with uipc_lock and give the connected
 452                  * pair a seperate lock to share.
 453                  * There is a race here: sotounpcb(so2)->unp_streamlock
 454                  * is not locked, so when changing so2->so_lock
 455                  * another thread can grab it while so->so_lock is still
 456                  * pointing to the (locked) uipc_lock.
 457                  * this should be harmless, exept that this makes
 458                  * solocked2() and solocked() unreliable.
 459                  * Another problem is that unp_setaddr() expects the
 460                  * the socket locked. Grabing sotounpcb(so2)->unp_streamlock
 461                  * fixes both issues.
 462                  */
 463                 mutex_enter(sotounpcb(so2)->unp_streamlock);
 464                 unp_setpeerlocks(so2, so);
 465                 /*
 466                  * Only now return peer's address, as we may need to
 467                  * block in order to allocate memory.
 468                  *
 469                  * XXX Minor race: connection can be broken while
 470                  * lock is dropped in unp_setaddr().  We will return
 471                  * error == 0 and sun_noname as the peer address.
 472                  */
 473                 unp_setaddr(so, nam, true);
 474                 /* so_lock now points to unp_streamlock */
 475                 mutex_exit(so2->so_lock);
 476                 break;
 477
 478         case PRU_SHUTDOWN:
 479                 socantsendmore(so);
 480                 unp_shutdown(unp);
 481                 break;
 482
 483         case PRU_RCVD:
 484                 switch (so->so_type) {
 485
 486                 case SOCK_DGRAM:
 487                         panic("uipc 1");
 488                         /*NOTREACHED*/
 489
 490                 case SOCK_STREAM:
 491 #define rcv (&so->so_rcv)
 492 #define snd (&so2->so_snd)
 493                         if (unp->unp_conn == 0)
 494                                 break;
 495                         so2 = unp->unp_conn->unp_socket;
 496                         KASSERT(solocked2(so, so2));
 497                         /*
 498                          * Adjust backpressure on sender
 499                          * and wakeup any waiting to write.
 500                          */
 501                         snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
 502                         unp->unp_mbcnt = rcv->sb_mbcnt;
 503                         newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
 504                         (void)chgsbsize(so2->so_uidinfo,
 505                             &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
 506                         unp->unp_cc = rcv->sb_cc;
 507                         sowwakeup(so2);
 508 #undef snd
 509 #undef rcv
 510                         break;
 511
 512                 default:
 513                         panic("uipc 2");
 514                 }
 515                 break;
 516
 517         case PRU_SEND:
 518                 /*
 519                  * Note: unp_internalize() rejects any control message
 520                  * other than SCM_RIGHTS, and only allows one.  This
 521                  * has the side-effect of preventing a caller from
 522                  * forging SCM_CREDS.
 523                  */
 524                 if (control) {
 525                         sounlock(so);
 526                         error = unp_internalize(&control);
 527                         solock(so);
 528                         if (error != 0) {
 529                                 m_freem(control);
 530                                 m_freem(m);
 531                                 break;
 532                         }
 533                 }
 534                 switch (so->so_type) {
 535
 536                 case SOCK_DGRAM: {
 537                         KASSERT(so->so_lock == uipc_lock);
 538                         if (nam) {
 539                                 if ((so->so_state & SS_ISCONNECTED) != 0)
 540                                         error = EISCONN;
 541                                 else {
 542                                         /*
 543                                          * Note: once connected, the
 544                                          * socket's lock must not be
 545                                          * dropped until we have sent
 546                                          * the message and disconnected.
 547                                          * This is necessary to prevent
 548                                          * intervening control ops, like
 549                                          * another connection.
 550                                          */
 551                                         error = unp_connect(so, nam, l);
 552                                 }
 553                         } else {
 554                                 if ((so->so_state & SS_ISCONNECTED) == 0)
 555                                         error = ENOTCONN;
 556                         }
 557                         if (error) {
 558                                 unp_dispose(control);
 559                                 m_freem(control);
 560                                 m_freem(m);
 561                                 break;
 562                         }
 563                         KASSERT(p != NULL);
 564                         error = unp_output(m, control, unp, l);
 565                         if (nam)
 566                                 unp_disconnect(unp);
 567                         break;
 568                 }
 569
 570                 case SOCK_STREAM:
 571 #define rcv (&so2->so_rcv)
 572 #define snd (&so->so_snd)
 573                         if (unp->unp_conn == NULL) {
 574                                 error = ENOTCONN;
 575                                 break;
 576                         }
 577                         so2 = unp->unp_conn->unp_socket;
 578                         KASSERT(solocked2(so, so2));
 579                         if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
 580                                 /*
 581                                  * Credentials are passed only once on
 582                                  * SOCK_STREAM.
 583                                  */
 584                                 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
 585                                 control = unp_addsockcred(l, control);
 586                         }
 587                         /*
 588                          * Send to paired receive port, and then reduce
 589                          * send buffer hiwater marks to maintain backpressure.
 590                          * Wake up readers.
 591                          */
 592                         if (control) {
 593                                 if (sbappendcontrol(rcv, m, control) != 0)
 594                                         control = NULL;
 595                         } else
 596                                 sbappend(rcv, m);
 597                         snd->sb_mbmax -=
 598                             rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
 599                         unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
 600                         newhiwat = snd->sb_hiwat -
 601                             (rcv->sb_cc - unp->unp_conn->unp_cc);
 602                         (void)chgsbsize(so->so_uidinfo,
 603                             &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
 604                         unp->unp_conn->unp_cc = rcv->sb_cc;
 605                         sorwakeup(so2);
 606 #undef snd
 607 #undef rcv
 608                         if (control != NULL) {
 609                                 unp_dispose(control);
 610                                 m_freem(control);
 611                         }
 612                         break;
 613
 614                 default:
 615                         panic("uipc 4");
 616                 }
 617                 break;
 618
 619         case PRU_ABORT:
 620                 (void)unp_drop(unp, ECONNABORTED);
 621
 622                 KASSERT(so->so_head == NULL);
 623 #ifdef DIAGNOSTIC
 624                 if (so->so_pcb == NULL)
 625                         panic("uipc 5: drop killed pcb");
 626 #endif
 627                 unp_detach(unp);
 628                 break;
 629
 630         case PRU_SENSE:
 631                 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
 632                 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
 633                         so2 = unp->unp_conn->unp_socket;
 634                         KASSERT(solocked2(so, so2));
 635                         ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
 636                 }
 637                 ((struct stat *) m)->st_dev = NODEV;
 638                 if (unp->unp_ino == 0)
 639                         unp->unp_ino = unp_ino++;
 640                 ((struct stat *) m)->st_atimespec =
 641                     ((struct stat *) m)->st_mtimespec =
 642                     ((struct stat *) m)->st_ctimespec = unp->unp_ctime;
 643                 ((struct stat *) m)->st_ino = unp->unp_ino;
 644                 return (0);
 645
 646         case PRU_RCVOOB:
 647                 error = EOPNOTSUPP;
 648                 break;
 649
 650         case PRU_SENDOOB:
 651                 m_freem(control);
 652                 m_freem(m);
 653                 error = EOPNOTSUPP;
 654                 break;
 655
 656         case PRU_SOCKADDR:
 657                 unp_setaddr(so, nam, false);
 658                 break;
 659
 660         case PRU_PEERADDR:
 661                 unp_setaddr(so, nam, true);
 662                 break;
 663
 664         default:
 665                 panic("piusrreq");
 666         }
 667
 668 release:
 669         return (error);
 670 }
 671
 672 /*
 673  * Unix domain socket option processing.
 674  */
 675 int
 676 uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
 677 {
 678         struct unpcb *unp = sotounpcb(so);
 679         int optval = 0, error = 0;
 680
 681         KASSERT(solocked(so));
 682
 683         if (sopt->sopt_level != 0) {
 684                 error = ENOPROTOOPT;
 685         } else switch (op) {
 686
 687         case PRCO_SETOPT:
 688                 switch (sopt->sopt_name) {
 689                 case LOCAL_CREDS:
 690                 case LOCAL_CONNWAIT:
 691                         error = sockopt_getint(sopt, &optval);
 692                         if (error)
 693                                 break;
 694                         switch (sopt->sopt_name) {
 695 #define OPTSET(bit) \
 696         if (optval) \
 697                 unp->unp_flags |= (bit); \
 698         else \
 699                 unp->unp_flags &= ~(bit);
 700
 701                         case LOCAL_CREDS:
 702                                 OPTSET(UNP_WANTCRED);
 703                                 break;
 704                         case LOCAL_CONNWAIT:
 705                                 OPTSET(UNP_CONNWAIT);
 706                                 break;
 707                         }
 708                         break;
 709 #undef OPTSET
 710
 711                 default:
 712                         error = ENOPROTOOPT;
 713                         break;
 714                 }
 715                 break;
 716
 717         case PRCO_GETOPT:
 718                 sounlock(so);
 719                 switch (sopt->sopt_name) {
 720                 case LOCAL_PEEREID:
 721                         if (unp->unp_flags & UNP_EIDSVALID) {
 722                                 error = sockopt_set(sopt,
 723                                     &unp->unp_connid, sizeof(unp->unp_connid));
 724                         } else {
 725                                 error = EINVAL;
 726                         }
 727                         break;
 728                 case LOCAL_CREDS:
 729 #define OPTBIT(bit)     (unp->unp_flags & (bit) ? 1 : 0)
 730
 731                         optval = OPTBIT(UNP_WANTCRED);
 732                         error = sockopt_setint(sopt, optval);
 733                         break;
 734 #undef OPTBIT
 735
 736                 default:
 737                         error = ENOPROTOOPT;
 738                         break;
 739                 }
 740                 solock(so);
 741                 break;
 742         }
 743         return (error);
 744 }
 745
 746 /*
 747  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
 748  * for stream sockets, although the total for sender and receiver is
 749  * actually only PIPSIZ.
 750  * Datagram sockets really use the sendspace as the maximum datagram size,
 751  * and don't really want to reserve the sendspace.  Their recvspace should
 752  * be large enough for at least one max-size datagram plus address.
 753  */
 754 #define PIPSIZ  4096
 755 u_long  unpst_sendspace = PIPSIZ;
 756 u_long  unpst_recvspace = PIPSIZ;
 757 u_long  unpdg_sendspace = 2*1024;       /* really max datagram size */
 758 u_long  unpdg_recvspace = 4*1024;
 759
 760 u_int   unp_rights;                     /* files in flight */
 761 u_int   unp_rights_ratio = 2;           /* limit, fraction of maxfiles */
 762
 763 int
 764 unp_attach(struct socket *so)
 765 {
 766         struct unpcb *unp;
 767         int error;
 768
 769         switch (so->so_type) {
 770         case SOCK_STREAM:
 771                 if (so->so_lock == NULL) {
 772                         /*
 773                          * XXX Assuming that no socket locks are held,
 774                          * as this call may sleep.
 775                          */
 776                         so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 777                         solock(so);
 778                 }
 779                 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 780                         error = soreserve(so, unpst_sendspace, unpst_recvspace);
 781                         if (error != 0)
 782                                 return (error);
 783                 }
 784                 break;
 785
 786         case SOCK_DGRAM:
 787                 if (so->so_lock == NULL) {
 788                         mutex_obj_hold(uipc_lock);
 789                         so->so_lock = uipc_lock;
 790                         solock(so);
 791                 }
 792                 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 793                         error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
 794                         if (error != 0)
 795                                 return (error);
 796                 }
 797                 break;
 798
 799         default:
 800                 panic("unp_attach");
 801         }
 802         KASSERT(solocked(so));
 803         unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT);
 804         if (unp == NULL)
 805                 return (ENOBUFS);
 806         memset(unp, 0, sizeof(*unp));
 807         unp->unp_socket = so;
 808         so->so_pcb = unp;
 809         nanotime(&unp->unp_ctime);
 810         return (0);
 811 }
 812
 813 void
 814 unp_detach(struct unpcb *unp)
 815 {
 816         struct socket *so;
 817         vnode_t *vp;
 818
 819         so = unp->unp_socket;
 820
 821  retry:
 822         if ((vp = unp->unp_vnode) != NULL) {
 823                 sounlock(so);
 824                 /* Acquire v_interlock to protect against unp_connect(). */
 825                 /* XXXAD racy */
 826                 mutex_enter(&vp->v_interlock);
 827                 vp->v_socket = NULL;
 828                 vrelel(vp, 0);
 829                 solock(so);
 830                 unp->unp_vnode = NULL;
 831         }
 832         if (unp->unp_conn)
 833                 unp_disconnect(unp);
 834         while (unp->unp_refs) {
 835                 KASSERT(solocked2(so, unp->unp_refs->unp_socket));
 836                 if (unp_drop(unp->unp_refs, ECONNRESET)) {
 837                         solock(so);
 838                         goto retry;
 839                 }
 840         }
 841         soisdisconnected(so);
 842         so->so_pcb = NULL;
 843         if (unp_rights) {
 844                 /*
 845                  * Normally the receive buffer is flushed later, in sofree,
 846                  * but if our receive buffer holds references to files that
 847                  * are now garbage, we will enqueue those file references to
 848                  * the garbage collector and kick it into action.
 849                  */
 850                 sorflush(so);
 851                 unp_free(unp);
 852                 unp_thread_kick();
 853         } else
 854                 unp_free(unp);
 855 }
 856
 857 int
 858 unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l)
 859 {
 860         struct sockaddr_un *sun;
 861         struct unpcb *unp;
 862         vnode_t *vp;
 863         struct vattr vattr;
 864         size_t addrlen;
 865         int error;
 866         struct nameidata nd;
 867         proc_t *p;
 868
 869         unp = sotounpcb(so);
 870         if (unp->unp_vnode != NULL)
 871                 return (EINVAL);
 872         if ((unp->unp_flags & UNP_BUSY) != 0) {
 873                 /*
 874                  * EALREADY may not be strictly accurate, but since this
 875                  * is a major application error it's hardly a big deal.
 876                  */
 877                 return (EALREADY);
 878         }
 879         unp->unp_flags |= UNP_BUSY;
 880         sounlock(so);
 881
 882         /*
 883          * Allocate the new sockaddr.  We have to allocate one
 884          * extra byte so that we can ensure that the pathname
 885          * is nul-terminated.
 886          */
 887         p = l->l_proc;
 888         addrlen = nam->m_len + 1;
 889         sun = malloc(addrlen, M_SONAME, M_WAITOK);
 890         m_copydata(nam, 0, nam->m_len, (void *)sun);
 891         *(((char *)sun) + nam->m_len) = '\0';
 892
 893         NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, UIO_SYSSPACE,
 894             sun->sun_path);
 895
 896 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
 897         if ((error = namei(&nd)) != 0)
 898                 goto bad;
 899         vp = nd.ni_vp;
 900         if (vp != NULL) {
 901                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 902                 if (nd.ni_dvp == vp)
 903                         vrele(nd.ni_dvp);
 904                 else
 905                         vput(nd.ni_dvp);
 906                 vrele(vp);
 907                 error = EADDRINUSE;
 908                 goto bad;
 909         }
 910         vattr_null(&vattr);
 911         vattr.va_type = VSOCK;
 912         vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
 913         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 914         if (error)
 915                 goto bad;
 916         vp = nd.ni_vp;
 917         solock(so);
 918         vp->v_socket = unp->unp_socket;
 919         unp->unp_vnode = vp;
 920         unp->unp_addrlen = addrlen;
 921         unp->unp_addr = sun;
 922         unp->unp_connid.unp_pid = p->p_pid;
 923         unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
 924         unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
 925         unp->unp_flags |= UNP_EIDSBIND;
 926         VOP_UNLOCK(vp, 0);
 927         unp->unp_flags &= ~UNP_BUSY;
 928         return (0);
 929
 930  bad:
 931         free(sun, M_SONAME);
 932         solock(so);
 933         unp->unp_flags &= ~UNP_BUSY;
 934         return (error);
 935 }
 936
 937 int
 938 unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l)
 939 {
 940         struct sockaddr_un *sun;
 941         vnode_t *vp;
 942         struct socket *so2, *so3;
 943         struct unpcb *unp, *unp2, *unp3;
 944         size_t addrlen;
 945         int error;
 946         struct nameidata nd;
 947
 948         unp = sotounpcb(so);
 949         if ((unp->unp_flags & UNP_BUSY) != 0) {
 950                 /*
 951                  * EALREADY may not be strictly accurate, but since this
 952                  * is a major application error it's hardly a big deal.
 953                  */
 954                 return (EALREADY);
 955         }
 956         unp->unp_flags |= UNP_BUSY;
 957         sounlock(so);
 958
 959         /*
 960          * Allocate a temporary sockaddr.  We have to allocate one extra
 961          * byte so that we can ensure that the pathname is nul-terminated.
 962          * When we establish the connection, we copy the other PCB's
 963          * sockaddr to our own.
 964          */
 965         addrlen = nam->m_len + 1;
 966         sun = malloc(addrlen, M_SONAME, M_WAITOK);
 967         m_copydata(nam, 0, nam->m_len, (void *)sun);
 968         *(((char *)sun) + nam->m_len) = '\0';
 969
 970         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_SYSSPACE,
 971             sun->sun_path);
 972
 973         if ((error = namei(&nd)) != 0)
 974                 goto bad2;
 975         vp = nd.ni_vp;
 976         if (vp->v_type != VSOCK) {
 977                 error = ENOTSOCK;
 978                 goto bad;
 979         }
 980         if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
 981                 goto bad;
 982         /* Acquire v_interlock to protect against unp_detach(). */
 983         mutex_enter(&vp->v_interlock);
 984         so2 = vp->v_socket;
 985         if (so2 == NULL) {
 986                 mutex_exit(&vp->v_interlock);
 987                 error = ECONNREFUSED;
 988                 goto bad;
 989         }
 990         if (so->so_type != so2->so_type) {
 991                 mutex_exit(&vp->v_interlock);
 992                 error = EPROTOTYPE;
 993                 goto bad;
 994         }
 995         solock(so);
 996         unp_resetlock(so);
 997         mutex_exit(&vp->v_interlock);
 998         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
 999                 /*
1000                  * This may seem somewhat fragile but is OK: if we can
1001                  * see SO_ACCEPTCONN set on the endpoint, then it must
1002                  * be locked by the domain-wide uipc_lock.
1003                  */
1004                 KASSERT((so->so_options & SO_ACCEPTCONN) == 0 ||
1005                     so2->so_lock == uipc_lock);
1006                 if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
1007                     (so3 = sonewconn(so2, 0)) == NULL) {
1008                         error = ECONNREFUSED;
1009                         sounlock(so);
1010                         goto bad;
1011                 }
1012                 unp2 = sotounpcb(so2);
1013                 unp3 = sotounpcb(so3);
1014                 if (unp2->unp_addr) {
1015                         unp3->unp_addr = malloc(unp2->unp_addrlen,
1016                             M_SONAME, M_WAITOK);
1017                         memcpy(unp3->unp_addr, unp2->unp_addr,
1018                             unp2->unp_addrlen);
1019                         unp3->unp_addrlen = unp2->unp_addrlen;
1020                 }
1021                 unp3->unp_flags = unp2->unp_flags;
1022                 unp3->unp_connid.unp_pid = l->l_proc->p_pid;
1023                 unp3->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
1024                 unp3->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
1025                 unp3->unp_flags |= UNP_EIDSVALID;
1026                 if (unp2->unp_flags & UNP_EIDSBIND) {
1027                         unp->unp_connid = unp2->unp_connid;
1028                         unp->unp_flags |= UNP_EIDSVALID;
1029                 }
1030                 so2 = so3;
1031         }
1032         error = unp_connect2(so, so2, PRU_CONNECT);
1033         sounlock(so);
1034  bad:
1035         vput(vp);
1036  bad2:
1037         free(sun, M_SONAME);
1038         solock(so);
1039         unp->unp_flags &= ~UNP_BUSY;
1040         return (error);
1041 }
1042
1043 int
1044 unp_connect2(struct socket *so, struct socket *so2, int req)
1045 {
1046         struct unpcb *unp = sotounpcb(so);
1047         struct unpcb *unp2;
1048
1049         if (so2->so_type != so->so_type)
1050                 return (EPROTOTYPE);
1051
1052         /*
1053          * All three sockets involved must be locked by same lock:
1054          *
1055          * local endpoint (so)
1056          * remote endpoint (so2)
1057          * queue head (so->so_head, only if PR_CONNREQUIRED)
1058          */
1059         KASSERT(solocked2(so, so2));
1060         KASSERT(so->so_head == NULL);
1061         if (so2->so_head != NULL) {
1062                 KASSERT(so2->so_lock == uipc_lock);
1063                 KASSERT(solocked2(so2, so2->so_head));
1064         }
1065
1066         unp2 = sotounpcb(so2);
1067         unp->unp_conn = unp2;
1068         switch (so->so_type) {
1069
1070         case SOCK_DGRAM:
1071                 unp->unp_nextref = unp2->unp_refs;
1072                 unp2->unp_refs = unp;
1073                 soisconnected(so);
1074                 break;
1075
1076         case SOCK_STREAM:
1077                 unp2->unp_conn = unp;
1078                 if (req == PRU_CONNECT &&
1079                     ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1080                         soisconnecting(so);
1081                 else
1082                         soisconnected(so);
1083                 soisconnected(so2);
1084                 /*
1085                  * If the connection is fully established, break the
1086                  * association with uipc_lock and give the connected
1087                  * pair a seperate lock to share.  For CONNECT2, we
1088                  * require that the locks already match (the sockets
1089                  * are created that way).
1090                  */
1091                 if (req == PRU_CONNECT) {
1092                         KASSERT(so2->so_head != NULL);
1093                         unp_setpeerlocks(so, so2);
1094                 }
1095                 break;
1096
1097         default:
1098                 panic("unp_connect2");
1099         }
1100         return (0);
1101 }
1102
1103 void
1104 unp_disconnect(struct unpcb *unp)
1105 {
1106         struct unpcb *unp2 = unp->unp_conn;
1107         struct socket *so;
1108
1109         if (unp2 == 0)
1110                 return;
1111         unp->unp_conn = 0;
1112         so = unp->unp_socket;
1113         switch (so->so_type) {
1114         case SOCK_DGRAM:
1115                 if (unp2->unp_refs == unp)
1116                         unp2->unp_refs = unp->unp_nextref;
1117                 else {
1118                         unp2 = unp2->unp_refs;
1119                         for (;;) {
1120                                 KASSERT(solocked2(so, unp2->unp_socket));
1121                                 if (unp2 == 0)
1122                                         panic("unp_disconnect");
1123                                 if (unp2->unp_nextref == unp)
1124                                         break;
1125                                 unp2 = unp2->unp_nextref;
1126                         }
1127                         unp2->unp_nextref = unp->unp_nextref;
1128                 }
1129                 unp->unp_nextref = 0;
1130                 so->so_state &= ~SS_ISCONNECTED;
1131                 break;
1132
1133         case SOCK_STREAM:
1134                 KASSERT(solocked2(so, unp2->unp_socket));
1135                 soisdisconnected(so);
1136                 unp2->unp_conn = 0;
1137                 soisdisconnected(unp2->unp_socket);
1138                 break;
1139         }
1140 }
1141
1142 #ifdef notdef
1143 unp_abort(struct unpcb *unp)
1144 {
1145         unp_detach(unp);
1146 }
1147 #endif
1148
1149 void
1150 unp_shutdown(struct unpcb *unp)
1151 {
1152         struct socket *so;
1153
1154         if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1155             (so = unp->unp_conn->unp_socket))
1156                 socantrcvmore(so);
1157 }
1158
1159 bool
1160 unp_drop(struct unpcb *unp, int errno)
1161 {
1162         struct socket *so = unp->unp_socket;
1163
1164         KASSERT(solocked(so));
1165
1166         so->so_error = errno;
1167         unp_disconnect(unp);
1168         if (so->so_head) {
1169                 so->so_pcb = NULL;
1170                 /* sofree() drops the socket lock */
1171                 sofree(so);
1172                 unp_free(unp);
1173                 return true;
1174         }
1175         return false;
1176 }
1177
1178 #ifdef notdef
1179 unp_drain(void)
1180 {
1181
1182 }
1183 #endif
1184
1185 int
1186 unp_externalize(struct mbuf *rights, struct lwp *l)
1187 {
1188         struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1189         struct proc *p = l->l_proc;
1190         int i, *fdp;
1191         file_t **rp;
1192         file_t *fp;
1193         int nfds, error = 0;
1194
1195         nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1196             sizeof(file_t *);
1197         rp = (file_t **)CMSG_DATA(cm);
1198
1199         fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
1200         rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
1201
1202         /* Make sure the recipient should be able to see the files.. */
1203         if (p->p_cwdi->cwdi_rdir != NULL) {
1204                 rp = (file_t **)CMSG_DATA(cm);
1205                 for (i = 0; i < nfds; i++) {
1206                         fp = *rp++;
1207                         /*
1208                          * If we are in a chroot'ed directory, and
1209                          * someone wants to pass us a directory, make
1210                          * sure it's inside the subtree we're allowed
1211                          * to access.
1212                          */
1213                         if (fp->f_type == DTYPE_VNODE) {
1214                                 vnode_t *vp = (vnode_t *)fp->f_data;
1215                                 if ((vp->v_type == VDIR) &&
1216                                     !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
1217                                         error = EPERM;
1218                                         break;
1219                                 }
1220                         }
1221                 }
1222         }
1223
1224  restart:
1225         rp = (file_t **)CMSG_DATA(cm);
1226         if (error != 0) {
1227                 for (i = 0; i < nfds; i++) {
1228                         fp = *rp;
1229                         *rp++ = 0;
1230                         unp_discard_now(fp);
1231                 }
1232                 goto out;
1233         }
1234
1235         /*
1236          * First loop -- allocate file descriptor table slots for the
1237          * new files.
1238          */
1239         for (i = 0; i < nfds; i++) {
1240                 fp = *rp++;
1241                 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
1242                         /*
1243                          * Back out what we've done so far.
1244                          */
1245                         for (--i; i >= 0; i--) {
1246                                 fd_abort(p, NULL, fdp[i]);
1247                         }
1248                         if (error == ENOSPC) {
1249                                 fd_tryexpand(p);
1250                                 error = 0;
1251                         } else {
1252                                 /*
1253                                  * This is the error that has historically
1254                                  * been returned, and some callers may
1255                                  * expect it.
1256                                  */
1257                                 error = EMSGSIZE;
1258                         }
1259                         goto restart;
1260                 }
1261         }
1262
1263         /*
1264          * Now that adding them has succeeded, update all of the
1265          * file passing state and affix the descriptors.
1266          */
1267         rp = (file_t **)CMSG_DATA(cm);
1268         for (i = 0; i < nfds; i++) {
1269                 fp = *rp++;
1270                 atomic_dec_uint(&unp_rights);
1271                 fd_affix(p, fp, fdp[i]);
1272                 mutex_enter(&fp->f_lock);
1273                 fp->f_msgcount--;
1274                 mutex_exit(&fp->f_lock);
1275                 /*
1276                  * Note that fd_affix() adds a reference to the file.
1277                  * The file may already have been closed by another
1278                  * LWP in the process, so we must drop the reference
1279                  * added by unp_internalize() with closef().
1280                  */
1281                 closef(fp);
1282         }
1283
1284         /*
1285          * Copy temporary array to message and adjust length, in case of
1286          * transition from large file_t pointers to ints.
1287          */
1288         memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
1289         cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1290         rights->m_len = CMSG_SPACE(nfds * sizeof(int));
1291  out:
1292         rw_exit(&p->p_cwdi->cwdi_lock);
1293         free(fdp, M_TEMP);
1294         return (error);
1295 }
1296
1297 int
1298 unp_internalize(struct mbuf **controlp)
1299 {
1300         filedesc_t *fdescp = curlwp->l_fd;
1301         struct mbuf *control = *controlp;
1302         struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
1303         file_t **rp, **files;
1304         file_t *fp;
1305         int i, fd, *fdp;
1306         int nfds, error;
1307         u_int maxmsg;
1308
1309         error = 0;
1310         newcm = NULL;
1311
1312         /* Sanity check the control message header. */
1313         if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1314             cm->cmsg_len > control->m_len ||
1315             cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
1316                 return (EINVAL);
1317
1318         /*
1319          * Verify that the file descriptors are valid, and acquire
1320          * a reference to each.
1321          */
1322         nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
1323         fdp = (int *)CMSG_DATA(cm);
1324         maxmsg = maxfiles / unp_rights_ratio;
1325         for (i = 0; i < nfds; i++) {
1326                 fd = *fdp++;
1327                 if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
1328                         atomic_dec_uint(&unp_rights);
1329                         nfds = i;
1330                         error = EAGAIN;
1331                         goto out;
1332                 }
1333                 if ((fp = fd_getfile(fd)) == NULL) {
1334                         atomic_dec_uint(&unp_rights);
1335                         nfds = i;
1336                         error = EBADF;
1337                         goto out;
1338                 }
1339         }
1340
1341         /* Allocate new space and copy header into it. */
1342         newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
1343         if (newcm == NULL) {
1344                 error = E2BIG;
1345                 goto out;
1346         }
1347         memcpy(newcm, cm, sizeof(struct cmsghdr));
1348         files = (file_t **)CMSG_DATA(newcm);
1349
1350         /*
1351          * Transform the file descriptors into file_t pointers, in
1352          * reverse order so that if pointers are bigger than ints, the
1353          * int won't get until we're done.  No need to lock, as we have
1354          * already validated the descriptors with fd_getfile().
1355          */
1356         fdp = (int *)CMSG_DATA(cm) + nfds;
1357         rp = files + nfds;
1358         for (i = 0; i < nfds; i++) {
1359                 fp = fdescp->fd_dt->dt_ff[*--fdp]->ff_file;
1360                 KASSERT(fp != NULL);
1361                 mutex_enter(&fp->f_lock);
1362                 *--rp = fp;
1363                 fp->f_count++;
1364                 fp->f_msgcount++;
1365                 mutex_exit(&fp->f_lock);
1366         }
1367
1368  out:
1369         /* Release descriptor references. */
1370         fdp = (int *)CMSG_DATA(cm);
1371         for (i = 0; i < nfds; i++) {
1372                 fd_putfile(*fdp++);
1373                 if (error != 0) {
1374                         atomic_dec_uint(&unp_rights);
1375                 }
1376         }
1377
1378         if (error == 0) {
1379                 if (control->m_flags & M_EXT) {
1380                         m_freem(control);
1381                         *controlp = control = m_get(M_WAIT, MT_CONTROL);
1382                 }
1383                 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
1384                     M_MBUF, NULL, NULL);
1385                 cm = newcm;
1386                 /*
1387                  * Adjust message & mbuf to note amount of space
1388                  * actually used.
1389                  */
1390                 cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
1391                 control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
1392         }
1393
1394         return error;
1395 }
1396
1397 struct mbuf *
1398 unp_addsockcred(struct lwp *l, struct mbuf *control)
1399 {
1400         struct cmsghdr *cmp;
1401         struct sockcred *sc;
1402         struct mbuf *m, *n;
1403         int len, space, i;
1404
1405         len = CMSG_LEN(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
1406         space = CMSG_SPACE(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
1407
1408         m = m_get(M_WAIT, MT_CONTROL);
1409         if (space > MLEN) {
1410                 if (space > MCLBYTES)
1411                         MEXTMALLOC(m, space, M_WAITOK);
1412                 else
1413                         m_clget(m, M_WAIT);
1414                 if ((m->m_flags & M_EXT) == 0) {
1415                         m_free(m);
1416                         return (control);
1417                 }
1418         }
1419
1420         m->m_len = space;
1421         m->m_next = NULL;
1422         cmp = mtod(m, struct cmsghdr *);
1423         sc = (struct sockcred *)CMSG_DATA(cmp);
1424         cmp->cmsg_len = len;
1425         cmp->cmsg_level = SOL_SOCKET;
1426         cmp->cmsg_type = SCM_CREDS;
1427         sc->sc_uid = kauth_cred_getuid(l->l_cred);
1428         sc->sc_euid = kauth_cred_geteuid(l->l_cred);
1429         sc->sc_gid = kauth_cred_getgid(l->l_cred);
1430         sc->sc_egid = kauth_cred_getegid(l->l_cred);
1431         sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
1432         for (i = 0; i < sc->sc_ngroups; i++)
1433                 sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
1434
1435         /*
1436          * If a control message already exists, append us to the end.
1437          */
1438         if (control != NULL) {
1439                 for (n = control; n->m_next != NULL; n = n->m_next)
1440                         ;
1441                 n->m_next = m;
1442         } else
1443                 control = m;
1444
1445         return (control);
1446 }
1447
1448 /*
1449  * Do a mark-sweep GC of files in the system, to free up any which are
1450  * caught in flight to an about-to-be-closed socket.  Additionally,
1451  * process deferred file closures.
1452  */
1453 static void
1454 unp_gc(file_t *dp)
1455 {
1456         extern  struct domain unixdomain;
1457         file_t *fp, *np;
1458         struct socket *so, *so1;
1459         u_int i, old, new;
1460         bool didwork;
1461
1462         KASSERT(curlwp == unp_thread_lwp);
1463         KASSERT(mutex_owned(&filelist_lock));
1464
1465         /*
1466          * First, process deferred file closures.
1467          */
1468         while (!SLIST_EMPTY(&unp_thread_discard)) {
1469                 fp = SLIST_FIRST(&unp_thread_discard);
1470                 KASSERT(fp->f_unpcount > 0);
1471                 KASSERT(fp->f_count > 0);
1472                 KASSERT(fp->f_msgcount > 0);
1473                 KASSERT(fp->f_count >= fp->f_unpcount);
1474                 KASSERT(fp->f_count >= fp->f_msgcount);
1475                 KASSERT(fp->f_msgcount >= fp->f_unpcount);
1476                 SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
1477                 i = fp->f_unpcount;
1478                 fp->f_unpcount = 0;
1479                 mutex_exit(&filelist_lock);
1480                 for (; i != 0; i--) {
1481                         unp_discard_now(fp);
1482                 }
1483                 mutex_enter(&filelist_lock);
1484         }
1485
1486         /*
1487          * Clear mark bits.  Ensure that we don't consider new files
1488          * entering the file table during this loop (they will not have
1489          * FSCAN set).
1490          */
1491         unp_defer = 0;
1492         LIST_FOREACH(fp, &filehead, f_list) {
1493                 for (old = fp->f_flag;; old = new) {
1494                         new = atomic_cas_uint(&fp->f_flag, old,
1495                             (old | FSCAN) & ~(FMARK|FDEFER));
1496                         if (__predict_true(old == new)) {
1497                                 break;
1498                         }
1499                 }
1500         }
1501
1502         /*
1503          * Iterate over the set of sockets, marking ones believed (based on
1504          * refcount) to be referenced from a process, and marking for rescan
1505          * sockets which are queued on a socket.  Recan continues descending
1506          * and searching for sockets referenced by sockets (FDEFER), until
1507          * there are no more socket->socket references to be discovered.
1508          */
1509         do {
1510                 didwork = false;
1511                 for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
1512                         KASSERT(mutex_owned(&filelist_lock));
1513                         np = LIST_NEXT(fp, f_list);
1514                         mutex_enter(&fp->f_lock);
1515                         if ((fp->f_flag & FDEFER) != 0) {
1516                                 atomic_and_uint(&fp->f_flag, ~FDEFER);
1517                                 unp_defer--;
1518                                 KASSERT(fp->f_count != 0);
1519                         } else {
1520                                 if (fp->f_count == 0 ||
1521                                     (fp->f_flag & FMARK) != 0 ||
1522                                     fp->f_count == fp->f_msgcount ||
1523                                     fp->f_unpcount != 0) {
1524                                         mutex_exit(&fp->f_lock);
1525                                         continue;
1526                                 }
1527                         }
1528                         atomic_or_uint(&fp->f_flag, FMARK);
1529
1530                         if (fp->f_type != DTYPE_SOCKET ||
1531                             (so = fp->f_data) == NULL ||
1532                             so->so_proto->pr_domain != &unixdomain ||
1533                             (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
1534                                 mutex_exit(&fp->f_lock);
1535                                 continue;
1536                         }
1537
1538                         /* Gain file ref, mark our position, and unlock. */
1539                         didwork = true;
1540                         LIST_INSERT_AFTER(fp, dp, f_list);
1541                         fp->f_count++;
1542                         mutex_exit(&fp->f_lock);
1543                         mutex_exit(&filelist_lock);
1544
1545                         /*
1546                          * Mark files referenced from sockets queued on the
1547                          * accept queue as well.
1548                          */
1549                         solock(so);
1550                         unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1551                         if ((so->so_options & SO_ACCEPTCONN) != 0) {
1552                                 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1553                                         unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1554                                 }
1555                                 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1556                                         unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1557                                 }
1558                         }
1559                         sounlock(so);
1560
1561                         /* Re-lock and restart from where we left off. */
1562                         closef(fp);
1563                         mutex_enter(&filelist_lock);
1564                         np = LIST_NEXT(dp, f_list);
1565                         LIST_REMOVE(dp, f_list);
1566                 }
1567                 /*
1568                  * Bail early if we did nothing in the loop above.  Could
1569                  * happen because of concurrent activity causing unp_defer
1570                  * to get out of sync.
1571                  */
1572         } while (unp_defer != 0 && didwork);
1573
1574         /*
1575          * Sweep pass.
1576          *
1577          * We grab an extra reference to each of the files that are
1578          * not otherwise accessible and then free the rights that are
1579          * stored in messages on them.
1580          */
1581         for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
1582                 KASSERT(mutex_owned(&filelist_lock));
1583                 np = LIST_NEXT(fp, f_list);
1584                 mutex_enter(&fp->f_lock);
1585
1586                 /*
1587                  * Ignore non-sockets.
1588                  * Ignore dead sockets, or sockets with pending close.
1589                  * Ignore sockets obviously referenced elsewhere.
1590                  * Ignore sockets marked as referenced by our scan.
1591                  * Ignore new sockets that did not exist during the scan.
1592                  */
1593                 if (fp->f_type != DTYPE_SOCKET ||
1594                     fp->f_count == 0 || fp->f_unpcount != 0 ||
1595                     fp->f_count != fp->f_msgcount ||
1596                     (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
1597                         mutex_exit(&fp->f_lock);
1598                         continue;
1599                 }
1600
1601                 /* Gain file ref, mark our position, and unlock. */
1602                 LIST_INSERT_AFTER(fp, dp, f_list);
1603                 fp->f_count++;
1604                 mutex_exit(&fp->f_lock);
1605                 mutex_exit(&filelist_lock);
1606
1607                 /*
1608                  * Flush all data from the socket's receive buffer.
1609                  * This will cause files referenced only by the
1610                  * socket to be queued for close.
1611                  */
1612                 so = fp->f_data;
1613                 solock(so);
1614                 sorflush(so);
1615                 sounlock(so);
1616
1617                 /* Re-lock and restart from where we left off. */
1618                 closef(fp);
1619                 mutex_enter(&filelist_lock);
1620                 np = LIST_NEXT(dp, f_list);
1621                 LIST_REMOVE(dp, f_list);
1622         }
1623 }
1624
1625 /*
1626  * Garbage collector thread.  While SCM_RIGHTS messages are in transit,
1627  * wake once per second to garbage collect.  Run continually while we
1628  * have deferred closes to process.
1629  */
1630 static void
1631 unp_thread(void *cookie)
1632 {
1633         file_t *dp;
1634
1635         /* Allocate a dummy file for our scans. */
1636         if ((dp = fgetdummy()) == NULL) {
1637                 panic("unp_thread");
1638         }
1639
1640         mutex_enter(&filelist_lock);
1641         for (;;) {
1642                 KASSERT(mutex_owned(&filelist_lock));
1643                 if (SLIST_EMPTY(&unp_thread_discard)) {
1644                         if (unp_rights != 0) {
1645                                 (void)cv_timedwait(&unp_thread_cv,
1646                                     &filelist_lock, hz);
1647                         } else {
1648                                 cv_wait(&unp_thread_cv, &filelist_lock);
1649                         }
1650                 }
1651                 unp_gc(dp);
1652         }
1653         /* NOTREACHED */
1654 }
1655
1656 /*
1657  * Kick the garbage collector into action if there is something for
1658  * it to process.
1659  */
1660 static void
1661 unp_thread_kick(void)
1662 {
1663
1664         if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
1665                 mutex_enter(&filelist_lock);
1666                 cv_signal(&unp_thread_cv);
1667                 mutex_exit(&filelist_lock);
1668         }
1669 }
1670
1671 void
1672 unp_dispose(struct mbuf *m)
1673 {
1674
1675         if (m)
1676                 unp_scan(m, unp_discard_later, 1);
1677 }
1678
1679 void
1680 unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
1681 {
1682         struct mbuf *m;
1683         file_t **rp, *fp;
1684         struct cmsghdr *cm;
1685         int i, qfds;
1686
1687         while (m0) {
1688                 for (m = m0; m; m = m->m_next) {
1689                         if (m->m_type != MT_CONTROL ||
1690                             m->m_len < sizeof(*cm)) {
1691                                 continue;
1692                         }
1693                         cm = mtod(m, struct cmsghdr *);
1694                         if (cm->cmsg_level != SOL_SOCKET ||
1695                             cm->cmsg_type != SCM_RIGHTS)
1696                                 continue;
1697                         qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1698                             / sizeof(file_t *);
1699                         rp = (file_t **)CMSG_DATA(cm);
1700                         for (i = 0; i < qfds; i++) {
1701                                 fp = *rp;
1702                                 if (discard) {
1703                                         *rp = 0;
1704                                 }
1705                                 (*op)(fp);
1706                                 rp++;
1707                         }
1708                 }
1709                 m0 = m0->m_nextpkt;
1710         }
1711 }
1712
1713 void
1714 unp_mark(file_t *fp)
1715 {
1716
1717         if (fp == NULL)
1718                 return;
1719
1720         /* If we're already deferred, don't screw up the defer count */
1721         mutex_enter(&fp->f_lock);
1722         if (fp->f_flag & (FMARK | FDEFER)) {
1723                 mutex_exit(&fp->f_lock);
1724                 return;
1725         }
1726
1727         /*
1728          * Minimize the number of deferrals...  Sockets are the only type of
1729          * file which can hold references to another file, so just mark
1730          * other files, and defer unmarked sockets for the next pass.
1731          */
1732         if (fp->f_type == DTYPE_SOCKET) {
1733                 unp_defer++;
1734                 KASSERT(fp->f_count != 0);
1735                 atomic_or_uint(&fp->f_flag, FDEFER);
1736         } else {
1737                 atomic_or_uint(&fp->f_flag, FMARK);
1738         }
1739         mutex_exit(&fp->f_lock);
1740 }
1741
1742 static void
1743 unp_discard_now(file_t *fp)
1744 {
1745
1746         if (fp == NULL)
1747                 return;
1748
1749         KASSERT(fp->f_count > 0);
1750         KASSERT(fp->f_msgcount > 0);
1751
1752         mutex_enter(&fp->f_lock);
1753         fp->f_msgcount--;
1754         mutex_exit(&fp->f_lock);
1755         atomic_dec_uint(&unp_rights);
1756         (void)closef(fp);
1757 }
1758
1759 static void
1760 unp_discard_later(file_t *fp)
1761 {
1762
1763         if (fp == NULL)
1764                 return;
1765
1766         KASSERT(fp->f_count > 0);
1767         KASSERT(fp->f_msgcount > 0);
1768
1769         mutex_enter(&filelist_lock);
1770         if (fp->f_unpcount++ == 0) {
1771                 SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
1772         }
1773         mutex_exit(&filelist_lock);
1774 }