sys/kern/uipc_socket.c

   1 /*      $NetBSD: uipc_socket.c,v 1.199 2009/12/30 06:58:50 elad Exp $   */
   2
   3 /*-
   4  * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright (c) 2004 The FreeBSD Foundation
  34  * Copyright (c) 2004 Robert Watson
  35  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  36  *      The Regents of the University of California.  All rights reserved.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. Neither the name of the University nor the names of its contributors
  47  *    may be used to endorse or promote products derived from this software
  48  *    without specific prior written permission.
  49  *
  50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  60  * SUCH DAMAGE.
  61  *
  62  *      @(#)uipc_socket.c       8.6 (Berkeley) 5/2/95
  63  */
  64
  65 #include <sys/cdefs.h>
  66 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.199 2009/12/30 06:58:50 elad Exp $");
  67
  68 #include "opt_compat_netbsd.h"
  69 #include "opt_sock_counters.h"
  70 #include "opt_sosend_loan.h"
  71 #include "opt_mbuftrace.h"
  72 #include "opt_somaxkva.h"
  73 #include "opt_multiprocessor.h" /* XXX */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/proc.h>
  78 #include <sys/file.h>
  79 #include <sys/filedesc.h>
  80 #include <sys/kmem.h>
  81 #include <sys/mbuf.h>
  82 #include <sys/domain.h>
  83 #include <sys/kernel.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/signalvar.h>
  88 #include <sys/resourcevar.h>
  89 #include <sys/uidinfo.h>
  90 #include <sys/event.h>
  91 #include <sys/poll.h>
  92 #include <sys/kauth.h>
  93 #include <sys/mutex.h>
  94 #include <sys/condvar.h>
  95
  96 #ifdef COMPAT_50
  97 #include <compat/sys/time.h>
  98 #include <compat/sys/socket.h>
  99 #endif
 100
 101 #include <uvm/uvm.h>
 102
 103 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options");
 104 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 105
 106 extern const struct fileops socketops;
 107
 108 extern int      somaxconn;                      /* patchable (XXX sysctl) */
 109 int             somaxconn = SOMAXCONN;
 110 kmutex_t        *softnet_lock;
 111
 112 #ifdef SOSEND_COUNTERS
 113 #include <sys/device.h>
 114
 115 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
 116     NULL, "sosend", "loan big");
 117 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
 118     NULL, "sosend", "copy big");
 119 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
 120     NULL, "sosend", "copy small");
 121 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
 122     NULL, "sosend", "kva limit");
 123
 124 #define SOSEND_COUNTER_INCR(ev)         (ev)->ev_count++
 125
 126 EVCNT_ATTACH_STATIC(sosend_loan_big);
 127 EVCNT_ATTACH_STATIC(sosend_copy_big);
 128 EVCNT_ATTACH_STATIC(sosend_copy_small);
 129 EVCNT_ATTACH_STATIC(sosend_kvalimit);
 130 #else
 131
 132 #define SOSEND_COUNTER_INCR(ev)         /* nothing */
 133
 134 #endif /* SOSEND_COUNTERS */
 135
 136 static struct callback_entry sokva_reclaimerentry;
 137
 138 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
 139 int sock_loan_thresh = -1;
 140 #else
 141 int sock_loan_thresh = 4096;
 142 #endif
 143
 144 static kmutex_t so_pendfree_lock;
 145 static struct mbuf *so_pendfree;
 146
 147 #ifndef SOMAXKVA
 148 #define SOMAXKVA (16 * 1024 * 1024)
 149 #endif
 150 int somaxkva = SOMAXKVA;
 151 static int socurkva;
 152 static kcondvar_t socurkva_cv;
 153
 154 static kauth_listener_t socket_listener;
 155
 156 #define SOCK_LOAN_CHUNK         65536
 157
 158 static size_t sodopendfree(void);
 159 static size_t sodopendfreel(void);
 160
 161 static void sysctl_kern_somaxkva_setup(void);
 162 static struct sysctllog *socket_sysctllog;
 163
 164 static vsize_t
 165 sokvareserve(struct socket *so, vsize_t len)
 166 {
 167         int error;
 168
 169         mutex_enter(&so_pendfree_lock);
 170         while (socurkva + len > somaxkva) {
 171                 size_t freed;
 172
 173                 /*
 174                  * try to do pendfree.
 175                  */
 176
 177                 freed = sodopendfreel();
 178
 179                 /*
 180                  * if some kva was freed, try again.
 181                  */
 182
 183                 if (freed)
 184                         continue;
 185
 186                 SOSEND_COUNTER_INCR(&sosend_kvalimit);
 187                 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
 188                 if (error) {
 189                         len = 0;
 190                         break;
 191                 }
 192         }
 193         socurkva += len;
 194         mutex_exit(&so_pendfree_lock);
 195         return len;
 196 }
 197
 198 static void
 199 sokvaunreserve(vsize_t len)
 200 {
 201
 202         mutex_enter(&so_pendfree_lock);
 203         socurkva -= len;
 204         cv_broadcast(&socurkva_cv);
 205         mutex_exit(&so_pendfree_lock);
 206 }
 207
 208 /*
 209  * sokvaalloc: allocate kva for loan.
 210  */
 211
 212 vaddr_t
 213 sokvaalloc(vsize_t len, struct socket *so)
 214 {
 215         vaddr_t lva;
 216
 217         /*
 218          * reserve kva.
 219          */
 220
 221         if (sokvareserve(so, len) == 0)
 222                 return 0;
 223
 224         /*
 225          * allocate kva.
 226          */
 227
 228         lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA);
 229         if (lva == 0) {
 230                 sokvaunreserve(len);
 231                 return (0);
 232         }
 233
 234         return lva;
 235 }
 236
 237 /*
 238  * sokvafree: free kva for loan.
 239  */
 240
 241 void
 242 sokvafree(vaddr_t sva, vsize_t len)
 243 {
 244
 245         /*
 246          * free kva.
 247          */
 248
 249         uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
 250
 251         /*
 252          * unreserve kva.
 253          */
 254
 255         sokvaunreserve(len);
 256 }
 257
 258 static void
 259 sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
 260 {
 261         vaddr_t sva, eva;
 262         vsize_t len;
 263         int npgs;
 264
 265         KASSERT(pgs != NULL);
 266
 267         eva = round_page((vaddr_t) buf + size);
 268         sva = trunc_page((vaddr_t) buf);
 269         len = eva - sva;
 270         npgs = len >> PAGE_SHIFT;
 271
 272         pmap_kremove(sva, len);
 273         pmap_update(pmap_kernel());
 274         uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
 275         sokvafree(sva, len);
 276 }
 277
 278 static size_t
 279 sodopendfree(void)
 280 {
 281         size_t rv;
 282
 283         if (__predict_true(so_pendfree == NULL))
 284                 return 0;
 285
 286         mutex_enter(&so_pendfree_lock);
 287         rv = sodopendfreel();
 288         mutex_exit(&so_pendfree_lock);
 289
 290         return rv;
 291 }
 292
 293 /*
 294  * sodopendfreel: free mbufs on "pendfree" list.
 295  * unlock and relock so_pendfree_lock when freeing mbufs.
 296  *
 297  * => called with so_pendfree_lock held.
 298  */
 299
 300 static size_t
 301 sodopendfreel(void)
 302 {
 303         struct mbuf *m, *next;
 304         size_t rv = 0;
 305
 306         KASSERT(mutex_owned(&so_pendfree_lock));
 307
 308         while (so_pendfree != NULL) {
 309                 m = so_pendfree;
 310                 so_pendfree = NULL;
 311                 mutex_exit(&so_pendfree_lock);
 312
 313                 for (; m != NULL; m = next) {
 314                         next = m->m_next;
 315                         KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0);
 316                         KASSERT(m->m_ext.ext_refcnt == 0);
 317
 318                         rv += m->m_ext.ext_size;
 319                         sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
 320                             m->m_ext.ext_size);
 321                         pool_cache_put(mb_cache, m);
 322                 }
 323
 324                 mutex_enter(&so_pendfree_lock);
 325         }
 326
 327         return (rv);
 328 }
 329
 330 void
 331 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
 332 {
 333
 334         KASSERT(m != NULL);
 335
 336         /*
 337          * postpone freeing mbuf.
 338          *
 339          * we can't do it in interrupt context
 340          * because we need to put kva back to kernel_map.
 341          */
 342
 343         mutex_enter(&so_pendfree_lock);
 344         m->m_next = so_pendfree;
 345         so_pendfree = m;
 346         cv_broadcast(&socurkva_cv);
 347         mutex_exit(&so_pendfree_lock);
 348 }
 349
 350 static long
 351 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
 352 {
 353         struct iovec *iov = uio->uio_iov;
 354         vaddr_t sva, eva;
 355         vsize_t len;
 356         vaddr_t lva;
 357         int npgs, error;
 358         vaddr_t va;
 359         int i;
 360
 361         if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
 362                 return (0);
 363
 364         if (iov->iov_len < (size_t) space)
 365                 space = iov->iov_len;
 366         if (space > SOCK_LOAN_CHUNK)
 367                 space = SOCK_LOAN_CHUNK;
 368
 369         eva = round_page((vaddr_t) iov->iov_base + space);
 370         sva = trunc_page((vaddr_t) iov->iov_base);
 371         len = eva - sva;
 372         npgs = len >> PAGE_SHIFT;
 373
 374         KASSERT(npgs <= M_EXT_MAXPAGES);
 375
 376         lva = sokvaalloc(len, so);
 377         if (lva == 0)
 378                 return 0;
 379
 380         error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
 381             m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
 382         if (error) {
 383                 sokvafree(lva, len);
 384                 return (0);
 385         }
 386
 387         for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
 388                 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
 389                     VM_PROT_READ, 0);
 390         pmap_update(pmap_kernel());
 391
 392         lva += (vaddr_t) iov->iov_base & PAGE_MASK;
 393
 394         MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
 395         m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
 396
 397         uio->uio_resid -= space;
 398         /* uio_offset not updated, not set/used for write(2) */
 399         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
 400         uio->uio_iov->iov_len -= space;
 401         if (uio->uio_iov->iov_len == 0) {
 402                 uio->uio_iov++;
 403                 uio->uio_iovcnt--;
 404         }
 405
 406         return (space);
 407 }
 408
 409 static int
 410 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
 411 {
 412
 413         KASSERT(ce == &sokva_reclaimerentry);
 414         KASSERT(obj == NULL);
 415
 416         sodopendfree();
 417         if (!vm_map_starved_p(kernel_map)) {
 418                 return CALLBACK_CHAIN_ABORT;
 419         }
 420         return CALLBACK_CHAIN_CONTINUE;
 421 }
 422
 423 struct mbuf *
 424 getsombuf(struct socket *so, int type)
 425 {
 426         struct mbuf *m;
 427
 428         m = m_get(M_WAIT, type);
 429         MCLAIM(m, so->so_mowner);
 430         return m;
 431 }
 432
 433 static int
 434 socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
 435     void *arg0, void *arg1, void *arg2, void *arg3)
 436 {
 437         int result;
 438         enum kauth_network_req req;
 439
 440         result = KAUTH_RESULT_DEFER;
 441         req = (enum kauth_network_req)arg0;
 442
 443         if ((action != KAUTH_NETWORK_SOCKET) &&
 444             (action != KAUTH_NETWORK_BIND))
 445                 return result;
 446
 447         switch (req) {
 448         case KAUTH_REQ_NETWORK_BIND_PORT:
 449                 result = KAUTH_RESULT_ALLOW;
 450                 break;
 451
 452         case KAUTH_REQ_NETWORK_SOCKET_DROP: {
 453                 /* Normal users can only drop their own connections. */
 454                 struct socket *so = (struct socket *)arg1;
 455
 456                 if (proc_uidmatch(cred, so->so_cred))
 457                         result = KAUTH_RESULT_ALLOW;
 458
 459                 break;
 460                 }
 461
 462         case KAUTH_REQ_NETWORK_SOCKET_OPEN:
 463                 /* We allow "raw" routing/bluetooth sockets to anyone. */
 464                 if ((u_long)arg1 == PF_ROUTE || (u_long)arg1 == PF_BLUETOOTH)
 465                         result = KAUTH_RESULT_ALLOW;
 466                 else {
 467                         /* Privileged, let secmodel handle this. */
 468                         if ((u_long)arg2 == SOCK_RAW)
 469                                 break;
 470                 }
 471
 472                 result = KAUTH_RESULT_ALLOW;
 473
 474                 break;
 475
 476         case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
 477                 result = KAUTH_RESULT_ALLOW;
 478
 479                 break;
 480
 481         default:
 482                 break;
 483         }
 484
 485         return result;
 486 }
 487
 488 void
 489 soinit(void)
 490 {
 491
 492         sysctl_kern_somaxkva_setup();
 493
 494         mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
 495         softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 496         cv_init(&socurkva_cv, "sokva");
 497         soinit2();
 498
 499         /* Set the initial adjusted socket buffer size. */
 500         if (sb_max_set(sb_max))
 501                 panic("bad initial sb_max value: %lu", sb_max);
 502
 503         callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
 504             &sokva_reclaimerentry, NULL, sokva_reclaim_callback);
 505
 506         socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
 507             socket_listener_cb, NULL);
 508 }
 509
 510 /*
 511  * Socket operation routines.
 512  * These routines are called by the routines in
 513  * sys_socket.c or from a system process, and
 514  * implement the semantics of socket operations by
 515  * switching out to the protocol specific routines.
 516  */
 517 /*ARGSUSED*/
 518 int
 519 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
 520          struct socket *lockso)
 521 {
 522         const struct protosw    *prp;
 523         struct socket   *so;
 524         uid_t           uid;
 525         int             error;
 526         kmutex_t        *lock;
 527
 528         error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
 529             KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
 530             KAUTH_ARG(proto));
 531         if (error != 0)
 532                 return error;
 533
 534         if (proto)
 535                 prp = pffindproto(dom, proto, type);
 536         else
 537                 prp = pffindtype(dom, type);
 538         if (prp == NULL) {
 539                 /* no support for domain */
 540                 if (pffinddomain(dom) == 0)
 541                         return EAFNOSUPPORT;
 542                 /* no support for socket type */
 543                 if (proto == 0 && type != 0)
 544                         return EPROTOTYPE;
 545                 return EPROTONOSUPPORT;
 546         }
 547         if (prp->pr_usrreq == NULL)
 548                 return EPROTONOSUPPORT;
 549         if (prp->pr_type != type)
 550                 return EPROTOTYPE;
 551
 552         so = soget(true);
 553         so->so_type = type;
 554         so->so_proto = prp;
 555         so->so_send = sosend;
 556         so->so_receive = soreceive;
 557 #ifdef MBUFTRACE
 558         so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
 559         so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
 560         so->so_mowner = &prp->pr_domain->dom_mowner;
 561 #endif
 562         uid = kauth_cred_geteuid(l->l_cred);
 563         so->so_uidinfo = uid_find(uid);
 564         so->so_cpid = l->l_proc->p_pid;
 565         if (lockso != NULL) {
 566                 /* Caller wants us to share a lock. */
 567                 lock = lockso->so_lock;
 568                 so->so_lock = lock;
 569                 mutex_obj_hold(lock);
 570                 mutex_enter(lock);
 571         } else {
 572                 /* Lock assigned and taken during PRU_ATTACH. */
 573         }
 574         error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL,
 575             (struct mbuf *)(long)proto, NULL, l);
 576         KASSERT(solocked(so));
 577         if (error != 0) {
 578                 so->so_state |= SS_NOFDREF;
 579                 sofree(so);
 580                 return error;
 581         }
 582         so->so_cred = kauth_cred_dup(l->l_cred);
 583         sounlock(so);
 584         *aso = so;
 585         return 0;
 586 }
 587
 588 /* On success, write file descriptor to fdout and return zero.  On
 589  * failure, return non-zero; *fdout will be undefined.
 590  */
 591 int
 592 fsocreate(int domain, struct socket **sop, int type, int protocol,
 593     struct lwp *l, int *fdout)
 594 {
 595         struct socket   *so;
 596         struct file     *fp;
 597         int             fd, error;
 598
 599         if ((error = fd_allocfile(&fp, &fd)) != 0)
 600                 return (error);
 601         fp->f_flag = FREAD|FWRITE;
 602         fp->f_type = DTYPE_SOCKET;
 603         fp->f_ops = &socketops;
 604         error = socreate(domain, &so, type, protocol, l, NULL);
 605         if (error != 0) {
 606                 fd_abort(curproc, fp, fd);
 607         } else {
 608                 if (sop != NULL)
 609                         *sop = so;
 610                 fp->f_data = so;
 611                 fd_affix(curproc, fp, fd);
 612                 *fdout = fd;
 613         }
 614         return error;
 615 }
 616
 617 int
 618 sofamily(const struct socket *so)
 619 {
 620         const struct protosw *pr;
 621         const struct domain *dom;
 622
 623         if ((pr = so->so_proto) == NULL)
 624                 return AF_UNSPEC;
 625         if ((dom = pr->pr_domain) == NULL)
 626                 return AF_UNSPEC;
 627         return dom->dom_family;
 628 }
 629
 630 int
 631 sobind(struct socket *so, struct mbuf *nam, struct lwp *l)
 632 {
 633         int     error;
 634
 635         solock(so);
 636         error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l);
 637         sounlock(so);
 638         return error;
 639 }
 640
 641 int
 642 solisten(struct socket *so, int backlog, struct lwp *l)
 643 {
 644         int     error;
 645
 646         solock(so);
 647         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 648             SS_ISDISCONNECTING)) != 0) {
 649                 sounlock(so);
 650                 return (EOPNOTSUPP);
 651         }
 652         error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL,
 653             NULL, NULL, l);
 654         if (error != 0) {
 655                 sounlock(so);
 656                 return error;
 657         }
 658         if (TAILQ_EMPTY(&so->so_q))
 659                 so->so_options |= SO_ACCEPTCONN;
 660         if (backlog < 0)
 661                 backlog = 0;
 662         so->so_qlimit = min(backlog, somaxconn);
 663         sounlock(so);
 664         return 0;
 665 }
 666
 667 void
 668 sofree(struct socket *so)
 669 {
 670         u_int refs;
 671
 672         KASSERT(solocked(so));
 673
 674         if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
 675                 sounlock(so);
 676                 return;
 677         }
 678         if (so->so_head) {
 679                 /*
 680                  * We must not decommission a socket that's on the accept(2)
 681                  * queue.  If we do, then accept(2) may hang after select(2)
 682                  * indicated that the listening socket was ready.
 683                  */
 684                 if (!soqremque(so, 0)) {
 685                         sounlock(so);
 686                         return;
 687                 }
 688         }
 689         if (so->so_rcv.sb_hiwat)
 690                 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
 691                     RLIM_INFINITY);
 692         if (so->so_snd.sb_hiwat)
 693                 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
 694                     RLIM_INFINITY);
 695         sbrelease(&so->so_snd, so);
 696         KASSERT(!cv_has_waiters(&so->so_cv));
 697         KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
 698         KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
 699         sorflush(so);
 700         refs = so->so_aborting; /* XXX */
 701         /* Remove acccept filter if one is present. */
 702         if (so->so_accf != NULL)
 703                 (void)accept_filt_clear(so);
 704         sounlock(so);
 705         if (refs == 0)          /* XXX */
 706                 soput(so);
 707 }
 708
 709 /*
 710  * Close a socket on last file table reference removal.
 711  * Initiate disconnect if connected.
 712  * Free socket when disconnect complete.
 713  */
 714 int
 715 soclose(struct socket *so)
 716 {
 717         struct socket   *so2;
 718         int             error;
 719         int             error2;
 720
 721         error = 0;
 722         solock(so);
 723         if (so->so_options & SO_ACCEPTCONN) {
 724                 for (;;) {
 725                         if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
 726                                 KASSERT(solocked2(so, so2));
 727                                 (void) soqremque(so2, 0);
 728                                 /* soabort drops the lock. */
 729                                 (void) soabort(so2);
 730                                 solock(so);
 731                                 continue;
 732                         }
 733                         if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
 734                                 KASSERT(solocked2(so, so2));
 735                                 (void) soqremque(so2, 1);
 736                                 /* soabort drops the lock. */
 737                                 (void) soabort(so2);
 738                                 solock(so);
 739                                 continue;
 740                         }
 741                         break;
 742                 }
 743         }
 744         if (so->so_pcb == 0)
 745                 goto discard;
 746         if (so->so_state & SS_ISCONNECTED) {
 747                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 748                         error = sodisconnect(so);
 749                         if (error)
 750                                 goto drop;
 751                 }
 752                 if (so->so_options & SO_LINGER) {
 753                         if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio)
 754                                 goto drop;
 755                         while (so->so_state & SS_ISCONNECTED) {
 756                                 error = sowait(so, true, so->so_linger * hz);
 757                                 if (error)
 758                                         break;
 759                         }
 760                 }
 761         }
 762  drop:
 763         if (so->so_pcb) {
 764                 error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
 765                     NULL, NULL, NULL, NULL);
 766                 if (error == 0)
 767                         error = error2;
 768         }
 769  discard:
 770         if (so->so_state & SS_NOFDREF)
 771                 panic("soclose: NOFDREF");
 772         kauth_cred_free(so->so_cred);
 773         so->so_state |= SS_NOFDREF;
 774         sofree(so);
 775         return (error);
 776 }
 777
 778 /*
 779  * Must be called with the socket locked..  Will return with it unlocked.
 780  */
 781 int
 782 soabort(struct socket *so)
 783 {
 784         u_int refs;
 785         int error;
 786
 787         KASSERT(solocked(so));
 788         KASSERT(so->so_head == NULL);
 789
 790         so->so_aborting++;              /* XXX */
 791         error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL,
 792             NULL, NULL, NULL);
 793         refs = --so->so_aborting;       /* XXX */
 794         if (error || (refs == 0)) {
 795                 sofree(so);
 796         } else {
 797                 sounlock(so);
 798         }
 799         return error;
 800 }
 801
 802 int
 803 soaccept(struct socket *so, struct mbuf *nam)
 804 {
 805         int     error;
 806
 807         KASSERT(solocked(so));
 808
 809         error = 0;
 810         if ((so->so_state & SS_NOFDREF) == 0)
 811                 panic("soaccept: !NOFDREF");
 812         so->so_state &= ~SS_NOFDREF;
 813         if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
 814             (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
 815                 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
 816                     NULL, nam, NULL, NULL);
 817         else
 818                 error = ECONNABORTED;
 819
 820         return (error);
 821 }
 822
 823 int
 824 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l)
 825 {
 826         int             error;
 827
 828         KASSERT(solocked(so));
 829
 830         if (so->so_options & SO_ACCEPTCONN)
 831                 return (EOPNOTSUPP);
 832         /*
 833          * If protocol is connection-based, can only connect once.
 834          * Otherwise, if connected, try to disconnect first.
 835          * This allows user to disconnect by connecting to, e.g.,
 836          * a null address.
 837          */
 838         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 839             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 840             (error = sodisconnect(so))))
 841                 error = EISCONN;
 842         else
 843                 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
 844                     NULL, nam, NULL, l);
 845         return (error);
 846 }
 847
 848 int
 849 soconnect2(struct socket *so1, struct socket *so2)
 850 {
 851         int     error;
 852
 853         KASSERT(solocked2(so1, so2));
 854
 855         error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
 856             NULL, (struct mbuf *)so2, NULL, NULL);
 857         return (error);
 858 }
 859
 860 int
 861 sodisconnect(struct socket *so)
 862 {
 863         int     error;
 864
 865         KASSERT(solocked(so));
 866
 867         if ((so->so_state & SS_ISCONNECTED) == 0) {
 868                 error = ENOTCONN;
 869         } else if (so->so_state & SS_ISDISCONNECTING) {
 870                 error = EALREADY;
 871         } else {
 872                 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
 873                     NULL, NULL, NULL, NULL);
 874         }
 875         sodopendfree();
 876         return (error);
 877 }
 878
 879 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
 880 /*
 881  * Send on a socket.
 882  * If send must go all at once and message is larger than
 883  * send buffering, then hard error.
 884  * Lock against other senders.
 885  * If must go all at once and not enough room now, then
 886  * inform user that this would block and do nothing.
 887  * Otherwise, if nonblocking, send as much as possible.
 888  * The data to be sent is described by "uio" if nonzero,
 889  * otherwise by the mbuf chain "top" (which must be null
 890  * if uio is not).  Data provided in mbuf chain must be small
 891  * enough to send all at once.
 892  *
 893  * Returns nonzero on error, timeout or signal; callers
 894  * must check for short counts if EINTR/ERESTART are returned.
 895  * Data and control buffers are freed on return.
 896  */
 897 int
 898 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
 899         struct mbuf *control, int flags, struct lwp *l)
 900 {
 901         struct mbuf     **mp, *m;
 902         struct proc     *p;
 903         long            space, len, resid, clen, mlen;
 904         int             error, s, dontroute, atomic;
 905         short           wakeup_state = 0;
 906
 907         p = l->l_proc;
 908         sodopendfree();
 909         clen = 0;
 910
 911         /*
 912          * solock() provides atomicity of access.  splsoftnet() prevents
 913          * protocol processing soft interrupts from interrupting us and
 914          * blocking (expensive).
 915          */
 916         s = splsoftnet();
 917         solock(so);
 918         atomic = sosendallatonce(so) || top;
 919         if (uio)
 920                 resid = uio->uio_resid;
 921         else
 922                 resid = top->m_pkthdr.len;
 923         /*
 924          * In theory resid should be unsigned.
 925          * However, space must be signed, as it might be less than 0
 926          * if we over-committed, and we must use a signed comparison
 927          * of space and resid.  On the other hand, a negative resid
 928          * causes us to loop sending 0-length segments to the protocol.
 929          */
 930         if (resid < 0) {
 931                 error = EINVAL;
 932                 goto out;
 933         }
 934         dontroute =
 935             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
 936             (so->so_proto->pr_flags & PR_ATOMIC);
 937         l->l_ru.ru_msgsnd++;
 938         if (control)
 939                 clen = control->m_len;
 940  restart:
 941         if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
 942                 goto out;
 943         do {
 944                 if (so->so_state & SS_CANTSENDMORE) {
 945                         error = EPIPE;
 946                         goto release;
 947                 }
 948                 if (so->so_error) {
 949                         error = so->so_error;
 950                         so->so_error = 0;
 951                         goto release;
 952                 }
 953                 if ((so->so_state & SS_ISCONNECTED) == 0) {
 954                         if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 955                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 956                                     !(resid == 0 && clen != 0)) {
 957                                         error = ENOTCONN;
 958                                         goto release;
 959                                 }
 960                         } else if (addr == 0) {
 961                                 error = EDESTADDRREQ;
 962                                 goto release;
 963                         }
 964                 }
 965                 space = sbspace(&so->so_snd);
 966                 if (flags & MSG_OOB)
 967                         space += 1024;
 968                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
 969                     clen > so->so_snd.sb_hiwat) {
 970                         error = EMSGSIZE;
 971                         goto release;
 972                 }
 973                 if (space < resid + clen &&
 974                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
 975                         if (so->so_nbio) {
 976                                 error = EWOULDBLOCK;
 977                                 goto release;
 978                         }
 979                         sbunlock(&so->so_snd);
 980                         if (wakeup_state & SS_RESTARTSYS) {
 981                                 error = ERESTART;
 982                                 goto out;
 983                         }
 984                         error = sbwait(&so->so_snd);
 985                         if (error)
 986                                 goto out;
 987                         wakeup_state = so->so_state;
 988                         goto restart;
 989                 }
 990                 wakeup_state = 0;
 991                 mp = &top;
 992                 space -= clen;
 993                 do {
 994                         if (uio == NULL) {
 995                                 /*
 996                                  * Data is prepackaged in "top".
 997                                  */
 998                                 resid = 0;
 999                                 if (flags & MSG_EOR)
1000                                         top->m_flags |= M_EOR;
1001                         } else do {
1002                                 sounlock(so);
1003                                 splx(s);
1004                                 if (top == NULL) {
1005                                         m = m_gethdr(M_WAIT, MT_DATA);
1006                                         mlen = MHLEN;
1007                                         m->m_pkthdr.len = 0;
1008                                         m->m_pkthdr.rcvif = NULL;
1009                                 } else {
1010                                         m = m_get(M_WAIT, MT_DATA);
1011                                         mlen = MLEN;
1012                                 }
1013                                 MCLAIM(m, so->so_snd.sb_mowner);
1014                                 if (sock_loan_thresh >= 0 &&
1015                                     uio->uio_iov->iov_len >= sock_loan_thresh &&
1016                                     space >= sock_loan_thresh &&
1017                                     (len = sosend_loan(so, uio, m,
1018                                                        space)) != 0) {
1019                                         SOSEND_COUNTER_INCR(&sosend_loan_big);
1020                                         space -= len;
1021                                         goto have_data;
1022                                 }
1023                                 if (resid >= MINCLSIZE && space >= MCLBYTES) {
1024                                         SOSEND_COUNTER_INCR(&sosend_copy_big);
1025                                         m_clget(m, M_WAIT);
1026                                         if ((m->m_flags & M_EXT) == 0)
1027                                                 goto nopages;
1028                                         mlen = MCLBYTES;
1029                                         if (atomic && top == 0) {
1030                                                 len = lmin(MCLBYTES - max_hdr,
1031                                                     resid);
1032                                                 m->m_data += max_hdr;
1033                                         } else
1034                                                 len = lmin(MCLBYTES, resid);
1035                                         space -= len;
1036                                 } else {
1037  nopages:
1038                                         SOSEND_COUNTER_INCR(&sosend_copy_small);
1039                                         len = lmin(lmin(mlen, resid), space);
1040                                         space -= len;
1041                                         /*
1042                                          * For datagram protocols, leave room
1043                                          * for protocol headers in first mbuf.
1044                                          */
1045                                         if (atomic && top == 0 && len < mlen)
1046                                                 MH_ALIGN(m, len);
1047                                 }
1048                                 error = uiomove(mtod(m, void *), (int)len, uio);
1049  have_data:
1050                                 resid = uio->uio_resid;
1051                                 m->m_len = len;
1052                                 *mp = m;
1053                                 top->m_pkthdr.len += len;
1054                                 s = splsoftnet();
1055                                 solock(so);
1056                                 if (error != 0)
1057                                         goto release;
1058                                 mp = &m->m_next;
1059                                 if (resid <= 0) {
1060                                         if (flags & MSG_EOR)
1061                                                 top->m_flags |= M_EOR;
1062                                         break;
1063                                 }
1064                         } while (space > 0 && atomic);
1065
1066                         if (so->so_state & SS_CANTSENDMORE) {
1067                                 error = EPIPE;
1068                                 goto release;
1069                         }
1070                         if (dontroute)
1071                                 so->so_options |= SO_DONTROUTE;
1072                         if (resid > 0)
1073                                 so->so_state |= SS_MORETOCOME;
1074                         error = (*so->so_proto->pr_usrreq)(so,
1075                             (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
1076                             top, addr, control, curlwp);
1077                         if (dontroute)
1078                                 so->so_options &= ~SO_DONTROUTE;
1079                         if (resid > 0)
1080                                 so->so_state &= ~SS_MORETOCOME;
1081                         clen = 0;
1082                         control = NULL;
1083                         top = NULL;
1084                         mp = &top;
1085                         if (error != 0)
1086                                 goto release;
1087                 } while (resid && space > 0);
1088         } while (resid);
1089
1090  release:
1091         sbunlock(&so->so_snd);
1092  out:
1093         sounlock(so);
1094         splx(s);
1095         if (top)
1096                 m_freem(top);
1097         if (control)
1098                 m_freem(control);
1099         return (error);
1100 }
1101
1102 /*
1103  * Following replacement or removal of the first mbuf on the first
1104  * mbuf chain of a socket buffer, push necessary state changes back
1105  * into the socket buffer so that other consumers see the values
1106  * consistently.  'nextrecord' is the callers locally stored value of
1107  * the original value of sb->sb_mb->m_nextpkt which must be restored
1108  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
1109  */
1110 static void
1111 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
1112 {
1113
1114         KASSERT(solocked(sb->sb_so));
1115
1116         /*
1117          * First, update for the new value of nextrecord.  If necessary,
1118          * make it the first record.
1119          */
1120         if (sb->sb_mb != NULL)
1121                 sb->sb_mb->m_nextpkt = nextrecord;
1122         else
1123                 sb->sb_mb = nextrecord;
1124
1125         /*
1126          * Now update any dependent socket buffer fields to reflect
1127          * the new state.  This is an inline of SB_EMPTY_FIXUP, with
1128          * the addition of a second clause that takes care of the
1129          * case where sb_mb has been updated, but remains the last
1130          * record.
1131          */
1132         if (sb->sb_mb == NULL) {
1133                 sb->sb_mbtail = NULL;
1134                 sb->sb_lastrecord = NULL;
1135         } else if (sb->sb_mb->m_nextpkt == NULL)
1136                 sb->sb_lastrecord = sb->sb_mb;
1137 }
1138
1139 /*
1140  * Implement receive operations on a socket.
1141  * We depend on the way that records are added to the sockbuf
1142  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1143  * must begin with an address if the protocol so specifies,
1144  * followed by an optional mbuf or mbufs containing ancillary data,
1145  * and then zero or more mbufs of data.
1146  * In order to avoid blocking network interrupts for the entire time here,
1147  * we splx() while doing the actual copy to user space.
1148  * Although the sockbuf is locked, new data may still be appended,
1149  * and thus we must maintain consistency of the sockbuf during that time.
1150  *
1151  * The caller may receive the data as a single mbuf chain by supplying
1152  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1153  * only for the count in uio_resid.
1154  */
1155 int
1156 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
1157         struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1158 {
1159         struct lwp *l = curlwp;
1160         struct mbuf     *m, **mp, *mt;
1161         int atomic, flags, len, error, s, offset, moff, type, orig_resid;
1162         const struct protosw    *pr;
1163         struct mbuf     *nextrecord;
1164         int             mbuf_removed = 0;
1165         const struct domain *dom;
1166         short           wakeup_state = 0;
1167
1168         pr = so->so_proto;
1169         atomic = pr->pr_flags & PR_ATOMIC;
1170         dom = pr->pr_domain;
1171         mp = mp0;
1172         type = 0;
1173         orig_resid = uio->uio_resid;
1174
1175         if (paddr != NULL)
1176                 *paddr = NULL;
1177         if (controlp != NULL)
1178                 *controlp = NULL;
1179         if (flagsp != NULL)
1180                 flags = *flagsp &~ MSG_EOR;
1181         else
1182                 flags = 0;
1183
1184         if ((flags & MSG_DONTWAIT) == 0)
1185                 sodopendfree();
1186
1187         if (flags & MSG_OOB) {
1188                 m = m_get(M_WAIT, MT_DATA);
1189                 solock(so);
1190                 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
1191                     (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l);
1192                 sounlock(so);
1193                 if (error)
1194                         goto bad;
1195                 do {
1196                         error = uiomove(mtod(m, void *),
1197                             (int) min(uio->uio_resid, m->m_len), uio);
1198                         m = m_free(m);
1199                 } while (uio->uio_resid > 0 && error == 0 && m);
1200  bad:
1201                 if (m != NULL)
1202                         m_freem(m);
1203                 return error;
1204         }
1205         if (mp != NULL)
1206                 *mp = NULL;
1207
1208         /*
1209          * solock() provides atomicity of access.  splsoftnet() prevents
1210          * protocol processing soft interrupts from interrupting us and
1211          * blocking (expensive).
1212          */
1213         s = splsoftnet();
1214         solock(so);
1215         if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
1216                 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l);
1217
1218  restart:
1219         if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
1220                 sounlock(so);
1221                 splx(s);
1222                 return error;
1223         }
1224
1225         m = so->so_rcv.sb_mb;
1226         /*
1227          * If we have less data than requested, block awaiting more
1228          * (subject to any timeout) if:
1229          *   1. the current count is less than the low water mark,
1230          *   2. MSG_WAITALL is set, and it is possible to do the entire
1231          *      receive operation at once if we block (resid <= hiwat), or
1232          *   3. MSG_DONTWAIT is not set.
1233          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1234          * we have to do the receive in sections, and thus risk returning
1235          * a short count if a timeout or signal occurs after we start.
1236          */
1237         if (m == NULL ||
1238             ((flags & MSG_DONTWAIT) == 0 &&
1239              so->so_rcv.sb_cc < uio->uio_resid &&
1240              (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1241               ((flags & MSG_WAITALL) &&
1242                uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1243              m->m_nextpkt == NULL && !atomic)) {
1244 #ifdef DIAGNOSTIC
1245                 if (m == NULL && so->so_rcv.sb_cc)
1246                         panic("receive 1");
1247 #endif
1248                 if (so->so_error) {
1249                         if (m != NULL)
1250                                 goto dontblock;
1251                         error = so->so_error;
1252                         if ((flags & MSG_PEEK) == 0)
1253                                 so->so_error = 0;
1254                         goto release;
1255                 }
1256                 if (so->so_state & SS_CANTRCVMORE) {
1257                         if (m != NULL)
1258                                 goto dontblock;
1259                         else
1260                                 goto release;
1261                 }
1262                 for (; m != NULL; m = m->m_next)
1263                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1264                                 m = so->so_rcv.sb_mb;
1265                                 goto dontblock;
1266                         }
1267                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1268                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1269                         error = ENOTCONN;
1270                         goto release;
1271                 }
1272                 if (uio->uio_resid == 0)
1273                         goto release;
1274                 if (so->so_nbio || (flags & MSG_DONTWAIT)) {
1275                         error = EWOULDBLOCK;
1276                         goto release;
1277                 }
1278                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
1279                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
1280                 sbunlock(&so->so_rcv);
1281                 if (wakeup_state & SS_RESTARTSYS)
1282                         error = ERESTART;
1283                 else
1284                         error = sbwait(&so->so_rcv);
1285                 if (error != 0) {
1286                         sounlock(so);
1287                         splx(s);
1288                         return error;
1289                 }
1290                 wakeup_state = so->so_state;
1291                 goto restart;
1292         }
1293  dontblock:
1294         /*
1295          * On entry here, m points to the first record of the socket buffer.
1296          * From this point onward, we maintain 'nextrecord' as a cache of the
1297          * pointer to the next record in the socket buffer.  We must keep the
1298          * various socket buffer pointers and local stack versions of the
1299          * pointers in sync, pushing out modifications before dropping the
1300          * socket lock, and re-reading them when picking it up.
1301          *
1302          * Otherwise, we will race with the network stack appending new data
1303          * or records onto the socket buffer by using inconsistent/stale
1304          * versions of the field, possibly resulting in socket buffer
1305          * corruption.
1306          *
1307          * By holding the high-level sblock(), we prevent simultaneous
1308          * readers from pulling off the front of the socket buffer.
1309          */
1310         if (l != NULL)
1311                 l->l_ru.ru_msgrcv++;
1312         KASSERT(m == so->so_rcv.sb_mb);
1313         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
1314         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1315         nextrecord = m->m_nextpkt;
1316         if (pr->pr_flags & PR_ADDR) {
1317 #ifdef DIAGNOSTIC
1318                 if (m->m_type != MT_SONAME)
1319                         panic("receive 1a");
1320 #endif
1321                 orig_resid = 0;
1322                 if (flags & MSG_PEEK) {
1323                         if (paddr)
1324                                 *paddr = m_copy(m, 0, m->m_len);
1325                         m = m->m_next;
1326                 } else {
1327                         sbfree(&so->so_rcv, m);
1328                         mbuf_removed = 1;
1329                         if (paddr != NULL) {
1330                                 *paddr = m;
1331                                 so->so_rcv.sb_mb = m->m_next;
1332                                 m->m_next = NULL;
1333                                 m = so->so_rcv.sb_mb;
1334                         } else {
1335                                 MFREE(m, so->so_rcv.sb_mb);
1336                                 m = so->so_rcv.sb_mb;
1337                         }
1338                         sbsync(&so->so_rcv, nextrecord);
1339                 }
1340         }
1341
1342         /*
1343          * Process one or more MT_CONTROL mbufs present before any data mbufs
1344          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1345          * just copy the data; if !MSG_PEEK, we call into the protocol to
1346          * perform externalization (or freeing if controlp == NULL).
1347          */
1348         if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
1349                 struct mbuf *cm = NULL, *cmn;
1350                 struct mbuf **cme = &cm;
1351
1352                 do {
1353                         if (flags & MSG_PEEK) {
1354                                 if (controlp != NULL) {
1355                                         *controlp = m_copy(m, 0, m->m_len);
1356                                         controlp = &(*controlp)->m_next;
1357                                 }
1358                                 m = m->m_next;
1359                         } else {
1360                                 sbfree(&so->so_rcv, m);
1361                                 so->so_rcv.sb_mb = m->m_next;
1362                                 m->m_next = NULL;
1363                                 *cme = m;
1364                                 cme = &(*cme)->m_next;
1365                                 m = so->so_rcv.sb_mb;
1366                         }
1367                 } while (m != NULL && m->m_type == MT_CONTROL);
1368                 if ((flags & MSG_PEEK) == 0)
1369                         sbsync(&so->so_rcv, nextrecord);
1370                 for (; cm != NULL; cm = cmn) {
1371                         cmn = cm->m_next;
1372                         cm->m_next = NULL;
1373                         type = mtod(cm, struct cmsghdr *)->cmsg_type;
1374                         if (controlp != NULL) {
1375                                 if (dom->dom_externalize != NULL &&
1376                                     type == SCM_RIGHTS) {
1377                                         sounlock(so);
1378                                         splx(s);
1379                                         error = (*dom->dom_externalize)(cm, l);
1380                                         s = splsoftnet();
1381                                         solock(so);
1382                                 }
1383                                 *controlp = cm;
1384                                 while (*controlp != NULL)
1385                                         controlp = &(*controlp)->m_next;
1386                         } else {
1387                                 /*
1388                                  * Dispose of any SCM_RIGHTS message that went
1389                                  * through the read path rather than recv.
1390                                  */
1391                                 if (dom->dom_dispose != NULL &&
1392                                     type == SCM_RIGHTS) {
1393                                         sounlock(so);
1394                                         (*dom->dom_dispose)(cm);
1395                                         solock(so);
1396                                 }
1397                                 m_freem(cm);
1398                         }
1399                 }
1400                 if (m != NULL)
1401                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1402                 else
1403                         nextrecord = so->so_rcv.sb_mb;
1404                 orig_resid = 0;
1405         }
1406
1407         /* If m is non-NULL, we have some data to read. */
1408         if (__predict_true(m != NULL)) {
1409                 type = m->m_type;
1410                 if (type == MT_OOBDATA)
1411                         flags |= MSG_OOB;
1412         }
1413         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1414         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1415
1416         moff = 0;
1417         offset = 0;
1418         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1419                 if (m->m_type == MT_OOBDATA) {
1420                         if (type != MT_OOBDATA)
1421                                 break;
1422                 } else if (type == MT_OOBDATA)
1423                         break;
1424 #ifdef DIAGNOSTIC
1425                 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
1426                         panic("receive 3");
1427 #endif
1428                 so->so_state &= ~SS_RCVATMARK;
1429                 wakeup_state = 0;
1430                 len = uio->uio_resid;
1431                 if (so->so_oobmark && len > so->so_oobmark - offset)
1432                         len = so->so_oobmark - offset;
1433                 if (len > m->m_len - moff)
1434                         len = m->m_len - moff;
1435                 /*
1436                  * If mp is set, just pass back the mbufs.
1437                  * Otherwise copy them out via the uio, then free.
1438                  * Sockbuf must be consistent here (points to current mbuf,
1439                  * it points to next record) when we drop priority;
1440                  * we must note any additions to the sockbuf when we
1441                  * block interrupts again.
1442                  */
1443                 if (mp == NULL) {
1444                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1445                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1446                         sounlock(so);
1447                         splx(s);
1448                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1449                         s = splsoftnet();
1450                         solock(so);
1451                         if (error != 0) {
1452                                 /*
1453                                  * If any part of the record has been removed
1454                                  * (such as the MT_SONAME mbuf, which will
1455                                  * happen when PR_ADDR, and thus also
1456                                  * PR_ATOMIC, is set), then drop the entire
1457                                  * record to maintain the atomicity of the
1458                                  * receive operation.
1459                                  *
1460                                  * This avoids a later panic("receive 1a")
1461                                  * when compiled with DIAGNOSTIC.
1462                                  */
1463                                 if (m && mbuf_removed && atomic)
1464                                         (void) sbdroprecord(&so->so_rcv);
1465
1466                                 goto release;
1467                         }
1468                 } else
1469                         uio->uio_resid -= len;
1470                 if (len == m->m_len - moff) {
1471                         if (m->m_flags & M_EOR)
1472                                 flags |= MSG_EOR;
1473                         if (flags & MSG_PEEK) {
1474                                 m = m->m_next;
1475                                 moff = 0;
1476                         } else {
1477                                 nextrecord = m->m_nextpkt;
1478                                 sbfree(&so->so_rcv, m);
1479                                 if (mp) {
1480                                         *mp = m;
1481                                         mp = &m->m_next;
1482                                         so->so_rcv.sb_mb = m = m->m_next;
1483                                         *mp = NULL;
1484                                 } else {
1485                                         MFREE(m, so->so_rcv.sb_mb);
1486                                         m = so->so_rcv.sb_mb;
1487                                 }
1488                                 /*
1489                                  * If m != NULL, we also know that
1490                                  * so->so_rcv.sb_mb != NULL.
1491                                  */
1492                                 KASSERT(so->so_rcv.sb_mb == m);
1493                                 if (m) {
1494                                         m->m_nextpkt = nextrecord;
1495                                         if (nextrecord == NULL)
1496                                                 so->so_rcv.sb_lastrecord = m;
1497                                 } else {
1498                                         so->so_rcv.sb_mb = nextrecord;
1499                                         SB_EMPTY_FIXUP(&so->so_rcv);
1500                                 }
1501                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1502                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1503                         }
1504                 } else if (flags & MSG_PEEK)
1505                         moff += len;
1506                 else {
1507                         if (mp != NULL) {
1508                                 mt = m_copym(m, 0, len, M_NOWAIT);
1509                                 if (__predict_false(mt == NULL)) {
1510                                         sounlock(so);
1511                                         mt = m_copym(m, 0, len, M_WAIT);
1512                                         solock(so);
1513                                 }
1514                                 *mp = mt;
1515                         }
1516                         m->m_data += len;
1517                         m->m_len -= len;
1518                         so->so_rcv.sb_cc -= len;
1519                 }
1520                 if (so->so_oobmark) {
1521                         if ((flags & MSG_PEEK) == 0) {
1522                                 so->so_oobmark -= len;
1523                                 if (so->so_oobmark == 0) {
1524                                         so->so_state |= SS_RCVATMARK;
1525                                         break;
1526                                 }
1527                         } else {
1528                                 offset += len;
1529                                 if (offset == so->so_oobmark)
1530                                         break;
1531                         }
1532                 }
1533                 if (flags & MSG_EOR)
1534                         break;
1535                 /*
1536                  * If the MSG_WAITALL flag is set (for non-atomic socket),
1537                  * we must not quit until "uio->uio_resid == 0" or an error
1538                  * termination.  If a signal/timeout occurs, return
1539                  * with a short count but without error.
1540                  * Keep sockbuf locked against other readers.
1541                  */
1542                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1543                     !sosendallatonce(so) && !nextrecord) {
1544                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
1545                                 break;
1546                         /*
1547                          * If we are peeking and the socket receive buffer is
1548                          * full, stop since we can't get more data to peek at.
1549                          */
1550                         if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
1551                                 break;
1552                         /*
1553                          * If we've drained the socket buffer, tell the
1554                          * protocol in case it needs to do something to
1555                          * get it filled again.
1556                          */
1557                         if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
1558                                 (*pr->pr_usrreq)(so, PRU_RCVD,
1559                                     NULL, (struct mbuf *)(long)flags, NULL, l);
1560                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1561                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1562                         if (wakeup_state & SS_RESTARTSYS)
1563                                 error = ERESTART;
1564                         else
1565                                 error = sbwait(&so->so_rcv);
1566                         if (error != 0) {
1567                                 sbunlock(&so->so_rcv);
1568                                 sounlock(so);
1569                                 splx(s);
1570                                 return 0;
1571                         }
1572                         if ((m = so->so_rcv.sb_mb) != NULL)
1573                                 nextrecord = m->m_nextpkt;
1574                         wakeup_state = so->so_state;
1575                 }
1576         }
1577
1578         if (m && atomic) {
1579                 flags |= MSG_TRUNC;
1580                 if ((flags & MSG_PEEK) == 0)
1581                         (void) sbdroprecord(&so->so_rcv);
1582         }
1583         if ((flags & MSG_PEEK) == 0) {
1584                 if (m == NULL) {
1585                         /*
1586                          * First part is an inline SB_EMPTY_FIXUP().  Second
1587                          * part makes sure sb_lastrecord is up-to-date if
1588                          * there is still data in the socket buffer.
1589                          */
1590                         so->so_rcv.sb_mb = nextrecord;
1591                         if (so->so_rcv.sb_mb == NULL) {
1592                                 so->so_rcv.sb_mbtail = NULL;
1593                                 so->so_rcv.sb_lastrecord = NULL;
1594                         } else if (nextrecord->m_nextpkt == NULL)
1595                                 so->so_rcv.sb_lastrecord = nextrecord;
1596                 }
1597                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1598                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1599                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1600                         (*pr->pr_usrreq)(so, PRU_RCVD, NULL,
1601                             (struct mbuf *)(long)flags, NULL, l);
1602         }
1603         if (orig_resid == uio->uio_resid && orig_resid &&
1604             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1605                 sbunlock(&so->so_rcv);
1606                 goto restart;
1607         }
1608
1609         if (flagsp != NULL)
1610                 *flagsp |= flags;
1611  release:
1612         sbunlock(&so->so_rcv);
1613         sounlock(so);
1614         splx(s);
1615         return error;
1616 }
1617
1618 int
1619 soshutdown(struct socket *so, int how)
1620 {
1621         const struct protosw    *pr;
1622         int     error;
1623
1624         KASSERT(solocked(so));
1625
1626         pr = so->so_proto;
1627         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1628                 return (EINVAL);
1629
1630         if (how == SHUT_RD || how == SHUT_RDWR) {
1631                 sorflush(so);
1632                 error = 0;
1633         }
1634         if (how == SHUT_WR || how == SHUT_RDWR)
1635                 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL,
1636                     NULL, NULL, NULL);
1637
1638         return error;
1639 }
1640
1641 void
1642 sorestart(struct socket *so)
1643 {
1644         /*
1645          * An application has called close() on an fd on which another
1646          * of its threads has called a socket system call.
1647          * Mark this and wake everyone up, and code that would block again
1648          * instead returns ERESTART.
1649          * On system call re-entry the fd is validated and EBADF returned.
1650          * Any other fd will block again on the 2nd syscall.
1651          */
1652         solock(so);
1653         so->so_state |= SS_RESTARTSYS;
1654         cv_broadcast(&so->so_cv);
1655         cv_broadcast(&so->so_snd.sb_cv);
1656         cv_broadcast(&so->so_rcv.sb_cv);
1657         sounlock(so);
1658 }
1659
1660 void
1661 sorflush(struct socket *so)
1662 {
1663         struct sockbuf  *sb, asb;
1664         const struct protosw    *pr;
1665
1666         KASSERT(solocked(so));
1667
1668         sb = &so->so_rcv;
1669         pr = so->so_proto;
1670         socantrcvmore(so);
1671         sb->sb_flags |= SB_NOINTR;
1672         (void )sblock(sb, M_WAITOK);
1673         sbunlock(sb);
1674         asb = *sb;
1675         /*
1676          * Clear most of the sockbuf structure, but leave some of the
1677          * fields valid.
1678          */
1679         memset(&sb->sb_startzero, 0,
1680             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1681         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
1682                 sounlock(so);
1683                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1684                 solock(so);
1685         }
1686         sbrelease(&asb, so);
1687 }
1688
1689 /*
1690  * internal set SOL_SOCKET options
1691  */
1692 static int
1693 sosetopt1(struct socket *so, const struct sockopt *sopt)
1694 {
1695         int error = EINVAL, optval, opt;
1696         struct linger l;
1697         struct timeval tv;
1698
1699         switch ((opt = sopt->sopt_name)) {
1700
1701         case SO_ACCEPTFILTER:
1702                 error = accept_filt_setopt(so, sopt);
1703                 KASSERT(solocked(so));
1704                 break;
1705
1706         case SO_LINGER:
1707                 error = sockopt_get(sopt, &l, sizeof(l));
1708                 solock(so);
1709                 if (error)
1710                         break;
1711                 if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
1712                     l.l_linger > (INT_MAX / hz)) {
1713                         error = EDOM;
1714                         break;
1715                 }
1716                 so->so_linger = l.l_linger;
1717                 if (l.l_onoff)
1718                         so->so_options |= SO_LINGER;
1719                 else
1720                         so->so_options &= ~SO_LINGER;
1721                 break;
1722
1723         case SO_DEBUG:
1724         case SO_KEEPALIVE:
1725         case SO_DONTROUTE:
1726         case SO_USELOOPBACK:
1727         case SO_BROADCAST:
1728         case SO_REUSEADDR:
1729         case SO_REUSEPORT:
1730         case SO_OOBINLINE:
1731         case SO_TIMESTAMP:
1732 #ifdef SO_OTIMESTAMP
1733         case SO_OTIMESTAMP:
1734 #endif
1735                 error = sockopt_getint(sopt, &optval);
1736                 solock(so);
1737                 if (error)
1738                         break;
1739                 if (optval)
1740                         so->so_options |= opt;
1741                 else
1742                         so->so_options &= ~opt;
1743                 break;
1744
1745         case SO_SNDBUF:
1746         case SO_RCVBUF:
1747         case SO_SNDLOWAT:
1748         case SO_RCVLOWAT:
1749                 error = sockopt_getint(sopt, &optval);
1750                 solock(so);
1751                 if (error)
1752                         break;
1753
1754                 /*
1755                  * Values < 1 make no sense for any of these
1756                  * options, so disallow them.
1757                  */
1758                 if (optval < 1) {
1759                         error = EINVAL;
1760                         break;
1761                 }
1762
1763                 switch (opt) {
1764                 case SO_SNDBUF:
1765                         if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
1766                                 error = ENOBUFS;
1767                                 break;
1768                         }
1769                         so->so_snd.sb_flags &= ~SB_AUTOSIZE;
1770                         break;
1771
1772                 case SO_RCVBUF:
1773                         if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
1774                                 error = ENOBUFS;
1775                                 break;
1776                         }
1777                         so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1778                         break;
1779
1780                 /*
1781                  * Make sure the low-water is never greater than
1782                  * the high-water.
1783                  */
1784                 case SO_SNDLOWAT:
1785                         if (optval > so->so_snd.sb_hiwat)
1786                                 optval = so->so_snd.sb_hiwat;
1787
1788                         so->so_snd.sb_lowat = optval;
1789                         break;
1790
1791                 case SO_RCVLOWAT:
1792                         if (optval > so->so_rcv.sb_hiwat)
1793                                 optval = so->so_rcv.sb_hiwat;
1794
1795                         so->so_rcv.sb_lowat = optval;
1796                         break;
1797                 }
1798                 break;
1799
1800 #ifdef COMPAT_50
1801         case SO_OSNDTIMEO:
1802         case SO_ORCVTIMEO: {
1803                 struct timeval50 otv;
1804                 error = sockopt_get(sopt, &otv, sizeof(otv));
1805                 if (error) {
1806                         solock(so);
1807                         break;
1808                 }
1809                 timeval50_to_timeval(&otv, &tv);
1810                 opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO;
1811                 error = 0;
1812                 /*FALLTHROUGH*/
1813         }
1814 #endif /* COMPAT_50 */
1815
1816         case SO_SNDTIMEO:
1817         case SO_RCVTIMEO:
1818                 if (error)
1819                         error = sockopt_get(sopt, &tv, sizeof(tv));
1820                 solock(so);
1821                 if (error)
1822                         break;
1823
1824                 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
1825                         error = EDOM;
1826                         break;
1827                 }
1828
1829                 optval = tv.tv_sec * hz + tv.tv_usec / tick;
1830                 if (optval == 0 && tv.tv_usec != 0)
1831                         optval = 1;
1832
1833                 switch (opt) {
1834                 case SO_SNDTIMEO:
1835                         so->so_snd.sb_timeo = optval;
1836                         break;
1837                 case SO_RCVTIMEO:
1838                         so->so_rcv.sb_timeo = optval;
1839                         break;
1840                 }
1841                 break;
1842
1843         default:
1844                 solock(so);
1845                 error = ENOPROTOOPT;
1846                 break;
1847         }
1848         KASSERT(solocked(so));
1849         return error;
1850 }
1851
1852 int
1853 sosetopt(struct socket *so, struct sockopt *sopt)
1854 {
1855         int error, prerr;
1856
1857         if (sopt->sopt_level == SOL_SOCKET) {
1858                 error = sosetopt1(so, sopt);
1859                 KASSERT(solocked(so));
1860         } else {
1861                 error = ENOPROTOOPT;
1862                 solock(so);
1863         }
1864
1865         if ((error == 0 || error == ENOPROTOOPT) &&
1866             so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
1867                 /* give the protocol stack a shot */
1868                 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
1869                 if (prerr == 0)
1870                         error = 0;
1871                 else if (prerr != ENOPROTOOPT)
1872                         error = prerr;
1873         }
1874         sounlock(so);
1875         return error;
1876 }
1877
1878 /*
1879  * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
1880  */
1881 int
1882 so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
1883     const void *val, size_t valsize)
1884 {
1885         struct sockopt sopt;
1886         int error;
1887
1888         KASSERT(valsize == 0 || val != NULL);
1889
1890         sockopt_init(&sopt, level, name, valsize);
1891         sockopt_set(&sopt, val, valsize);
1892
1893         error = sosetopt(so, &sopt);
1894
1895         sockopt_destroy(&sopt);
1896
1897         return error;
1898 }
1899
1900 /*
1901  * internal get SOL_SOCKET options
1902  */
1903 static int
1904 sogetopt1(struct socket *so, struct sockopt *sopt)
1905 {
1906         int error, optval, opt;
1907         struct linger l;
1908         struct timeval tv;
1909
1910         switch ((opt = sopt->sopt_name)) {
1911
1912         case SO_ACCEPTFILTER:
1913                 error = accept_filt_getopt(so, sopt);
1914                 break;
1915
1916         case SO_LINGER:
1917                 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
1918                 l.l_linger = so->so_linger;
1919
1920                 error = sockopt_set(sopt, &l, sizeof(l));
1921                 break;
1922
1923         case SO_USELOOPBACK:
1924         case SO_DONTROUTE:
1925         case SO_DEBUG:
1926         case SO_KEEPALIVE:
1927         case SO_REUSEADDR:
1928         case SO_REUSEPORT:
1929         case SO_BROADCAST:
1930         case SO_OOBINLINE:
1931         case SO_TIMESTAMP:
1932 #ifdef SO_OTIMESTAMP
1933         case SO_OTIMESTAMP:
1934 #endif
1935                 error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
1936                 break;
1937
1938         case SO_TYPE:
1939                 error = sockopt_setint(sopt, so->so_type);
1940                 break;
1941
1942         case SO_ERROR:
1943                 error = sockopt_setint(sopt, so->so_error);
1944                 so->so_error = 0;
1945                 break;
1946
1947         case SO_SNDBUF:
1948                 error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
1949                 break;
1950
1951         case SO_RCVBUF:
1952                 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
1953                 break;
1954
1955         case SO_SNDLOWAT:
1956                 error = sockopt_setint(sopt, so->so_snd.sb_lowat);
1957                 break;
1958
1959         case SO_RCVLOWAT:
1960                 error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
1961                 break;
1962
1963 #ifdef COMPAT_50
1964         case SO_OSNDTIMEO:
1965         case SO_ORCVTIMEO: {
1966                 struct timeval50 otv;
1967
1968                 optval = (opt == SO_OSNDTIMEO ?
1969                      so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1970
1971                 otv.tv_sec = optval / hz;
1972                 otv.tv_usec = (optval % hz) * tick;
1973
1974                 error = sockopt_set(sopt, &otv, sizeof(otv));
1975                 break;
1976         }
1977 #endif /* COMPAT_50 */
1978
1979         case SO_SNDTIMEO:
1980         case SO_RCVTIMEO:
1981                 optval = (opt == SO_SNDTIMEO ?
1982                      so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1983
1984                 tv.tv_sec = optval / hz;
1985                 tv.tv_usec = (optval % hz) * tick;
1986
1987                 error = sockopt_set(sopt, &tv, sizeof(tv));
1988                 break;
1989
1990         case SO_OVERFLOWED:
1991                 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
1992                 break;
1993
1994         default:
1995                 error = ENOPROTOOPT;
1996                 break;
1997         }
1998
1999         return (error);
2000 }
2001
2002 int
2003 sogetopt(struct socket *so, struct sockopt *sopt)
2004 {
2005         int             error;
2006
2007         solock(so);
2008         if (sopt->sopt_level != SOL_SOCKET) {
2009                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2010                         error = ((*so->so_proto->pr_ctloutput)
2011                             (PRCO_GETOPT, so, sopt));
2012                 } else
2013                         error = (ENOPROTOOPT);
2014         } else {
2015                 error = sogetopt1(so, sopt);
2016         }
2017         sounlock(so);
2018         return (error);
2019 }
2020
2021 /*
2022  * alloc sockopt data buffer buffer
2023  *      - will be released at destroy
2024  */
2025 static int
2026 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
2027 {
2028
2029         KASSERT(sopt->sopt_size == 0);
2030
2031         if (len > sizeof(sopt->sopt_buf)) {
2032                 sopt->sopt_data = kmem_zalloc(len, kmflag);
2033                 if (sopt->sopt_data == NULL)
2034                         return ENOMEM;
2035         } else
2036                 sopt->sopt_data = sopt->sopt_buf;
2037
2038         sopt->sopt_size = len;
2039         return 0;
2040 }
2041
2042 /*
2043  * initialise sockopt storage
2044  *      - MAY sleep during allocation
2045  */
2046 void
2047 sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
2048 {
2049
2050         memset(sopt, 0, sizeof(*sopt));
2051
2052         sopt->sopt_level = level;
2053         sopt->sopt_name = name;
2054         (void)sockopt_alloc(sopt, size, KM_SLEEP);
2055 }
2056
2057 /*
2058  * destroy sockopt storage
2059  *      - will release any held memory references
2060  */
2061 void
2062 sockopt_destroy(struct sockopt *sopt)
2063 {
2064
2065         if (sopt->sopt_data != sopt->sopt_buf)
2066                 kmem_free(sopt->sopt_data, sopt->sopt_size);
2067
2068         memset(sopt, 0, sizeof(*sopt));
2069 }
2070
2071 /*
2072  * set sockopt value
2073  *      - value is copied into sockopt
2074  *      - memory is allocated when necessary, will not sleep
2075  */
2076 int
2077 sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
2078 {
2079         int error;
2080
2081         if (sopt->sopt_size == 0) {
2082                 error = sockopt_alloc(sopt, len, KM_NOSLEEP);
2083                 if (error)
2084                         return error;
2085         }
2086
2087         KASSERT(sopt->sopt_size == len);
2088         memcpy(sopt->sopt_data, buf, len);
2089         return 0;
2090 }
2091
2092 /*
2093  * common case of set sockopt integer value
2094  */
2095 int
2096 sockopt_setint(struct sockopt *sopt, int val)
2097 {
2098
2099         return sockopt_set(sopt, &val, sizeof(int));
2100 }
2101
2102 /*
2103  * get sockopt value
2104  *      - correct size must be given
2105  */
2106 int
2107 sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
2108 {
2109
2110         if (sopt->sopt_size != len)
2111                 return EINVAL;
2112
2113         memcpy(buf, sopt->sopt_data, len);
2114         return 0;
2115 }
2116
2117 /*
2118  * common case of get sockopt integer value
2119  */
2120 int
2121 sockopt_getint(const struct sockopt *sopt, int *valp)
2122 {
2123
2124         return sockopt_get(sopt, valp, sizeof(int));
2125 }
2126
2127 /*
2128  * set sockopt value from mbuf
2129  *      - ONLY for legacy code
2130  *      - mbuf is released by sockopt
2131  *      - will not sleep
2132  */
2133 int
2134 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
2135 {
2136         size_t len;
2137         int error;
2138
2139         len = m_length(m);
2140
2141         if (sopt->sopt_size == 0) {
2142                 error = sockopt_alloc(sopt, len, KM_NOSLEEP);
2143                 if (error)
2144                         return error;
2145         }
2146
2147         KASSERT(sopt->sopt_size == len);
2148         m_copydata(m, 0, len, sopt->sopt_data);
2149         m_freem(m);
2150
2151         return 0;
2152 }
2153
2154 /*
2155  * get sockopt value into mbuf
2156  *      - ONLY for legacy code
2157  *      - mbuf to be released by the caller
2158  *      - will not sleep
2159  */
2160 struct mbuf *
2161 sockopt_getmbuf(const struct sockopt *sopt)
2162 {
2163         struct mbuf *m;
2164
2165         if (sopt->sopt_size > MCLBYTES)
2166                 return NULL;
2167
2168         m = m_get(M_DONTWAIT, MT_SOOPTS);
2169         if (m == NULL)
2170                 return NULL;
2171
2172         if (sopt->sopt_size > MLEN) {
2173                 MCLGET(m, M_DONTWAIT);
2174                 if ((m->m_flags & M_EXT) == 0) {
2175                         m_free(m);
2176                         return NULL;
2177                 }
2178         }
2179
2180         memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
2181         m->m_len = sopt->sopt_size;
2182
2183         return m;
2184 }
2185
2186 void
2187 sohasoutofband(struct socket *so)
2188 {
2189
2190         fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
2191         selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
2192 }
2193
2194 static void
2195 filt_sordetach(struct knote *kn)
2196 {
2197         struct socket   *so;
2198
2199         so = ((file_t *)kn->kn_obj)->f_data;
2200         solock(so);
2201         SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext);
2202         if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist))
2203                 so->so_rcv.sb_flags &= ~SB_KNOTE;
2204         sounlock(so);
2205 }
2206
2207 /*ARGSUSED*/
2208 static int
2209 filt_soread(struct knote *kn, long hint)
2210 {
2211         struct socket   *so;
2212         int rv;
2213
2214         so = ((file_t *)kn->kn_obj)->f_data;
2215         if (hint != NOTE_SUBMIT)
2216                 solock(so);
2217         kn->kn_data = so->so_rcv.sb_cc;
2218         if (so->so_state & SS_CANTRCVMORE) {
2219                 kn->kn_flags |= EV_EOF;
2220                 kn->kn_fflags = so->so_error;
2221                 rv = 1;
2222         } else if (so->so_error)        /* temporary udp error */
2223                 rv = 1;
2224         else if (kn->kn_sfflags & NOTE_LOWAT)
2225                 rv = (kn->kn_data >= kn->kn_sdata);
2226         else
2227                 rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2228         if (hint != NOTE_SUBMIT)
2229                 sounlock(so);
2230         return rv;
2231 }
2232
2233 static void
2234 filt_sowdetach(struct knote *kn)
2235 {
2236         struct socket   *so;
2237
2238         so = ((file_t *)kn->kn_obj)->f_data;
2239         solock(so);
2240         SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext);
2241         if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist))
2242                 so->so_snd.sb_flags &= ~SB_KNOTE;
2243         sounlock(so);
2244 }
2245
2246 /*ARGSUSED*/
2247 static int
2248 filt_sowrite(struct knote *kn, long hint)
2249 {
2250         struct socket   *so;
2251         int rv;
2252
2253         so = ((file_t *)kn->kn_obj)->f_data;
2254         if (hint != NOTE_SUBMIT)
2255                 solock(so);
2256         kn->kn_data = sbspace(&so->so_snd);
2257         if (so->so_state & SS_CANTSENDMORE) {
2258                 kn->kn_flags |= EV_EOF;
2259                 kn->kn_fflags = so->so_error;
2260                 rv = 1;
2261         } else if (so->so_error)        /* temporary udp error */
2262                 rv = 1;
2263         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2264             (so->so_proto->pr_flags & PR_CONNREQUIRED))
2265                 rv = 0;
2266         else if (kn->kn_sfflags & NOTE_LOWAT)
2267                 rv = (kn->kn_data >= kn->kn_sdata);
2268         else
2269                 rv = (kn->kn_data >= so->so_snd.sb_lowat);
2270         if (hint != NOTE_SUBMIT)
2271                 sounlock(so);
2272         return rv;
2273 }
2274
2275 /*ARGSUSED*/
2276 static int
2277 filt_solisten(struct knote *kn, long hint)
2278 {
2279         struct socket   *so;
2280         int rv;
2281
2282         so = ((file_t *)kn->kn_obj)->f_data;
2283
2284         /*
2285          * Set kn_data to number of incoming connections, not
2286          * counting partial (incomplete) connections.
2287          */
2288         if (hint != NOTE_SUBMIT)
2289                 solock(so);
2290         kn->kn_data = so->so_qlen;
2291         rv = (kn->kn_data > 0);
2292         if (hint != NOTE_SUBMIT)
2293                 sounlock(so);
2294         return rv;
2295 }
2296
2297 static const struct filterops solisten_filtops =
2298         { 1, NULL, filt_sordetach, filt_solisten };
2299 static const struct filterops soread_filtops =
2300         { 1, NULL, filt_sordetach, filt_soread };
2301 static const struct filterops sowrite_filtops =
2302         { 1, NULL, filt_sowdetach, filt_sowrite };
2303
2304 int
2305 soo_kqfilter(struct file *fp, struct knote *kn)
2306 {
2307         struct socket   *so;
2308         struct sockbuf  *sb;
2309
2310         so = ((file_t *)kn->kn_obj)->f_data;
2311         solock(so);
2312         switch (kn->kn_filter) {
2313         case EVFILT_READ:
2314                 if (so->so_options & SO_ACCEPTCONN)
2315                         kn->kn_fop = &solisten_filtops;
2316                 else
2317                         kn->kn_fop = &soread_filtops;
2318                 sb = &so->so_rcv;
2319                 break;
2320         case EVFILT_WRITE:
2321                 kn->kn_fop = &sowrite_filtops;
2322                 sb = &so->so_snd;
2323                 break;
2324         default:
2325                 sounlock(so);
2326                 return (EINVAL);
2327         }
2328         SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext);
2329         sb->sb_flags |= SB_KNOTE;
2330         sounlock(so);
2331         return (0);
2332 }
2333
2334 static int
2335 sodopoll(struct socket *so, int events)
2336 {
2337         int revents;
2338
2339         revents = 0;
2340
2341         if (events & (POLLIN | POLLRDNORM))
2342                 if (soreadable(so))
2343                         revents |= events & (POLLIN | POLLRDNORM);
2344
2345         if (events & (POLLOUT | POLLWRNORM))
2346                 if (sowritable(so))
2347                         revents |= events & (POLLOUT | POLLWRNORM);
2348
2349         if (events & (POLLPRI | POLLRDBAND))
2350                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
2351                         revents |= events & (POLLPRI | POLLRDBAND);
2352
2353         return revents;
2354 }
2355
2356 int
2357 sopoll(struct socket *so, int events)
2358 {
2359         int revents = 0;
2360
2361 #ifndef DIAGNOSTIC
2362         /*
2363          * Do a quick, unlocked check in expectation that the socket
2364          * will be ready for I/O.  Don't do this check if DIAGNOSTIC,
2365          * as the solocked() assertions will fail.
2366          */
2367         if ((revents = sodopoll(so, events)) != 0)
2368                 return revents;
2369 #endif
2370
2371         solock(so);
2372         if ((revents = sodopoll(so, events)) == 0) {
2373                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2374                         selrecord(curlwp, &so->so_rcv.sb_sel);
2375                         so->so_rcv.sb_flags |= SB_NOTIFY;
2376                 }
2377
2378                 if (events & (POLLOUT | POLLWRNORM)) {
2379                         selrecord(curlwp, &so->so_snd.sb_sel);
2380                         so->so_snd.sb_flags |= SB_NOTIFY;
2381                 }
2382         }
2383         sounlock(so);
2384
2385         return revents;
2386 }
2387
2388
2389 #include <sys/sysctl.h>
2390
2391 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
2392
2393 /*
2394  * sysctl helper routine for kern.somaxkva.  ensures that the given
2395  * value is not too small.
2396  * (XXX should we maybe make sure it's not too large as well?)
2397  */
2398 static int
2399 sysctl_kern_somaxkva(SYSCTLFN_ARGS)
2400 {
2401         int error, new_somaxkva;
2402         struct sysctlnode node;
2403
2404         new_somaxkva = somaxkva;
2405         node = *rnode;
2406         node.sysctl_data = &new_somaxkva;
2407         error = sysctl_lookup(SYSCTLFN_CALL(&node));
2408         if (error || newp == NULL)
2409                 return (error);
2410
2411         if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
2412                 return (EINVAL);
2413
2414         mutex_enter(&so_pendfree_lock);
2415         somaxkva = new_somaxkva;
2416         cv_broadcast(&socurkva_cv);
2417         mutex_exit(&so_pendfree_lock);
2418
2419         return (error);
2420 }
2421
2422 static void
2423 sysctl_kern_somaxkva_setup(void)
2424 {
2425
2426         KASSERT(socket_sysctllog == NULL);
2427         sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2428                        CTLFLAG_PERMANENT,
2429                        CTLTYPE_NODE, "kern", NULL,
2430                        NULL, 0, NULL, 0,
2431                        CTL_KERN, CTL_EOL);
2432
2433         sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2434                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2435                        CTLTYPE_INT, "somaxkva",
2436                        SYSCTL_DESCR("Maximum amount of kernel memory to be "
2437                                     "used for socket buffers"),
2438                        sysctl_kern_somaxkva, 0, NULL, 0,
2439                        CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
2440 }