sys/kern/uipc_socket2.c

   1 /*      $NetBSD: uipc_socket2.c,v 1.105 2009/12/30 18:33:53 elad Exp $  */
   2
   3 /*-
   4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26  * POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. Neither the name of the University nor the names of its contributors
  42  *    may be used to endorse or promote products derived from this software
  43  *    without specific prior written permission.
  44  *
  45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  55  * SUCH DAMAGE.
  56  *
  57  *      @(#)uipc_socket2.c      8.2 (Berkeley) 2/14/95
  58  */
  59
  60 #include <sys/cdefs.h>
  61 __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.105 2009/12/30 18:33:53 elad Exp $");
  62
  63 #include "opt_mbuftrace.h"
  64 #include "opt_sb_max.h"
  65
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/proc.h>
  69 #include <sys/file.h>
  70 #include <sys/buf.h>
  71 #include <sys/malloc.h>
  72 #include <sys/mbuf.h>
  73 #include <sys/protosw.h>
  74 #include <sys/domain.h>
  75 #include <sys/poll.h>
  76 #include <sys/socket.h>
  77 #include <sys/socketvar.h>
  78 #include <sys/signalvar.h>
  79 #include <sys/kauth.h>
  80 #include <sys/pool.h>
  81 #include <sys/uidinfo.h>
  82
  83 /*
  84  * Primitive routines for operating on sockets and socket buffers.
  85  *
  86  * Locking rules and assumptions:
  87  *
  88  * o socket::so_lock can change on the fly.  The low level routines used
  89  *   to lock sockets are aware of this.  When so_lock is acquired, the
  90  *   routine locking must check to see if so_lock still points to the
  91  *   lock that was acquired.  If so_lock has changed in the meantime, the
  92  *   now irellevant lock that was acquired must be dropped and the lock
  93  *   operation retried.  Although not proven here, this is completely safe
  94  *   on a multiprocessor system, even with relaxed memory ordering, given
  95  *   the next two rules:
  96  *
  97  * o In order to mutate so_lock, the lock pointed to by the current value
  98  *   of so_lock must be held: i.e., the socket must be held locked by the
  99  *   changing thread.  The thread must issue membar_exit() to prevent
 100  *   memory accesses being reordered, and can set so_lock to the desired
 101  *   value.  If the lock pointed to by the new value of so_lock is not
 102  *   held by the changing thread, the socket must then be considered
 103  *   unlocked.
 104  *
 105  * o If so_lock is mutated, and the previous lock referred to by so_lock
 106  *   could still be visible to other threads in the system (e.g. via file
 107  *   descriptor or protocol-internal reference), then the old lock must
 108  *   remain valid until the socket and/or protocol control block has been
 109  *   torn down.
 110  *
 111  * o If a socket has a non-NULL so_head value (i.e. is in the process of
 112  *   connecting), then locking the socket must also lock the socket pointed
 113  *   to by so_head: their lock pointers must match.
 114  *
 115  * o If a socket has connections in progress (so_q, so_q0 not empty) then
 116  *   locking the socket must also lock the sockets attached to both queues.
 117  *   Again, their lock pointers must match.
 118  *
 119  * o Beyond the initial lock assigment in socreate(), assigning locks to
 120  *   sockets is the responsibility of the individual protocols / protocol
 121  *   domains.
 122  */
 123
 124 static pool_cache_t socket_cache;
 125
 126 u_long  sb_max = SB_MAX;        /* maximum socket buffer size */
 127 static u_long sb_max_adj;       /* adjusted sb_max */
 128
 129 /*
 130  * Procedures to manipulate state flags of socket
 131  * and do appropriate wakeups.  Normal sequence from the
 132  * active (originating) side is that soisconnecting() is
 133  * called during processing of connect() call,
 134  * resulting in an eventual call to soisconnected() if/when the
 135  * connection is established.  When the connection is torn down
 136  * soisdisconnecting() is called during processing of disconnect() call,
 137  * and soisdisconnected() is called when the connection to the peer
 138  * is totally severed.  The semantics of these routines are such that
 139  * connectionless protocols can call soisconnected() and soisdisconnected()
 140  * only, bypassing the in-progress calls when setting up a ``connection''
 141  * takes no time.
 142  *
 143  * From the passive side, a socket is created with
 144  * two queues of sockets: so_q0 for connections in progress
 145  * and so_q for connections already made and awaiting user acceptance.
 146  * As a protocol is preparing incoming connections, it creates a socket
 147  * structure queued on so_q0 by calling sonewconn().  When the connection
 148  * is established, soisconnected() is called, and transfers the
 149  * socket structure to so_q, making it available to accept().
 150  *
 151  * If a socket is closed with sockets on either
 152  * so_q0 or so_q, these sockets are dropped.
 153  *
 154  * If higher level protocols are implemented in
 155  * the kernel, the wakeups done here will sometimes
 156  * cause software-interrupt process scheduling.
 157  */
 158
 159 void
 160 soisconnecting(struct socket *so)
 161 {
 162
 163         KASSERT(solocked(so));
 164
 165         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
 166         so->so_state |= SS_ISCONNECTING;
 167 }
 168
 169 void
 170 soisconnected(struct socket *so)
 171 {
 172         struct socket   *head;
 173
 174         head = so->so_head;
 175
 176         KASSERT(solocked(so));
 177         KASSERT(head == NULL || solocked2(so, head));
 178
 179         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 180         so->so_state |= SS_ISCONNECTED;
 181         if (head && so->so_onq == &head->so_q0) {
 182                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
 183                         soqremque(so, 0);
 184                         soqinsque(head, so, 1);
 185                         sorwakeup(head);
 186                         cv_broadcast(&head->so_cv);
 187                 } else {
 188                         so->so_upcall =
 189                             head->so_accf->so_accept_filter->accf_callback;
 190                         so->so_upcallarg = head->so_accf->so_accept_filter_arg;
 191                         so->so_rcv.sb_flags |= SB_UPCALL;
 192                         so->so_options &= ~SO_ACCEPTFILTER;
 193                         (*so->so_upcall)(so, so->so_upcallarg,
 194                                          POLLIN|POLLRDNORM, M_DONTWAIT);
 195                 }
 196         } else {
 197                 cv_broadcast(&so->so_cv);
 198                 sorwakeup(so);
 199                 sowwakeup(so);
 200         }
 201 }
 202
 203 void
 204 soisdisconnecting(struct socket *so)
 205 {
 206
 207         KASSERT(solocked(so));
 208
 209         so->so_state &= ~SS_ISCONNECTING;
 210         so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
 211         cv_broadcast(&so->so_cv);
 212         sowwakeup(so);
 213         sorwakeup(so);
 214 }
 215
 216 void
 217 soisdisconnected(struct socket *so)
 218 {
 219
 220         KASSERT(solocked(so));
 221
 222         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 223         so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
 224         cv_broadcast(&so->so_cv);
 225         sowwakeup(so);
 226         sorwakeup(so);
 227 }
 228
 229 void
 230 soinit2(void)
 231 {
 232
 233         socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
 234             "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
 235 }
 236
 237 /*
 238  * When an attempt at a new connection is noted on a socket
 239  * which accepts connections, sonewconn is called.  If the
 240  * connection is possible (subject to space constraints, etc.)
 241  * then we allocate a new structure, propoerly linked into the
 242  * data structure of the original socket, and return this.
 243  * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED.
 244  */
 245 struct socket *
 246 sonewconn(struct socket *head, int connstatus)
 247 {
 248         struct socket   *so;
 249         int             soqueue, error;
 250
 251         KASSERT(connstatus == 0 || connstatus == SS_ISCONFIRMING ||
 252             connstatus == SS_ISCONNECTED);
 253         KASSERT(solocked(head));
 254
 255         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 256                 connstatus = 0;
 257         soqueue = connstatus ? 1 : 0;
 258         if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
 259                 return NULL;
 260         so = soget(false);
 261         if (so == NULL)
 262                 return NULL;
 263         mutex_obj_hold(head->so_lock);
 264         so->so_lock = head->so_lock;
 265         so->so_type = head->so_type;
 266         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 267         so->so_linger = head->so_linger;
 268         so->so_state = head->so_state | SS_NOFDREF;
 269         so->so_nbio = head->so_nbio;
 270         so->so_proto = head->so_proto;
 271         so->so_timeo = head->so_timeo;
 272         so->so_pgid = head->so_pgid;
 273         so->so_send = head->so_send;
 274         so->so_receive = head->so_receive;
 275         so->so_uidinfo = head->so_uidinfo;
 276         so->so_cpid = head->so_cpid;
 277 #ifdef MBUFTRACE
 278         so->so_mowner = head->so_mowner;
 279         so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
 280         so->so_snd.sb_mowner = head->so_snd.sb_mowner;
 281 #endif
 282         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) != 0)
 283                 goto out;
 284         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 285         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 286         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 287         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 288         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 289         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 290         soqinsque(head, so, soqueue);
 291         error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL,
 292             NULL, NULL);
 293         KASSERT(solocked(so));
 294         if (error != 0) {
 295                 (void) soqremque(so, soqueue);
 296 out:
 297                 /*
 298                  * Remove acccept filter if one is present.
 299                  * XXX Is this really needed?
 300                  */
 301                 if (so->so_accf != NULL)
 302                         (void)accept_filt_clear(so);
 303                 soput(so);
 304                 return NULL;
 305         }
 306         if (connstatus) {
 307                 sorwakeup(head);
 308                 cv_broadcast(&head->so_cv);
 309                 so->so_state |= connstatus;
 310         }
 311         return so;
 312 }
 313
 314 struct socket *
 315 soget(bool waitok)
 316 {
 317         struct socket *so;
 318
 319         so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
 320         if (__predict_false(so == NULL))
 321                 return (NULL);
 322         memset(so, 0, sizeof(*so));
 323         TAILQ_INIT(&so->so_q0);
 324         TAILQ_INIT(&so->so_q);
 325         cv_init(&so->so_cv, "socket");
 326         cv_init(&so->so_rcv.sb_cv, "netio");
 327         cv_init(&so->so_snd.sb_cv, "netio");
 328         selinit(&so->so_rcv.sb_sel);
 329         selinit(&so->so_snd.sb_sel);
 330         so->so_rcv.sb_so = so;
 331         so->so_snd.sb_so = so;
 332         return so;
 333 }
 334
 335 void
 336 soput(struct socket *so)
 337 {
 338
 339         KASSERT(!cv_has_waiters(&so->so_cv));
 340         KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
 341         KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
 342         seldestroy(&so->so_rcv.sb_sel);
 343         seldestroy(&so->so_snd.sb_sel);
 344         mutex_obj_free(so->so_lock);
 345         cv_destroy(&so->so_cv);
 346         cv_destroy(&so->so_rcv.sb_cv);
 347         cv_destroy(&so->so_snd.sb_cv);
 348         pool_cache_put(socket_cache, so);
 349 }
 350
 351 void
 352 soqinsque(struct socket *head, struct socket *so, int q)
 353 {
 354
 355         KASSERT(solocked2(head, so));
 356
 357 #ifdef DIAGNOSTIC
 358         if (so->so_onq != NULL)
 359                 panic("soqinsque");
 360 #endif
 361
 362         so->so_head = head;
 363         if (q == 0) {
 364                 head->so_q0len++;
 365                 so->so_onq = &head->so_q0;
 366         } else {
 367                 head->so_qlen++;
 368                 so->so_onq = &head->so_q;
 369         }
 370         TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
 371 }
 372
 373 int
 374 soqremque(struct socket *so, int q)
 375 {
 376         struct socket   *head;
 377
 378         head = so->so_head;
 379
 380         KASSERT(solocked(so));
 381         if (q == 0) {
 382                 if (so->so_onq != &head->so_q0)
 383                         return (0);
 384                 head->so_q0len--;
 385         } else {
 386                 if (so->so_onq != &head->so_q)
 387                         return (0);
 388                 head->so_qlen--;
 389         }
 390         KASSERT(solocked2(so, head));
 391         TAILQ_REMOVE(so->so_onq, so, so_qe);
 392         so->so_onq = NULL;
 393         so->so_head = NULL;
 394         return (1);
 395 }
 396
 397 /*
 398  * Socantsendmore indicates that no more data will be sent on the
 399  * socket; it would normally be applied to a socket when the user
 400  * informs the system that no more data is to be sent, by the protocol
 401  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 402  * will be received, and will normally be applied to the socket by a
 403  * protocol when it detects that the peer will send no more data.
 404  * Data queued for reading in the socket may yet be read.
 405  */
 406
 407 void
 408 socantsendmore(struct socket *so)
 409 {
 410
 411         KASSERT(solocked(so));
 412
 413         so->so_state |= SS_CANTSENDMORE;
 414         sowwakeup(so);
 415 }
 416
 417 void
 418 socantrcvmore(struct socket *so)
 419 {
 420
 421         KASSERT(solocked(so));
 422
 423         so->so_state |= SS_CANTRCVMORE;
 424         sorwakeup(so);
 425 }
 426
 427 /*
 428  * Wait for data to arrive at/drain from a socket buffer.
 429  */
 430 int
 431 sbwait(struct sockbuf *sb)
 432 {
 433         struct socket *so;
 434         kmutex_t *lock;
 435         int error;
 436
 437         so = sb->sb_so;
 438
 439         KASSERT(solocked(so));
 440
 441         sb->sb_flags |= SB_NOTIFY;
 442         lock = so->so_lock;
 443         if ((sb->sb_flags & SB_NOINTR) != 0)
 444                 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
 445         else
 446                 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
 447         if (__predict_false(lock != so->so_lock))
 448                 solockretry(so, lock);
 449         return error;
 450 }
 451
 452 /*
 453  * Wakeup processes waiting on a socket buffer.
 454  * Do asynchronous notification via SIGIO
 455  * if the socket buffer has the SB_ASYNC flag set.
 456  */
 457 void
 458 sowakeup(struct socket *so, struct sockbuf *sb, int code)
 459 {
 460         int band;
 461
 462         KASSERT(solocked(so));
 463         KASSERT(sb->sb_so == so);
 464
 465         if (code == POLL_IN)
 466                 band = POLLIN|POLLRDNORM;
 467         else
 468                 band = POLLOUT|POLLWRNORM;
 469         sb->sb_flags &= ~SB_NOTIFY;
 470         selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
 471         cv_broadcast(&sb->sb_cv);
 472         if (sb->sb_flags & SB_ASYNC)
 473                 fownsignal(so->so_pgid, SIGIO, code, band, so);
 474         if (sb->sb_flags & SB_UPCALL)
 475                 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
 476 }
 477
 478 /*
 479  * Reset a socket's lock pointer.  Wake all threads waiting on the
 480  * socket's condition variables so that they can restart their waits
 481  * using the new lock.  The existing lock must be held.
 482  */
 483 void
 484 solockreset(struct socket *so, kmutex_t *lock)
 485 {
 486
 487         KASSERT(solocked(so));
 488
 489         so->so_lock = lock;
 490         cv_broadcast(&so->so_snd.sb_cv);
 491         cv_broadcast(&so->so_rcv.sb_cv);
 492         cv_broadcast(&so->so_cv);
 493 }
 494
 495 /*
 496  * Socket buffer (struct sockbuf) utility routines.
 497  *
 498  * Each socket contains two socket buffers: one for sending data and
 499  * one for receiving data.  Each buffer contains a queue of mbufs,
 500  * information about the number of mbufs and amount of data in the
 501  * queue, and other fields allowing poll() statements and notification
 502  * on data availability to be implemented.
 503  *
 504  * Data stored in a socket buffer is maintained as a list of records.
 505  * Each record is a list of mbufs chained together with the m_next
 506  * field.  Records are chained together with the m_nextpkt field. The upper
 507  * level routine soreceive() expects the following conventions to be
 508  * observed when placing information in the receive buffer:
 509  *
 510  * 1. If the protocol requires each message be preceded by the sender's
 511  *    name, then a record containing that name must be present before
 512  *    any associated data (mbuf's must be of type MT_SONAME).
 513  * 2. If the protocol supports the exchange of ``access rights'' (really
 514  *    just additional data associated with the message), and there are
 515  *    ``rights'' to be received, then a record containing this data
 516  *    should be present (mbuf's must be of type MT_CONTROL).
 517  * 3. If a name or rights record exists, then it must be followed by
 518  *    a data record, perhaps of zero length.
 519  *
 520  * Before using a new socket structure it is first necessary to reserve
 521  * buffer space to the socket, by calling sbreserve().  This should commit
 522  * some of the available buffer space in the system buffer pool for the
 523  * socket (currently, it does nothing but enforce limits).  The space
 524  * should be released by calling sbrelease() when the socket is destroyed.
 525  */
 526
 527 int
 528 sb_max_set(u_long new_sbmax)
 529 {
 530         int s;
 531
 532         if (new_sbmax < (16 * 1024))
 533                 return (EINVAL);
 534
 535         s = splsoftnet();
 536         sb_max = new_sbmax;
 537         sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
 538         splx(s);
 539
 540         return (0);
 541 }
 542
 543 int
 544 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
 545 {
 546
 547         KASSERT(so->so_lock == NULL || solocked(so));
 548
 549         /*
 550          * there's at least one application (a configure script of screen)
 551          * which expects a fifo is writable even if it has "some" bytes
 552          * in its buffer.
 553          * so we want to make sure (hiwat - lowat) >= (some bytes).
 554          *
 555          * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
 556          * we expect it's large enough for such applications.
 557          */
 558         u_long  lowat = MAX(sock_loan_thresh, MCLBYTES);
 559         u_long  hiwat = lowat + PIPE_BUF;
 560
 561         if (sndcc < hiwat)
 562                 sndcc = hiwat;
 563         if (sbreserve(&so->so_snd, sndcc, so) == 0)
 564                 goto bad;
 565         if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
 566                 goto bad2;
 567         if (so->so_rcv.sb_lowat == 0)
 568                 so->so_rcv.sb_lowat = 1;
 569         if (so->so_snd.sb_lowat == 0)
 570                 so->so_snd.sb_lowat = lowat;
 571         if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
 572                 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
 573         return (0);
 574  bad2:
 575         sbrelease(&so->so_snd, so);
 576  bad:
 577         return (ENOBUFS);
 578 }
 579
 580 /*
 581  * Allot mbufs to a sockbuf.
 582  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 583  * if buffering efficiency is near the normal case.
 584  */
 585 int
 586 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
 587 {
 588         struct lwp *l = curlwp; /* XXX */
 589         rlim_t maxcc;
 590         struct uidinfo *uidinfo;
 591
 592         KASSERT(so->so_lock == NULL || solocked(so));
 593         KASSERT(sb->sb_so == so);
 594         KASSERT(sb_max_adj != 0);
 595
 596         if (cc == 0 || cc > sb_max_adj)
 597                 return (0);
 598
 599         maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
 600
 601         uidinfo = so->so_uidinfo;
 602         if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
 603                 return 0;
 604         sb->sb_mbmax = min(cc * 2, sb_max);
 605         if (sb->sb_lowat > sb->sb_hiwat)
 606                 sb->sb_lowat = sb->sb_hiwat;
 607         return (1);
 608 }
 609
 610 /*
 611  * Free mbufs held by a socket, and reserved mbuf space.  We do not assert
 612  * that the socket is held locked here: see sorflush().
 613  */
 614 void
 615 sbrelease(struct sockbuf *sb, struct socket *so)
 616 {
 617
 618         KASSERT(sb->sb_so == so);
 619
 620         sbflush(sb);
 621         (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
 622         sb->sb_mbmax = 0;
 623 }
 624
 625 /*
 626  * Routines to add and remove
 627  * data from an mbuf queue.
 628  *
 629  * The routines sbappend() or sbappendrecord() are normally called to
 630  * append new mbufs to a socket buffer, after checking that adequate
 631  * space is available, comparing the function sbspace() with the amount
 632  * of data to be added.  sbappendrecord() differs from sbappend() in
 633  * that data supplied is treated as the beginning of a new record.
 634  * To place a sender's address, optional access rights, and data in a
 635  * socket receive buffer, sbappendaddr() should be used.  To place
 636  * access rights and data in a socket receive buffer, sbappendrights()
 637  * should be used.  In either case, the new data begins a new record.
 638  * Note that unlike sbappend() and sbappendrecord(), these routines check
 639  * for the caller that there will be enough space to store the data.
 640  * Each fails if there is not enough space, or if it cannot find mbufs
 641  * to store additional information in.
 642  *
 643  * Reliable protocols may use the socket send buffer to hold data
 644  * awaiting acknowledgement.  Data is normally copied from a socket
 645  * send buffer in a protocol with m_copy for output to a peer,
 646  * and then removing the data from the socket buffer with sbdrop()
 647  * or sbdroprecord() when the data is acknowledged by the peer.
 648  */
 649
 650 #ifdef SOCKBUF_DEBUG
 651 void
 652 sblastrecordchk(struct sockbuf *sb, const char *where)
 653 {
 654         struct mbuf *m = sb->sb_mb;
 655
 656         KASSERT(solocked(sb->sb_so));
 657
 658         while (m && m->m_nextpkt)
 659                 m = m->m_nextpkt;
 660
 661         if (m != sb->sb_lastrecord) {
 662                 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
 663                     sb->sb_mb, sb->sb_lastrecord, m);
 664                 printf("packet chain:\n");
 665                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
 666                         printf("\t%p\n", m);
 667                 panic("sblastrecordchk from %s", where);
 668         }
 669 }
 670
 671 void
 672 sblastmbufchk(struct sockbuf *sb, const char *where)
 673 {
 674         struct mbuf *m = sb->sb_mb;
 675         struct mbuf *n;
 676
 677         KASSERT(solocked(sb->sb_so));
 678
 679         while (m && m->m_nextpkt)
 680                 m = m->m_nextpkt;
 681
 682         while (m && m->m_next)
 683                 m = m->m_next;
 684
 685         if (m != sb->sb_mbtail) {
 686                 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
 687                     sb->sb_mb, sb->sb_mbtail, m);
 688                 printf("packet tree:\n");
 689                 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
 690                         printf("\t");
 691                         for (n = m; n != NULL; n = n->m_next)
 692                                 printf("%p ", n);
 693                         printf("\n");
 694                 }
 695                 panic("sblastmbufchk from %s", where);
 696         }
 697 }
 698 #endif /* SOCKBUF_DEBUG */
 699
 700 /*
 701  * Link a chain of records onto a socket buffer
 702  */
 703 #define SBLINKRECORDCHAIN(sb, m0, mlast)                                \
 704 do {                                                                    \
 705         if ((sb)->sb_lastrecord != NULL)                                \
 706                 (sb)->sb_lastrecord->m_nextpkt = (m0);                  \
 707         else                                                            \
 708                 (sb)->sb_mb = (m0);                                     \
 709         (sb)->sb_lastrecord = (mlast);                                  \
 710 } while (/*CONSTCOND*/0)
 711
 712
 713 #define SBLINKRECORD(sb, m0)                                            \
 714     SBLINKRECORDCHAIN(sb, m0, m0)
 715
 716 /*
 717  * Append mbuf chain m to the last record in the
 718  * socket buffer sb.  The additional space associated
 719  * the mbuf chain is recorded in sb.  Empty mbufs are
 720  * discarded and mbufs are compacted where possible.
 721  */
 722 void
 723 sbappend(struct sockbuf *sb, struct mbuf *m)
 724 {
 725         struct mbuf     *n;
 726
 727         KASSERT(solocked(sb->sb_so));
 728
 729         if (m == 0)
 730                 return;
 731
 732 #ifdef MBUFTRACE
 733         m_claimm(m, sb->sb_mowner);
 734 #endif
 735
 736         SBLASTRECORDCHK(sb, "sbappend 1");
 737
 738         if ((n = sb->sb_lastrecord) != NULL) {
 739                 /*
 740                  * XXX Would like to simply use sb_mbtail here, but
 741                  * XXX I need to verify that I won't miss an EOR that
 742                  * XXX way.
 743                  */
 744                 do {
 745                         if (n->m_flags & M_EOR) {
 746                                 sbappendrecord(sb, m); /* XXXXXX!!!! */
 747                                 return;
 748                         }
 749                 } while (n->m_next && (n = n->m_next));
 750         } else {
 751                 /*
 752                  * If this is the first record in the socket buffer, it's
 753                  * also the last record.
 754                  */
 755                 sb->sb_lastrecord = m;
 756         }
 757         sbcompress(sb, m, n);
 758         SBLASTRECORDCHK(sb, "sbappend 2");
 759 }
 760
 761 /*
 762  * This version of sbappend() should only be used when the caller
 763  * absolutely knows that there will never be more than one record
 764  * in the socket buffer, that is, a stream protocol (such as TCP).
 765  */
 766 void
 767 sbappendstream(struct sockbuf *sb, struct mbuf *m)
 768 {
 769
 770         KASSERT(solocked(sb->sb_so));
 771         KDASSERT(m->m_nextpkt == NULL);
 772         KASSERT(sb->sb_mb == sb->sb_lastrecord);
 773
 774         SBLASTMBUFCHK(sb, __func__);
 775
 776 #ifdef MBUFTRACE
 777         m_claimm(m, sb->sb_mowner);
 778 #endif
 779
 780         sbcompress(sb, m, sb->sb_mbtail);
 781
 782         sb->sb_lastrecord = sb->sb_mb;
 783         SBLASTRECORDCHK(sb, __func__);
 784 }
 785
 786 #ifdef SOCKBUF_DEBUG
 787 void
 788 sbcheck(struct sockbuf *sb)
 789 {
 790         struct mbuf     *m, *m2;
 791         u_long          len, mbcnt;
 792
 793         KASSERT(solocked(sb->sb_so));
 794
 795         len = 0;
 796         mbcnt = 0;
 797         for (m = sb->sb_mb; m; m = m->m_nextpkt) {
 798                 for (m2 = m; m2 != NULL; m2 = m2->m_next) {
 799                         len += m2->m_len;
 800                         mbcnt += MSIZE;
 801                         if (m2->m_flags & M_EXT)
 802                                 mbcnt += m2->m_ext.ext_size;
 803                         if (m2->m_nextpkt != NULL)
 804                                 panic("sbcheck nextpkt");
 805                 }
 806         }
 807         if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
 808                 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
 809                     mbcnt, sb->sb_mbcnt);
 810                 panic("sbcheck");
 811         }
 812 }
 813 #endif
 814
 815 /*
 816  * As above, except the mbuf chain
 817  * begins a new record.
 818  */
 819 void
 820 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
 821 {
 822         struct mbuf     *m;
 823
 824         KASSERT(solocked(sb->sb_so));
 825
 826         if (m0 == 0)
 827                 return;
 828
 829 #ifdef MBUFTRACE
 830         m_claimm(m0, sb->sb_mowner);
 831 #endif
 832         /*
 833          * Put the first mbuf on the queue.
 834          * Note this permits zero length records.
 835          */
 836         sballoc(sb, m0);
 837         SBLASTRECORDCHK(sb, "sbappendrecord 1");
 838         SBLINKRECORD(sb, m0);
 839         m = m0->m_next;
 840         m0->m_next = 0;
 841         if (m && (m0->m_flags & M_EOR)) {
 842                 m0->m_flags &= ~M_EOR;
 843                 m->m_flags |= M_EOR;
 844         }
 845         sbcompress(sb, m, m0);
 846         SBLASTRECORDCHK(sb, "sbappendrecord 2");
 847 }
 848
 849 /*
 850  * As above except that OOB data
 851  * is inserted at the beginning of the sockbuf,
 852  * but after any other OOB data.
 853  */
 854 void
 855 sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
 856 {
 857         struct mbuf     *m, **mp;
 858
 859         KASSERT(solocked(sb->sb_so));
 860
 861         if (m0 == 0)
 862                 return;
 863
 864         SBLASTRECORDCHK(sb, "sbinsertoob 1");
 865
 866         for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
 867             again:
 868                 switch (m->m_type) {
 869
 870                 case MT_OOBDATA:
 871                         continue;               /* WANT next train */
 872
 873                 case MT_CONTROL:
 874                         if ((m = m->m_next) != NULL)
 875                                 goto again;     /* inspect THIS train further */
 876                 }
 877                 break;
 878         }
 879         /*
 880          * Put the first mbuf on the queue.
 881          * Note this permits zero length records.
 882          */
 883         sballoc(sb, m0);
 884         m0->m_nextpkt = *mp;
 885         if (*mp == NULL) {
 886                 /* m0 is actually the new tail */
 887                 sb->sb_lastrecord = m0;
 888         }
 889         *mp = m0;
 890         m = m0->m_next;
 891         m0->m_next = 0;
 892         if (m && (m0->m_flags & M_EOR)) {
 893                 m0->m_flags &= ~M_EOR;
 894                 m->m_flags |= M_EOR;
 895         }
 896         sbcompress(sb, m, m0);
 897         SBLASTRECORDCHK(sb, "sbinsertoob 2");
 898 }
 899
 900 /*
 901  * Append address and data, and optionally, control (ancillary) data
 902  * to the receive queue of a socket.  If present,
 903  * m0 must include a packet header with total length.
 904  * Returns 0 if no space in sockbuf or insufficient mbufs.
 905  */
 906 int
 907 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
 908         struct mbuf *control)
 909 {
 910         struct mbuf     *m, *n, *nlast;
 911         int             space, len;
 912
 913         KASSERT(solocked(sb->sb_so));
 914
 915         space = asa->sa_len;
 916
 917         if (m0 != NULL) {
 918                 if ((m0->m_flags & M_PKTHDR) == 0)
 919                         panic("sbappendaddr");
 920                 space += m0->m_pkthdr.len;
 921 #ifdef MBUFTRACE
 922                 m_claimm(m0, sb->sb_mowner);
 923 #endif
 924         }
 925         for (n = control; n; n = n->m_next) {
 926                 space += n->m_len;
 927                 MCLAIM(n, sb->sb_mowner);
 928                 if (n->m_next == 0)     /* keep pointer to last control buf */
 929                         break;
 930         }
 931         if (space > sbspace(sb))
 932                 return (0);
 933         MGET(m, M_DONTWAIT, MT_SONAME);
 934         if (m == 0)
 935                 return (0);
 936         MCLAIM(m, sb->sb_mowner);
 937         /*
 938          * XXX avoid 'comparison always true' warning which isn't easily
 939          * avoided.
 940          */
 941         len = asa->sa_len;
 942         if (len > MLEN) {
 943                 MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
 944                 if ((m->m_flags & M_EXT) == 0) {
 945                         m_free(m);
 946                         return (0);
 947                 }
 948         }
 949         m->m_len = asa->sa_len;
 950         memcpy(mtod(m, void *), asa, asa->sa_len);
 951         if (n)
 952                 n->m_next = m0;         /* concatenate data to control */
 953         else
 954                 control = m0;
 955         m->m_next = control;
 956
 957         SBLASTRECORDCHK(sb, "sbappendaddr 1");
 958
 959         for (n = m; n->m_next != NULL; n = n->m_next)
 960                 sballoc(sb, n);
 961         sballoc(sb, n);
 962         nlast = n;
 963         SBLINKRECORD(sb, m);
 964
 965         sb->sb_mbtail = nlast;
 966         SBLASTMBUFCHK(sb, "sbappendaddr");
 967         SBLASTRECORDCHK(sb, "sbappendaddr 2");
 968
 969         return (1);
 970 }
 971
 972 /*
 973  * Helper for sbappendchainaddr: prepend a struct sockaddr* to
 974  * an mbuf chain.
 975  */
 976 static inline struct mbuf *
 977 m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
 978                    const struct sockaddr *asa)
 979 {
 980         struct mbuf *m;
 981         const int salen = asa->sa_len;
 982
 983         KASSERT(solocked(sb->sb_so));
 984
 985         /* only the first in each chain need be a pkthdr */
 986         MGETHDR(m, M_DONTWAIT, MT_SONAME);
 987         if (m == 0)
 988                 return (0);
 989         MCLAIM(m, sb->sb_mowner);
 990 #ifdef notyet
 991         if (salen > MHLEN) {
 992                 MEXTMALLOC(m, salen, M_NOWAIT);
 993                 if ((m->m_flags & M_EXT) == 0) {
 994                         m_free(m);
 995                         return (0);
 996                 }
 997         }
 998 #else
 999         KASSERT(salen <= MHLEN);
1000 #endif
1001         m->m_len = salen;
1002         memcpy(mtod(m, void *), asa, salen);
1003         m->m_next = m0;
1004         m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1005
1006         return m;
1007 }
1008
1009 int
1010 sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
1011                   struct mbuf *m0, int sbprio)
1012 {
1013         int space;
1014         struct mbuf *m, *n, *n0, *nlast;
1015         int error;
1016
1017         KASSERT(solocked(sb->sb_so));
1018
1019         /*
1020          * XXX sbprio reserved for encoding priority of this* request:
1021          *  SB_PRIO_NONE --> honour normal sb limits
1022          *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1023          *      take whole chain. Intended for large requests
1024          *      that should be delivered atomically (all, or none).
1025          * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1026          *       over normal socket limits, for messages indicating
1027          *       buffer overflow in earlier normal/lower-priority messages
1028          * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
1029          *       Intended for  kernel-generated messages only.
1030          *        Up to generator to avoid total mbuf resource exhaustion.
1031          */
1032         (void)sbprio;
1033
1034         if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1035                 panic("sbappendaddrchain");
1036
1037         space = sbspace(sb);
1038
1039 #ifdef notyet
1040         /*
1041          * Enforce SB_PRIO_* limits as described above.
1042          */
1043 #endif
1044
1045         n0 = NULL;
1046         nlast = NULL;
1047         for (m = m0; m; m = m->m_nextpkt) {
1048                 struct mbuf *np;
1049
1050 #ifdef MBUFTRACE
1051                 m_claimm(m, sb->sb_mowner);
1052 #endif
1053
1054                 /* Prepend sockaddr to this record (m) of input chain m0 */
1055                 n = m_prepend_sockaddr(sb, m, asa);
1056                 if (n == NULL) {
1057                         error = ENOBUFS;
1058                         goto bad;
1059                 }
1060
1061                 /* Append record (asa+m) to end of new chain n0 */
1062                 if (n0 == NULL) {
1063                         n0 = n;
1064                 } else {
1065                         nlast->m_nextpkt = n;
1066                 }
1067                 /* Keep track of last record on new chain */
1068                 nlast = n;
1069
1070                 for (np = n; np; np = np->m_next)
1071                         sballoc(sb, np);
1072         }
1073
1074         SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
1075
1076         /* Drop the entire chain of (asa+m) records onto the socket */
1077         SBLINKRECORDCHAIN(sb, n0, nlast);
1078
1079         SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
1080
1081         for (m = nlast; m->m_next; m = m->m_next)
1082                 ;
1083         sb->sb_mbtail = m;
1084         SBLASTMBUFCHK(sb, "sbappendaddrchain");
1085
1086         return (1);
1087
1088 bad:
1089         /*
1090          * On error, free the prepended addreseses. For consistency
1091          * with sbappendaddr(), leave it to our caller to free
1092          * the input record chain passed to us as m0.
1093          */
1094         while ((n = n0) != NULL) {
1095                 struct mbuf *np;
1096
1097                 /* Undo the sballoc() of this record */
1098                 for (np = n; np; np = np->m_next)
1099                         sbfree(sb, np);
1100
1101                 n0 = n->m_nextpkt;      /* iterate at next prepended address */
1102                 MFREE(n, np);           /* free prepended address (not data) */
1103         }
1104         return 0;
1105 }
1106
1107
1108 int
1109 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1110 {
1111         struct mbuf     *m, *mlast, *n;
1112         int             space;
1113
1114         KASSERT(solocked(sb->sb_so));
1115
1116         space = 0;
1117         if (control == 0)
1118                 panic("sbappendcontrol");
1119         for (m = control; ; m = m->m_next) {
1120                 space += m->m_len;
1121                 MCLAIM(m, sb->sb_mowner);
1122                 if (m->m_next == 0)
1123                         break;
1124         }
1125         n = m;                  /* save pointer to last control buffer */
1126         for (m = m0; m; m = m->m_next) {
1127                 MCLAIM(m, sb->sb_mowner);
1128                 space += m->m_len;
1129         }
1130         if (space > sbspace(sb))
1131                 return (0);
1132         n->m_next = m0;                 /* concatenate data to control */
1133
1134         SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1135
1136         for (m = control; m->m_next != NULL; m = m->m_next)
1137                 sballoc(sb, m);
1138         sballoc(sb, m);
1139         mlast = m;
1140         SBLINKRECORD(sb, control);
1141
1142         sb->sb_mbtail = mlast;
1143         SBLASTMBUFCHK(sb, "sbappendcontrol");
1144         SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1145
1146         return (1);
1147 }
1148
1149 /*
1150  * Compress mbuf chain m into the socket
1151  * buffer sb following mbuf n.  If n
1152  * is null, the buffer is presumed empty.
1153  */
1154 void
1155 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1156 {
1157         int             eor;
1158         struct mbuf     *o;
1159
1160         KASSERT(solocked(sb->sb_so));
1161
1162         eor = 0;
1163         while (m) {
1164                 eor |= m->m_flags & M_EOR;
1165                 if (m->m_len == 0 &&
1166                     (eor == 0 ||
1167                      (((o = m->m_next) || (o = n)) &&
1168                       o->m_type == m->m_type))) {
1169                         if (sb->sb_lastrecord == m)
1170                                 sb->sb_lastrecord = m->m_next;
1171                         m = m_free(m);
1172                         continue;
1173                 }
1174                 if (n && (n->m_flags & M_EOR) == 0 &&
1175                     /* M_TRAILINGSPACE() checks buffer writeability */
1176                     m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
1177                     m->m_len <= M_TRAILINGSPACE(n) &&
1178                     n->m_type == m->m_type) {
1179                         memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1180                             (unsigned)m->m_len);
1181                         n->m_len += m->m_len;
1182                         sb->sb_cc += m->m_len;
1183                         m = m_free(m);
1184                         continue;
1185                 }
1186                 if (n)
1187                         n->m_next = m;
1188                 else
1189                         sb->sb_mb = m;
1190                 sb->sb_mbtail = m;
1191                 sballoc(sb, m);
1192                 n = m;
1193                 m->m_flags &= ~M_EOR;
1194                 m = m->m_next;
1195                 n->m_next = 0;
1196         }
1197         if (eor) {
1198                 if (n)
1199                         n->m_flags |= eor;
1200                 else
1201                         printf("semi-panic: sbcompress\n");
1202         }
1203         SBLASTMBUFCHK(sb, __func__);
1204 }
1205
1206 /*
1207  * Free all mbufs in a sockbuf.
1208  * Check that all resources are reclaimed.
1209  */
1210 void
1211 sbflush(struct sockbuf *sb)
1212 {
1213
1214         KASSERT(solocked(sb->sb_so));
1215         KASSERT((sb->sb_flags & SB_LOCK) == 0);
1216
1217         while (sb->sb_mbcnt)
1218                 sbdrop(sb, (int)sb->sb_cc);
1219
1220         KASSERT(sb->sb_cc == 0);
1221         KASSERT(sb->sb_mb == NULL);
1222         KASSERT(sb->sb_mbtail == NULL);
1223         KASSERT(sb->sb_lastrecord == NULL);
1224 }
1225
1226 /*
1227  * Drop data from (the front of) a sockbuf.
1228  */
1229 void
1230 sbdrop(struct sockbuf *sb, int len)
1231 {
1232         struct mbuf     *m, *mn, *next;
1233
1234         KASSERT(solocked(sb->sb_so));
1235
1236         next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1237         while (len > 0) {
1238                 if (m == 0) {
1239                         if (next == 0)
1240                                 panic("sbdrop");
1241                         m = next;
1242                         next = m->m_nextpkt;
1243                         continue;
1244                 }
1245                 if (m->m_len > len) {
1246                         m->m_len -= len;
1247                         m->m_data += len;
1248                         sb->sb_cc -= len;
1249                         break;
1250                 }
1251                 len -= m->m_len;
1252                 sbfree(sb, m);
1253                 MFREE(m, mn);
1254                 m = mn;
1255         }
1256         while (m && m->m_len == 0) {
1257                 sbfree(sb, m);
1258                 MFREE(m, mn);
1259                 m = mn;
1260         }
1261         if (m) {
1262                 sb->sb_mb = m;
1263                 m->m_nextpkt = next;
1264         } else
1265                 sb->sb_mb = next;
1266         /*
1267          * First part is an inline SB_EMPTY_FIXUP().  Second part
1268          * makes sure sb_lastrecord is up-to-date if we dropped
1269          * part of the last record.
1270          */
1271         m = sb->sb_mb;
1272         if (m == NULL) {
1273                 sb->sb_mbtail = NULL;
1274                 sb->sb_lastrecord = NULL;
1275         } else if (m->m_nextpkt == NULL)
1276                 sb->sb_lastrecord = m;
1277 }
1278
1279 /*
1280  * Drop a record off the front of a sockbuf
1281  * and move the next record to the front.
1282  */
1283 void
1284 sbdroprecord(struct sockbuf *sb)
1285 {
1286         struct mbuf     *m, *mn;
1287
1288         KASSERT(solocked(sb->sb_so));
1289
1290         m = sb->sb_mb;
1291         if (m) {
1292                 sb->sb_mb = m->m_nextpkt;
1293                 do {
1294                         sbfree(sb, m);
1295                         MFREE(m, mn);
1296                 } while ((m = mn) != NULL);
1297         }
1298         SB_EMPTY_FIXUP(sb);
1299 }
1300
1301 /*
1302  * Create a "control" mbuf containing the specified data
1303  * with the specified type for presentation on a socket buffer.
1304  */
1305 struct mbuf *
1306 sbcreatecontrol(void *p, int size, int type, int level)
1307 {
1308         struct cmsghdr  *cp;
1309         struct mbuf     *m;
1310
1311         if (CMSG_SPACE(size) > MCLBYTES) {
1312                 printf("sbcreatecontrol: message too large %d\n", size);
1313                 return NULL;
1314         }
1315
1316         if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1317                 return ((struct mbuf *) NULL);
1318         if (CMSG_SPACE(size) > MLEN) {
1319                 MCLGET(m, M_DONTWAIT);
1320                 if ((m->m_flags & M_EXT) == 0) {
1321                         m_free(m);
1322                         return NULL;
1323                 }
1324         }
1325         cp = mtod(m, struct cmsghdr *);
1326         memcpy(CMSG_DATA(cp), p, size);
1327         m->m_len = CMSG_SPACE(size);
1328         cp->cmsg_len = CMSG_LEN(size);
1329         cp->cmsg_level = level;
1330         cp->cmsg_type = type;
1331         return (m);
1332 }
1333
1334 void
1335 solockretry(struct socket *so, kmutex_t *lock)
1336 {
1337
1338         while (lock != so->so_lock) {
1339                 mutex_exit(lock);
1340                 lock = so->so_lock;
1341                 mutex_enter(lock);
1342         }
1343 }
1344
1345 bool
1346 solocked(struct socket *so)
1347 {
1348
1349         return mutex_owned(so->so_lock);
1350 }
1351
1352 bool
1353 solocked2(struct socket *so1, struct socket *so2)
1354 {
1355         kmutex_t *lock;
1356
1357         lock = so1->so_lock;
1358         if (lock != so2->so_lock)
1359                 return false;
1360         return mutex_owned(lock);
1361 }
1362
1363 /*
1364  * Assign a default lock to a new socket.  For PRU_ATTACH, and done by
1365  * protocols that do not have special locking requirements.
1366  */
1367 void
1368 sosetlock(struct socket *so)
1369 {
1370         kmutex_t *lock;
1371
1372         if (so->so_lock == NULL) {
1373                 lock = softnet_lock;
1374                 so->so_lock = lock;
1375                 mutex_obj_hold(lock);
1376                 mutex_enter(lock);
1377         }
1378
1379         /* In all cases, lock must be held on return from PRU_ATTACH. */
1380         KASSERT(solocked(so));
1381 }
1382
1383 /*
1384  * Set lock on sockbuf sb; sleep if lock is already held.
1385  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1386  * Returns error without lock if sleep is interrupted.
1387  */
1388 int
1389 sblock(struct sockbuf *sb, int wf)
1390 {
1391         struct socket *so;
1392         kmutex_t *lock;
1393         int error;
1394
1395         KASSERT(solocked(sb->sb_so));
1396
1397         for (;;) {
1398                 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
1399                         sb->sb_flags |= SB_LOCK;
1400                         return 0;
1401                 }
1402                 if (wf != M_WAITOK)
1403                         return EWOULDBLOCK;
1404                 so = sb->sb_so;
1405                 lock = so->so_lock;
1406                 if ((sb->sb_flags & SB_NOINTR) != 0) {
1407                         cv_wait(&so->so_cv, lock);
1408                         error = 0;
1409                 } else
1410                         error = cv_wait_sig(&so->so_cv, lock);
1411                 if (__predict_false(lock != so->so_lock))
1412                         solockretry(so, lock);
1413                 if (error != 0)
1414                         return error;
1415         }
1416 }
1417
1418 void
1419 sbunlock(struct sockbuf *sb)
1420 {
1421         struct socket *so;
1422
1423         so = sb->sb_so;
1424
1425         KASSERT(solocked(so));
1426         KASSERT((sb->sb_flags & SB_LOCK) != 0);
1427
1428         sb->sb_flags &= ~SB_LOCK;
1429         cv_broadcast(&so->so_cv);
1430 }
1431
1432 int
1433 sowait(struct socket *so, bool catch, int timo)
1434 {
1435         kmutex_t *lock;
1436         int error;
1437
1438         KASSERT(solocked(so));
1439         KASSERT(catch || timo != 0);
1440
1441         lock = so->so_lock;
1442         if (catch)
1443                 error = cv_timedwait_sig(&so->so_cv, lock, timo);
1444         else
1445                 error = cv_timedwait(&so->so_cv, lock, timo);
1446         if (__predict_false(lock != so->so_lock))
1447                 solockretry(so, lock);
1448         return error;
1449 }