2 * Copyright (c) 1982, 1986, 1988, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/protosw.h>
45 #include <sys/errno.h>
47 #include <sys/queue.h>
48 #include <sys/synch.h>
50 #include <sys/sysctl.h>
53 #include <net/route.h>
56 #include <netinet/in.h>
57 #include <netinet/in_systm.h>
58 #include <netinet/ip.h>
59 #include <netinet/in_pcb.h>
60 #include <netinet/in_var.h>
61 #include <netinet/ip_var.h>
62 #include <netinet/tcp.h>
63 #include <netinet/tcp_fsm.h>
64 #include <netinet/tcp_seq.h>
65 #include <netinet/tcp_timer.h>
66 #include <netinet/tcp_var.h>
67 #include <netinet/tcpip.h>
69 #include <netinet/tcp_debug.h>
73 * TCP protocol interface to socket abstraction.
75 extern char *tcpstates
[];
78 * Process a TCP user request for TCP tb. If this is a send request
79 * then m is the mbuf chain of send data. If this is a timer expiration
80 * (called from the software clock routine), then timertype tells which timer.
84 tcp_usrreq(so
, req
, m
, nam
, control
)
87 struct mbuf
*m
, *nam
, *control
;
89 register struct inpcb
*inp
;
90 register struct tcpcb
*tp
= 0;
91 struct sockaddr_in
*sinp
;
98 if (req
== PRU_CONTROL
)
99 return (in_control(so
, (long)m
, (caddr_t
)nam
,
100 (struct ifnet
*)control
));
101 if (control
&& control
->m_len
) {
111 * When a TCP is attached to a socket, then there will be
112 * a (struct inpcb) pointed at by the socket, and this
113 * structure will point at a subsidary (struct tcpcb).
115 if (inp
== 0 && req
!= PRU_ATTACH
) {
117 return (EINVAL
); /* XXX */
121 /* WHAT IF TP IS 0? */
123 tcp_acounts
[tp
->t_state
][req
]++;
126 ostate
= tp
->t_state
;
131 #endif /* TCPDEBUG */
136 * TCP attaches to socket via PRU_ATTACH, reserving space,
137 * and an internet control block.
144 error
= tcp_attach(so
);
147 if ((so
->so_options
& SO_LINGER
) && so
->so_linger
.tv_sec
== 0)
148 so
->so_linger
.tv_sec
= TCP_LINGERTIME
;
153 * PRU_DETACH detaches the TCP protocol from the socket.
154 * If the protocol state is non-embryonic, then can't
155 * do this directly: have to initiate a PRU_DISCONNECT,
156 * which may finish later; embryonic TCB's can just
160 if (tp
->t_state
> TCPS_LISTEN
)
161 tp
= tcp_disconnect(tp
);
167 * Give the socket an address.
171 * Must check for multicast addresses and disallow binding
174 sinp
= mtod(nam
, struct sockaddr_in
*);
175 if (sinp
->sin_family
== AF_INET
&&
176 IN_MULTICAST(ntohl(sinp
->sin_addr
.s_addr
))) {
177 error
= EAFNOSUPPORT
;
180 error
= in_pcbbind(inp
, nam
);
186 * Prepare to accept connections.
189 if (inp
->inp_lport
== 0)
190 error
= in_pcbbind(inp
, NULL
);
192 tp
->t_state
= TCPS_LISTEN
;
196 * Initiate connection to peer.
197 * Create a template for use in transmissions on this connection.
198 * Enter SYN_SENT state, and mark socket as connecting.
199 * Start keep-alive timer, and seed output sequence space.
200 * Send initial segment on connection.
204 * Must disallow TCP ``connections'' to multicast addresses.
206 sinp
= mtod(nam
, struct sockaddr_in
*);
207 if (sinp
->sin_family
== AF_INET
208 && IN_MULTICAST(ntohl(sinp
->sin_addr
.s_addr
))) {
209 error
= EAFNOSUPPORT
;
213 if ((error
= tcp_connect(tp
, nam
)) != 0)
215 error
= tcp_output(tp
);
219 * Create a TCP connection between two sockets.
226 * Initiate disconnect from peer.
227 * If connection never passed embryonic stage, just drop;
228 * else if don't need to let data drain, then can just drop anyways,
229 * else have to begin TCP shutdown process: mark socket disconnecting,
230 * drain unread data, state switch to reflect user close, and
231 * send segment (e.g. FIN) to peer. Socket will be really disconnected
232 * when peer sends FIN and acks ours.
234 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
237 tp
= tcp_disconnect(tp
);
241 * Accept a connection. Essentially all the work is
242 * done at higher levels; just return the address
243 * of the peer, storing through addr.
246 in_setpeeraddr(inp
, nam
);
250 * Mark the connection as being incapable of further output.
254 tp
= tcp_usrclosed(tp
);
256 error
= tcp_output(tp
);
260 * After a receive, possibly send window update to peer.
263 (void) tcp_output(tp
);
267 * Do a send by putting data in output queue and updating urgent
268 * marker if URG set. Possibly send more data.
272 sbappend(&so
->so_snd
, m
);
273 if (nam
&& tp
->t_state
< TCPS_SYN_SENT
) {
275 * Do implied connect if not yet connected,
276 * initialize window to default value, and
277 * initialize maxseg/maxopd using peer's cached
280 error
= tcp_connect(tp
, nam
);
283 tp
->snd_wnd
= TTCP_CLIENT_SND_WND
;
287 if (req
== PRU_SEND_EOF
) {
289 * Close the send side of the connection after
293 tp
= tcp_usrclosed(tp
);
296 error
= tcp_output(tp
);
303 tp
= tcp_drop(tp
, ECONNABORTED
);
307 ((struct stat
*) m
)->st_blksize
= so
->so_snd
.sb_hiwat
;
312 if ((so
->so_oobmark
== 0 &&
313 (so
->so_state
& SS_RCVATMARK
) == 0) ||
314 so
->so_options
& SO_OOBINLINE
||
315 tp
->t_oobflags
& TCPOOB_HADDATA
) {
319 if ((tp
->t_oobflags
& TCPOOB_HAVEDATA
) == 0) {
324 *mtod(m
, caddr_t
) = tp
->t_iobc
;
325 if (((long)nam
& MSG_PEEK
) == 0)
326 tp
->t_oobflags
^= (TCPOOB_HAVEDATA
| TCPOOB_HADDATA
);
330 if (sbspace(&so
->so_snd
) < -512) {
336 * According to RFC961 (Assigned Protocols),
337 * the urgent pointer points to the last octet
338 * of urgent data. We continue, however,
339 * to consider it to indicate the first octet
340 * of data past the urgent section.
341 * Otherwise, snd_up should be one lower.
343 sbappend(&so
->so_snd
, m
);
344 tp
->snd_up
= tp
->snd_una
+ so
->so_snd
.sb_cc
;
346 error
= tcp_output(tp
);
351 in_setsockaddr(inp
, nam
);
355 in_setpeeraddr(inp
, nam
);
359 * TCP slow timer went off; going through this
360 * routine for tracing's sake.
363 tp
= tcp_timers(tp
, (long)nam
);
365 req
|= (int)nam
<< 8; /* for debug's sake */
373 if (tp
&& (so
->so_options
& SO_DEBUG
))
374 tcp_trace(TA_USER
, ostate
, tp
, (struct tcpiphdr
*)0, req
);
381 * Common subroutine to open a TCP connection to remote host specified
382 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
383 * port number if needed. Call in_pcbladdr to do the routing and to choose
384 * a local host address (interface). If there is an existing incarnation
385 * of the same connection in TIME-WAIT state and if the remote host was
386 * sending CC options and if the connection duration was < MSL, then
387 * truncate the previous TIME-WAIT state and proceed.
388 * Initialize connection parameters and enter SYN-SENT state.
392 register struct tcpcb
*tp
;
395 struct inpcb
*inp
= tp
->t_inpcb
, *oinp
;
396 struct socket
*so
= inp
->inp_socket
;
398 struct sockaddr_in
*sin
= mtod(nam
, struct sockaddr_in
*);
399 struct sockaddr_in
*ifaddr
;
402 if (inp
->inp_lport
== 0) {
403 error
= in_pcbbind(inp
, NULL
);
409 * Cannot simply call in_pcbconnect, because there might be an
410 * earlier incarnation of this same connection still in
411 * TIME_WAIT state, creating an ADDRINUSE error.
413 error
= in_pcbladdr(inp
, nam
, &ifaddr
);
416 oinp
= in_pcblookup(inp
->inp_pcbinfo
->listhead
,
417 sin
->sin_addr
, sin
->sin_port
,
418 inp
->inp_laddr
.s_addr
!= INADDR_ANY
? inp
->inp_laddr
422 if (oinp
!= inp
&& (otp
= intotcpcb(oinp
)) != NULL
&&
423 otp
->t_state
== TCPS_TIME_WAIT
&&
424 otp
->t_duration
< TCPTV_MSL
&&
425 (otp
->t_flags
& TF_RCVD_CC
))
426 otp
= tcp_close(otp
);
430 if (inp
->inp_laddr
.s_addr
== INADDR_ANY
)
431 inp
->inp_laddr
= ifaddr
->sin_addr
;
432 inp
->inp_faddr
= sin
->sin_addr
;
433 inp
->inp_fport
= sin
->sin_port
;
436 tp
->t_template
= tcp_template(tp
);
437 if (tp
->t_template
== 0) {
438 in_pcbdisconnect(inp
);
442 /* Compute window scaling to request. */
443 while (tp
->request_r_scale
< TCP_MAX_WINSHIFT
&&
444 (TCP_MAXWIN
<< tp
->request_r_scale
) < so
->so_rcv
.sb_hiwat
)
445 tp
->request_r_scale
++;
448 tcpstat
.tcps_connattempt
++;
449 tp
->t_state
= TCPS_SYN_SENT
;
450 tp
->t_timer
[TCPT_KEEP
] = TCPTV_KEEP_INIT
;
451 tp
->iss
= tcp_iss
; tcp_iss
+= TCP_ISSINCR
/2;
453 tp
->cc_send
= CC_INC(tcp_ccgen
);
459 tcp_ctloutput(op
, so
, level
, optname
, mp
)
467 register struct tcpcb
*tp
;
468 register struct mbuf
*m
;
475 if (op
== PRCO_SETOPT
&& *mp
)
479 if (level
!= IPPROTO_TCP
) {
480 error
= ip_ctloutput(op
, so
, level
, optname
, mp
);
493 if (m
== NULL
|| m
->m_len
< sizeof (int))
495 else if (*mtod(m
, int *))
496 tp
->t_flags
|= TF_NODELAY
;
498 tp
->t_flags
&= ~TF_NODELAY
;
502 if (m
&& (i
= *mtod(m
, int *)) > 0 && i
<= tp
->t_maxseg
)
509 if (m
== NULL
|| m
->m_len
< sizeof (int))
511 else if (*mtod(m
, int *))
512 tp
->t_flags
|= TF_NOOPT
;
514 tp
->t_flags
&= ~TF_NOOPT
;
518 if (m
== NULL
|| m
->m_len
< sizeof (int))
520 else if (*mtod(m
, int *))
521 tp
->t_flags
|= TF_NOPUSH
;
523 tp
->t_flags
&= ~TF_NOPUSH
;
535 *mp
= m
= m_get(M_WAIT
, MT_SOOPTS
);
536 m
->m_len
= sizeof(int);
540 *mtod(m
, int *) = tp
->t_flags
& TF_NODELAY
;
543 *mtod(m
, int *) = tp
->t_maxseg
;
546 *mtod(m
, int *) = tp
->t_flags
& TF_NOOPT
;
549 *mtod(m
, int *) = tp
->t_flags
& TF_NOPUSH
;
562 * tcp_sendspace and tcp_recvspace are the default send and receive window
563 * sizes, respectively. These are obsolescent (this information should
564 * be set by the route).
566 u_long tcp_sendspace
= 1024*16;
567 u_long tcp_recvspace
= 1024*16;
570 * Attach TCP protocol to socket, allocating
571 * internet protocol control block, tcp control block,
572 * bufer space, and entering LISTEN state if to accept connections.
578 register struct tcpcb
*tp
;
582 if (so
->so_snd
.sb_hiwat
== 0 || so
->so_rcv
.sb_hiwat
== 0) {
583 error
= soreserve(so
, tcp_sendspace
, tcp_recvspace
);
587 error
= in_pcballoc(so
, &tcbinfo
);
591 tp
= tcp_newtcpcb(inp
);
593 int nofd
= so
->so_state
& SS_NOFDREF
; /* XXX */
595 so
->so_state
&= ~SS_NOFDREF
; /* don't free the socket yet */
597 so
->so_state
|= nofd
;
600 tp
->t_state
= TCPS_CLOSED
;
605 * Initiate (or continue) disconnect.
606 * If embryonic state, just send reset (once).
607 * If in ``let data drain'' option and linger null, just drop.
608 * Otherwise (hard), mark socket disconnecting and drop
609 * current input data; switch states based on user close, and
610 * send segment to peer (with FIN).
614 register struct tcpcb
*tp
;
616 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
618 if (tp
->t_state
< TCPS_ESTABLISHED
)
620 else if ((so
->so_options
& SO_LINGER
) && so
->so_linger
.tv_sec
== 0)
621 tp
= tcp_drop(tp
, 0);
623 soisdisconnecting(so
);
624 sbflush(&so
->so_rcv
);
625 tp
= tcp_usrclosed(tp
);
627 (void) tcp_output(tp
);
633 * User issued close, and wish to trail through shutdown states:
634 * if never received SYN, just forget it. If got a SYN from peer,
635 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
636 * If already got a FIN from peer, then almost done; go to LAST_ACK
637 * state. In all other cases, have already sent FIN to peer (e.g.
638 * after PRU_SHUTDOWN), and just have to play tedious game waiting
639 * for peer to send FIN or not respond to keep-alives, etc.
640 * We can let the user exit from the close as soon as the FIN is acked.
644 register struct tcpcb
*tp
;
647 switch (tp
->t_state
) {
651 tp
->t_state
= TCPS_CLOSED
;
656 case TCPS_SYN_RECEIVED
:
657 tp
->t_flags
|= TF_NEEDFIN
;
660 case TCPS_ESTABLISHED
:
661 tp
->t_state
= TCPS_FIN_WAIT_1
;
664 case TCPS_CLOSE_WAIT
:
665 tp
->t_state
= TCPS_LAST_ACK
;
668 if (tp
&& tp
->t_state
>= TCPS_FIN_WAIT_2
)
669 soisdisconnected(tp
->t_inpcb
->inp_socket
);
675 * Sysctl for tcp variables.
678 tcp_sysctl(name
, namelen
, oldp
, oldlenp
, newp
, newlen
)
686 /* All sysctl names at this level are terminal. */
691 case TCPCTL_DO_RFC1323
:
692 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
694 case TCPCTL_DO_RFC1644
:
695 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
698 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
701 return (sysctl_rdstruct(oldp
, oldlenp
, newp
, &tcpstat
,
704 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
, &tcp_rttdflt
));
705 case TCPCTL_KEEPIDLE
:
706 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
708 case TCPCTL_KEEPINTVL
:
709 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
711 case TCPCTL_SENDSPACE
:
712 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
713 (int *)&tcp_sendspace
)); /* XXX */
714 case TCPCTL_RECVSPACE
:
715 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
716 (int *)&tcp_recvspace
)); /* XXX */
718 return (ENOPROTOOPT
);