2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2006 Pavel Fedin
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 * must display the following acknowledgement:
16 * This product includes software developed by the University of
17 * California, Berkeley and its contributors.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/protosw.h>
45 #include <sys/errno.h>
46 #include <sys/queue.h>
48 #include <net/route.h>
51 #include <netinet/in.h>
52 #include <netinet/in_systm.h>
53 #include <netinet/ip.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/in_var.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/ip_icmp.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
65 #include <netinet/tcp_debug.h>
68 #include <kern/kern_subr_protos.h>
70 /* patchable/settable parameters for tcp */
71 int ip_defttl
= 60; /* default time to live for TCP segs */
72 int tcp_mssdflt
= TCP_MSS
;
73 int tcp_rttdflt
= TCPTV_SRTTDFLT
/ PR_SLOWHZ
;
74 int tcp_do_rfc1323
= 1;
75 int tcp_do_rfc1644
= 1;
76 static void tcp_cleartaocache(void);
78 extern u_char inetctlerrmap
[];
79 extern struct in_addr zeroin_addr
;
82 * Target size of TCP PCB hash table. Will be rounded down to a prime
86 #define TCBHASHSIZE 128
96 tcp_iss
= 1; /* wrong */
100 tcbinfo
.listhead
= &tcb
;
101 tcbinfo
.hashbase
= phashinit(TCBHASHSIZE
, M_PCB
, &tcbinfo
.hashsize
);
102 if (max_protohdr
< sizeof(struct tcpiphdr
))
103 max_protohdr
= sizeof(struct tcpiphdr
);
104 if (max_linkhdr
+ sizeof(struct tcpiphdr
) > MHLEN
)
109 * Create template to be used to send tcp packets on a connection.
110 * Call after host entry created, allocates an mbuf and fills
111 * in a skeletal tcp/ip header, minimizing the amount of work
112 * necessary when the connection is used.
118 register struct inpcb
*inp
= tp
->t_inpcb
;
119 register struct mbuf
*m
;
120 register struct tcpiphdr
*n
;
122 if ((n
= tp
->t_template
) == 0) {
123 m
= m_get(M_DONTWAIT
, MT_HEADER
);
126 m
->m_len
= sizeof (struct tcpiphdr
);
127 n
= mtod(m
, struct tcpiphdr
*);
129 bzero(n
->ti_x1
, sizeof(n
->ti_x1
));
130 n
->ti_pr
= IPPROTO_TCP
;
131 n
->ti_len
= htons(sizeof (struct tcpiphdr
) - sizeof (struct ip
));
132 n
->ti_src
= inp
->inp_laddr
;
133 n
->ti_dst
= inp
->inp_faddr
;
134 n
->ti_sport
= inp
->inp_lport
;
135 n
->ti_dport
= inp
->inp_fport
;
148 * Send a single message to the TCP at address specified by
149 * the given TCP/IP header. If m == 0, then we make a copy
150 * of the tcpiphdr at ti and send directly to the addressed host.
151 * This is used to force keep alive messages out using the TCP
152 * template for a connection tp->t_template. If flags are given
153 * then we send a message back to the TCP which originated the
154 * segment ti, and discard the mbuf containing it and any other
157 * In any case the ack and sequence number of the transmitted
158 * segment are as specified by the parameters.
161 tcp_respond(tp
, ti
, m
, ack
, seq
, flags
)
163 register struct tcpiphdr
*ti
;
164 register struct mbuf
*m
;
170 struct route
*ro
= 0;
173 win
= sbspace(&tp
->t_inpcb
->inp_socket
->so_rcv
);
174 ro
= &tp
->t_inpcb
->inp_route
;
177 m
= m_gethdr(M_DONTWAIT
, MT_HEADER
);
185 m
->m_data
+= max_linkhdr
;
186 *mtod(m
, struct tcpiphdr
*) = *ti
;
187 ti
= mtod(m
, struct tcpiphdr
*);
192 m
->m_data
= (caddr_t
)ti
;
193 m
->m_len
= sizeof (struct tcpiphdr
);
195 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
196 xchg(ti
->ti_dst
.s_addr
, ti
->ti_src
.s_addr
, u_long
);
197 xchg(ti
->ti_dport
, ti
->ti_sport
, u_short
);
200 ti
->ti_len
= htons((u_short
)(sizeof (struct tcphdr
) + tlen
));
201 tlen
+= sizeof (struct tcpiphdr
);
203 m
->m_pkthdr
.len
= tlen
;
204 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
205 bzero(ti
->ti_x1
, sizeof(ti
->ti_x1
));
206 ti
->ti_seq
= htonl(seq
);
207 ti
->ti_ack
= htonl(ack
);
209 ti
->ti_off
= sizeof (struct tcphdr
) >> 2;
210 ti
->ti_flags
= flags
;
212 ti
->ti_win
= htons((u_short
) (win
>> tp
->rcv_scale
));
214 ti
->ti_win
= htons((u_short
)win
);
217 ti
->ti_sum
= in_cksum(m
, tlen
);
218 ((struct ip
*)ti
)->ip_len
= tlen
;
219 ((struct ip
*)ti
)->ip_ttl
= ip_defttl
;
221 if (tp
== NULL
|| (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
))
222 tcp_trace(TA_OUTPUT
, 0, tp
, ti
, 0);
224 #ifdef ENABLE_MULTICAST
225 (void) ip_output(m
, NULL
, ro
, 0, NULL
);
227 (void) ip_output(m
, NULL
, ro
, 0);
232 * Create a new TCP control block, making an
233 * empty reassembly queue and hooking it to the argument
234 * protocol control block.
240 register struct tcpcb
*tp
;
242 tp
= bsd_malloc(sizeof(*tp
), M_PCB
, M_NOWAIT
);
244 return ((struct tcpcb
*)0);
245 bzero((char *) tp
, sizeof(struct tcpcb
));
247 tp
->t_maxseg
= tp
->t_maxopd
= tcp_mssdflt
;
250 tp
->t_flags
= (TF_REQ_SCALE
|TF_REQ_TSTMP
);
252 tp
->t_flags
|= TF_REQ_CC
;
255 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
256 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
257 * reasonable initial retransmit time.
259 tp
->t_srtt
= TCPTV_SRTTBASE
;
260 tp
->t_rttvar
= tcp_rttdflt
* PR_SLOWHZ
<< 2;
261 tp
->t_rttmin
= TCPTV_MIN
;
262 TCPT_RANGESET(tp
->t_rxtcur
,
263 ((TCPTV_SRTTBASE
>> 2) + (TCPTV_SRTTDFLT
<< 2)) >> 1,
264 TCPTV_MIN
, TCPTV_REXMTMAX
);
265 tp
->snd_cwnd
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
266 tp
->snd_ssthresh
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
267 inp
->inp_ip
.ip_ttl
= ip_defttl
;
268 inp
->inp_ppcb
= (caddr_t
)tp
;
273 * Drop a TCP connection, reporting
274 * the specified error. If connection is synchronized,
275 * then send a RST to peer.
279 register struct tcpcb
*tp
;
282 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
284 if (TCPS_HAVERCVDSYN(tp
->t_state
)) {
285 tp
->t_state
= TCPS_CLOSED
;
286 (void) tcp_output(tp
);
287 tcpstat
.tcps_drops
++;
289 tcpstat
.tcps_conndrops
++;
290 if (_errno
== ETIMEDOUT
&& tp
->t_softerror
)
291 _errno
= tp
->t_softerror
;
292 so
->so_error
= _errno
;
293 return (tcp_close(tp
));
297 * Close a TCP control block:
298 * discard all space held by the tcp
299 * discard internet protocol block
300 * wake up any sleepers
304 register struct tcpcb
*tp
;
306 struct inpcb
*inp
= tp
->t_inpcb
;
307 struct socket
*so
= inp
->inp_socket
;
308 register struct mbuf
*q
;
309 register struct mbuf
*nq
;
311 register struct rtentry
*rt
;
314 * If we sent enough data to get some meaningful characteristics,
315 * save them in the routing entry. 'Enough' is arbitrarily
316 * defined as the sendpipesize (default 4K) * 16. This would
317 * give us 16 rtt samples assuming we only get one sample per
318 * window (the usual case on a long haul net). 16 samples is
319 * enough for the srtt filter to converge to within 5% of the correct
320 * value; fewer samples and we could save a very bogus rtt.
322 * Don't update the default route's characteristics and don't
323 * update anything that the user "locked".
325 if (SEQ_LT(tp
->iss
+ so
->so_snd
.sb_hiwat
* 16, tp
->snd_max
) &&
326 (rt
= inp
->inp_route
.ro_rt
) &&
327 ((struct sockaddr_in
*)rt_key(rt
))->sin_addr
.s_addr
!= INADDR_ANY
) {
328 register u_long i
= 0;
330 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTT
) == 0) {
332 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTT_SCALE
));
333 if (rt
->rt_rmx
.rmx_rtt
&& i
)
335 * filter this update to half the old & half
336 * the new values, converting scale.
337 * See route.h and tcp_var.h for a
338 * description of the scaling constants.
341 (rt
->rt_rmx
.rmx_rtt
+ i
) / 2;
343 rt
->rt_rmx
.rmx_rtt
= i
;
345 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTTVAR
) == 0) {
347 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTTVAR_SCALE
));
348 if (rt
->rt_rmx
.rmx_rttvar
&& i
)
349 rt
->rt_rmx
.rmx_rttvar
=
350 (rt
->rt_rmx
.rmx_rttvar
+ i
) / 2;
352 rt
->rt_rmx
.rmx_rttvar
= i
;
355 * update the pipelimit (ssthresh) if it has been updated
356 * already or if a pipesize was specified & the threshhold
357 * got below half the pipesize. I.e., wait for bad news
358 * before we start updating, then update on both good
361 if (((rt
->rt_rmx
.rmx_locks
& RTV_SSTHRESH
) == 0 &&
362 ((i
= tp
->snd_ssthresh
) != 0) && rt
->rt_rmx
.rmx_ssthresh
) ||
363 i
< (rt
->rt_rmx
.rmx_sendpipe
/ 2)) {
365 * convert the limit from user data bytes to
366 * packets then to packet data bytes.
368 i
= (i
+ tp
->t_maxseg
/ 2) / tp
->t_maxseg
;
371 i
*= (u_long
)(tp
->t_maxseg
+ sizeof (struct tcpiphdr
));
372 if (rt
->rt_rmx
.rmx_ssthresh
)
373 rt
->rt_rmx
.rmx_ssthresh
=
374 (rt
->rt_rmx
.rmx_ssthresh
+ i
) / 2;
376 rt
->rt_rmx
.rmx_ssthresh
= i
;
380 /* free the reassembly queue, if any */
381 for (q
= tp
->t_segq
; q
; q
= nq
) {
387 (void) m_free(dtom(tp
->t_template
));
390 soisdisconnected(so
);
392 tcpstat
.tcps_closed
++;
393 return ((struct tcpcb
*)0);
403 * Notify a tcp user of an asynchronous error;
404 * store error as soft error, but wake up user
405 * (for now, won't do anything until can select for soft error).
408 tcp_notify(inp
, error
)
412 register struct tcpcb
*tp
= (struct tcpcb
*)inp
->inp_ppcb
;
413 register struct socket
*so
= inp
->inp_socket
;
416 * Ignore some errors if we are hooked up.
417 * If connection hasn't completed, has retransmitted several times,
418 * and receives a second error, give up now. This is better
419 * than waiting a long time to establish a connection that
420 * can never complete.
422 if (tp
->t_state
== TCPS_ESTABLISHED
&&
423 (error
== EHOSTUNREACH
|| error
== ENETUNREACH
||
424 error
== EHOSTDOWN
)) {
426 } else if (tp
->t_state
< TCPS_ESTABLISHED
&& tp
->t_rxtshift
> 3 &&
428 so
->so_error
= error
;
430 tp
->t_softerror
= error
;
431 wakeup((caddr_t
) &so
->so_timeo
);
437 tcp_ctlinput(cmd
, sa
, arg
)
442 register struct ip
*ip
= arg
;
443 register struct tcphdr
*th
;
444 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
446 if (cmd
== PRC_QUENCH
)
448 else if (!PRC_IS_REDIRECT(cmd
) &&
449 ((unsigned)cmd
> PRC_NCMDS
|| inetctlerrmap
[cmd
] == 0))
452 th
= (struct tcphdr
*)((caddr_t
)ip
+ (ip
->ip_hl
<< 2));
453 in_pcbnotify(&tcb
, sa
, th
->th_dport
, ip
->ip_src
, th
->th_sport
,
456 in_pcbnotify(&tcb
, sa
, 0, zeroin_addr
, 0, cmd
, notify
);
460 * When a source quench is received, close congestion window
461 * to one segment. We will gradually open it again as we proceed.
464 tcp_quench(inp
, _errno
)
468 struct tcpcb
*tp
= intotcpcb(inp
);
471 tp
->snd_cwnd
= tp
->t_maxseg
;
475 * Look-up the routing entry to the peer of this inpcb. If no route
476 * is found and it cannot be allocated the return NULL. This routine
477 * is called by TCP routines that access the rmx structure and by tcp_mss
478 * to get the interface MTU.
487 ro
= &inp
->inp_route
;
489 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
490 /* No route yet, so try to acquire one */
491 if (inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
492 struct sockaddr_in
*rodst_saddr
= (struct sockaddr_in
*)&ro
->ro_dst
;
493 ro
->ro_dst
.sa_family
= AF_INET
;
494 ro
->ro_dst
.sa_len
= sizeof(ro
->ro_dst
);
495 rodst_saddr
->sin_addr
= inp
->inp_faddr
;
504 * Return a pointer to the cached information about the remote host.
505 * The cached information is stored in the protocol specific part of
512 struct rtentry
*rt
= tcp_rtlookup(inp
);
514 /* Make sure this is a host route and is up. */
516 (rt
->rt_flags
& (RTF_UP
|RTF_HOST
)) != (RTF_UP
|RTF_HOST
))
519 return rmx_taop(rt
->rt_rmx
);
523 * Clear all the TAO cache entries, called from tcp_init.
526 * This routine is just an empty one, because we assume that the routing
527 * routing tables are initialized at the same time when TCP, so there is
528 * nothing in the cache left over.
531 tcp_cleartaocache(void)