1 /* $NetBSD: tcp_sack.c,v 1.24 2008/04/28 20:24:09 martin Exp $ */
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kentaro A. Kurahone.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
34 * The Regents of the University of California. All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
65 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
67 * NRL grants permission for redistribution and use in source and binary
68 * forms, with or without modification, of the software and documentation
69 * created at NRL provided that the following conditions are met:
71 * 1. Redistributions of source code must retain the above copyright
72 * notice, this list of conditions and the following disclaimer.
73 * 2. Redistributions in binary form must reproduce the above copyright
74 * notice, this list of conditions and the following disclaimer in the
75 * documentation and/or other materials provided with the distribution.
76 * 3. All advertising materials mentioning features or use of this software
77 * must display the following acknowledgements:
78 * This product includes software developed by the University of
79 * California, Berkeley and its contributors.
80 * This product includes software developed at the Information
81 * Technology Division, US Naval Research Laboratory.
82 * 4. Neither the name of the NRL nor the names of its contributors
83 * may be used to endorse or promote products derived from this software
84 * without specific prior written permission.
86 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
87 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
88 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
89 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
90 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
91 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
92 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
93 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
94 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
95 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
96 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
98 * The views and conclusions contained in the software and documentation
99 * are those of the authors and should not be interpreted as representing
100 * official policies, either expressed or implied, of the US Naval
101 * Research Laboratory (NRL).
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.24 2008/04/28 20:24:09 martin Exp $");
107 #include "opt_inet.h"
108 #include "opt_ipsec.h"
109 #include "opt_inet_csum.h"
110 #include "opt_tcp_debug.h"
113 #include <sys/param.h>
114 #include <sys/systm.h>
115 #include <sys/malloc.h>
116 #include <sys/mbuf.h>
117 #include <sys/protosw.h>
118 #include <sys/socket.h>
119 #include <sys/socketvar.h>
120 #include <sys/errno.h>
121 #include <sys/syslog.h>
122 #include <sys/pool.h>
123 #include <sys/domain.h>
124 #include <sys/kernel.h>
127 #include <net/route.h>
128 #include <net/if_types.h>
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/in_var.h>
135 #include <netinet/ip_var.h>
139 #include <netinet/in.h>
141 #include <netinet/ip6.h>
142 #include <netinet6/ip6_var.h>
143 #include <netinet6/in6_pcb.h>
144 #include <netinet6/ip6_var.h>
145 #include <netinet6/in6_var.h>
146 #include <netinet/icmp6.h>
147 #include <netinet6/nd6.h>
151 /* always need ip6.h for IP6_EXTHDR_GET */
152 #include <netinet/ip6.h>
155 #include <netinet/tcp.h>
156 #include <netinet/tcp_fsm.h>
157 #include <netinet/tcp_seq.h>
158 #include <netinet/tcp_timer.h>
159 #include <netinet/tcp_var.h>
160 #include <netinet/tcpip.h>
161 #include <netinet/tcp_debug.h>
163 #include <machine/stdarg.h>
165 /* SACK block pool. */
166 static struct pool sackhole_pool
;
172 pool_init(&sackhole_pool
, sizeof(struct sackhole
), 0, 0, 0,
173 "sackholepl", NULL
, IPL_SOFTNET
);
176 static struct sackhole
*
177 sack_allochole(struct tcpcb
*tp
)
179 struct sackhole
*hole
;
181 if (tp
->snd_numholes
>= tcp_sack_tp_maxholes
||
182 tcp_sack_globalholes
>= tcp_sack_globalmaxholes
) {
185 hole
= pool_get(&sackhole_pool
, PR_NOWAIT
);
190 tcp_sack_globalholes
++;
195 static struct sackhole
*
196 sack_inserthole(struct tcpcb
*tp
, tcp_seq start
, tcp_seq end
,
197 struct sackhole
*prev
)
199 struct sackhole
*hole
;
201 hole
= sack_allochole(tp
);
205 hole
->start
= hole
->rxmit
= start
;
208 TAILQ_INSERT_AFTER(&tp
->snd_holes
, prev
, hole
, sackhole_q
);
210 TAILQ_INSERT_TAIL(&tp
->snd_holes
, hole
, sackhole_q
);
215 static struct sackhole
*
216 sack_removehole(struct tcpcb
*tp
, struct sackhole
*hole
)
218 struct sackhole
*next
;
220 next
= TAILQ_NEXT(hole
, sackhole_q
);
222 tcp_sack_globalholes
--;
223 TAILQ_REMOVE(&tp
->snd_holes
, hole
, sackhole_q
);
224 pool_put(&sackhole_pool
, hole
);
230 tcp_new_dsack(struct tcpcb
*tp
, tcp_seq seq
, u_int32_t len
)
232 if (TCP_SACK_ENABLED(tp
)) {
233 tp
->rcv_dsack_block
.left
= seq
;
234 tp
->rcv_dsack_block
.right
= seq
+ len
;
235 tp
->rcv_sack_flags
|= TCPSACK_HAVED
;
240 tcp_sack_option(struct tcpcb
*tp
, const struct tcphdr
*th
, const u_char
*cp
,
244 t_sack_block
[(MAX_TCPOPTLEN
- 2) / (sizeof(u_int32_t
) * 2)];
245 struct sackblk
*sack
= NULL
;
246 struct sackhole
*cur
= NULL
;
247 struct sackhole
*tmp
= NULL
;
248 const char *lp
= cp
+ 2;
249 int i
, j
, num_sack_blks
;
250 tcp_seq left
, right
, acked
;
253 * If we aren't processing SACK responses, this is not an ACK
254 * or the peer sends us a sack option with invalid length, don't
255 * update the scoreboard.
257 if (!TCP_SACK_ENABLED(tp
) || ((th
->th_flags
& TH_ACK
) == 0) ||
258 (optlen
% 8 != 2 || optlen
< 10)) {
263 * If we don't want any SACK holes to be allocated, just return.
265 if (tcp_sack_globalmaxholes
== 0 || tcp_sack_tp_maxholes
== 0) {
269 /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
270 if (SEQ_LT(th
->th_ack
, tp
->snd_una
) || SEQ_GT(th
->th_ack
, tp
->snd_max
))
274 * Extract SACK blocks.
276 * Note that t_sack_block is sorted so that we only need to do
277 * one pass over the sequence number space. (SACK "fast-path")
279 num_sack_blks
= optlen
/ 8;
280 acked
= (SEQ_GT(th
->th_ack
, tp
->snd_una
)) ? th
->th_ack
: tp
->snd_una
;
281 for (i
= 0; i
< num_sack_blks
; i
++, lp
+= sizeof(uint32_t) * 2) {
282 memcpy(&left
, lp
, sizeof(uint32_t));
283 memcpy(&right
, lp
+ sizeof(uint32_t), sizeof(uint32_t));
285 right
= ntohl(right
);
287 if (SEQ_LEQ(right
, acked
) || SEQ_GT(right
, tp
->snd_max
) ||
288 SEQ_GEQ(left
, right
)) {
289 /* SACK entry that's old, or invalid. */
295 /* Insertion sort. */
296 for (j
= i
; (j
> 0) && SEQ_LT(left
, t_sack_block
[j
- 1].left
);
298 t_sack_block
[j
].left
= t_sack_block
[j
- 1].left
;
299 t_sack_block
[j
].right
= t_sack_block
[j
- 1].right
;
301 t_sack_block
[j
].left
= left
;
302 t_sack_block
[j
].right
= right
;
305 /* Update the scoreboard. */
306 cur
= TAILQ_FIRST(&tp
->snd_holes
);
307 for (i
= 0; i
< num_sack_blks
; i
++) {
308 sack
= &t_sack_block
[i
];
310 * FACK TCP. Update snd_fack so we can enter Fast
313 if (SEQ_GEQ(sack
->right
, tp
->snd_fack
))
314 tp
->snd_fack
= sack
->right
;
316 if (TAILQ_EMPTY(&tp
->snd_holes
)) {
318 cur
= sack_inserthole(tp
, th
->th_ack
, sack
->left
, NULL
);
320 /* ENOBUFS, bail out*/
323 tp
->rcv_lastsack
= sack
->right
;
324 continue; /* With next sack block */
327 /* Go through the list of holes. */
329 if (SEQ_LEQ(sack
->right
, cur
->start
))
330 /* SACKs data before the current hole */
331 break; /* No use going through more holes */
333 if (SEQ_GEQ(sack
->left
, cur
->end
)) {
334 /* SACKs data beyond the current hole */
335 cur
= TAILQ_NEXT(cur
, sackhole_q
);
339 if (SEQ_LEQ(sack
->left
, cur
->start
)) {
340 /* Data acks at least the beginning of hole */
341 if (SEQ_GEQ(sack
->right
, cur
->end
)) {
342 /* Acks entire hole, so delete hole */
343 cur
= sack_removehole(tp
, cur
);
347 /* Otherwise, move start of hole forward */
348 cur
->start
= sack
->right
;
349 cur
->rxmit
= SEQ_MAX(cur
->rxmit
, cur
->start
);
353 if (SEQ_GEQ(sack
->right
, cur
->end
)) {
354 /* Move end of hole backward. */
355 cur
->end
= sack
->left
;
356 cur
->rxmit
= SEQ_MIN(cur
->rxmit
, cur
->end
);
357 cur
= TAILQ_NEXT(cur
, sackhole_q
);
361 if (SEQ_LT(cur
->start
, sack
->left
) &&
362 SEQ_GT(cur
->end
, sack
->right
)) {
364 * ACKs some data in middle of a hole; need to
367 tmp
= sack_inserthole(tp
, sack
->right
, cur
->end
,
372 tmp
->rxmit
= SEQ_MAX(cur
->rxmit
, tmp
->start
);
373 cur
->end
= sack
->left
;
374 cur
->rxmit
= SEQ_MIN(cur
->rxmit
, cur
->end
);
380 /* At this point, we have reached the tail of the list. */
381 if (SEQ_LT(tp
->rcv_lastsack
, sack
->left
)) {
383 * Need to append new hole at end.
385 cur
= sack_inserthole(tp
, tp
->rcv_lastsack
, sack
->left
,
391 if (SEQ_LT(tp
->rcv_lastsack
, sack
->right
)) {
392 tp
->rcv_lastsack
= sack
->right
;
398 tcp_del_sackholes(struct tcpcb
*tp
, const struct tcphdr
*th
)
400 /* Max because this could be an older ack that just arrived. */
401 tcp_seq lastack
= SEQ_GT(th
->th_ack
, tp
->snd_una
) ?
402 th
->th_ack
: tp
->snd_una
;
403 struct sackhole
*cur
= TAILQ_FIRST(&tp
->snd_holes
);
406 if (SEQ_LEQ(cur
->end
, lastack
)) {
407 cur
= sack_removehole(tp
, cur
);
408 } else if (SEQ_LT(cur
->start
, lastack
)) {
409 cur
->start
= lastack
;
410 if (SEQ_LT(cur
->rxmit
, cur
->start
))
411 cur
->rxmit
= cur
->start
;
419 tcp_free_sackholes(struct tcpcb
*tp
)
421 struct sackhole
*sack
;
423 /* Free up the SACK hole list. */
424 while ((sack
= TAILQ_FIRST(&tp
->snd_holes
)) != NULL
) {
425 sack_removehole(tp
, sack
);
427 KASSERT(tp
->snd_numholes
== 0);
431 * Implements the SACK response to a new ack, checking for partial acks
435 tcp_sack_newack(struct tcpcb
*tp
, const struct tcphdr
*th
)
437 if (tp
->t_partialacks
< 0) {
439 * Not in fast recovery. Reset the duplicate ack
443 } else if (SEQ_LT(th
->th_ack
, tp
->snd_recover
)) {
445 * Partial ack handling within a sack recovery episode.
446 * Keeping this very simple for now. When a partial ack
447 * is received, force snd_cwnd to a value that will allow
448 * the sender to transmit no more than 2 segments.
449 * If necessary, a fancier scheme can be adopted at a
450 * later point, but for now, the goal is to prevent the
451 * sender from bursting a large amount of data in the midst
455 int sack_bytes_rxmt
= 0;
458 TCP_TIMER_DISARM(tp
, TCPT_REXMT
);
462 * send one or 2 segments based on how much new data was acked
464 if (((th
->th_ack
- tp
->snd_una
) / tp
->t_segsz
) > 2)
466 (void)tcp_sack_output(tp
, &sack_bytes_rxmt
);
467 tp
->snd_cwnd
= sack_bytes_rxmt
+
468 (tp
->snd_nxt
- tp
->sack_newdata
) + num_segs
* tp
->t_segsz
;
469 tp
->t_flags
|= TF_ACKNOW
;
470 (void) tcp_output(tp
);
473 * Complete ack, inflate the congestion window to
474 * ssthresh and exit fast recovery.
476 * Window inflation should have left us with approx.
477 * snd_ssthresh outstanding data. But in case we
478 * would be inclined to send a burst, better to do
479 * it via the slow start mechanism.
481 if (SEQ_SUB(tp
->snd_max
, th
->th_ack
) < tp
->snd_ssthresh
)
482 tp
->snd_cwnd
= SEQ_SUB(tp
->snd_max
, th
->th_ack
)
485 tp
->snd_cwnd
= tp
->snd_ssthresh
;
486 tp
->t_partialacks
= -1;
488 if (SEQ_GT(th
->th_ack
, tp
->snd_fack
))
489 tp
->snd_fack
= th
->th_ack
;
494 * Returns pointer to a sackhole if there are any pending retransmissions;
498 tcp_sack_output(struct tcpcb
*tp
, int *sack_bytes_rexmt
)
500 struct sackhole
*cur
= NULL
;
502 if (!TCP_SACK_ENABLED(tp
))
505 *sack_bytes_rexmt
= 0;
506 TAILQ_FOREACH(cur
, &tp
->snd_holes
, sackhole_q
) {
507 if (SEQ_LT(cur
->rxmit
, cur
->end
)) {
508 if (SEQ_LT(cur
->rxmit
, tp
->snd_una
)) {
512 *sack_bytes_rexmt
+= (cur
->rxmit
- cur
->start
);
515 *sack_bytes_rexmt
+= (cur
->rxmit
- cur
->start
);
522 * After a timeout, the SACK list may be rebuilt. This SACK information
523 * should be used to avoid retransmitting SACKed data. This function
524 * traverses the SACK list to see if snd_nxt should be moved forward.
527 tcp_sack_adjust(struct tcpcb
*tp
)
529 struct sackhole
*cur
= TAILQ_FIRST(&tp
->snd_holes
);
530 struct sackhole
*n
= NULL
;
532 if (TAILQ_EMPTY(&tp
->snd_holes
))
533 return; /* No holes */
534 if (SEQ_GEQ(tp
->snd_nxt
, tp
->rcv_lastsack
))
535 return; /* We're already beyond any SACKed blocks */
538 * Two cases for which we want to advance snd_nxt:
539 * i) snd_nxt lies between end of one hole and beginning of another
540 * ii) snd_nxt lies between end of last hole and rcv_lastsack
542 while ((n
= TAILQ_NEXT(cur
, sackhole_q
)) != NULL
) {
543 if (SEQ_LT(tp
->snd_nxt
, cur
->end
))
545 if (SEQ_GEQ(tp
->snd_nxt
, n
->start
))
548 tp
->snd_nxt
= n
->start
;
552 if (SEQ_LT(tp
->snd_nxt
, cur
->end
))
554 tp
->snd_nxt
= tp
->rcv_lastsack
;
560 tcp_sack_numblks(const struct tcpcb
*tp
)
564 if (!TCP_SACK_ENABLED(tp
)) {
568 numblks
= (((tp
->rcv_sack_flags
& TCPSACK_HAVED
) != 0) ? 1 : 0) +
575 if (numblks
> TCP_SACK_MAX
) {
576 numblks
= TCP_SACK_MAX
;
583 void sack_dump(const struct tcpcb
*);
586 sack_dump(const struct tcpcb
*tp
)
588 const struct sackhole
*cur
;
590 printf("snd_una=%" PRIu32
", snd_max=%" PRIu32
"\n",
591 tp
->snd_una
, tp
->snd_max
);
592 printf("rcv_lastsack=%" PRIu32
", snd_fack=%" PRIu32
"\n",
593 tp
->rcv_lastsack
, tp
->snd_fack
);
594 printf("numholes=%d\n", tp
->snd_numholes
);
595 TAILQ_FOREACH(cur
, &tp
->snd_holes
, sackhole_q
) {
596 printf("\t%" PRIu32
"-%" PRIu32
", rxmit=%" PRIu32
"\n",
597 cur
->start
, cur
->end
, cur
->rxmit
);
600 #endif /* defined(DDB) */