2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 2007-2008,2010
7 * Swinburne University of Technology, Melbourne, Australia.
8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9 * Copyright (c) 2010 The FreeBSD Foundation
10 * Copyright (c) 2010-2011 Juniper Networks, Inc.
11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12 * All rights reserved.
14 * Portions of this software were developed at the Centre for Advanced Internet
15 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16 * James Healy and David Hayes, made possible in part by a grant from the Cisco
17 * University Research Program Fund at Community Foundation Silicon Valley.
19 * Portions of this software were developed at the Centre for Advanced
20 * Internet Architectures, Swinburne University of Technology, Melbourne,
21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
23 * Portions of this software were developed by Robert N. M. Watson under
24 * contract to Juniper Networks, Inc.
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. Neither the name of the University nor the names of its contributors
35 * may be used to endorse or promote products derived from this software
36 * without specific prior written permission.
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * Utility functions to deal with Explicit Congestion Notification in TCP
53 * implementing the essential parts of the Accurate ECN extension
54 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57 #include <sys/cdefs.h>
59 #include "opt_inet6.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/kernel.h>
64 #include <sys/sysctl.h>
65 #include <sys/malloc.h>
67 #include <sys/socket.h>
68 #include <sys/socketvar.h>
70 #include <machine/cpu.h>
75 #include <net/if_var.h>
76 #include <net/route.h>
79 #include <netinet/in.h>
80 #include <netinet/in_systm.h>
81 #include <netinet/ip.h>
82 #include <netinet/in_var.h>
83 #include <netinet/in_pcb.h>
84 #include <netinet/ip_var.h>
85 #include <netinet/ip6.h>
86 #include <netinet/icmp6.h>
87 #include <netinet6/nd6.h>
88 #include <netinet6/ip6_var.h>
89 #include <netinet6/in6_pcb.h>
90 #include <netinet/tcp.h>
91 #include <netinet/tcp_fsm.h>
92 #include <netinet/tcp_seq.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/tcp_syncache.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcpip.h>
97 #include <netinet/tcp_ecn.h>
99 static inline int tcp_ecn_get_ace(uint16_t);
100 static inline void tcp_ecn_set_ace(uint16_t *, uint32_t);
102 static SYSCTL_NODE(_net_inet_tcp
, OID_AUTO
, ecn
,
103 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
106 VNET_DEFINE(int, tcp_do_ecn
) = 2;
107 SYSCTL_INT(_net_inet_tcp_ecn
, OID_AUTO
, enable
,
108 CTLFLAG_VNET
| CTLFLAG_RW
, &VNET_NAME(tcp_do_ecn
), 0,
111 VNET_DEFINE(int, tcp_ecn_maxretries
) = 1;
112 SYSCTL_INT(_net_inet_tcp_ecn
, OID_AUTO
, maxretries
,
113 CTLFLAG_VNET
| CTLFLAG_RW
, &VNET_NAME(tcp_ecn_maxretries
), 0,
114 "Max retries before giving up on ECN");
117 * Process incoming SYN,ACK packet
120 tcp_ecn_input_syn_sent(struct tcpcb
*tp
, uint16_t thflags
, int iptos
)
122 switch (V_tcp_do_ecn
) {
128 /* RFC3168 ECN handling */
129 if ((thflags
& (TH_CWR
| TH_ECE
)) == (0 | TH_ECE
)) {
130 tp
->t_flags2
|= TF2_ECN_PERMIT
;
131 tp
->t_flags2
&= ~TF2_ACE_PERMIT
;
132 TCPSTAT_INC(tcps_ecn_shs
);
139 * Decoding Accurate ECN according to
140 * table in section 3.1.1
142 * On the SYN,ACK, process the AccECN
143 * flags indicating the state the SYN
145 * Reactions to Path ECN mangling can
148 switch (thflags
& (TH_AE
| TH_CWR
| TH_ECE
)) {
151 tp
->t_flags2
|= TF2_ECN_PERMIT
;
152 tp
->t_flags2
&= ~TF2_ACE_PERMIT
;
153 TCPSTAT_INC(tcps_ecn_shs
);
157 tp
->t_flags2
|= TF2_ACE_PERMIT
;
158 tp
->t_flags2
&= ~TF2_ECN_PERMIT
;
160 TCPSTAT_INC(tcps_ecn_shs
);
161 TCPSTAT_INC(tcps_ace_nect
);
165 tp
->t_flags2
|= TF2_ACE_PERMIT
;
166 tp
->t_flags2
&= ~TF2_ECN_PERMIT
;
168 TCPSTAT_INC(tcps_ecn_shs
);
169 TCPSTAT_INC(tcps_ace_ect0
);
172 case (0|TH_CWR
|TH_ECE
):
173 tp
->t_flags2
|= TF2_ACE_PERMIT
;
174 tp
->t_flags2
&= ~TF2_ECN_PERMIT
;
176 TCPSTAT_INC(tcps_ecn_shs
);
177 TCPSTAT_INC(tcps_ace_ect1
);
180 case (TH_AE
|TH_CWR
|0):
181 tp
->t_flags2
|= TF2_ACE_PERMIT
;
182 tp
->t_flags2
&= ~TF2_ECN_PERMIT
;
185 * reduce the IW to 2 MSS (to
186 * account for delayed acks) if
187 * the SYN,ACK was CE marked
189 tp
->snd_cwnd
= 2 * tcp_maxseg(tp
);
190 TCPSTAT_INC(tcps_ecn_shs
);
191 TCPSTAT_INC(tcps_ace_nect
);
194 tp
->t_flags2
&= ~(TF2_ECN_PERMIT
| TF2_ACE_PERMIT
);
198 * Set the AccECN Codepoints on
199 * the outgoing <ACK> to the ECN
200 * state of the <SYN,ACK>
201 * according to table 3 in the
204 switch (iptos
& IPTOS_ECN_MASK
) {
205 case (IPTOS_ECN_NOTECT
):
208 case (IPTOS_ECN_ECT0
):
211 case (IPTOS_ECN_ECT1
):
223 * Handle parallel SYN for ECN
226 tcp_ecn_input_parallel_syn(struct tcpcb
*tp
, uint16_t thflags
, int iptos
)
228 if (thflags
& TH_ACK
)
230 switch (V_tcp_do_ecn
) {
236 /* RFC3168 ECN handling */
237 if ((thflags
& (TH_CWR
| TH_ECE
)) == (TH_CWR
| TH_ECE
)) {
238 tp
->t_flags2
|= TF2_ECN_PERMIT
;
239 tp
->t_flags2
&= ~TF2_ACE_PERMIT
;
240 tp
->t_flags2
|= TF2_ECN_SND_ECE
;
241 TCPSTAT_INC(tcps_ecn_shs
);
247 /* AccECN handling */
248 switch (thflags
& (TH_AE
| TH_CWR
| TH_ECE
)) {
251 tp
->t_flags2
&= ~(TF2_ECN_PERMIT
| TF2_ACE_PERMIT
);
253 case (0|TH_CWR
|TH_ECE
):
254 tp
->t_flags2
|= TF2_ECN_PERMIT
;
255 tp
->t_flags2
&= ~TF2_ACE_PERMIT
;
256 tp
->t_flags2
|= TF2_ECN_SND_ECE
;
257 TCPSTAT_INC(tcps_ecn_shs
);
259 case (TH_AE
|TH_CWR
|TH_ECE
):
260 tp
->t_flags2
|= TF2_ACE_PERMIT
;
261 tp
->t_flags2
&= ~TF2_ECN_PERMIT
;
262 TCPSTAT_INC(tcps_ecn_shs
);
264 * Set the AccECN Codepoints on
265 * the outgoing <ACK> to the ECN
266 * state of the <SYN,ACK>
267 * according to table 3 in the
270 switch (iptos
& IPTOS_ECN_MASK
) {
271 case (IPTOS_ECN_NOTECT
):
274 case (IPTOS_ECN_ECT0
):
277 case (IPTOS_ECN_ECT1
):
291 * TCP ECN processing.
294 tcp_ecn_input_segment(struct tcpcb
*tp
, uint16_t thflags
, int tlen
, int pkts
, int iptos
)
298 switch (iptos
& IPTOS_ECN_MASK
) {
300 TCPSTAT_INC(tcps_ecn_rcvce
);
303 TCPSTAT_INC(tcps_ecn_rcvect0
);
306 TCPSTAT_INC(tcps_ecn_rcvect1
);
310 if (tp
->t_flags2
& (TF2_ECN_PERMIT
| TF2_ACE_PERMIT
)) {
311 if (tp
->t_flags2
& TF2_ACE_PERMIT
) {
312 if ((iptos
& IPTOS_ECN_MASK
) == IPTOS_ECN_CE
)
314 if (tp
->t_flags2
& TF2_ECN_PERMIT
) {
315 delta_cep
= (tcp_ecn_get_ace(thflags
) + 8 -
316 (tp
->t_scep
& 7)) & 7;
317 if (delta_cep
< pkts
)
319 ((pkts
- delta_cep
) & 7);
320 tp
->t_scep
+= delta_cep
;
323 * process the final ACK of the 3WHS
324 * see table 3 in draft-ietf-tcpm-accurate-ecn
326 switch (tcp_ecn_get_ace(thflags
)) {
328 /* nonECT SYN or SYN,ACK */
331 /* ECT1 SYN or SYN,ACK */
334 /* ECT0 SYN or SYN,ACK */
338 /* CE SYN or SYN,ACK */
340 tp
->snd_cwnd
= 2 * tcp_maxseg(tp
);
343 /* mangled AccECN handshake */
347 tp
->t_flags2
|= TF2_ECN_PERMIT
;
350 /* RFC3168 ECN handling */
351 if ((thflags
& (TH_SYN
| TH_ECE
)) == TH_ECE
) {
355 if (thflags
& TH_CWR
) {
356 tp
->t_flags2
&= ~TF2_ECN_SND_ECE
;
357 tp
->t_flags
|= TF_ACKNOW
;
359 if ((iptos
& IPTOS_ECN_MASK
) == IPTOS_ECN_CE
)
360 tp
->t_flags2
|= TF2_ECN_SND_ECE
;
363 /* Process a packet differently from RFC3168. */
364 cc_ecnpkt_handler_flags(tp
, thflags
, iptos
);
371 * Send ECN setup <SYN> packet header flags
374 tcp_ecn_output_syn_sent(struct tcpcb
*tp
)
376 uint16_t thflags
= 0;
378 if (V_tcp_do_ecn
== 0)
380 if (V_tcp_do_ecn
== 1) {
381 /* Send a RFC3168 ECN setup <SYN> packet */
382 if (tp
->t_rxtshift
>= 1) {
383 if (tp
->t_rxtshift
<= V_tcp_ecn_maxretries
)
384 thflags
= TH_ECE
|TH_CWR
;
386 thflags
= TH_ECE
|TH_CWR
;
387 } else if (V_tcp_do_ecn
== 3) {
388 /* Send an Accurate ECN setup <SYN> packet */
389 if (tp
->t_rxtshift
>= 1) {
390 if (tp
->t_rxtshift
<= V_tcp_ecn_maxretries
)
391 thflags
= TH_ECE
|TH_CWR
|TH_AE
;
393 thflags
= TH_ECE
|TH_CWR
|TH_AE
;
400 * output processing of ECN feature
401 * returning IP ECN header codepoint
404 tcp_ecn_output_established(struct tcpcb
*tp
, uint16_t *thflags
, int len
, bool rxmit
)
406 int ipecn
= IPTOS_ECN_NOTECT
;
410 * If the peer has ECN, mark data packets with
411 * ECN capable transmission (ECT).
412 * Ignore pure control packets, retransmissions
415 newdata
= (len
> 0 && SEQ_GEQ(tp
->snd_nxt
, tp
->snd_max
) &&
417 !((tp
->t_flags
& TF_FORCEDATA
) && len
== 1));
418 /* RFC3168 ECN marking, only new data segments */
420 if (tp
->t_flags2
& TF2_ECN_USE_ECT1
) {
421 ipecn
= IPTOS_ECN_ECT1
;
422 TCPSTAT_INC(tcps_ecn_sndect1
);
424 ipecn
= IPTOS_ECN_ECT0
;
425 TCPSTAT_INC(tcps_ecn_sndect0
);
429 * Reply with proper ECN notifications.
431 if (tp
->t_flags2
& TF2_ACE_PERMIT
) {
432 tcp_ecn_set_ace(thflags
, tp
->t_rcep
);
433 if (!(tp
->t_flags2
& TF2_ECN_PERMIT
)) {
435 * here we process the final
438 if (tp
->t_rcep
== 0b110) {
443 tp
->t_flags2
|= TF2_ECN_PERMIT
;
447 (tp
->t_flags2
& TF2_ECN_SND_CWR
)) {
449 tp
->t_flags2
&= ~TF2_ECN_SND_CWR
;
451 if (tp
->t_flags2
& TF2_ECN_SND_ECE
)
459 * Set up the ECN related tcpcb fields from
463 tcp_ecn_syncache_socket(struct tcpcb
*tp
, struct syncache
*sc
)
465 if (sc
->sc_flags
& SCF_ECN_MASK
) {
466 switch (sc
->sc_flags
& SCF_ECN_MASK
) {
468 tp
->t_flags2
|= TF2_ECN_PERMIT
;
475 tp
->t_flags2
|= TF2_ACE_PERMIT
;
480 tp
->t_flags2
|= TF2_ACE_PERMIT
;
489 * Process a <SYN> packets ECN information, and provide the
490 * syncache with the relevant information.
493 tcp_ecn_syncache_add(uint16_t thflags
, int iptos
)
497 switch (iptos
& IPTOS_ECN_MASK
) {
499 TCPSTAT_INC(tcps_ecn_rcvce
);
502 TCPSTAT_INC(tcps_ecn_rcvect0
);
505 TCPSTAT_INC(tcps_ecn_rcvect1
);
509 switch (thflags
& (TH_AE
|TH_CWR
|TH_ECE
)) {
514 case (0|TH_CWR
|TH_ECE
):
518 case (TH_AE
|TH_CWR
|TH_ECE
):
519 if ((V_tcp_do_ecn
== 3) ||
520 (V_tcp_do_ecn
== 4)) {
521 switch (iptos
& IPTOS_ECN_MASK
) {
523 scflags
= SCF_ACE_CE
;
531 case IPTOS_ECN_NOTECT
:
538 /* Default Case (section 3.1.2) */
540 if ((V_tcp_do_ecn
== 3) ||
541 (V_tcp_do_ecn
== 4)) {
542 switch (iptos
& IPTOS_ECN_MASK
) {
544 scflags
= SCF_ACE_CE
;
552 case IPTOS_ECN_NOTECT
:
563 * Set up the ECN information for the <SYN,ACK> from
564 * syncache information.
567 tcp_ecn_syncache_respond(uint16_t thflags
, struct syncache
*sc
)
569 if ((thflags
& TH_SYN
) &&
570 (sc
->sc_flags
& SCF_ECN_MASK
)) {
571 switch (sc
->sc_flags
& SCF_ECN_MASK
) {
573 thflags
|= (0 | 0 | TH_ECE
);
574 TCPSTAT_INC(tcps_ecn_shs
);
577 thflags
|= (0 | TH_CWR
| 0);
578 TCPSTAT_INC(tcps_ecn_shs
);
579 TCPSTAT_INC(tcps_ace_nect
);
582 thflags
|= (TH_AE
| 0 | 0);
583 TCPSTAT_INC(tcps_ecn_shs
);
584 TCPSTAT_INC(tcps_ace_ect0
);
587 thflags
|= (0 | TH_ECE
| TH_CWR
);
588 TCPSTAT_INC(tcps_ecn_shs
);
589 TCPSTAT_INC(tcps_ace_ect1
);
592 thflags
|= (TH_AE
| TH_CWR
| 0);
593 TCPSTAT_INC(tcps_ecn_shs
);
594 TCPSTAT_INC(tcps_ace_ce
);
602 tcp_ecn_get_ace(uint16_t thflags
)
604 return ((thflags
& (TH_AE
|TH_CWR
|TH_ECE
)) >> TH_ACE_SHIFT
);
608 tcp_ecn_set_ace(uint16_t *thflags
, uint32_t t_rcep
)
610 *thflags
&= ~(TH_AE
|TH_CWR
|TH_ECE
);
611 *thflags
|= ((t_rcep
<< TH_ACE_SHIFT
) & (TH_AE
|TH_CWR
|TH_ECE
));