1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <crypto/sha2.h>
12 #include <net/mptcp.h>
16 static bool mptcp_cap_flag_sha256(u8 flags
)
18 return (flags
& MPTCP_CAP_FLAG_MASK
) == MPTCP_CAP_HMAC_SHA256
;
21 static void mptcp_parse_option(const struct sk_buff
*skb
,
22 const unsigned char *ptr
, int opsize
,
23 struct mptcp_options_received
*mp_opt
)
25 u8 subtype
= *ptr
>> 4;
31 case MPTCPOPT_MP_CAPABLE
:
32 /* strict size checking */
33 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_SYN
)) {
34 if (skb
->len
> tcp_hdr(skb
)->doff
<< 2)
35 expected_opsize
= TCPOLEN_MPTCP_MPC_ACK_DATA
;
37 expected_opsize
= TCPOLEN_MPTCP_MPC_ACK
;
39 if (TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_ACK
)
40 expected_opsize
= TCPOLEN_MPTCP_MPC_SYNACK
;
42 expected_opsize
= TCPOLEN_MPTCP_MPC_SYN
;
44 if (opsize
!= expected_opsize
)
47 /* try to be gentle vs future versions on the initial syn */
48 version
= *ptr
++ & MPTCP_VERSION_MASK
;
49 if (opsize
!= TCPOLEN_MPTCP_MPC_SYN
) {
50 if (version
!= MPTCP_SUPPORTED_VERSION
)
52 } else if (version
< MPTCP_SUPPORTED_VERSION
) {
57 if (!mptcp_cap_flag_sha256(flags
) ||
58 (flags
& MPTCP_CAP_EXTENSIBILITY
))
61 /* RFC 6824, Section 3.1:
62 * "For the Checksum Required bit (labeled "A"), if either
63 * host requires the use of checksums, checksums MUST be used.
64 * In other words, the only way for checksums not to be used
65 * is if both hosts in their SYNs set A=0."
68 * "If a checksum is not present when its use has been
69 * negotiated, the receiver MUST close the subflow with a RST as
70 * it is considered broken."
72 * We don't implement DSS checksum - fall back to TCP.
74 if (flags
& MPTCP_CAP_CHECKSUM_REQD
)
77 mp_opt
->mp_capable
= 1;
78 if (opsize
>= TCPOLEN_MPTCP_MPC_SYNACK
) {
79 mp_opt
->sndr_key
= get_unaligned_be64(ptr
);
82 if (opsize
>= TCPOLEN_MPTCP_MPC_ACK
) {
83 mp_opt
->rcvr_key
= get_unaligned_be64(ptr
);
86 if (opsize
== TCPOLEN_MPTCP_MPC_ACK_DATA
) {
88 * "the data parameters in a MP_CAPABLE are semantically
89 * equivalent to those in a DSS option and can be used
95 mp_opt
->data_len
= get_unaligned_be16(ptr
);
98 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
99 version
, flags
, opsize
, mp_opt
->sndr_key
,
100 mp_opt
->rcvr_key
, mp_opt
->data_len
);
103 case MPTCPOPT_MP_JOIN
:
105 if (opsize
== TCPOLEN_MPTCP_MPJ_SYN
) {
106 mp_opt
->backup
= *ptr
++ & MPTCPOPT_BACKUP
;
107 mp_opt
->join_id
= *ptr
++;
108 mp_opt
->token
= get_unaligned_be32(ptr
);
110 mp_opt
->nonce
= get_unaligned_be32(ptr
);
112 pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
113 mp_opt
->backup
, mp_opt
->join_id
,
114 mp_opt
->token
, mp_opt
->nonce
);
115 } else if (opsize
== TCPOLEN_MPTCP_MPJ_SYNACK
) {
116 mp_opt
->backup
= *ptr
++ & MPTCPOPT_BACKUP
;
117 mp_opt
->join_id
= *ptr
++;
118 mp_opt
->thmac
= get_unaligned_be64(ptr
);
120 mp_opt
->nonce
= get_unaligned_be32(ptr
);
122 pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
123 mp_opt
->backup
, mp_opt
->join_id
,
124 mp_opt
->thmac
, mp_opt
->nonce
);
125 } else if (opsize
== TCPOLEN_MPTCP_MPJ_ACK
) {
127 memcpy(mp_opt
->hmac
, ptr
, MPTCPOPT_HMAC_LEN
);
128 pr_debug("MP_JOIN hmac");
130 pr_warn("MP_JOIN bad option size");
139 /* we must clear 'mpc_map' be able to detect MP_CAPABLE
140 * map vs DSS map in mptcp_incoming_options(), and reconstruct
141 * map info accordingly
144 flags
= (*ptr
++) & MPTCP_DSS_FLAG_MASK
;
145 mp_opt
->data_fin
= (flags
& MPTCP_DSS_DATA_FIN
) != 0;
146 mp_opt
->dsn64
= (flags
& MPTCP_DSS_DSN64
) != 0;
147 mp_opt
->use_map
= (flags
& MPTCP_DSS_HAS_MAP
) != 0;
148 mp_opt
->ack64
= (flags
& MPTCP_DSS_ACK64
) != 0;
149 mp_opt
->use_ack
= (flags
& MPTCP_DSS_HAS_ACK
);
151 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
152 mp_opt
->data_fin
, mp_opt
->dsn64
,
153 mp_opt
->use_map
, mp_opt
->ack64
,
156 expected_opsize
= TCPOLEN_MPTCP_DSS_BASE
;
158 if (mp_opt
->use_ack
) {
160 expected_opsize
+= TCPOLEN_MPTCP_DSS_ACK64
;
162 expected_opsize
+= TCPOLEN_MPTCP_DSS_ACK32
;
165 if (mp_opt
->use_map
) {
167 expected_opsize
+= TCPOLEN_MPTCP_DSS_MAP64
;
169 expected_opsize
+= TCPOLEN_MPTCP_DSS_MAP32
;
172 /* RFC 6824, Section 3.3:
173 * If a checksum is present, but its use had
174 * not been negotiated in the MP_CAPABLE handshake,
175 * the checksum field MUST be ignored.
177 if (opsize
!= expected_opsize
&&
178 opsize
!= expected_opsize
+ TCPOLEN_MPTCP_DSS_CHECKSUM
)
183 if (mp_opt
->use_ack
) {
185 mp_opt
->data_ack
= get_unaligned_be64(ptr
);
188 mp_opt
->data_ack
= get_unaligned_be32(ptr
);
192 pr_debug("data_ack=%llu", mp_opt
->data_ack
);
195 if (mp_opt
->use_map
) {
197 mp_opt
->data_seq
= get_unaligned_be64(ptr
);
200 mp_opt
->data_seq
= get_unaligned_be32(ptr
);
204 mp_opt
->subflow_seq
= get_unaligned_be32(ptr
);
207 mp_opt
->data_len
= get_unaligned_be16(ptr
);
210 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
211 mp_opt
->data_seq
, mp_opt
->subflow_seq
,
217 case MPTCPOPT_ADD_ADDR
:
218 mp_opt
->echo
= (*ptr
++) & MPTCP_ADDR_ECHO
;
220 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR
||
221 opsize
== TCPOLEN_MPTCP_ADD_ADDR_PORT
)
222 mp_opt
->family
= MPTCP_ADDR_IPVERSION_4
;
223 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
224 else if (opsize
== TCPOLEN_MPTCP_ADD_ADDR6
||
225 opsize
== TCPOLEN_MPTCP_ADD_ADDR6_PORT
)
226 mp_opt
->family
= MPTCP_ADDR_IPVERSION_6
;
231 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR_BASE
||
232 opsize
== TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT
)
233 mp_opt
->family
= MPTCP_ADDR_IPVERSION_4
;
234 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
235 else if (opsize
== TCPOLEN_MPTCP_ADD_ADDR6_BASE
||
236 opsize
== TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT
)
237 mp_opt
->family
= MPTCP_ADDR_IPVERSION_6
;
243 mp_opt
->add_addr
= 1;
244 mp_opt
->addr_id
= *ptr
++;
245 if (mp_opt
->family
== MPTCP_ADDR_IPVERSION_4
) {
246 memcpy((u8
*)&mp_opt
->addr
.s_addr
, (u8
*)ptr
, 4);
248 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR_PORT
||
249 opsize
== TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT
) {
250 mp_opt
->port
= get_unaligned_be16(ptr
);
254 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
256 memcpy(mp_opt
->addr6
.s6_addr
, (u8
*)ptr
, 16);
258 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR6_PORT
||
259 opsize
== TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT
) {
260 mp_opt
->port
= get_unaligned_be16(ptr
);
266 mp_opt
->ahmac
= get_unaligned_be64(ptr
);
269 pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
270 (mp_opt
->family
== MPTCP_ADDR_IPVERSION_6
) ? "6" : "",
271 mp_opt
->addr_id
, mp_opt
->ahmac
, mp_opt
->echo
, mp_opt
->port
);
274 case MPTCPOPT_RM_ADDR
:
275 if (opsize
!= TCPOLEN_MPTCP_RM_ADDR_BASE
)
281 mp_opt
->rm_id
= *ptr
++;
282 pr_debug("RM_ADDR: id=%d", mp_opt
->rm_id
);
285 case MPTCPOPT_MP_FASTCLOSE
:
286 if (opsize
!= TCPOLEN_MPTCP_FASTCLOSE
)
290 mp_opt
->rcvr_key
= get_unaligned_be64(ptr
);
292 mp_opt
->fastclose
= 1;
300 void mptcp_get_options(const struct sk_buff
*skb
,
301 struct mptcp_options_received
*mp_opt
)
303 const struct tcphdr
*th
= tcp_hdr(skb
);
304 const unsigned char *ptr
;
307 /* initialize option status */
308 mp_opt
->mp_capable
= 0;
310 mp_opt
->add_addr
= 0;
312 mp_opt
->fastclose
= 0;
317 length
= (th
->doff
* 4) - sizeof(struct tcphdr
);
318 ptr
= (const unsigned char *)(th
+ 1);
327 case TCPOPT_NOP
: /* Ref: RFC 793 section 3.1 */
332 if (opsize
< 2) /* "silly options" */
335 return; /* don't parse partial options */
336 if (opcode
== TCPOPT_MPTCP
)
337 mptcp_parse_option(skb
, ptr
, opsize
, mp_opt
);
344 bool mptcp_syn_options(struct sock
*sk
, const struct sk_buff
*skb
,
345 unsigned int *size
, struct mptcp_out_options
*opts
)
347 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
349 /* we will use snd_isn to detect first pkt [re]transmission
350 * in mptcp_established_options_mp()
352 subflow
->snd_isn
= TCP_SKB_CB(skb
)->end_seq
;
353 if (subflow
->request_mptcp
) {
354 opts
->suboptions
= OPTION_MPTCP_MPC_SYN
;
355 *size
= TCPOLEN_MPTCP_MPC_SYN
;
357 } else if (subflow
->request_join
) {
358 pr_debug("remote_token=%u, nonce=%u", subflow
->remote_token
,
359 subflow
->local_nonce
);
360 opts
->suboptions
= OPTION_MPTCP_MPJ_SYN
;
361 opts
->join_id
= subflow
->local_id
;
362 opts
->token
= subflow
->remote_token
;
363 opts
->nonce
= subflow
->local_nonce
;
364 opts
->backup
= subflow
->request_bkup
;
365 *size
= TCPOLEN_MPTCP_MPJ_SYN
;
371 /* MP_JOIN client subflow must wait for 4th ack before sending any data:
372 * TCP can't schedule delack timer before the subflow is fully established.
373 * MPTCP uses the delack timer to do 3rd ack retransmissions
375 static void schedule_3rdack_retransmission(struct sock
*sk
)
377 struct inet_connection_sock
*icsk
= inet_csk(sk
);
378 struct tcp_sock
*tp
= tcp_sk(sk
);
379 unsigned long timeout
;
381 /* reschedule with a timeout above RTT, as we must look only for drop */
383 timeout
= tp
->srtt_us
<< 1;
385 timeout
= TCP_TIMEOUT_INIT
;
387 WARN_ON_ONCE(icsk
->icsk_ack
.pending
& ICSK_ACK_TIMER
);
388 icsk
->icsk_ack
.pending
|= ICSK_ACK_SCHED
| ICSK_ACK_TIMER
;
389 icsk
->icsk_ack
.timeout
= timeout
;
390 sk_reset_timer(sk
, &icsk
->icsk_delack_timer
, timeout
);
393 static void clear_3rdack_retransmission(struct sock
*sk
)
395 struct inet_connection_sock
*icsk
= inet_csk(sk
);
397 sk_stop_timer(sk
, &icsk
->icsk_delack_timer
);
398 icsk
->icsk_ack
.timeout
= 0;
399 icsk
->icsk_ack
.ato
= 0;
400 icsk
->icsk_ack
.pending
&= ~(ICSK_ACK_SCHED
| ICSK_ACK_TIMER
);
403 static bool mptcp_established_options_mp(struct sock
*sk
, struct sk_buff
*skb
,
405 unsigned int remaining
,
406 struct mptcp_out_options
*opts
)
408 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
409 struct mptcp_ext
*mpext
;
410 unsigned int data_len
;
412 /* When skb is not available, we better over-estimate the emitted
413 * options len. A full DSS option (28 bytes) is longer than
414 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
415 * tell the caller to defer the estimate to
416 * mptcp_established_options_dss(), which will reserve enough space.
421 /* MPC/MPJ needed only on 3rd ack packet */
422 if (subflow
->fully_established
||
423 subflow
->snd_isn
!= TCP_SKB_CB(skb
)->seq
)
426 if (subflow
->mp_capable
) {
427 mpext
= mptcp_get_ext(skb
);
428 data_len
= mpext
? mpext
->data_len
: 0;
430 /* we will check ext_copy.data_len in mptcp_write_options() to
431 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
432 * TCPOLEN_MPTCP_MPC_ACK
434 opts
->ext_copy
.data_len
= data_len
;
435 opts
->suboptions
= OPTION_MPTCP_MPC_ACK
;
436 opts
->sndr_key
= subflow
->local_key
;
437 opts
->rcvr_key
= subflow
->remote_key
;
440 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
441 * packets that start the first subflow of an MPTCP connection,
442 * as well as the first packet that carries data
445 *size
= ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA
, 4);
447 *size
= TCPOLEN_MPTCP_MPC_ACK
;
449 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
450 subflow
, subflow
->local_key
, subflow
->remote_key
,
454 } else if (subflow
->mp_join
) {
455 opts
->suboptions
= OPTION_MPTCP_MPJ_ACK
;
456 memcpy(opts
->hmac
, subflow
->hmac
, MPTCPOPT_HMAC_LEN
);
457 *size
= TCPOLEN_MPTCP_MPJ_ACK
;
458 pr_debug("subflow=%p", subflow
);
460 schedule_3rdack_retransmission(sk
);
466 static void mptcp_write_data_fin(struct mptcp_subflow_context
*subflow
,
467 struct sk_buff
*skb
, struct mptcp_ext
*ext
)
469 /* The write_seq value has already been incremented, so the actual
470 * sequence number for the DATA_FIN is one less.
472 u64 data_fin_tx_seq
= READ_ONCE(mptcp_sk(subflow
->conn
)->write_seq
) - 1;
474 if (!ext
->use_map
|| !skb
->len
) {
475 /* RFC6824 requires a DSS mapping with specific values
476 * if DATA_FIN is set but no data payload is mapped
481 ext
->data_seq
= data_fin_tx_seq
;
482 ext
->subflow_seq
= 0;
484 } else if (ext
->data_seq
+ ext
->data_len
== data_fin_tx_seq
) {
485 /* If there's an existing DSS mapping and it is the
486 * final mapping, DATA_FIN consumes 1 additional byte of
494 static bool mptcp_established_options_dss(struct sock
*sk
, struct sk_buff
*skb
,
496 unsigned int remaining
,
497 struct mptcp_out_options
*opts
)
499 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
500 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
501 unsigned int dss_size
= 0;
502 u64 snd_data_fin_enable
;
503 struct mptcp_ext
*mpext
;
504 unsigned int ack_size
;
507 mpext
= skb
? mptcp_get_ext(skb
) : NULL
;
508 snd_data_fin_enable
= mptcp_data_fin_enabled(msk
);
510 if (!skb
|| (mpext
&& mpext
->use_map
) || snd_data_fin_enable
) {
511 unsigned int map_size
;
513 map_size
= TCPOLEN_MPTCP_DSS_BASE
+ TCPOLEN_MPTCP_DSS_MAP64
;
515 remaining
-= map_size
;
518 opts
->ext_copy
= *mpext
;
520 if (skb
&& snd_data_fin_enable
)
521 mptcp_write_data_fin(subflow
, skb
, &opts
->ext_copy
);
525 /* passive sockets msk will set the 'can_ack' after accept(), even
526 * if the first subflow may have the already the remote key handy
528 opts
->ext_copy
.use_ack
= 0;
529 if (!READ_ONCE(msk
->can_ack
)) {
530 *size
= ALIGN(dss_size
, 4);
534 if (READ_ONCE(msk
->use_64bit_ack
)) {
535 ack_size
= TCPOLEN_MPTCP_DSS_ACK64
;
536 opts
->ext_copy
.data_ack
= READ_ONCE(msk
->ack_seq
);
537 opts
->ext_copy
.ack64
= 1;
539 ack_size
= TCPOLEN_MPTCP_DSS_ACK32
;
540 opts
->ext_copy
.data_ack32
= (uint32_t)READ_ONCE(msk
->ack_seq
);
541 opts
->ext_copy
.ack64
= 0;
543 opts
->ext_copy
.use_ack
= 1;
544 WRITE_ONCE(msk
->old_wspace
, __mptcp_space((struct sock
*)msk
));
546 /* Add kind/length/subtype/flag overhead if mapping is not populated */
548 ack_size
+= TCPOLEN_MPTCP_DSS_BASE
;
550 dss_size
+= ack_size
;
552 *size
= ALIGN(dss_size
, 4);
556 static u64
add_addr_generate_hmac(u64 key1
, u64 key2
, u8 addr_id
,
557 struct in_addr
*addr
)
559 u8 hmac
[SHA256_DIGEST_SIZE
];
563 memcpy(&msg
[1], &addr
->s_addr
, 4);
567 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 7, hmac
);
569 return get_unaligned_be64(&hmac
[SHA256_DIGEST_SIZE
- sizeof(u64
)]);
572 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
573 static u64
add_addr6_generate_hmac(u64 key1
, u64 key2
, u8 addr_id
,
574 struct in6_addr
*addr
)
576 u8 hmac
[SHA256_DIGEST_SIZE
];
580 memcpy(&msg
[1], &addr
->s6_addr
, 16);
584 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 19, hmac
);
586 return get_unaligned_be64(&hmac
[SHA256_DIGEST_SIZE
- sizeof(u64
)]);
590 static bool mptcp_established_options_add_addr(struct sock
*sk
, struct sk_buff
*skb
,
592 unsigned int remaining
,
593 struct mptcp_out_options
*opts
)
595 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
596 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
597 bool drop_other_suboptions
= false;
598 unsigned int opt_size
= *size
;
599 struct mptcp_addr_info saddr
;
604 if ((mptcp_pm_should_add_signal_ipv6(msk
) ||
605 mptcp_pm_should_add_signal_port(msk
)) &&
606 skb
&& skb_is_tcp_pure_ack(skb
)) {
607 pr_debug("drop other suboptions");
608 opts
->suboptions
= 0;
609 opts
->ext_copy
.use_ack
= 0;
610 opts
->ext_copy
.use_map
= 0;
611 remaining
+= opt_size
;
612 drop_other_suboptions
= true;
615 if (!mptcp_pm_should_add_signal(msk
) ||
616 !(mptcp_pm_add_addr_signal(msk
, remaining
, &saddr
, &echo
, &port
)))
619 len
= mptcp_add_addr_len(saddr
.family
, echo
, port
);
624 if (drop_other_suboptions
)
626 opts
->addr_id
= saddr
.id
;
628 opts
->port
= ntohs(saddr
.port
);
629 if (saddr
.family
== AF_INET
) {
630 opts
->suboptions
|= OPTION_MPTCP_ADD_ADDR
;
631 opts
->addr
= saddr
.addr
;
633 opts
->ahmac
= add_addr_generate_hmac(msk
->local_key
,
639 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
640 else if (saddr
.family
== AF_INET6
) {
641 opts
->suboptions
|= OPTION_MPTCP_ADD_ADDR6
;
642 opts
->addr6
= saddr
.addr6
;
644 opts
->ahmac
= add_addr6_generate_hmac(msk
->local_key
,
651 pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
652 opts
->addr_id
, opts
->ahmac
, echo
, opts
->port
);
657 static bool mptcp_established_options_rm_addr(struct sock
*sk
,
659 unsigned int remaining
,
660 struct mptcp_out_options
*opts
)
662 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
663 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
666 if (!mptcp_pm_should_rm_signal(msk
) ||
667 !(mptcp_pm_rm_addr_signal(msk
, remaining
, &rm_id
)))
670 if (remaining
< TCPOLEN_MPTCP_RM_ADDR_BASE
)
673 *size
= TCPOLEN_MPTCP_RM_ADDR_BASE
;
674 opts
->suboptions
|= OPTION_MPTCP_RM_ADDR
;
677 pr_debug("rm_id=%d", opts
->rm_id
);
682 bool mptcp_established_options(struct sock
*sk
, struct sk_buff
*skb
,
683 unsigned int *size
, unsigned int remaining
,
684 struct mptcp_out_options
*opts
)
686 unsigned int opt_size
= 0;
689 opts
->suboptions
= 0;
691 if (unlikely(mptcp_check_fallback(sk
)))
694 /* prevent adding of any MPTCP related options on reset packet
695 * until we support MP_TCPRST/MP_FASTCLOSE
697 if (unlikely(skb
&& TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_RST
))
700 if (mptcp_established_options_mp(sk
, skb
, &opt_size
, remaining
, opts
))
702 else if (mptcp_established_options_dss(sk
, skb
, &opt_size
, remaining
,
706 /* we reserved enough space for the above options, and exceeding the
707 * TCP option space would be fatal
709 if (WARN_ON_ONCE(opt_size
> remaining
))
713 remaining
-= opt_size
;
714 if (mptcp_established_options_add_addr(sk
, skb
, &opt_size
, remaining
, opts
)) {
716 remaining
-= opt_size
;
718 } else if (mptcp_established_options_rm_addr(sk
, &opt_size
, remaining
, opts
)) {
720 remaining
-= opt_size
;
727 bool mptcp_synack_options(const struct request_sock
*req
, unsigned int *size
,
728 struct mptcp_out_options
*opts
)
730 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
732 if (subflow_req
->mp_capable
) {
733 opts
->suboptions
= OPTION_MPTCP_MPC_SYNACK
;
734 opts
->sndr_key
= subflow_req
->local_key
;
735 *size
= TCPOLEN_MPTCP_MPC_SYNACK
;
736 pr_debug("subflow_req=%p, local_key=%llu",
737 subflow_req
, subflow_req
->local_key
);
739 } else if (subflow_req
->mp_join
) {
740 opts
->suboptions
= OPTION_MPTCP_MPJ_SYNACK
;
741 opts
->backup
= subflow_req
->backup
;
742 opts
->join_id
= subflow_req
->local_id
;
743 opts
->thmac
= subflow_req
->thmac
;
744 opts
->nonce
= subflow_req
->local_nonce
;
745 pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
746 subflow_req
, opts
->backup
, opts
->join_id
,
747 opts
->thmac
, opts
->nonce
);
748 *size
= TCPOLEN_MPTCP_MPJ_SYNACK
;
754 static bool check_fully_established(struct mptcp_sock
*msk
, struct sock
*ssk
,
755 struct mptcp_subflow_context
*subflow
,
757 struct mptcp_options_received
*mp_opt
)
759 /* here we can process OoO, in-window pkts, only in-sequence 4th ack
760 * will make the subflow fully established
762 if (likely(subflow
->fully_established
)) {
763 /* on passive sockets, check for 3rd ack retransmission
764 * note that msk is always set by subflow_syn_recv_sock()
765 * for mp_join subflows
767 if (TCP_SKB_CB(skb
)->seq
== subflow
->ssn_offset
+ 1 &&
768 TCP_SKB_CB(skb
)->end_seq
== TCP_SKB_CB(skb
)->seq
&&
769 subflow
->mp_join
&& mp_opt
->mp_join
&&
770 READ_ONCE(msk
->pm
.server_side
))
772 goto fully_established
;
775 /* we must process OoO packets before the first subflow is fully
776 * established. OoO packets are instead a protocol violation
777 * for MP_JOIN subflows as the peer must not send any data
778 * before receiving the forth ack - cfr. RFC 8684 section 3.2.
780 if (TCP_SKB_CB(skb
)->seq
!= subflow
->ssn_offset
+ 1) {
781 if (subflow
->mp_join
)
783 return subflow
->mp_capable
;
786 if (mp_opt
->dss
&& mp_opt
->use_ack
) {
787 /* subflows are fully established as soon as we get any
790 subflow
->fully_established
= 1;
791 WRITE_ONCE(msk
->fully_established
, true);
792 goto fully_established
;
795 if (mp_opt
->add_addr
) {
796 WRITE_ONCE(msk
->fully_established
, true);
800 /* If the first established packet does not contain MP_CAPABLE + data
801 * then fallback to TCP. Fallback scenarios requires a reset for
804 if (!mp_opt
->mp_capable
) {
805 if (subflow
->mp_join
)
807 subflow
->mp_capable
= 0;
809 __mptcp_do_fallback(msk
);
813 if (unlikely(!READ_ONCE(msk
->pm
.server_side
)))
814 pr_warn_once("bogus mpc option on established client sk");
815 mptcp_subflow_fully_established(subflow
, mp_opt
);
818 /* if the subflow is not already linked into the conn_list, we can't
819 * notify the PM: this subflow is still on the listener queue
820 * and the PM possibly acquiring the subflow lock could race with
823 if (likely(subflow
->pm_notified
) || list_empty(&subflow
->node
))
826 subflow
->pm_notified
= 1;
827 if (subflow
->mp_join
) {
828 clear_3rdack_retransmission(ssk
);
829 mptcp_pm_subflow_established(msk
, subflow
);
831 mptcp_pm_fully_established(msk
);
836 mptcp_subflow_reset(ssk
);
840 static u64
expand_ack(u64 old_ack
, u64 cur_ack
, bool use_64bit
)
842 u32 old_ack32
, cur_ack32
;
847 old_ack32
= (u32
)old_ack
;
848 cur_ack32
= (u32
)cur_ack
;
849 cur_ack
= (old_ack
& GENMASK_ULL(63, 32)) + cur_ack32
;
850 if (unlikely(before(cur_ack32
, old_ack32
)))
851 return cur_ack
+ (1LL << 32);
855 static void ack_update_msk(struct mptcp_sock
*msk
,
857 struct mptcp_options_received
*mp_opt
)
859 u64 new_wnd_end
, new_snd_una
, snd_nxt
= READ_ONCE(msk
->snd_nxt
);
860 struct sock
*sk
= (struct sock
*)msk
;
865 /* avoid ack expansion on update conflict, to reduce the risk of
866 * wrongly expanding to a future ack sequence number, which is way
867 * more dangerous than missing an ack
869 old_snd_una
= msk
->snd_una
;
870 new_snd_una
= expand_ack(old_snd_una
, mp_opt
->data_ack
, mp_opt
->ack64
);
872 /* ACK for data not even sent yet? Ignore. */
873 if (after64(new_snd_una
, snd_nxt
))
874 new_snd_una
= old_snd_una
;
876 new_wnd_end
= new_snd_una
+ tcp_sk(ssk
)->snd_wnd
;
878 if (after64(new_wnd_end
, msk
->wnd_end
))
879 msk
->wnd_end
= new_wnd_end
;
881 /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
882 if (after64(msk
->wnd_end
, READ_ONCE(msk
->snd_nxt
)) &&
883 sk_stream_memory_free(ssk
))
884 __mptcp_check_push(sk
, ssk
);
886 if (after64(new_snd_una
, old_snd_una
)) {
887 msk
->snd_una
= new_snd_una
;
888 __mptcp_data_acked(sk
);
890 mptcp_data_unlock(sk
);
893 bool mptcp_update_rcv_data_fin(struct mptcp_sock
*msk
, u64 data_fin_seq
, bool use_64bit
)
895 /* Skip if DATA_FIN was already received.
896 * If updating simultaneously with the recvmsg loop, values
897 * should match. If they mismatch, the peer is misbehaving and
898 * we will prefer the most recent information.
900 if (READ_ONCE(msk
->rcv_data_fin
) || !READ_ONCE(msk
->first
))
903 WRITE_ONCE(msk
->rcv_data_fin_seq
,
904 expand_ack(READ_ONCE(msk
->ack_seq
), data_fin_seq
, use_64bit
));
905 WRITE_ONCE(msk
->rcv_data_fin
, 1);
910 static bool add_addr_hmac_valid(struct mptcp_sock
*msk
,
911 struct mptcp_options_received
*mp_opt
)
918 if (mp_opt
->family
== MPTCP_ADDR_IPVERSION_4
)
919 hmac
= add_addr_generate_hmac(msk
->remote_key
,
921 mp_opt
->addr_id
, &mp_opt
->addr
);
922 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
924 hmac
= add_addr6_generate_hmac(msk
->remote_key
,
926 mp_opt
->addr_id
, &mp_opt
->addr6
);
929 pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
930 msk
, (unsigned long long)hmac
,
931 (unsigned long long)mp_opt
->ahmac
);
933 return hmac
== mp_opt
->ahmac
;
936 void mptcp_incoming_options(struct sock
*sk
, struct sk_buff
*skb
)
938 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
939 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
940 struct mptcp_options_received mp_opt
;
941 struct mptcp_ext
*mpext
;
943 if (__mptcp_check_fallback(msk
)) {
944 /* Keep it simple and unconditionally trigger send data cleanup and
945 * pending queue spooling. We will need to acquire the data lock
946 * for more accurate checks, and once the lock is acquired, such
949 mptcp_data_lock(subflow
->conn
);
950 if (sk_stream_memory_free(sk
))
951 __mptcp_check_push(subflow
->conn
, sk
);
952 __mptcp_data_acked(subflow
->conn
);
953 mptcp_data_unlock(subflow
->conn
);
957 mptcp_get_options(skb
, &mp_opt
);
958 if (!check_fully_established(msk
, sk
, subflow
, skb
, &mp_opt
))
961 if (mp_opt
.fastclose
&&
962 msk
->local_key
== mp_opt
.rcvr_key
) {
963 WRITE_ONCE(msk
->rcv_fastclose
, true);
964 mptcp_schedule_work((struct sock
*)msk
);
967 if (mp_opt
.add_addr
&& add_addr_hmac_valid(msk
, &mp_opt
)) {
968 struct mptcp_addr_info addr
;
970 addr
.port
= htons(mp_opt
.port
);
971 addr
.id
= mp_opt
.addr_id
;
972 if (mp_opt
.family
== MPTCP_ADDR_IPVERSION_4
) {
973 addr
.family
= AF_INET
;
974 addr
.addr
= mp_opt
.addr
;
976 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
977 else if (mp_opt
.family
== MPTCP_ADDR_IPVERSION_6
) {
978 addr
.family
= AF_INET6
;
979 addr
.addr6
= mp_opt
.addr6
;
983 mptcp_pm_add_addr_received(msk
, &addr
);
984 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_ADDADDR
);
986 mptcp_pm_del_add_timer(msk
, &addr
);
987 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_ECHOADD
);
992 if (mp_opt
.rm_addr
) {
993 mptcp_pm_rm_addr_received(msk
, mp_opt
.rm_id
);
1000 /* we can't wait for recvmsg() to update the ack_seq, otherwise
1001 * monodirectional flows will stuck
1004 ack_update_msk(msk
, sk
, &mp_opt
);
1006 /* Zero-data-length packets are dropped by the caller and not
1007 * propagated to the MPTCP layer, so the skb extension does not
1008 * need to be allocated or populated. DATA_FIN information, if
1009 * present, needs to be updated here before the skb is freed.
1011 if (TCP_SKB_CB(skb
)->seq
== TCP_SKB_CB(skb
)->end_seq
) {
1012 if (mp_opt
.data_fin
&& mp_opt
.data_len
== 1 &&
1013 mptcp_update_rcv_data_fin(msk
, mp_opt
.data_seq
, mp_opt
.dsn64
) &&
1014 schedule_work(&msk
->work
))
1015 sock_hold(subflow
->conn
);
1020 mpext
= skb_ext_add(skb
, SKB_EXT_MPTCP
);
1024 memset(mpext
, 0, sizeof(*mpext
));
1026 if (mp_opt
.use_map
) {
1027 if (mp_opt
.mpc_map
) {
1028 /* this is an MP_CAPABLE carrying MPTCP data
1029 * we know this map the first chunk of data
1031 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
,
1034 mpext
->subflow_seq
= 1;
1037 mpext
->data_fin
= 0;
1039 mpext
->data_seq
= mp_opt
.data_seq
;
1040 mpext
->subflow_seq
= mp_opt
.subflow_seq
;
1041 mpext
->dsn64
= mp_opt
.dsn64
;
1042 mpext
->data_fin
= mp_opt
.data_fin
;
1044 mpext
->data_len
= mp_opt
.data_len
;
1049 static void mptcp_set_rwin(const struct tcp_sock
*tp
)
1051 const struct sock
*ssk
= (const struct sock
*)tp
;
1052 const struct mptcp_subflow_context
*subflow
;
1053 struct mptcp_sock
*msk
;
1056 subflow
= mptcp_subflow_ctx(ssk
);
1057 msk
= mptcp_sk(subflow
->conn
);
1059 ack_seq
= READ_ONCE(msk
->ack_seq
) + tp
->rcv_wnd
;
1061 if (after64(ack_seq
, READ_ONCE(msk
->rcv_wnd_sent
)))
1062 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
1065 void mptcp_write_options(__be32
*ptr
, const struct tcp_sock
*tp
,
1066 struct mptcp_out_options
*opts
)
1068 if ((OPTION_MPTCP_MPC_SYN
| OPTION_MPTCP_MPC_SYNACK
|
1069 OPTION_MPTCP_MPC_ACK
) & opts
->suboptions
) {
1072 if (OPTION_MPTCP_MPC_SYN
& opts
->suboptions
)
1073 len
= TCPOLEN_MPTCP_MPC_SYN
;
1074 else if (OPTION_MPTCP_MPC_SYNACK
& opts
->suboptions
)
1075 len
= TCPOLEN_MPTCP_MPC_SYNACK
;
1076 else if (opts
->ext_copy
.data_len
)
1077 len
= TCPOLEN_MPTCP_MPC_ACK_DATA
;
1079 len
= TCPOLEN_MPTCP_MPC_ACK
;
1081 *ptr
++ = mptcp_option(MPTCPOPT_MP_CAPABLE
, len
,
1082 MPTCP_SUPPORTED_VERSION
,
1083 MPTCP_CAP_HMAC_SHA256
);
1085 if (!((OPTION_MPTCP_MPC_SYNACK
| OPTION_MPTCP_MPC_ACK
) &
1087 goto mp_capable_done
;
1089 put_unaligned_be64(opts
->sndr_key
, ptr
);
1091 if (!((OPTION_MPTCP_MPC_ACK
) & opts
->suboptions
))
1092 goto mp_capable_done
;
1094 put_unaligned_be64(opts
->rcvr_key
, ptr
);
1096 if (!opts
->ext_copy
.data_len
)
1097 goto mp_capable_done
;
1099 put_unaligned_be32(opts
->ext_copy
.data_len
<< 16 |
1100 TCPOPT_NOP
<< 8 | TCPOPT_NOP
, ptr
);
1105 if ((OPTION_MPTCP_ADD_ADDR
1106 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1107 | OPTION_MPTCP_ADD_ADDR6
1109 ) & opts
->suboptions
) {
1110 u8 len
= TCPOLEN_MPTCP_ADD_ADDR_BASE
;
1111 u8 echo
= MPTCP_ADDR_ECHO
;
1113 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1114 if (OPTION_MPTCP_ADD_ADDR6
& opts
->suboptions
)
1115 len
= TCPOLEN_MPTCP_ADD_ADDR6_BASE
;
1119 len
+= TCPOLEN_MPTCP_PORT_LEN
;
1122 len
+= sizeof(opts
->ahmac
);
1126 *ptr
++ = mptcp_option(MPTCPOPT_ADD_ADDR
,
1127 len
, echo
, opts
->addr_id
);
1128 if (OPTION_MPTCP_ADD_ADDR
& opts
->suboptions
) {
1129 memcpy((u8
*)ptr
, (u8
*)&opts
->addr
.s_addr
, 4);
1132 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1133 else if (OPTION_MPTCP_ADD_ADDR6
& opts
->suboptions
) {
1134 memcpy((u8
*)ptr
, opts
->addr6
.s6_addr
, 16);
1141 put_unaligned_be64(opts
->ahmac
, ptr
);
1146 u8
*bptr
= (u8
*)ptr
;
1148 put_unaligned_be16(opts
->port
, bptr
);
1150 put_unaligned_be64(opts
->ahmac
, bptr
);
1152 put_unaligned_be16(TCPOPT_NOP
<< 8 |
1157 put_unaligned_be32(opts
->port
<< 16 |
1165 if (OPTION_MPTCP_RM_ADDR
& opts
->suboptions
) {
1166 *ptr
++ = mptcp_option(MPTCPOPT_RM_ADDR
,
1167 TCPOLEN_MPTCP_RM_ADDR_BASE
,
1171 if (OPTION_MPTCP_MPJ_SYN
& opts
->suboptions
) {
1172 *ptr
++ = mptcp_option(MPTCPOPT_MP_JOIN
,
1173 TCPOLEN_MPTCP_MPJ_SYN
,
1174 opts
->backup
, opts
->join_id
);
1175 put_unaligned_be32(opts
->token
, ptr
);
1177 put_unaligned_be32(opts
->nonce
, ptr
);
1181 if (OPTION_MPTCP_MPJ_SYNACK
& opts
->suboptions
) {
1182 *ptr
++ = mptcp_option(MPTCPOPT_MP_JOIN
,
1183 TCPOLEN_MPTCP_MPJ_SYNACK
,
1184 opts
->backup
, opts
->join_id
);
1185 put_unaligned_be64(opts
->thmac
, ptr
);
1187 put_unaligned_be32(opts
->nonce
, ptr
);
1191 if (OPTION_MPTCP_MPJ_ACK
& opts
->suboptions
) {
1192 *ptr
++ = mptcp_option(MPTCPOPT_MP_JOIN
,
1193 TCPOLEN_MPTCP_MPJ_ACK
, 0, 0);
1194 memcpy(ptr
, opts
->hmac
, MPTCPOPT_HMAC_LEN
);
1198 if (opts
->ext_copy
.use_ack
|| opts
->ext_copy
.use_map
) {
1199 struct mptcp_ext
*mpext
= &opts
->ext_copy
;
1200 u8 len
= TCPOLEN_MPTCP_DSS_BASE
;
1203 if (mpext
->use_ack
) {
1204 flags
= MPTCP_DSS_HAS_ACK
;
1206 len
+= TCPOLEN_MPTCP_DSS_ACK64
;
1207 flags
|= MPTCP_DSS_ACK64
;
1209 len
+= TCPOLEN_MPTCP_DSS_ACK32
;
1213 if (mpext
->use_map
) {
1214 len
+= TCPOLEN_MPTCP_DSS_MAP64
;
1216 /* Use only 64-bit mapping flags for now, add
1217 * support for optional 32-bit mappings later.
1219 flags
|= MPTCP_DSS_HAS_MAP
| MPTCP_DSS_DSN64
;
1220 if (mpext
->data_fin
)
1221 flags
|= MPTCP_DSS_DATA_FIN
;
1224 *ptr
++ = mptcp_option(MPTCPOPT_DSS
, len
, 0, flags
);
1226 if (mpext
->use_ack
) {
1228 put_unaligned_be64(mpext
->data_ack
, ptr
);
1231 put_unaligned_be32(mpext
->data_ack32
, ptr
);
1236 if (mpext
->use_map
) {
1237 put_unaligned_be64(mpext
->data_seq
, ptr
);
1239 put_unaligned_be32(mpext
->subflow_seq
, ptr
);
1241 put_unaligned_be32(mpext
->data_len
<< 16 |
1242 TCPOPT_NOP
<< 8 | TCPOPT_NOP
, ptr
);