1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/sha2.h>
13 #include <crypto/utils.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
18 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
19 #include <net/ip6_route.h>
20 #include <net/transp_v6.h>
22 #include <net/mptcp.h>
27 #include <trace/events/mptcp.h>
28 #include <trace/events/sock.h>
30 static void mptcp_subflow_ops_undo_override(struct sock
*ssk
);
32 static void SUBFLOW_REQ_INC_STATS(struct request_sock
*req
,
33 enum linux_mptcp_mib_field field
)
35 MPTCP_INC_STATS(sock_net(req_to_sk(req
)), field
);
38 static void subflow_req_destructor(struct request_sock
*req
)
40 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
42 pr_debug("subflow_req=%p\n", subflow_req
);
45 sock_put((struct sock
*)subflow_req
->msk
);
47 mptcp_token_destroy_request(req
);
50 static void subflow_generate_hmac(u64 key1
, u64 key2
, u32 nonce1
, u32 nonce2
,
55 put_unaligned_be32(nonce1
, &msg
[0]);
56 put_unaligned_be32(nonce2
, &msg
[4]);
58 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 8, hmac
);
61 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock
*msk
)
63 return mptcp_is_fully_established((void *)msk
) &&
64 ((mptcp_pm_is_userspace(msk
) &&
65 mptcp_userspace_pm_active(msk
)) ||
66 READ_ONCE(msk
->pm
.accept_subflow
));
69 /* validate received token and create truncated hmac and nonce for SYN-ACK */
70 static void subflow_req_create_thmac(struct mptcp_subflow_request_sock
*subflow_req
)
72 struct mptcp_sock
*msk
= subflow_req
->msk
;
73 u8 hmac
[SHA256_DIGEST_SIZE
];
75 get_random_bytes(&subflow_req
->local_nonce
, sizeof(u32
));
77 subflow_generate_hmac(READ_ONCE(msk
->local_key
),
78 READ_ONCE(msk
->remote_key
),
79 subflow_req
->local_nonce
,
80 subflow_req
->remote_nonce
, hmac
);
82 subflow_req
->thmac
= get_unaligned_be64(hmac
);
85 static struct mptcp_sock
*subflow_token_join_request(struct request_sock
*req
)
87 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
88 struct mptcp_sock
*msk
;
91 msk
= mptcp_token_get_sock(sock_net(req_to_sk(req
)), subflow_req
->token
);
93 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINNOTOKEN
);
97 local_id
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)req
);
99 sock_put((struct sock
*)msk
);
102 subflow_req
->local_id
= local_id
;
103 subflow_req
->request_bkup
= mptcp_pm_is_backup(msk
, (struct sock_common
*)req
);
108 static void subflow_init_req(struct request_sock
*req
, const struct sock
*sk_listener
)
110 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
112 subflow_req
->mp_capable
= 0;
113 subflow_req
->mp_join
= 0;
114 subflow_req
->csum_reqd
= mptcp_is_checksum_enabled(sock_net(sk_listener
));
115 subflow_req
->allow_join_id0
= mptcp_allow_join_id0(sock_net(sk_listener
));
116 subflow_req
->msk
= NULL
;
117 mptcp_token_init_request(req
);
120 static bool subflow_use_different_sport(struct mptcp_sock
*msk
, const struct sock
*sk
)
122 return inet_sk(sk
)->inet_sport
!= inet_sk((struct sock
*)msk
)->inet_sport
;
125 static void subflow_add_reset_reason(struct sk_buff
*skb
, u8 reason
)
127 struct mptcp_ext
*mpext
= skb_ext_add(skb
, SKB_EXT_MPTCP
);
130 memset(mpext
, 0, sizeof(*mpext
));
131 mpext
->reset_reason
= reason
;
135 static int subflow_reset_req_endp(struct request_sock
*req
, struct sk_buff
*skb
)
137 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEENDPATTEMPT
);
138 subflow_add_reset_reason(skb
, MPTCP_RST_EPROHIBIT
);
142 /* Init mptcp request socket.
144 * Returns an error code if a JOIN has failed and a TCP reset
147 static int subflow_check_req(struct request_sock
*req
,
148 const struct sock
*sk_listener
,
151 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
152 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
153 struct mptcp_options_received mp_opt
;
154 bool opt_mp_capable
, opt_mp_join
;
156 pr_debug("subflow_req=%p, listener=%p\n", subflow_req
, listener
);
158 #ifdef CONFIG_TCP_MD5SIG
159 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
162 if (rcu_access_pointer(tcp_sk(sk_listener
)->md5sig_info
)) {
163 subflow_add_reset_reason(skb
, MPTCP_RST_EMPTCP
);
168 mptcp_get_options(skb
, &mp_opt
);
170 opt_mp_capable
= !!(mp_opt
.suboptions
& OPTION_MPTCP_MPC_SYN
);
171 opt_mp_join
= !!(mp_opt
.suboptions
& OPTION_MPTCP_MPJ_SYN
);
172 if (opt_mp_capable
) {
173 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEPASSIVE
);
175 if (unlikely(listener
->pm_listener
))
176 return subflow_reset_req_endp(req
, skb
);
179 } else if (opt_mp_join
) {
180 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINSYNRX
);
183 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINSYNBACKUPRX
);
184 } else if (unlikely(listener
->pm_listener
)) {
185 return subflow_reset_req_endp(req
, skb
);
188 if (opt_mp_capable
&& listener
->request_mptcp
) {
189 int err
, retries
= MPTCP_TOKEN_MAX_RETRIES
;
191 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
194 get_random_bytes(&subflow_req
->local_key
, sizeof(subflow_req
->local_key
));
195 } while (subflow_req
->local_key
== 0);
197 if (unlikely(req
->syncookie
)) {
198 mptcp_crypto_key_sha(subflow_req
->local_key
,
201 if (mptcp_token_exists(subflow_req
->token
)) {
204 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_TOKENFALLBACKINIT
);
206 subflow_req
->mp_capable
= 1;
211 err
= mptcp_token_new_request(req
);
213 subflow_req
->mp_capable
= 1;
214 else if (retries
-- > 0)
217 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_TOKENFALLBACKINIT
);
219 } else if (opt_mp_join
&& listener
->request_mptcp
) {
220 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
221 subflow_req
->mp_join
= 1;
222 subflow_req
->backup
= mp_opt
.backup
;
223 subflow_req
->remote_id
= mp_opt
.join_id
;
224 subflow_req
->token
= mp_opt
.token
;
225 subflow_req
->remote_nonce
= mp_opt
.nonce
;
226 subflow_req
->msk
= subflow_token_join_request(req
);
228 /* Can't fall back to TCP in this case. */
229 if (!subflow_req
->msk
) {
230 subflow_add_reset_reason(skb
, MPTCP_RST_EMPTCP
);
234 if (subflow_use_different_sport(subflow_req
->msk
, sk_listener
)) {
235 pr_debug("syn inet_sport=%d %d\n",
236 ntohs(inet_sk(sk_listener
)->inet_sport
),
237 ntohs(inet_sk((struct sock
*)subflow_req
->msk
)->inet_sport
));
238 if (!mptcp_pm_sport_in_anno_list(subflow_req
->msk
, sk_listener
)) {
239 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MISMATCHPORTSYNRX
);
240 subflow_add_reset_reason(skb
, MPTCP_RST_EPROHIBIT
);
243 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINPORTSYNRX
);
246 subflow_req_create_thmac(subflow_req
);
248 if (unlikely(req
->syncookie
)) {
249 if (!mptcp_can_accept_new_subflow(subflow_req
->msk
)) {
250 subflow_add_reset_reason(skb
, MPTCP_RST_EPROHIBIT
);
254 subflow_init_req_cookie_join_save(subflow_req
, skb
);
257 pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req
->token
,
258 subflow_req
->remote_nonce
, subflow_req
->msk
);
264 int mptcp_subflow_init_cookie_req(struct request_sock
*req
,
265 const struct sock
*sk_listener
,
268 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
269 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
270 struct mptcp_options_received mp_opt
;
271 bool opt_mp_capable
, opt_mp_join
;
274 subflow_init_req(req
, sk_listener
);
275 mptcp_get_options(skb
, &mp_opt
);
277 opt_mp_capable
= !!(mp_opt
.suboptions
& OPTION_MPTCP_MPC_ACK
);
278 opt_mp_join
= !!(mp_opt
.suboptions
& OPTION_MPTCP_MPJ_ACK
);
279 if (opt_mp_capable
&& opt_mp_join
)
282 if (opt_mp_capable
&& listener
->request_mptcp
) {
283 if (mp_opt
.sndr_key
== 0)
286 subflow_req
->local_key
= mp_opt
.rcvr_key
;
287 err
= mptcp_token_new_request(req
);
291 subflow_req
->mp_capable
= 1;
292 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
293 } else if (opt_mp_join
&& listener
->request_mptcp
) {
294 if (!mptcp_token_join_cookie_init_state(subflow_req
, skb
))
297 subflow_req
->mp_join
= 1;
298 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
303 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req
);
305 static enum sk_rst_reason
mptcp_get_rst_reason(const struct sk_buff
*skb
)
307 const struct mptcp_ext
*mpext
= mptcp_get_ext(skb
);
310 return SK_RST_REASON_NOT_SPECIFIED
;
312 return sk_rst_convert_mptcp_reason(mpext
->reset_reason
);
315 static struct dst_entry
*subflow_v4_route_req(const struct sock
*sk
,
318 struct request_sock
*req
,
321 struct dst_entry
*dst
;
324 tcp_rsk(req
)->is_mptcp
= 1;
325 subflow_init_req(req
, sk
);
327 dst
= tcp_request_sock_ipv4_ops
.route_req(sk
, skb
, fl
, req
, tw_isn
);
331 err
= subflow_check_req(req
, sk
, skb
);
337 tcp_request_sock_ops
.send_reset(sk
, skb
,
338 mptcp_get_rst_reason(skb
));
342 static void subflow_prep_synack(const struct sock
*sk
, struct request_sock
*req
,
343 struct tcp_fastopen_cookie
*foc
,
344 enum tcp_synack_type synack_type
)
346 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
347 struct inet_request_sock
*ireq
= inet_rsk(req
);
349 /* clear tstamp_ok, as needed depending on cookie */
350 if (foc
&& foc
->len
> -1)
353 if (synack_type
== TCP_SYNACK_FASTOPEN
)
354 mptcp_fastopen_subflow_synack_set_params(subflow
, req
);
357 static int subflow_v4_send_synack(const struct sock
*sk
, struct dst_entry
*dst
,
359 struct request_sock
*req
,
360 struct tcp_fastopen_cookie
*foc
,
361 enum tcp_synack_type synack_type
,
362 struct sk_buff
*syn_skb
)
364 subflow_prep_synack(sk
, req
, foc
, synack_type
);
366 return tcp_request_sock_ipv4_ops
.send_synack(sk
, dst
, fl
, req
, foc
,
367 synack_type
, syn_skb
);
370 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
371 static int subflow_v6_send_synack(const struct sock
*sk
, struct dst_entry
*dst
,
373 struct request_sock
*req
,
374 struct tcp_fastopen_cookie
*foc
,
375 enum tcp_synack_type synack_type
,
376 struct sk_buff
*syn_skb
)
378 subflow_prep_synack(sk
, req
, foc
, synack_type
);
380 return tcp_request_sock_ipv6_ops
.send_synack(sk
, dst
, fl
, req
, foc
,
381 synack_type
, syn_skb
);
384 static struct dst_entry
*subflow_v6_route_req(const struct sock
*sk
,
387 struct request_sock
*req
,
390 struct dst_entry
*dst
;
393 tcp_rsk(req
)->is_mptcp
= 1;
394 subflow_init_req(req
, sk
);
396 dst
= tcp_request_sock_ipv6_ops
.route_req(sk
, skb
, fl
, req
, tw_isn
);
400 err
= subflow_check_req(req
, sk
, skb
);
406 tcp6_request_sock_ops
.send_reset(sk
, skb
,
407 mptcp_get_rst_reason(skb
));
412 /* validate received truncated hmac and create hmac for third ACK */
413 static bool subflow_thmac_valid(struct mptcp_subflow_context
*subflow
)
415 u8 hmac
[SHA256_DIGEST_SIZE
];
418 subflow_generate_hmac(subflow
->remote_key
, subflow
->local_key
,
419 subflow
->remote_nonce
, subflow
->local_nonce
,
422 thmac
= get_unaligned_be64(hmac
);
423 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
424 subflow
, subflow
->token
, thmac
, subflow
->thmac
);
426 return thmac
== subflow
->thmac
;
429 void mptcp_subflow_reset(struct sock
*ssk
)
431 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
432 struct sock
*sk
= subflow
->conn
;
434 /* mptcp_mp_fail_no_response() can reach here on an already closed
437 if (ssk
->sk_state
== TCP_CLOSE
)
440 /* must hold: tcp_done() could drop last reference on parent */
443 mptcp_send_active_reset_reason(ssk
);
445 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &mptcp_sk(sk
)->flags
))
446 mptcp_schedule_work(sk
);
451 static bool subflow_use_different_dport(struct mptcp_sock
*msk
, const struct sock
*sk
)
453 return inet_sk(sk
)->inet_dport
!= inet_sk((struct sock
*)msk
)->inet_dport
;
456 void __mptcp_sync_state(struct sock
*sk
, int state
)
458 struct mptcp_subflow_context
*subflow
;
459 struct mptcp_sock
*msk
= mptcp_sk(sk
);
460 struct sock
*ssk
= msk
->first
;
462 subflow
= mptcp_subflow_ctx(ssk
);
463 __mptcp_propagate_sndbuf(sk
, ssk
);
464 if (!msk
->rcvspace_init
)
465 mptcp_rcv_space_init(msk
, ssk
);
467 if (sk
->sk_state
== TCP_SYN_SENT
) {
468 /* subflow->idsn is always available is TCP_SYN_SENT state,
469 * even for the FASTOPEN scenarios
471 WRITE_ONCE(msk
->write_seq
, subflow
->idsn
+ 1);
472 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
473 mptcp_set_state(sk
, state
);
474 sk
->sk_state_change(sk
);
478 static void subflow_set_remote_key(struct mptcp_sock
*msk
,
479 struct mptcp_subflow_context
*subflow
,
480 const struct mptcp_options_received
*mp_opt
)
482 /* active MPC subflow will reach here multiple times:
483 * at subflow_finish_connect() time and at 4th ack time
485 if (subflow
->remote_key_valid
)
488 subflow
->remote_key_valid
= 1;
489 subflow
->remote_key
= mp_opt
->sndr_key
;
490 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
, &subflow
->iasn
);
493 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
494 WRITE_ONCE(msk
->ack_seq
, subflow
->iasn
);
495 WRITE_ONCE(msk
->can_ack
, true);
496 atomic64_set(&msk
->rcv_wnd_sent
, subflow
->iasn
);
499 static void mptcp_propagate_state(struct sock
*sk
, struct sock
*ssk
,
500 struct mptcp_subflow_context
*subflow
,
501 const struct mptcp_options_received
*mp_opt
)
503 struct mptcp_sock
*msk
= mptcp_sk(sk
);
507 /* Options are available only in the non fallback cases
508 * avoid updating rx path fields otherwise
510 WRITE_ONCE(msk
->snd_una
, subflow
->idsn
+ 1);
511 WRITE_ONCE(msk
->wnd_end
, subflow
->idsn
+ 1 + tcp_sk(ssk
)->snd_wnd
);
512 subflow_set_remote_key(msk
, subflow
, mp_opt
);
515 if (!sock_owned_by_user(sk
)) {
516 __mptcp_sync_state(sk
, ssk
->sk_state
);
518 msk
->pending_state
= ssk
->sk_state
;
519 __set_bit(MPTCP_SYNC_STATE
, &msk
->cb_flags
);
521 mptcp_data_unlock(sk
);
524 static void subflow_finish_connect(struct sock
*sk
, const struct sk_buff
*skb
)
526 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
527 struct mptcp_options_received mp_opt
;
528 struct sock
*parent
= subflow
->conn
;
529 struct mptcp_sock
*msk
;
531 subflow
->icsk_af_ops
->sk_rx_dst_set(sk
, skb
);
533 /* be sure no special action on any packet other than syn-ack */
534 if (subflow
->conn_finished
)
537 msk
= mptcp_sk(parent
);
538 subflow
->rel_write_seq
= 1;
539 subflow
->conn_finished
= 1;
540 subflow
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
541 pr_debug("subflow=%p synack seq=%x\n", subflow
, subflow
->ssn_offset
);
543 mptcp_get_options(skb
, &mp_opt
);
544 if (subflow
->request_mptcp
) {
545 if (!(mp_opt
.suboptions
& OPTION_MPTCP_MPC_SYNACK
)) {
546 MPTCP_INC_STATS(sock_net(sk
),
547 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK
);
548 mptcp_do_fallback(sk
);
553 if (mp_opt
.suboptions
& OPTION_MPTCP_CSUMREQD
)
554 WRITE_ONCE(msk
->csum_enabled
, true);
555 if (mp_opt
.deny_join_id0
)
556 WRITE_ONCE(msk
->pm
.remote_deny_join_id0
, true);
557 subflow
->mp_capable
= 1;
558 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_MPCAPABLEACTIVEACK
);
559 mptcp_finish_connect(sk
);
560 mptcp_active_enable(parent
);
561 mptcp_propagate_state(parent
, sk
, subflow
, &mp_opt
);
562 } else if (subflow
->request_join
) {
563 u8 hmac
[SHA256_DIGEST_SIZE
];
565 if (!(mp_opt
.suboptions
& OPTION_MPTCP_MPJ_SYNACK
)) {
566 subflow
->reset_reason
= MPTCP_RST_EMPTCP
;
570 subflow
->backup
= mp_opt
.backup
;
571 subflow
->thmac
= mp_opt
.thmac
;
572 subflow
->remote_nonce
= mp_opt
.nonce
;
573 WRITE_ONCE(subflow
->remote_id
, mp_opt
.join_id
);
574 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n",
575 subflow
, subflow
->thmac
, subflow
->remote_nonce
,
578 if (!subflow_thmac_valid(subflow
)) {
579 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINACKMAC
);
580 subflow
->reset_reason
= MPTCP_RST_EMPTCP
;
584 if (!mptcp_finish_join(sk
))
587 subflow_generate_hmac(subflow
->local_key
, subflow
->remote_key
,
588 subflow
->local_nonce
,
589 subflow
->remote_nonce
,
591 memcpy(subflow
->hmac
, hmac
, MPTCPOPT_HMAC_LEN
);
593 subflow
->mp_join
= 1;
594 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNACKRX
);
597 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNACKBACKUPRX
);
599 if (subflow_use_different_dport(msk
, sk
)) {
600 pr_debug("synack inet_dport=%d %d\n",
601 ntohs(inet_sk(sk
)->inet_dport
),
602 ntohs(inet_sk(parent
)->inet_dport
));
603 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINPORTSYNACKRX
);
605 } else if (mptcp_check_fallback(sk
)) {
606 /* It looks like MPTCP is blocked, while TCP is not */
607 if (subflow
->mpc_drop
)
608 mptcp_active_disable(parent
);
610 mptcp_propagate_state(parent
, sk
, subflow
, NULL
);
615 subflow
->reset_transient
= 0;
616 mptcp_subflow_reset(sk
);
619 static void subflow_set_local_id(struct mptcp_subflow_context
*subflow
, int local_id
)
621 WARN_ON_ONCE(local_id
< 0 || local_id
> 255);
622 WRITE_ONCE(subflow
->local_id
, local_id
);
625 static int subflow_chk_local_id(struct sock
*sk
)
627 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
628 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
631 if (likely(subflow
->local_id
>= 0))
634 err
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)sk
);
638 subflow_set_local_id(subflow
, err
);
639 subflow
->request_bkup
= mptcp_pm_is_backup(msk
, (struct sock_common
*)sk
);
644 static int subflow_rebuild_header(struct sock
*sk
)
646 int err
= subflow_chk_local_id(sk
);
648 if (unlikely(err
< 0))
651 return inet_sk_rebuild_header(sk
);
654 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
655 static int subflow_v6_rebuild_header(struct sock
*sk
)
657 int err
= subflow_chk_local_id(sk
);
659 if (unlikely(err
< 0))
662 return inet6_sk_rebuild_header(sk
);
666 static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init
;
667 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init
;
669 static int subflow_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
671 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
673 pr_debug("subflow=%p\n", subflow
);
675 /* Never answer to SYNs sent to broadcast or multicast */
676 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
679 return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops
,
680 &subflow_request_sock_ipv4_ops
,
687 static void subflow_v4_req_destructor(struct request_sock
*req
)
689 subflow_req_destructor(req
);
690 tcp_request_sock_ops
.destructor(req
);
693 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
694 static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init
;
695 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init
;
696 static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init
;
697 static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init
;
698 static struct proto tcpv6_prot_override __ro_after_init
;
700 static int subflow_v6_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
702 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
704 pr_debug("subflow=%p\n", subflow
);
706 if (skb
->protocol
== htons(ETH_P_IP
))
707 return subflow_v4_conn_request(sk
, skb
);
709 if (!ipv6_unicast_destination(skb
))
712 if (ipv6_addr_v4mapped(&ipv6_hdr(skb
)->saddr
)) {
713 __IP6_INC_STATS(sock_net(sk
), NULL
, IPSTATS_MIB_INHDRERRORS
);
717 return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops
,
718 &subflow_request_sock_ipv6_ops
, sk
, skb
);
722 return 0; /* don't send reset */
725 static void subflow_v6_req_destructor(struct request_sock
*req
)
727 subflow_req_destructor(req
);
728 tcp6_request_sock_ops
.destructor(req
);
732 struct request_sock
*mptcp_subflow_reqsk_alloc(const struct request_sock_ops
*ops
,
733 struct sock
*sk_listener
,
734 bool attach_listener
)
736 if (ops
->family
== AF_INET
)
737 ops
= &mptcp_subflow_v4_request_sock_ops
;
738 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
739 else if (ops
->family
== AF_INET6
)
740 ops
= &mptcp_subflow_v6_request_sock_ops
;
743 return inet_reqsk_alloc(ops
, sk_listener
, attach_listener
);
745 EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc
);
747 /* validate hmac received in third ACK */
748 static bool subflow_hmac_valid(const struct request_sock
*req
,
749 const struct mptcp_options_received
*mp_opt
)
751 const struct mptcp_subflow_request_sock
*subflow_req
;
752 u8 hmac
[SHA256_DIGEST_SIZE
];
753 struct mptcp_sock
*msk
;
755 subflow_req
= mptcp_subflow_rsk(req
);
756 msk
= subflow_req
->msk
;
760 subflow_generate_hmac(READ_ONCE(msk
->remote_key
),
761 READ_ONCE(msk
->local_key
),
762 subflow_req
->remote_nonce
,
763 subflow_req
->local_nonce
, hmac
);
765 return !crypto_memneq(hmac
, mp_opt
->hmac
, MPTCPOPT_HMAC_LEN
);
768 static void subflow_ulp_fallback(struct sock
*sk
,
769 struct mptcp_subflow_context
*old_ctx
)
771 struct inet_connection_sock
*icsk
= inet_csk(sk
);
773 mptcp_subflow_tcp_fallback(sk
, old_ctx
);
774 icsk
->icsk_ulp_ops
= NULL
;
775 rcu_assign_pointer(icsk
->icsk_ulp_data
, NULL
);
776 tcp_sk(sk
)->is_mptcp
= 0;
778 mptcp_subflow_ops_undo_override(sk
);
781 void mptcp_subflow_drop_ctx(struct sock
*ssk
)
783 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(ssk
);
788 list_del(&mptcp_subflow_ctx(ssk
)->node
);
789 if (inet_csk(ssk
)->icsk_ulp_ops
) {
790 subflow_ulp_fallback(ssk
, ctx
);
798 void __mptcp_subflow_fully_established(struct mptcp_sock
*msk
,
799 struct mptcp_subflow_context
*subflow
,
800 const struct mptcp_options_received
*mp_opt
)
802 subflow_set_remote_key(msk
, subflow
, mp_opt
);
803 WRITE_ONCE(subflow
->fully_established
, true);
804 WRITE_ONCE(msk
->fully_established
, true);
806 if (subflow
->is_mptfo
)
807 __mptcp_fastopen_gen_msk_ackseq(msk
, subflow
, mp_opt
);
810 static struct sock
*subflow_syn_recv_sock(const struct sock
*sk
,
812 struct request_sock
*req
,
813 struct dst_entry
*dst
,
814 struct request_sock
*req_unhash
,
817 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk
);
818 struct mptcp_subflow_request_sock
*subflow_req
;
819 struct mptcp_options_received mp_opt
;
820 bool fallback
, fallback_is_fatal
;
821 enum sk_rst_reason reason
;
822 struct mptcp_sock
*owner
;
825 pr_debug("listener=%p, req=%p, conn=%p\n", listener
, req
, listener
->conn
);
827 /* After child creation we must look for MPC even when options
830 mp_opt
.suboptions
= 0;
832 /* hopefully temporary handling for MP_JOIN+syncookie */
833 subflow_req
= mptcp_subflow_rsk(req
);
834 fallback_is_fatal
= tcp_rsk(req
)->is_mptcp
&& subflow_req
->mp_join
;
835 fallback
= !tcp_rsk(req
)->is_mptcp
;
839 /* if the sk is MP_CAPABLE, we try to fetch the client key */
840 if (subflow_req
->mp_capable
) {
841 /* we can receive and accept an in-window, out-of-order pkt,
842 * which may not carry the MP_CAPABLE opt even on mptcp enabled
843 * paths: always try to extract the peer key, and fallback
844 * for packets missing it.
845 * Even OoO DSS packets coming legitly after dropped or
846 * reordered MPC will cause fallback, but we don't have other
849 mptcp_get_options(skb
, &mp_opt
);
850 if (!(mp_opt
.suboptions
&
851 (OPTION_MPTCP_MPC_SYN
| OPTION_MPTCP_MPC_ACK
)))
854 } else if (subflow_req
->mp_join
) {
855 mptcp_get_options(skb
, &mp_opt
);
856 if (!(mp_opt
.suboptions
& OPTION_MPTCP_MPJ_ACK
) ||
857 !subflow_hmac_valid(req
, &mp_opt
) ||
858 !mptcp_can_accept_new_subflow(subflow_req
->msk
)) {
859 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKMAC
);
865 child
= listener
->icsk_af_ops
->syn_recv_sock(sk
, skb
, req
, dst
,
866 req_unhash
, own_req
);
868 if (child
&& *own_req
) {
869 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(child
);
871 tcp_rsk(req
)->drop_req
= false;
873 /* we need to fallback on ctx allocation failure and on pre-reqs
874 * checking above. In the latter scenario we additionally need
875 * to reset the context to non MPTCP status.
877 if (!ctx
|| fallback
) {
878 if (fallback_is_fatal
) {
879 subflow_add_reset_reason(skb
, MPTCP_RST_EMPTCP
);
885 /* ssk inherits options of listener sk */
886 ctx
->setsockopt_seq
= listener
->setsockopt_seq
;
888 if (ctx
->mp_capable
) {
889 ctx
->conn
= mptcp_sk_clone_init(listener
->conn
, &mp_opt
, child
, req
);
894 owner
= mptcp_sk(ctx
->conn
);
895 mptcp_pm_new_connection(owner
, child
, 1);
897 /* with OoO packets we can reach here without ingress
900 if (mp_opt
.suboptions
& OPTION_MPTCP_MPC_ACK
) {
901 mptcp_pm_fully_established(owner
, child
);
902 ctx
->pm_notified
= 1;
904 } else if (ctx
->mp_join
) {
905 owner
= subflow_req
->msk
;
907 subflow_add_reset_reason(skb
, MPTCP_RST_EPROHIBIT
);
911 /* move the msk reference ownership to the subflow */
912 subflow_req
->msk
= NULL
;
913 ctx
->conn
= (struct sock
*)owner
;
915 if (subflow_use_different_sport(owner
, sk
)) {
916 pr_debug("ack inet_sport=%d %d\n",
917 ntohs(inet_sk(sk
)->inet_sport
),
918 ntohs(inet_sk((struct sock
*)owner
)->inet_sport
));
919 if (!mptcp_pm_sport_in_anno_list(owner
, sk
)) {
920 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MISMATCHPORTACKRX
);
921 subflow_add_reset_reason(skb
, MPTCP_RST_EPROHIBIT
);
924 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINPORTACKRX
);
927 if (!mptcp_finish_join(child
)) {
928 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(child
);
930 subflow_add_reset_reason(skb
, subflow
->reset_reason
);
934 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKRX
);
935 tcp_rsk(req
)->drop_req
= true;
939 /* check for expected invariant - should never trigger, just help
940 * catching earlier subtle bugs
942 WARN_ON_ONCE(child
&& *own_req
&& tcp_sk(child
)->is_mptcp
&&
943 (!mptcp_subflow_ctx(child
) ||
944 !mptcp_subflow_ctx(child
)->conn
));
948 mptcp_subflow_drop_ctx(child
);
949 tcp_rsk(req
)->drop_req
= true;
950 inet_csk_prepare_for_destroy_sock(child
);
952 reason
= mptcp_get_rst_reason(skb
);
953 req
->rsk_ops
->send_reset(sk
, skb
, reason
);
955 /* The last child reference will be released by the caller */
960 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK
);
961 mptcp_subflow_drop_ctx(child
);
965 static struct inet_connection_sock_af_ops subflow_specific __ro_after_init
;
966 static struct proto tcp_prot_override __ro_after_init
;
968 enum mapping_status
{
978 static void dbg_bad_map(struct mptcp_subflow_context
*subflow
, u32 ssn
)
980 pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n",
981 ssn
, subflow
->map_subflow_seq
, subflow
->map_data_len
);
984 static bool skb_is_fully_mapped(struct sock
*ssk
, struct sk_buff
*skb
)
986 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
987 unsigned int skb_consumed
;
989 skb_consumed
= tcp_sk(ssk
)->copied_seq
- TCP_SKB_CB(skb
)->seq
;
990 if (unlikely(skb_consumed
>= skb
->len
)) {
991 DEBUG_NET_WARN_ON_ONCE(1);
995 return skb
->len
- skb_consumed
<= subflow
->map_data_len
-
996 mptcp_subflow_get_map_offset(subflow
);
999 static bool validate_mapping(struct sock
*ssk
, struct sk_buff
*skb
)
1001 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1002 u32 ssn
= tcp_sk(ssk
)->copied_seq
- subflow
->ssn_offset
;
1004 if (unlikely(before(ssn
, subflow
->map_subflow_seq
))) {
1005 /* Mapping covers data later in the subflow stream,
1006 * currently unsupported.
1008 dbg_bad_map(subflow
, ssn
);
1011 if (unlikely(!before(ssn
, subflow
->map_subflow_seq
+
1012 subflow
->map_data_len
))) {
1013 /* Mapping does covers past subflow data, invalid */
1014 dbg_bad_map(subflow
, ssn
);
1020 static enum mapping_status
validate_data_csum(struct sock
*ssk
, struct sk_buff
*skb
,
1023 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1024 u32 offset
, seq
, delta
;
1031 /* mapping already validated on previous traversal */
1032 if (subflow
->map_csum_len
== subflow
->map_data_len
)
1035 /* traverse the receive queue, ensuring it contains a full
1036 * DSS mapping and accumulating the related csum.
1037 * Preserve the accoumlate csum across multiple calls, to compute
1038 * the csum only once
1040 delta
= subflow
->map_data_len
- subflow
->map_csum_len
;
1042 seq
= tcp_sk(ssk
)->copied_seq
+ subflow
->map_csum_len
;
1043 offset
= seq
- TCP_SKB_CB(skb
)->seq
;
1045 /* if the current skb has not been accounted yet, csum its contents
1046 * up to the amount covered by the current DSS
1048 if (offset
< skb
->len
) {
1051 len
= min(skb
->len
- offset
, delta
);
1052 csum
= skb_checksum(skb
, offset
, len
, 0);
1053 subflow
->map_data_csum
= csum_block_add(subflow
->map_data_csum
, csum
,
1054 subflow
->map_csum_len
);
1057 subflow
->map_csum_len
+= len
;
1062 if (skb_queue_is_last(&ssk
->sk_receive_queue
, skb
)) {
1063 /* if this subflow is closed, the partial mapping
1064 * will be never completed; flush the pending skbs, so
1065 * that subflow_sched_work_if_closed() can kick in
1067 if (unlikely(ssk
->sk_state
== TCP_CLOSE
))
1068 while ((skb
= skb_peek(&ssk
->sk_receive_queue
)))
1069 sk_eat_skb(ssk
, skb
);
1071 /* not enough data to validate the csum */
1072 return MAPPING_EMPTY
;
1075 /* the DSS mapping for next skbs will be validated later,
1076 * when a get_mapping_status call will process such skb
1081 /* note that 'map_data_len' accounts only for the carried data, does
1082 * not include the eventual seq increment due to the data fin,
1083 * while the pseudo header requires the original DSS data len,
1086 csum
= __mptcp_make_csum(subflow
->map_seq
,
1087 subflow
->map_subflow_seq
,
1088 subflow
->map_data_len
+ subflow
->map_data_fin
,
1089 subflow
->map_data_csum
);
1090 if (unlikely(csum
)) {
1091 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DATACSUMERR
);
1092 return MAPPING_BAD_CSUM
;
1095 subflow
->valid_csum_seen
= 1;
1099 static enum mapping_status
get_mapping_status(struct sock
*ssk
,
1100 struct mptcp_sock
*msk
)
1102 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1103 bool csum_reqd
= READ_ONCE(msk
->csum_enabled
);
1104 struct mptcp_ext
*mpext
;
1105 struct sk_buff
*skb
;
1109 skb
= skb_peek(&ssk
->sk_receive_queue
);
1111 return MAPPING_EMPTY
;
1113 if (mptcp_check_fallback(ssk
))
1114 return MAPPING_DUMMY
;
1116 mpext
= mptcp_get_ext(skb
);
1117 if (!mpext
|| !mpext
->use_map
) {
1118 if (!subflow
->map_valid
&& !skb
->len
) {
1119 /* the TCP stack deliver 0 len FIN pkt to the receive
1120 * queue, that is the only 0len pkts ever expected here,
1121 * and we can admit no mapping only for 0 len pkts
1123 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
))
1124 WARN_ONCE(1, "0len seq %d:%d flags %x",
1125 TCP_SKB_CB(skb
)->seq
,
1126 TCP_SKB_CB(skb
)->end_seq
,
1127 TCP_SKB_CB(skb
)->tcp_flags
);
1128 sk_eat_skb(ssk
, skb
);
1129 return MAPPING_EMPTY
;
1132 /* If the required DSS has likely been dropped by a middlebox */
1133 if (!subflow
->map_valid
)
1134 return MAPPING_NODSS
;
1139 trace_get_mapping_status(mpext
);
1141 data_len
= mpext
->data_len
;
1142 if (data_len
== 0) {
1143 pr_debug("infinite mapping received\n");
1144 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_INFINITEMAPRX
);
1145 subflow
->map_data_len
= 0;
1146 return MAPPING_INVALID
;
1149 if (mpext
->data_fin
== 1) {
1152 if (data_len
== 1) {
1153 bool updated
= mptcp_update_rcv_data_fin(msk
, mpext
->data_seq
,
1155 pr_debug("DATA_FIN with no payload seq=%llu\n", mpext
->data_seq
);
1156 if (subflow
->map_valid
) {
1157 /* A DATA_FIN might arrive in a DSS
1158 * option before the previous mapping
1159 * has been fully consumed. Continue
1160 * handling the existing mapping.
1162 skb_ext_del(skb
, SKB_EXT_MPTCP
);
1167 mptcp_schedule_work((struct sock
*)msk
);
1169 return MAPPING_DATA_FIN
;
1172 data_fin_seq
= mpext
->data_seq
+ data_len
- 1;
1174 /* If mpext->data_seq is a 32-bit value, data_fin_seq must also
1175 * be limited to 32 bits.
1178 data_fin_seq
&= GENMASK_ULL(31, 0);
1180 mptcp_update_rcv_data_fin(msk
, data_fin_seq
, mpext
->dsn64
);
1181 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n",
1182 data_fin_seq
, mpext
->dsn64
);
1184 /* Adjust for DATA_FIN using 1 byte of sequence space */
1188 map_seq
= mptcp_expand_seq(READ_ONCE(msk
->ack_seq
), mpext
->data_seq
, mpext
->dsn64
);
1189 WRITE_ONCE(mptcp_sk(subflow
->conn
)->use_64bit_ack
, !!mpext
->dsn64
);
1191 if (subflow
->map_valid
) {
1192 /* Allow replacing only with an identical map */
1193 if (subflow
->map_seq
== map_seq
&&
1194 subflow
->map_subflow_seq
== mpext
->subflow_seq
&&
1195 subflow
->map_data_len
== data_len
&&
1196 subflow
->map_csum_reqd
== mpext
->csum_reqd
) {
1197 skb_ext_del(skb
, SKB_EXT_MPTCP
);
1201 /* If this skb data are fully covered by the current mapping,
1202 * the new map would need caching, which is not supported
1204 if (skb_is_fully_mapped(ssk
, skb
)) {
1205 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DSSNOMATCH
);
1206 return MAPPING_INVALID
;
1209 /* will validate the next map after consuming the current one */
1213 subflow
->map_seq
= map_seq
;
1214 subflow
->map_subflow_seq
= mpext
->subflow_seq
;
1215 subflow
->map_data_len
= data_len
;
1216 subflow
->map_valid
= 1;
1217 subflow
->map_data_fin
= mpext
->data_fin
;
1218 subflow
->mpc_map
= mpext
->mpc_map
;
1219 subflow
->map_csum_reqd
= mpext
->csum_reqd
;
1220 subflow
->map_csum_len
= 0;
1221 subflow
->map_data_csum
= csum_unfold(mpext
->csum
);
1223 /* Cfr RFC 8684 Section 3.3.0 */
1224 if (unlikely(subflow
->map_csum_reqd
!= csum_reqd
))
1225 return MAPPING_INVALID
;
1227 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n",
1228 subflow
->map_seq
, subflow
->map_subflow_seq
,
1229 subflow
->map_data_len
, subflow
->map_csum_reqd
,
1230 subflow
->map_data_csum
);
1233 /* we revalidate valid mapping on new skb, because we must ensure
1234 * the current skb is completely covered by the available mapping
1236 if (!validate_mapping(ssk
, skb
)) {
1237 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DSSTCPMISMATCH
);
1238 return MAPPING_INVALID
;
1241 skb_ext_del(skb
, SKB_EXT_MPTCP
);
1244 return validate_data_csum(ssk
, skb
, csum_reqd
);
1247 static void mptcp_subflow_discard_data(struct sock
*ssk
, struct sk_buff
*skb
,
1250 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1251 bool fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
1252 struct tcp_sock
*tp
= tcp_sk(ssk
);
1253 u32 offset
, incr
, avail_len
;
1255 offset
= tp
->copied_seq
- TCP_SKB_CB(skb
)->seq
;
1256 if (WARN_ON_ONCE(offset
> skb
->len
))
1259 avail_len
= skb
->len
- offset
;
1260 incr
= limit
>= avail_len
? avail_len
+ fin
: limit
;
1262 pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr
, skb
->len
,
1263 offset
, subflow
->map_subflow_seq
);
1264 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DUPDATA
);
1265 tcp_sk(ssk
)->copied_seq
+= incr
;
1268 if (!before(tcp_sk(ssk
)->copied_seq
, TCP_SKB_CB(skb
)->end_seq
))
1269 sk_eat_skb(ssk
, skb
);
1270 if (mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
)
1271 subflow
->map_valid
= 0;
1274 /* sched mptcp worker to remove the subflow if no more data is pending */
1275 static void subflow_sched_work_if_closed(struct mptcp_sock
*msk
, struct sock
*ssk
)
1277 struct sock
*sk
= (struct sock
*)msk
;
1279 if (likely(ssk
->sk_state
!= TCP_CLOSE
&&
1280 (ssk
->sk_state
!= TCP_CLOSE_WAIT
||
1281 inet_sk_state_load(sk
) != TCP_ESTABLISHED
)))
1284 if (skb_queue_empty(&ssk
->sk_receive_queue
) &&
1285 !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
))
1286 mptcp_schedule_work(sk
);
1289 static bool subflow_can_fallback(struct mptcp_subflow_context
*subflow
)
1291 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
1293 if (subflow
->mp_join
)
1295 else if (READ_ONCE(msk
->csum_enabled
))
1296 return !subflow
->valid_csum_seen
;
1298 return READ_ONCE(msk
->allow_infinite_fallback
);
1301 static void mptcp_subflow_fail(struct mptcp_sock
*msk
, struct sock
*ssk
)
1303 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1304 unsigned long fail_tout
;
1306 /* graceful failure can happen only on the MPC subflow */
1307 if (WARN_ON_ONCE(ssk
!= READ_ONCE(msk
->first
)))
1310 /* since the close timeout take precedence on the fail one,
1311 * no need to start the latter when the first is already set
1313 if (sock_flag((struct sock
*)msk
, SOCK_DEAD
))
1316 /* we don't need extreme accuracy here, use a zero fail_tout as special
1317 * value meaning no fail timeout at all;
1319 fail_tout
= jiffies
+ TCP_RTO_MAX
;
1322 WRITE_ONCE(subflow
->fail_tout
, fail_tout
);
1325 mptcp_reset_tout_timer(msk
, subflow
->fail_tout
);
1328 static bool subflow_check_data_avail(struct sock
*ssk
)
1330 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1331 enum mapping_status status
;
1332 struct mptcp_sock
*msk
;
1333 struct sk_buff
*skb
;
1335 if (!skb_peek(&ssk
->sk_receive_queue
))
1336 WRITE_ONCE(subflow
->data_avail
, false);
1337 if (subflow
->data_avail
)
1340 msk
= mptcp_sk(subflow
->conn
);
1345 status
= get_mapping_status(ssk
, msk
);
1346 trace_subflow_check_data_avail(status
, skb_peek(&ssk
->sk_receive_queue
));
1347 if (unlikely(status
== MAPPING_INVALID
|| status
== MAPPING_DUMMY
||
1348 status
== MAPPING_BAD_CSUM
|| status
== MAPPING_NODSS
))
1351 if (status
!= MAPPING_OK
)
1354 skb
= skb_peek(&ssk
->sk_receive_queue
);
1355 if (WARN_ON_ONCE(!skb
))
1358 if (unlikely(!READ_ONCE(msk
->can_ack
)))
1361 old_ack
= READ_ONCE(msk
->ack_seq
);
1362 ack_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
1363 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack
,
1365 if (unlikely(before64(ack_seq
, old_ack
))) {
1366 mptcp_subflow_discard_data(ssk
, skb
, old_ack
- ack_seq
);
1370 WRITE_ONCE(subflow
->data_avail
, true);
1376 subflow_sched_work_if_closed(msk
, ssk
);
1380 if (!__mptcp_check_fallback(msk
)) {
1381 /* RFC 8684 section 3.7. */
1382 if (status
== MAPPING_BAD_CSUM
&&
1383 (subflow
->mp_join
|| subflow
->valid_csum_seen
)) {
1384 subflow
->send_mp_fail
= 1;
1386 if (!READ_ONCE(msk
->allow_infinite_fallback
)) {
1387 subflow
->reset_transient
= 0;
1388 subflow
->reset_reason
= MPTCP_RST_EMIDDLEBOX
;
1391 mptcp_subflow_fail(msk
, ssk
);
1392 WRITE_ONCE(subflow
->data_avail
, true);
1396 if (!subflow_can_fallback(subflow
) && subflow
->map_data_len
) {
1397 /* fatal protocol error, close the socket.
1398 * subflow_error_report() will introduce the appropriate barriers
1400 subflow
->reset_transient
= 0;
1401 subflow
->reset_reason
= status
== MAPPING_NODSS
?
1402 MPTCP_RST_EMIDDLEBOX
:
1406 WRITE_ONCE(ssk
->sk_err
, EBADMSG
);
1407 tcp_set_state(ssk
, TCP_CLOSE
);
1408 while ((skb
= skb_peek(&ssk
->sk_receive_queue
)))
1409 sk_eat_skb(ssk
, skb
);
1410 mptcp_send_active_reset_reason(ssk
);
1411 WRITE_ONCE(subflow
->data_avail
, false);
1415 mptcp_do_fallback(ssk
);
1418 skb
= skb_peek(&ssk
->sk_receive_queue
);
1419 subflow
->map_valid
= 1;
1420 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
1421 subflow
->map_data_len
= skb
->len
;
1422 subflow
->map_subflow_seq
= tcp_sk(ssk
)->copied_seq
- subflow
->ssn_offset
;
1423 WRITE_ONCE(subflow
->data_avail
, true);
1427 bool mptcp_subflow_data_available(struct sock
*sk
)
1429 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1431 /* check if current mapping is still valid */
1432 if (subflow
->map_valid
&&
1433 mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
) {
1434 subflow
->map_valid
= 0;
1435 WRITE_ONCE(subflow
->data_avail
, false);
1437 pr_debug("Done with mapping: seq=%u data_len=%u\n",
1438 subflow
->map_subflow_seq
,
1439 subflow
->map_data_len
);
1442 return subflow_check_data_avail(sk
);
1445 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
1448 * In mptcp, rwin is about the mptcp-level connection data.
1450 * Data that is still on the ssk rx queue can thus be ignored,
1451 * as far as mptcp peer is concerned that data is still inflight.
1452 * DSS ACK is updated when skb is moved to the mptcp rx queue.
1454 void mptcp_space(const struct sock
*ssk
, int *space
, int *full_space
)
1456 const struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1457 const struct sock
*sk
= subflow
->conn
;
1459 *space
= __mptcp_space(sk
);
1460 *full_space
= mptcp_win_from_space(sk
, READ_ONCE(sk
->sk_rcvbuf
));
1463 static void subflow_error_report(struct sock
*ssk
)
1465 struct sock
*sk
= mptcp_subflow_ctx(ssk
)->conn
;
1467 /* bail early if this is a no-op, so that we avoid introducing a
1468 * problematic lockdep dependency between TCP accept queue lock
1469 * and msk socket spinlock
1474 mptcp_data_lock(sk
);
1475 if (!sock_owned_by_user(sk
))
1476 __mptcp_error_report(sk
);
1478 __set_bit(MPTCP_ERROR_REPORT
, &mptcp_sk(sk
)->cb_flags
);
1479 mptcp_data_unlock(sk
);
1482 static void subflow_data_ready(struct sock
*sk
)
1484 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1485 u16 state
= 1 << inet_sk_state_load(sk
);
1486 struct sock
*parent
= subflow
->conn
;
1487 struct mptcp_sock
*msk
;
1489 trace_sk_data_ready(sk
);
1491 msk
= mptcp_sk(parent
);
1492 if (state
& TCPF_LISTEN
) {
1493 /* MPJ subflow are removed from accept queue before reaching here,
1494 * avoid stray wakeups
1496 if (reqsk_queue_empty(&inet_csk(sk
)->icsk_accept_queue
))
1499 parent
->sk_data_ready(parent
);
1503 WARN_ON_ONCE(!__mptcp_check_fallback(msk
) && !subflow
->mp_capable
&&
1504 !subflow
->mp_join
&& !(state
& TCPF_CLOSE
));
1506 if (mptcp_subflow_data_available(sk
)) {
1507 mptcp_data_ready(parent
, sk
);
1509 /* subflow-level lowat test are not relevant.
1510 * respect the msk-level threshold eventually mandating an immediate ack
1512 if (mptcp_data_avail(msk
) < parent
->sk_rcvlowat
&&
1513 (tcp_sk(sk
)->rcv_nxt
- tcp_sk(sk
)->rcv_wup
) > inet_csk(sk
)->icsk_ack
.rcv_mss
)
1514 inet_csk(sk
)->icsk_ack
.pending
|= ICSK_ACK_NOW
;
1515 } else if (unlikely(sk
->sk_err
)) {
1516 subflow_error_report(sk
);
1520 static void subflow_write_space(struct sock
*ssk
)
1522 struct sock
*sk
= mptcp_subflow_ctx(ssk
)->conn
;
1524 mptcp_propagate_sndbuf(sk
, ssk
);
1525 mptcp_write_space(sk
);
1528 static const struct inet_connection_sock_af_ops
*
1529 subflow_default_af_ops(struct sock
*sk
)
1531 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1532 if (sk
->sk_family
== AF_INET6
)
1533 return &subflow_v6_specific
;
1535 return &subflow_specific
;
1538 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1539 void mptcpv6_handle_mapped(struct sock
*sk
, bool mapped
)
1541 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1542 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1543 const struct inet_connection_sock_af_ops
*target
;
1545 target
= mapped
? &subflow_v6m_specific
: subflow_default_af_ops(sk
);
1547 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n",
1548 subflow
, sk
->sk_family
, icsk
->icsk_af_ops
, target
, mapped
);
1550 if (likely(icsk
->icsk_af_ops
== target
))
1553 subflow
->icsk_af_ops
= icsk
->icsk_af_ops
;
1554 icsk
->icsk_af_ops
= target
;
1558 void mptcp_info2sockaddr(const struct mptcp_addr_info
*info
,
1559 struct sockaddr_storage
*addr
,
1560 unsigned short family
)
1562 memset(addr
, 0, sizeof(*addr
));
1563 addr
->ss_family
= family
;
1564 if (addr
->ss_family
== AF_INET
) {
1565 struct sockaddr_in
*in_addr
= (struct sockaddr_in
*)addr
;
1567 if (info
->family
== AF_INET
)
1568 in_addr
->sin_addr
= info
->addr
;
1569 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1570 else if (ipv6_addr_v4mapped(&info
->addr6
))
1571 in_addr
->sin_addr
.s_addr
= info
->addr6
.s6_addr32
[3];
1573 in_addr
->sin_port
= info
->port
;
1575 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1576 else if (addr
->ss_family
== AF_INET6
) {
1577 struct sockaddr_in6
*in6_addr
= (struct sockaddr_in6
*)addr
;
1579 if (info
->family
== AF_INET
)
1580 ipv6_addr_set_v4mapped(info
->addr
.s_addr
,
1581 &in6_addr
->sin6_addr
);
1583 in6_addr
->sin6_addr
= info
->addr6
;
1584 in6_addr
->sin6_port
= info
->port
;
1589 int __mptcp_subflow_connect(struct sock
*sk
, const struct mptcp_pm_local
*local
,
1590 const struct mptcp_addr_info
*remote
)
1592 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1593 struct mptcp_subflow_context
*subflow
;
1594 int local_id
= local
->addr
.id
;
1595 struct sockaddr_storage addr
;
1596 int remote_id
= remote
->id
;
1597 int err
= -ENOTCONN
;
1603 /* The userspace PM sent the request too early? */
1604 if (!mptcp_is_fully_established(sk
))
1607 err
= mptcp_subflow_create_socket(sk
, local
->addr
.family
, &sf
);
1609 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNTXCREATSKERR
);
1610 pr_debug("msk=%p local=%d remote=%d create sock error: %d\n",
1611 msk
, local_id
, remote_id
, err
);
1616 subflow
= mptcp_subflow_ctx(ssk
);
1618 get_random_bytes(&subflow
->local_nonce
, sizeof(u32
));
1619 } while (!subflow
->local_nonce
);
1621 /* if 'IPADDRANY', the ID will be set later, after the routing */
1622 if (local
->addr
.family
== AF_INET
) {
1623 if (!local
->addr
.addr
.s_addr
)
1625 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1626 } else if (sk
->sk_family
== AF_INET6
) {
1627 if (ipv6_addr_any(&local
->addr
.addr6
))
1633 subflow_set_local_id(subflow
, local_id
);
1635 subflow
->remote_key_valid
= 1;
1636 subflow
->remote_key
= READ_ONCE(msk
->remote_key
);
1637 subflow
->local_key
= READ_ONCE(msk
->local_key
);
1638 subflow
->token
= msk
->token
;
1639 mptcp_info2sockaddr(&local
->addr
, &addr
, ssk
->sk_family
);
1641 addrlen
= sizeof(struct sockaddr_in
);
1642 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1643 if (addr
.ss_family
== AF_INET6
)
1644 addrlen
= sizeof(struct sockaddr_in6
);
1646 ssk
->sk_bound_dev_if
= local
->ifindex
;
1647 err
= kernel_bind(sf
, (struct sockaddr
*)&addr
, addrlen
);
1649 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNTXBINDERR
);
1650 pr_debug("msk=%p local=%d remote=%d bind error: %d\n",
1651 msk
, local_id
, remote_id
, err
);
1655 mptcp_crypto_key_sha(subflow
->remote_key
, &remote_token
, NULL
);
1656 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk
,
1657 remote_token
, local_id
, remote_id
);
1658 subflow
->remote_token
= remote_token
;
1659 WRITE_ONCE(subflow
->remote_id
, remote_id
);
1660 subflow
->request_join
= 1;
1661 subflow
->request_bkup
= !!(local
->flags
& MPTCP_PM_ADDR_FLAG_BACKUP
);
1662 subflow
->subflow_id
= msk
->subflow_id
++;
1663 mptcp_info2sockaddr(remote
, &addr
, ssk
->sk_family
);
1666 list_add_tail(&subflow
->node
, &msk
->conn_list
);
1667 err
= kernel_connect(sf
, (struct sockaddr
*)&addr
, addrlen
, O_NONBLOCK
);
1668 if (err
&& err
!= -EINPROGRESS
) {
1669 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNTXCONNECTERR
);
1670 pr_debug("msk=%p local=%d remote=%d connect error: %d\n",
1671 msk
, local_id
, remote_id
, err
);
1675 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNTX
);
1677 /* discard the subflow socket */
1678 mptcp_sock_graft(ssk
, sk
->sk_socket
);
1679 iput(SOCK_INODE(sf
));
1680 WRITE_ONCE(msk
->allow_infinite_fallback
, false);
1681 mptcp_stop_tout_timer(sk
);
1685 list_del(&subflow
->node
);
1686 sock_put(mptcp_subflow_tcp_sock(subflow
));
1689 subflow
->disposable
= 1;
1693 /* we account subflows before the creation, and this failures will not
1694 * be caught by sk_state_change()
1696 mptcp_pm_close_subflow(msk
);
1700 static void mptcp_attach_cgroup(struct sock
*parent
, struct sock
*child
)
1702 #ifdef CONFIG_SOCK_CGROUP_DATA
1703 struct sock_cgroup_data
*parent_skcd
= &parent
->sk_cgrp_data
,
1704 *child_skcd
= &child
->sk_cgrp_data
;
1706 /* only the additional subflows created by kworkers have to be modified */
1707 if (cgroup_id(sock_cgroup_ptr(parent_skcd
)) !=
1708 cgroup_id(sock_cgroup_ptr(child_skcd
))) {
1710 struct mem_cgroup
*memcg
= parent
->sk_memcg
;
1712 mem_cgroup_sk_free(child
);
1713 if (memcg
&& css_tryget(&memcg
->css
))
1714 child
->sk_memcg
= memcg
;
1715 #endif /* CONFIG_MEMCG */
1717 cgroup_sk_free(child_skcd
);
1718 *child_skcd
= *parent_skcd
;
1719 cgroup_sk_clone(child_skcd
);
1721 #endif /* CONFIG_SOCK_CGROUP_DATA */
1724 static void mptcp_subflow_ops_override(struct sock
*ssk
)
1726 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1727 if (ssk
->sk_prot
== &tcpv6_prot
)
1728 ssk
->sk_prot
= &tcpv6_prot_override
;
1731 ssk
->sk_prot
= &tcp_prot_override
;
1734 static void mptcp_subflow_ops_undo_override(struct sock
*ssk
)
1736 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1737 if (ssk
->sk_prot
== &tcpv6_prot_override
)
1738 ssk
->sk_prot
= &tcpv6_prot
;
1741 ssk
->sk_prot
= &tcp_prot
;
1744 int mptcp_subflow_create_socket(struct sock
*sk
, unsigned short family
,
1745 struct socket
**new_sock
)
1747 struct mptcp_subflow_context
*subflow
;
1748 struct net
*net
= sock_net(sk
);
1752 /* un-accepted server sockets can reach here - on bad configuration
1753 * bail early to avoid greater trouble later
1755 if (unlikely(!sk
->sk_socket
))
1758 err
= sock_create_kern(net
, family
, SOCK_STREAM
, IPPROTO_TCP
, &sf
);
1762 lock_sock_nested(sf
->sk
, SINGLE_DEPTH_NESTING
);
1764 err
= security_mptcp_add_subflow(sk
, sf
->sk
);
1768 /* the newly created socket has to be in the same cgroup as its parent */
1769 mptcp_attach_cgroup(sk
, sf
->sk
);
1771 /* kernel sockets do not by default acquire net ref, but TCP timer
1773 * Update ns_tracker to current stack trace and refcounted tracker.
1775 __netns_tracker_free(net
, &sf
->sk
->ns_tracker
, false);
1776 sf
->sk
->sk_net_refcnt
= 1;
1777 get_net_track(net
, &sf
->sk
->ns_tracker
, GFP_KERNEL
);
1778 sock_inuse_add(net
, 1);
1779 err
= tcp_set_ulp(sf
->sk
, "mptcp");
1783 mptcp_sockopt_sync_locked(mptcp_sk(sk
), sf
->sk
);
1784 release_sock(sf
->sk
);
1786 /* the newly created socket really belongs to the owning MPTCP
1787 * socket, even if for additional subflows the allocation is performed
1788 * by a kernel workqueue. Adjust inode references, so that the
1789 * procfs/diag interfaces really show this one belonging to the correct
1792 SOCK_INODE(sf
)->i_ino
= SOCK_INODE(sk
->sk_socket
)->i_ino
;
1793 SOCK_INODE(sf
)->i_uid
= SOCK_INODE(sk
->sk_socket
)->i_uid
;
1794 SOCK_INODE(sf
)->i_gid
= SOCK_INODE(sk
->sk_socket
)->i_gid
;
1796 subflow
= mptcp_subflow_ctx(sf
->sk
);
1797 pr_debug("subflow=%p\n", subflow
);
1802 mptcp_subflow_ops_override(sf
->sk
);
1807 release_sock(sf
->sk
);
1812 static struct mptcp_subflow_context
*subflow_create_ctx(struct sock
*sk
,
1815 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1816 struct mptcp_subflow_context
*ctx
;
1818 ctx
= kzalloc(sizeof(*ctx
), priority
);
1822 rcu_assign_pointer(icsk
->icsk_ulp_data
, ctx
);
1823 INIT_LIST_HEAD(&ctx
->node
);
1824 INIT_LIST_HEAD(&ctx
->delegated_node
);
1826 pr_debug("subflow=%p\n", ctx
);
1829 WRITE_ONCE(ctx
->local_id
, -1);
1834 static void __subflow_state_change(struct sock
*sk
)
1836 struct socket_wq
*wq
;
1839 wq
= rcu_dereference(sk
->sk_wq
);
1840 if (skwq_has_sleeper(wq
))
1841 wake_up_interruptible_all(&wq
->wait
);
1845 static bool subflow_is_done(const struct sock
*sk
)
1847 return sk
->sk_shutdown
& RCV_SHUTDOWN
|| sk
->sk_state
== TCP_CLOSE
;
1850 static void subflow_state_change(struct sock
*sk
)
1852 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1853 struct sock
*parent
= subflow
->conn
;
1854 struct mptcp_sock
*msk
;
1856 __subflow_state_change(sk
);
1858 msk
= mptcp_sk(parent
);
1859 if (subflow_simultaneous_connect(sk
)) {
1860 mptcp_do_fallback(sk
);
1862 subflow
->conn_finished
= 1;
1863 mptcp_propagate_state(parent
, sk
, subflow
, NULL
);
1866 /* as recvmsg() does not acquire the subflow socket for ssk selection
1867 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1868 * the data available machinery here.
1870 if (mptcp_subflow_data_available(sk
))
1871 mptcp_data_ready(parent
, sk
);
1872 else if (unlikely(sk
->sk_err
))
1873 subflow_error_report(sk
);
1875 subflow_sched_work_if_closed(mptcp_sk(parent
), sk
);
1877 /* when the fallback subflow closes the rx side, trigger a 'dummy'
1878 * ingress data fin, so that the msk state will follow along
1880 if (__mptcp_check_fallback(msk
) && subflow_is_done(sk
) && msk
->first
== sk
&&
1881 mptcp_update_rcv_data_fin(msk
, READ_ONCE(msk
->ack_seq
), true))
1882 mptcp_schedule_work(parent
);
1885 void mptcp_subflow_queue_clean(struct sock
*listener_sk
, struct sock
*listener_ssk
)
1887 struct request_sock_queue
*queue
= &inet_csk(listener_ssk
)->icsk_accept_queue
;
1888 struct request_sock
*req
, *head
, *tail
;
1889 struct mptcp_subflow_context
*subflow
;
1890 struct sock
*sk
, *ssk
;
1892 /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
1893 * Splice the req list, so that accept() can not reach the pending ssk after
1894 * the listener socket is released below.
1896 spin_lock_bh(&queue
->rskq_lock
);
1897 head
= queue
->rskq_accept_head
;
1898 tail
= queue
->rskq_accept_tail
;
1899 queue
->rskq_accept_head
= NULL
;
1900 queue
->rskq_accept_tail
= NULL
;
1901 spin_unlock_bh(&queue
->rskq_lock
);
1906 /* can't acquire the msk socket lock under the subflow one,
1907 * or will cause ABBA deadlock
1909 release_sock(listener_ssk
);
1911 for (req
= head
; req
; req
= req
->dl_next
) {
1913 if (!sk_is_mptcp(ssk
))
1916 subflow
= mptcp_subflow_ctx(ssk
);
1917 if (!subflow
|| !subflow
->conn
)
1923 lock_sock_nested(sk
, SINGLE_DEPTH_NESTING
);
1924 __mptcp_unaccepted_force_close(sk
);
1927 /* lockdep will report a false positive ABBA deadlock
1928 * between cancel_work_sync and the listener socket.
1929 * The involved locks belong to different sockets WRT
1930 * the existing AB chain.
1931 * Using a per socket key is problematic as key
1932 * deregistration requires process context and must be
1933 * performed at socket disposal time, in atomic
1935 * Just tell lockdep to consider the listener socket
1938 mutex_release(&listener_sk
->sk_lock
.dep_map
, _RET_IP_
);
1939 mptcp_cancel_work(sk
);
1940 mutex_acquire(&listener_sk
->sk_lock
.dep_map
, 0, 0, _RET_IP_
);
1945 /* we are still under the listener msk socket lock */
1946 lock_sock_nested(listener_ssk
, SINGLE_DEPTH_NESTING
);
1948 /* restore the listener queue, to let the TCP code clean it up */
1949 spin_lock_bh(&queue
->rskq_lock
);
1950 WARN_ON_ONCE(queue
->rskq_accept_head
);
1951 queue
->rskq_accept_head
= head
;
1952 queue
->rskq_accept_tail
= tail
;
1953 spin_unlock_bh(&queue
->rskq_lock
);
1956 static int subflow_ulp_init(struct sock
*sk
)
1958 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1959 struct mptcp_subflow_context
*ctx
;
1960 struct tcp_sock
*tp
= tcp_sk(sk
);
1963 /* disallow attaching ULP to a socket unless it has been
1964 * created with sock_create_kern()
1966 if (!sk
->sk_kern_sock
) {
1971 ctx
= subflow_create_ctx(sk
, GFP_KERNEL
);
1977 pr_debug("subflow=%p, family=%d\n", ctx
, sk
->sk_family
);
1980 ctx
->icsk_af_ops
= icsk
->icsk_af_ops
;
1981 icsk
->icsk_af_ops
= subflow_default_af_ops(sk
);
1982 ctx
->tcp_state_change
= sk
->sk_state_change
;
1983 ctx
->tcp_error_report
= sk
->sk_error_report
;
1985 WARN_ON_ONCE(sk
->sk_data_ready
!= sock_def_readable
);
1986 WARN_ON_ONCE(sk
->sk_write_space
!= sk_stream_write_space
);
1988 sk
->sk_data_ready
= subflow_data_ready
;
1989 sk
->sk_write_space
= subflow_write_space
;
1990 sk
->sk_state_change
= subflow_state_change
;
1991 sk
->sk_error_report
= subflow_error_report
;
1996 static void subflow_ulp_release(struct sock
*ssk
)
1998 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(ssk
);
1999 bool release
= true;
2007 /* if the msk has been orphaned, keep the ctx
2008 * alive, will be freed by __mptcp_close_ssk(),
2009 * when the subflow is still unaccepted
2011 release
= ctx
->disposable
|| list_empty(&ctx
->node
);
2013 /* inet_child_forget() does not call sk_state_change(),
2014 * explicitly trigger the socket close machinery
2016 if (!release
&& !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW
,
2017 &mptcp_sk(sk
)->flags
))
2018 mptcp_schedule_work(sk
);
2022 mptcp_subflow_ops_undo_override(ssk
);
2024 kfree_rcu(ctx
, rcu
);
2027 static void subflow_ulp_clone(const struct request_sock
*req
,
2029 const gfp_t priority
)
2031 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
2032 struct mptcp_subflow_context
*old_ctx
= mptcp_subflow_ctx(newsk
);
2033 struct mptcp_subflow_context
*new_ctx
;
2035 if (!tcp_rsk(req
)->is_mptcp
||
2036 (!subflow_req
->mp_capable
&& !subflow_req
->mp_join
)) {
2037 subflow_ulp_fallback(newsk
, old_ctx
);
2041 new_ctx
= subflow_create_ctx(newsk
, priority
);
2043 subflow_ulp_fallback(newsk
, old_ctx
);
2047 new_ctx
->conn_finished
= 1;
2048 new_ctx
->icsk_af_ops
= old_ctx
->icsk_af_ops
;
2049 new_ctx
->tcp_state_change
= old_ctx
->tcp_state_change
;
2050 new_ctx
->tcp_error_report
= old_ctx
->tcp_error_report
;
2051 new_ctx
->rel_write_seq
= 1;
2053 if (subflow_req
->mp_capable
) {
2054 /* see comments in subflow_syn_recv_sock(), MPTCP connection
2055 * is fully established only after we receive the remote key
2057 new_ctx
->mp_capable
= 1;
2058 new_ctx
->local_key
= subflow_req
->local_key
;
2059 new_ctx
->token
= subflow_req
->token
;
2060 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
2061 new_ctx
->idsn
= subflow_req
->idsn
;
2063 /* this is the first subflow, id is always 0 */
2064 subflow_set_local_id(new_ctx
, 0);
2065 } else if (subflow_req
->mp_join
) {
2066 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
2067 new_ctx
->mp_join
= 1;
2068 WRITE_ONCE(new_ctx
->fully_established
, true);
2069 new_ctx
->remote_key_valid
= 1;
2070 new_ctx
->backup
= subflow_req
->backup
;
2071 new_ctx
->request_bkup
= subflow_req
->request_bkup
;
2072 WRITE_ONCE(new_ctx
->remote_id
, subflow_req
->remote_id
);
2073 new_ctx
->token
= subflow_req
->token
;
2074 new_ctx
->thmac
= subflow_req
->thmac
;
2076 /* the subflow req id is valid, fetched via subflow_check_req()
2077 * and subflow_token_join_request()
2079 subflow_set_local_id(new_ctx
, subflow_req
->local_id
);
2083 static void tcp_release_cb_override(struct sock
*ssk
)
2085 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
2088 /* process and clear all the pending actions, but leave the subflow into
2089 * the napi queue. To respect locking, only the same CPU that originated
2090 * the action can touch the list. mptcp_napi_poll will take care of it.
2092 status
= set_mask_bits(&subflow
->delegated_status
, MPTCP_DELEGATE_ACTIONS_MASK
, 0);
2094 mptcp_subflow_process_delegated(ssk
, status
);
2096 tcp_release_cb(ssk
);
2099 static int tcp_abort_override(struct sock
*ssk
, int err
)
2101 /* closing a listener subflow requires a great deal of care.
2102 * keep it simple and just prevent such operation
2104 if (inet_sk_state_load(ssk
) == TCP_LISTEN
)
2107 return tcp_abort(ssk
, err
);
2110 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly
= {
2112 .owner
= THIS_MODULE
,
2113 .init
= subflow_ulp_init
,
2114 .release
= subflow_ulp_release
,
2115 .clone
= subflow_ulp_clone
,
2118 static int subflow_ops_init(struct request_sock_ops
*subflow_ops
)
2120 subflow_ops
->obj_size
= sizeof(struct mptcp_subflow_request_sock
);
2122 subflow_ops
->slab
= kmem_cache_create(subflow_ops
->slab_name
,
2123 subflow_ops
->obj_size
, 0,
2125 SLAB_TYPESAFE_BY_RCU
,
2127 if (!subflow_ops
->slab
)
2133 void __init
mptcp_subflow_init(void)
2135 mptcp_subflow_v4_request_sock_ops
= tcp_request_sock_ops
;
2136 mptcp_subflow_v4_request_sock_ops
.slab_name
= "request_sock_subflow_v4";
2137 mptcp_subflow_v4_request_sock_ops
.destructor
= subflow_v4_req_destructor
;
2139 if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops
) != 0)
2140 panic("MPTCP: failed to init subflow v4 request sock ops\n");
2142 subflow_request_sock_ipv4_ops
= tcp_request_sock_ipv4_ops
;
2143 subflow_request_sock_ipv4_ops
.route_req
= subflow_v4_route_req
;
2144 subflow_request_sock_ipv4_ops
.send_synack
= subflow_v4_send_synack
;
2146 subflow_specific
= ipv4_specific
;
2147 subflow_specific
.conn_request
= subflow_v4_conn_request
;
2148 subflow_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
2149 subflow_specific
.sk_rx_dst_set
= subflow_finish_connect
;
2150 subflow_specific
.rebuild_header
= subflow_rebuild_header
;
2152 tcp_prot_override
= tcp_prot
;
2153 tcp_prot_override
.release_cb
= tcp_release_cb_override
;
2154 tcp_prot_override
.diag_destroy
= tcp_abort_override
;
2156 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2157 /* In struct mptcp_subflow_request_sock, we assume the TCP request sock
2158 * structures for v4 and v6 have the same size. It should not changed in
2159 * the future but better to make sure to be warned if it is no longer
2162 BUILD_BUG_ON(sizeof(struct tcp_request_sock
) != sizeof(struct tcp6_request_sock
));
2164 mptcp_subflow_v6_request_sock_ops
= tcp6_request_sock_ops
;
2165 mptcp_subflow_v6_request_sock_ops
.slab_name
= "request_sock_subflow_v6";
2166 mptcp_subflow_v6_request_sock_ops
.destructor
= subflow_v6_req_destructor
;
2168 if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops
) != 0)
2169 panic("MPTCP: failed to init subflow v6 request sock ops\n");
2171 subflow_request_sock_ipv6_ops
= tcp_request_sock_ipv6_ops
;
2172 subflow_request_sock_ipv6_ops
.route_req
= subflow_v6_route_req
;
2173 subflow_request_sock_ipv6_ops
.send_synack
= subflow_v6_send_synack
;
2175 subflow_v6_specific
= ipv6_specific
;
2176 subflow_v6_specific
.conn_request
= subflow_v6_conn_request
;
2177 subflow_v6_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
2178 subflow_v6_specific
.sk_rx_dst_set
= subflow_finish_connect
;
2179 subflow_v6_specific
.rebuild_header
= subflow_v6_rebuild_header
;
2181 subflow_v6m_specific
= subflow_v6_specific
;
2182 subflow_v6m_specific
.queue_xmit
= ipv4_specific
.queue_xmit
;
2183 subflow_v6m_specific
.send_check
= ipv4_specific
.send_check
;
2184 subflow_v6m_specific
.net_header_len
= ipv4_specific
.net_header_len
;
2185 subflow_v6m_specific
.mtu_reduced
= ipv4_specific
.mtu_reduced
;
2186 subflow_v6m_specific
.rebuild_header
= subflow_rebuild_header
;
2188 tcpv6_prot_override
= tcpv6_prot
;
2189 tcpv6_prot_override
.release_cb
= tcp_release_cb_override
;
2190 tcpv6_prot_override
.diag_destroy
= tcp_abort_override
;
2193 mptcp_diag_subflow_init(&subflow_ulp_ops
);
2195 if (tcp_register_ulp(&subflow_ulp_ops
) != 0)
2196 panic("MPTCP: failed to register subflows to ULP\n");