1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
22 #include <net/mptcp.h>
26 static void SUBFLOW_REQ_INC_STATS(struct request_sock
*req
,
27 enum linux_mptcp_mib_field field
)
29 MPTCP_INC_STATS(sock_net(req_to_sk(req
)), field
);
32 static int subflow_rebuild_header(struct sock
*sk
)
34 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
35 int local_id
, err
= 0;
37 if (subflow
->request_mptcp
&& !subflow
->token
) {
38 pr_debug("subflow=%p", sk
);
39 err
= mptcp_token_new_connect(sk
);
40 } else if (subflow
->request_join
&& !subflow
->local_nonce
) {
41 struct mptcp_sock
*msk
= (struct mptcp_sock
*)subflow
->conn
;
43 pr_debug("subflow=%p", sk
);
46 get_random_bytes(&subflow
->local_nonce
, sizeof(u32
));
47 } while (!subflow
->local_nonce
);
49 if (subflow
->local_id
)
52 local_id
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)sk
);
56 subflow
->local_id
= local_id
;
63 return subflow
->icsk_af_ops
->rebuild_header(sk
);
66 static void subflow_req_destructor(struct request_sock
*req
)
68 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
70 pr_debug("subflow_req=%p", subflow_req
);
72 if (subflow_req
->mp_capable
)
73 mptcp_token_destroy_request(subflow_req
->token
);
74 tcp_request_sock_ops
.destructor(req
);
77 static void subflow_generate_hmac(u64 key1
, u64 key2
, u32 nonce1
, u32 nonce2
,
82 put_unaligned_be32(nonce1
, &msg
[0]);
83 put_unaligned_be32(nonce2
, &msg
[4]);
85 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 8, hmac
);
88 /* validate received token and create truncated hmac and nonce for SYN-ACK */
89 static bool subflow_token_join_request(struct request_sock
*req
,
90 const struct sk_buff
*skb
)
92 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
93 u8 hmac
[SHA256_DIGEST_SIZE
];
94 struct mptcp_sock
*msk
;
97 msk
= mptcp_token_get_sock(subflow_req
->token
);
99 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINNOTOKEN
);
103 local_id
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)req
);
105 sock_put((struct sock
*)msk
);
108 subflow_req
->local_id
= local_id
;
110 get_random_bytes(&subflow_req
->local_nonce
, sizeof(u32
));
112 subflow_generate_hmac(msk
->local_key
, msk
->remote_key
,
113 subflow_req
->local_nonce
,
114 subflow_req
->remote_nonce
, hmac
);
116 subflow_req
->thmac
= get_unaligned_be64(hmac
);
118 sock_put((struct sock
*)msk
);
122 static void subflow_init_req(struct request_sock
*req
,
123 const struct sock
*sk_listener
,
126 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
127 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
128 struct mptcp_options_received mp_opt
;
130 pr_debug("subflow_req=%p, listener=%p", subflow_req
, listener
);
132 mptcp_get_options(skb
, &mp_opt
);
134 subflow_req
->mp_capable
= 0;
135 subflow_req
->mp_join
= 0;
137 #ifdef CONFIG_TCP_MD5SIG
138 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
141 if (rcu_access_pointer(tcp_sk(sk_listener
)->md5sig_info
))
145 if (mp_opt
.mp_capable
) {
146 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEPASSIVE
);
150 } else if (mp_opt
.mp_join
) {
151 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINSYNRX
);
154 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
157 err
= mptcp_token_new_request(req
);
159 subflow_req
->mp_capable
= 1;
161 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
162 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
163 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
164 subflow_req
->mp_join
= 1;
165 subflow_req
->backup
= mp_opt
.backup
;
166 subflow_req
->remote_id
= mp_opt
.join_id
;
167 subflow_req
->token
= mp_opt
.token
;
168 subflow_req
->remote_nonce
= mp_opt
.nonce
;
169 pr_debug("token=%u, remote_nonce=%u", subflow_req
->token
,
170 subflow_req
->remote_nonce
);
171 if (!subflow_token_join_request(req
, skb
)) {
172 subflow_req
->mp_join
= 0;
173 // @@ need to trigger RST
178 static void subflow_v4_init_req(struct request_sock
*req
,
179 const struct sock
*sk_listener
,
182 tcp_rsk(req
)->is_mptcp
= 1;
184 tcp_request_sock_ipv4_ops
.init_req(req
, sk_listener
, skb
);
186 subflow_init_req(req
, sk_listener
, skb
);
189 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
190 static void subflow_v6_init_req(struct request_sock
*req
,
191 const struct sock
*sk_listener
,
194 tcp_rsk(req
)->is_mptcp
= 1;
196 tcp_request_sock_ipv6_ops
.init_req(req
, sk_listener
, skb
);
198 subflow_init_req(req
, sk_listener
, skb
);
202 /* validate received truncated hmac and create hmac for third ACK */
203 static bool subflow_thmac_valid(struct mptcp_subflow_context
*subflow
)
205 u8 hmac
[SHA256_DIGEST_SIZE
];
208 subflow_generate_hmac(subflow
->remote_key
, subflow
->local_key
,
209 subflow
->remote_nonce
, subflow
->local_nonce
,
212 thmac
= get_unaligned_be64(hmac
);
213 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
214 subflow
, subflow
->token
,
215 (unsigned long long)thmac
,
216 (unsigned long long)subflow
->thmac
);
218 return thmac
== subflow
->thmac
;
221 static void subflow_finish_connect(struct sock
*sk
, const struct sk_buff
*skb
)
223 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
224 struct mptcp_options_received mp_opt
;
225 struct sock
*parent
= subflow
->conn
;
226 struct tcp_sock
*tp
= tcp_sk(sk
);
228 subflow
->icsk_af_ops
->sk_rx_dst_set(sk
, skb
);
230 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
231 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
232 parent
->sk_state_change(parent
);
235 /* be sure no special action on any packet other than syn-ack */
236 if (subflow
->conn_finished
)
239 subflow
->conn_finished
= 1;
241 mptcp_get_options(skb
, &mp_opt
);
242 if (subflow
->request_mptcp
&& mp_opt
.mp_capable
) {
243 subflow
->mp_capable
= 1;
244 subflow
->can_ack
= 1;
245 subflow
->remote_key
= mp_opt
.sndr_key
;
246 pr_debug("subflow=%p, remote_key=%llu", subflow
,
247 subflow
->remote_key
);
248 } else if (subflow
->request_join
&& mp_opt
.mp_join
) {
249 subflow
->mp_join
= 1;
250 subflow
->thmac
= mp_opt
.thmac
;
251 subflow
->remote_nonce
= mp_opt
.nonce
;
252 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow
,
253 subflow
->thmac
, subflow
->remote_nonce
);
254 } else if (subflow
->request_mptcp
) {
261 if (subflow
->mp_capable
) {
262 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk
),
263 subflow
->remote_key
);
264 mptcp_finish_connect(sk
);
267 pr_debug("synack seq=%u", TCP_SKB_CB(skb
)->seq
);
268 subflow
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
270 } else if (subflow
->mp_join
) {
271 u8 hmac
[SHA256_DIGEST_SIZE
];
273 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
274 subflow
, subflow
->thmac
,
275 subflow
->remote_nonce
);
276 if (!subflow_thmac_valid(subflow
)) {
277 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINACKMAC
);
278 subflow
->mp_join
= 0;
282 subflow_generate_hmac(subflow
->local_key
, subflow
->remote_key
,
283 subflow
->local_nonce
,
284 subflow
->remote_nonce
,
287 memcpy(subflow
->hmac
, hmac
, MPTCPOPT_HMAC_LEN
);
290 subflow
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
292 if (!mptcp_finish_join(sk
))
295 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNACKRX
);
298 tcp_send_active_reset(sk
, GFP_ATOMIC
);
303 static struct request_sock_ops subflow_request_sock_ops
;
304 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops
;
306 static int subflow_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
308 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
310 pr_debug("subflow=%p", subflow
);
312 /* Never answer to SYNs sent to broadcast or multicast */
313 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
316 return tcp_conn_request(&subflow_request_sock_ops
,
317 &subflow_request_sock_ipv4_ops
,
324 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
325 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops
;
326 static struct inet_connection_sock_af_ops subflow_v6_specific
;
327 static struct inet_connection_sock_af_ops subflow_v6m_specific
;
329 static int subflow_v6_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
331 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
333 pr_debug("subflow=%p", subflow
);
335 if (skb
->protocol
== htons(ETH_P_IP
))
336 return subflow_v4_conn_request(sk
, skb
);
338 if (!ipv6_unicast_destination(skb
))
341 return tcp_conn_request(&subflow_request_sock_ops
,
342 &subflow_request_sock_ipv6_ops
, sk
, skb
);
346 return 0; /* don't send reset */
350 /* validate hmac received in third ACK */
351 static bool subflow_hmac_valid(const struct request_sock
*req
,
352 const struct mptcp_options_received
*mp_opt
)
354 const struct mptcp_subflow_request_sock
*subflow_req
;
355 u8 hmac
[SHA256_DIGEST_SIZE
];
356 struct mptcp_sock
*msk
;
359 subflow_req
= mptcp_subflow_rsk(req
);
360 msk
= mptcp_token_get_sock(subflow_req
->token
);
364 subflow_generate_hmac(msk
->remote_key
, msk
->local_key
,
365 subflow_req
->remote_nonce
,
366 subflow_req
->local_nonce
, hmac
);
369 if (crypto_memneq(hmac
, mp_opt
->hmac
, MPTCPOPT_HMAC_LEN
))
372 sock_put((struct sock
*)msk
);
376 static void mptcp_sock_destruct(struct sock
*sk
)
378 /* if new mptcp socket isn't accepted, it is free'd
379 * from the tcp listener sockets request queue, linked
380 * from req->sk. The tcp socket is released.
381 * This calls the ULP release function which will
382 * also remove the mptcp socket, via
383 * sock_put(ctx->conn).
385 * Problem is that the mptcp socket will not be in
386 * SYN_RECV state and doesn't have SOCK_DEAD flag.
387 * Both result in warnings from inet_sock_destruct.
390 if (sk
->sk_state
== TCP_SYN_RECV
) {
391 sk
->sk_state
= TCP_CLOSE
;
392 WARN_ON_ONCE(sk
->sk_socket
);
396 mptcp_token_destroy(mptcp_sk(sk
)->token
);
397 inet_sock_destruct(sk
);
400 static void mptcp_force_close(struct sock
*sk
)
402 inet_sk_state_store(sk
, TCP_CLOSE
);
403 sk_common_release(sk
);
406 static void subflow_ulp_fallback(struct sock
*sk
,
407 struct mptcp_subflow_context
*old_ctx
)
409 struct inet_connection_sock
*icsk
= inet_csk(sk
);
411 mptcp_subflow_tcp_fallback(sk
, old_ctx
);
412 icsk
->icsk_ulp_ops
= NULL
;
413 rcu_assign_pointer(icsk
->icsk_ulp_data
, NULL
);
414 tcp_sk(sk
)->is_mptcp
= 0;
417 static struct sock
*subflow_syn_recv_sock(const struct sock
*sk
,
419 struct request_sock
*req
,
420 struct dst_entry
*dst
,
421 struct request_sock
*req_unhash
,
424 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk
);
425 struct mptcp_subflow_request_sock
*subflow_req
;
426 struct mptcp_options_received mp_opt
;
427 bool fallback_is_fatal
= false;
428 struct sock
*new_msk
= NULL
;
429 bool fallback
= false;
432 pr_debug("listener=%p, req=%p, conn=%p", listener
, req
, listener
->conn
);
434 /* we need later a valid 'mp_capable' value even when options are not
437 mp_opt
.mp_capable
= 0;
438 if (tcp_rsk(req
)->is_mptcp
== 0)
441 /* if the sk is MP_CAPABLE, we try to fetch the client key */
442 subflow_req
= mptcp_subflow_rsk(req
);
443 if (subflow_req
->mp_capable
) {
444 if (TCP_SKB_CB(skb
)->seq
!= subflow_req
->ssn_offset
+ 1) {
445 /* here we can receive and accept an in-window,
446 * out-of-order pkt, which will not carry the MP_CAPABLE
447 * opt even on mptcp enabled paths
452 mptcp_get_options(skb
, &mp_opt
);
453 if (!mp_opt
.mp_capable
) {
459 new_msk
= mptcp_sk_clone(listener
->conn
, &mp_opt
, req
);
462 } else if (subflow_req
->mp_join
) {
463 fallback_is_fatal
= true;
464 mptcp_get_options(skb
, &mp_opt
);
465 if (!mp_opt
.mp_join
||
466 !subflow_hmac_valid(req
, &mp_opt
)) {
467 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKMAC
);
473 child
= listener
->icsk_af_ops
->syn_recv_sock(sk
, skb
, req
, dst
,
474 req_unhash
, own_req
);
476 if (child
&& *own_req
) {
477 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(child
);
479 /* we need to fallback on ctx allocation failure and on pre-reqs
480 * checking above. In the latter scenario we additionally need
481 * to reset the context to non MPTCP status.
483 if (!ctx
|| fallback
) {
484 if (fallback_is_fatal
)
488 subflow_ulp_fallback(child
, ctx
);
494 if (ctx
->mp_capable
) {
495 /* new mpc subflow takes ownership of the newly
496 * created mptcp socket
498 new_msk
->sk_destruct
= mptcp_sock_destruct
;
499 mptcp_pm_new_connection(mptcp_sk(new_msk
), 1);
503 /* with OoO packets we can reach here without ingress
506 ctx
->remote_key
= mp_opt
.sndr_key
;
507 ctx
->fully_established
= mp_opt
.mp_capable
;
508 ctx
->can_ack
= mp_opt
.mp_capable
;
509 } else if (ctx
->mp_join
) {
510 struct mptcp_sock
*owner
;
512 owner
= mptcp_token_get_sock(ctx
->token
);
516 ctx
->conn
= (struct sock
*)owner
;
517 if (!mptcp_finish_join(child
))
520 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKRX
);
525 /* dispose of the left over mptcp master, if any */
526 if (unlikely(new_msk
))
527 mptcp_force_close(new_msk
);
529 /* check for expected invariant - should never trigger, just help
530 * catching eariler subtle bugs
532 WARN_ON_ONCE(child
&& *own_req
&& tcp_sk(child
)->is_mptcp
&&
533 (!mptcp_subflow_ctx(child
) ||
534 !mptcp_subflow_ctx(child
)->conn
));
538 tcp_send_active_reset(child
, GFP_ATOMIC
);
539 inet_csk_prepare_forced_close(child
);
544 static struct inet_connection_sock_af_ops subflow_specific
;
546 enum mapping_status
{
553 static u64
expand_seq(u64 old_seq
, u16 old_data_len
, u64 seq
)
555 if ((u32
)seq
== (u32
)old_seq
)
558 /* Assume map covers data not mapped yet. */
559 return seq
| ((old_seq
+ old_data_len
+ 1) & GENMASK_ULL(63, 32));
562 static void warn_bad_map(struct mptcp_subflow_context
*subflow
, u32 ssn
)
564 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
565 ssn
, subflow
->map_subflow_seq
, subflow
->map_data_len
);
568 static bool skb_is_fully_mapped(struct sock
*ssk
, struct sk_buff
*skb
)
570 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
571 unsigned int skb_consumed
;
573 skb_consumed
= tcp_sk(ssk
)->copied_seq
- TCP_SKB_CB(skb
)->seq
;
574 if (WARN_ON_ONCE(skb_consumed
>= skb
->len
))
577 return skb
->len
- skb_consumed
<= subflow
->map_data_len
-
578 mptcp_subflow_get_map_offset(subflow
);
581 static bool validate_mapping(struct sock
*ssk
, struct sk_buff
*skb
)
583 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
584 u32 ssn
= tcp_sk(ssk
)->copied_seq
- subflow
->ssn_offset
;
586 if (unlikely(before(ssn
, subflow
->map_subflow_seq
))) {
587 /* Mapping covers data later in the subflow stream,
588 * currently unsupported.
590 warn_bad_map(subflow
, ssn
);
593 if (unlikely(!before(ssn
, subflow
->map_subflow_seq
+
594 subflow
->map_data_len
))) {
595 /* Mapping does covers past subflow data, invalid */
596 warn_bad_map(subflow
, ssn
+ skb
->len
);
602 static enum mapping_status
get_mapping_status(struct sock
*ssk
)
604 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
605 struct mptcp_ext
*mpext
;
610 skb
= skb_peek(&ssk
->sk_receive_queue
);
612 return MAPPING_EMPTY
;
614 mpext
= mptcp_get_ext(skb
);
615 if (!mpext
|| !mpext
->use_map
) {
616 if (!subflow
->map_valid
&& !skb
->len
) {
617 /* the TCP stack deliver 0 len FIN pkt to the receive
618 * queue, that is the only 0len pkts ever expected here,
619 * and we can admit no mapping only for 0 len pkts
621 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
))
622 WARN_ONCE(1, "0len seq %d:%d flags %x",
623 TCP_SKB_CB(skb
)->seq
,
624 TCP_SKB_CB(skb
)->end_seq
,
625 TCP_SKB_CB(skb
)->tcp_flags
);
626 sk_eat_skb(ssk
, skb
);
627 return MAPPING_EMPTY
;
630 if (!subflow
->map_valid
)
631 return MAPPING_INVALID
;
636 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
637 mpext
->data_seq
, mpext
->dsn64
, mpext
->subflow_seq
,
638 mpext
->data_len
, mpext
->data_fin
);
640 data_len
= mpext
->data_len
;
642 pr_err("Infinite mapping not handled");
643 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_INFINITEMAPRX
);
644 return MAPPING_INVALID
;
647 if (mpext
->data_fin
== 1) {
649 pr_debug("DATA_FIN with no payload");
650 if (subflow
->map_valid
) {
651 /* A DATA_FIN might arrive in a DSS
652 * option before the previous mapping
653 * has been fully consumed. Continue
654 * handling the existing mapping.
656 skb_ext_del(skb
, SKB_EXT_MPTCP
);
659 return MAPPING_DATA_FIN
;
663 /* Adjust for DATA_FIN using 1 byte of sequence space */
668 map_seq
= expand_seq(subflow
->map_seq
, subflow
->map_data_len
,
670 pr_debug("expanded seq=%llu", subflow
->map_seq
);
672 map_seq
= mpext
->data_seq
;
675 if (subflow
->map_valid
) {
676 /* Allow replacing only with an identical map */
677 if (subflow
->map_seq
== map_seq
&&
678 subflow
->map_subflow_seq
== mpext
->subflow_seq
&&
679 subflow
->map_data_len
== data_len
) {
680 skb_ext_del(skb
, SKB_EXT_MPTCP
);
684 /* If this skb data are fully covered by the current mapping,
685 * the new map would need caching, which is not supported
687 if (skb_is_fully_mapped(ssk
, skb
)) {
688 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DSSNOMATCH
);
689 return MAPPING_INVALID
;
692 /* will validate the next map after consuming the current one */
696 subflow
->map_seq
= map_seq
;
697 subflow
->map_subflow_seq
= mpext
->subflow_seq
;
698 subflow
->map_data_len
= data_len
;
699 subflow
->map_valid
= 1;
700 subflow
->mpc_map
= mpext
->mpc_map
;
701 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
702 subflow
->map_seq
, subflow
->map_subflow_seq
,
703 subflow
->map_data_len
);
706 /* we revalidate valid mapping on new skb, because we must ensure
707 * the current skb is completely covered by the available mapping
709 if (!validate_mapping(ssk
, skb
))
710 return MAPPING_INVALID
;
712 skb_ext_del(skb
, SKB_EXT_MPTCP
);
716 static int subflow_read_actor(read_descriptor_t
*desc
,
718 unsigned int offset
, size_t len
)
720 size_t copy_len
= min(desc
->count
, len
);
722 desc
->count
-= copy_len
;
724 pr_debug("flushed %zu bytes, %zu left", copy_len
, desc
->count
);
728 static bool subflow_check_data_avail(struct sock
*ssk
)
730 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
731 enum mapping_status status
;
732 struct mptcp_sock
*msk
;
735 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow
->conn
, ssk
,
736 subflow
->data_avail
, skb_peek(&ssk
->sk_receive_queue
));
737 if (subflow
->data_avail
)
740 msk
= mptcp_sk(subflow
->conn
);
747 status
= get_mapping_status(ssk
);
748 pr_debug("msk=%p ssk=%p status=%d", msk
, ssk
, status
);
749 if (status
== MAPPING_INVALID
) {
750 ssk
->sk_err
= EBADMSG
;
754 if (status
!= MAPPING_OK
)
757 skb
= skb_peek(&ssk
->sk_receive_queue
);
758 if (WARN_ON_ONCE(!skb
))
761 /* if msk lacks the remote key, this subflow must provide an
762 * MP_CAPABLE-based mapping
764 if (unlikely(!READ_ONCE(msk
->can_ack
))) {
765 if (!subflow
->mpc_map
) {
766 ssk
->sk_err
= EBADMSG
;
769 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
770 WRITE_ONCE(msk
->ack_seq
, subflow
->map_seq
);
771 WRITE_ONCE(msk
->can_ack
, true);
774 old_ack
= READ_ONCE(msk
->ack_seq
);
775 ack_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
776 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack
,
778 if (ack_seq
== old_ack
)
781 /* only accept in-sequence mapping. Old values are spurious
782 * retransmission; we can hit "future" values on active backup
783 * subflow switch, we relay on retransmissions to get
785 * Cuncurrent subflows support will require subflow data
788 map_remaining
= subflow
->map_data_len
-
789 mptcp_subflow_get_map_offset(subflow
);
790 if (before64(ack_seq
, old_ack
))
791 delta
= min_t(size_t, old_ack
- ack_seq
, map_remaining
);
793 delta
= min_t(size_t, ack_seq
- old_ack
, map_remaining
);
795 /* discard mapped data */
796 pr_debug("discarding %zu bytes, current map len=%d", delta
,
799 read_descriptor_t desc
= {
804 ret
= tcp_read_sock(ssk
, &desc
, subflow_read_actor
);
811 if (delta
== map_remaining
)
812 subflow
->map_valid
= 0;
818 /* fatal protocol error, close the socket */
819 /* This barrier is coupled with smp_rmb() in tcp_poll() */
821 ssk
->sk_error_report(ssk
);
822 tcp_set_state(ssk
, TCP_CLOSE
);
823 tcp_send_active_reset(ssk
, GFP_ATOMIC
);
827 bool mptcp_subflow_data_available(struct sock
*sk
)
829 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
832 /* check if current mapping is still valid */
833 if (subflow
->map_valid
&&
834 mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
) {
835 subflow
->map_valid
= 0;
836 subflow
->data_avail
= 0;
838 pr_debug("Done with mapping: seq=%u data_len=%u",
839 subflow
->map_subflow_seq
,
840 subflow
->map_data_len
);
843 if (!subflow_check_data_avail(sk
)) {
844 subflow
->data_avail
= 0;
848 skb
= skb_peek(&sk
->sk_receive_queue
);
849 subflow
->data_avail
= skb
&&
850 before(tcp_sk(sk
)->copied_seq
, TCP_SKB_CB(skb
)->end_seq
);
851 return subflow
->data_avail
;
854 static void subflow_data_ready(struct sock
*sk
)
856 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
857 struct sock
*parent
= subflow
->conn
;
859 if (!subflow
->mp_capable
&& !subflow
->mp_join
) {
860 subflow
->tcp_data_ready(sk
);
862 parent
->sk_data_ready(parent
);
866 if (mptcp_subflow_data_available(sk
))
867 mptcp_data_ready(parent
, sk
);
870 static void subflow_write_space(struct sock
*sk
)
872 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
873 struct sock
*parent
= subflow
->conn
;
875 sk_stream_write_space(sk
);
876 if (sk_stream_is_writeable(sk
)) {
877 set_bit(MPTCP_SEND_SPACE
, &mptcp_sk(parent
)->flags
);
878 smp_mb__after_atomic();
879 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
880 sk_stream_write_space(parent
);
884 static struct inet_connection_sock_af_ops
*
885 subflow_default_af_ops(struct sock
*sk
)
887 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
888 if (sk
->sk_family
== AF_INET6
)
889 return &subflow_v6_specific
;
891 return &subflow_specific
;
894 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
895 void mptcpv6_handle_mapped(struct sock
*sk
, bool mapped
)
897 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
898 struct inet_connection_sock
*icsk
= inet_csk(sk
);
899 struct inet_connection_sock_af_ops
*target
;
901 target
= mapped
? &subflow_v6m_specific
: subflow_default_af_ops(sk
);
903 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
904 subflow
, sk
->sk_family
, icsk
->icsk_af_ops
, target
, mapped
);
906 if (likely(icsk
->icsk_af_ops
== target
))
909 subflow
->icsk_af_ops
= icsk
->icsk_af_ops
;
910 icsk
->icsk_af_ops
= target
;
914 static void mptcp_info2sockaddr(const struct mptcp_addr_info
*info
,
915 struct sockaddr_storage
*addr
)
917 memset(addr
, 0, sizeof(*addr
));
918 addr
->ss_family
= info
->family
;
919 if (addr
->ss_family
== AF_INET
) {
920 struct sockaddr_in
*in_addr
= (struct sockaddr_in
*)addr
;
922 in_addr
->sin_addr
= info
->addr
;
923 in_addr
->sin_port
= info
->port
;
925 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
926 else if (addr
->ss_family
== AF_INET6
) {
927 struct sockaddr_in6
*in6_addr
= (struct sockaddr_in6
*)addr
;
929 in6_addr
->sin6_addr
= info
->addr6
;
930 in6_addr
->sin6_port
= info
->port
;
935 int __mptcp_subflow_connect(struct sock
*sk
, int ifindex
,
936 const struct mptcp_addr_info
*loc
,
937 const struct mptcp_addr_info
*remote
)
939 struct mptcp_sock
*msk
= mptcp_sk(sk
);
940 struct mptcp_subflow_context
*subflow
;
941 struct sockaddr_storage addr
;
947 if (sk
->sk_state
!= TCP_ESTABLISHED
)
950 err
= mptcp_subflow_create_socket(sk
, &sf
);
954 subflow
= mptcp_subflow_ctx(sf
->sk
);
955 subflow
->remote_key
= msk
->remote_key
;
956 subflow
->local_key
= msk
->local_key
;
957 subflow
->token
= msk
->token
;
958 mptcp_info2sockaddr(loc
, &addr
);
960 addrlen
= sizeof(struct sockaddr_in
);
961 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
962 if (loc
->family
== AF_INET6
)
963 addrlen
= sizeof(struct sockaddr_in6
);
965 sf
->sk
->sk_bound_dev_if
= ifindex
;
966 err
= kernel_bind(sf
, (struct sockaddr
*)&addr
, addrlen
);
970 mptcp_crypto_key_sha(subflow
->remote_key
, &remote_token
, NULL
);
971 pr_debug("msk=%p remote_token=%u", msk
, remote_token
);
972 subflow
->remote_token
= remote_token
;
973 subflow
->local_id
= loc
->id
;
974 subflow
->request_join
= 1;
975 subflow
->request_bkup
= 1;
976 mptcp_info2sockaddr(remote
, &addr
);
978 err
= kernel_connect(sf
, (struct sockaddr
*)&addr
, addrlen
, O_NONBLOCK
);
979 if (err
&& err
!= -EINPROGRESS
)
982 spin_lock_bh(&msk
->join_list_lock
);
983 list_add_tail(&subflow
->node
, &msk
->join_list
);
984 spin_unlock_bh(&msk
->join_list_lock
);
993 int mptcp_subflow_create_socket(struct sock
*sk
, struct socket
**new_sock
)
995 struct mptcp_subflow_context
*subflow
;
996 struct net
*net
= sock_net(sk
);
1000 err
= sock_create_kern(net
, sk
->sk_family
, SOCK_STREAM
, IPPROTO_TCP
,
1007 /* kernel sockets do not by default acquire net ref, but TCP timer
1010 sf
->sk
->sk_net_refcnt
= 1;
1012 #ifdef CONFIG_PROC_FS
1013 this_cpu_add(*net
->core
.sock_inuse
, 1);
1015 err
= tcp_set_ulp(sf
->sk
, "mptcp");
1016 release_sock(sf
->sk
);
1023 /* the newly created socket really belongs to the owning MPTCP master
1024 * socket, even if for additional subflows the allocation is performed
1025 * by a kernel workqueue. Adjust inode references, so that the
1026 * procfs/diag interaces really show this one belonging to the correct
1029 SOCK_INODE(sf
)->i_ino
= SOCK_INODE(sk
->sk_socket
)->i_ino
;
1030 SOCK_INODE(sf
)->i_uid
= SOCK_INODE(sk
->sk_socket
)->i_uid
;
1031 SOCK_INODE(sf
)->i_gid
= SOCK_INODE(sk
->sk_socket
)->i_gid
;
1033 subflow
= mptcp_subflow_ctx(sf
->sk
);
1034 pr_debug("subflow=%p", subflow
);
1043 static struct mptcp_subflow_context
*subflow_create_ctx(struct sock
*sk
,
1046 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1047 struct mptcp_subflow_context
*ctx
;
1049 ctx
= kzalloc(sizeof(*ctx
), priority
);
1053 rcu_assign_pointer(icsk
->icsk_ulp_data
, ctx
);
1054 INIT_LIST_HEAD(&ctx
->node
);
1056 pr_debug("subflow=%p", ctx
);
1063 static void __subflow_state_change(struct sock
*sk
)
1065 struct socket_wq
*wq
;
1068 wq
= rcu_dereference(sk
->sk_wq
);
1069 if (skwq_has_sleeper(wq
))
1070 wake_up_interruptible_all(&wq
->wait
);
1074 static bool subflow_is_done(const struct sock
*sk
)
1076 return sk
->sk_shutdown
& RCV_SHUTDOWN
|| sk
->sk_state
== TCP_CLOSE
;
1079 static void subflow_state_change(struct sock
*sk
)
1081 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1082 struct sock
*parent
= subflow
->conn
;
1084 __subflow_state_change(sk
);
1086 /* as recvmsg() does not acquire the subflow socket for ssk selection
1087 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1088 * the data available machinery here.
1090 if (subflow
->mp_capable
&& mptcp_subflow_data_available(sk
))
1091 mptcp_data_ready(parent
, sk
);
1093 if (!(parent
->sk_shutdown
& RCV_SHUTDOWN
) &&
1094 !subflow
->rx_eof
&& subflow_is_done(sk
)) {
1095 subflow
->rx_eof
= 1;
1096 mptcp_subflow_eof(parent
);
1100 static int subflow_ulp_init(struct sock
*sk
)
1102 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1103 struct mptcp_subflow_context
*ctx
;
1104 struct tcp_sock
*tp
= tcp_sk(sk
);
1107 /* disallow attaching ULP to a socket unless it has been
1108 * created with sock_create_kern()
1110 if (!sk
->sk_kern_sock
) {
1115 ctx
= subflow_create_ctx(sk
, GFP_KERNEL
);
1121 pr_debug("subflow=%p, family=%d", ctx
, sk
->sk_family
);
1124 ctx
->icsk_af_ops
= icsk
->icsk_af_ops
;
1125 icsk
->icsk_af_ops
= subflow_default_af_ops(sk
);
1126 ctx
->tcp_data_ready
= sk
->sk_data_ready
;
1127 ctx
->tcp_state_change
= sk
->sk_state_change
;
1128 ctx
->tcp_write_space
= sk
->sk_write_space
;
1129 sk
->sk_data_ready
= subflow_data_ready
;
1130 sk
->sk_write_space
= subflow_write_space
;
1131 sk
->sk_state_change
= subflow_state_change
;
1136 static void subflow_ulp_release(struct sock
*sk
)
1138 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(sk
);
1144 sock_put(ctx
->conn
);
1146 kfree_rcu(ctx
, rcu
);
1149 static void subflow_ulp_clone(const struct request_sock
*req
,
1151 const gfp_t priority
)
1153 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
1154 struct mptcp_subflow_context
*old_ctx
= mptcp_subflow_ctx(newsk
);
1155 struct mptcp_subflow_context
*new_ctx
;
1157 if (!tcp_rsk(req
)->is_mptcp
||
1158 (!subflow_req
->mp_capable
&& !subflow_req
->mp_join
)) {
1159 subflow_ulp_fallback(newsk
, old_ctx
);
1163 new_ctx
= subflow_create_ctx(newsk
, priority
);
1165 subflow_ulp_fallback(newsk
, old_ctx
);
1169 new_ctx
->conn_finished
= 1;
1170 new_ctx
->icsk_af_ops
= old_ctx
->icsk_af_ops
;
1171 new_ctx
->tcp_data_ready
= old_ctx
->tcp_data_ready
;
1172 new_ctx
->tcp_state_change
= old_ctx
->tcp_state_change
;
1173 new_ctx
->tcp_write_space
= old_ctx
->tcp_write_space
;
1174 new_ctx
->rel_write_seq
= 1;
1175 new_ctx
->tcp_sock
= newsk
;
1177 if (subflow_req
->mp_capable
) {
1178 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1179 * is fully established only after we receive the remote key
1181 new_ctx
->mp_capable
= 1;
1182 new_ctx
->local_key
= subflow_req
->local_key
;
1183 new_ctx
->token
= subflow_req
->token
;
1184 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1185 new_ctx
->idsn
= subflow_req
->idsn
;
1186 } else if (subflow_req
->mp_join
) {
1187 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1188 new_ctx
->mp_join
= 1;
1189 new_ctx
->fully_established
= 1;
1190 new_ctx
->backup
= subflow_req
->backup
;
1191 new_ctx
->local_id
= subflow_req
->local_id
;
1192 new_ctx
->token
= subflow_req
->token
;
1193 new_ctx
->thmac
= subflow_req
->thmac
;
1197 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly
= {
1199 .owner
= THIS_MODULE
,
1200 .init
= subflow_ulp_init
,
1201 .release
= subflow_ulp_release
,
1202 .clone
= subflow_ulp_clone
,
1205 static int subflow_ops_init(struct request_sock_ops
*subflow_ops
)
1207 subflow_ops
->obj_size
= sizeof(struct mptcp_subflow_request_sock
);
1208 subflow_ops
->slab_name
= "request_sock_subflow";
1210 subflow_ops
->slab
= kmem_cache_create(subflow_ops
->slab_name
,
1211 subflow_ops
->obj_size
, 0,
1213 SLAB_TYPESAFE_BY_RCU
,
1215 if (!subflow_ops
->slab
)
1218 subflow_ops
->destructor
= subflow_req_destructor
;
1223 void mptcp_subflow_init(void)
1225 subflow_request_sock_ops
= tcp_request_sock_ops
;
1226 if (subflow_ops_init(&subflow_request_sock_ops
) != 0)
1227 panic("MPTCP: failed to init subflow request sock ops\n");
1229 subflow_request_sock_ipv4_ops
= tcp_request_sock_ipv4_ops
;
1230 subflow_request_sock_ipv4_ops
.init_req
= subflow_v4_init_req
;
1232 subflow_specific
= ipv4_specific
;
1233 subflow_specific
.conn_request
= subflow_v4_conn_request
;
1234 subflow_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1235 subflow_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1236 subflow_specific
.rebuild_header
= subflow_rebuild_header
;
1238 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1239 subflow_request_sock_ipv6_ops
= tcp_request_sock_ipv6_ops
;
1240 subflow_request_sock_ipv6_ops
.init_req
= subflow_v6_init_req
;
1242 subflow_v6_specific
= ipv6_specific
;
1243 subflow_v6_specific
.conn_request
= subflow_v6_conn_request
;
1244 subflow_v6_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1245 subflow_v6_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1246 subflow_v6_specific
.rebuild_header
= subflow_rebuild_header
;
1248 subflow_v6m_specific
= subflow_v6_specific
;
1249 subflow_v6m_specific
.queue_xmit
= ipv4_specific
.queue_xmit
;
1250 subflow_v6m_specific
.send_check
= ipv4_specific
.send_check
;
1251 subflow_v6m_specific
.net_header_len
= ipv4_specific
.net_header_len
;
1252 subflow_v6m_specific
.mtu_reduced
= ipv4_specific
.mtu_reduced
;
1253 subflow_v6m_specific
.net_frag_header_len
= 0;
1256 mptcp_diag_subflow_init(&subflow_ulp_ops
);
1258 if (tcp_register_ulp(&subflow_ulp_ops
) != 0)
1259 panic("MPTCP: failed to register subflows to ULP\n");