1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <linux/sched/signal.h>
13 #include <linux/atomic.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #include <net/tcp_states.h>
20 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
21 #include <net/transp_v6.h>
23 #include <net/mptcp.h>
28 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
30 struct mptcp_sock msk
;
41 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
43 static struct percpu_counter mptcp_sockets_allocated
;
45 static void __mptcp_destroy_sock(struct sock
*sk
);
46 static void __mptcp_check_send_data_fin(struct sock
*sk
);
48 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
49 * completed yet or has failed, return the subflow socket.
50 * Otherwise return NULL.
52 static struct socket
*__mptcp_nmpc_socket(const struct mptcp_sock
*msk
)
54 if (!msk
->subflow
|| READ_ONCE(msk
->can_ack
))
60 /* Returns end sequence number of the receiver's advertised window */
61 static u64
mptcp_wnd_end(const struct mptcp_sock
*msk
)
63 return READ_ONCE(msk
->wnd_end
);
66 static bool mptcp_is_tcpsk(struct sock
*sk
)
68 struct socket
*sock
= sk
->sk_socket
;
70 if (unlikely(sk
->sk_prot
== &tcp_prot
)) {
71 /* we are being invoked after mptcp_accept() has
72 * accepted a non-mp-capable flow: sk is a tcp_sk,
75 * Hand the socket over to tcp so all further socket ops
78 sock
->ops
= &inet_stream_ops
;
80 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
81 } else if (unlikely(sk
->sk_prot
== &tcpv6_prot
)) {
82 sock
->ops
= &inet6_stream_ops
;
90 static struct sock
*__mptcp_tcp_fallback(struct mptcp_sock
*msk
)
92 sock_owned_by_me((const struct sock
*)msk
);
94 if (likely(!__mptcp_check_fallback(msk
)))
100 static int __mptcp_socket_create(struct mptcp_sock
*msk
)
102 struct mptcp_subflow_context
*subflow
;
103 struct sock
*sk
= (struct sock
*)msk
;
104 struct socket
*ssock
;
107 err
= mptcp_subflow_create_socket(sk
, &ssock
);
111 msk
->first
= ssock
->sk
;
112 msk
->subflow
= ssock
;
113 subflow
= mptcp_subflow_ctx(ssock
->sk
);
114 list_add(&subflow
->node
, &msk
->conn_list
);
115 sock_hold(ssock
->sk
);
116 subflow
->request_mptcp
= 1;
118 /* accept() will wait on first subflow sk_wq, and we always wakes up
121 RCU_INIT_POINTER(msk
->first
->sk_wq
, &sk
->sk_socket
->wq
);
126 static void mptcp_drop(struct sock
*sk
, struct sk_buff
*skb
)
128 sk_drops_add(sk
, skb
);
132 static bool mptcp_try_coalesce(struct sock
*sk
, struct sk_buff
*to
,
133 struct sk_buff
*from
)
138 if (MPTCP_SKB_CB(from
)->offset
||
139 !skb_try_coalesce(to
, from
, &fragstolen
, &delta
))
142 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
143 MPTCP_SKB_CB(from
)->map_seq
, MPTCP_SKB_CB(to
)->map_seq
,
144 to
->len
, MPTCP_SKB_CB(from
)->end_seq
);
145 MPTCP_SKB_CB(to
)->end_seq
= MPTCP_SKB_CB(from
)->end_seq
;
146 kfree_skb_partial(from
, fragstolen
);
147 atomic_add(delta
, &sk
->sk_rmem_alloc
);
148 sk_mem_charge(sk
, delta
);
152 static bool mptcp_ooo_try_coalesce(struct mptcp_sock
*msk
, struct sk_buff
*to
,
153 struct sk_buff
*from
)
155 if (MPTCP_SKB_CB(from
)->map_seq
!= MPTCP_SKB_CB(to
)->end_seq
)
158 return mptcp_try_coalesce((struct sock
*)msk
, to
, from
);
161 /* "inspired" by tcp_data_queue_ofo(), main differences:
163 * - don't cope with sacks
165 static void mptcp_data_queue_ofo(struct mptcp_sock
*msk
, struct sk_buff
*skb
)
167 struct sock
*sk
= (struct sock
*)msk
;
168 struct rb_node
**p
, *parent
;
169 u64 seq
, end_seq
, max_seq
;
170 struct sk_buff
*skb1
;
172 seq
= MPTCP_SKB_CB(skb
)->map_seq
;
173 end_seq
= MPTCP_SKB_CB(skb
)->end_seq
;
174 max_seq
= READ_ONCE(msk
->rcv_wnd_sent
);
176 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk
, seq
, max_seq
,
177 RB_EMPTY_ROOT(&msk
->out_of_order_queue
));
178 if (after64(end_seq
, max_seq
)) {
181 pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
182 (unsigned long long)end_seq
- (unsigned long)max_seq
,
183 (unsigned long long)msk
->rcv_wnd_sent
);
184 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_NODSSWINDOW
);
188 p
= &msk
->out_of_order_queue
.rb_node
;
189 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUE
);
190 if (RB_EMPTY_ROOT(&msk
->out_of_order_queue
)) {
191 rb_link_node(&skb
->rbnode
, NULL
, p
);
192 rb_insert_color(&skb
->rbnode
, &msk
->out_of_order_queue
);
193 msk
->ooo_last_skb
= skb
;
197 /* with 2 subflows, adding at end of ooo queue is quite likely
198 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
200 if (mptcp_ooo_try_coalesce(msk
, msk
->ooo_last_skb
, skb
)) {
201 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOMERGE
);
202 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUETAIL
);
206 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
207 if (!before64(seq
, MPTCP_SKB_CB(msk
->ooo_last_skb
)->end_seq
)) {
208 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUETAIL
);
209 parent
= &msk
->ooo_last_skb
->rbnode
;
210 p
= &parent
->rb_right
;
214 /* Find place to insert this segment. Handle overlaps on the way. */
218 skb1
= rb_to_skb(parent
);
219 if (before64(seq
, MPTCP_SKB_CB(skb1
)->map_seq
)) {
220 p
= &parent
->rb_left
;
223 if (before64(seq
, MPTCP_SKB_CB(skb1
)->end_seq
)) {
224 if (!after64(end_seq
, MPTCP_SKB_CB(skb1
)->end_seq
)) {
225 /* All the bits are present. Drop. */
227 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
230 if (after64(seq
, MPTCP_SKB_CB(skb1
)->map_seq
)) {
234 * continue traversing
237 /* skb's seq == skb1's seq and skb covers skb1.
238 * Replace skb1 with skb.
240 rb_replace_node(&skb1
->rbnode
, &skb
->rbnode
,
241 &msk
->out_of_order_queue
);
242 mptcp_drop(sk
, skb1
);
243 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
246 } else if (mptcp_ooo_try_coalesce(msk
, skb1
, skb
)) {
247 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOMERGE
);
250 p
= &parent
->rb_right
;
254 /* Insert segment into RB tree. */
255 rb_link_node(&skb
->rbnode
, parent
, p
);
256 rb_insert_color(&skb
->rbnode
, &msk
->out_of_order_queue
);
259 /* Remove other segments covered by skb. */
260 while ((skb1
= skb_rb_next(skb
)) != NULL
) {
261 if (before64(end_seq
, MPTCP_SKB_CB(skb1
)->end_seq
))
263 rb_erase(&skb1
->rbnode
, &msk
->out_of_order_queue
);
264 mptcp_drop(sk
, skb1
);
265 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
267 /* If there is no skb after us, we are the last_skb ! */
269 msk
->ooo_last_skb
= skb
;
273 skb_set_owner_r(skb
, sk
);
276 static bool __mptcp_move_skb(struct mptcp_sock
*msk
, struct sock
*ssk
,
277 struct sk_buff
*skb
, unsigned int offset
,
280 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
281 struct sock
*sk
= (struct sock
*)msk
;
282 struct sk_buff
*tail
;
284 __skb_unlink(skb
, &ssk
->sk_receive_queue
);
289 /* try to fetch required memory from subflow */
290 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
)) {
291 if (ssk
->sk_forward_alloc
< skb
->truesize
)
293 __sk_mem_reclaim(ssk
, skb
->truesize
);
294 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
))
298 /* the skb map_seq accounts for the skb offset:
299 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
302 MPTCP_SKB_CB(skb
)->map_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
303 MPTCP_SKB_CB(skb
)->end_seq
= MPTCP_SKB_CB(skb
)->map_seq
+ copy_len
;
304 MPTCP_SKB_CB(skb
)->offset
= offset
;
306 if (MPTCP_SKB_CB(skb
)->map_seq
== msk
->ack_seq
) {
308 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ copy_len
);
309 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
310 if (tail
&& mptcp_try_coalesce(sk
, tail
, skb
))
313 skb_set_owner_r(skb
, sk
);
314 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
316 } else if (after64(MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
)) {
317 mptcp_data_queue_ofo(msk
, skb
);
321 /* old data, keep it simple and drop the whole pkt, sender
322 * will retransmit as needed, if needed.
324 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
330 static void mptcp_stop_timer(struct sock
*sk
)
332 struct inet_connection_sock
*icsk
= inet_csk(sk
);
334 sk_stop_timer(sk
, &icsk
->icsk_retransmit_timer
);
335 mptcp_sk(sk
)->timer_ival
= 0;
338 static void mptcp_close_wake_up(struct sock
*sk
)
340 if (sock_flag(sk
, SOCK_DEAD
))
343 sk
->sk_state_change(sk
);
344 if (sk
->sk_shutdown
== SHUTDOWN_MASK
||
345 sk
->sk_state
== TCP_CLOSE
)
346 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_HUP
);
348 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
351 static bool mptcp_pending_data_fin_ack(struct sock
*sk
)
353 struct mptcp_sock
*msk
= mptcp_sk(sk
);
355 return !__mptcp_check_fallback(msk
) &&
356 ((1 << sk
->sk_state
) &
357 (TCPF_FIN_WAIT1
| TCPF_CLOSING
| TCPF_LAST_ACK
)) &&
358 msk
->write_seq
== READ_ONCE(msk
->snd_una
);
361 static void mptcp_check_data_fin_ack(struct sock
*sk
)
363 struct mptcp_sock
*msk
= mptcp_sk(sk
);
365 /* Look for an acknowledged DATA_FIN */
366 if (mptcp_pending_data_fin_ack(sk
)) {
367 mptcp_stop_timer(sk
);
369 WRITE_ONCE(msk
->snd_data_fin_enable
, 0);
371 switch (sk
->sk_state
) {
373 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
377 inet_sk_state_store(sk
, TCP_CLOSE
);
381 mptcp_close_wake_up(sk
);
385 static bool mptcp_pending_data_fin(struct sock
*sk
, u64
*seq
)
387 struct mptcp_sock
*msk
= mptcp_sk(sk
);
389 if (READ_ONCE(msk
->rcv_data_fin
) &&
390 ((1 << sk
->sk_state
) &
391 (TCPF_ESTABLISHED
| TCPF_FIN_WAIT1
| TCPF_FIN_WAIT2
))) {
392 u64 rcv_data_fin_seq
= READ_ONCE(msk
->rcv_data_fin_seq
);
394 if (msk
->ack_seq
== rcv_data_fin_seq
) {
396 *seq
= rcv_data_fin_seq
;
405 static void mptcp_set_timeout(const struct sock
*sk
, const struct sock
*ssk
)
407 long tout
= ssk
&& inet_csk(ssk
)->icsk_pending
?
408 inet_csk(ssk
)->icsk_timeout
- jiffies
: 0;
411 tout
= mptcp_sk(sk
)->timer_ival
;
412 mptcp_sk(sk
)->timer_ival
= tout
> 0 ? tout
: TCP_RTO_MIN
;
415 static bool mptcp_subflow_active(struct mptcp_subflow_context
*subflow
)
417 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
419 /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
420 if (subflow
->request_join
&& !subflow
->fully_established
)
423 /* only send if our side has not closed yet */
424 return ((1 << ssk
->sk_state
) & (TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
));
427 static bool tcp_can_send_ack(const struct sock
*ssk
)
429 return !((1 << inet_sk_state_load(ssk
)) &
430 (TCPF_SYN_SENT
| TCPF_SYN_RECV
| TCPF_TIME_WAIT
| TCPF_CLOSE
));
433 static void mptcp_send_ack(struct mptcp_sock
*msk
)
435 struct mptcp_subflow_context
*subflow
;
437 mptcp_for_each_subflow(msk
, subflow
) {
438 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
441 if (tcp_can_send_ack(ssk
))
447 static bool mptcp_subflow_cleanup_rbuf(struct sock
*ssk
)
452 ret
= tcp_can_send_ack(ssk
);
454 tcp_cleanup_rbuf(ssk
, 1);
459 static void mptcp_cleanup_rbuf(struct mptcp_sock
*msk
)
461 struct sock
*ack_hint
= READ_ONCE(msk
->ack_hint
);
462 struct mptcp_subflow_context
*subflow
;
464 /* if the hinted ssk is still active, try to use it */
465 if (likely(ack_hint
)) {
466 mptcp_for_each_subflow(msk
, subflow
) {
467 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
469 if (ack_hint
== ssk
&& mptcp_subflow_cleanup_rbuf(ssk
))
474 /* otherwise pick the first active subflow */
475 mptcp_for_each_subflow(msk
, subflow
)
476 if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow
)))
480 static bool mptcp_check_data_fin(struct sock
*sk
)
482 struct mptcp_sock
*msk
= mptcp_sk(sk
);
483 u64 rcv_data_fin_seq
;
486 if (__mptcp_check_fallback(msk
) || !msk
->first
)
489 /* Need to ack a DATA_FIN received from a peer while this side
490 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
491 * msk->rcv_data_fin was set when parsing the incoming options
492 * at the subflow level and the msk lock was not held, so this
493 * is the first opportunity to act on the DATA_FIN and change
496 * If we are caught up to the sequence number of the incoming
497 * DATA_FIN, send the DATA_ACK now and do state transition. If
498 * not caught up, do nothing and let the recv code send DATA_ACK
502 if (mptcp_pending_data_fin(sk
, &rcv_data_fin_seq
)) {
503 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ 1);
504 WRITE_ONCE(msk
->rcv_data_fin
, 0);
506 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
507 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
508 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
510 switch (sk
->sk_state
) {
511 case TCP_ESTABLISHED
:
512 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
515 inet_sk_state_store(sk
, TCP_CLOSING
);
518 inet_sk_state_store(sk
, TCP_CLOSE
);
521 /* Other states not expected */
527 mptcp_set_timeout(sk
, NULL
);
529 mptcp_close_wake_up(sk
);
534 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock
*msk
,
538 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
539 struct sock
*sk
= (struct sock
*)msk
;
540 unsigned int moved
= 0;
541 bool more_data_avail
;
546 sk_rbuf
= READ_ONCE(sk
->sk_rcvbuf
);
548 if (!(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
549 int ssk_rbuf
= READ_ONCE(ssk
->sk_rcvbuf
);
551 if (unlikely(ssk_rbuf
> sk_rbuf
)) {
552 WRITE_ONCE(sk
->sk_rcvbuf
, ssk_rbuf
);
557 pr_debug("msk=%p ssk=%p", msk
, ssk
);
560 u32 map_remaining
, offset
;
561 u32 seq
= tp
->copied_seq
;
565 /* try to move as much data as available */
566 map_remaining
= subflow
->map_data_len
-
567 mptcp_subflow_get_map_offset(subflow
);
569 skb
= skb_peek(&ssk
->sk_receive_queue
);
571 /* if no data is found, a racing workqueue/recvmsg
572 * already processed the new data, stop here or we
573 * can enter an infinite loop
580 if (__mptcp_check_fallback(msk
)) {
581 /* if we are running under the workqueue, TCP could have
582 * collapsed skbs between dummy map creation and now
583 * be sure to adjust the size
585 map_remaining
= skb
->len
;
586 subflow
->map_data_len
= skb
->len
;
589 offset
= seq
- TCP_SKB_CB(skb
)->seq
;
590 fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
596 if (offset
< skb
->len
) {
597 size_t len
= skb
->len
- offset
;
602 if (__mptcp_move_skb(msk
, ssk
, skb
, offset
, len
))
606 if (WARN_ON_ONCE(map_remaining
< len
))
610 sk_eat_skb(ssk
, skb
);
614 WRITE_ONCE(tp
->copied_seq
, seq
);
615 more_data_avail
= mptcp_subflow_data_available(ssk
);
617 if (atomic_read(&sk
->sk_rmem_alloc
) > sk_rbuf
) {
621 } while (more_data_avail
);
622 WRITE_ONCE(msk
->ack_hint
, ssk
);
628 static bool __mptcp_ofo_queue(struct mptcp_sock
*msk
)
630 struct sock
*sk
= (struct sock
*)msk
;
631 struct sk_buff
*skb
, *tail
;
636 p
= rb_first(&msk
->out_of_order_queue
);
637 pr_debug("msk=%p empty=%d", msk
, RB_EMPTY_ROOT(&msk
->out_of_order_queue
));
640 if (after64(MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
))
644 rb_erase(&skb
->rbnode
, &msk
->out_of_order_queue
);
646 if (unlikely(!after64(MPTCP_SKB_CB(skb
)->end_seq
,
649 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
653 end_seq
= MPTCP_SKB_CB(skb
)->end_seq
;
654 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
655 if (!tail
|| !mptcp_ooo_try_coalesce(msk
, tail
, skb
)) {
656 int delta
= msk
->ack_seq
- MPTCP_SKB_CB(skb
)->map_seq
;
658 /* skip overlapping data, if any */
659 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
660 MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
,
662 MPTCP_SKB_CB(skb
)->offset
+= delta
;
663 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
665 msk
->ack_seq
= end_seq
;
671 /* In most cases we will be able to lock the mptcp socket. If its already
672 * owned, we need to defer to the work queue to avoid ABBA deadlock.
674 static void move_skbs_to_msk(struct mptcp_sock
*msk
, struct sock
*ssk
)
676 struct sock
*sk
= (struct sock
*)msk
;
677 unsigned int moved
= 0;
679 if (inet_sk_state_load(sk
) == TCP_CLOSE
)
684 __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
685 __mptcp_ofo_queue(msk
);
687 /* If the moves have caught up with the DATA_FIN sequence number
688 * it's time to ack the DATA_FIN and change socket state, but
689 * this is not a good place to change state. Let the workqueue
692 if (mptcp_pending_data_fin(sk
, NULL
))
693 mptcp_schedule_work(sk
);
694 mptcp_data_unlock(sk
);
697 void mptcp_data_ready(struct sock
*sk
, struct sock
*ssk
)
699 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
700 struct mptcp_sock
*msk
= mptcp_sk(sk
);
701 int sk_rbuf
, ssk_rbuf
;
704 /* The peer can send data while we are shutting down this
705 * subflow at msk destruction time, but we must avoid enqueuing
706 * more data to the msk receive queue
708 if (unlikely(subflow
->disposable
))
711 /* move_skbs_to_msk below can legitly clear the data_avail flag,
712 * but we will need later to properly woke the reader, cache its
715 wake
= subflow
->data_avail
== MPTCP_SUBFLOW_DATA_AVAIL
;
717 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
719 ssk_rbuf
= READ_ONCE(ssk
->sk_rcvbuf
);
720 sk_rbuf
= READ_ONCE(sk
->sk_rcvbuf
);
721 if (unlikely(ssk_rbuf
> sk_rbuf
))
724 /* over limit? can't append more skbs to msk */
725 if (atomic_read(&sk
->sk_rmem_alloc
) > sk_rbuf
)
728 move_skbs_to_msk(msk
, ssk
);
732 sk
->sk_data_ready(sk
);
735 void __mptcp_flush_join_list(struct mptcp_sock
*msk
)
737 if (likely(list_empty(&msk
->join_list
)))
740 spin_lock_bh(&msk
->join_list_lock
);
741 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
742 spin_unlock_bh(&msk
->join_list_lock
);
745 static bool mptcp_timer_pending(struct sock
*sk
)
747 return timer_pending(&inet_csk(sk
)->icsk_retransmit_timer
);
750 static void mptcp_reset_timer(struct sock
*sk
)
752 struct inet_connection_sock
*icsk
= inet_csk(sk
);
755 /* prevent rescheduling on close */
756 if (unlikely(inet_sk_state_load(sk
) == TCP_CLOSE
))
759 /* should never be called with mptcp level timer cleared */
760 tout
= READ_ONCE(mptcp_sk(sk
)->timer_ival
);
761 if (WARN_ON_ONCE(!tout
))
763 sk_reset_timer(sk
, &icsk
->icsk_retransmit_timer
, jiffies
+ tout
);
766 bool mptcp_schedule_work(struct sock
*sk
)
768 if (inet_sk_state_load(sk
) != TCP_CLOSE
&&
769 schedule_work(&mptcp_sk(sk
)->work
)) {
770 /* each subflow already holds a reference to the sk, and the
771 * workqueue is invoked by a subflow, so sk can't go away here.
779 void mptcp_subflow_eof(struct sock
*sk
)
781 if (!test_and_set_bit(MPTCP_WORK_EOF
, &mptcp_sk(sk
)->flags
))
782 mptcp_schedule_work(sk
);
785 static void mptcp_check_for_eof(struct mptcp_sock
*msk
)
787 struct mptcp_subflow_context
*subflow
;
788 struct sock
*sk
= (struct sock
*)msk
;
791 mptcp_for_each_subflow(msk
, subflow
)
792 receivers
+= !subflow
->rx_eof
;
796 if (!(sk
->sk_shutdown
& RCV_SHUTDOWN
)) {
797 /* hopefully temporary hack: propagate shutdown status
798 * to msk, when all subflows agree on it
800 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
802 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
803 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
804 sk
->sk_data_ready(sk
);
807 switch (sk
->sk_state
) {
808 case TCP_ESTABLISHED
:
809 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
812 inet_sk_state_store(sk
, TCP_CLOSING
);
815 inet_sk_state_store(sk
, TCP_CLOSE
);
820 mptcp_close_wake_up(sk
);
823 static struct sock
*mptcp_subflow_recv_lookup(const struct mptcp_sock
*msk
)
825 struct mptcp_subflow_context
*subflow
;
826 struct sock
*sk
= (struct sock
*)msk
;
828 sock_owned_by_me(sk
);
830 mptcp_for_each_subflow(msk
, subflow
) {
831 if (subflow
->data_avail
)
832 return mptcp_subflow_tcp_sock(subflow
);
838 static bool mptcp_skb_can_collapse_to(u64 write_seq
,
839 const struct sk_buff
*skb
,
840 const struct mptcp_ext
*mpext
)
842 if (!tcp_skb_can_collapse_to(skb
))
845 /* can collapse only if MPTCP level sequence is in order and this
846 * mapping has not been xmitted yet
848 return mpext
&& mpext
->data_seq
+ mpext
->data_len
== write_seq
&&
852 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock
*msk
,
853 const struct page_frag
*pfrag
,
854 const struct mptcp_data_frag
*df
)
856 return df
&& pfrag
->page
== df
->page
&&
857 pfrag
->size
- pfrag
->offset
> 0 &&
858 df
->data_seq
+ df
->data_len
== msk
->write_seq
;
861 static int mptcp_wmem_with_overhead(struct sock
*sk
, int size
)
863 struct mptcp_sock
*msk
= mptcp_sk(sk
);
866 ret
= size
+ ((sizeof(struct mptcp_data_frag
) * size
) >> PAGE_SHIFT
);
867 skbs
= (msk
->tx_pending_data
+ size
) / msk
->size_goal_cache
;
868 if (skbs
< msk
->skb_tx_cache
.qlen
)
871 return ret
+ (skbs
- msk
->skb_tx_cache
.qlen
) * SKB_TRUESIZE(MAX_TCP_HEADER
);
874 static void __mptcp_wmem_reserve(struct sock
*sk
, int size
)
876 int amount
= mptcp_wmem_with_overhead(sk
, size
);
877 struct mptcp_sock
*msk
= mptcp_sk(sk
);
879 WARN_ON_ONCE(msk
->wmem_reserved
);
880 if (WARN_ON_ONCE(amount
< 0))
883 if (amount
<= sk
->sk_forward_alloc
)
886 /* under memory pressure try to reserve at most a single page
887 * otherwise try to reserve the full estimate and fallback
888 * to a single page before entering the error path
890 if ((tcp_under_memory_pressure(sk
) && amount
> PAGE_SIZE
) ||
891 !sk_wmem_schedule(sk
, amount
)) {
892 if (amount
<= PAGE_SIZE
)
896 if (!sk_wmem_schedule(sk
, amount
))
901 msk
->wmem_reserved
= amount
;
902 sk
->sk_forward_alloc
-= amount
;
906 /* we will wait for memory on next allocation */
907 msk
->wmem_reserved
= -1;
910 static void __mptcp_update_wmem(struct sock
*sk
)
912 struct mptcp_sock
*msk
= mptcp_sk(sk
);
914 if (!msk
->wmem_reserved
)
917 if (msk
->wmem_reserved
< 0)
918 msk
->wmem_reserved
= 0;
919 if (msk
->wmem_reserved
> 0) {
920 sk
->sk_forward_alloc
+= msk
->wmem_reserved
;
921 msk
->wmem_reserved
= 0;
925 static bool mptcp_wmem_alloc(struct sock
*sk
, int size
)
927 struct mptcp_sock
*msk
= mptcp_sk(sk
);
929 /* check for pre-existing error condition */
930 if (msk
->wmem_reserved
< 0)
933 if (msk
->wmem_reserved
>= size
)
937 if (!sk_wmem_schedule(sk
, size
)) {
938 mptcp_data_unlock(sk
);
942 sk
->sk_forward_alloc
-= size
;
943 msk
->wmem_reserved
+= size
;
944 mptcp_data_unlock(sk
);
947 msk
->wmem_reserved
-= size
;
951 static void mptcp_wmem_uncharge(struct sock
*sk
, int size
)
953 struct mptcp_sock
*msk
= mptcp_sk(sk
);
955 if (msk
->wmem_reserved
< 0)
956 msk
->wmem_reserved
= 0;
957 msk
->wmem_reserved
+= size
;
960 static void mptcp_mem_reclaim_partial(struct sock
*sk
)
962 struct mptcp_sock
*msk
= mptcp_sk(sk
);
964 /* if we are experiencing a transint allocation error,
965 * the forward allocation memory has been already
968 if (msk
->wmem_reserved
< 0)
972 sk
->sk_forward_alloc
+= msk
->wmem_reserved
;
973 sk_mem_reclaim_partial(sk
);
974 msk
->wmem_reserved
= sk
->sk_forward_alloc
;
975 sk
->sk_forward_alloc
= 0;
976 mptcp_data_unlock(sk
);
979 static void dfrag_uncharge(struct sock
*sk
, int len
)
981 sk_mem_uncharge(sk
, len
);
982 sk_wmem_queued_add(sk
, -len
);
985 static void dfrag_clear(struct sock
*sk
, struct mptcp_data_frag
*dfrag
)
987 int len
= dfrag
->data_len
+ dfrag
->overhead
;
989 list_del(&dfrag
->list
);
990 dfrag_uncharge(sk
, len
);
991 put_page(dfrag
->page
);
994 static void __mptcp_clean_una(struct sock
*sk
)
996 struct mptcp_sock
*msk
= mptcp_sk(sk
);
997 struct mptcp_data_frag
*dtmp
, *dfrag
;
998 bool cleaned
= false;
1001 /* on fallback we just need to ignore snd_una, as this is really
1004 if (__mptcp_check_fallback(msk
))
1005 msk
->snd_una
= READ_ONCE(msk
->snd_nxt
);
1007 snd_una
= msk
->snd_una
;
1008 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
) {
1009 if (after64(dfrag
->data_seq
+ dfrag
->data_len
, snd_una
))
1012 if (WARN_ON_ONCE(dfrag
== msk
->first_pending
))
1014 dfrag_clear(sk
, dfrag
);
1018 dfrag
= mptcp_rtx_head(sk
);
1019 if (dfrag
&& after64(snd_una
, dfrag
->data_seq
)) {
1020 u64 delta
= snd_una
- dfrag
->data_seq
;
1022 if (WARN_ON_ONCE(delta
> dfrag
->already_sent
))
1025 dfrag
->data_seq
+= delta
;
1026 dfrag
->offset
+= delta
;
1027 dfrag
->data_len
-= delta
;
1028 dfrag
->already_sent
-= delta
;
1030 dfrag_uncharge(sk
, delta
);
1036 if (tcp_under_memory_pressure(sk
)) {
1037 __mptcp_update_wmem(sk
);
1038 sk_mem_reclaim_partial(sk
);
1041 if (sk_stream_is_writeable(sk
)) {
1042 /* pairs with memory barrier in mptcp_poll */
1044 if (test_and_clear_bit(MPTCP_NOSPACE
, &msk
->flags
))
1045 sk_stream_write_space(sk
);
1049 if (snd_una
== READ_ONCE(msk
->snd_nxt
)) {
1050 if (msk
->timer_ival
)
1051 mptcp_stop_timer(sk
);
1053 mptcp_reset_timer(sk
);
1057 static void mptcp_enter_memory_pressure(struct sock
*sk
)
1059 struct mptcp_subflow_context
*subflow
;
1060 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1063 sk_stream_moderate_sndbuf(sk
);
1064 mptcp_for_each_subflow(msk
, subflow
) {
1065 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1068 tcp_enter_memory_pressure(ssk
);
1069 sk_stream_moderate_sndbuf(ssk
);
1074 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
1077 static bool mptcp_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
1079 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag
),
1080 pfrag
, sk
->sk_allocation
)))
1083 mptcp_enter_memory_pressure(sk
);
1087 static struct mptcp_data_frag
*
1088 mptcp_carve_data_frag(const struct mptcp_sock
*msk
, struct page_frag
*pfrag
,
1091 int offset
= ALIGN(orig_offset
, sizeof(long));
1092 struct mptcp_data_frag
*dfrag
;
1094 dfrag
= (struct mptcp_data_frag
*)(page_to_virt(pfrag
->page
) + offset
);
1095 dfrag
->data_len
= 0;
1096 dfrag
->data_seq
= msk
->write_seq
;
1097 dfrag
->overhead
= offset
- orig_offset
+ sizeof(struct mptcp_data_frag
);
1098 dfrag
->offset
= offset
+ sizeof(struct mptcp_data_frag
);
1099 dfrag
->already_sent
= 0;
1100 dfrag
->page
= pfrag
->page
;
1105 struct mptcp_sendmsg_info
{
1113 static int mptcp_check_allowed_size(struct mptcp_sock
*msk
, u64 data_seq
,
1116 u64 window_end
= mptcp_wnd_end(msk
);
1118 if (__mptcp_check_fallback(msk
))
1121 if (!before64(data_seq
+ avail_size
, window_end
)) {
1122 u64 allowed_size
= window_end
- data_seq
;
1124 return min_t(unsigned int, allowed_size
, avail_size
);
1130 static bool __mptcp_add_ext(struct sk_buff
*skb
, gfp_t gfp
)
1132 struct skb_ext
*mpext
= __skb_ext_alloc(gfp
);
1136 __skb_ext_set(skb
, SKB_EXT_MPTCP
, mpext
);
1140 static struct sk_buff
*__mptcp_do_alloc_tx_skb(struct sock
*sk
, gfp_t gfp
)
1142 struct sk_buff
*skb
;
1144 skb
= alloc_skb_fclone(MAX_TCP_HEADER
, gfp
);
1146 if (likely(__mptcp_add_ext(skb
, gfp
))) {
1147 skb_reserve(skb
, MAX_TCP_HEADER
);
1148 skb
->reserved_tailroom
= skb
->end
- skb
->tail
;
1153 mptcp_enter_memory_pressure(sk
);
1158 static bool mptcp_tx_cache_refill(struct sock
*sk
, int size
,
1159 struct sk_buff_head
*skbs
, int *total_ts
)
1161 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1162 struct sk_buff
*skb
;
1165 if (unlikely(tcp_under_memory_pressure(sk
))) {
1166 mptcp_mem_reclaim_partial(sk
);
1168 /* under pressure pre-allocate at most a single skb */
1169 if (msk
->skb_tx_cache
.qlen
)
1171 space_needed
= msk
->size_goal_cache
;
1173 space_needed
= msk
->tx_pending_data
+ size
-
1174 msk
->skb_tx_cache
.qlen
* msk
->size_goal_cache
;
1177 while (space_needed
> 0) {
1178 skb
= __mptcp_do_alloc_tx_skb(sk
, sk
->sk_allocation
);
1179 if (unlikely(!skb
)) {
1180 /* under memory pressure, try to pass the caller a
1181 * single skb to allow forward progress
1183 while (skbs
->qlen
> 1) {
1184 skb
= __skb_dequeue_tail(skbs
);
1187 return skbs
->qlen
> 0;
1190 *total_ts
+= skb
->truesize
;
1191 __skb_queue_tail(skbs
, skb
);
1192 space_needed
-= msk
->size_goal_cache
;
1197 static bool __mptcp_alloc_tx_skb(struct sock
*sk
, struct sock
*ssk
, gfp_t gfp
)
1199 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1200 struct sk_buff
*skb
;
1202 if (ssk
->sk_tx_skb_cache
) {
1203 skb
= ssk
->sk_tx_skb_cache
;
1204 if (unlikely(!skb_ext_find(skb
, SKB_EXT_MPTCP
) &&
1205 !__mptcp_add_ext(skb
, gfp
)))
1210 skb
= skb_peek(&msk
->skb_tx_cache
);
1212 if (likely(sk_wmem_schedule(ssk
, skb
->truesize
))) {
1213 skb
= __skb_dequeue(&msk
->skb_tx_cache
);
1214 if (WARN_ON_ONCE(!skb
))
1217 mptcp_wmem_uncharge(sk
, skb
->truesize
);
1218 ssk
->sk_tx_skb_cache
= skb
;
1222 /* over memory limit, no point to try to allocate a new skb */
1226 skb
= __mptcp_do_alloc_tx_skb(sk
, gfp
);
1230 if (likely(sk_wmem_schedule(ssk
, skb
->truesize
))) {
1231 ssk
->sk_tx_skb_cache
= skb
;
1238 static bool mptcp_must_reclaim_memory(struct sock
*sk
, struct sock
*ssk
)
1240 return !ssk
->sk_tx_skb_cache
&&
1241 !skb_peek(&mptcp_sk(sk
)->skb_tx_cache
) &&
1242 tcp_under_memory_pressure(sk
);
1245 static bool mptcp_alloc_tx_skb(struct sock
*sk
, struct sock
*ssk
)
1247 if (unlikely(mptcp_must_reclaim_memory(sk
, ssk
)))
1248 mptcp_mem_reclaim_partial(sk
);
1249 return __mptcp_alloc_tx_skb(sk
, ssk
, sk
->sk_allocation
);
1252 static int mptcp_sendmsg_frag(struct sock
*sk
, struct sock
*ssk
,
1253 struct mptcp_data_frag
*dfrag
,
1254 struct mptcp_sendmsg_info
*info
)
1256 u64 data_seq
= dfrag
->data_seq
+ info
->sent
;
1257 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1258 bool zero_window_probe
= false;
1259 struct mptcp_ext
*mpext
= NULL
;
1260 struct sk_buff
*skb
, *tail
;
1261 bool can_collapse
= false;
1266 pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
1267 msk
, ssk
, dfrag
->data_seq
, dfrag
->data_len
, info
->sent
);
1269 /* compute send limit */
1270 info
->mss_now
= tcp_send_mss(ssk
, &info
->size_goal
, info
->flags
);
1271 avail_size
= info
->size_goal
;
1272 msk
->size_goal_cache
= info
->size_goal
;
1273 skb
= tcp_write_queue_tail(ssk
);
1275 /* Limit the write to the size available in the
1276 * current skb, if any, so that we create at most a new skb.
1277 * Explicitly tells TCP internals to avoid collapsing on later
1278 * queue management operation, to avoid breaking the ext <->
1279 * SSN association set here
1281 mpext
= skb_ext_find(skb
, SKB_EXT_MPTCP
);
1282 can_collapse
= (info
->size_goal
- skb
->len
> 0) &&
1283 mptcp_skb_can_collapse_to(data_seq
, skb
, mpext
);
1284 if (!can_collapse
) {
1285 TCP_SKB_CB(skb
)->eor
= 1;
1287 size_bias
= skb
->len
;
1288 avail_size
= info
->size_goal
- skb
->len
;
1292 /* Zero window and all data acked? Probe. */
1293 avail_size
= mptcp_check_allowed_size(msk
, data_seq
, avail_size
);
1294 if (avail_size
== 0) {
1295 u64 snd_una
= READ_ONCE(msk
->snd_una
);
1297 if (skb
|| snd_una
!= msk
->snd_nxt
)
1299 zero_window_probe
= true;
1300 data_seq
= snd_una
- 1;
1304 if (WARN_ON_ONCE(info
->sent
> info
->limit
||
1305 info
->limit
> dfrag
->data_len
))
1308 ret
= info
->limit
- info
->sent
;
1309 tail
= tcp_build_frag(ssk
, avail_size
+ size_bias
, info
->flags
,
1310 dfrag
->page
, dfrag
->offset
+ info
->sent
, &ret
);
1312 tcp_remove_empty_skb(sk
, tcp_write_queue_tail(ssk
));
1316 /* if the tail skb is still the cached one, collapsing really happened.
1319 TCP_SKB_CB(tail
)->tcp_flags
&= ~TCPHDR_PSH
;
1320 mpext
->data_len
+= ret
;
1321 WARN_ON_ONCE(!can_collapse
);
1322 WARN_ON_ONCE(zero_window_probe
);
1326 mpext
= skb_ext_find(tail
, SKB_EXT_MPTCP
);
1327 if (WARN_ON_ONCE(!mpext
)) {
1328 /* should never reach here, stream corrupted */
1332 memset(mpext
, 0, sizeof(*mpext
));
1333 mpext
->data_seq
= data_seq
;
1334 mpext
->subflow_seq
= mptcp_subflow_ctx(ssk
)->rel_write_seq
;
1335 mpext
->data_len
= ret
;
1339 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1340 mpext
->data_seq
, mpext
->subflow_seq
, mpext
->data_len
,
1343 if (zero_window_probe
) {
1344 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
1347 tcp_push_pending_frames(ssk
);
1350 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
1354 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1355 sizeof(struct tcphdr) - \
1356 MAX_TCP_OPTION_SPACE - \
1357 sizeof(struct ipv6hdr) - \
1358 sizeof(struct frag_hdr))
1360 struct subflow_send_info
{
1365 static struct sock
*mptcp_subflow_get_send(struct mptcp_sock
*msk
,
1368 struct subflow_send_info send_info
[2];
1369 struct mptcp_subflow_context
*subflow
;
1370 int i
, nr_active
= 0;
1375 sock_owned_by_me((struct sock
*)msk
);
1378 if (__mptcp_check_fallback(msk
)) {
1381 *sndbuf
= msk
->first
->sk_sndbuf
;
1382 return sk_stream_memory_free(msk
->first
) ? msk
->first
: NULL
;
1385 /* re-use last subflow, if the burst allow that */
1386 if (msk
->last_snd
&& msk
->snd_burst
> 0 &&
1387 sk_stream_memory_free(msk
->last_snd
) &&
1388 mptcp_subflow_active(mptcp_subflow_ctx(msk
->last_snd
))) {
1389 mptcp_for_each_subflow(msk
, subflow
) {
1390 ssk
= mptcp_subflow_tcp_sock(subflow
);
1391 *sndbuf
= max(tcp_sk(ssk
)->snd_wnd
, *sndbuf
);
1393 return msk
->last_snd
;
1396 /* pick the subflow with the lower wmem/wspace ratio */
1397 for (i
= 0; i
< 2; ++i
) {
1398 send_info
[i
].ssk
= NULL
;
1399 send_info
[i
].ratio
= -1;
1401 mptcp_for_each_subflow(msk
, subflow
) {
1402 ssk
= mptcp_subflow_tcp_sock(subflow
);
1403 if (!mptcp_subflow_active(subflow
))
1406 nr_active
+= !subflow
->backup
;
1407 *sndbuf
= max(tcp_sk(ssk
)->snd_wnd
, *sndbuf
);
1408 if (!sk_stream_memory_free(subflow
->tcp_sock
))
1411 pace
= READ_ONCE(ssk
->sk_pacing_rate
);
1415 ratio
= div_u64((u64
)READ_ONCE(ssk
->sk_wmem_queued
) << 32,
1417 if (ratio
< send_info
[subflow
->backup
].ratio
) {
1418 send_info
[subflow
->backup
].ssk
= ssk
;
1419 send_info
[subflow
->backup
].ratio
= ratio
;
1423 pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1424 msk
, nr_active
, send_info
[0].ssk
, send_info
[0].ratio
,
1425 send_info
[1].ssk
, send_info
[1].ratio
);
1427 /* pick the best backup if no other subflow is active */
1429 send_info
[0].ssk
= send_info
[1].ssk
;
1431 if (send_info
[0].ssk
) {
1432 msk
->last_snd
= send_info
[0].ssk
;
1433 msk
->snd_burst
= min_t(int, MPTCP_SEND_BURST_SIZE
,
1434 sk_stream_wspace(msk
->last_snd
));
1435 return msk
->last_snd
;
1440 static void mptcp_push_release(struct sock
*sk
, struct sock
*ssk
,
1441 struct mptcp_sendmsg_info
*info
)
1443 mptcp_set_timeout(sk
, ssk
);
1444 tcp_push(ssk
, 0, info
->mss_now
, tcp_sk(ssk
)->nonagle
, info
->size_goal
);
1448 static void mptcp_push_pending(struct sock
*sk
, unsigned int flags
)
1450 struct sock
*prev_ssk
= NULL
, *ssk
= NULL
;
1451 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1452 struct mptcp_sendmsg_info info
= {
1455 struct mptcp_data_frag
*dfrag
;
1456 int len
, copied
= 0;
1459 while ((dfrag
= mptcp_send_head(sk
))) {
1460 info
.sent
= dfrag
->already_sent
;
1461 info
.limit
= dfrag
->data_len
;
1462 len
= dfrag
->data_len
- dfrag
->already_sent
;
1467 __mptcp_flush_join_list(msk
);
1468 ssk
= mptcp_subflow_get_send(msk
, &sndbuf
);
1470 /* do auto tuning */
1471 if (!(sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
) &&
1472 sndbuf
> READ_ONCE(sk
->sk_sndbuf
))
1473 WRITE_ONCE(sk
->sk_sndbuf
, sndbuf
);
1475 /* try to keep the subflow socket lock across
1476 * consecutive xmit on the same socket
1478 if (ssk
!= prev_ssk
&& prev_ssk
)
1479 mptcp_push_release(sk
, prev_ssk
, &info
);
1483 if (ssk
!= prev_ssk
|| !prev_ssk
)
1486 /* keep it simple and always provide a new skb for the
1487 * subflow, even if we will not use it when collapsing
1488 * on the pending one
1490 if (!mptcp_alloc_tx_skb(sk
, ssk
)) {
1491 mptcp_push_release(sk
, ssk
, &info
);
1495 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
1497 mptcp_push_release(sk
, ssk
, &info
);
1502 dfrag
->already_sent
+= ret
;
1503 msk
->snd_nxt
+= ret
;
1504 msk
->snd_burst
-= ret
;
1505 msk
->tx_pending_data
-= ret
;
1509 WRITE_ONCE(msk
->first_pending
, mptcp_send_next(sk
));
1512 /* at this point we held the socket lock for the last subflow we used */
1514 mptcp_push_release(sk
, ssk
, &info
);
1518 /* start the timer, if it's not pending */
1519 if (!mptcp_timer_pending(sk
))
1520 mptcp_reset_timer(sk
);
1521 __mptcp_check_send_data_fin(sk
);
1525 static void __mptcp_subflow_push_pending(struct sock
*sk
, struct sock
*ssk
)
1527 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1528 struct mptcp_sendmsg_info info
;
1529 struct mptcp_data_frag
*dfrag
;
1530 int len
, copied
= 0;
1533 while ((dfrag
= mptcp_send_head(sk
))) {
1534 info
.sent
= dfrag
->already_sent
;
1535 info
.limit
= dfrag
->data_len
;
1536 len
= dfrag
->data_len
- dfrag
->already_sent
;
1540 /* do auto tuning */
1541 if (!(sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
) &&
1542 ssk
->sk_sndbuf
> READ_ONCE(sk
->sk_sndbuf
))
1543 WRITE_ONCE(sk
->sk_sndbuf
, ssk
->sk_sndbuf
);
1545 if (unlikely(mptcp_must_reclaim_memory(sk
, ssk
))) {
1546 __mptcp_update_wmem(sk
);
1547 sk_mem_reclaim_partial(sk
);
1549 if (!__mptcp_alloc_tx_skb(sk
, ssk
, GFP_ATOMIC
))
1552 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
1557 dfrag
->already_sent
+= ret
;
1558 msk
->snd_nxt
+= ret
;
1559 msk
->snd_burst
-= ret
;
1560 msk
->tx_pending_data
-= ret
;
1564 WRITE_ONCE(msk
->first_pending
, mptcp_send_next(sk
));
1568 /* __mptcp_alloc_tx_skb could have released some wmem and we are
1569 * not going to flush it via release_sock()
1571 __mptcp_update_wmem(sk
);
1573 mptcp_set_timeout(sk
, ssk
);
1574 tcp_push(ssk
, 0, info
.mss_now
, tcp_sk(ssk
)->nonagle
,
1576 if (msk
->snd_data_fin_enable
&&
1577 msk
->snd_nxt
+ 1 == msk
->write_seq
)
1578 mptcp_schedule_work(sk
);
1582 static int mptcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
)
1584 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1585 struct page_frag
*pfrag
;
1590 if (msg
->msg_flags
& ~(MSG_MORE
| MSG_DONTWAIT
| MSG_NOSIGNAL
))
1593 mptcp_lock_sock(sk
, __mptcp_wmem_reserve(sk
, min_t(size_t, 1 << 20, len
)));
1595 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1597 if ((1 << sk
->sk_state
) & ~(TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
)) {
1598 ret
= sk_stream_wait_connect(sk
, &timeo
);
1603 pfrag
= sk_page_frag(sk
);
1605 while (msg_data_left(msg
)) {
1606 int total_ts
, frag_truesize
= 0;
1607 struct mptcp_data_frag
*dfrag
;
1608 struct sk_buff_head skbs
;
1609 bool dfrag_collapsed
;
1610 size_t psize
, offset
;
1612 if (sk
->sk_err
|| (sk
->sk_shutdown
& SEND_SHUTDOWN
)) {
1617 /* reuse tail pfrag, if possible, or carve a new one from the
1620 dfrag
= mptcp_pending_tail(sk
);
1621 dfrag_collapsed
= mptcp_frag_can_collapse_to(msk
, pfrag
, dfrag
);
1622 if (!dfrag_collapsed
) {
1623 if (!sk_stream_memory_free(sk
))
1624 goto wait_for_memory
;
1626 if (!mptcp_page_frag_refill(sk
, pfrag
))
1627 goto wait_for_memory
;
1629 dfrag
= mptcp_carve_data_frag(msk
, pfrag
, pfrag
->offset
);
1630 frag_truesize
= dfrag
->overhead
;
1633 /* we do not bound vs wspace, to allow a single packet.
1634 * memory accounting will prevent execessive memory usage
1637 offset
= dfrag
->offset
+ dfrag
->data_len
;
1638 psize
= pfrag
->size
- offset
;
1639 psize
= min_t(size_t, psize
, msg_data_left(msg
));
1640 total_ts
= psize
+ frag_truesize
;
1641 __skb_queue_head_init(&skbs
);
1642 if (!mptcp_tx_cache_refill(sk
, psize
, &skbs
, &total_ts
))
1643 goto wait_for_memory
;
1645 if (!mptcp_wmem_alloc(sk
, total_ts
)) {
1646 __skb_queue_purge(&skbs
);
1647 goto wait_for_memory
;
1650 skb_queue_splice_tail(&skbs
, &msk
->skb_tx_cache
);
1651 if (copy_page_from_iter(dfrag
->page
, offset
, psize
,
1652 &msg
->msg_iter
) != psize
) {
1653 mptcp_wmem_uncharge(sk
, psize
+ frag_truesize
);
1658 /* data successfully copied into the write queue */
1660 dfrag
->data_len
+= psize
;
1661 frag_truesize
+= psize
;
1662 pfrag
->offset
+= frag_truesize
;
1663 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ psize
);
1664 msk
->tx_pending_data
+= psize
;
1666 /* charge data on mptcp pending queue to the msk socket
1667 * Note: we charge such data both to sk and ssk
1669 sk_wmem_queued_add(sk
, frag_truesize
);
1670 if (!dfrag_collapsed
) {
1671 get_page(dfrag
->page
);
1672 list_add_tail(&dfrag
->list
, &msk
->rtx_queue
);
1673 if (!msk
->first_pending
)
1674 WRITE_ONCE(msk
->first_pending
, dfrag
);
1676 pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk
,
1677 dfrag
->data_seq
, dfrag
->data_len
, dfrag
->already_sent
,
1683 set_bit(MPTCP_NOSPACE
, &msk
->flags
);
1684 mptcp_push_pending(sk
, msg
->msg_flags
);
1685 ret
= sk_stream_wait_memory(sk
, &timeo
);
1691 mptcp_push_pending(sk
, msg
->msg_flags
);
1695 return copied
? : ret
;
1698 static void mptcp_wait_data(struct sock
*sk
, long *timeo
)
1700 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
1701 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1703 add_wait_queue(sk_sleep(sk
), &wait
);
1704 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1706 sk_wait_event(sk
, timeo
,
1707 test_and_clear_bit(MPTCP_DATA_READY
, &msk
->flags
), &wait
);
1709 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1710 remove_wait_queue(sk_sleep(sk
), &wait
);
1713 static int __mptcp_recvmsg_mskq(struct mptcp_sock
*msk
,
1717 struct sk_buff
*skb
;
1720 while ((skb
= skb_peek(&msk
->receive_queue
)) != NULL
) {
1721 u32 offset
= MPTCP_SKB_CB(skb
)->offset
;
1722 u32 data_len
= skb
->len
- offset
;
1723 u32 count
= min_t(size_t, len
- copied
, data_len
);
1726 err
= skb_copy_datagram_msg(skb
, offset
, msg
, count
);
1727 if (unlikely(err
< 0)) {
1735 if (count
< data_len
) {
1736 MPTCP_SKB_CB(skb
)->offset
+= count
;
1740 /* we will bulk release the skb memory later */
1741 skb
->destructor
= NULL
;
1742 msk
->rmem_released
+= skb
->truesize
;
1743 __skb_unlink(skb
, &msk
->receive_queue
);
1753 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1755 * Only difference: Use highest rtt estimate of the subflows in use.
1757 static void mptcp_rcv_space_adjust(struct mptcp_sock
*msk
, int copied
)
1759 struct mptcp_subflow_context
*subflow
;
1760 struct sock
*sk
= (struct sock
*)msk
;
1761 u32 time
, advmss
= 1;
1764 sock_owned_by_me(sk
);
1769 msk
->rcvq_space
.copied
+= copied
;
1771 mstamp
= div_u64(tcp_clock_ns(), NSEC_PER_USEC
);
1772 time
= tcp_stamp_us_delta(mstamp
, msk
->rcvq_space
.time
);
1774 rtt_us
= msk
->rcvq_space
.rtt_us
;
1775 if (rtt_us
&& time
< (rtt_us
>> 3))
1779 mptcp_for_each_subflow(msk
, subflow
) {
1780 const struct tcp_sock
*tp
;
1784 tp
= tcp_sk(mptcp_subflow_tcp_sock(subflow
));
1786 sf_rtt_us
= READ_ONCE(tp
->rcv_rtt_est
.rtt_us
);
1787 sf_advmss
= READ_ONCE(tp
->advmss
);
1789 rtt_us
= max(sf_rtt_us
, rtt_us
);
1790 advmss
= max(sf_advmss
, advmss
);
1793 msk
->rcvq_space
.rtt_us
= rtt_us
;
1794 if (time
< (rtt_us
>> 3) || rtt_us
== 0)
1797 if (msk
->rcvq_space
.copied
<= msk
->rcvq_space
.space
)
1800 if (sock_net(sk
)->ipv4
.sysctl_tcp_moderate_rcvbuf
&&
1801 !(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
1805 rcvwin
= ((u64
)msk
->rcvq_space
.copied
<< 1) + 16 * advmss
;
1807 grow
= rcvwin
* (msk
->rcvq_space
.copied
- msk
->rcvq_space
.space
);
1809 do_div(grow
, msk
->rcvq_space
.space
);
1810 rcvwin
+= (grow
<< 1);
1812 rcvmem
= SKB_TRUESIZE(advmss
+ MAX_TCP_HEADER
);
1813 while (tcp_win_from_space(sk
, rcvmem
) < advmss
)
1816 do_div(rcvwin
, advmss
);
1817 rcvbuf
= min_t(u64
, rcvwin
* rcvmem
,
1818 sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[2]);
1820 if (rcvbuf
> sk
->sk_rcvbuf
) {
1823 window_clamp
= tcp_win_from_space(sk
, rcvbuf
);
1824 WRITE_ONCE(sk
->sk_rcvbuf
, rcvbuf
);
1826 /* Make subflows follow along. If we do not do this, we
1827 * get drops at subflow level if skbs can't be moved to
1828 * the mptcp rx queue fast enough (announced rcv_win can
1829 * exceed ssk->sk_rcvbuf).
1831 mptcp_for_each_subflow(msk
, subflow
) {
1835 ssk
= mptcp_subflow_tcp_sock(subflow
);
1836 slow
= lock_sock_fast(ssk
);
1837 WRITE_ONCE(ssk
->sk_rcvbuf
, rcvbuf
);
1838 tcp_sk(ssk
)->window_clamp
= window_clamp
;
1839 tcp_cleanup_rbuf(ssk
, 1);
1840 unlock_sock_fast(ssk
, slow
);
1845 msk
->rcvq_space
.space
= msk
->rcvq_space
.copied
;
1847 msk
->rcvq_space
.copied
= 0;
1848 msk
->rcvq_space
.time
= mstamp
;
1851 static void __mptcp_update_rmem(struct sock
*sk
)
1853 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1855 if (!msk
->rmem_released
)
1858 atomic_sub(msk
->rmem_released
, &sk
->sk_rmem_alloc
);
1859 sk_mem_uncharge(sk
, msk
->rmem_released
);
1860 msk
->rmem_released
= 0;
1863 static void __mptcp_splice_receive_queue(struct sock
*sk
)
1865 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1867 skb_queue_splice_tail_init(&sk
->sk_receive_queue
, &msk
->receive_queue
);
1870 static bool __mptcp_move_skbs(struct mptcp_sock
*msk
, unsigned int rcv
)
1872 struct sock
*sk
= (struct sock
*)msk
;
1873 unsigned int moved
= 0;
1876 __mptcp_flush_join_list(msk
);
1878 struct sock
*ssk
= mptcp_subflow_recv_lookup(msk
);
1881 /* we can have data pending in the subflows only if the msk
1882 * receive buffer was full at subflow_data_ready() time,
1883 * that is an unlikely slow path.
1888 slowpath
= lock_sock_fast(ssk
);
1889 mptcp_data_lock(sk
);
1890 done
= __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
1891 mptcp_data_unlock(sk
);
1893 WRITE_ONCE(msk
->rmem_pending
, min(rcv
, moved
));
1894 tcp_cleanup_rbuf(ssk
, 1);
1895 WRITE_ONCE(msk
->rmem_pending
, 0);
1897 unlock_sock_fast(ssk
, slowpath
);
1900 /* acquire the data lock only if some input data is pending */
1902 if (!RB_EMPTY_ROOT(&msk
->out_of_order_queue
) ||
1903 !skb_queue_empty_lockless(&sk
->sk_receive_queue
)) {
1904 mptcp_data_lock(sk
);
1905 __mptcp_update_rmem(sk
);
1906 ret
|= __mptcp_ofo_queue(msk
);
1907 __mptcp_splice_receive_queue(sk
);
1908 mptcp_data_unlock(sk
);
1911 mptcp_check_data_fin((struct sock
*)msk
);
1912 return !skb_queue_empty(&msk
->receive_queue
);
1915 static int mptcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
,
1916 int nonblock
, int flags
, int *addr_len
)
1918 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1923 if (msg
->msg_flags
& ~(MSG_WAITALL
| MSG_DONTWAIT
))
1926 mptcp_lock_sock(sk
, __mptcp_splice_receive_queue(sk
));
1927 if (unlikely(sk
->sk_state
== TCP_LISTEN
)) {
1932 timeo
= sock_rcvtimeo(sk
, nonblock
);
1934 len
= min_t(size_t, len
, INT_MAX
);
1935 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, len
);
1937 while (copied
< len
) {
1938 int bytes_read
, old_space
;
1940 bytes_read
= __mptcp_recvmsg_mskq(msk
, msg
, len
- copied
);
1941 if (unlikely(bytes_read
< 0)) {
1943 copied
= bytes_read
;
1947 copied
+= bytes_read
;
1949 if (skb_queue_empty(&msk
->receive_queue
) &&
1950 __mptcp_move_skbs(msk
, len
- copied
))
1953 /* be sure to advertise window change */
1954 old_space
= READ_ONCE(msk
->old_wspace
);
1955 if ((tcp_space(sk
) - old_space
) >= old_space
)
1956 mptcp_cleanup_rbuf(msk
);
1958 /* only the master socket status is relevant here. The exit
1959 * conditions mirror closely tcp_recvmsg()
1961 if (copied
>= target
)
1966 sk
->sk_state
== TCP_CLOSE
||
1967 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
1969 signal_pending(current
))
1973 copied
= sock_error(sk
);
1977 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
1978 mptcp_check_for_eof(msk
);
1980 if (sk
->sk_shutdown
& RCV_SHUTDOWN
) {
1981 /* race breaker: the shutdown could be after the
1982 * previous receive queue check
1984 if (__mptcp_move_skbs(msk
, len
- copied
))
1989 if (sk
->sk_state
== TCP_CLOSE
) {
1999 if (signal_pending(current
)) {
2000 copied
= sock_intr_errno(timeo
);
2005 pr_debug("block timeout %ld", timeo
);
2006 mptcp_wait_data(sk
, &timeo
);
2009 if (skb_queue_empty_lockless(&sk
->sk_receive_queue
) &&
2010 skb_queue_empty(&msk
->receive_queue
)) {
2011 /* entire backlog drained, clear DATA_READY. */
2012 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
2014 /* .. race-breaker: ssk might have gotten new data
2015 * after last __mptcp_move_skbs() returned false.
2017 if (unlikely(__mptcp_move_skbs(msk
, 0)))
2018 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2019 } else if (unlikely(!test_bit(MPTCP_DATA_READY
, &msk
->flags
))) {
2020 /* data to read but mptcp_wait_data() cleared DATA_READY */
2021 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2024 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
2025 msk
, test_bit(MPTCP_DATA_READY
, &msk
->flags
),
2026 skb_queue_empty_lockless(&sk
->sk_receive_queue
), copied
);
2027 mptcp_rcv_space_adjust(msk
, copied
);
2033 static void mptcp_retransmit_handler(struct sock
*sk
)
2035 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2037 set_bit(MPTCP_WORK_RTX
, &msk
->flags
);
2038 mptcp_schedule_work(sk
);
2041 static void mptcp_retransmit_timer(struct timer_list
*t
)
2043 struct inet_connection_sock
*icsk
= from_timer(icsk
, t
,
2044 icsk_retransmit_timer
);
2045 struct sock
*sk
= &icsk
->icsk_inet
.sk
;
2048 if (!sock_owned_by_user(sk
)) {
2049 mptcp_retransmit_handler(sk
);
2051 /* delegate our work to tcp_release_cb() */
2052 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED
,
2060 static void mptcp_timeout_timer(struct timer_list
*t
)
2062 struct sock
*sk
= from_timer(sk
, t
, sk_timer
);
2064 mptcp_schedule_work(sk
);
2068 /* Find an idle subflow. Return NULL if there is unacked data at tcp
2071 * A backup subflow is returned only if that is the only kind available.
2073 static struct sock
*mptcp_subflow_get_retrans(const struct mptcp_sock
*msk
)
2075 struct mptcp_subflow_context
*subflow
;
2076 struct sock
*backup
= NULL
;
2078 sock_owned_by_me((const struct sock
*)msk
);
2080 if (__mptcp_check_fallback(msk
))
2083 mptcp_for_each_subflow(msk
, subflow
) {
2084 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2086 if (!mptcp_subflow_active(subflow
))
2089 /* still data outstanding at TCP level? Don't retransmit. */
2090 if (!tcp_write_queue_empty(ssk
)) {
2091 if (inet_csk(ssk
)->icsk_ca_state
>= TCP_CA_Loss
)
2096 if (subflow
->backup
) {
2108 /* subflow sockets can be either outgoing (connect) or incoming
2111 * Outgoing subflows use in-kernel sockets.
2112 * Incoming subflows do not have their own 'struct socket' allocated,
2113 * so we need to use tcp_close() after detaching them from the mptcp
2116 void __mptcp_close_ssk(struct sock
*sk
, struct sock
*ssk
,
2117 struct mptcp_subflow_context
*subflow
)
2119 bool dispose_socket
= false;
2120 struct socket
*sock
;
2122 list_del(&subflow
->node
);
2124 lock_sock_nested(ssk
, SINGLE_DEPTH_NESTING
);
2126 /* if we are invoked by the msk cleanup code, the subflow is
2129 sock
= ssk
->sk_socket
;
2131 dispose_socket
= sock
!= sk
->sk_socket
;
2135 subflow
->disposable
= 1;
2137 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
2138 * the ssk has been already destroyed, we just need to release the
2139 * reference owned by msk;
2141 if (!inet_csk(ssk
)->icsk_ulp_ops
) {
2142 kfree_rcu(subflow
, rcu
);
2144 /* otherwise tcp will dispose of the ssk and subflow ctx */
2145 __tcp_close(ssk
, 0);
2147 /* close acquired an extra ref */
2152 iput(SOCK_INODE(sock
));
2157 static unsigned int mptcp_sync_mss(struct sock
*sk
, u32 pmtu
)
2162 static void pm_work(struct mptcp_sock
*msk
)
2164 struct mptcp_pm_data
*pm
= &msk
->pm
;
2166 spin_lock_bh(&msk
->pm
.lock
);
2168 pr_debug("msk=%p status=%x", msk
, pm
->status
);
2169 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_RECEIVED
)) {
2170 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED
);
2171 mptcp_pm_nl_add_addr_received(msk
);
2173 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_SEND_ACK
)) {
2174 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK
);
2175 mptcp_pm_nl_add_addr_send_ack(msk
);
2177 if (pm
->status
& BIT(MPTCP_PM_RM_ADDR_RECEIVED
)) {
2178 pm
->status
&= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED
);
2179 mptcp_pm_nl_rm_addr_received(msk
);
2181 if (pm
->status
& BIT(MPTCP_PM_ESTABLISHED
)) {
2182 pm
->status
&= ~BIT(MPTCP_PM_ESTABLISHED
);
2183 mptcp_pm_nl_fully_established(msk
);
2185 if (pm
->status
& BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
)) {
2186 pm
->status
&= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
);
2187 mptcp_pm_nl_subflow_established(msk
);
2190 spin_unlock_bh(&msk
->pm
.lock
);
2193 static void __mptcp_close_subflow(struct mptcp_sock
*msk
)
2195 struct mptcp_subflow_context
*subflow
, *tmp
;
2197 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
2198 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2200 if (inet_sk_state_load(ssk
) != TCP_CLOSE
)
2203 __mptcp_close_ssk((struct sock
*)msk
, ssk
, subflow
);
2207 static bool mptcp_check_close_timeout(const struct sock
*sk
)
2209 s32 delta
= tcp_jiffies32
- inet_csk(sk
)->icsk_mtup
.probe_timestamp
;
2210 struct mptcp_subflow_context
*subflow
;
2212 if (delta
>= TCP_TIMEWAIT_LEN
)
2215 /* if all subflows are in closed status don't bother with additional
2218 mptcp_for_each_subflow(mptcp_sk(sk
), subflow
) {
2219 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow
)) !=
2226 static void mptcp_check_fastclose(struct mptcp_sock
*msk
)
2228 struct mptcp_subflow_context
*subflow
, *tmp
;
2229 struct sock
*sk
= &msk
->sk
.icsk_inet
.sk
;
2231 if (likely(!READ_ONCE(msk
->rcv_fastclose
)))
2234 mptcp_token_destroy(msk
);
2236 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
2237 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2240 if (tcp_sk
->sk_state
!= TCP_CLOSE
) {
2241 tcp_send_active_reset(tcp_sk
, GFP_ATOMIC
);
2242 tcp_set_state(tcp_sk
, TCP_CLOSE
);
2244 release_sock(tcp_sk
);
2247 inet_sk_state_store(sk
, TCP_CLOSE
);
2248 sk
->sk_shutdown
= SHUTDOWN_MASK
;
2249 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
2250 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2251 set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
);
2253 mptcp_close_wake_up(sk
);
2256 static void mptcp_worker(struct work_struct
*work
)
2258 struct mptcp_sock
*msk
= container_of(work
, struct mptcp_sock
, work
);
2259 struct sock
*ssk
, *sk
= &msk
->sk
.icsk_inet
.sk
;
2260 struct mptcp_sendmsg_info info
= {};
2261 struct mptcp_data_frag
*dfrag
;
2266 state
= sk
->sk_state
;
2267 if (unlikely(state
== TCP_CLOSE
))
2270 mptcp_check_data_fin_ack(sk
);
2271 __mptcp_flush_join_list(msk
);
2273 mptcp_check_fastclose(msk
);
2275 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
))
2276 __mptcp_close_subflow(msk
);
2281 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
2282 mptcp_check_for_eof(msk
);
2284 __mptcp_check_send_data_fin(sk
);
2285 mptcp_check_data_fin(sk
);
2287 /* if the msk data is completely acked, or the socket timedout,
2288 * there is no point in keeping around an orphaned sk
2290 if (sock_flag(sk
, SOCK_DEAD
) &&
2291 (mptcp_check_close_timeout(sk
) ||
2292 (state
!= sk
->sk_state
&&
2293 ((1 << inet_sk_state_load(sk
)) & (TCPF_CLOSE
| TCPF_FIN_WAIT2
))))) {
2294 inet_sk_state_store(sk
, TCP_CLOSE
);
2295 __mptcp_destroy_sock(sk
);
2299 if (!test_and_clear_bit(MPTCP_WORK_RTX
, &msk
->flags
))
2302 dfrag
= mptcp_rtx_head(sk
);
2306 ssk
= mptcp_subflow_get_retrans(msk
);
2312 /* limit retransmission to the bytes already sent on some subflows */
2314 info
.limit
= dfrag
->already_sent
;
2315 while (info
.sent
< dfrag
->already_sent
) {
2316 if (!mptcp_alloc_tx_skb(sk
, ssk
))
2319 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
2323 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_RETRANSSEGS
);
2328 tcp_push(ssk
, 0, info
.mss_now
, tcp_sk(ssk
)->nonagle
,
2331 mptcp_set_timeout(sk
, ssk
);
2335 if (!mptcp_timer_pending(sk
))
2336 mptcp_reset_timer(sk
);
2343 static int __mptcp_init_sock(struct sock
*sk
)
2345 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2347 spin_lock_init(&msk
->join_list_lock
);
2349 INIT_LIST_HEAD(&msk
->conn_list
);
2350 INIT_LIST_HEAD(&msk
->join_list
);
2351 INIT_LIST_HEAD(&msk
->rtx_queue
);
2352 INIT_WORK(&msk
->work
, mptcp_worker
);
2353 __skb_queue_head_init(&msk
->receive_queue
);
2354 __skb_queue_head_init(&msk
->skb_tx_cache
);
2355 msk
->out_of_order_queue
= RB_ROOT
;
2356 msk
->first_pending
= NULL
;
2357 msk
->wmem_reserved
= 0;
2358 msk
->rmem_released
= 0;
2359 msk
->tx_pending_data
= 0;
2360 msk
->size_goal_cache
= TCP_BASE_MSS
;
2362 msk
->ack_hint
= NULL
;
2364 inet_csk(sk
)->icsk_sync_mss
= mptcp_sync_mss
;
2366 mptcp_pm_data_init(msk
);
2368 /* re-use the csk retrans timer for MPTCP-level retrans */
2369 timer_setup(&msk
->sk
.icsk_retransmit_timer
, mptcp_retransmit_timer
, 0);
2370 timer_setup(&sk
->sk_timer
, mptcp_timeout_timer
, 0);
2374 static int mptcp_init_sock(struct sock
*sk
)
2376 struct net
*net
= sock_net(sk
);
2379 ret
= __mptcp_init_sock(sk
);
2383 if (!mptcp_is_enabled(net
))
2384 return -ENOPROTOOPT
;
2386 if (unlikely(!net
->mib
.mptcp_statistics
) && !mptcp_mib_alloc(net
))
2389 ret
= __mptcp_socket_create(mptcp_sk(sk
));
2393 sk_sockets_allocated_inc(sk
);
2394 sk
->sk_rcvbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[1];
2395 sk
->sk_sndbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_wmem
[1];
2400 static void __mptcp_clear_xmit(struct sock
*sk
)
2402 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2403 struct mptcp_data_frag
*dtmp
, *dfrag
;
2404 struct sk_buff
*skb
;
2406 WRITE_ONCE(msk
->first_pending
, NULL
);
2407 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
)
2408 dfrag_clear(sk
, dfrag
);
2409 while ((skb
= __skb_dequeue(&msk
->skb_tx_cache
)) != NULL
) {
2410 sk
->sk_forward_alloc
+= skb
->truesize
;
2415 static void mptcp_cancel_work(struct sock
*sk
)
2417 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2419 if (cancel_work_sync(&msk
->work
))
2423 void mptcp_subflow_shutdown(struct sock
*sk
, struct sock
*ssk
, int how
)
2427 switch (ssk
->sk_state
) {
2429 if (!(how
& RCV_SHUTDOWN
))
2433 tcp_disconnect(ssk
, O_NONBLOCK
);
2436 if (__mptcp_check_fallback(mptcp_sk(sk
))) {
2437 pr_debug("Fallback");
2438 ssk
->sk_shutdown
|= how
;
2439 tcp_shutdown(ssk
, how
);
2441 pr_debug("Sending DATA_FIN on subflow %p", ssk
);
2442 mptcp_set_timeout(sk
, ssk
);
2451 static const unsigned char new_state
[16] = {
2452 /* current state: new state: action: */
2453 [0 /* (Invalid) */] = TCP_CLOSE
,
2454 [TCP_ESTABLISHED
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
2455 [TCP_SYN_SENT
] = TCP_CLOSE
,
2456 [TCP_SYN_RECV
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
2457 [TCP_FIN_WAIT1
] = TCP_FIN_WAIT1
,
2458 [TCP_FIN_WAIT2
] = TCP_FIN_WAIT2
,
2459 [TCP_TIME_WAIT
] = TCP_CLOSE
, /* should not happen ! */
2460 [TCP_CLOSE
] = TCP_CLOSE
,
2461 [TCP_CLOSE_WAIT
] = TCP_LAST_ACK
| TCP_ACTION_FIN
,
2462 [TCP_LAST_ACK
] = TCP_LAST_ACK
,
2463 [TCP_LISTEN
] = TCP_CLOSE
,
2464 [TCP_CLOSING
] = TCP_CLOSING
,
2465 [TCP_NEW_SYN_RECV
] = TCP_CLOSE
, /* should not happen ! */
2468 static int mptcp_close_state(struct sock
*sk
)
2470 int next
= (int)new_state
[sk
->sk_state
];
2471 int ns
= next
& TCP_STATE_MASK
;
2473 inet_sk_state_store(sk
, ns
);
2475 return next
& TCP_ACTION_FIN
;
2478 static void __mptcp_check_send_data_fin(struct sock
*sk
)
2480 struct mptcp_subflow_context
*subflow
;
2481 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2483 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
2484 msk
, msk
->snd_data_fin_enable
, !!mptcp_send_head(sk
),
2485 msk
->snd_nxt
, msk
->write_seq
);
2487 /* we still need to enqueue subflows or not really shutting down,
2490 if (!msk
->snd_data_fin_enable
|| msk
->snd_nxt
+ 1 != msk
->write_seq
||
2491 mptcp_send_head(sk
))
2494 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
2496 /* fallback socket will not get data_fin/ack, can move to the next
2499 if (__mptcp_check_fallback(msk
)) {
2500 if ((1 << sk
->sk_state
) & (TCPF_CLOSING
| TCPF_LAST_ACK
)) {
2501 inet_sk_state_store(sk
, TCP_CLOSE
);
2502 mptcp_close_wake_up(sk
);
2503 } else if (sk
->sk_state
== TCP_FIN_WAIT1
) {
2504 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
2508 __mptcp_flush_join_list(msk
);
2509 mptcp_for_each_subflow(msk
, subflow
) {
2510 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2512 mptcp_subflow_shutdown(sk
, tcp_sk
, SEND_SHUTDOWN
);
2516 static void __mptcp_wr_shutdown(struct sock
*sk
)
2518 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2520 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
2521 msk
, msk
->snd_data_fin_enable
, sk
->sk_shutdown
, sk
->sk_state
,
2522 !!mptcp_send_head(sk
));
2524 /* will be ignored by fallback sockets */
2525 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ 1);
2526 WRITE_ONCE(msk
->snd_data_fin_enable
, 1);
2528 __mptcp_check_send_data_fin(sk
);
2531 static void __mptcp_destroy_sock(struct sock
*sk
)
2533 struct mptcp_subflow_context
*subflow
, *tmp
;
2534 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2535 LIST_HEAD(conn_list
);
2537 pr_debug("msk=%p", msk
);
2539 /* be sure to always acquire the join list lock, to sync vs
2540 * mptcp_finish_join().
2542 spin_lock_bh(&msk
->join_list_lock
);
2543 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
2544 spin_unlock_bh(&msk
->join_list_lock
);
2545 list_splice_init(&msk
->conn_list
, &conn_list
);
2547 sk_stop_timer(sk
, &msk
->sk
.icsk_retransmit_timer
);
2548 sk_stop_timer(sk
, &sk
->sk_timer
);
2551 list_for_each_entry_safe(subflow
, tmp
, &conn_list
, node
) {
2552 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2553 __mptcp_close_ssk(sk
, ssk
, subflow
);
2556 sk
->sk_prot
->destroy(sk
);
2558 WARN_ON_ONCE(msk
->wmem_reserved
);
2559 WARN_ON_ONCE(msk
->rmem_released
);
2560 sk_stream_kill_queues(sk
);
2561 xfrm_sk_free_policy(sk
);
2562 sk_refcnt_debug_release(sk
);
2566 static void mptcp_close(struct sock
*sk
, long timeout
)
2568 struct mptcp_subflow_context
*subflow
;
2569 bool do_cancel_work
= false;
2572 sk
->sk_shutdown
= SHUTDOWN_MASK
;
2574 if ((1 << sk
->sk_state
) & (TCPF_LISTEN
| TCPF_CLOSE
)) {
2575 inet_sk_state_store(sk
, TCP_CLOSE
);
2579 if (mptcp_close_state(sk
))
2580 __mptcp_wr_shutdown(sk
);
2582 sk_stream_wait_close(sk
, timeout
);
2585 /* orphan all the subflows */
2586 inet_csk(sk
)->icsk_mtup
.probe_timestamp
= tcp_jiffies32
;
2587 list_for_each_entry(subflow
, &mptcp_sk(sk
)->conn_list
, node
) {
2588 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2589 bool slow
, dispose_socket
;
2590 struct socket
*sock
;
2592 slow
= lock_sock_fast(ssk
);
2593 sock
= ssk
->sk_socket
;
2594 dispose_socket
= sock
&& sock
!= sk
->sk_socket
;
2596 unlock_sock_fast(ssk
, slow
);
2598 /* for the outgoing subflows we additionally need to free
2599 * the associated socket
2602 iput(SOCK_INODE(sock
));
2607 pr_debug("msk=%p state=%d", sk
, sk
->sk_state
);
2608 if (sk
->sk_state
== TCP_CLOSE
) {
2609 __mptcp_destroy_sock(sk
);
2610 do_cancel_work
= true;
2612 sk_reset_timer(sk
, &sk
->sk_timer
, jiffies
+ TCP_TIMEWAIT_LEN
);
2616 mptcp_cancel_work(sk
);
2620 static void mptcp_copy_inaddrs(struct sock
*msk
, const struct sock
*ssk
)
2622 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2623 const struct ipv6_pinfo
*ssk6
= inet6_sk(ssk
);
2624 struct ipv6_pinfo
*msk6
= inet6_sk(msk
);
2626 msk
->sk_v6_daddr
= ssk
->sk_v6_daddr
;
2627 msk
->sk_v6_rcv_saddr
= ssk
->sk_v6_rcv_saddr
;
2630 msk6
->saddr
= ssk6
->saddr
;
2631 msk6
->flow_label
= ssk6
->flow_label
;
2635 inet_sk(msk
)->inet_num
= inet_sk(ssk
)->inet_num
;
2636 inet_sk(msk
)->inet_dport
= inet_sk(ssk
)->inet_dport
;
2637 inet_sk(msk
)->inet_sport
= inet_sk(ssk
)->inet_sport
;
2638 inet_sk(msk
)->inet_daddr
= inet_sk(ssk
)->inet_daddr
;
2639 inet_sk(msk
)->inet_saddr
= inet_sk(ssk
)->inet_saddr
;
2640 inet_sk(msk
)->inet_rcv_saddr
= inet_sk(ssk
)->inet_rcv_saddr
;
2643 static int mptcp_disconnect(struct sock
*sk
, int flags
)
2645 /* Should never be called.
2646 * inet_stream_connect() calls ->disconnect, but that
2647 * refers to the subflow socket, not the mptcp one.
2653 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2654 static struct ipv6_pinfo
*mptcp_inet6_sk(const struct sock
*sk
)
2656 unsigned int offset
= sizeof(struct mptcp6_sock
) - sizeof(struct ipv6_pinfo
);
2658 return (struct ipv6_pinfo
*)(((u8
*)sk
) + offset
);
2662 struct sock
*mptcp_sk_clone(const struct sock
*sk
,
2663 const struct mptcp_options_received
*mp_opt
,
2664 struct request_sock
*req
)
2666 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
2667 struct sock
*nsk
= sk_clone_lock(sk
, GFP_ATOMIC
);
2668 struct mptcp_sock
*msk
;
2674 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2675 if (nsk
->sk_family
== AF_INET6
)
2676 inet_sk(nsk
)->pinet6
= mptcp_inet6_sk(nsk
);
2679 __mptcp_init_sock(nsk
);
2681 msk
= mptcp_sk(nsk
);
2682 msk
->local_key
= subflow_req
->local_key
;
2683 msk
->token
= subflow_req
->token
;
2684 msk
->subflow
= NULL
;
2685 WRITE_ONCE(msk
->fully_established
, false);
2687 msk
->write_seq
= subflow_req
->idsn
+ 1;
2688 msk
->snd_nxt
= msk
->write_seq
;
2689 msk
->snd_una
= msk
->write_seq
;
2690 msk
->wnd_end
= msk
->snd_nxt
+ req
->rsk_rcv_wnd
;
2692 if (mp_opt
->mp_capable
) {
2693 msk
->can_ack
= true;
2694 msk
->remote_key
= mp_opt
->sndr_key
;
2695 mptcp_crypto_key_sha(msk
->remote_key
, NULL
, &ack_seq
);
2697 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
2698 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
2701 sock_reset_flag(nsk
, SOCK_RCU_FREE
);
2702 /* will be fully established after successful MPC subflow creation */
2703 inet_sk_state_store(nsk
, TCP_SYN_RECV
);
2705 security_inet_csk_clone(nsk
, req
);
2706 bh_unlock_sock(nsk
);
2708 /* keep a single reference */
2713 void mptcp_rcv_space_init(struct mptcp_sock
*msk
, const struct sock
*ssk
)
2715 const struct tcp_sock
*tp
= tcp_sk(ssk
);
2717 msk
->rcvq_space
.copied
= 0;
2718 msk
->rcvq_space
.rtt_us
= 0;
2720 msk
->rcvq_space
.time
= tp
->tcp_mstamp
;
2722 /* initial rcv_space offering made to peer */
2723 msk
->rcvq_space
.space
= min_t(u32
, tp
->rcv_wnd
,
2724 TCP_INIT_CWND
* tp
->advmss
);
2725 if (msk
->rcvq_space
.space
== 0)
2726 msk
->rcvq_space
.space
= TCP_INIT_CWND
* TCP_MSS_DEFAULT
;
2728 WRITE_ONCE(msk
->wnd_end
, msk
->snd_nxt
+ tcp_sk(ssk
)->snd_wnd
);
2731 static struct sock
*mptcp_accept(struct sock
*sk
, int flags
, int *err
,
2734 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2735 struct socket
*listener
;
2738 listener
= __mptcp_nmpc_socket(msk
);
2739 if (WARN_ON_ONCE(!listener
)) {
2744 pr_debug("msk=%p, listener=%p", msk
, mptcp_subflow_ctx(listener
->sk
));
2745 newsk
= inet_csk_accept(listener
->sk
, flags
, err
, kern
);
2749 pr_debug("msk=%p, subflow is mptcp=%d", msk
, sk_is_mptcp(newsk
));
2750 if (sk_is_mptcp(newsk
)) {
2751 struct mptcp_subflow_context
*subflow
;
2752 struct sock
*new_mptcp_sock
;
2754 subflow
= mptcp_subflow_ctx(newsk
);
2755 new_mptcp_sock
= subflow
->conn
;
2757 /* is_mptcp should be false if subflow->conn is missing, see
2758 * subflow_syn_recv_sock()
2760 if (WARN_ON_ONCE(!new_mptcp_sock
)) {
2761 tcp_sk(newsk
)->is_mptcp
= 0;
2765 /* acquire the 2nd reference for the owning socket */
2766 sock_hold(new_mptcp_sock
);
2767 newsk
= new_mptcp_sock
;
2768 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_MPCAPABLEPASSIVEACK
);
2770 MPTCP_INC_STATS(sock_net(sk
),
2771 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK
);
2777 void mptcp_destroy_common(struct mptcp_sock
*msk
)
2779 struct sock
*sk
= (struct sock
*)msk
;
2781 __mptcp_clear_xmit(sk
);
2783 /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
2784 skb_queue_splice_tail_init(&msk
->receive_queue
, &sk
->sk_receive_queue
);
2786 skb_rbtree_purge(&msk
->out_of_order_queue
);
2787 mptcp_token_destroy(msk
);
2788 mptcp_pm_free_anno_list(msk
);
2791 static void mptcp_destroy(struct sock
*sk
)
2793 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2795 mptcp_destroy_common(msk
);
2796 sk_sockets_allocated_dec(sk
);
2799 static int mptcp_setsockopt_sol_socket(struct mptcp_sock
*msk
, int optname
,
2800 sockptr_t optval
, unsigned int optlen
)
2802 struct sock
*sk
= (struct sock
*)msk
;
2803 struct socket
*ssock
;
2810 ssock
= __mptcp_nmpc_socket(msk
);
2816 ret
= sock_setsockopt(ssock
, SOL_SOCKET
, optname
, optval
, optlen
);
2818 if (optname
== SO_REUSEPORT
)
2819 sk
->sk_reuseport
= ssock
->sk
->sk_reuseport
;
2820 else if (optname
== SO_REUSEADDR
)
2821 sk
->sk_reuse
= ssock
->sk
->sk_reuse
;
2827 return sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
, optval
, optlen
);
2830 static int mptcp_setsockopt_v6(struct mptcp_sock
*msk
, int optname
,
2831 sockptr_t optval
, unsigned int optlen
)
2833 struct sock
*sk
= (struct sock
*)msk
;
2834 int ret
= -EOPNOTSUPP
;
2835 struct socket
*ssock
;
2840 ssock
= __mptcp_nmpc_socket(msk
);
2846 ret
= tcp_setsockopt(ssock
->sk
, SOL_IPV6
, optname
, optval
, optlen
);
2848 sk
->sk_ipv6only
= ssock
->sk
->sk_ipv6only
;
2857 static int mptcp_setsockopt(struct sock
*sk
, int level
, int optname
,
2858 sockptr_t optval
, unsigned int optlen
)
2860 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2863 pr_debug("msk=%p", msk
);
2865 if (level
== SOL_SOCKET
)
2866 return mptcp_setsockopt_sol_socket(msk
, optname
, optval
, optlen
);
2868 /* @@ the meaning of setsockopt() when the socket is connected and
2869 * there are multiple subflows is not yet defined. It is up to the
2870 * MPTCP-level socket to configure the subflows until the subflow
2871 * is in TCP fallback, when TCP socket options are passed through
2872 * to the one remaining subflow.
2875 ssk
= __mptcp_tcp_fallback(msk
);
2878 return tcp_setsockopt(ssk
, level
, optname
, optval
, optlen
);
2880 if (level
== SOL_IPV6
)
2881 return mptcp_setsockopt_v6(msk
, optname
, optval
, optlen
);
2886 static int mptcp_getsockopt(struct sock
*sk
, int level
, int optname
,
2887 char __user
*optval
, int __user
*option
)
2889 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2892 pr_debug("msk=%p", msk
);
2894 /* @@ the meaning of setsockopt() when the socket is connected and
2895 * there are multiple subflows is not yet defined. It is up to the
2896 * MPTCP-level socket to configure the subflows until the subflow
2897 * is in TCP fallback, when socket options are passed through
2898 * to the one remaining subflow.
2901 ssk
= __mptcp_tcp_fallback(msk
);
2904 return tcp_getsockopt(ssk
, level
, optname
, optval
, option
);
2909 void __mptcp_data_acked(struct sock
*sk
)
2911 if (!sock_owned_by_user(sk
))
2912 __mptcp_clean_una(sk
);
2914 set_bit(MPTCP_CLEAN_UNA
, &mptcp_sk(sk
)->flags
);
2916 if (mptcp_pending_data_fin_ack(sk
))
2917 mptcp_schedule_work(sk
);
2920 void __mptcp_check_push(struct sock
*sk
, struct sock
*ssk
)
2922 if (!mptcp_send_head(sk
))
2925 if (!sock_owned_by_user(sk
))
2926 __mptcp_subflow_push_pending(sk
, ssk
);
2928 set_bit(MPTCP_PUSH_PENDING
, &mptcp_sk(sk
)->flags
);
2931 #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
2933 /* processes deferred events and flush wmem */
2934 static void mptcp_release_cb(struct sock
*sk
)
2936 unsigned long flags
, nflags
;
2938 /* push_pending may touch wmem_reserved, do it before the later
2941 if (test_and_clear_bit(MPTCP_CLEAN_UNA
, &mptcp_sk(sk
)->flags
))
2942 __mptcp_clean_una(sk
);
2943 if (test_and_clear_bit(MPTCP_PUSH_PENDING
, &mptcp_sk(sk
)->flags
)) {
2944 /* mptcp_push_pending() acquires the subflow socket lock
2946 * 1) can't be invoked in atomic scope
2947 * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
2948 * datapath acquires the msk socket spinlock while helding
2949 * the subflow socket lock
2952 spin_unlock_bh(&sk
->sk_lock
.slock
);
2953 mptcp_push_pending(sk
, 0);
2954 spin_lock_bh(&sk
->sk_lock
.slock
);
2957 /* clear any wmem reservation and errors */
2958 __mptcp_update_wmem(sk
);
2959 __mptcp_update_rmem(sk
);
2962 flags
= sk
->sk_tsq_flags
;
2963 if (!(flags
& MPTCP_DEFERRED_ALL
))
2965 nflags
= flags
& ~MPTCP_DEFERRED_ALL
;
2966 } while (cmpxchg(&sk
->sk_tsq_flags
, flags
, nflags
) != flags
);
2968 sock_release_ownership(sk
);
2970 if (flags
& TCPF_WRITE_TIMER_DEFERRED
) {
2971 mptcp_retransmit_handler(sk
);
2976 static int mptcp_hash(struct sock
*sk
)
2978 /* should never be called,
2979 * we hash the TCP subflows not the master socket
2985 static void mptcp_unhash(struct sock
*sk
)
2987 /* called from sk_common_release(), but nothing to do here */
2990 static int mptcp_get_port(struct sock
*sk
, unsigned short snum
)
2992 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2993 struct socket
*ssock
;
2995 ssock
= __mptcp_nmpc_socket(msk
);
2996 pr_debug("msk=%p, subflow=%p", msk
, ssock
);
2997 if (WARN_ON_ONCE(!ssock
))
3000 return inet_csk_get_port(ssock
->sk
, snum
);
3003 void mptcp_finish_connect(struct sock
*ssk
)
3005 struct mptcp_subflow_context
*subflow
;
3006 struct mptcp_sock
*msk
;
3010 subflow
= mptcp_subflow_ctx(ssk
);
3014 pr_debug("msk=%p, token=%u", sk
, subflow
->token
);
3016 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
, &ack_seq
);
3018 subflow
->map_seq
= ack_seq
;
3019 subflow
->map_subflow_seq
= 1;
3021 /* the socket is not connected yet, no msk/subflow ops can access/race
3022 * accessing the field below
3024 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
3025 WRITE_ONCE(msk
->local_key
, subflow
->local_key
);
3026 WRITE_ONCE(msk
->write_seq
, subflow
->idsn
+ 1);
3027 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
3028 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
3029 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
3030 WRITE_ONCE(msk
->can_ack
, 1);
3031 WRITE_ONCE(msk
->snd_una
, msk
->write_seq
);
3033 mptcp_pm_new_connection(msk
, 0);
3035 mptcp_rcv_space_init(msk
, ssk
);
3038 static void mptcp_sock_graft(struct sock
*sk
, struct socket
*parent
)
3040 write_lock_bh(&sk
->sk_callback_lock
);
3041 rcu_assign_pointer(sk
->sk_wq
, &parent
->wq
);
3042 sk_set_socket(sk
, parent
);
3043 sk
->sk_uid
= SOCK_INODE(parent
)->i_uid
;
3044 write_unlock_bh(&sk
->sk_callback_lock
);
3047 bool mptcp_finish_join(struct sock
*ssk
)
3049 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
3050 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
3051 struct sock
*parent
= (void *)msk
;
3052 struct socket
*parent_sock
;
3055 pr_debug("msk=%p, subflow=%p", msk
, subflow
);
3057 /* mptcp socket already closing? */
3058 if (!mptcp_is_fully_established(parent
))
3061 if (!msk
->pm
.server_side
)
3064 if (!mptcp_pm_allow_new_subflow(msk
))
3067 /* active connections are already on conn_list, and we can't acquire
3069 * use the join list lock as synchronization point and double-check
3070 * msk status to avoid racing with __mptcp_destroy_sock()
3072 spin_lock_bh(&msk
->join_list_lock
);
3073 ret
= inet_sk_state_load(parent
) == TCP_ESTABLISHED
;
3074 if (ret
&& !WARN_ON_ONCE(!list_empty(&subflow
->node
))) {
3075 list_add_tail(&subflow
->node
, &msk
->join_list
);
3078 spin_unlock_bh(&msk
->join_list_lock
);
3082 /* attach to msk socket only after we are sure he will deal with us
3085 parent_sock
= READ_ONCE(parent
->sk_socket
);
3086 if (parent_sock
&& !ssk
->sk_socket
)
3087 mptcp_sock_graft(ssk
, parent_sock
);
3088 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
3092 static struct proto mptcp_prot
= {
3094 .owner
= THIS_MODULE
,
3095 .init
= mptcp_init_sock
,
3096 .disconnect
= mptcp_disconnect
,
3097 .close
= mptcp_close
,
3098 .accept
= mptcp_accept
,
3099 .setsockopt
= mptcp_setsockopt
,
3100 .getsockopt
= mptcp_getsockopt
,
3101 .shutdown
= tcp_shutdown
,
3102 .destroy
= mptcp_destroy
,
3103 .sendmsg
= mptcp_sendmsg
,
3104 .recvmsg
= mptcp_recvmsg
,
3105 .release_cb
= mptcp_release_cb
,
3107 .unhash
= mptcp_unhash
,
3108 .get_port
= mptcp_get_port
,
3109 .sockets_allocated
= &mptcp_sockets_allocated
,
3110 .memory_allocated
= &tcp_memory_allocated
,
3111 .memory_pressure
= &tcp_memory_pressure
,
3112 .sysctl_wmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_wmem
),
3113 .sysctl_rmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_rmem
),
3114 .sysctl_mem
= sysctl_tcp_mem
,
3115 .obj_size
= sizeof(struct mptcp_sock
),
3116 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
3117 .no_autobind
= true,
3120 static int mptcp_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3122 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3123 struct socket
*ssock
;
3126 lock_sock(sock
->sk
);
3127 ssock
= __mptcp_nmpc_socket(msk
);
3133 err
= ssock
->ops
->bind(ssock
, uaddr
, addr_len
);
3135 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3138 release_sock(sock
->sk
);
3142 static void mptcp_subflow_early_fallback(struct mptcp_sock
*msk
,
3143 struct mptcp_subflow_context
*subflow
)
3145 subflow
->request_mptcp
= 0;
3146 __mptcp_do_fallback(msk
);
3149 static int mptcp_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
3150 int addr_len
, int flags
)
3152 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3153 struct mptcp_subflow_context
*subflow
;
3154 struct socket
*ssock
;
3157 lock_sock(sock
->sk
);
3158 if (sock
->state
!= SS_UNCONNECTED
&& msk
->subflow
) {
3159 /* pending connection or invalid state, let existing subflow
3162 ssock
= msk
->subflow
;
3166 ssock
= __mptcp_nmpc_socket(msk
);
3172 mptcp_token_destroy(msk
);
3173 inet_sk_state_store(sock
->sk
, TCP_SYN_SENT
);
3174 subflow
= mptcp_subflow_ctx(ssock
->sk
);
3175 #ifdef CONFIG_TCP_MD5SIG
3176 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
3179 if (rcu_access_pointer(tcp_sk(ssock
->sk
)->md5sig_info
))
3180 mptcp_subflow_early_fallback(msk
, subflow
);
3182 if (subflow
->request_mptcp
&& mptcp_token_new_connect(ssock
->sk
))
3183 mptcp_subflow_early_fallback(msk
, subflow
);
3186 err
= ssock
->ops
->connect(ssock
, uaddr
, addr_len
, flags
);
3187 sock
->state
= ssock
->state
;
3189 /* on successful connect, the msk state will be moved to established by
3190 * subflow_finish_connect()
3192 if (!err
|| err
== -EINPROGRESS
)
3193 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3195 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
3198 release_sock(sock
->sk
);
3202 static int mptcp_listen(struct socket
*sock
, int backlog
)
3204 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3205 struct socket
*ssock
;
3208 pr_debug("msk=%p", msk
);
3210 lock_sock(sock
->sk
);
3211 ssock
= __mptcp_nmpc_socket(msk
);
3217 mptcp_token_destroy(msk
);
3218 inet_sk_state_store(sock
->sk
, TCP_LISTEN
);
3219 sock_set_flag(sock
->sk
, SOCK_RCU_FREE
);
3221 err
= ssock
->ops
->listen(ssock
, backlog
);
3222 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
3224 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3227 release_sock(sock
->sk
);
3231 static int mptcp_stream_accept(struct socket
*sock
, struct socket
*newsock
,
3232 int flags
, bool kern
)
3234 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3235 struct socket
*ssock
;
3238 pr_debug("msk=%p", msk
);
3240 lock_sock(sock
->sk
);
3241 if (sock
->sk
->sk_state
!= TCP_LISTEN
)
3244 ssock
= __mptcp_nmpc_socket(msk
);
3248 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
3249 sock_hold(ssock
->sk
);
3250 release_sock(sock
->sk
);
3252 err
= ssock
->ops
->accept(sock
, newsock
, flags
, kern
);
3253 if (err
== 0 && !mptcp_is_tcpsk(newsock
->sk
)) {
3254 struct mptcp_sock
*msk
= mptcp_sk(newsock
->sk
);
3255 struct mptcp_subflow_context
*subflow
;
3256 struct sock
*newsk
= newsock
->sk
;
3259 slowpath
= lock_sock_fast(newsk
);
3261 /* PM/worker can now acquire the first subflow socket
3262 * lock without racing with listener queue cleanup,
3263 * we can notify it, if needed.
3265 subflow
= mptcp_subflow_ctx(msk
->first
);
3266 list_add(&subflow
->node
, &msk
->conn_list
);
3267 sock_hold(msk
->first
);
3268 if (mptcp_is_fully_established(newsk
))
3269 mptcp_pm_fully_established(msk
);
3271 mptcp_copy_inaddrs(newsk
, msk
->first
);
3272 mptcp_rcv_space_init(msk
, msk
->first
);
3274 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
3275 * This is needed so NOSPACE flag can be set from tcp stack.
3277 __mptcp_flush_join_list(msk
);
3278 mptcp_for_each_subflow(msk
, subflow
) {
3279 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
3281 if (!ssk
->sk_socket
)
3282 mptcp_sock_graft(ssk
, newsock
);
3284 unlock_sock_fast(newsk
, slowpath
);
3287 if (inet_csk_listen_poll(ssock
->sk
))
3288 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
3289 sock_put(ssock
->sk
);
3293 release_sock(sock
->sk
);
3297 static __poll_t
mptcp_check_readable(struct mptcp_sock
*msk
)
3299 return test_bit(MPTCP_DATA_READY
, &msk
->flags
) ? EPOLLIN
| EPOLLRDNORM
:
3303 static __poll_t
mptcp_check_writeable(struct mptcp_sock
*msk
)
3305 struct sock
*sk
= (struct sock
*)msk
;
3307 if (unlikely(sk
->sk_shutdown
& SEND_SHUTDOWN
))
3310 if (sk_stream_is_writeable(sk
))
3311 return EPOLLOUT
| EPOLLWRNORM
;
3313 set_bit(MPTCP_NOSPACE
, &msk
->flags
);
3314 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
3315 if (sk_stream_is_writeable(sk
))
3316 return EPOLLOUT
| EPOLLWRNORM
;
3321 static __poll_t
mptcp_poll(struct file
*file
, struct socket
*sock
,
3322 struct poll_table_struct
*wait
)
3324 struct sock
*sk
= sock
->sk
;
3325 struct mptcp_sock
*msk
;
3330 sock_poll_wait(file
, sock
, wait
);
3332 state
= inet_sk_state_load(sk
);
3333 pr_debug("msk=%p state=%d flags=%lx", msk
, state
, msk
->flags
);
3334 if (state
== TCP_LISTEN
)
3335 return mptcp_check_readable(msk
);
3337 if (state
!= TCP_SYN_SENT
&& state
!= TCP_SYN_RECV
) {
3338 mask
|= mptcp_check_readable(msk
);
3339 mask
|= mptcp_check_writeable(msk
);
3341 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
3342 mask
|= EPOLLIN
| EPOLLRDNORM
| EPOLLRDHUP
;
3347 static int mptcp_shutdown(struct socket
*sock
, int how
)
3349 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3350 struct sock
*sk
= sock
->sk
;
3353 pr_debug("sk=%p, how=%d", msk
, how
);
3358 if ((how
& ~SHUTDOWN_MASK
) || !how
) {
3363 if (sock
->state
== SS_CONNECTING
) {
3364 if ((1 << sk
->sk_state
) &
3365 (TCPF_SYN_SENT
| TCPF_SYN_RECV
| TCPF_CLOSE
))
3366 sock
->state
= SS_DISCONNECTING
;
3368 sock
->state
= SS_CONNECTED
;
3371 sk
->sk_shutdown
|= how
;
3372 if ((how
& SEND_SHUTDOWN
) && mptcp_close_state(sk
))
3373 __mptcp_wr_shutdown(sk
);
3375 /* Wake up anyone sleeping in poll. */
3376 sk
->sk_state_change(sk
);
3384 static const struct proto_ops mptcp_stream_ops
= {
3386 .owner
= THIS_MODULE
,
3387 .release
= inet_release
,
3389 .connect
= mptcp_stream_connect
,
3390 .socketpair
= sock_no_socketpair
,
3391 .accept
= mptcp_stream_accept
,
3392 .getname
= inet_getname
,
3394 .ioctl
= inet_ioctl
,
3395 .gettstamp
= sock_gettstamp
,
3396 .listen
= mptcp_listen
,
3397 .shutdown
= mptcp_shutdown
,
3398 .setsockopt
= sock_common_setsockopt
,
3399 .getsockopt
= sock_common_getsockopt
,
3400 .sendmsg
= inet_sendmsg
,
3401 .recvmsg
= inet_recvmsg
,
3402 .mmap
= sock_no_mmap
,
3403 .sendpage
= inet_sendpage
,
3406 static struct inet_protosw mptcp_protosw
= {
3407 .type
= SOCK_STREAM
,
3408 .protocol
= IPPROTO_MPTCP
,
3409 .prot
= &mptcp_prot
,
3410 .ops
= &mptcp_stream_ops
,
3411 .flags
= INET_PROTOSW_ICSK
,
3414 void __init
mptcp_proto_init(void)
3416 mptcp_prot
.h
.hashinfo
= tcp_prot
.h
.hashinfo
;
3418 if (percpu_counter_init(&mptcp_sockets_allocated
, 0, GFP_KERNEL
))
3419 panic("Failed to allocate MPTCP pcpu counter\n");
3421 mptcp_subflow_init();
3425 if (proto_register(&mptcp_prot
, 1) != 0)
3426 panic("Failed to register MPTCP proto.\n");
3428 inet_register_protosw(&mptcp_protosw
);
3430 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb
) > sizeof_field(struct sk_buff
, cb
));
3433 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
3434 static const struct proto_ops mptcp_v6_stream_ops
= {
3436 .owner
= THIS_MODULE
,
3437 .release
= inet6_release
,
3439 .connect
= mptcp_stream_connect
,
3440 .socketpair
= sock_no_socketpair
,
3441 .accept
= mptcp_stream_accept
,
3442 .getname
= inet6_getname
,
3444 .ioctl
= inet6_ioctl
,
3445 .gettstamp
= sock_gettstamp
,
3446 .listen
= mptcp_listen
,
3447 .shutdown
= mptcp_shutdown
,
3448 .setsockopt
= sock_common_setsockopt
,
3449 .getsockopt
= sock_common_getsockopt
,
3450 .sendmsg
= inet6_sendmsg
,
3451 .recvmsg
= inet6_recvmsg
,
3452 .mmap
= sock_no_mmap
,
3453 .sendpage
= inet_sendpage
,
3454 #ifdef CONFIG_COMPAT
3455 .compat_ioctl
= inet6_compat_ioctl
,
3459 static struct proto mptcp_v6_prot
;
3461 static void mptcp_v6_destroy(struct sock
*sk
)
3464 inet6_destroy_sock(sk
);
3467 static struct inet_protosw mptcp_v6_protosw
= {
3468 .type
= SOCK_STREAM
,
3469 .protocol
= IPPROTO_MPTCP
,
3470 .prot
= &mptcp_v6_prot
,
3471 .ops
= &mptcp_v6_stream_ops
,
3472 .flags
= INET_PROTOSW_ICSK
,
3475 int __init
mptcp_proto_v6_init(void)
3479 mptcp_v6_prot
= mptcp_prot
;
3480 strcpy(mptcp_v6_prot
.name
, "MPTCPv6");
3481 mptcp_v6_prot
.slab
= NULL
;
3482 mptcp_v6_prot
.destroy
= mptcp_v6_destroy
;
3483 mptcp_v6_prot
.obj_size
= sizeof(struct mptcp6_sock
);
3485 err
= proto_register(&mptcp_v6_prot
, 1);
3489 err
= inet6_register_protosw(&mptcp_v6_protosw
);
3491 proto_unregister(&mptcp_v6_prot
);