1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <linux/sched/signal.h>
13 #include <linux/atomic.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #include <net/tcp_states.h>
20 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
21 #include <net/transp_v6.h>
23 #include <net/mptcp.h>
28 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
30 struct mptcp_sock msk
;
41 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
43 static struct percpu_counter mptcp_sockets_allocated
;
45 static void __mptcp_destroy_sock(struct sock
*sk
);
46 static void __mptcp_check_send_data_fin(struct sock
*sk
);
48 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
49 * completed yet or has failed, return the subflow socket.
50 * Otherwise return NULL.
52 static struct socket
*__mptcp_nmpc_socket(const struct mptcp_sock
*msk
)
54 if (!msk
->subflow
|| READ_ONCE(msk
->can_ack
))
60 /* Returns end sequence number of the receiver's advertised window */
61 static u64
mptcp_wnd_end(const struct mptcp_sock
*msk
)
63 return READ_ONCE(msk
->wnd_end
);
66 static bool mptcp_is_tcpsk(struct sock
*sk
)
68 struct socket
*sock
= sk
->sk_socket
;
70 if (unlikely(sk
->sk_prot
== &tcp_prot
)) {
71 /* we are being invoked after mptcp_accept() has
72 * accepted a non-mp-capable flow: sk is a tcp_sk,
75 * Hand the socket over to tcp so all further socket ops
78 sock
->ops
= &inet_stream_ops
;
80 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
81 } else if (unlikely(sk
->sk_prot
== &tcpv6_prot
)) {
82 sock
->ops
= &inet6_stream_ops
;
90 static struct sock
*__mptcp_tcp_fallback(struct mptcp_sock
*msk
)
92 sock_owned_by_me((const struct sock
*)msk
);
94 if (likely(!__mptcp_check_fallback(msk
)))
100 static int __mptcp_socket_create(struct mptcp_sock
*msk
)
102 struct mptcp_subflow_context
*subflow
;
103 struct sock
*sk
= (struct sock
*)msk
;
104 struct socket
*ssock
;
107 err
= mptcp_subflow_create_socket(sk
, &ssock
);
111 msk
->first
= ssock
->sk
;
112 msk
->subflow
= ssock
;
113 subflow
= mptcp_subflow_ctx(ssock
->sk
);
114 list_add(&subflow
->node
, &msk
->conn_list
);
115 sock_hold(ssock
->sk
);
116 subflow
->request_mptcp
= 1;
118 /* accept() will wait on first subflow sk_wq, and we always wakes up
121 RCU_INIT_POINTER(msk
->first
->sk_wq
, &sk
->sk_socket
->wq
);
126 static void mptcp_drop(struct sock
*sk
, struct sk_buff
*skb
)
128 sk_drops_add(sk
, skb
);
132 static bool mptcp_try_coalesce(struct sock
*sk
, struct sk_buff
*to
,
133 struct sk_buff
*from
)
138 if (MPTCP_SKB_CB(from
)->offset
||
139 !skb_try_coalesce(to
, from
, &fragstolen
, &delta
))
142 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
143 MPTCP_SKB_CB(from
)->map_seq
, MPTCP_SKB_CB(to
)->map_seq
,
144 to
->len
, MPTCP_SKB_CB(from
)->end_seq
);
145 MPTCP_SKB_CB(to
)->end_seq
= MPTCP_SKB_CB(from
)->end_seq
;
146 kfree_skb_partial(from
, fragstolen
);
147 atomic_add(delta
, &sk
->sk_rmem_alloc
);
148 sk_mem_charge(sk
, delta
);
152 static bool mptcp_ooo_try_coalesce(struct mptcp_sock
*msk
, struct sk_buff
*to
,
153 struct sk_buff
*from
)
155 if (MPTCP_SKB_CB(from
)->map_seq
!= MPTCP_SKB_CB(to
)->end_seq
)
158 return mptcp_try_coalesce((struct sock
*)msk
, to
, from
);
161 /* "inspired" by tcp_data_queue_ofo(), main differences:
163 * - don't cope with sacks
165 static void mptcp_data_queue_ofo(struct mptcp_sock
*msk
, struct sk_buff
*skb
)
167 struct sock
*sk
= (struct sock
*)msk
;
168 struct rb_node
**p
, *parent
;
169 u64 seq
, end_seq
, max_seq
;
170 struct sk_buff
*skb1
;
172 seq
= MPTCP_SKB_CB(skb
)->map_seq
;
173 end_seq
= MPTCP_SKB_CB(skb
)->end_seq
;
174 max_seq
= READ_ONCE(msk
->rcv_wnd_sent
);
176 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk
, seq
, max_seq
,
177 RB_EMPTY_ROOT(&msk
->out_of_order_queue
));
178 if (after64(end_seq
, max_seq
)) {
181 pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
182 (unsigned long long)end_seq
- (unsigned long)max_seq
,
183 (unsigned long long)msk
->rcv_wnd_sent
);
184 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_NODSSWINDOW
);
188 p
= &msk
->out_of_order_queue
.rb_node
;
189 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUE
);
190 if (RB_EMPTY_ROOT(&msk
->out_of_order_queue
)) {
191 rb_link_node(&skb
->rbnode
, NULL
, p
);
192 rb_insert_color(&skb
->rbnode
, &msk
->out_of_order_queue
);
193 msk
->ooo_last_skb
= skb
;
197 /* with 2 subflows, adding at end of ooo queue is quite likely
198 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
200 if (mptcp_ooo_try_coalesce(msk
, msk
->ooo_last_skb
, skb
)) {
201 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOMERGE
);
202 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUETAIL
);
206 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
207 if (!before64(seq
, MPTCP_SKB_CB(msk
->ooo_last_skb
)->end_seq
)) {
208 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUETAIL
);
209 parent
= &msk
->ooo_last_skb
->rbnode
;
210 p
= &parent
->rb_right
;
214 /* Find place to insert this segment. Handle overlaps on the way. */
218 skb1
= rb_to_skb(parent
);
219 if (before64(seq
, MPTCP_SKB_CB(skb1
)->map_seq
)) {
220 p
= &parent
->rb_left
;
223 if (before64(seq
, MPTCP_SKB_CB(skb1
)->end_seq
)) {
224 if (!after64(end_seq
, MPTCP_SKB_CB(skb1
)->end_seq
)) {
225 /* All the bits are present. Drop. */
227 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
230 if (after64(seq
, MPTCP_SKB_CB(skb1
)->map_seq
)) {
234 * continue traversing
237 /* skb's seq == skb1's seq and skb covers skb1.
238 * Replace skb1 with skb.
240 rb_replace_node(&skb1
->rbnode
, &skb
->rbnode
,
241 &msk
->out_of_order_queue
);
242 mptcp_drop(sk
, skb1
);
243 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
246 } else if (mptcp_ooo_try_coalesce(msk
, skb1
, skb
)) {
247 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOMERGE
);
250 p
= &parent
->rb_right
;
254 /* Insert segment into RB tree. */
255 rb_link_node(&skb
->rbnode
, parent
, p
);
256 rb_insert_color(&skb
->rbnode
, &msk
->out_of_order_queue
);
259 /* Remove other segments covered by skb. */
260 while ((skb1
= skb_rb_next(skb
)) != NULL
) {
261 if (before64(end_seq
, MPTCP_SKB_CB(skb1
)->end_seq
))
263 rb_erase(&skb1
->rbnode
, &msk
->out_of_order_queue
);
264 mptcp_drop(sk
, skb1
);
265 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
267 /* If there is no skb after us, we are the last_skb ! */
269 msk
->ooo_last_skb
= skb
;
273 skb_set_owner_r(skb
, sk
);
276 static bool __mptcp_move_skb(struct mptcp_sock
*msk
, struct sock
*ssk
,
277 struct sk_buff
*skb
, unsigned int offset
,
280 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
281 struct sock
*sk
= (struct sock
*)msk
;
282 struct sk_buff
*tail
;
284 __skb_unlink(skb
, &ssk
->sk_receive_queue
);
289 /* try to fetch required memory from subflow */
290 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
)) {
291 if (ssk
->sk_forward_alloc
< skb
->truesize
)
293 __sk_mem_reclaim(ssk
, skb
->truesize
);
294 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
))
298 /* the skb map_seq accounts for the skb offset:
299 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
302 MPTCP_SKB_CB(skb
)->map_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
303 MPTCP_SKB_CB(skb
)->end_seq
= MPTCP_SKB_CB(skb
)->map_seq
+ copy_len
;
304 MPTCP_SKB_CB(skb
)->offset
= offset
;
306 if (MPTCP_SKB_CB(skb
)->map_seq
== msk
->ack_seq
) {
308 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ copy_len
);
309 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
310 if (tail
&& mptcp_try_coalesce(sk
, tail
, skb
))
313 skb_set_owner_r(skb
, sk
);
314 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
316 } else if (after64(MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
)) {
317 mptcp_data_queue_ofo(msk
, skb
);
321 /* old data, keep it simple and drop the whole pkt, sender
322 * will retransmit as needed, if needed.
324 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
330 static void mptcp_stop_timer(struct sock
*sk
)
332 struct inet_connection_sock
*icsk
= inet_csk(sk
);
334 sk_stop_timer(sk
, &icsk
->icsk_retransmit_timer
);
335 mptcp_sk(sk
)->timer_ival
= 0;
338 static void mptcp_close_wake_up(struct sock
*sk
)
340 if (sock_flag(sk
, SOCK_DEAD
))
343 sk
->sk_state_change(sk
);
344 if (sk
->sk_shutdown
== SHUTDOWN_MASK
||
345 sk
->sk_state
== TCP_CLOSE
)
346 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_HUP
);
348 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
351 static bool mptcp_pending_data_fin_ack(struct sock
*sk
)
353 struct mptcp_sock
*msk
= mptcp_sk(sk
);
355 return !__mptcp_check_fallback(msk
) &&
356 ((1 << sk
->sk_state
) &
357 (TCPF_FIN_WAIT1
| TCPF_CLOSING
| TCPF_LAST_ACK
)) &&
358 msk
->write_seq
== READ_ONCE(msk
->snd_una
);
361 static void mptcp_check_data_fin_ack(struct sock
*sk
)
363 struct mptcp_sock
*msk
= mptcp_sk(sk
);
365 /* Look for an acknowledged DATA_FIN */
366 if (mptcp_pending_data_fin_ack(sk
)) {
367 mptcp_stop_timer(sk
);
369 WRITE_ONCE(msk
->snd_data_fin_enable
, 0);
371 switch (sk
->sk_state
) {
373 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
377 inet_sk_state_store(sk
, TCP_CLOSE
);
381 mptcp_close_wake_up(sk
);
385 static bool mptcp_pending_data_fin(struct sock
*sk
, u64
*seq
)
387 struct mptcp_sock
*msk
= mptcp_sk(sk
);
389 if (READ_ONCE(msk
->rcv_data_fin
) &&
390 ((1 << sk
->sk_state
) &
391 (TCPF_ESTABLISHED
| TCPF_FIN_WAIT1
| TCPF_FIN_WAIT2
))) {
392 u64 rcv_data_fin_seq
= READ_ONCE(msk
->rcv_data_fin_seq
);
394 if (msk
->ack_seq
== rcv_data_fin_seq
) {
396 *seq
= rcv_data_fin_seq
;
405 static void mptcp_set_timeout(const struct sock
*sk
, const struct sock
*ssk
)
407 long tout
= ssk
&& inet_csk(ssk
)->icsk_pending
?
408 inet_csk(ssk
)->icsk_timeout
- jiffies
: 0;
411 tout
= mptcp_sk(sk
)->timer_ival
;
412 mptcp_sk(sk
)->timer_ival
= tout
> 0 ? tout
: TCP_RTO_MIN
;
415 static bool mptcp_subflow_active(struct mptcp_subflow_context
*subflow
)
417 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
419 /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
420 if (subflow
->request_join
&& !subflow
->fully_established
)
423 /* only send if our side has not closed yet */
424 return ((1 << ssk
->sk_state
) & (TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
));
427 static bool tcp_can_send_ack(const struct sock
*ssk
)
429 return !((1 << inet_sk_state_load(ssk
)) &
430 (TCPF_SYN_SENT
| TCPF_SYN_RECV
| TCPF_TIME_WAIT
| TCPF_CLOSE
));
433 static void mptcp_send_ack(struct mptcp_sock
*msk
)
435 struct mptcp_subflow_context
*subflow
;
437 mptcp_for_each_subflow(msk
, subflow
) {
438 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
441 if (tcp_can_send_ack(ssk
))
447 static bool mptcp_subflow_cleanup_rbuf(struct sock
*ssk
)
452 ret
= tcp_can_send_ack(ssk
);
454 tcp_cleanup_rbuf(ssk
, 1);
459 static void mptcp_cleanup_rbuf(struct mptcp_sock
*msk
)
461 struct sock
*ack_hint
= READ_ONCE(msk
->ack_hint
);
462 struct mptcp_subflow_context
*subflow
;
464 /* if the hinted ssk is still active, try to use it */
465 if (likely(ack_hint
)) {
466 mptcp_for_each_subflow(msk
, subflow
) {
467 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
469 if (ack_hint
== ssk
&& mptcp_subflow_cleanup_rbuf(ssk
))
474 /* otherwise pick the first active subflow */
475 mptcp_for_each_subflow(msk
, subflow
)
476 if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow
)))
480 static bool mptcp_check_data_fin(struct sock
*sk
)
482 struct mptcp_sock
*msk
= mptcp_sk(sk
);
483 u64 rcv_data_fin_seq
;
486 if (__mptcp_check_fallback(msk
) || !msk
->first
)
489 /* Need to ack a DATA_FIN received from a peer while this side
490 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
491 * msk->rcv_data_fin was set when parsing the incoming options
492 * at the subflow level and the msk lock was not held, so this
493 * is the first opportunity to act on the DATA_FIN and change
496 * If we are caught up to the sequence number of the incoming
497 * DATA_FIN, send the DATA_ACK now and do state transition. If
498 * not caught up, do nothing and let the recv code send DATA_ACK
502 if (mptcp_pending_data_fin(sk
, &rcv_data_fin_seq
)) {
503 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ 1);
504 WRITE_ONCE(msk
->rcv_data_fin
, 0);
506 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
507 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
508 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
510 switch (sk
->sk_state
) {
511 case TCP_ESTABLISHED
:
512 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
515 inet_sk_state_store(sk
, TCP_CLOSING
);
518 inet_sk_state_store(sk
, TCP_CLOSE
);
521 /* Other states not expected */
527 mptcp_set_timeout(sk
, NULL
);
529 mptcp_close_wake_up(sk
);
534 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock
*msk
,
538 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
539 struct sock
*sk
= (struct sock
*)msk
;
540 unsigned int moved
= 0;
541 bool more_data_avail
;
546 sk_rbuf
= READ_ONCE(sk
->sk_rcvbuf
);
548 if (!(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
549 int ssk_rbuf
= READ_ONCE(ssk
->sk_rcvbuf
);
551 if (unlikely(ssk_rbuf
> sk_rbuf
)) {
552 WRITE_ONCE(sk
->sk_rcvbuf
, ssk_rbuf
);
557 pr_debug("msk=%p ssk=%p", msk
, ssk
);
560 u32 map_remaining
, offset
;
561 u32 seq
= tp
->copied_seq
;
565 /* try to move as much data as available */
566 map_remaining
= subflow
->map_data_len
-
567 mptcp_subflow_get_map_offset(subflow
);
569 skb
= skb_peek(&ssk
->sk_receive_queue
);
571 /* if no data is found, a racing workqueue/recvmsg
572 * already processed the new data, stop here or we
573 * can enter an infinite loop
580 if (__mptcp_check_fallback(msk
)) {
581 /* if we are running under the workqueue, TCP could have
582 * collapsed skbs between dummy map creation and now
583 * be sure to adjust the size
585 map_remaining
= skb
->len
;
586 subflow
->map_data_len
= skb
->len
;
589 offset
= seq
- TCP_SKB_CB(skb
)->seq
;
590 fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
596 if (offset
< skb
->len
) {
597 size_t len
= skb
->len
- offset
;
602 if (__mptcp_move_skb(msk
, ssk
, skb
, offset
, len
))
606 if (WARN_ON_ONCE(map_remaining
< len
))
610 sk_eat_skb(ssk
, skb
);
614 WRITE_ONCE(tp
->copied_seq
, seq
);
615 more_data_avail
= mptcp_subflow_data_available(ssk
);
617 if (atomic_read(&sk
->sk_rmem_alloc
) > sk_rbuf
) {
621 } while (more_data_avail
);
622 WRITE_ONCE(msk
->ack_hint
, ssk
);
628 static bool __mptcp_ofo_queue(struct mptcp_sock
*msk
)
630 struct sock
*sk
= (struct sock
*)msk
;
631 struct sk_buff
*skb
, *tail
;
636 p
= rb_first(&msk
->out_of_order_queue
);
637 pr_debug("msk=%p empty=%d", msk
, RB_EMPTY_ROOT(&msk
->out_of_order_queue
));
640 if (after64(MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
))
644 rb_erase(&skb
->rbnode
, &msk
->out_of_order_queue
);
646 if (unlikely(!after64(MPTCP_SKB_CB(skb
)->end_seq
,
649 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
653 end_seq
= MPTCP_SKB_CB(skb
)->end_seq
;
654 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
655 if (!tail
|| !mptcp_ooo_try_coalesce(msk
, tail
, skb
)) {
656 int delta
= msk
->ack_seq
- MPTCP_SKB_CB(skb
)->map_seq
;
658 /* skip overlapping data, if any */
659 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
660 MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
,
662 MPTCP_SKB_CB(skb
)->offset
+= delta
;
663 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
665 msk
->ack_seq
= end_seq
;
671 /* In most cases we will be able to lock the mptcp socket. If its already
672 * owned, we need to defer to the work queue to avoid ABBA deadlock.
674 static void move_skbs_to_msk(struct mptcp_sock
*msk
, struct sock
*ssk
)
676 struct sock
*sk
= (struct sock
*)msk
;
677 unsigned int moved
= 0;
679 if (inet_sk_state_load(sk
) == TCP_CLOSE
)
684 __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
685 __mptcp_ofo_queue(msk
);
687 /* If the moves have caught up with the DATA_FIN sequence number
688 * it's time to ack the DATA_FIN and change socket state, but
689 * this is not a good place to change state. Let the workqueue
692 if (mptcp_pending_data_fin(sk
, NULL
))
693 mptcp_schedule_work(sk
);
694 mptcp_data_unlock(sk
);
697 void mptcp_data_ready(struct sock
*sk
, struct sock
*ssk
)
699 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
700 struct mptcp_sock
*msk
= mptcp_sk(sk
);
701 int sk_rbuf
, ssk_rbuf
;
704 /* The peer can send data while we are shutting down this
705 * subflow at msk destruction time, but we must avoid enqueuing
706 * more data to the msk receive queue
708 if (unlikely(subflow
->disposable
))
711 /* move_skbs_to_msk below can legitly clear the data_avail flag,
712 * but we will need later to properly woke the reader, cache its
715 wake
= subflow
->data_avail
== MPTCP_SUBFLOW_DATA_AVAIL
;
717 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
719 ssk_rbuf
= READ_ONCE(ssk
->sk_rcvbuf
);
720 sk_rbuf
= READ_ONCE(sk
->sk_rcvbuf
);
721 if (unlikely(ssk_rbuf
> sk_rbuf
))
724 /* over limit? can't append more skbs to msk */
725 if (atomic_read(&sk
->sk_rmem_alloc
) > sk_rbuf
)
728 move_skbs_to_msk(msk
, ssk
);
732 sk
->sk_data_ready(sk
);
735 void __mptcp_flush_join_list(struct mptcp_sock
*msk
)
737 if (likely(list_empty(&msk
->join_list
)))
740 spin_lock_bh(&msk
->join_list_lock
);
741 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
742 spin_unlock_bh(&msk
->join_list_lock
);
745 static bool mptcp_timer_pending(struct sock
*sk
)
747 return timer_pending(&inet_csk(sk
)->icsk_retransmit_timer
);
750 static void mptcp_reset_timer(struct sock
*sk
)
752 struct inet_connection_sock
*icsk
= inet_csk(sk
);
755 /* prevent rescheduling on close */
756 if (unlikely(inet_sk_state_load(sk
) == TCP_CLOSE
))
759 /* should never be called with mptcp level timer cleared */
760 tout
= READ_ONCE(mptcp_sk(sk
)->timer_ival
);
761 if (WARN_ON_ONCE(!tout
))
763 sk_reset_timer(sk
, &icsk
->icsk_retransmit_timer
, jiffies
+ tout
);
766 bool mptcp_schedule_work(struct sock
*sk
)
768 if (inet_sk_state_load(sk
) != TCP_CLOSE
&&
769 schedule_work(&mptcp_sk(sk
)->work
)) {
770 /* each subflow already holds a reference to the sk, and the
771 * workqueue is invoked by a subflow, so sk can't go away here.
779 void mptcp_subflow_eof(struct sock
*sk
)
781 if (!test_and_set_bit(MPTCP_WORK_EOF
, &mptcp_sk(sk
)->flags
))
782 mptcp_schedule_work(sk
);
785 static void mptcp_check_for_eof(struct mptcp_sock
*msk
)
787 struct mptcp_subflow_context
*subflow
;
788 struct sock
*sk
= (struct sock
*)msk
;
791 mptcp_for_each_subflow(msk
, subflow
)
792 receivers
+= !subflow
->rx_eof
;
796 if (!(sk
->sk_shutdown
& RCV_SHUTDOWN
)) {
797 /* hopefully temporary hack: propagate shutdown status
798 * to msk, when all subflows agree on it
800 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
802 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
803 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
804 sk
->sk_data_ready(sk
);
807 switch (sk
->sk_state
) {
808 case TCP_ESTABLISHED
:
809 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
812 inet_sk_state_store(sk
, TCP_CLOSING
);
815 inet_sk_state_store(sk
, TCP_CLOSE
);
820 mptcp_close_wake_up(sk
);
823 static struct sock
*mptcp_subflow_recv_lookup(const struct mptcp_sock
*msk
)
825 struct mptcp_subflow_context
*subflow
;
826 struct sock
*sk
= (struct sock
*)msk
;
828 sock_owned_by_me(sk
);
830 mptcp_for_each_subflow(msk
, subflow
) {
831 if (subflow
->data_avail
)
832 return mptcp_subflow_tcp_sock(subflow
);
838 static bool mptcp_skb_can_collapse_to(u64 write_seq
,
839 const struct sk_buff
*skb
,
840 const struct mptcp_ext
*mpext
)
842 if (!tcp_skb_can_collapse_to(skb
))
845 /* can collapse only if MPTCP level sequence is in order and this
846 * mapping has not been xmitted yet
848 return mpext
&& mpext
->data_seq
+ mpext
->data_len
== write_seq
&&
852 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock
*msk
,
853 const struct page_frag
*pfrag
,
854 const struct mptcp_data_frag
*df
)
856 return df
&& pfrag
->page
== df
->page
&&
857 pfrag
->size
- pfrag
->offset
> 0 &&
858 df
->data_seq
+ df
->data_len
== msk
->write_seq
;
861 static int mptcp_wmem_with_overhead(struct sock
*sk
, int size
)
863 struct mptcp_sock
*msk
= mptcp_sk(sk
);
866 ret
= size
+ ((sizeof(struct mptcp_data_frag
) * size
) >> PAGE_SHIFT
);
867 skbs
= (msk
->tx_pending_data
+ size
) / msk
->size_goal_cache
;
868 if (skbs
< msk
->skb_tx_cache
.qlen
)
871 return ret
+ (skbs
- msk
->skb_tx_cache
.qlen
) * SKB_TRUESIZE(MAX_TCP_HEADER
);
874 static void __mptcp_wmem_reserve(struct sock
*sk
, int size
)
876 int amount
= mptcp_wmem_with_overhead(sk
, size
);
877 struct mptcp_sock
*msk
= mptcp_sk(sk
);
879 WARN_ON_ONCE(msk
->wmem_reserved
);
880 if (amount
<= sk
->sk_forward_alloc
)
883 /* under memory pressure try to reserve at most a single page
884 * otherwise try to reserve the full estimate and fallback
885 * to a single page before entering the error path
887 if ((tcp_under_memory_pressure(sk
) && amount
> PAGE_SIZE
) ||
888 !sk_wmem_schedule(sk
, amount
)) {
889 if (amount
<= PAGE_SIZE
)
893 if (!sk_wmem_schedule(sk
, amount
))
898 msk
->wmem_reserved
= amount
;
899 sk
->sk_forward_alloc
-= amount
;
903 /* we will wait for memory on next allocation */
904 msk
->wmem_reserved
= -1;
907 static void __mptcp_update_wmem(struct sock
*sk
)
909 struct mptcp_sock
*msk
= mptcp_sk(sk
);
911 if (!msk
->wmem_reserved
)
914 if (msk
->wmem_reserved
< 0)
915 msk
->wmem_reserved
= 0;
916 if (msk
->wmem_reserved
> 0) {
917 sk
->sk_forward_alloc
+= msk
->wmem_reserved
;
918 msk
->wmem_reserved
= 0;
922 static bool mptcp_wmem_alloc(struct sock
*sk
, int size
)
924 struct mptcp_sock
*msk
= mptcp_sk(sk
);
926 /* check for pre-existing error condition */
927 if (msk
->wmem_reserved
< 0)
930 if (msk
->wmem_reserved
>= size
)
934 if (!sk_wmem_schedule(sk
, size
)) {
935 mptcp_data_unlock(sk
);
939 sk
->sk_forward_alloc
-= size
;
940 msk
->wmem_reserved
+= size
;
941 mptcp_data_unlock(sk
);
944 msk
->wmem_reserved
-= size
;
948 static void mptcp_wmem_uncharge(struct sock
*sk
, int size
)
950 struct mptcp_sock
*msk
= mptcp_sk(sk
);
952 if (msk
->wmem_reserved
< 0)
953 msk
->wmem_reserved
= 0;
954 msk
->wmem_reserved
+= size
;
957 static void mptcp_mem_reclaim_partial(struct sock
*sk
)
959 struct mptcp_sock
*msk
= mptcp_sk(sk
);
961 /* if we are experiencing a transint allocation error,
962 * the forward allocation memory has been already
965 if (msk
->wmem_reserved
< 0)
969 sk
->sk_forward_alloc
+= msk
->wmem_reserved
;
970 sk_mem_reclaim_partial(sk
);
971 msk
->wmem_reserved
= sk
->sk_forward_alloc
;
972 sk
->sk_forward_alloc
= 0;
973 mptcp_data_unlock(sk
);
976 static void dfrag_uncharge(struct sock
*sk
, int len
)
978 sk_mem_uncharge(sk
, len
);
979 sk_wmem_queued_add(sk
, -len
);
982 static void dfrag_clear(struct sock
*sk
, struct mptcp_data_frag
*dfrag
)
984 int len
= dfrag
->data_len
+ dfrag
->overhead
;
986 list_del(&dfrag
->list
);
987 dfrag_uncharge(sk
, len
);
988 put_page(dfrag
->page
);
991 static void __mptcp_clean_una(struct sock
*sk
)
993 struct mptcp_sock
*msk
= mptcp_sk(sk
);
994 struct mptcp_data_frag
*dtmp
, *dfrag
;
995 bool cleaned
= false;
998 /* on fallback we just need to ignore snd_una, as this is really
1001 if (__mptcp_check_fallback(msk
))
1002 msk
->snd_una
= READ_ONCE(msk
->snd_nxt
);
1004 snd_una
= msk
->snd_una
;
1005 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
) {
1006 if (after64(dfrag
->data_seq
+ dfrag
->data_len
, snd_una
))
1009 if (WARN_ON_ONCE(dfrag
== msk
->first_pending
))
1011 dfrag_clear(sk
, dfrag
);
1015 dfrag
= mptcp_rtx_head(sk
);
1016 if (dfrag
&& after64(snd_una
, dfrag
->data_seq
)) {
1017 u64 delta
= snd_una
- dfrag
->data_seq
;
1019 if (WARN_ON_ONCE(delta
> dfrag
->already_sent
))
1022 dfrag
->data_seq
+= delta
;
1023 dfrag
->offset
+= delta
;
1024 dfrag
->data_len
-= delta
;
1025 dfrag
->already_sent
-= delta
;
1027 dfrag_uncharge(sk
, delta
);
1033 if (tcp_under_memory_pressure(sk
)) {
1034 __mptcp_update_wmem(sk
);
1035 sk_mem_reclaim_partial(sk
);
1038 if (sk_stream_is_writeable(sk
)) {
1039 /* pairs with memory barrier in mptcp_poll */
1041 if (test_and_clear_bit(MPTCP_NOSPACE
, &msk
->flags
))
1042 sk_stream_write_space(sk
);
1046 if (snd_una
== READ_ONCE(msk
->snd_nxt
)) {
1047 if (msk
->timer_ival
)
1048 mptcp_stop_timer(sk
);
1050 mptcp_reset_timer(sk
);
1054 static void mptcp_enter_memory_pressure(struct sock
*sk
)
1056 struct mptcp_subflow_context
*subflow
;
1057 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1060 sk_stream_moderate_sndbuf(sk
);
1061 mptcp_for_each_subflow(msk
, subflow
) {
1062 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1065 tcp_enter_memory_pressure(ssk
);
1066 sk_stream_moderate_sndbuf(ssk
);
1071 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
1074 static bool mptcp_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
1076 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag
),
1077 pfrag
, sk
->sk_allocation
)))
1080 mptcp_enter_memory_pressure(sk
);
1084 static struct mptcp_data_frag
*
1085 mptcp_carve_data_frag(const struct mptcp_sock
*msk
, struct page_frag
*pfrag
,
1088 int offset
= ALIGN(orig_offset
, sizeof(long));
1089 struct mptcp_data_frag
*dfrag
;
1091 dfrag
= (struct mptcp_data_frag
*)(page_to_virt(pfrag
->page
) + offset
);
1092 dfrag
->data_len
= 0;
1093 dfrag
->data_seq
= msk
->write_seq
;
1094 dfrag
->overhead
= offset
- orig_offset
+ sizeof(struct mptcp_data_frag
);
1095 dfrag
->offset
= offset
+ sizeof(struct mptcp_data_frag
);
1096 dfrag
->already_sent
= 0;
1097 dfrag
->page
= pfrag
->page
;
1102 struct mptcp_sendmsg_info
{
1110 static int mptcp_check_allowed_size(struct mptcp_sock
*msk
, u64 data_seq
,
1113 u64 window_end
= mptcp_wnd_end(msk
);
1115 if (__mptcp_check_fallback(msk
))
1118 if (!before64(data_seq
+ avail_size
, window_end
)) {
1119 u64 allowed_size
= window_end
- data_seq
;
1121 return min_t(unsigned int, allowed_size
, avail_size
);
1127 static bool __mptcp_add_ext(struct sk_buff
*skb
, gfp_t gfp
)
1129 struct skb_ext
*mpext
= __skb_ext_alloc(gfp
);
1133 __skb_ext_set(skb
, SKB_EXT_MPTCP
, mpext
);
1137 static struct sk_buff
*__mptcp_do_alloc_tx_skb(struct sock
*sk
, gfp_t gfp
)
1139 struct sk_buff
*skb
;
1141 skb
= alloc_skb_fclone(MAX_TCP_HEADER
, gfp
);
1143 if (likely(__mptcp_add_ext(skb
, gfp
))) {
1144 skb_reserve(skb
, MAX_TCP_HEADER
);
1145 skb
->reserved_tailroom
= skb
->end
- skb
->tail
;
1150 mptcp_enter_memory_pressure(sk
);
1155 static bool mptcp_tx_cache_refill(struct sock
*sk
, int size
,
1156 struct sk_buff_head
*skbs
, int *total_ts
)
1158 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1159 struct sk_buff
*skb
;
1162 if (unlikely(tcp_under_memory_pressure(sk
))) {
1163 mptcp_mem_reclaim_partial(sk
);
1165 /* under pressure pre-allocate at most a single skb */
1166 if (msk
->skb_tx_cache
.qlen
)
1168 space_needed
= msk
->size_goal_cache
;
1170 space_needed
= msk
->tx_pending_data
+ size
-
1171 msk
->skb_tx_cache
.qlen
* msk
->size_goal_cache
;
1174 while (space_needed
> 0) {
1175 skb
= __mptcp_do_alloc_tx_skb(sk
, sk
->sk_allocation
);
1176 if (unlikely(!skb
)) {
1177 /* under memory pressure, try to pass the caller a
1178 * single skb to allow forward progress
1180 while (skbs
->qlen
> 1) {
1181 skb
= __skb_dequeue_tail(skbs
);
1184 return skbs
->qlen
> 0;
1187 *total_ts
+= skb
->truesize
;
1188 __skb_queue_tail(skbs
, skb
);
1189 space_needed
-= msk
->size_goal_cache
;
1194 static bool __mptcp_alloc_tx_skb(struct sock
*sk
, struct sock
*ssk
, gfp_t gfp
)
1196 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1197 struct sk_buff
*skb
;
1199 if (ssk
->sk_tx_skb_cache
) {
1200 skb
= ssk
->sk_tx_skb_cache
;
1201 if (unlikely(!skb_ext_find(skb
, SKB_EXT_MPTCP
) &&
1202 !__mptcp_add_ext(skb
, gfp
)))
1207 skb
= skb_peek(&msk
->skb_tx_cache
);
1209 if (likely(sk_wmem_schedule(ssk
, skb
->truesize
))) {
1210 skb
= __skb_dequeue(&msk
->skb_tx_cache
);
1211 if (WARN_ON_ONCE(!skb
))
1214 mptcp_wmem_uncharge(sk
, skb
->truesize
);
1215 ssk
->sk_tx_skb_cache
= skb
;
1219 /* over memory limit, no point to try to allocate a new skb */
1223 skb
= __mptcp_do_alloc_tx_skb(sk
, gfp
);
1227 if (likely(sk_wmem_schedule(ssk
, skb
->truesize
))) {
1228 ssk
->sk_tx_skb_cache
= skb
;
1235 static bool mptcp_must_reclaim_memory(struct sock
*sk
, struct sock
*ssk
)
1237 return !ssk
->sk_tx_skb_cache
&&
1238 !skb_peek(&mptcp_sk(sk
)->skb_tx_cache
) &&
1239 tcp_under_memory_pressure(sk
);
1242 static bool mptcp_alloc_tx_skb(struct sock
*sk
, struct sock
*ssk
)
1244 if (unlikely(mptcp_must_reclaim_memory(sk
, ssk
)))
1245 mptcp_mem_reclaim_partial(sk
);
1246 return __mptcp_alloc_tx_skb(sk
, ssk
, sk
->sk_allocation
);
1249 static int mptcp_sendmsg_frag(struct sock
*sk
, struct sock
*ssk
,
1250 struct mptcp_data_frag
*dfrag
,
1251 struct mptcp_sendmsg_info
*info
)
1253 u64 data_seq
= dfrag
->data_seq
+ info
->sent
;
1254 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1255 bool zero_window_probe
= false;
1256 struct mptcp_ext
*mpext
= NULL
;
1257 struct sk_buff
*skb
, *tail
;
1258 bool can_collapse
= false;
1263 pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
1264 msk
, ssk
, dfrag
->data_seq
, dfrag
->data_len
, info
->sent
);
1266 /* compute send limit */
1267 info
->mss_now
= tcp_send_mss(ssk
, &info
->size_goal
, info
->flags
);
1268 avail_size
= info
->size_goal
;
1269 msk
->size_goal_cache
= info
->size_goal
;
1270 skb
= tcp_write_queue_tail(ssk
);
1272 /* Limit the write to the size available in the
1273 * current skb, if any, so that we create at most a new skb.
1274 * Explicitly tells TCP internals to avoid collapsing on later
1275 * queue management operation, to avoid breaking the ext <->
1276 * SSN association set here
1278 mpext
= skb_ext_find(skb
, SKB_EXT_MPTCP
);
1279 can_collapse
= (info
->size_goal
- skb
->len
> 0) &&
1280 mptcp_skb_can_collapse_to(data_seq
, skb
, mpext
);
1281 if (!can_collapse
) {
1282 TCP_SKB_CB(skb
)->eor
= 1;
1284 size_bias
= skb
->len
;
1285 avail_size
= info
->size_goal
- skb
->len
;
1289 /* Zero window and all data acked? Probe. */
1290 avail_size
= mptcp_check_allowed_size(msk
, data_seq
, avail_size
);
1291 if (avail_size
== 0) {
1292 u64 snd_una
= READ_ONCE(msk
->snd_una
);
1294 if (skb
|| snd_una
!= msk
->snd_nxt
)
1296 zero_window_probe
= true;
1297 data_seq
= snd_una
- 1;
1301 if (WARN_ON_ONCE(info
->sent
> info
->limit
||
1302 info
->limit
> dfrag
->data_len
))
1305 ret
= info
->limit
- info
->sent
;
1306 tail
= tcp_build_frag(ssk
, avail_size
+ size_bias
, info
->flags
,
1307 dfrag
->page
, dfrag
->offset
+ info
->sent
, &ret
);
1309 tcp_remove_empty_skb(sk
, tcp_write_queue_tail(ssk
));
1313 /* if the tail skb is still the cached one, collapsing really happened.
1316 TCP_SKB_CB(tail
)->tcp_flags
&= ~TCPHDR_PSH
;
1317 mpext
->data_len
+= ret
;
1318 WARN_ON_ONCE(!can_collapse
);
1319 WARN_ON_ONCE(zero_window_probe
);
1323 mpext
= skb_ext_find(tail
, SKB_EXT_MPTCP
);
1324 if (WARN_ON_ONCE(!mpext
)) {
1325 /* should never reach here, stream corrupted */
1329 memset(mpext
, 0, sizeof(*mpext
));
1330 mpext
->data_seq
= data_seq
;
1331 mpext
->subflow_seq
= mptcp_subflow_ctx(ssk
)->rel_write_seq
;
1332 mpext
->data_len
= ret
;
1336 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1337 mpext
->data_seq
, mpext
->subflow_seq
, mpext
->data_len
,
1340 if (zero_window_probe
) {
1341 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
1344 tcp_push_pending_frames(ssk
);
1347 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
1351 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1352 sizeof(struct tcphdr) - \
1353 MAX_TCP_OPTION_SPACE - \
1354 sizeof(struct ipv6hdr) - \
1355 sizeof(struct frag_hdr))
1357 struct subflow_send_info
{
1362 static struct sock
*mptcp_subflow_get_send(struct mptcp_sock
*msk
,
1365 struct subflow_send_info send_info
[2];
1366 struct mptcp_subflow_context
*subflow
;
1367 int i
, nr_active
= 0;
1372 sock_owned_by_me((struct sock
*)msk
);
1375 if (__mptcp_check_fallback(msk
)) {
1378 *sndbuf
= msk
->first
->sk_sndbuf
;
1379 return sk_stream_memory_free(msk
->first
) ? msk
->first
: NULL
;
1382 /* re-use last subflow, if the burst allow that */
1383 if (msk
->last_snd
&& msk
->snd_burst
> 0 &&
1384 sk_stream_memory_free(msk
->last_snd
) &&
1385 mptcp_subflow_active(mptcp_subflow_ctx(msk
->last_snd
))) {
1386 mptcp_for_each_subflow(msk
, subflow
) {
1387 ssk
= mptcp_subflow_tcp_sock(subflow
);
1388 *sndbuf
= max(tcp_sk(ssk
)->snd_wnd
, *sndbuf
);
1390 return msk
->last_snd
;
1393 /* pick the subflow with the lower wmem/wspace ratio */
1394 for (i
= 0; i
< 2; ++i
) {
1395 send_info
[i
].ssk
= NULL
;
1396 send_info
[i
].ratio
= -1;
1398 mptcp_for_each_subflow(msk
, subflow
) {
1399 ssk
= mptcp_subflow_tcp_sock(subflow
);
1400 if (!mptcp_subflow_active(subflow
))
1403 nr_active
+= !subflow
->backup
;
1404 *sndbuf
= max(tcp_sk(ssk
)->snd_wnd
, *sndbuf
);
1405 if (!sk_stream_memory_free(subflow
->tcp_sock
))
1408 pace
= READ_ONCE(ssk
->sk_pacing_rate
);
1412 ratio
= div_u64((u64
)READ_ONCE(ssk
->sk_wmem_queued
) << 32,
1414 if (ratio
< send_info
[subflow
->backup
].ratio
) {
1415 send_info
[subflow
->backup
].ssk
= ssk
;
1416 send_info
[subflow
->backup
].ratio
= ratio
;
1420 pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1421 msk
, nr_active
, send_info
[0].ssk
, send_info
[0].ratio
,
1422 send_info
[1].ssk
, send_info
[1].ratio
);
1424 /* pick the best backup if no other subflow is active */
1426 send_info
[0].ssk
= send_info
[1].ssk
;
1428 if (send_info
[0].ssk
) {
1429 msk
->last_snd
= send_info
[0].ssk
;
1430 msk
->snd_burst
= min_t(int, MPTCP_SEND_BURST_SIZE
,
1431 sk_stream_wspace(msk
->last_snd
));
1432 return msk
->last_snd
;
1437 static void mptcp_push_release(struct sock
*sk
, struct sock
*ssk
,
1438 struct mptcp_sendmsg_info
*info
)
1440 mptcp_set_timeout(sk
, ssk
);
1441 tcp_push(ssk
, 0, info
->mss_now
, tcp_sk(ssk
)->nonagle
, info
->size_goal
);
1445 static void mptcp_push_pending(struct sock
*sk
, unsigned int flags
)
1447 struct sock
*prev_ssk
= NULL
, *ssk
= NULL
;
1448 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1449 struct mptcp_sendmsg_info info
= {
1452 struct mptcp_data_frag
*dfrag
;
1453 int len
, copied
= 0;
1456 while ((dfrag
= mptcp_send_head(sk
))) {
1457 info
.sent
= dfrag
->already_sent
;
1458 info
.limit
= dfrag
->data_len
;
1459 len
= dfrag
->data_len
- dfrag
->already_sent
;
1464 __mptcp_flush_join_list(msk
);
1465 ssk
= mptcp_subflow_get_send(msk
, &sndbuf
);
1467 /* do auto tuning */
1468 if (!(sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
) &&
1469 sndbuf
> READ_ONCE(sk
->sk_sndbuf
))
1470 WRITE_ONCE(sk
->sk_sndbuf
, sndbuf
);
1472 /* try to keep the subflow socket lock across
1473 * consecutive xmit on the same socket
1475 if (ssk
!= prev_ssk
&& prev_ssk
)
1476 mptcp_push_release(sk
, prev_ssk
, &info
);
1480 if (ssk
!= prev_ssk
|| !prev_ssk
)
1483 /* keep it simple and always provide a new skb for the
1484 * subflow, even if we will not use it when collapsing
1485 * on the pending one
1487 if (!mptcp_alloc_tx_skb(sk
, ssk
)) {
1488 mptcp_push_release(sk
, ssk
, &info
);
1492 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
1494 mptcp_push_release(sk
, ssk
, &info
);
1499 dfrag
->already_sent
+= ret
;
1500 msk
->snd_nxt
+= ret
;
1501 msk
->snd_burst
-= ret
;
1502 msk
->tx_pending_data
-= ret
;
1506 WRITE_ONCE(msk
->first_pending
, mptcp_send_next(sk
));
1509 /* at this point we held the socket lock for the last subflow we used */
1511 mptcp_push_release(sk
, ssk
, &info
);
1515 /* start the timer, if it's not pending */
1516 if (!mptcp_timer_pending(sk
))
1517 mptcp_reset_timer(sk
);
1518 __mptcp_check_send_data_fin(sk
);
1522 static void __mptcp_subflow_push_pending(struct sock
*sk
, struct sock
*ssk
)
1524 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1525 struct mptcp_sendmsg_info info
;
1526 struct mptcp_data_frag
*dfrag
;
1527 int len
, copied
= 0;
1530 while ((dfrag
= mptcp_send_head(sk
))) {
1531 info
.sent
= dfrag
->already_sent
;
1532 info
.limit
= dfrag
->data_len
;
1533 len
= dfrag
->data_len
- dfrag
->already_sent
;
1537 /* do auto tuning */
1538 if (!(sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
) &&
1539 ssk
->sk_sndbuf
> READ_ONCE(sk
->sk_sndbuf
))
1540 WRITE_ONCE(sk
->sk_sndbuf
, ssk
->sk_sndbuf
);
1542 if (unlikely(mptcp_must_reclaim_memory(sk
, ssk
))) {
1543 __mptcp_update_wmem(sk
);
1544 sk_mem_reclaim_partial(sk
);
1546 if (!__mptcp_alloc_tx_skb(sk
, ssk
, GFP_ATOMIC
))
1549 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
1554 dfrag
->already_sent
+= ret
;
1555 msk
->snd_nxt
+= ret
;
1556 msk
->snd_burst
-= ret
;
1557 msk
->tx_pending_data
-= ret
;
1561 WRITE_ONCE(msk
->first_pending
, mptcp_send_next(sk
));
1565 /* __mptcp_alloc_tx_skb could have released some wmem and we are
1566 * not going to flush it via release_sock()
1568 __mptcp_update_wmem(sk
);
1570 mptcp_set_timeout(sk
, ssk
);
1571 tcp_push(ssk
, 0, info
.mss_now
, tcp_sk(ssk
)->nonagle
,
1573 if (msk
->snd_data_fin_enable
&&
1574 msk
->snd_nxt
+ 1 == msk
->write_seq
)
1575 mptcp_schedule_work(sk
);
1579 static int mptcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
)
1581 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1582 struct page_frag
*pfrag
;
1587 if (msg
->msg_flags
& ~(MSG_MORE
| MSG_DONTWAIT
| MSG_NOSIGNAL
))
1590 mptcp_lock_sock(sk
, __mptcp_wmem_reserve(sk
, len
));
1592 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1594 if ((1 << sk
->sk_state
) & ~(TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
)) {
1595 ret
= sk_stream_wait_connect(sk
, &timeo
);
1600 pfrag
= sk_page_frag(sk
);
1602 while (msg_data_left(msg
)) {
1603 int total_ts
, frag_truesize
= 0;
1604 struct mptcp_data_frag
*dfrag
;
1605 struct sk_buff_head skbs
;
1606 bool dfrag_collapsed
;
1607 size_t psize
, offset
;
1609 if (sk
->sk_err
|| (sk
->sk_shutdown
& SEND_SHUTDOWN
)) {
1614 /* reuse tail pfrag, if possible, or carve a new one from the
1617 dfrag
= mptcp_pending_tail(sk
);
1618 dfrag_collapsed
= mptcp_frag_can_collapse_to(msk
, pfrag
, dfrag
);
1619 if (!dfrag_collapsed
) {
1620 if (!sk_stream_memory_free(sk
))
1621 goto wait_for_memory
;
1623 if (!mptcp_page_frag_refill(sk
, pfrag
))
1624 goto wait_for_memory
;
1626 dfrag
= mptcp_carve_data_frag(msk
, pfrag
, pfrag
->offset
);
1627 frag_truesize
= dfrag
->overhead
;
1630 /* we do not bound vs wspace, to allow a single packet.
1631 * memory accounting will prevent execessive memory usage
1634 offset
= dfrag
->offset
+ dfrag
->data_len
;
1635 psize
= pfrag
->size
- offset
;
1636 psize
= min_t(size_t, psize
, msg_data_left(msg
));
1637 total_ts
= psize
+ frag_truesize
;
1638 __skb_queue_head_init(&skbs
);
1639 if (!mptcp_tx_cache_refill(sk
, psize
, &skbs
, &total_ts
))
1640 goto wait_for_memory
;
1642 if (!mptcp_wmem_alloc(sk
, total_ts
)) {
1643 __skb_queue_purge(&skbs
);
1644 goto wait_for_memory
;
1647 skb_queue_splice_tail(&skbs
, &msk
->skb_tx_cache
);
1648 if (copy_page_from_iter(dfrag
->page
, offset
, psize
,
1649 &msg
->msg_iter
) != psize
) {
1650 mptcp_wmem_uncharge(sk
, psize
+ frag_truesize
);
1655 /* data successfully copied into the write queue */
1657 dfrag
->data_len
+= psize
;
1658 frag_truesize
+= psize
;
1659 pfrag
->offset
+= frag_truesize
;
1660 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ psize
);
1661 msk
->tx_pending_data
+= psize
;
1663 /* charge data on mptcp pending queue to the msk socket
1664 * Note: we charge such data both to sk and ssk
1666 sk_wmem_queued_add(sk
, frag_truesize
);
1667 if (!dfrag_collapsed
) {
1668 get_page(dfrag
->page
);
1669 list_add_tail(&dfrag
->list
, &msk
->rtx_queue
);
1670 if (!msk
->first_pending
)
1671 WRITE_ONCE(msk
->first_pending
, dfrag
);
1673 pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk
,
1674 dfrag
->data_seq
, dfrag
->data_len
, dfrag
->already_sent
,
1680 set_bit(MPTCP_NOSPACE
, &msk
->flags
);
1681 mptcp_push_pending(sk
, msg
->msg_flags
);
1682 ret
= sk_stream_wait_memory(sk
, &timeo
);
1688 mptcp_push_pending(sk
, msg
->msg_flags
);
1692 return copied
? : ret
;
1695 static void mptcp_wait_data(struct sock
*sk
, long *timeo
)
1697 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
1698 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1700 add_wait_queue(sk_sleep(sk
), &wait
);
1701 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1703 sk_wait_event(sk
, timeo
,
1704 test_and_clear_bit(MPTCP_DATA_READY
, &msk
->flags
), &wait
);
1706 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1707 remove_wait_queue(sk_sleep(sk
), &wait
);
1710 static int __mptcp_recvmsg_mskq(struct mptcp_sock
*msk
,
1714 struct sk_buff
*skb
;
1717 while ((skb
= skb_peek(&msk
->receive_queue
)) != NULL
) {
1718 u32 offset
= MPTCP_SKB_CB(skb
)->offset
;
1719 u32 data_len
= skb
->len
- offset
;
1720 u32 count
= min_t(size_t, len
- copied
, data_len
);
1723 err
= skb_copy_datagram_msg(skb
, offset
, msg
, count
);
1724 if (unlikely(err
< 0)) {
1732 if (count
< data_len
) {
1733 MPTCP_SKB_CB(skb
)->offset
+= count
;
1737 /* we will bulk release the skb memory later */
1738 skb
->destructor
= NULL
;
1739 msk
->rmem_released
+= skb
->truesize
;
1740 __skb_unlink(skb
, &msk
->receive_queue
);
1750 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1752 * Only difference: Use highest rtt estimate of the subflows in use.
1754 static void mptcp_rcv_space_adjust(struct mptcp_sock
*msk
, int copied
)
1756 struct mptcp_subflow_context
*subflow
;
1757 struct sock
*sk
= (struct sock
*)msk
;
1758 u32 time
, advmss
= 1;
1761 sock_owned_by_me(sk
);
1766 msk
->rcvq_space
.copied
+= copied
;
1768 mstamp
= div_u64(tcp_clock_ns(), NSEC_PER_USEC
);
1769 time
= tcp_stamp_us_delta(mstamp
, msk
->rcvq_space
.time
);
1771 rtt_us
= msk
->rcvq_space
.rtt_us
;
1772 if (rtt_us
&& time
< (rtt_us
>> 3))
1776 mptcp_for_each_subflow(msk
, subflow
) {
1777 const struct tcp_sock
*tp
;
1781 tp
= tcp_sk(mptcp_subflow_tcp_sock(subflow
));
1783 sf_rtt_us
= READ_ONCE(tp
->rcv_rtt_est
.rtt_us
);
1784 sf_advmss
= READ_ONCE(tp
->advmss
);
1786 rtt_us
= max(sf_rtt_us
, rtt_us
);
1787 advmss
= max(sf_advmss
, advmss
);
1790 msk
->rcvq_space
.rtt_us
= rtt_us
;
1791 if (time
< (rtt_us
>> 3) || rtt_us
== 0)
1794 if (msk
->rcvq_space
.copied
<= msk
->rcvq_space
.space
)
1797 if (sock_net(sk
)->ipv4
.sysctl_tcp_moderate_rcvbuf
&&
1798 !(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
1802 rcvwin
= ((u64
)msk
->rcvq_space
.copied
<< 1) + 16 * advmss
;
1804 grow
= rcvwin
* (msk
->rcvq_space
.copied
- msk
->rcvq_space
.space
);
1806 do_div(grow
, msk
->rcvq_space
.space
);
1807 rcvwin
+= (grow
<< 1);
1809 rcvmem
= SKB_TRUESIZE(advmss
+ MAX_TCP_HEADER
);
1810 while (tcp_win_from_space(sk
, rcvmem
) < advmss
)
1813 do_div(rcvwin
, advmss
);
1814 rcvbuf
= min_t(u64
, rcvwin
* rcvmem
,
1815 sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[2]);
1817 if (rcvbuf
> sk
->sk_rcvbuf
) {
1820 window_clamp
= tcp_win_from_space(sk
, rcvbuf
);
1821 WRITE_ONCE(sk
->sk_rcvbuf
, rcvbuf
);
1823 /* Make subflows follow along. If we do not do this, we
1824 * get drops at subflow level if skbs can't be moved to
1825 * the mptcp rx queue fast enough (announced rcv_win can
1826 * exceed ssk->sk_rcvbuf).
1828 mptcp_for_each_subflow(msk
, subflow
) {
1832 ssk
= mptcp_subflow_tcp_sock(subflow
);
1833 slow
= lock_sock_fast(ssk
);
1834 WRITE_ONCE(ssk
->sk_rcvbuf
, rcvbuf
);
1835 tcp_sk(ssk
)->window_clamp
= window_clamp
;
1836 tcp_cleanup_rbuf(ssk
, 1);
1837 unlock_sock_fast(ssk
, slow
);
1842 msk
->rcvq_space
.space
= msk
->rcvq_space
.copied
;
1844 msk
->rcvq_space
.copied
= 0;
1845 msk
->rcvq_space
.time
= mstamp
;
1848 static void __mptcp_update_rmem(struct sock
*sk
)
1850 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1852 if (!msk
->rmem_released
)
1855 atomic_sub(msk
->rmem_released
, &sk
->sk_rmem_alloc
);
1856 sk_mem_uncharge(sk
, msk
->rmem_released
);
1857 msk
->rmem_released
= 0;
1860 static void __mptcp_splice_receive_queue(struct sock
*sk
)
1862 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1864 skb_queue_splice_tail_init(&sk
->sk_receive_queue
, &msk
->receive_queue
);
1867 static bool __mptcp_move_skbs(struct mptcp_sock
*msk
, unsigned int rcv
)
1869 struct sock
*sk
= (struct sock
*)msk
;
1870 unsigned int moved
= 0;
1873 __mptcp_flush_join_list(msk
);
1875 struct sock
*ssk
= mptcp_subflow_recv_lookup(msk
);
1878 /* we can have data pending in the subflows only if the msk
1879 * receive buffer was full at subflow_data_ready() time,
1880 * that is an unlikely slow path.
1885 slowpath
= lock_sock_fast(ssk
);
1886 mptcp_data_lock(sk
);
1887 done
= __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
1888 mptcp_data_unlock(sk
);
1890 WRITE_ONCE(msk
->rmem_pending
, min(rcv
, moved
));
1891 tcp_cleanup_rbuf(ssk
, 1);
1892 WRITE_ONCE(msk
->rmem_pending
, 0);
1894 unlock_sock_fast(ssk
, slowpath
);
1897 /* acquire the data lock only if some input data is pending */
1899 if (!RB_EMPTY_ROOT(&msk
->out_of_order_queue
) ||
1900 !skb_queue_empty_lockless(&sk
->sk_receive_queue
)) {
1901 mptcp_data_lock(sk
);
1902 __mptcp_update_rmem(sk
);
1903 ret
|= __mptcp_ofo_queue(msk
);
1904 __mptcp_splice_receive_queue(sk
);
1905 mptcp_data_unlock(sk
);
1908 mptcp_check_data_fin((struct sock
*)msk
);
1909 return !skb_queue_empty(&msk
->receive_queue
);
1912 static int mptcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
,
1913 int nonblock
, int flags
, int *addr_len
)
1915 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1920 if (msg
->msg_flags
& ~(MSG_WAITALL
| MSG_DONTWAIT
))
1923 mptcp_lock_sock(sk
, __mptcp_splice_receive_queue(sk
));
1924 if (unlikely(sk
->sk_state
== TCP_LISTEN
)) {
1929 timeo
= sock_rcvtimeo(sk
, nonblock
);
1931 len
= min_t(size_t, len
, INT_MAX
);
1932 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, len
);
1934 while (copied
< len
) {
1935 int bytes_read
, old_space
;
1937 bytes_read
= __mptcp_recvmsg_mskq(msk
, msg
, len
- copied
);
1938 if (unlikely(bytes_read
< 0)) {
1940 copied
= bytes_read
;
1944 copied
+= bytes_read
;
1946 if (skb_queue_empty(&msk
->receive_queue
) &&
1947 __mptcp_move_skbs(msk
, len
- copied
))
1950 /* be sure to advertise window change */
1951 old_space
= READ_ONCE(msk
->old_wspace
);
1952 if ((tcp_space(sk
) - old_space
) >= old_space
)
1953 mptcp_cleanup_rbuf(msk
);
1955 /* only the master socket status is relevant here. The exit
1956 * conditions mirror closely tcp_recvmsg()
1958 if (copied
>= target
)
1963 sk
->sk_state
== TCP_CLOSE
||
1964 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
1966 signal_pending(current
))
1970 copied
= sock_error(sk
);
1974 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
1975 mptcp_check_for_eof(msk
);
1977 if (sk
->sk_shutdown
& RCV_SHUTDOWN
) {
1978 /* race breaker: the shutdown could be after the
1979 * previous receive queue check
1981 if (__mptcp_move_skbs(msk
, len
- copied
))
1986 if (sk
->sk_state
== TCP_CLOSE
) {
1996 if (signal_pending(current
)) {
1997 copied
= sock_intr_errno(timeo
);
2002 pr_debug("block timeout %ld", timeo
);
2003 mptcp_wait_data(sk
, &timeo
);
2006 if (skb_queue_empty_lockless(&sk
->sk_receive_queue
) &&
2007 skb_queue_empty(&msk
->receive_queue
)) {
2008 /* entire backlog drained, clear DATA_READY. */
2009 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
2011 /* .. race-breaker: ssk might have gotten new data
2012 * after last __mptcp_move_skbs() returned false.
2014 if (unlikely(__mptcp_move_skbs(msk
, 0)))
2015 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2016 } else if (unlikely(!test_bit(MPTCP_DATA_READY
, &msk
->flags
))) {
2017 /* data to read but mptcp_wait_data() cleared DATA_READY */
2018 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2021 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
2022 msk
, test_bit(MPTCP_DATA_READY
, &msk
->flags
),
2023 skb_queue_empty_lockless(&sk
->sk_receive_queue
), copied
);
2024 mptcp_rcv_space_adjust(msk
, copied
);
2030 static void mptcp_retransmit_handler(struct sock
*sk
)
2032 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2034 set_bit(MPTCP_WORK_RTX
, &msk
->flags
);
2035 mptcp_schedule_work(sk
);
2038 static void mptcp_retransmit_timer(struct timer_list
*t
)
2040 struct inet_connection_sock
*icsk
= from_timer(icsk
, t
,
2041 icsk_retransmit_timer
);
2042 struct sock
*sk
= &icsk
->icsk_inet
.sk
;
2045 if (!sock_owned_by_user(sk
)) {
2046 mptcp_retransmit_handler(sk
);
2048 /* delegate our work to tcp_release_cb() */
2049 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED
,
2057 static void mptcp_timeout_timer(struct timer_list
*t
)
2059 struct sock
*sk
= from_timer(sk
, t
, sk_timer
);
2061 mptcp_schedule_work(sk
);
2065 /* Find an idle subflow. Return NULL if there is unacked data at tcp
2068 * A backup subflow is returned only if that is the only kind available.
2070 static struct sock
*mptcp_subflow_get_retrans(const struct mptcp_sock
*msk
)
2072 struct mptcp_subflow_context
*subflow
;
2073 struct sock
*backup
= NULL
;
2075 sock_owned_by_me((const struct sock
*)msk
);
2077 if (__mptcp_check_fallback(msk
))
2080 mptcp_for_each_subflow(msk
, subflow
) {
2081 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2083 if (!mptcp_subflow_active(subflow
))
2086 /* still data outstanding at TCP level? Don't retransmit. */
2087 if (!tcp_write_queue_empty(ssk
)) {
2088 if (inet_csk(ssk
)->icsk_ca_state
>= TCP_CA_Loss
)
2093 if (subflow
->backup
) {
2105 /* subflow sockets can be either outgoing (connect) or incoming
2108 * Outgoing subflows use in-kernel sockets.
2109 * Incoming subflows do not have their own 'struct socket' allocated,
2110 * so we need to use tcp_close() after detaching them from the mptcp
2113 void __mptcp_close_ssk(struct sock
*sk
, struct sock
*ssk
,
2114 struct mptcp_subflow_context
*subflow
)
2116 bool dispose_socket
= false;
2117 struct socket
*sock
;
2119 list_del(&subflow
->node
);
2121 lock_sock_nested(ssk
, SINGLE_DEPTH_NESTING
);
2123 /* if we are invoked by the msk cleanup code, the subflow is
2126 sock
= ssk
->sk_socket
;
2128 dispose_socket
= sock
!= sk
->sk_socket
;
2132 subflow
->disposable
= 1;
2134 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
2135 * the ssk has been already destroyed, we just need to release the
2136 * reference owned by msk;
2138 if (!inet_csk(ssk
)->icsk_ulp_ops
) {
2139 kfree_rcu(subflow
, rcu
);
2141 /* otherwise tcp will dispose of the ssk and subflow ctx */
2142 __tcp_close(ssk
, 0);
2144 /* close acquired an extra ref */
2149 iput(SOCK_INODE(sock
));
2154 static unsigned int mptcp_sync_mss(struct sock
*sk
, u32 pmtu
)
2159 static void pm_work(struct mptcp_sock
*msk
)
2161 struct mptcp_pm_data
*pm
= &msk
->pm
;
2163 spin_lock_bh(&msk
->pm
.lock
);
2165 pr_debug("msk=%p status=%x", msk
, pm
->status
);
2166 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_RECEIVED
)) {
2167 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED
);
2168 mptcp_pm_nl_add_addr_received(msk
);
2170 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_SEND_ACK
)) {
2171 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK
);
2172 mptcp_pm_nl_add_addr_send_ack(msk
);
2174 if (pm
->status
& BIT(MPTCP_PM_RM_ADDR_RECEIVED
)) {
2175 pm
->status
&= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED
);
2176 mptcp_pm_nl_rm_addr_received(msk
);
2178 if (pm
->status
& BIT(MPTCP_PM_ESTABLISHED
)) {
2179 pm
->status
&= ~BIT(MPTCP_PM_ESTABLISHED
);
2180 mptcp_pm_nl_fully_established(msk
);
2182 if (pm
->status
& BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
)) {
2183 pm
->status
&= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
);
2184 mptcp_pm_nl_subflow_established(msk
);
2187 spin_unlock_bh(&msk
->pm
.lock
);
2190 static void __mptcp_close_subflow(struct mptcp_sock
*msk
)
2192 struct mptcp_subflow_context
*subflow
, *tmp
;
2194 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
2195 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2197 if (inet_sk_state_load(ssk
) != TCP_CLOSE
)
2200 __mptcp_close_ssk((struct sock
*)msk
, ssk
, subflow
);
2204 static bool mptcp_check_close_timeout(const struct sock
*sk
)
2206 s32 delta
= tcp_jiffies32
- inet_csk(sk
)->icsk_mtup
.probe_timestamp
;
2207 struct mptcp_subflow_context
*subflow
;
2209 if (delta
>= TCP_TIMEWAIT_LEN
)
2212 /* if all subflows are in closed status don't bother with additional
2215 mptcp_for_each_subflow(mptcp_sk(sk
), subflow
) {
2216 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow
)) !=
2223 static void mptcp_check_fastclose(struct mptcp_sock
*msk
)
2225 struct mptcp_subflow_context
*subflow
, *tmp
;
2226 struct sock
*sk
= &msk
->sk
.icsk_inet
.sk
;
2228 if (likely(!READ_ONCE(msk
->rcv_fastclose
)))
2231 mptcp_token_destroy(msk
);
2233 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
2234 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2237 if (tcp_sk
->sk_state
!= TCP_CLOSE
) {
2238 tcp_send_active_reset(tcp_sk
, GFP_ATOMIC
);
2239 tcp_set_state(tcp_sk
, TCP_CLOSE
);
2241 release_sock(tcp_sk
);
2244 inet_sk_state_store(sk
, TCP_CLOSE
);
2245 sk
->sk_shutdown
= SHUTDOWN_MASK
;
2246 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
2247 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2248 set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
);
2250 mptcp_close_wake_up(sk
);
2253 static void mptcp_worker(struct work_struct
*work
)
2255 struct mptcp_sock
*msk
= container_of(work
, struct mptcp_sock
, work
);
2256 struct sock
*ssk
, *sk
= &msk
->sk
.icsk_inet
.sk
;
2257 struct mptcp_sendmsg_info info
= {};
2258 struct mptcp_data_frag
*dfrag
;
2263 state
= sk
->sk_state
;
2264 if (unlikely(state
== TCP_CLOSE
))
2267 mptcp_check_data_fin_ack(sk
);
2268 __mptcp_flush_join_list(msk
);
2270 mptcp_check_fastclose(msk
);
2272 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
))
2273 __mptcp_close_subflow(msk
);
2278 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
2279 mptcp_check_for_eof(msk
);
2281 __mptcp_check_send_data_fin(sk
);
2282 mptcp_check_data_fin(sk
);
2284 /* if the msk data is completely acked, or the socket timedout,
2285 * there is no point in keeping around an orphaned sk
2287 if (sock_flag(sk
, SOCK_DEAD
) &&
2288 (mptcp_check_close_timeout(sk
) ||
2289 (state
!= sk
->sk_state
&&
2290 ((1 << inet_sk_state_load(sk
)) & (TCPF_CLOSE
| TCPF_FIN_WAIT2
))))) {
2291 inet_sk_state_store(sk
, TCP_CLOSE
);
2292 __mptcp_destroy_sock(sk
);
2296 if (!test_and_clear_bit(MPTCP_WORK_RTX
, &msk
->flags
))
2299 dfrag
= mptcp_rtx_head(sk
);
2303 ssk
= mptcp_subflow_get_retrans(msk
);
2309 /* limit retransmission to the bytes already sent on some subflows */
2311 info
.limit
= dfrag
->already_sent
;
2312 while (info
.sent
< dfrag
->already_sent
) {
2313 if (!mptcp_alloc_tx_skb(sk
, ssk
))
2316 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
2320 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_RETRANSSEGS
);
2325 tcp_push(ssk
, 0, info
.mss_now
, tcp_sk(ssk
)->nonagle
,
2328 mptcp_set_timeout(sk
, ssk
);
2332 if (!mptcp_timer_pending(sk
))
2333 mptcp_reset_timer(sk
);
2340 static int __mptcp_init_sock(struct sock
*sk
)
2342 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2344 spin_lock_init(&msk
->join_list_lock
);
2346 INIT_LIST_HEAD(&msk
->conn_list
);
2347 INIT_LIST_HEAD(&msk
->join_list
);
2348 INIT_LIST_HEAD(&msk
->rtx_queue
);
2349 INIT_WORK(&msk
->work
, mptcp_worker
);
2350 __skb_queue_head_init(&msk
->receive_queue
);
2351 __skb_queue_head_init(&msk
->skb_tx_cache
);
2352 msk
->out_of_order_queue
= RB_ROOT
;
2353 msk
->first_pending
= NULL
;
2354 msk
->wmem_reserved
= 0;
2355 msk
->rmem_released
= 0;
2356 msk
->tx_pending_data
= 0;
2357 msk
->size_goal_cache
= TCP_BASE_MSS
;
2359 msk
->ack_hint
= NULL
;
2361 inet_csk(sk
)->icsk_sync_mss
= mptcp_sync_mss
;
2363 mptcp_pm_data_init(msk
);
2365 /* re-use the csk retrans timer for MPTCP-level retrans */
2366 timer_setup(&msk
->sk
.icsk_retransmit_timer
, mptcp_retransmit_timer
, 0);
2367 timer_setup(&sk
->sk_timer
, mptcp_timeout_timer
, 0);
2371 static int mptcp_init_sock(struct sock
*sk
)
2373 struct net
*net
= sock_net(sk
);
2376 ret
= __mptcp_init_sock(sk
);
2380 if (!mptcp_is_enabled(net
))
2381 return -ENOPROTOOPT
;
2383 if (unlikely(!net
->mib
.mptcp_statistics
) && !mptcp_mib_alloc(net
))
2386 ret
= __mptcp_socket_create(mptcp_sk(sk
));
2390 sk_sockets_allocated_inc(sk
);
2391 sk
->sk_rcvbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[1];
2392 sk
->sk_sndbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_wmem
[1];
2397 static void __mptcp_clear_xmit(struct sock
*sk
)
2399 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2400 struct mptcp_data_frag
*dtmp
, *dfrag
;
2401 struct sk_buff
*skb
;
2403 WRITE_ONCE(msk
->first_pending
, NULL
);
2404 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
)
2405 dfrag_clear(sk
, dfrag
);
2406 while ((skb
= __skb_dequeue(&msk
->skb_tx_cache
)) != NULL
) {
2407 sk
->sk_forward_alloc
+= skb
->truesize
;
2412 static void mptcp_cancel_work(struct sock
*sk
)
2414 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2416 if (cancel_work_sync(&msk
->work
))
2420 void mptcp_subflow_shutdown(struct sock
*sk
, struct sock
*ssk
, int how
)
2424 switch (ssk
->sk_state
) {
2426 if (!(how
& RCV_SHUTDOWN
))
2430 tcp_disconnect(ssk
, O_NONBLOCK
);
2433 if (__mptcp_check_fallback(mptcp_sk(sk
))) {
2434 pr_debug("Fallback");
2435 ssk
->sk_shutdown
|= how
;
2436 tcp_shutdown(ssk
, how
);
2438 pr_debug("Sending DATA_FIN on subflow %p", ssk
);
2439 mptcp_set_timeout(sk
, ssk
);
2448 static const unsigned char new_state
[16] = {
2449 /* current state: new state: action: */
2450 [0 /* (Invalid) */] = TCP_CLOSE
,
2451 [TCP_ESTABLISHED
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
2452 [TCP_SYN_SENT
] = TCP_CLOSE
,
2453 [TCP_SYN_RECV
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
2454 [TCP_FIN_WAIT1
] = TCP_FIN_WAIT1
,
2455 [TCP_FIN_WAIT2
] = TCP_FIN_WAIT2
,
2456 [TCP_TIME_WAIT
] = TCP_CLOSE
, /* should not happen ! */
2457 [TCP_CLOSE
] = TCP_CLOSE
,
2458 [TCP_CLOSE_WAIT
] = TCP_LAST_ACK
| TCP_ACTION_FIN
,
2459 [TCP_LAST_ACK
] = TCP_LAST_ACK
,
2460 [TCP_LISTEN
] = TCP_CLOSE
,
2461 [TCP_CLOSING
] = TCP_CLOSING
,
2462 [TCP_NEW_SYN_RECV
] = TCP_CLOSE
, /* should not happen ! */
2465 static int mptcp_close_state(struct sock
*sk
)
2467 int next
= (int)new_state
[sk
->sk_state
];
2468 int ns
= next
& TCP_STATE_MASK
;
2470 inet_sk_state_store(sk
, ns
);
2472 return next
& TCP_ACTION_FIN
;
2475 static void __mptcp_check_send_data_fin(struct sock
*sk
)
2477 struct mptcp_subflow_context
*subflow
;
2478 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2480 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
2481 msk
, msk
->snd_data_fin_enable
, !!mptcp_send_head(sk
),
2482 msk
->snd_nxt
, msk
->write_seq
);
2484 /* we still need to enqueue subflows or not really shutting down,
2487 if (!msk
->snd_data_fin_enable
|| msk
->snd_nxt
+ 1 != msk
->write_seq
||
2488 mptcp_send_head(sk
))
2491 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
2493 /* fallback socket will not get data_fin/ack, can move to the next
2496 if (__mptcp_check_fallback(msk
)) {
2497 if ((1 << sk
->sk_state
) & (TCPF_CLOSING
| TCPF_LAST_ACK
)) {
2498 inet_sk_state_store(sk
, TCP_CLOSE
);
2499 mptcp_close_wake_up(sk
);
2500 } else if (sk
->sk_state
== TCP_FIN_WAIT1
) {
2501 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
2505 __mptcp_flush_join_list(msk
);
2506 mptcp_for_each_subflow(msk
, subflow
) {
2507 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2509 mptcp_subflow_shutdown(sk
, tcp_sk
, SEND_SHUTDOWN
);
2513 static void __mptcp_wr_shutdown(struct sock
*sk
)
2515 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2517 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
2518 msk
, msk
->snd_data_fin_enable
, sk
->sk_shutdown
, sk
->sk_state
,
2519 !!mptcp_send_head(sk
));
2521 /* will be ignored by fallback sockets */
2522 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ 1);
2523 WRITE_ONCE(msk
->snd_data_fin_enable
, 1);
2525 __mptcp_check_send_data_fin(sk
);
2528 static void __mptcp_destroy_sock(struct sock
*sk
)
2530 struct mptcp_subflow_context
*subflow
, *tmp
;
2531 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2532 LIST_HEAD(conn_list
);
2534 pr_debug("msk=%p", msk
);
2536 /* be sure to always acquire the join list lock, to sync vs
2537 * mptcp_finish_join().
2539 spin_lock_bh(&msk
->join_list_lock
);
2540 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
2541 spin_unlock_bh(&msk
->join_list_lock
);
2542 list_splice_init(&msk
->conn_list
, &conn_list
);
2544 sk_stop_timer(sk
, &msk
->sk
.icsk_retransmit_timer
);
2545 sk_stop_timer(sk
, &sk
->sk_timer
);
2548 list_for_each_entry_safe(subflow
, tmp
, &conn_list
, node
) {
2549 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2550 __mptcp_close_ssk(sk
, ssk
, subflow
);
2553 sk
->sk_prot
->destroy(sk
);
2555 WARN_ON_ONCE(msk
->wmem_reserved
);
2556 WARN_ON_ONCE(msk
->rmem_released
);
2557 sk_stream_kill_queues(sk
);
2558 xfrm_sk_free_policy(sk
);
2559 sk_refcnt_debug_release(sk
);
2563 static void mptcp_close(struct sock
*sk
, long timeout
)
2565 struct mptcp_subflow_context
*subflow
;
2566 bool do_cancel_work
= false;
2569 sk
->sk_shutdown
= SHUTDOWN_MASK
;
2571 if ((1 << sk
->sk_state
) & (TCPF_LISTEN
| TCPF_CLOSE
)) {
2572 inet_sk_state_store(sk
, TCP_CLOSE
);
2576 if (mptcp_close_state(sk
))
2577 __mptcp_wr_shutdown(sk
);
2579 sk_stream_wait_close(sk
, timeout
);
2582 /* orphan all the subflows */
2583 inet_csk(sk
)->icsk_mtup
.probe_timestamp
= tcp_jiffies32
;
2584 list_for_each_entry(subflow
, &mptcp_sk(sk
)->conn_list
, node
) {
2585 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2586 bool slow
, dispose_socket
;
2587 struct socket
*sock
;
2589 slow
= lock_sock_fast(ssk
);
2590 sock
= ssk
->sk_socket
;
2591 dispose_socket
= sock
&& sock
!= sk
->sk_socket
;
2593 unlock_sock_fast(ssk
, slow
);
2595 /* for the outgoing subflows we additionally need to free
2596 * the associated socket
2599 iput(SOCK_INODE(sock
));
2604 pr_debug("msk=%p state=%d", sk
, sk
->sk_state
);
2605 if (sk
->sk_state
== TCP_CLOSE
) {
2606 __mptcp_destroy_sock(sk
);
2607 do_cancel_work
= true;
2609 sk_reset_timer(sk
, &sk
->sk_timer
, jiffies
+ TCP_TIMEWAIT_LEN
);
2613 mptcp_cancel_work(sk
);
2617 static void mptcp_copy_inaddrs(struct sock
*msk
, const struct sock
*ssk
)
2619 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2620 const struct ipv6_pinfo
*ssk6
= inet6_sk(ssk
);
2621 struct ipv6_pinfo
*msk6
= inet6_sk(msk
);
2623 msk
->sk_v6_daddr
= ssk
->sk_v6_daddr
;
2624 msk
->sk_v6_rcv_saddr
= ssk
->sk_v6_rcv_saddr
;
2627 msk6
->saddr
= ssk6
->saddr
;
2628 msk6
->flow_label
= ssk6
->flow_label
;
2632 inet_sk(msk
)->inet_num
= inet_sk(ssk
)->inet_num
;
2633 inet_sk(msk
)->inet_dport
= inet_sk(ssk
)->inet_dport
;
2634 inet_sk(msk
)->inet_sport
= inet_sk(ssk
)->inet_sport
;
2635 inet_sk(msk
)->inet_daddr
= inet_sk(ssk
)->inet_daddr
;
2636 inet_sk(msk
)->inet_saddr
= inet_sk(ssk
)->inet_saddr
;
2637 inet_sk(msk
)->inet_rcv_saddr
= inet_sk(ssk
)->inet_rcv_saddr
;
2640 static int mptcp_disconnect(struct sock
*sk
, int flags
)
2642 /* Should never be called.
2643 * inet_stream_connect() calls ->disconnect, but that
2644 * refers to the subflow socket, not the mptcp one.
2650 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2651 static struct ipv6_pinfo
*mptcp_inet6_sk(const struct sock
*sk
)
2653 unsigned int offset
= sizeof(struct mptcp6_sock
) - sizeof(struct ipv6_pinfo
);
2655 return (struct ipv6_pinfo
*)(((u8
*)sk
) + offset
);
2659 struct sock
*mptcp_sk_clone(const struct sock
*sk
,
2660 const struct mptcp_options_received
*mp_opt
,
2661 struct request_sock
*req
)
2663 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
2664 struct sock
*nsk
= sk_clone_lock(sk
, GFP_ATOMIC
);
2665 struct mptcp_sock
*msk
;
2671 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2672 if (nsk
->sk_family
== AF_INET6
)
2673 inet_sk(nsk
)->pinet6
= mptcp_inet6_sk(nsk
);
2676 __mptcp_init_sock(nsk
);
2678 msk
= mptcp_sk(nsk
);
2679 msk
->local_key
= subflow_req
->local_key
;
2680 msk
->token
= subflow_req
->token
;
2681 msk
->subflow
= NULL
;
2682 WRITE_ONCE(msk
->fully_established
, false);
2684 msk
->write_seq
= subflow_req
->idsn
+ 1;
2685 msk
->snd_nxt
= msk
->write_seq
;
2686 msk
->snd_una
= msk
->write_seq
;
2687 msk
->wnd_end
= msk
->snd_nxt
+ req
->rsk_rcv_wnd
;
2689 if (mp_opt
->mp_capable
) {
2690 msk
->can_ack
= true;
2691 msk
->remote_key
= mp_opt
->sndr_key
;
2692 mptcp_crypto_key_sha(msk
->remote_key
, NULL
, &ack_seq
);
2694 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
2695 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
2698 sock_reset_flag(nsk
, SOCK_RCU_FREE
);
2699 /* will be fully established after successful MPC subflow creation */
2700 inet_sk_state_store(nsk
, TCP_SYN_RECV
);
2702 security_inet_csk_clone(nsk
, req
);
2703 bh_unlock_sock(nsk
);
2705 /* keep a single reference */
2710 void mptcp_rcv_space_init(struct mptcp_sock
*msk
, const struct sock
*ssk
)
2712 const struct tcp_sock
*tp
= tcp_sk(ssk
);
2714 msk
->rcvq_space
.copied
= 0;
2715 msk
->rcvq_space
.rtt_us
= 0;
2717 msk
->rcvq_space
.time
= tp
->tcp_mstamp
;
2719 /* initial rcv_space offering made to peer */
2720 msk
->rcvq_space
.space
= min_t(u32
, tp
->rcv_wnd
,
2721 TCP_INIT_CWND
* tp
->advmss
);
2722 if (msk
->rcvq_space
.space
== 0)
2723 msk
->rcvq_space
.space
= TCP_INIT_CWND
* TCP_MSS_DEFAULT
;
2725 WRITE_ONCE(msk
->wnd_end
, msk
->snd_nxt
+ tcp_sk(ssk
)->snd_wnd
);
2728 static struct sock
*mptcp_accept(struct sock
*sk
, int flags
, int *err
,
2731 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2732 struct socket
*listener
;
2735 listener
= __mptcp_nmpc_socket(msk
);
2736 if (WARN_ON_ONCE(!listener
)) {
2741 pr_debug("msk=%p, listener=%p", msk
, mptcp_subflow_ctx(listener
->sk
));
2742 newsk
= inet_csk_accept(listener
->sk
, flags
, err
, kern
);
2746 pr_debug("msk=%p, subflow is mptcp=%d", msk
, sk_is_mptcp(newsk
));
2747 if (sk_is_mptcp(newsk
)) {
2748 struct mptcp_subflow_context
*subflow
;
2749 struct sock
*new_mptcp_sock
;
2751 subflow
= mptcp_subflow_ctx(newsk
);
2752 new_mptcp_sock
= subflow
->conn
;
2754 /* is_mptcp should be false if subflow->conn is missing, see
2755 * subflow_syn_recv_sock()
2757 if (WARN_ON_ONCE(!new_mptcp_sock
)) {
2758 tcp_sk(newsk
)->is_mptcp
= 0;
2762 /* acquire the 2nd reference for the owning socket */
2763 sock_hold(new_mptcp_sock
);
2764 newsk
= new_mptcp_sock
;
2765 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_MPCAPABLEPASSIVEACK
);
2767 MPTCP_INC_STATS(sock_net(sk
),
2768 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK
);
2774 void mptcp_destroy_common(struct mptcp_sock
*msk
)
2776 struct sock
*sk
= (struct sock
*)msk
;
2778 __mptcp_clear_xmit(sk
);
2780 /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
2781 skb_queue_splice_tail_init(&msk
->receive_queue
, &sk
->sk_receive_queue
);
2783 skb_rbtree_purge(&msk
->out_of_order_queue
);
2784 mptcp_token_destroy(msk
);
2785 mptcp_pm_free_anno_list(msk
);
2788 static void mptcp_destroy(struct sock
*sk
)
2790 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2792 mptcp_destroy_common(msk
);
2793 sk_sockets_allocated_dec(sk
);
2796 static int mptcp_setsockopt_sol_socket(struct mptcp_sock
*msk
, int optname
,
2797 sockptr_t optval
, unsigned int optlen
)
2799 struct sock
*sk
= (struct sock
*)msk
;
2800 struct socket
*ssock
;
2807 ssock
= __mptcp_nmpc_socket(msk
);
2813 ret
= sock_setsockopt(ssock
, SOL_SOCKET
, optname
, optval
, optlen
);
2815 if (optname
== SO_REUSEPORT
)
2816 sk
->sk_reuseport
= ssock
->sk
->sk_reuseport
;
2817 else if (optname
== SO_REUSEADDR
)
2818 sk
->sk_reuse
= ssock
->sk
->sk_reuse
;
2824 return sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
, optval
, optlen
);
2827 static int mptcp_setsockopt_v6(struct mptcp_sock
*msk
, int optname
,
2828 sockptr_t optval
, unsigned int optlen
)
2830 struct sock
*sk
= (struct sock
*)msk
;
2831 int ret
= -EOPNOTSUPP
;
2832 struct socket
*ssock
;
2837 ssock
= __mptcp_nmpc_socket(msk
);
2843 ret
= tcp_setsockopt(ssock
->sk
, SOL_IPV6
, optname
, optval
, optlen
);
2845 sk
->sk_ipv6only
= ssock
->sk
->sk_ipv6only
;
2854 static int mptcp_setsockopt(struct sock
*sk
, int level
, int optname
,
2855 sockptr_t optval
, unsigned int optlen
)
2857 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2860 pr_debug("msk=%p", msk
);
2862 if (level
== SOL_SOCKET
)
2863 return mptcp_setsockopt_sol_socket(msk
, optname
, optval
, optlen
);
2865 /* @@ the meaning of setsockopt() when the socket is connected and
2866 * there are multiple subflows is not yet defined. It is up to the
2867 * MPTCP-level socket to configure the subflows until the subflow
2868 * is in TCP fallback, when TCP socket options are passed through
2869 * to the one remaining subflow.
2872 ssk
= __mptcp_tcp_fallback(msk
);
2875 return tcp_setsockopt(ssk
, level
, optname
, optval
, optlen
);
2877 if (level
== SOL_IPV6
)
2878 return mptcp_setsockopt_v6(msk
, optname
, optval
, optlen
);
2883 static int mptcp_getsockopt(struct sock
*sk
, int level
, int optname
,
2884 char __user
*optval
, int __user
*option
)
2886 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2889 pr_debug("msk=%p", msk
);
2891 /* @@ the meaning of setsockopt() when the socket is connected and
2892 * there are multiple subflows is not yet defined. It is up to the
2893 * MPTCP-level socket to configure the subflows until the subflow
2894 * is in TCP fallback, when socket options are passed through
2895 * to the one remaining subflow.
2898 ssk
= __mptcp_tcp_fallback(msk
);
2901 return tcp_getsockopt(ssk
, level
, optname
, optval
, option
);
2906 void __mptcp_data_acked(struct sock
*sk
)
2908 if (!sock_owned_by_user(sk
))
2909 __mptcp_clean_una(sk
);
2911 set_bit(MPTCP_CLEAN_UNA
, &mptcp_sk(sk
)->flags
);
2913 if (mptcp_pending_data_fin_ack(sk
))
2914 mptcp_schedule_work(sk
);
2917 void __mptcp_check_push(struct sock
*sk
, struct sock
*ssk
)
2919 if (!mptcp_send_head(sk
))
2922 if (!sock_owned_by_user(sk
))
2923 __mptcp_subflow_push_pending(sk
, ssk
);
2925 set_bit(MPTCP_PUSH_PENDING
, &mptcp_sk(sk
)->flags
);
2928 #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
2930 /* processes deferred events and flush wmem */
2931 static void mptcp_release_cb(struct sock
*sk
)
2933 unsigned long flags
, nflags
;
2935 /* push_pending may touch wmem_reserved, do it before the later
2938 if (test_and_clear_bit(MPTCP_CLEAN_UNA
, &mptcp_sk(sk
)->flags
))
2939 __mptcp_clean_una(sk
);
2940 if (test_and_clear_bit(MPTCP_PUSH_PENDING
, &mptcp_sk(sk
)->flags
)) {
2941 /* mptcp_push_pending() acquires the subflow socket lock
2943 * 1) can't be invoked in atomic scope
2944 * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
2945 * datapath acquires the msk socket spinlock while helding
2946 * the subflow socket lock
2949 spin_unlock_bh(&sk
->sk_lock
.slock
);
2950 mptcp_push_pending(sk
, 0);
2951 spin_lock_bh(&sk
->sk_lock
.slock
);
2954 /* clear any wmem reservation and errors */
2955 __mptcp_update_wmem(sk
);
2956 __mptcp_update_rmem(sk
);
2959 flags
= sk
->sk_tsq_flags
;
2960 if (!(flags
& MPTCP_DEFERRED_ALL
))
2962 nflags
= flags
& ~MPTCP_DEFERRED_ALL
;
2963 } while (cmpxchg(&sk
->sk_tsq_flags
, flags
, nflags
) != flags
);
2965 sock_release_ownership(sk
);
2967 if (flags
& TCPF_WRITE_TIMER_DEFERRED
) {
2968 mptcp_retransmit_handler(sk
);
2973 static int mptcp_hash(struct sock
*sk
)
2975 /* should never be called,
2976 * we hash the TCP subflows not the master socket
2982 static void mptcp_unhash(struct sock
*sk
)
2984 /* called from sk_common_release(), but nothing to do here */
2987 static int mptcp_get_port(struct sock
*sk
, unsigned short snum
)
2989 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2990 struct socket
*ssock
;
2992 ssock
= __mptcp_nmpc_socket(msk
);
2993 pr_debug("msk=%p, subflow=%p", msk
, ssock
);
2994 if (WARN_ON_ONCE(!ssock
))
2997 return inet_csk_get_port(ssock
->sk
, snum
);
3000 void mptcp_finish_connect(struct sock
*ssk
)
3002 struct mptcp_subflow_context
*subflow
;
3003 struct mptcp_sock
*msk
;
3007 subflow
= mptcp_subflow_ctx(ssk
);
3011 pr_debug("msk=%p, token=%u", sk
, subflow
->token
);
3013 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
, &ack_seq
);
3015 subflow
->map_seq
= ack_seq
;
3016 subflow
->map_subflow_seq
= 1;
3018 /* the socket is not connected yet, no msk/subflow ops can access/race
3019 * accessing the field below
3021 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
3022 WRITE_ONCE(msk
->local_key
, subflow
->local_key
);
3023 WRITE_ONCE(msk
->write_seq
, subflow
->idsn
+ 1);
3024 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
3025 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
3026 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
3027 WRITE_ONCE(msk
->can_ack
, 1);
3028 WRITE_ONCE(msk
->snd_una
, msk
->write_seq
);
3030 mptcp_pm_new_connection(msk
, 0);
3032 mptcp_rcv_space_init(msk
, ssk
);
3035 static void mptcp_sock_graft(struct sock
*sk
, struct socket
*parent
)
3037 write_lock_bh(&sk
->sk_callback_lock
);
3038 rcu_assign_pointer(sk
->sk_wq
, &parent
->wq
);
3039 sk_set_socket(sk
, parent
);
3040 sk
->sk_uid
= SOCK_INODE(parent
)->i_uid
;
3041 write_unlock_bh(&sk
->sk_callback_lock
);
3044 bool mptcp_finish_join(struct sock
*ssk
)
3046 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
3047 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
3048 struct sock
*parent
= (void *)msk
;
3049 struct socket
*parent_sock
;
3052 pr_debug("msk=%p, subflow=%p", msk
, subflow
);
3054 /* mptcp socket already closing? */
3055 if (!mptcp_is_fully_established(parent
))
3058 if (!msk
->pm
.server_side
)
3061 if (!mptcp_pm_allow_new_subflow(msk
))
3064 /* active connections are already on conn_list, and we can't acquire
3066 * use the join list lock as synchronization point and double-check
3067 * msk status to avoid racing with __mptcp_destroy_sock()
3069 spin_lock_bh(&msk
->join_list_lock
);
3070 ret
= inet_sk_state_load(parent
) == TCP_ESTABLISHED
;
3071 if (ret
&& !WARN_ON_ONCE(!list_empty(&subflow
->node
))) {
3072 list_add_tail(&subflow
->node
, &msk
->join_list
);
3075 spin_unlock_bh(&msk
->join_list_lock
);
3079 /* attach to msk socket only after we are sure he will deal with us
3082 parent_sock
= READ_ONCE(parent
->sk_socket
);
3083 if (parent_sock
&& !ssk
->sk_socket
)
3084 mptcp_sock_graft(ssk
, parent_sock
);
3085 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
3089 static struct proto mptcp_prot
= {
3091 .owner
= THIS_MODULE
,
3092 .init
= mptcp_init_sock
,
3093 .disconnect
= mptcp_disconnect
,
3094 .close
= mptcp_close
,
3095 .accept
= mptcp_accept
,
3096 .setsockopt
= mptcp_setsockopt
,
3097 .getsockopt
= mptcp_getsockopt
,
3098 .shutdown
= tcp_shutdown
,
3099 .destroy
= mptcp_destroy
,
3100 .sendmsg
= mptcp_sendmsg
,
3101 .recvmsg
= mptcp_recvmsg
,
3102 .release_cb
= mptcp_release_cb
,
3104 .unhash
= mptcp_unhash
,
3105 .get_port
= mptcp_get_port
,
3106 .sockets_allocated
= &mptcp_sockets_allocated
,
3107 .memory_allocated
= &tcp_memory_allocated
,
3108 .memory_pressure
= &tcp_memory_pressure
,
3109 .sysctl_wmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_wmem
),
3110 .sysctl_rmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_rmem
),
3111 .sysctl_mem
= sysctl_tcp_mem
,
3112 .obj_size
= sizeof(struct mptcp_sock
),
3113 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
3114 .no_autobind
= true,
3117 static int mptcp_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3119 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3120 struct socket
*ssock
;
3123 lock_sock(sock
->sk
);
3124 ssock
= __mptcp_nmpc_socket(msk
);
3130 err
= ssock
->ops
->bind(ssock
, uaddr
, addr_len
);
3132 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3135 release_sock(sock
->sk
);
3139 static void mptcp_subflow_early_fallback(struct mptcp_sock
*msk
,
3140 struct mptcp_subflow_context
*subflow
)
3142 subflow
->request_mptcp
= 0;
3143 __mptcp_do_fallback(msk
);
3146 static int mptcp_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
3147 int addr_len
, int flags
)
3149 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3150 struct mptcp_subflow_context
*subflow
;
3151 struct socket
*ssock
;
3154 lock_sock(sock
->sk
);
3155 if (sock
->state
!= SS_UNCONNECTED
&& msk
->subflow
) {
3156 /* pending connection or invalid state, let existing subflow
3159 ssock
= msk
->subflow
;
3163 ssock
= __mptcp_nmpc_socket(msk
);
3169 mptcp_token_destroy(msk
);
3170 inet_sk_state_store(sock
->sk
, TCP_SYN_SENT
);
3171 subflow
= mptcp_subflow_ctx(ssock
->sk
);
3172 #ifdef CONFIG_TCP_MD5SIG
3173 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
3176 if (rcu_access_pointer(tcp_sk(ssock
->sk
)->md5sig_info
))
3177 mptcp_subflow_early_fallback(msk
, subflow
);
3179 if (subflow
->request_mptcp
&& mptcp_token_new_connect(ssock
->sk
))
3180 mptcp_subflow_early_fallback(msk
, subflow
);
3183 err
= ssock
->ops
->connect(ssock
, uaddr
, addr_len
, flags
);
3184 sock
->state
= ssock
->state
;
3186 /* on successful connect, the msk state will be moved to established by
3187 * subflow_finish_connect()
3189 if (!err
|| err
== -EINPROGRESS
)
3190 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3192 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
3195 release_sock(sock
->sk
);
3199 static int mptcp_listen(struct socket
*sock
, int backlog
)
3201 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3202 struct socket
*ssock
;
3205 pr_debug("msk=%p", msk
);
3207 lock_sock(sock
->sk
);
3208 ssock
= __mptcp_nmpc_socket(msk
);
3214 mptcp_token_destroy(msk
);
3215 inet_sk_state_store(sock
->sk
, TCP_LISTEN
);
3216 sock_set_flag(sock
->sk
, SOCK_RCU_FREE
);
3218 err
= ssock
->ops
->listen(ssock
, backlog
);
3219 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
3221 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3224 release_sock(sock
->sk
);
3228 static int mptcp_stream_accept(struct socket
*sock
, struct socket
*newsock
,
3229 int flags
, bool kern
)
3231 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3232 struct socket
*ssock
;
3235 pr_debug("msk=%p", msk
);
3237 lock_sock(sock
->sk
);
3238 if (sock
->sk
->sk_state
!= TCP_LISTEN
)
3241 ssock
= __mptcp_nmpc_socket(msk
);
3245 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
3246 sock_hold(ssock
->sk
);
3247 release_sock(sock
->sk
);
3249 err
= ssock
->ops
->accept(sock
, newsock
, flags
, kern
);
3250 if (err
== 0 && !mptcp_is_tcpsk(newsock
->sk
)) {
3251 struct mptcp_sock
*msk
= mptcp_sk(newsock
->sk
);
3252 struct mptcp_subflow_context
*subflow
;
3253 struct sock
*newsk
= newsock
->sk
;
3256 slowpath
= lock_sock_fast(newsk
);
3258 /* PM/worker can now acquire the first subflow socket
3259 * lock without racing with listener queue cleanup,
3260 * we can notify it, if needed.
3262 subflow
= mptcp_subflow_ctx(msk
->first
);
3263 list_add(&subflow
->node
, &msk
->conn_list
);
3264 sock_hold(msk
->first
);
3265 if (mptcp_is_fully_established(newsk
))
3266 mptcp_pm_fully_established(msk
);
3268 mptcp_copy_inaddrs(newsk
, msk
->first
);
3269 mptcp_rcv_space_init(msk
, msk
->first
);
3271 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
3272 * This is needed so NOSPACE flag can be set from tcp stack.
3274 __mptcp_flush_join_list(msk
);
3275 mptcp_for_each_subflow(msk
, subflow
) {
3276 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
3278 if (!ssk
->sk_socket
)
3279 mptcp_sock_graft(ssk
, newsock
);
3281 unlock_sock_fast(newsk
, slowpath
);
3284 if (inet_csk_listen_poll(ssock
->sk
))
3285 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
3286 sock_put(ssock
->sk
);
3290 release_sock(sock
->sk
);
3294 static __poll_t
mptcp_check_readable(struct mptcp_sock
*msk
)
3296 return test_bit(MPTCP_DATA_READY
, &msk
->flags
) ? EPOLLIN
| EPOLLRDNORM
:
3300 static __poll_t
mptcp_check_writeable(struct mptcp_sock
*msk
)
3302 struct sock
*sk
= (struct sock
*)msk
;
3304 if (unlikely(sk
->sk_shutdown
& SEND_SHUTDOWN
))
3307 if (sk_stream_is_writeable(sk
))
3308 return EPOLLOUT
| EPOLLWRNORM
;
3310 set_bit(MPTCP_NOSPACE
, &msk
->flags
);
3311 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
3312 if (sk_stream_is_writeable(sk
))
3313 return EPOLLOUT
| EPOLLWRNORM
;
3318 static __poll_t
mptcp_poll(struct file
*file
, struct socket
*sock
,
3319 struct poll_table_struct
*wait
)
3321 struct sock
*sk
= sock
->sk
;
3322 struct mptcp_sock
*msk
;
3327 sock_poll_wait(file
, sock
, wait
);
3329 state
= inet_sk_state_load(sk
);
3330 pr_debug("msk=%p state=%d flags=%lx", msk
, state
, msk
->flags
);
3331 if (state
== TCP_LISTEN
)
3332 return mptcp_check_readable(msk
);
3334 if (state
!= TCP_SYN_SENT
&& state
!= TCP_SYN_RECV
) {
3335 mask
|= mptcp_check_readable(msk
);
3336 mask
|= mptcp_check_writeable(msk
);
3338 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
3339 mask
|= EPOLLIN
| EPOLLRDNORM
| EPOLLRDHUP
;
3344 static int mptcp_shutdown(struct socket
*sock
, int how
)
3346 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3347 struct sock
*sk
= sock
->sk
;
3350 pr_debug("sk=%p, how=%d", msk
, how
);
3355 if ((how
& ~SHUTDOWN_MASK
) || !how
) {
3360 if (sock
->state
== SS_CONNECTING
) {
3361 if ((1 << sk
->sk_state
) &
3362 (TCPF_SYN_SENT
| TCPF_SYN_RECV
| TCPF_CLOSE
))
3363 sock
->state
= SS_DISCONNECTING
;
3365 sock
->state
= SS_CONNECTED
;
3368 sk
->sk_shutdown
|= how
;
3369 if ((how
& SEND_SHUTDOWN
) && mptcp_close_state(sk
))
3370 __mptcp_wr_shutdown(sk
);
3372 /* Wake up anyone sleeping in poll. */
3373 sk
->sk_state_change(sk
);
3381 static const struct proto_ops mptcp_stream_ops
= {
3383 .owner
= THIS_MODULE
,
3384 .release
= inet_release
,
3386 .connect
= mptcp_stream_connect
,
3387 .socketpair
= sock_no_socketpair
,
3388 .accept
= mptcp_stream_accept
,
3389 .getname
= inet_getname
,
3391 .ioctl
= inet_ioctl
,
3392 .gettstamp
= sock_gettstamp
,
3393 .listen
= mptcp_listen
,
3394 .shutdown
= mptcp_shutdown
,
3395 .setsockopt
= sock_common_setsockopt
,
3396 .getsockopt
= sock_common_getsockopt
,
3397 .sendmsg
= inet_sendmsg
,
3398 .recvmsg
= inet_recvmsg
,
3399 .mmap
= sock_no_mmap
,
3400 .sendpage
= inet_sendpage
,
3403 static struct inet_protosw mptcp_protosw
= {
3404 .type
= SOCK_STREAM
,
3405 .protocol
= IPPROTO_MPTCP
,
3406 .prot
= &mptcp_prot
,
3407 .ops
= &mptcp_stream_ops
,
3408 .flags
= INET_PROTOSW_ICSK
,
3411 void __init
mptcp_proto_init(void)
3413 mptcp_prot
.h
.hashinfo
= tcp_prot
.h
.hashinfo
;
3415 if (percpu_counter_init(&mptcp_sockets_allocated
, 0, GFP_KERNEL
))
3416 panic("Failed to allocate MPTCP pcpu counter\n");
3418 mptcp_subflow_init();
3422 if (proto_register(&mptcp_prot
, 1) != 0)
3423 panic("Failed to register MPTCP proto.\n");
3425 inet_register_protosw(&mptcp_protosw
);
3427 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb
) > sizeof_field(struct sk_buff
, cb
));
3430 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
3431 static const struct proto_ops mptcp_v6_stream_ops
= {
3433 .owner
= THIS_MODULE
,
3434 .release
= inet6_release
,
3436 .connect
= mptcp_stream_connect
,
3437 .socketpair
= sock_no_socketpair
,
3438 .accept
= mptcp_stream_accept
,
3439 .getname
= inet6_getname
,
3441 .ioctl
= inet6_ioctl
,
3442 .gettstamp
= sock_gettstamp
,
3443 .listen
= mptcp_listen
,
3444 .shutdown
= mptcp_shutdown
,
3445 .setsockopt
= sock_common_setsockopt
,
3446 .getsockopt
= sock_common_getsockopt
,
3447 .sendmsg
= inet6_sendmsg
,
3448 .recvmsg
= inet6_recvmsg
,
3449 .mmap
= sock_no_mmap
,
3450 .sendpage
= inet_sendpage
,
3451 #ifdef CONFIG_COMPAT
3452 .compat_ioctl
= inet6_compat_ioctl
,
3456 static struct proto mptcp_v6_prot
;
3458 static void mptcp_v6_destroy(struct sock
*sk
)
3461 inet6_destroy_sock(sk
);
3464 static struct inet_protosw mptcp_v6_protosw
= {
3465 .type
= SOCK_STREAM
,
3466 .protocol
= IPPROTO_MPTCP
,
3467 .prot
= &mptcp_v6_prot
,
3468 .ops
= &mptcp_v6_stream_ops
,
3469 .flags
= INET_PROTOSW_ICSK
,
3472 int __init
mptcp_proto_v6_init(void)
3476 mptcp_v6_prot
= mptcp_prot
;
3477 strcpy(mptcp_v6_prot
.name
, "MPTCPv6");
3478 mptcp_v6_prot
.slab
= NULL
;
3479 mptcp_v6_prot
.destroy
= mptcp_v6_destroy
;
3480 mptcp_v6_prot
.obj_size
= sizeof(struct mptcp6_sock
);
3482 err
= proto_register(&mptcp_v6_prot
, 1);
3486 err
= inet6_register_protosw(&mptcp_v6_protosw
);
3488 proto_unregister(&mptcp_v6_prot
);