1 // SPDX-License-Identifier: GPL-2.0-only
3 * VMware vSockets Driver
5 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
8 #include <linux/types.h>
9 #include <linux/socket.h>
10 #include <linux/stddef.h>
13 #include "vmci_transport_notify.h"
15 #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
17 static bool vmci_transport_notify_waiting_write(struct vsock_sock
*vsk
)
19 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
23 if (!PKT_FIELD(vsk
, peer_waiting_write
))
26 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
27 /* When the sender blocks, we take that as a sign that the sender is
28 * faster than the receiver. To reduce the transmit rate of the sender,
29 * we delay the sending of the read notification by decreasing the
30 * write_notify_window. The notification is delayed until the number of
31 * bytes used in the queue drops below the write_notify_window.
34 if (!PKT_FIELD(vsk
, peer_waiting_write_detected
)) {
35 PKT_FIELD(vsk
, peer_waiting_write_detected
) = true;
36 if (PKT_FIELD(vsk
, write_notify_window
) < PAGE_SIZE
) {
37 PKT_FIELD(vsk
, write_notify_window
) =
38 PKT_FIELD(vsk
, write_notify_min_window
);
40 PKT_FIELD(vsk
, write_notify_window
) -= PAGE_SIZE
;
41 if (PKT_FIELD(vsk
, write_notify_window
) <
42 PKT_FIELD(vsk
, write_notify_min_window
))
43 PKT_FIELD(vsk
, write_notify_window
) =
44 PKT_FIELD(vsk
, write_notify_min_window
);
48 notify_limit
= vmci_trans(vsk
)->consume_size
-
49 PKT_FIELD(vsk
, write_notify_window
);
54 /* For now we ignore the wait information and just see if the free
55 * space exceeds the notify limit. Note that improving this function
56 * to be more intelligent will not require a protocol change and will
57 * retain compatibility between endpoints with mixed versions of this
60 * The notify_limit is used to delay notifications in the case where
61 * flow control is enabled. Below the test is expressed in terms of
62 * free space in the queue: if free_space > ConsumeSize -
63 * write_notify_window then notify An alternate way of expressing this
64 * is to rewrite the expression to use the data ready in the receive
65 * queue: if write_notify_window > bufferReady then notify as
66 * free_space == ConsumeSize - bufferReady.
68 retval
= vmci_qpair_consume_free_space(vmci_trans(vsk
)->qpair
) >
70 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
73 * Once we notify the peer, we reset the detected flag so the
74 * next wait will again cause a decrease in the window size.
77 PKT_FIELD(vsk
, peer_waiting_write_detected
) = false;
86 static bool vmci_transport_notify_waiting_read(struct vsock_sock
*vsk
)
88 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
89 if (!PKT_FIELD(vsk
, peer_waiting_read
))
92 /* For now we ignore the wait information and just see if there is any
93 * data for our peer to read. Note that improving this function to be
94 * more intelligent will not require a protocol change and will retain
95 * compatibility between endpoints with mixed versions of this
98 return vmci_qpair_produce_buf_ready(vmci_trans(vsk
)->qpair
) > 0;
105 vmci_transport_handle_waiting_read(struct sock
*sk
,
106 struct vmci_transport_packet
*pkt
,
108 struct sockaddr_vm
*dst
,
109 struct sockaddr_vm
*src
)
111 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
112 struct vsock_sock
*vsk
;
116 PKT_FIELD(vsk
, peer_waiting_read
) = true;
117 memcpy(&PKT_FIELD(vsk
, peer_waiting_read_info
), &pkt
->u
.wait
,
118 sizeof(PKT_FIELD(vsk
, peer_waiting_read_info
)));
120 if (vmci_transport_notify_waiting_read(vsk
)) {
124 sent
= vmci_transport_send_wrote_bh(dst
, src
) > 0;
126 sent
= vmci_transport_send_wrote(sk
) > 0;
129 PKT_FIELD(vsk
, peer_waiting_read
) = false;
135 vmci_transport_handle_waiting_write(struct sock
*sk
,
136 struct vmci_transport_packet
*pkt
,
138 struct sockaddr_vm
*dst
,
139 struct sockaddr_vm
*src
)
141 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
142 struct vsock_sock
*vsk
;
146 PKT_FIELD(vsk
, peer_waiting_write
) = true;
147 memcpy(&PKT_FIELD(vsk
, peer_waiting_write_info
), &pkt
->u
.wait
,
148 sizeof(PKT_FIELD(vsk
, peer_waiting_write_info
)));
150 if (vmci_transport_notify_waiting_write(vsk
)) {
154 sent
= vmci_transport_send_read_bh(dst
, src
) > 0;
156 sent
= vmci_transport_send_read(sk
) > 0;
159 PKT_FIELD(vsk
, peer_waiting_write
) = false;
165 vmci_transport_handle_read(struct sock
*sk
,
166 struct vmci_transport_packet
*pkt
,
168 struct sockaddr_vm
*dst
, struct sockaddr_vm
*src
)
170 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
171 struct vsock_sock
*vsk
;
174 PKT_FIELD(vsk
, sent_waiting_write
) = false;
177 sk
->sk_write_space(sk
);
180 static bool send_waiting_read(struct sock
*sk
, u64 room_needed
)
182 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
183 struct vsock_sock
*vsk
;
184 struct vmci_transport_waiting_info waiting_info
;
192 if (PKT_FIELD(vsk
, sent_waiting_read
))
195 if (PKT_FIELD(vsk
, write_notify_window
) <
196 vmci_trans(vsk
)->consume_size
)
197 PKT_FIELD(vsk
, write_notify_window
) =
198 min(PKT_FIELD(vsk
, write_notify_window
) + PAGE_SIZE
,
199 vmci_trans(vsk
)->consume_size
);
201 vmci_qpair_get_consume_indexes(vmci_trans(vsk
)->qpair
, &tail
, &head
);
202 room_left
= vmci_trans(vsk
)->consume_size
- head
;
203 if (room_needed
>= room_left
) {
204 waiting_info
.offset
= room_needed
- room_left
;
205 waiting_info
.generation
=
206 PKT_FIELD(vsk
, consume_q_generation
) + 1;
208 waiting_info
.offset
= head
+ room_needed
;
209 waiting_info
.generation
= PKT_FIELD(vsk
, consume_q_generation
);
212 ret
= vmci_transport_send_waiting_read(sk
, &waiting_info
) > 0;
214 PKT_FIELD(vsk
, sent_waiting_read
) = true;
222 static bool send_waiting_write(struct sock
*sk
, u64 room_needed
)
224 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
225 struct vsock_sock
*vsk
;
226 struct vmci_transport_waiting_info waiting_info
;
234 if (PKT_FIELD(vsk
, sent_waiting_write
))
237 vmci_qpair_get_produce_indexes(vmci_trans(vsk
)->qpair
, &tail
, &head
);
238 room_left
= vmci_trans(vsk
)->produce_size
- tail
;
239 if (room_needed
+ 1 >= room_left
) {
240 /* Wraps around to current generation. */
241 waiting_info
.offset
= room_needed
+ 1 - room_left
;
242 waiting_info
.generation
= PKT_FIELD(vsk
, produce_q_generation
);
244 waiting_info
.offset
= tail
+ room_needed
+ 1;
245 waiting_info
.generation
=
246 PKT_FIELD(vsk
, produce_q_generation
) - 1;
249 ret
= vmci_transport_send_waiting_write(sk
, &waiting_info
) > 0;
251 PKT_FIELD(vsk
, sent_waiting_write
) = true;
259 static int vmci_transport_send_read_notification(struct sock
*sk
)
261 struct vsock_sock
*vsk
;
263 unsigned int retries
;
271 if (vmci_transport_notify_waiting_write(vsk
)) {
272 /* Notify the peer that we have read, retrying the send on
273 * failure up to our maximum value. XXX For now we just log
274 * the failure, but later we should schedule a work item to
275 * handle the resend until it succeeds. That would require
276 * keeping track of work items in the vsk and cleaning them up
279 while (!(vsk
->peer_shutdown
& RCV_SHUTDOWN
) &&
281 retries
< VMCI_TRANSPORT_MAX_DGRAM_RESENDS
) {
282 err
= vmci_transport_send_read(sk
);
289 if (retries
>= VMCI_TRANSPORT_MAX_DGRAM_RESENDS
)
290 pr_err("%p unable to send read notify to peer\n", sk
);
292 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
293 PKT_FIELD(vsk
, peer_waiting_write
) = false;
301 vmci_transport_handle_wrote(struct sock
*sk
,
302 struct vmci_transport_packet
*pkt
,
304 struct sockaddr_vm
*dst
, struct sockaddr_vm
*src
)
306 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
307 struct vsock_sock
*vsk
= vsock_sk(sk
);
308 PKT_FIELD(vsk
, sent_waiting_read
) = false;
310 sk
->sk_data_ready(sk
);
313 static void vmci_transport_notify_pkt_socket_init(struct sock
*sk
)
315 struct vsock_sock
*vsk
= vsock_sk(sk
);
317 PKT_FIELD(vsk
, write_notify_window
) = PAGE_SIZE
;
318 PKT_FIELD(vsk
, write_notify_min_window
) = PAGE_SIZE
;
319 PKT_FIELD(vsk
, peer_waiting_read
) = false;
320 PKT_FIELD(vsk
, peer_waiting_write
) = false;
321 PKT_FIELD(vsk
, peer_waiting_write_detected
) = false;
322 PKT_FIELD(vsk
, sent_waiting_read
) = false;
323 PKT_FIELD(vsk
, sent_waiting_write
) = false;
324 PKT_FIELD(vsk
, produce_q_generation
) = 0;
325 PKT_FIELD(vsk
, consume_q_generation
) = 0;
327 memset(&PKT_FIELD(vsk
, peer_waiting_read_info
), 0,
328 sizeof(PKT_FIELD(vsk
, peer_waiting_read_info
)));
329 memset(&PKT_FIELD(vsk
, peer_waiting_write_info
), 0,
330 sizeof(PKT_FIELD(vsk
, peer_waiting_write_info
)));
333 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock
*vsk
)
338 vmci_transport_notify_pkt_poll_in(struct sock
*sk
,
339 size_t target
, bool *data_ready_now
)
341 struct vsock_sock
*vsk
= vsock_sk(sk
);
343 if (vsock_stream_has_data(vsk
)) {
344 *data_ready_now
= true;
346 /* We can't read right now because there is nothing in the
347 * queue. Ask for notifications when there is something to
350 if (sk
->sk_state
== TCP_ESTABLISHED
) {
351 if (!send_waiting_read(sk
, 1))
355 *data_ready_now
= false;
362 vmci_transport_notify_pkt_poll_out(struct sock
*sk
,
363 size_t target
, bool *space_avail_now
)
365 s64 produce_q_free_space
;
366 struct vsock_sock
*vsk
= vsock_sk(sk
);
368 produce_q_free_space
= vsock_stream_has_space(vsk
);
369 if (produce_q_free_space
> 0) {
370 *space_avail_now
= true;
372 } else if (produce_q_free_space
== 0) {
373 /* This is a connected socket but we can't currently send data.
374 * Notify the peer that we are waiting if the queue is full. We
375 * only send a waiting write if the queue is full because
376 * otherwise we end up in an infinite WAITING_WRITE, READ,
377 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
378 * notification as a socket error, passing that back through
381 if (!send_waiting_write(sk
, 1))
384 *space_avail_now
= false;
391 vmci_transport_notify_pkt_recv_init(
394 struct vmci_transport_recv_notify_data
*data
)
396 struct vsock_sock
*vsk
= vsock_sk(sk
);
398 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
399 data
->consume_head
= 0;
400 data
->produce_tail
= 0;
401 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
402 data
->notify_on_block
= false;
404 if (PKT_FIELD(vsk
, write_notify_min_window
) < target
+ 1) {
405 PKT_FIELD(vsk
, write_notify_min_window
) = target
+ 1;
406 if (PKT_FIELD(vsk
, write_notify_window
) <
407 PKT_FIELD(vsk
, write_notify_min_window
)) {
408 /* If the current window is smaller than the new
409 * minimal window size, we need to reevaluate whether
410 * we need to notify the sender. If the number of ready
411 * bytes are smaller than the new window, we need to
412 * send a notification to the sender before we block.
415 PKT_FIELD(vsk
, write_notify_window
) =
416 PKT_FIELD(vsk
, write_notify_min_window
);
417 data
->notify_on_block
= true;
427 vmci_transport_notify_pkt_recv_pre_block(
430 struct vmci_transport_recv_notify_data
*data
)
434 /* Notify our peer that we are waiting for data to read. */
435 if (!send_waiting_read(sk
, target
)) {
439 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
440 if (data
->notify_on_block
) {
441 err
= vmci_transport_send_read_notification(sk
);
445 data
->notify_on_block
= false;
453 vmci_transport_notify_pkt_recv_pre_dequeue(
456 struct vmci_transport_recv_notify_data
*data
)
458 struct vsock_sock
*vsk
= vsock_sk(sk
);
460 /* Now consume up to len bytes from the queue. Note that since we have
461 * the socket locked we should copy at least ready bytes.
463 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
464 vmci_qpair_get_consume_indexes(vmci_trans(vsk
)->qpair
,
466 &data
->consume_head
);
473 vmci_transport_notify_pkt_recv_post_dequeue(
478 struct vmci_transport_recv_notify_data
*data
)
480 struct vsock_sock
*vsk
;
487 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
488 /* Detect a wrap-around to maintain queue generation. Note
489 * that this is safe since we hold the socket lock across the
490 * two queue pair operations.
493 vmci_trans(vsk
)->consume_size
- data
->consume_head
)
494 PKT_FIELD(vsk
, consume_q_generation
)++;
497 err
= vmci_transport_send_read_notification(sk
);
506 vmci_transport_notify_pkt_send_init(
508 struct vmci_transport_send_notify_data
*data
)
510 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
511 data
->consume_head
= 0;
512 data
->produce_tail
= 0;
519 vmci_transport_notify_pkt_send_pre_block(
521 struct vmci_transport_send_notify_data
*data
)
523 /* Notify our peer that we are waiting for room to write. */
524 if (!send_waiting_write(sk
, 1))
525 return -EHOSTUNREACH
;
531 vmci_transport_notify_pkt_send_pre_enqueue(
533 struct vmci_transport_send_notify_data
*data
)
535 struct vsock_sock
*vsk
= vsock_sk(sk
);
537 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
538 vmci_qpair_get_produce_indexes(vmci_trans(vsk
)->qpair
,
540 &data
->consume_head
);
547 vmci_transport_notify_pkt_send_post_enqueue(
550 struct vmci_transport_send_notify_data
*data
)
553 struct vsock_sock
*vsk
;
554 bool sent_wrote
= false;
559 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
560 /* Detect a wrap-around to maintain queue generation. Note that this
561 * is safe since we hold the socket lock across the two queue pair
564 if (written
>= vmci_trans(vsk
)->produce_size
- data
->produce_tail
)
565 PKT_FIELD(vsk
, produce_q_generation
)++;
569 if (vmci_transport_notify_waiting_read(vsk
)) {
570 /* Notify the peer that we have written, retrying the send on
571 * failure up to our maximum value. See the XXX comment for the
572 * corresponding piece of code in StreamRecvmsg() for potential
575 while (!(vsk
->peer_shutdown
& RCV_SHUTDOWN
) &&
577 retries
< VMCI_TRANSPORT_MAX_DGRAM_RESENDS
) {
578 err
= vmci_transport_send_wrote(sk
);
585 if (retries
>= VMCI_TRANSPORT_MAX_DGRAM_RESENDS
) {
586 pr_err("%p unable to send wrote notify to peer\n", sk
);
589 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
590 PKT_FIELD(vsk
, peer_waiting_read
) = false;
598 vmci_transport_notify_pkt_handle_pkt(
600 struct vmci_transport_packet
*pkt
,
602 struct sockaddr_vm
*dst
,
603 struct sockaddr_vm
*src
, bool *pkt_processed
)
605 bool processed
= false;
608 case VMCI_TRANSPORT_PACKET_TYPE_WROTE
:
609 vmci_transport_handle_wrote(sk
, pkt
, bottom_half
, dst
, src
);
612 case VMCI_TRANSPORT_PACKET_TYPE_READ
:
613 vmci_transport_handle_read(sk
, pkt
, bottom_half
, dst
, src
);
616 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE
:
617 vmci_transport_handle_waiting_write(sk
, pkt
, bottom_half
,
622 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ
:
623 vmci_transport_handle_waiting_read(sk
, pkt
, bottom_half
,
630 *pkt_processed
= processed
;
633 static void vmci_transport_notify_pkt_process_request(struct sock
*sk
)
635 struct vsock_sock
*vsk
= vsock_sk(sk
);
637 PKT_FIELD(vsk
, write_notify_window
) = vmci_trans(vsk
)->consume_size
;
638 if (vmci_trans(vsk
)->consume_size
<
639 PKT_FIELD(vsk
, write_notify_min_window
))
640 PKT_FIELD(vsk
, write_notify_min_window
) =
641 vmci_trans(vsk
)->consume_size
;
644 static void vmci_transport_notify_pkt_process_negotiate(struct sock
*sk
)
646 struct vsock_sock
*vsk
= vsock_sk(sk
);
648 PKT_FIELD(vsk
, write_notify_window
) = vmci_trans(vsk
)->consume_size
;
649 if (vmci_trans(vsk
)->consume_size
<
650 PKT_FIELD(vsk
, write_notify_min_window
))
651 PKT_FIELD(vsk
, write_notify_min_window
) =
652 vmci_trans(vsk
)->consume_size
;
655 /* Socket control packet based operations. */
656 const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops
= {
657 .socket_init
= vmci_transport_notify_pkt_socket_init
,
658 .socket_destruct
= vmci_transport_notify_pkt_socket_destruct
,
659 .poll_in
= vmci_transport_notify_pkt_poll_in
,
660 .poll_out
= vmci_transport_notify_pkt_poll_out
,
661 .handle_notify_pkt
= vmci_transport_notify_pkt_handle_pkt
,
662 .recv_init
= vmci_transport_notify_pkt_recv_init
,
663 .recv_pre_block
= vmci_transport_notify_pkt_recv_pre_block
,
664 .recv_pre_dequeue
= vmci_transport_notify_pkt_recv_pre_dequeue
,
665 .recv_post_dequeue
= vmci_transport_notify_pkt_recv_post_dequeue
,
666 .send_init
= vmci_transport_notify_pkt_send_init
,
667 .send_pre_block
= vmci_transport_notify_pkt_send_pre_block
,
668 .send_pre_enqueue
= vmci_transport_notify_pkt_send_pre_enqueue
,
669 .send_post_enqueue
= vmci_transport_notify_pkt_send_post_enqueue
,
670 .process_request
= vmci_transport_notify_pkt_process_request
,
671 .process_negotiate
= vmci_transport_notify_pkt_process_negotiate
,