2 * VMware vSockets Driver
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 #include <linux/types.h>
17 #include <linux/socket.h>
18 #include <linux/stddef.h>
21 #include "vmci_transport_notify.h"
23 #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
25 static bool vmci_transport_notify_waiting_write(struct vsock_sock
*vsk
)
27 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
31 if (!PKT_FIELD(vsk
, peer_waiting_write
))
34 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
35 /* When the sender blocks, we take that as a sign that the sender is
36 * faster than the receiver. To reduce the transmit rate of the sender,
37 * we delay the sending of the read notification by decreasing the
38 * write_notify_window. The notification is delayed until the number of
39 * bytes used in the queue drops below the write_notify_window.
42 if (!PKT_FIELD(vsk
, peer_waiting_write_detected
)) {
43 PKT_FIELD(vsk
, peer_waiting_write_detected
) = true;
44 if (PKT_FIELD(vsk
, write_notify_window
) < PAGE_SIZE
) {
45 PKT_FIELD(vsk
, write_notify_window
) =
46 PKT_FIELD(vsk
, write_notify_min_window
);
48 PKT_FIELD(vsk
, write_notify_window
) -= PAGE_SIZE
;
49 if (PKT_FIELD(vsk
, write_notify_window
) <
50 PKT_FIELD(vsk
, write_notify_min_window
))
51 PKT_FIELD(vsk
, write_notify_window
) =
52 PKT_FIELD(vsk
, write_notify_min_window
);
56 notify_limit
= vmci_trans(vsk
)->consume_size
-
57 PKT_FIELD(vsk
, write_notify_window
);
62 /* For now we ignore the wait information and just see if the free
63 * space exceeds the notify limit. Note that improving this function
64 * to be more intelligent will not require a protocol change and will
65 * retain compatibility between endpoints with mixed versions of this
68 * The notify_limit is used to delay notifications in the case where
69 * flow control is enabled. Below the test is expressed in terms of
70 * free space in the queue: if free_space > ConsumeSize -
71 * write_notify_window then notify An alternate way of expressing this
72 * is to rewrite the expression to use the data ready in the receive
73 * queue: if write_notify_window > bufferReady then notify as
74 * free_space == ConsumeSize - bufferReady.
76 retval
= vmci_qpair_consume_free_space(vmci_trans(vsk
)->qpair
) >
78 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
81 * Once we notify the peer, we reset the detected flag so the
82 * next wait will again cause a decrease in the window size.
85 PKT_FIELD(vsk
, peer_waiting_write_detected
) = false;
94 static bool vmci_transport_notify_waiting_read(struct vsock_sock
*vsk
)
96 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
97 if (!PKT_FIELD(vsk
, peer_waiting_read
))
100 /* For now we ignore the wait information and just see if there is any
101 * data for our peer to read. Note that improving this function to be
102 * more intelligent will not require a protocol change and will retain
103 * compatibility between endpoints with mixed versions of this
106 return vmci_qpair_produce_buf_ready(vmci_trans(vsk
)->qpair
) > 0;
113 vmci_transport_handle_waiting_read(struct sock
*sk
,
114 struct vmci_transport_packet
*pkt
,
116 struct sockaddr_vm
*dst
,
117 struct sockaddr_vm
*src
)
119 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
120 struct vsock_sock
*vsk
;
124 PKT_FIELD(vsk
, peer_waiting_read
) = true;
125 memcpy(&PKT_FIELD(vsk
, peer_waiting_read_info
), &pkt
->u
.wait
,
126 sizeof(PKT_FIELD(vsk
, peer_waiting_read_info
)));
128 if (vmci_transport_notify_waiting_read(vsk
)) {
132 sent
= vmci_transport_send_wrote_bh(dst
, src
) > 0;
134 sent
= vmci_transport_send_wrote(sk
) > 0;
137 PKT_FIELD(vsk
, peer_waiting_read
) = false;
143 vmci_transport_handle_waiting_write(struct sock
*sk
,
144 struct vmci_transport_packet
*pkt
,
146 struct sockaddr_vm
*dst
,
147 struct sockaddr_vm
*src
)
149 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
150 struct vsock_sock
*vsk
;
154 PKT_FIELD(vsk
, peer_waiting_write
) = true;
155 memcpy(&PKT_FIELD(vsk
, peer_waiting_write_info
), &pkt
->u
.wait
,
156 sizeof(PKT_FIELD(vsk
, peer_waiting_write_info
)));
158 if (vmci_transport_notify_waiting_write(vsk
)) {
162 sent
= vmci_transport_send_read_bh(dst
, src
) > 0;
164 sent
= vmci_transport_send_read(sk
) > 0;
167 PKT_FIELD(vsk
, peer_waiting_write
) = false;
173 vmci_transport_handle_read(struct sock
*sk
,
174 struct vmci_transport_packet
*pkt
,
176 struct sockaddr_vm
*dst
, struct sockaddr_vm
*src
)
178 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
179 struct vsock_sock
*vsk
;
182 PKT_FIELD(vsk
, sent_waiting_write
) = false;
185 sk
->sk_write_space(sk
);
188 static bool send_waiting_read(struct sock
*sk
, u64 room_needed
)
190 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
191 struct vsock_sock
*vsk
;
192 struct vmci_transport_waiting_info waiting_info
;
200 if (PKT_FIELD(vsk
, sent_waiting_read
))
203 if (PKT_FIELD(vsk
, write_notify_window
) <
204 vmci_trans(vsk
)->consume_size
)
205 PKT_FIELD(vsk
, write_notify_window
) =
206 min(PKT_FIELD(vsk
, write_notify_window
) + PAGE_SIZE
,
207 vmci_trans(vsk
)->consume_size
);
209 vmci_qpair_get_consume_indexes(vmci_trans(vsk
)->qpair
, &tail
, &head
);
210 room_left
= vmci_trans(vsk
)->consume_size
- head
;
211 if (room_needed
>= room_left
) {
212 waiting_info
.offset
= room_needed
- room_left
;
213 waiting_info
.generation
=
214 PKT_FIELD(vsk
, consume_q_generation
) + 1;
216 waiting_info
.offset
= head
+ room_needed
;
217 waiting_info
.generation
= PKT_FIELD(vsk
, consume_q_generation
);
220 ret
= vmci_transport_send_waiting_read(sk
, &waiting_info
) > 0;
222 PKT_FIELD(vsk
, sent_waiting_read
) = true;
230 static bool send_waiting_write(struct sock
*sk
, u64 room_needed
)
232 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
233 struct vsock_sock
*vsk
;
234 struct vmci_transport_waiting_info waiting_info
;
242 if (PKT_FIELD(vsk
, sent_waiting_write
))
245 vmci_qpair_get_produce_indexes(vmci_trans(vsk
)->qpair
, &tail
, &head
);
246 room_left
= vmci_trans(vsk
)->produce_size
- tail
;
247 if (room_needed
+ 1 >= room_left
) {
248 /* Wraps around to current generation. */
249 waiting_info
.offset
= room_needed
+ 1 - room_left
;
250 waiting_info
.generation
= PKT_FIELD(vsk
, produce_q_generation
);
252 waiting_info
.offset
= tail
+ room_needed
+ 1;
253 waiting_info
.generation
=
254 PKT_FIELD(vsk
, produce_q_generation
) - 1;
257 ret
= vmci_transport_send_waiting_write(sk
, &waiting_info
) > 0;
259 PKT_FIELD(vsk
, sent_waiting_write
) = true;
267 static int vmci_transport_send_read_notification(struct sock
*sk
)
269 struct vsock_sock
*vsk
;
271 unsigned int retries
;
279 if (vmci_transport_notify_waiting_write(vsk
)) {
280 /* Notify the peer that we have read, retrying the send on
281 * failure up to our maximum value. XXX For now we just log
282 * the failure, but later we should schedule a work item to
283 * handle the resend until it succeeds. That would require
284 * keeping track of work items in the vsk and cleaning them up
287 while (!(vsk
->peer_shutdown
& RCV_SHUTDOWN
) &&
289 retries
< VMCI_TRANSPORT_MAX_DGRAM_RESENDS
) {
290 err
= vmci_transport_send_read(sk
);
297 if (retries
>= VMCI_TRANSPORT_MAX_DGRAM_RESENDS
)
298 pr_err("%p unable to send read notify to peer\n", sk
);
300 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
301 PKT_FIELD(vsk
, peer_waiting_write
) = false;
309 vmci_transport_handle_wrote(struct sock
*sk
,
310 struct vmci_transport_packet
*pkt
,
312 struct sockaddr_vm
*dst
, struct sockaddr_vm
*src
)
314 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
315 struct vsock_sock
*vsk
= vsock_sk(sk
);
316 PKT_FIELD(vsk
, sent_waiting_read
) = false;
318 sk
->sk_data_ready(sk
);
321 static void vmci_transport_notify_pkt_socket_init(struct sock
*sk
)
323 struct vsock_sock
*vsk
= vsock_sk(sk
);
325 PKT_FIELD(vsk
, write_notify_window
) = PAGE_SIZE
;
326 PKT_FIELD(vsk
, write_notify_min_window
) = PAGE_SIZE
;
327 PKT_FIELD(vsk
, peer_waiting_read
) = false;
328 PKT_FIELD(vsk
, peer_waiting_write
) = false;
329 PKT_FIELD(vsk
, peer_waiting_write_detected
) = false;
330 PKT_FIELD(vsk
, sent_waiting_read
) = false;
331 PKT_FIELD(vsk
, sent_waiting_write
) = false;
332 PKT_FIELD(vsk
, produce_q_generation
) = 0;
333 PKT_FIELD(vsk
, consume_q_generation
) = 0;
335 memset(&PKT_FIELD(vsk
, peer_waiting_read_info
), 0,
336 sizeof(PKT_FIELD(vsk
, peer_waiting_read_info
)));
337 memset(&PKT_FIELD(vsk
, peer_waiting_write_info
), 0,
338 sizeof(PKT_FIELD(vsk
, peer_waiting_write_info
)));
341 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock
*vsk
)
346 vmci_transport_notify_pkt_poll_in(struct sock
*sk
,
347 size_t target
, bool *data_ready_now
)
349 struct vsock_sock
*vsk
= vsock_sk(sk
);
351 if (vsock_stream_has_data(vsk
)) {
352 *data_ready_now
= true;
354 /* We can't read right now because there is nothing in the
355 * queue. Ask for notifications when there is something to
358 if (sk
->sk_state
== TCP_ESTABLISHED
) {
359 if (!send_waiting_read(sk
, 1))
363 *data_ready_now
= false;
370 vmci_transport_notify_pkt_poll_out(struct sock
*sk
,
371 size_t target
, bool *space_avail_now
)
373 s64 produce_q_free_space
;
374 struct vsock_sock
*vsk
= vsock_sk(sk
);
376 produce_q_free_space
= vsock_stream_has_space(vsk
);
377 if (produce_q_free_space
> 0) {
378 *space_avail_now
= true;
380 } else if (produce_q_free_space
== 0) {
381 /* This is a connected socket but we can't currently send data.
382 * Notify the peer that we are waiting if the queue is full. We
383 * only send a waiting write if the queue is full because
384 * otherwise we end up in an infinite WAITING_WRITE, READ,
385 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
386 * notification as a socket error, passing that back through
389 if (!send_waiting_write(sk
, 1))
392 *space_avail_now
= false;
399 vmci_transport_notify_pkt_recv_init(
402 struct vmci_transport_recv_notify_data
*data
)
404 struct vsock_sock
*vsk
= vsock_sk(sk
);
406 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
407 data
->consume_head
= 0;
408 data
->produce_tail
= 0;
409 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
410 data
->notify_on_block
= false;
412 if (PKT_FIELD(vsk
, write_notify_min_window
) < target
+ 1) {
413 PKT_FIELD(vsk
, write_notify_min_window
) = target
+ 1;
414 if (PKT_FIELD(vsk
, write_notify_window
) <
415 PKT_FIELD(vsk
, write_notify_min_window
)) {
416 /* If the current window is smaller than the new
417 * minimal window size, we need to reevaluate whether
418 * we need to notify the sender. If the number of ready
419 * bytes are smaller than the new window, we need to
420 * send a notification to the sender before we block.
423 PKT_FIELD(vsk
, write_notify_window
) =
424 PKT_FIELD(vsk
, write_notify_min_window
);
425 data
->notify_on_block
= true;
435 vmci_transport_notify_pkt_recv_pre_block(
438 struct vmci_transport_recv_notify_data
*data
)
442 /* Notify our peer that we are waiting for data to read. */
443 if (!send_waiting_read(sk
, target
)) {
447 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
448 if (data
->notify_on_block
) {
449 err
= vmci_transport_send_read_notification(sk
);
453 data
->notify_on_block
= false;
461 vmci_transport_notify_pkt_recv_pre_dequeue(
464 struct vmci_transport_recv_notify_data
*data
)
466 struct vsock_sock
*vsk
= vsock_sk(sk
);
468 /* Now consume up to len bytes from the queue. Note that since we have
469 * the socket locked we should copy at least ready bytes.
471 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
472 vmci_qpair_get_consume_indexes(vmci_trans(vsk
)->qpair
,
474 &data
->consume_head
);
481 vmci_transport_notify_pkt_recv_post_dequeue(
486 struct vmci_transport_recv_notify_data
*data
)
488 struct vsock_sock
*vsk
;
495 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
496 /* Detect a wrap-around to maintain queue generation. Note
497 * that this is safe since we hold the socket lock across the
498 * two queue pair operations.
501 vmci_trans(vsk
)->consume_size
- data
->consume_head
)
502 PKT_FIELD(vsk
, consume_q_generation
)++;
505 err
= vmci_transport_send_read_notification(sk
);
514 vmci_transport_notify_pkt_send_init(
516 struct vmci_transport_send_notify_data
*data
)
518 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
519 data
->consume_head
= 0;
520 data
->produce_tail
= 0;
527 vmci_transport_notify_pkt_send_pre_block(
529 struct vmci_transport_send_notify_data
*data
)
531 /* Notify our peer that we are waiting for room to write. */
532 if (!send_waiting_write(sk
, 1))
533 return -EHOSTUNREACH
;
539 vmci_transport_notify_pkt_send_pre_enqueue(
541 struct vmci_transport_send_notify_data
*data
)
543 struct vsock_sock
*vsk
= vsock_sk(sk
);
545 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
546 vmci_qpair_get_produce_indexes(vmci_trans(vsk
)->qpair
,
548 &data
->consume_head
);
555 vmci_transport_notify_pkt_send_post_enqueue(
558 struct vmci_transport_send_notify_data
*data
)
561 struct vsock_sock
*vsk
;
562 bool sent_wrote
= false;
567 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
568 /* Detect a wrap-around to maintain queue generation. Note that this
569 * is safe since we hold the socket lock across the two queue pair
572 if (written
>= vmci_trans(vsk
)->produce_size
- data
->produce_tail
)
573 PKT_FIELD(vsk
, produce_q_generation
)++;
577 if (vmci_transport_notify_waiting_read(vsk
)) {
578 /* Notify the peer that we have written, retrying the send on
579 * failure up to our maximum value. See the XXX comment for the
580 * corresponding piece of code in StreamRecvmsg() for potential
583 while (!(vsk
->peer_shutdown
& RCV_SHUTDOWN
) &&
585 retries
< VMCI_TRANSPORT_MAX_DGRAM_RESENDS
) {
586 err
= vmci_transport_send_wrote(sk
);
593 if (retries
>= VMCI_TRANSPORT_MAX_DGRAM_RESENDS
) {
594 pr_err("%p unable to send wrote notify to peer\n", sk
);
597 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
598 PKT_FIELD(vsk
, peer_waiting_read
) = false;
606 vmci_transport_notify_pkt_handle_pkt(
608 struct vmci_transport_packet
*pkt
,
610 struct sockaddr_vm
*dst
,
611 struct sockaddr_vm
*src
, bool *pkt_processed
)
613 bool processed
= false;
616 case VMCI_TRANSPORT_PACKET_TYPE_WROTE
:
617 vmci_transport_handle_wrote(sk
, pkt
, bottom_half
, dst
, src
);
620 case VMCI_TRANSPORT_PACKET_TYPE_READ
:
621 vmci_transport_handle_read(sk
, pkt
, bottom_half
, dst
, src
);
624 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE
:
625 vmci_transport_handle_waiting_write(sk
, pkt
, bottom_half
,
630 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ
:
631 vmci_transport_handle_waiting_read(sk
, pkt
, bottom_half
,
638 *pkt_processed
= processed
;
641 static void vmci_transport_notify_pkt_process_request(struct sock
*sk
)
643 struct vsock_sock
*vsk
= vsock_sk(sk
);
645 PKT_FIELD(vsk
, write_notify_window
) = vmci_trans(vsk
)->consume_size
;
646 if (vmci_trans(vsk
)->consume_size
<
647 PKT_FIELD(vsk
, write_notify_min_window
))
648 PKT_FIELD(vsk
, write_notify_min_window
) =
649 vmci_trans(vsk
)->consume_size
;
652 static void vmci_transport_notify_pkt_process_negotiate(struct sock
*sk
)
654 struct vsock_sock
*vsk
= vsock_sk(sk
);
656 PKT_FIELD(vsk
, write_notify_window
) = vmci_trans(vsk
)->consume_size
;
657 if (vmci_trans(vsk
)->consume_size
<
658 PKT_FIELD(vsk
, write_notify_min_window
))
659 PKT_FIELD(vsk
, write_notify_min_window
) =
660 vmci_trans(vsk
)->consume_size
;
663 /* Socket control packet based operations. */
664 const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops
= {
665 .socket_init
= vmci_transport_notify_pkt_socket_init
,
666 .socket_destruct
= vmci_transport_notify_pkt_socket_destruct
,
667 .poll_in
= vmci_transport_notify_pkt_poll_in
,
668 .poll_out
= vmci_transport_notify_pkt_poll_out
,
669 .handle_notify_pkt
= vmci_transport_notify_pkt_handle_pkt
,
670 .recv_init
= vmci_transport_notify_pkt_recv_init
,
671 .recv_pre_block
= vmci_transport_notify_pkt_recv_pre_block
,
672 .recv_pre_dequeue
= vmci_transport_notify_pkt_recv_pre_dequeue
,
673 .recv_post_dequeue
= vmci_transport_notify_pkt_recv_post_dequeue
,
674 .send_init
= vmci_transport_notify_pkt_send_init
,
675 .send_pre_block
= vmci_transport_notify_pkt_send_pre_block
,
676 .send_pre_enqueue
= vmci_transport_notify_pkt_send_pre_enqueue
,
677 .send_post_enqueue
= vmci_transport_notify_pkt_send_post_enqueue
,
678 .process_request
= vmci_transport_notify_pkt_process_request
,
679 .process_negotiate
= vmci_transport_notify_pkt_process_negotiate
,