mm: close PageTail race
[linux/fpc-iii.git] / net / vmw_vsock / vmci_transport_notify.c
blob9a730744e7bcf7fd1a0d79963666e32494be773c
1 /*
2 * VMware vSockets Driver
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
16 #include <linux/types.h>
17 #include <linux/socket.h>
18 #include <linux/stddef.h>
19 #include <net/sock.h>
21 #include "vmci_transport_notify.h"
23 #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
25 static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
27 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
28 bool retval;
29 u64 notify_limit;
31 if (!PKT_FIELD(vsk, peer_waiting_write))
32 return false;
34 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
35 /* When the sender blocks, we take that as a sign that the sender is
36 * faster than the receiver. To reduce the transmit rate of the sender,
37 * we delay the sending of the read notification by decreasing the
38 * write_notify_window. The notification is delayed until the number of
39 * bytes used in the queue drops below the write_notify_window.
42 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
43 PKT_FIELD(vsk, peer_waiting_write_detected) = true;
44 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
45 PKT_FIELD(vsk, write_notify_window) =
46 PKT_FIELD(vsk, write_notify_min_window);
47 } else {
48 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
49 if (PKT_FIELD(vsk, write_notify_window) <
50 PKT_FIELD(vsk, write_notify_min_window))
51 PKT_FIELD(vsk, write_notify_window) =
52 PKT_FIELD(vsk, write_notify_min_window);
56 notify_limit = vmci_trans(vsk)->consume_size -
57 PKT_FIELD(vsk, write_notify_window);
58 #else
59 notify_limit = 0;
60 #endif
62 /* For now we ignore the wait information and just see if the free
63 * space exceeds the notify limit. Note that improving this function
64 * to be more intelligent will not require a protocol change and will
65 * retain compatibility between endpoints with mixed versions of this
66 * function.
68 * The notify_limit is used to delay notifications in the case where
69 * flow control is enabled. Below the test is expressed in terms of
70 * free space in the queue: if free_space > ConsumeSize -
71 * write_notify_window then notify An alternate way of expressing this
72 * is to rewrite the expression to use the data ready in the receive
73 * queue: if write_notify_window > bufferReady then notify as
74 * free_space == ConsumeSize - bufferReady.
76 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
77 notify_limit;
78 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
79 if (retval) {
81 * Once we notify the peer, we reset the detected flag so the
82 * next wait will again cause a decrease in the window size.
85 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
87 #endif
88 return retval;
89 #else
90 return true;
91 #endif
94 static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
96 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
97 if (!PKT_FIELD(vsk, peer_waiting_read))
98 return false;
100 /* For now we ignore the wait information and just see if there is any
101 * data for our peer to read. Note that improving this function to be
102 * more intelligent will not require a protocol change and will retain
103 * compatibility between endpoints with mixed versions of this
104 * function.
106 return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
107 #else
108 return true;
109 #endif
112 static void
113 vmci_transport_handle_waiting_read(struct sock *sk,
114 struct vmci_transport_packet *pkt,
115 bool bottom_half,
116 struct sockaddr_vm *dst,
117 struct sockaddr_vm *src)
119 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
120 struct vsock_sock *vsk;
122 vsk = vsock_sk(sk);
124 PKT_FIELD(vsk, peer_waiting_read) = true;
125 memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
126 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
128 if (vmci_transport_notify_waiting_read(vsk)) {
129 bool sent;
131 if (bottom_half)
132 sent = vmci_transport_send_wrote_bh(dst, src) > 0;
133 else
134 sent = vmci_transport_send_wrote(sk) > 0;
136 if (sent)
137 PKT_FIELD(vsk, peer_waiting_read) = false;
139 #endif
142 static void
143 vmci_transport_handle_waiting_write(struct sock *sk,
144 struct vmci_transport_packet *pkt,
145 bool bottom_half,
146 struct sockaddr_vm *dst,
147 struct sockaddr_vm *src)
149 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
150 struct vsock_sock *vsk;
152 vsk = vsock_sk(sk);
154 PKT_FIELD(vsk, peer_waiting_write) = true;
155 memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
156 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
158 if (vmci_transport_notify_waiting_write(vsk)) {
159 bool sent;
161 if (bottom_half)
162 sent = vmci_transport_send_read_bh(dst, src) > 0;
163 else
164 sent = vmci_transport_send_read(sk) > 0;
166 if (sent)
167 PKT_FIELD(vsk, peer_waiting_write) = false;
169 #endif
172 static void
173 vmci_transport_handle_read(struct sock *sk,
174 struct vmci_transport_packet *pkt,
175 bool bottom_half,
176 struct sockaddr_vm *dst, struct sockaddr_vm *src)
178 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
179 struct vsock_sock *vsk;
181 vsk = vsock_sk(sk);
182 PKT_FIELD(vsk, sent_waiting_write) = false;
183 #endif
185 sk->sk_write_space(sk);
188 static bool send_waiting_read(struct sock *sk, u64 room_needed)
190 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
191 struct vsock_sock *vsk;
192 struct vmci_transport_waiting_info waiting_info;
193 u64 tail;
194 u64 head;
195 u64 room_left;
196 bool ret;
198 vsk = vsock_sk(sk);
200 if (PKT_FIELD(vsk, sent_waiting_read))
201 return true;
203 if (PKT_FIELD(vsk, write_notify_window) <
204 vmci_trans(vsk)->consume_size)
205 PKT_FIELD(vsk, write_notify_window) =
206 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
207 vmci_trans(vsk)->consume_size);
209 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
210 room_left = vmci_trans(vsk)->consume_size - head;
211 if (room_needed >= room_left) {
212 waiting_info.offset = room_needed - room_left;
213 waiting_info.generation =
214 PKT_FIELD(vsk, consume_q_generation) + 1;
215 } else {
216 waiting_info.offset = head + room_needed;
217 waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
220 ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
221 if (ret)
222 PKT_FIELD(vsk, sent_waiting_read) = true;
224 return ret;
225 #else
226 return true;
227 #endif
230 static bool send_waiting_write(struct sock *sk, u64 room_needed)
232 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
233 struct vsock_sock *vsk;
234 struct vmci_transport_waiting_info waiting_info;
235 u64 tail;
236 u64 head;
237 u64 room_left;
238 bool ret;
240 vsk = vsock_sk(sk);
242 if (PKT_FIELD(vsk, sent_waiting_write))
243 return true;
245 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
246 room_left = vmci_trans(vsk)->produce_size - tail;
247 if (room_needed + 1 >= room_left) {
248 /* Wraps around to current generation. */
249 waiting_info.offset = room_needed + 1 - room_left;
250 waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
251 } else {
252 waiting_info.offset = tail + room_needed + 1;
253 waiting_info.generation =
254 PKT_FIELD(vsk, produce_q_generation) - 1;
257 ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
258 if (ret)
259 PKT_FIELD(vsk, sent_waiting_write) = true;
261 return ret;
262 #else
263 return true;
264 #endif
267 static int vmci_transport_send_read_notification(struct sock *sk)
269 struct vsock_sock *vsk;
270 bool sent_read;
271 unsigned int retries;
272 int err;
274 vsk = vsock_sk(sk);
275 sent_read = false;
276 retries = 0;
277 err = 0;
279 if (vmci_transport_notify_waiting_write(vsk)) {
280 /* Notify the peer that we have read, retrying the send on
281 * failure up to our maximum value. XXX For now we just log
282 * the failure, but later we should schedule a work item to
283 * handle the resend until it succeeds. That would require
284 * keeping track of work items in the vsk and cleaning them up
285 * upon socket close.
287 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
288 !sent_read &&
289 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
290 err = vmci_transport_send_read(sk);
291 if (err >= 0)
292 sent_read = true;
294 retries++;
297 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
298 pr_err("%p unable to send read notify to peer\n", sk);
299 else
300 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
301 PKT_FIELD(vsk, peer_waiting_write) = false;
302 #endif
305 return err;
308 static void
309 vmci_transport_handle_wrote(struct sock *sk,
310 struct vmci_transport_packet *pkt,
311 bool bottom_half,
312 struct sockaddr_vm *dst, struct sockaddr_vm *src)
314 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
315 struct vsock_sock *vsk = vsock_sk(sk);
316 PKT_FIELD(vsk, sent_waiting_read) = false;
317 #endif
318 sk->sk_data_ready(sk, 0);
321 static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
323 struct vsock_sock *vsk = vsock_sk(sk);
325 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
326 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
327 PKT_FIELD(vsk, peer_waiting_read) = false;
328 PKT_FIELD(vsk, peer_waiting_write) = false;
329 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
330 PKT_FIELD(vsk, sent_waiting_read) = false;
331 PKT_FIELD(vsk, sent_waiting_write) = false;
332 PKT_FIELD(vsk, produce_q_generation) = 0;
333 PKT_FIELD(vsk, consume_q_generation) = 0;
335 memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
336 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
337 memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
338 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
341 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
345 static int
346 vmci_transport_notify_pkt_poll_in(struct sock *sk,
347 size_t target, bool *data_ready_now)
349 struct vsock_sock *vsk = vsock_sk(sk);
351 if (vsock_stream_has_data(vsk)) {
352 *data_ready_now = true;
353 } else {
354 /* We can't read right now because there is nothing in the
355 * queue. Ask for notifications when there is something to
356 * read.
358 if (sk->sk_state == SS_CONNECTED) {
359 if (!send_waiting_read(sk, 1))
360 return -1;
363 *data_ready_now = false;
366 return 0;
369 static int
370 vmci_transport_notify_pkt_poll_out(struct sock *sk,
371 size_t target, bool *space_avail_now)
373 s64 produce_q_free_space;
374 struct vsock_sock *vsk = vsock_sk(sk);
376 produce_q_free_space = vsock_stream_has_space(vsk);
377 if (produce_q_free_space > 0) {
378 *space_avail_now = true;
379 return 0;
380 } else if (produce_q_free_space == 0) {
381 /* This is a connected socket but we can't currently send data.
382 * Notify the peer that we are waiting if the queue is full. We
383 * only send a waiting write if the queue is full because
384 * otherwise we end up in an infinite WAITING_WRITE, READ,
385 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
386 * notification as a socket error, passing that back through
387 * the mask.
389 if (!send_waiting_write(sk, 1))
390 return -1;
392 *space_avail_now = false;
395 return 0;
398 static int
399 vmci_transport_notify_pkt_recv_init(
400 struct sock *sk,
401 size_t target,
402 struct vmci_transport_recv_notify_data *data)
404 struct vsock_sock *vsk = vsock_sk(sk);
406 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
407 data->consume_head = 0;
408 data->produce_tail = 0;
409 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
410 data->notify_on_block = false;
412 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
413 PKT_FIELD(vsk, write_notify_min_window) = target + 1;
414 if (PKT_FIELD(vsk, write_notify_window) <
415 PKT_FIELD(vsk, write_notify_min_window)) {
416 /* If the current window is smaller than the new
417 * minimal window size, we need to reevaluate whether
418 * we need to notify the sender. If the number of ready
419 * bytes are smaller than the new window, we need to
420 * send a notification to the sender before we block.
423 PKT_FIELD(vsk, write_notify_window) =
424 PKT_FIELD(vsk, write_notify_min_window);
425 data->notify_on_block = true;
428 #endif
429 #endif
431 return 0;
434 static int
435 vmci_transport_notify_pkt_recv_pre_block(
436 struct sock *sk,
437 size_t target,
438 struct vmci_transport_recv_notify_data *data)
440 int err = 0;
442 /* Notify our peer that we are waiting for data to read. */
443 if (!send_waiting_read(sk, target)) {
444 err = -EHOSTUNREACH;
445 return err;
447 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
448 if (data->notify_on_block) {
449 err = vmci_transport_send_read_notification(sk);
450 if (err < 0)
451 return err;
453 data->notify_on_block = false;
455 #endif
457 return err;
460 static int
461 vmci_transport_notify_pkt_recv_pre_dequeue(
462 struct sock *sk,
463 size_t target,
464 struct vmci_transport_recv_notify_data *data)
466 struct vsock_sock *vsk = vsock_sk(sk);
468 /* Now consume up to len bytes from the queue. Note that since we have
469 * the socket locked we should copy at least ready bytes.
471 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
472 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
473 &data->produce_tail,
474 &data->consume_head);
475 #endif
477 return 0;
480 static int
481 vmci_transport_notify_pkt_recv_post_dequeue(
482 struct sock *sk,
483 size_t target,
484 ssize_t copied,
485 bool data_read,
486 struct vmci_transport_recv_notify_data *data)
488 struct vsock_sock *vsk;
489 int err;
491 vsk = vsock_sk(sk);
492 err = 0;
494 if (data_read) {
495 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
496 /* Detect a wrap-around to maintain queue generation. Note
497 * that this is safe since we hold the socket lock across the
498 * two queue pair operations.
500 if (copied >=
501 vmci_trans(vsk)->consume_size - data->consume_head)
502 PKT_FIELD(vsk, consume_q_generation)++;
503 #endif
505 err = vmci_transport_send_read_notification(sk);
506 if (err < 0)
507 return err;
510 return err;
513 static int
514 vmci_transport_notify_pkt_send_init(
515 struct sock *sk,
516 struct vmci_transport_send_notify_data *data)
518 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
519 data->consume_head = 0;
520 data->produce_tail = 0;
521 #endif
523 return 0;
526 static int
527 vmci_transport_notify_pkt_send_pre_block(
528 struct sock *sk,
529 struct vmci_transport_send_notify_data *data)
531 /* Notify our peer that we are waiting for room to write. */
532 if (!send_waiting_write(sk, 1))
533 return -EHOSTUNREACH;
535 return 0;
538 static int
539 vmci_transport_notify_pkt_send_pre_enqueue(
540 struct sock *sk,
541 struct vmci_transport_send_notify_data *data)
543 struct vsock_sock *vsk = vsock_sk(sk);
545 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
546 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
547 &data->produce_tail,
548 &data->consume_head);
549 #endif
551 return 0;
554 static int
555 vmci_transport_notify_pkt_send_post_enqueue(
556 struct sock *sk,
557 ssize_t written,
558 struct vmci_transport_send_notify_data *data)
560 int err = 0;
561 struct vsock_sock *vsk;
562 bool sent_wrote = false;
563 int retries = 0;
565 vsk = vsock_sk(sk);
567 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
568 /* Detect a wrap-around to maintain queue generation. Note that this
569 * is safe since we hold the socket lock across the two queue pair
570 * operations.
572 if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
573 PKT_FIELD(vsk, produce_q_generation)++;
575 #endif
577 if (vmci_transport_notify_waiting_read(vsk)) {
578 /* Notify the peer that we have written, retrying the send on
579 * failure up to our maximum value. See the XXX comment for the
580 * corresponding piece of code in StreamRecvmsg() for potential
581 * improvements.
583 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
584 !sent_wrote &&
585 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
586 err = vmci_transport_send_wrote(sk);
587 if (err >= 0)
588 sent_wrote = true;
590 retries++;
593 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
594 pr_err("%p unable to send wrote notify to peer\n", sk);
595 return err;
596 } else {
597 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
598 PKT_FIELD(vsk, peer_waiting_read) = false;
599 #endif
602 return err;
605 static void
606 vmci_transport_notify_pkt_handle_pkt(
607 struct sock *sk,
608 struct vmci_transport_packet *pkt,
609 bool bottom_half,
610 struct sockaddr_vm *dst,
611 struct sockaddr_vm *src, bool *pkt_processed)
613 bool processed = false;
615 switch (pkt->type) {
616 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
617 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
618 processed = true;
619 break;
620 case VMCI_TRANSPORT_PACKET_TYPE_READ:
621 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
622 processed = true;
623 break;
624 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
625 vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
626 dst, src);
627 processed = true;
628 break;
630 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
631 vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
632 dst, src);
633 processed = true;
634 break;
637 if (pkt_processed)
638 *pkt_processed = processed;
641 static void vmci_transport_notify_pkt_process_request(struct sock *sk)
643 struct vsock_sock *vsk = vsock_sk(sk);
645 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
646 if (vmci_trans(vsk)->consume_size <
647 PKT_FIELD(vsk, write_notify_min_window))
648 PKT_FIELD(vsk, write_notify_min_window) =
649 vmci_trans(vsk)->consume_size;
652 static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
654 struct vsock_sock *vsk = vsock_sk(sk);
656 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
657 if (vmci_trans(vsk)->consume_size <
658 PKT_FIELD(vsk, write_notify_min_window))
659 PKT_FIELD(vsk, write_notify_min_window) =
660 vmci_trans(vsk)->consume_size;
663 /* Socket control packet based operations. */
664 struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
665 vmci_transport_notify_pkt_socket_init,
666 vmci_transport_notify_pkt_socket_destruct,
667 vmci_transport_notify_pkt_poll_in,
668 vmci_transport_notify_pkt_poll_out,
669 vmci_transport_notify_pkt_handle_pkt,
670 vmci_transport_notify_pkt_recv_init,
671 vmci_transport_notify_pkt_recv_pre_block,
672 vmci_transport_notify_pkt_recv_pre_dequeue,
673 vmci_transport_notify_pkt_recv_post_dequeue,
674 vmci_transport_notify_pkt_send_init,
675 vmci_transport_notify_pkt_send_pre_block,
676 vmci_transport_notify_pkt_send_pre_enqueue,
677 vmci_transport_notify_pkt_send_post_enqueue,
678 vmci_transport_notify_pkt_process_request,
679 vmci_transport_notify_pkt_process_negotiate,