allow changing the speed of a running migration
[qemu-kvm/fedora.git] / posix-aio-compat.c
blobe9fc9fa60a1d26aabac3f9222ff5940e13a3ade0
1 /*
2 * QEMU posix-aio emulation
4 * Copyright IBM, Corp. 2008
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include <sys/ioctl.h>
15 #include <pthread.h>
16 #include <unistd.h>
17 #include <errno.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include "osdep.h"
23 #include "qemu-common.h"
25 #include "posix-aio-compat.h"
27 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
28 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
29 static pthread_t thread_id;
30 static pthread_attr_t attr;
31 static int max_threads = 64;
32 static int cur_threads = 0;
33 static int idle_threads = 0;
34 static TAILQ_HEAD(, qemu_paiocb) request_list;
36 #ifdef HAVE_PREADV
37 static int preadv_present = 1;
38 #else
39 static int preadv_present = 0;
40 #endif
42 static void die2(int err, const char *what)
44 fprintf(stderr, "%s failed: %s\n", what, strerror(err));
45 abort();
48 static void die(const char *what)
50 die2(errno, what);
53 static void mutex_lock(pthread_mutex_t *mutex)
55 int ret = pthread_mutex_lock(mutex);
56 if (ret) die2(ret, "pthread_mutex_lock");
59 static void mutex_unlock(pthread_mutex_t *mutex)
61 int ret = pthread_mutex_unlock(mutex);
62 if (ret) die2(ret, "pthread_mutex_unlock");
65 static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
66 struct timespec *ts)
68 int ret = pthread_cond_timedwait(cond, mutex, ts);
69 if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
70 return ret;
73 static void cond_signal(pthread_cond_t *cond)
75 int ret = pthread_cond_signal(cond);
76 if (ret) die2(ret, "pthread_cond_signal");
79 static void thread_create(pthread_t *thread, pthread_attr_t *attr,
80 void *(*start_routine)(void*), void *arg)
82 int ret = pthread_create(thread, attr, start_routine, arg);
83 if (ret) die2(ret, "pthread_create");
86 static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
88 int ret;
90 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
91 if (ret == -1)
92 return -errno;
95 * This looks weird, but the aio code only consideres a request
96 * successfull if it has written the number full number of bytes.
98 * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
99 * so in fact we return the ioctl command here to make posix_aio_read()
100 * happy..
102 return aiocb->aio_nbytes;
105 #ifdef HAVE_PREADV
107 static ssize_t
108 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
110 return preadv(fd, iov, nr_iov, offset);
113 static ssize_t
114 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
116 return pwritev(fd, iov, nr_iov, offset);
119 #else
121 static ssize_t
122 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
124 return -ENOSYS;
127 static ssize_t
128 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
130 return -ENOSYS;
133 #endif
136 * Check if we need to copy the data in the aiocb into a new
137 * properly aligned buffer.
139 static int aiocb_needs_copy(struct qemu_paiocb *aiocb)
141 if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) {
142 int i;
144 for (i = 0; i < aiocb->aio_niov; i++)
145 if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512)
146 return 1;
149 return 0;
152 static size_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
154 size_t offset = 0;
155 ssize_t len;
157 do {
158 if (aiocb->aio_type == QEMU_PAIO_WRITE)
159 len = qemu_pwritev(aiocb->aio_fildes,
160 aiocb->aio_iov,
161 aiocb->aio_niov,
162 aiocb->aio_offset + offset);
163 else
164 len = qemu_preadv(aiocb->aio_fildes,
165 aiocb->aio_iov,
166 aiocb->aio_niov,
167 aiocb->aio_offset + offset);
168 } while (len == -1 && errno == EINTR);
170 if (len == -1)
171 return -errno;
172 return len;
175 static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
177 size_t offset = 0;
178 size_t len;
180 while (offset < aiocb->aio_nbytes) {
181 if (aiocb->aio_type == QEMU_PAIO_WRITE)
182 len = pwrite(aiocb->aio_fildes,
183 (const char *)buf + offset,
184 aiocb->aio_nbytes - offset,
185 aiocb->aio_offset + offset);
186 else
187 len = pread(aiocb->aio_fildes,
188 buf + offset,
189 aiocb->aio_nbytes - offset,
190 aiocb->aio_offset + offset);
192 if (len == -1 && errno == EINTR)
193 continue;
194 else if (len == -1) {
195 offset = -errno;
196 break;
197 } else if (len == 0)
198 break;
200 offset += len;
203 return offset;
206 static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
208 size_t nbytes;
209 char *buf;
211 if (!aiocb_needs_copy(aiocb)) {
213 * If there is just a single buffer, and it is properly aligned
214 * we can just use plain pread/pwrite without any problems.
216 if (aiocb->aio_niov == 1)
217 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
220 * We have more than one iovec, and all are properly aligned.
222 * Try preadv/pwritev first and fall back to linearizing the
223 * buffer if it's not supported.
225 if (preadv_present) {
226 nbytes = handle_aiocb_rw_vector(aiocb);
227 if (nbytes == aiocb->aio_nbytes)
228 return nbytes;
229 if (nbytes < 0 && nbytes != -ENOSYS)
230 return nbytes;
231 preadv_present = 0;
235 * XXX(hch): short read/write. no easy way to handle the reminder
236 * using these interfaces. For now retry using plain
237 * pread/pwrite?
242 * Ok, we have to do it the hard way, copy all segments into
243 * a single aligned buffer.
245 buf = qemu_memalign(512, aiocb->aio_nbytes);
246 if (aiocb->aio_type == QEMU_PAIO_WRITE) {
247 char *p = buf;
248 int i;
250 for (i = 0; i < aiocb->aio_niov; ++i) {
251 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
252 p += aiocb->aio_iov[i].iov_len;
256 nbytes = handle_aiocb_rw_linear(aiocb, buf);
257 if (aiocb->aio_type != QEMU_PAIO_WRITE) {
258 char *p = buf;
259 size_t count = aiocb->aio_nbytes, copy;
260 int i;
262 for (i = 0; i < aiocb->aio_niov && count; ++i) {
263 copy = count;
264 if (copy > aiocb->aio_iov[i].iov_len)
265 copy = aiocb->aio_iov[i].iov_len;
266 memcpy(aiocb->aio_iov[i].iov_base, p, copy);
267 p += copy;
268 count -= copy;
271 qemu_vfree(buf);
273 return nbytes;
276 static void *aio_thread(void *unused)
278 pid_t pid;
279 sigset_t set;
281 pid = getpid();
283 /* block all signals */
284 if (sigfillset(&set)) die("sigfillset");
285 if (sigprocmask(SIG_BLOCK, &set, NULL)) die("sigprocmask");
287 while (1) {
288 struct qemu_paiocb *aiocb;
289 size_t ret = 0;
290 qemu_timeval tv;
291 struct timespec ts;
293 qemu_gettimeofday(&tv);
294 ts.tv_sec = tv.tv_sec + 10;
295 ts.tv_nsec = 0;
297 mutex_lock(&lock);
299 while (TAILQ_EMPTY(&request_list) &&
300 !(ret == ETIMEDOUT)) {
301 ret = cond_timedwait(&cond, &lock, &ts);
304 if (TAILQ_EMPTY(&request_list))
305 break;
307 aiocb = TAILQ_FIRST(&request_list);
308 TAILQ_REMOVE(&request_list, aiocb, node);
309 aiocb->active = 1;
310 idle_threads--;
311 mutex_unlock(&lock);
313 switch (aiocb->aio_type) {
314 case QEMU_PAIO_READ:
315 case QEMU_PAIO_WRITE:
316 ret = handle_aiocb_rw(aiocb);
317 break;
318 case QEMU_PAIO_IOCTL:
319 ret = handle_aiocb_ioctl(aiocb);
320 break;
321 default:
322 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
323 ret = -EINVAL;
324 break;
327 mutex_lock(&lock);
328 aiocb->ret = ret;
329 idle_threads++;
330 mutex_unlock(&lock);
332 if (kill(pid, aiocb->ev_signo)) die("kill failed");
335 idle_threads--;
336 cur_threads--;
337 mutex_unlock(&lock);
339 return NULL;
342 static void spawn_thread(void)
344 cur_threads++;
345 idle_threads++;
346 thread_create(&thread_id, &attr, aio_thread, NULL);
349 int qemu_paio_init(struct qemu_paioinit *aioinit)
351 int ret;
353 ret = pthread_attr_init(&attr);
354 if (ret) die2(ret, "pthread_attr_init");
356 ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
357 if (ret) die2(ret, "pthread_attr_setdetachstate");
359 TAILQ_INIT(&request_list);
361 return 0;
364 static int qemu_paio_submit(struct qemu_paiocb *aiocb, int type)
366 aiocb->aio_type = type;
367 aiocb->ret = -EINPROGRESS;
368 aiocb->active = 0;
369 mutex_lock(&lock);
370 if (idle_threads == 0 && cur_threads < max_threads)
371 spawn_thread();
372 TAILQ_INSERT_TAIL(&request_list, aiocb, node);
373 mutex_unlock(&lock);
374 cond_signal(&cond);
376 return 0;
379 int qemu_paio_read(struct qemu_paiocb *aiocb)
381 return qemu_paio_submit(aiocb, QEMU_PAIO_READ);
384 int qemu_paio_write(struct qemu_paiocb *aiocb)
386 return qemu_paio_submit(aiocb, QEMU_PAIO_WRITE);
389 int qemu_paio_ioctl(struct qemu_paiocb *aiocb)
391 return qemu_paio_submit(aiocb, QEMU_PAIO_IOCTL);
394 ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
396 ssize_t ret;
398 mutex_lock(&lock);
399 ret = aiocb->ret;
400 mutex_unlock(&lock);
402 return ret;
405 int qemu_paio_error(struct qemu_paiocb *aiocb)
407 ssize_t ret = qemu_paio_return(aiocb);
409 if (ret < 0)
410 ret = -ret;
411 else
412 ret = 0;
414 return ret;
417 int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
419 int ret;
421 mutex_lock(&lock);
422 if (!aiocb->active) {
423 TAILQ_REMOVE(&request_list, aiocb, node);
424 aiocb->ret = -ECANCELED;
425 ret = QEMU_PAIO_CANCELED;
426 } else if (aiocb->ret == -EINPROGRESS)
427 ret = QEMU_PAIO_NOTCANCELED;
428 else
429 ret = QEMU_PAIO_ALLDONE;
430 mutex_unlock(&lock);
432 return ret;