blob: d3c1174ebf4482974124dc925631629d52d401f3 [file] [log] [blame]
aliguori3c529d92008-12-12 16:41:40 +00001/*
2 * QEMU posix-aio emulation
3 *
4 * Copyright IBM, Corp. 2008
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
aliguori221f7152009-03-28 17:28:41 +000014#include <sys/ioctl.h>
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020015#include <sys/types.h>
aliguori3c529d92008-12-12 16:41:40 +000016#include <pthread.h>
17#include <unistd.h>
18#include <errno.h>
malc30525af2009-02-21 05:48:13 +000019#include <time.h>
malc8653c012009-02-21 05:48:11 +000020#include <string.h>
21#include <stdlib.h>
22#include <stdio.h>
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020023
Blue Swirl72cf2d42009-09-12 07:36:22 +000024#include "qemu-queue.h"
aliguori3c529d92008-12-12 16:41:40 +000025#include "osdep.h"
Jes Sorensendc786bc2010-10-26 10:39:23 +020026#include "sysemu.h"
aliguorif141eaf2009-04-07 18:43:24 +000027#include "qemu-common.h"
Stefan Hajnoczi6d519a52010-05-22 18:15:08 +010028#include "trace.h"
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020029#include "block_int.h"
aliguori3c529d92008-12-12 16:41:40 +000030
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020031#include "block/raw-posix-aio.h"
32
Avi Kivitye4ea78e2011-08-14 07:04:49 +030033static void do_spawn_thread(void);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020034
35struct qemu_paiocb {
36 BlockDriverAIOCB common;
37 int aio_fildes;
38 union {
39 struct iovec *aio_iov;
Stefan Hajnoczib587a522010-05-27 12:52:08 +010040 void *aio_ioctl_buf;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020041 };
42 int aio_niov;
43 size_t aio_nbytes;
44#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020045 off_t aio_offset;
46
Blue Swirl72cf2d42009-09-12 07:36:22 +000047 QTAILQ_ENTRY(qemu_paiocb) node;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +020048 int aio_type;
49 ssize_t ret;
50 int active;
51 struct qemu_paiocb *next;
52};
53
54typedef struct PosixAioState {
55 int rfd, wfd;
56 struct qemu_paiocb *first_aio;
57} PosixAioState;
58
aliguori3c529d92008-12-12 16:41:40 +000059
60static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
61static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
62static pthread_t thread_id;
malca8227a52009-02-21 05:48:17 +000063static pthread_attr_t attr;
aliguori3c529d92008-12-12 16:41:40 +000064static int max_threads = 64;
65static int cur_threads = 0;
66static int idle_threads = 0;
Avi Kivitye4ea78e2011-08-14 07:04:49 +030067static int new_threads = 0; /* backlog of threads we need to create */
68static int pending_threads = 0; /* threads created but not running yet */
69static QEMUBH *new_thread_bh;
Blue Swirl72cf2d42009-09-12 07:36:22 +000070static QTAILQ_HEAD(, qemu_paiocb) request_list;
aliguori3c529d92008-12-12 16:41:40 +000071
Juan Quintela2341f9a2009-07-27 16:12:58 +020072#ifdef CONFIG_PREADV
aliguoriceb42de2009-04-07 18:43:28 +000073static int preadv_present = 1;
74#else
75static int preadv_present = 0;
76#endif
77
malc8653c012009-02-21 05:48:11 +000078static void die2(int err, const char *what)
79{
80 fprintf(stderr, "%s failed: %s\n", what, strerror(err));
81 abort();
82}
83
84static void die(const char *what)
85{
86 die2(errno, what);
87}
88
89static void mutex_lock(pthread_mutex_t *mutex)
90{
91 int ret = pthread_mutex_lock(mutex);
92 if (ret) die2(ret, "pthread_mutex_lock");
93}
94
95static void mutex_unlock(pthread_mutex_t *mutex)
96{
97 int ret = pthread_mutex_unlock(mutex);
98 if (ret) die2(ret, "pthread_mutex_unlock");
99}
100
101static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
102 struct timespec *ts)
103{
104 int ret = pthread_cond_timedwait(cond, mutex, ts);
105 if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
106 return ret;
107}
108
malc5d47e372009-02-21 05:48:15 +0000109static void cond_signal(pthread_cond_t *cond)
malc8653c012009-02-21 05:48:11 +0000110{
malc5d47e372009-02-21 05:48:15 +0000111 int ret = pthread_cond_signal(cond);
112 if (ret) die2(ret, "pthread_cond_signal");
malc8653c012009-02-21 05:48:11 +0000113}
114
115static void thread_create(pthread_t *thread, pthread_attr_t *attr,
116 void *(*start_routine)(void*), void *arg)
117{
118 int ret = pthread_create(thread, attr, start_routine, arg);
119 if (ret) die2(ret, "pthread_create");
120}
121
Kevin Wolf6769da22009-11-18 12:15:10 +0100122static ssize_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
aliguorif141eaf2009-04-07 18:43:24 +0000123{
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100124 int ret;
aliguorif141eaf2009-04-07 18:43:24 +0000125
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100126 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
127 if (ret == -1)
128 return -errno;
Christoph Hellwige7d54ae2009-04-28 11:57:02 +0200129
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100130 /*
131 * This looks weird, but the aio code only consideres a request
Stefan Weilb0cd7122010-08-06 21:53:45 +0200132 * successful if it has written the number full number of bytes.
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100133 *
134 * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
135 * so in fact we return the ioctl command here to make posix_aio_read()
136 * happy..
137 */
138 return aiocb->aio_nbytes;
aliguorif141eaf2009-04-07 18:43:24 +0000139}
140
Kevin Wolf6769da22009-11-18 12:15:10 +0100141static ssize_t handle_aiocb_flush(struct qemu_paiocb *aiocb)
Christoph Hellwigb2e12bc2009-09-04 19:01:49 +0200142{
143 int ret;
144
Blue Swirl47faadc2009-09-12 06:19:14 +0000145 ret = qemu_fdatasync(aiocb->aio_fildes);
Christoph Hellwigb2e12bc2009-09-04 19:01:49 +0200146 if (ret == -1)
147 return -errno;
148 return 0;
149}
150
Juan Quintela2341f9a2009-07-27 16:12:58 +0200151#ifdef CONFIG_PREADV
aliguoriceb42de2009-04-07 18:43:28 +0000152
153static ssize_t
154qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
155{
156 return preadv(fd, iov, nr_iov, offset);
157}
158
159static ssize_t
160qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
161{
162 return pwritev(fd, iov, nr_iov, offset);
163}
164
165#else
166
167static ssize_t
168qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
169{
170 return -ENOSYS;
171}
172
173static ssize_t
174qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
175{
176 return -ENOSYS;
177}
178
179#endif
180
Kevin Wolf6769da22009-11-18 12:15:10 +0100181static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
aliguoriceb42de2009-04-07 18:43:28 +0000182{
aliguoriceb42de2009-04-07 18:43:28 +0000183 ssize_t len;
184
185 do {
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200186 if (aiocb->aio_type & QEMU_AIO_WRITE)
aliguoriceb42de2009-04-07 18:43:28 +0000187 len = qemu_pwritev(aiocb->aio_fildes,
188 aiocb->aio_iov,
189 aiocb->aio_niov,
Frediano Ziglio21cfa412011-09-16 13:34:46 +0200190 aiocb->aio_offset);
aliguoriceb42de2009-04-07 18:43:28 +0000191 else
192 len = qemu_preadv(aiocb->aio_fildes,
193 aiocb->aio_iov,
194 aiocb->aio_niov,
Frediano Ziglio21cfa412011-09-16 13:34:46 +0200195 aiocb->aio_offset);
aliguoriceb42de2009-04-07 18:43:28 +0000196 } while (len == -1 && errno == EINTR);
197
198 if (len == -1)
199 return -errno;
200 return len;
201}
202
Kevin Wolfba1d1af2011-07-25 19:42:37 +0200203/*
204 * Read/writes the data to/from a given linear buffer.
205 *
206 * Returns the number of bytes handles or -errno in case of an error. Short
207 * reads are only returned if the end of the file is reached.
208 */
Kevin Wolf6769da22009-11-18 12:15:10 +0100209static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
aliguori221f7152009-03-28 17:28:41 +0000210{
Kevin Wolf6769da22009-11-18 12:15:10 +0100211 ssize_t offset = 0;
212 ssize_t len;
aliguori221f7152009-03-28 17:28:41 +0000213
214 while (offset < aiocb->aio_nbytes) {
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200215 if (aiocb->aio_type & QEMU_AIO_WRITE)
aliguorif141eaf2009-04-07 18:43:24 +0000216 len = pwrite(aiocb->aio_fildes,
217 (const char *)buf + offset,
218 aiocb->aio_nbytes - offset,
219 aiocb->aio_offset + offset);
220 else
221 len = pread(aiocb->aio_fildes,
222 buf + offset,
aliguori221f7152009-03-28 17:28:41 +0000223 aiocb->aio_nbytes - offset,
224 aiocb->aio_offset + offset);
aliguori221f7152009-03-28 17:28:41 +0000225
aliguorif141eaf2009-04-07 18:43:24 +0000226 if (len == -1 && errno == EINTR)
227 continue;
228 else if (len == -1) {
229 offset = -errno;
230 break;
231 } else if (len == 0)
232 break;
aliguori221f7152009-03-28 17:28:41 +0000233
aliguorif141eaf2009-04-07 18:43:24 +0000234 offset += len;
aliguori221f7152009-03-28 17:28:41 +0000235 }
236
237 return offset;
238}
239
Kevin Wolf6769da22009-11-18 12:15:10 +0100240static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
aliguori221f7152009-03-28 17:28:41 +0000241{
Kevin Wolf6769da22009-11-18 12:15:10 +0100242 ssize_t nbytes;
aliguorif141eaf2009-04-07 18:43:24 +0000243 char *buf;
aliguori221f7152009-03-28 17:28:41 +0000244
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200245 if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
aliguorif141eaf2009-04-07 18:43:24 +0000246 /*
247 * If there is just a single buffer, and it is properly aligned
248 * we can just use plain pread/pwrite without any problems.
249 */
aliguoriceb42de2009-04-07 18:43:28 +0000250 if (aiocb->aio_niov == 1)
251 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
252
253 /*
254 * We have more than one iovec, and all are properly aligned.
255 *
256 * Try preadv/pwritev first and fall back to linearizing the
257 * buffer if it's not supported.
258 */
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100259 if (preadv_present) {
aliguoriceb42de2009-04-07 18:43:28 +0000260 nbytes = handle_aiocb_rw_vector(aiocb);
261 if (nbytes == aiocb->aio_nbytes)
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100262 return nbytes;
aliguoriceb42de2009-04-07 18:43:28 +0000263 if (nbytes < 0 && nbytes != -ENOSYS)
264 return nbytes;
265 preadv_present = 0;
266 }
267
268 /*
269 * XXX(hch): short read/write. no easy way to handle the reminder
270 * using these interfaces. For now retry using plain
271 * pread/pwrite?
272 */
aliguorif141eaf2009-04-07 18:43:24 +0000273 }
274
275 /*
276 * Ok, we have to do it the hard way, copy all segments into
277 * a single aligned buffer.
278 */
Christoph Hellwig72aef732010-09-12 23:42:56 +0200279 buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200280 if (aiocb->aio_type & QEMU_AIO_WRITE) {
aliguorif141eaf2009-04-07 18:43:24 +0000281 char *p = buf;
282 int i;
283
284 for (i = 0; i < aiocb->aio_niov; ++i) {
285 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
286 p += aiocb->aio_iov[i].iov_len;
287 }
288 }
289
290 nbytes = handle_aiocb_rw_linear(aiocb, buf);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200291 if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
aliguorif141eaf2009-04-07 18:43:24 +0000292 char *p = buf;
293 size_t count = aiocb->aio_nbytes, copy;
294 int i;
295
296 for (i = 0; i < aiocb->aio_niov && count; ++i) {
297 copy = count;
298 if (copy > aiocb->aio_iov[i].iov_len)
299 copy = aiocb->aio_iov[i].iov_len;
300 memcpy(aiocb->aio_iov[i].iov_base, p, copy);
301 p += copy;
302 count -= copy;
303 }
304 }
305 qemu_vfree(buf);
306
307 return nbytes;
aliguori221f7152009-03-28 17:28:41 +0000308}
309
Frediano Ziglioe1d3b252011-09-19 16:37:13 +0200310static void posix_aio_notify_event(void);
311
aliguori3c529d92008-12-12 16:41:40 +0000312static void *aio_thread(void *unused)
313{
Avi Kivitye4ea78e2011-08-14 07:04:49 +0300314 mutex_lock(&lock);
315 pending_threads--;
316 mutex_unlock(&lock);
317 do_spawn_thread();
318
aliguori3c529d92008-12-12 16:41:40 +0000319 while (1) {
320 struct qemu_paiocb *aiocb;
Kevin Wolf6769da22009-11-18 12:15:10 +0100321 ssize_t ret = 0;
malc30525af2009-02-21 05:48:13 +0000322 qemu_timeval tv;
323 struct timespec ts;
324
325 qemu_gettimeofday(&tv);
326 ts.tv_sec = tv.tv_sec + 10;
327 ts.tv_nsec = 0;
aliguori3c529d92008-12-12 16:41:40 +0000328
malc8653c012009-02-21 05:48:11 +0000329 mutex_lock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000330
Blue Swirl72cf2d42009-09-12 07:36:22 +0000331 while (QTAILQ_EMPTY(&request_list) &&
aliguori3c529d92008-12-12 16:41:40 +0000332 !(ret == ETIMEDOUT)) {
Kevin Wolf5be4aab2011-05-02 17:32:54 +0200333 idle_threads++;
malc8653c012009-02-21 05:48:11 +0000334 ret = cond_timedwait(&cond, &lock, &ts);
Kevin Wolf5be4aab2011-05-02 17:32:54 +0200335 idle_threads--;
aliguori3c529d92008-12-12 16:41:40 +0000336 }
337
Blue Swirl72cf2d42009-09-12 07:36:22 +0000338 if (QTAILQ_EMPTY(&request_list))
aliguori3c529d92008-12-12 16:41:40 +0000339 break;
340
Blue Swirl72cf2d42009-09-12 07:36:22 +0000341 aiocb = QTAILQ_FIRST(&request_list);
342 QTAILQ_REMOVE(&request_list, aiocb, node);
aliguori3c529d92008-12-12 16:41:40 +0000343 aiocb->active = 1;
malc8653c012009-02-21 05:48:11 +0000344 mutex_unlock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000345
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200346 switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
347 case QEMU_AIO_READ:
Kevin Wolfba1d1af2011-07-25 19:42:37 +0200348 ret = handle_aiocb_rw(aiocb);
349 if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) {
350 /* A short read means that we have reached EOF. Pad the buffer
351 * with zeros for bytes after EOF. */
352 QEMUIOVector qiov;
353
354 qemu_iovec_init_external(&qiov, aiocb->aio_iov,
355 aiocb->aio_niov);
356 qemu_iovec_memset_skip(&qiov, 0, aiocb->aio_nbytes - ret, ret);
357
358 ret = aiocb->aio_nbytes;
359 }
360 break;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200361 case QEMU_AIO_WRITE:
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100362 ret = handle_aiocb_rw(aiocb);
363 break;
Christoph Hellwigb2e12bc2009-09-04 19:01:49 +0200364 case QEMU_AIO_FLUSH:
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100365 ret = handle_aiocb_flush(aiocb);
366 break;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200367 case QEMU_AIO_IOCTL:
Stefan Hajnoczib587a522010-05-27 12:52:08 +0100368 ret = handle_aiocb_ioctl(aiocb);
369 break;
370 default:
371 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
372 ret = -EINVAL;
373 break;
374 }
aliguori3c529d92008-12-12 16:41:40 +0000375
malc8653c012009-02-21 05:48:11 +0000376 mutex_lock(&lock);
aliguori221f7152009-03-28 17:28:41 +0000377 aiocb->ret = ret;
malc8653c012009-02-21 05:48:11 +0000378 mutex_unlock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000379
Frediano Ziglioe1d3b252011-09-19 16:37:13 +0200380 posix_aio_notify_event();
aliguori3c529d92008-12-12 16:41:40 +0000381 }
382
aliguori3c529d92008-12-12 16:41:40 +0000383 cur_threads--;
malc8653c012009-02-21 05:48:11 +0000384 mutex_unlock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000385
386 return NULL;
387}
388
Avi Kivitye4ea78e2011-08-14 07:04:49 +0300389static void do_spawn_thread(void)
aliguori3c529d92008-12-12 16:41:40 +0000390{
malcee399302009-09-25 00:20:44 +0400391 sigset_t set, oldset;
392
Avi Kivitye4ea78e2011-08-14 07:04:49 +0300393 mutex_lock(&lock);
394 if (!new_threads) {
395 mutex_unlock(&lock);
396 return;
397 }
398
399 new_threads--;
400 pending_threads++;
401
402 mutex_unlock(&lock);
malcee399302009-09-25 00:20:44 +0400403
404 /* block all signals */
405 if (sigfillset(&set)) die("sigfillset");
406 if (sigprocmask(SIG_SETMASK, &set, &oldset)) die("sigprocmask");
407
malc8653c012009-02-21 05:48:11 +0000408 thread_create(&thread_id, &attr, aio_thread, NULL);
malcee399302009-09-25 00:20:44 +0400409
410 if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
aliguori3c529d92008-12-12 16:41:40 +0000411}
412
Avi Kivitye4ea78e2011-08-14 07:04:49 +0300413static void spawn_thread_bh_fn(void *opaque)
414{
415 do_spawn_thread();
416}
417
418static void spawn_thread(void)
419{
420 cur_threads++;
421 new_threads++;
422 /* If there are threads being created, they will spawn new workers, so
423 * we don't spend time creating many threads in a loop holding a mutex or
424 * starving the current vcpu.
425 *
426 * If there are no idle threads, ask the main thread to create one, so we
427 * inherit the correct affinity instead of the vcpu affinity.
428 */
429 if (!pending_threads) {
430 qemu_bh_schedule(new_thread_bh);
431 }
432}
433
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200434static void qemu_paio_submit(struct qemu_paiocb *aiocb)
aliguori3c529d92008-12-12 16:41:40 +0000435{
aliguori3c529d92008-12-12 16:41:40 +0000436 aiocb->ret = -EINPROGRESS;
437 aiocb->active = 0;
malc8653c012009-02-21 05:48:11 +0000438 mutex_lock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000439 if (idle_threads == 0 && cur_threads < max_threads)
440 spawn_thread();
Blue Swirl72cf2d42009-09-12 07:36:22 +0000441 QTAILQ_INSERT_TAIL(&request_list, aiocb, node);
malc8653c012009-02-21 05:48:11 +0000442 mutex_unlock(&lock);
malc5d47e372009-02-21 05:48:15 +0000443 cond_signal(&cond);
aliguori3c529d92008-12-12 16:41:40 +0000444}
445
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200446static ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
aliguori3c529d92008-12-12 16:41:40 +0000447{
448 ssize_t ret;
449
malc8653c012009-02-21 05:48:11 +0000450 mutex_lock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000451 ret = aiocb->ret;
malc8653c012009-02-21 05:48:11 +0000452 mutex_unlock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000453
454 return ret;
455}
456
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200457static int qemu_paio_error(struct qemu_paiocb *aiocb)
aliguori3c529d92008-12-12 16:41:40 +0000458{
459 ssize_t ret = qemu_paio_return(aiocb);
460
461 if (ret < 0)
462 ret = -ret;
463 else
464 ret = 0;
465
466 return ret;
467}
468
Kevin Wolf59c7b152009-10-22 17:54:35 +0200469static int posix_aio_process_queue(void *opaque)
aliguori3c529d92008-12-12 16:41:40 +0000470{
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200471 PosixAioState *s = opaque;
472 struct qemu_paiocb *acb, **pacb;
aliguori3c529d92008-12-12 16:41:40 +0000473 int ret;
Kevin Wolf59c7b152009-10-22 17:54:35 +0200474 int result = 0;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200475
476 for(;;) {
477 pacb = &s->first_aio;
478 for(;;) {
479 acb = *pacb;
480 if (!acb)
Kevin Wolf59c7b152009-10-22 17:54:35 +0200481 return result;
Kevin Wolfe5f37642009-10-22 17:54:40 +0200482
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200483 ret = qemu_paio_error(acb);
484 if (ret == ECANCELED) {
485 /* remove the request */
486 *pacb = acb->next;
487 qemu_aio_release(acb);
Kevin Wolf59c7b152009-10-22 17:54:35 +0200488 result = 1;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200489 } else if (ret != EINPROGRESS) {
490 /* end of aio */
491 if (ret == 0) {
492 ret = qemu_paio_return(acb);
493 if (ret == acb->aio_nbytes)
494 ret = 0;
495 else
496 ret = -EINVAL;
497 } else {
498 ret = -ret;
499 }
Stefan Hajnocziddca9fb2011-03-07 08:06:10 +0000500
501 trace_paio_complete(acb, acb->common.opaque, ret);
502
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200503 /* remove the request */
504 *pacb = acb->next;
505 /* call the callback */
506 acb->common.cb(acb->common.opaque, ret);
507 qemu_aio_release(acb);
Kevin Wolf59c7b152009-10-22 17:54:35 +0200508 result = 1;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200509 break;
510 } else {
511 pacb = &acb->next;
512 }
513 }
514 }
Kevin Wolf59c7b152009-10-22 17:54:35 +0200515
516 return result;
517}
518
519static void posix_aio_read(void *opaque)
520{
521 PosixAioState *s = opaque;
522 ssize_t len;
523
524 /* read all bytes from signal pipe */
525 for (;;) {
526 char bytes[16];
527
528 len = read(s->rfd, bytes, sizeof(bytes));
529 if (len == -1 && errno == EINTR)
530 continue; /* try again */
531 if (len == sizeof(bytes))
532 continue; /* more to read */
533 break;
534 }
535
536 posix_aio_process_queue(s);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200537}
538
539static int posix_aio_flush(void *opaque)
540{
541 PosixAioState *s = opaque;
542 return !!s->first_aio;
543}
544
545static PosixAioState *posix_aio_state;
546
Frediano Ziglioe1d3b252011-09-19 16:37:13 +0200547static void posix_aio_notify_event(void)
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200548{
Frediano Ziglioe1d3b252011-09-19 16:37:13 +0200549 char byte = 0;
550 ssize_t ret;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200551
Frediano Ziglioe1d3b252011-09-19 16:37:13 +0200552 ret = write(posix_aio_state->wfd, &byte, sizeof(byte));
553 if (ret < 0 && errno != EAGAIN)
554 die("write()");
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200555}
556
557static void paio_remove(struct qemu_paiocb *acb)
558{
559 struct qemu_paiocb **pacb;
560
561 /* remove the callback from the queue */
562 pacb = &posix_aio_state->first_aio;
563 for(;;) {
564 if (*pacb == NULL) {
565 fprintf(stderr, "paio_remove: aio request not found!\n");
566 break;
567 } else if (*pacb == acb) {
568 *pacb = acb->next;
569 qemu_aio_release(acb);
570 break;
571 }
572 pacb = &(*pacb)->next;
573 }
574}
575
576static void paio_cancel(BlockDriverAIOCB *blockacb)
577{
578 struct qemu_paiocb *acb = (struct qemu_paiocb *)blockacb;
579 int active = 0;
aliguori3c529d92008-12-12 16:41:40 +0000580
Stefan Hajnocziddca9fb2011-03-07 08:06:10 +0000581 trace_paio_cancel(acb, acb->common.opaque);
582
malc8653c012009-02-21 05:48:11 +0000583 mutex_lock(&lock);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200584 if (!acb->active) {
Blue Swirl72cf2d42009-09-12 07:36:22 +0000585 QTAILQ_REMOVE(&request_list, acb, node);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200586 acb->ret = -ECANCELED;
587 } else if (acb->ret == -EINPROGRESS) {
588 active = 1;
589 }
malc8653c012009-02-21 05:48:11 +0000590 mutex_unlock(&lock);
aliguori3c529d92008-12-12 16:41:40 +0000591
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200592 if (active) {
593 /* fail safe: if the aio could not be canceled, we wait for
594 it */
595 while (qemu_paio_error(acb) == EINPROGRESS)
596 ;
597 }
598
599 paio_remove(acb);
600}
601
602static AIOPool raw_aio_pool = {
603 .aiocb_size = sizeof(struct qemu_paiocb),
604 .cancel = paio_cancel,
605};
606
Kevin Wolf1e5b9d22009-10-26 13:03:08 +0100607BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200608 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
609 BlockDriverCompletionFunc *cb, void *opaque, int type)
610{
611 struct qemu_paiocb *acb;
612
613 acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
614 if (!acb)
615 return NULL;
616 acb->aio_type = type;
617 acb->aio_fildes = fd;
Kevin Wolfe5f37642009-10-22 17:54:40 +0200618
Christoph Hellwigb2e12bc2009-09-04 19:01:49 +0200619 if (qiov) {
620 acb->aio_iov = qiov->iov;
621 acb->aio_niov = qiov->niov;
622 }
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200623 acb->aio_nbytes = nb_sectors * 512;
624 acb->aio_offset = sector_num * 512;
625
626 acb->next = posix_aio_state->first_aio;
627 posix_aio_state->first_aio = acb;
628
Stefan Hajnoczi6d519a52010-05-22 18:15:08 +0100629 trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200630 qemu_paio_submit(acb);
631 return &acb->common;
632}
633
634BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
635 unsigned long int req, void *buf,
636 BlockDriverCompletionFunc *cb, void *opaque)
637{
638 struct qemu_paiocb *acb;
639
640 acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
641 if (!acb)
642 return NULL;
643 acb->aio_type = QEMU_AIO_IOCTL;
644 acb->aio_fildes = fd;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200645 acb->aio_offset = 0;
646 acb->aio_ioctl_buf = buf;
647 acb->aio_ioctl_cmd = req;
648
649 acb->next = posix_aio_state->first_aio;
650 posix_aio_state->first_aio = acb;
651
652 qemu_paio_submit(acb);
653 return &acb->common;
654}
655
Kevin Wolf1e5b9d22009-10-26 13:03:08 +0100656int paio_init(void)
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200657{
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200658 PosixAioState *s;
659 int fds[2];
660 int ret;
661
662 if (posix_aio_state)
Kevin Wolf1e5b9d22009-10-26 13:03:08 +0100663 return 0;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200664
Anthony Liguori7267c092011-08-20 22:09:37 -0500665 s = g_malloc(sizeof(PosixAioState));
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200666
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200667 s->first_aio = NULL;
Kevin Wolf40ff6d72009-12-02 12:24:42 +0100668 if (qemu_pipe(fds) == -1) {
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200669 fprintf(stderr, "failed to create pipe\n");
Kevin Wolf1e5b9d22009-10-26 13:03:08 +0100670 return -1;
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200671 }
672
673 s->rfd = fds[0];
674 s->wfd = fds[1];
675
676 fcntl(s->rfd, F_SETFL, O_NONBLOCK);
677 fcntl(s->wfd, F_SETFL, O_NONBLOCK);
678
Kevin Wolf8febfa22009-10-22 17:54:36 +0200679 qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush,
680 posix_aio_process_queue, s);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200681
682 ret = pthread_attr_init(&attr);
683 if (ret)
684 die2(ret, "pthread_attr_init");
685
686 ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
687 if (ret)
688 die2(ret, "pthread_attr_setdetachstate");
689
Blue Swirl72cf2d42009-09-12 07:36:22 +0000690 QTAILQ_INIT(&request_list);
Avi Kivitye4ea78e2011-08-14 07:04:49 +0300691 new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
Christoph Hellwig9ef91a62009-08-20 16:58:19 +0200692
693 posix_aio_state = s;
Kevin Wolf1e5b9d22009-10-26 13:03:08 +0100694 return 0;
aliguori3c529d92008-12-12 16:41:40 +0000695}