Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Linux native AIO support. |
| 3 | * |
| 4 | * Copyright (C) 2009 IBM, Corp. |
| 5 | * Copyright (C) 2009 Red Hat, Inc. |
| 6 | * |
| 7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. |
| 8 | * See the COPYING file in the top-level directory. |
| 9 | */ |
Peter Maydell | 80c71a2 | 2016-01-18 18:01:42 +0000 | [diff] [blame] | 10 | #include "qemu/osdep.h" |
Paolo Bonzini | 737e150 | 2012-12-17 18:19:44 +0100 | [diff] [blame] | 11 | #include "block/aio.h" |
Paolo Bonzini | 1de7afc | 2012-12-17 18:20:00 +0100 | [diff] [blame] | 12 | #include "qemu/queue.h" |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 13 | #include "block/block.h" |
Paolo Bonzini | 9f8540e | 2012-06-09 10:57:37 +0200 | [diff] [blame] | 14 | #include "block/raw-aio.h" |
Paolo Bonzini | 1de7afc | 2012-12-17 18:20:00 +0100 | [diff] [blame] | 15 | #include "qemu/event_notifier.h" |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 16 | #include "qemu/coroutine.h" |
Nishanth Aravamudan | ed6e216 | 2018-06-22 12:37:00 -0700 | [diff] [blame] | 17 | #include "qapi/error.h" |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 18 | |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 19 | #include <libaio.h> |
| 20 | |
| 21 | /* |
| 22 | * Queue size (per-device). |
| 23 | * |
| 24 | * XXX: eventually we need to communicate this to the guest and/or make it |
| 25 | * tunable by the guest. If we get more outstanding requests at a time |
| 26 | * than this we will get EAGAIN from io_submit which is communicated to |
| 27 | * the guest as an I/O error. |
| 28 | */ |
Wangyong | 2558cb8 | 2020-01-07 06:01:01 +0000 | [diff] [blame] | 29 | #define MAX_EVENTS 1024 |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 30 | |
Stefano Garzarella | d7ddd0a | 2021-07-21 11:42:11 +0200 | [diff] [blame] | 31 | /* Maximum number of requests in a batch. (default value) */ |
| 32 | #define DEFAULT_MAX_BATCH 32 |
| 33 | |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 34 | struct qemu_laiocb { |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 35 | Coroutine *co; |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 36 | LinuxAioState *ctx; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 37 | struct iocb iocb; |
| 38 | ssize_t ret; |
| 39 | size_t nbytes; |
Kevin Wolf | b161e2e | 2011-10-13 15:42:52 +0200 | [diff] [blame] | 40 | QEMUIOVector *qiov; |
| 41 | bool is_read; |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 42 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 43 | }; |
| 44 | |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 45 | typedef struct { |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 46 | int plugged; |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 47 | unsigned int in_queue; |
| 48 | unsigned int in_flight; |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 49 | bool blocked; |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 50 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 51 | } LaioQueue; |
| 52 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 53 | struct LinuxAioState { |
Paolo Bonzini | 0187f5c | 2016-07-04 18:33:20 +0200 | [diff] [blame] | 54 | AioContext *aio_context; |
| 55 | |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 56 | io_context_t ctx; |
Paolo Bonzini | c90caf2 | 2012-02-24 08:39:02 +0100 | [diff] [blame] | 57 | EventNotifier e; |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 58 | |
Paolo Bonzini | 1919631 | 2017-02-13 14:52:31 +0100 | [diff] [blame] | 59 | /* io queue for submit at batch. Protected by AioContext lock. */ |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 60 | LaioQueue io_q; |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 61 | |
Paolo Bonzini | 1919631 | 2017-02-13 14:52:31 +0100 | [diff] [blame] | 62 | /* I/O completion processing. Only runs in I/O thread. */ |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 63 | QEMUBH *completion_bh; |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 64 | int event_idx; |
| 65 | int event_max; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 66 | }; |
| 67 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 68 | static void ioq_submit(LinuxAioState *s); |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 69 | |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 70 | static inline ssize_t io_event_ret(struct io_event *ev) |
| 71 | { |
| 72 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); |
| 73 | } |
| 74 | |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 75 | /* |
Julia Suvorova | 2b02fd8 | 2019-06-02 23:17:09 +0300 | [diff] [blame] | 76 | * Completes an AIO request. |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 77 | */ |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 78 | static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 79 | { |
| 80 | int ret; |
| 81 | |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 82 | ret = laiocb->ret; |
| 83 | if (ret != -ECANCELED) { |
Kevin Wolf | b161e2e | 2011-10-13 15:42:52 +0200 | [diff] [blame] | 84 | if (ret == laiocb->nbytes) { |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 85 | ret = 0; |
Kevin Wolf | b161e2e | 2011-10-13 15:42:52 +0200 | [diff] [blame] | 86 | } else if (ret >= 0) { |
| 87 | /* Short reads mean EOF, pad with zeros. */ |
| 88 | if (laiocb->is_read) { |
Michael Tokarev | 3d9b492 | 2012-03-10 16:54:23 +0400 | [diff] [blame] | 89 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
| 90 | laiocb->qiov->size - ret); |
Kevin Wolf | b161e2e | 2011-10-13 15:42:52 +0200 | [diff] [blame] | 91 | } else { |
Denis V. Lunev | 1c42f14 | 2016-06-23 14:37:16 +0300 | [diff] [blame] | 92 | ret = -ENOSPC; |
Kevin Wolf | b161e2e | 2011-10-13 15:42:52 +0200 | [diff] [blame] | 93 | } |
| 94 | } |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 95 | } |
| 96 | |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 97 | laiocb->ret = ret; |
Julia Suvorova | 2b02fd8 | 2019-06-02 23:17:09 +0300 | [diff] [blame] | 98 | |
| 99 | /* |
| 100 | * If the coroutine is already entered it must be in ioq_submit() and |
| 101 | * will notice laio->ret has been filled in when it eventually runs |
| 102 | * later. Coroutines cannot be entered recursively so avoid doing |
| 103 | * that! |
| 104 | */ |
| 105 | if (!qemu_coroutine_entered(laiocb->co)) { |
| 106 | aio_co_wake(laiocb->co); |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 107 | } |
Kevin Wolf | db0ffc2 | 2009-10-22 17:54:41 +0200 | [diff] [blame] | 108 | } |
| 109 | |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 110 | /** |
| 111 | * aio_ring buffer which is shared between userspace and kernel. |
| 112 | * |
| 113 | * This copied from linux/fs/aio.c, common header does not exist |
| 114 | * but AIO exists for ages so we assume ABI is stable. |
| 115 | */ |
| 116 | struct aio_ring { |
| 117 | unsigned id; /* kernel internal index number */ |
| 118 | unsigned nr; /* number of io_events */ |
| 119 | unsigned head; /* Written to by userland or by kernel. */ |
| 120 | unsigned tail; |
| 121 | |
| 122 | unsigned magic; |
| 123 | unsigned compat_features; |
| 124 | unsigned incompat_features; |
| 125 | unsigned header_length; /* size of aio_ring */ |
| 126 | |
Philippe Mathieu-Daudé | f7795e4 | 2020-03-04 16:38:15 +0100 | [diff] [blame] | 127 | struct io_event io_events[]; |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 128 | }; |
| 129 | |
| 130 | /** |
| 131 | * io_getevents_peek: |
| 132 | * @ctx: AIO context |
| 133 | * @events: pointer on events array, output value |
| 134 | |
| 135 | * Returns the number of completed events and sets a pointer |
| 136 | * on events array. This function does not update the internal |
| 137 | * ring buffer, only reads head and tail. When @events has been |
| 138 | * processed io_getevents_commit() must be called. |
| 139 | */ |
| 140 | static inline unsigned int io_getevents_peek(io_context_t ctx, |
| 141 | struct io_event **events) |
| 142 | { |
| 143 | struct aio_ring *ring = (struct aio_ring *)ctx; |
| 144 | unsigned int head = ring->head, tail = ring->tail; |
| 145 | unsigned int nr; |
| 146 | |
| 147 | nr = tail >= head ? tail - head : ring->nr - head; |
| 148 | *events = ring->io_events + head; |
| 149 | /* To avoid speculative loads of s->events[i] before observing tail. |
| 150 | Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */ |
| 151 | smp_rmb(); |
| 152 | |
| 153 | return nr; |
| 154 | } |
| 155 | |
| 156 | /** |
| 157 | * io_getevents_commit: |
| 158 | * @ctx: AIO context |
| 159 | * @nr: the number of events on which head should be advanced |
| 160 | * |
| 161 | * Advances head of a ring buffer. |
| 162 | */ |
| 163 | static inline void io_getevents_commit(io_context_t ctx, unsigned int nr) |
| 164 | { |
| 165 | struct aio_ring *ring = (struct aio_ring *)ctx; |
| 166 | |
| 167 | if (nr) { |
| 168 | ring->head = (ring->head + nr) % ring->nr; |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | /** |
| 173 | * io_getevents_advance_and_peek: |
| 174 | * @ctx: AIO context |
| 175 | * @events: pointer on events array, output value |
| 176 | * @nr: the number of events on which head should be advanced |
| 177 | * |
| 178 | * Advances head of a ring buffer and returns number of elements left. |
| 179 | */ |
| 180 | static inline unsigned int |
| 181 | io_getevents_advance_and_peek(io_context_t ctx, |
| 182 | struct io_event **events, |
| 183 | unsigned int nr) |
| 184 | { |
| 185 | io_getevents_commit(ctx, nr); |
| 186 | return io_getevents_peek(ctx, events); |
| 187 | } |
| 188 | |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 189 | /** |
| 190 | * qemu_laio_process_completions: |
| 191 | * @s: AIO state |
| 192 | * |
| 193 | * Fetches completed I/O requests and invokes their callbacks. |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 194 | * |
| 195 | * The function is somewhat tricky because it supports nested event loops, for |
| 196 | * example when a request callback invokes aio_poll(). In order to do this, |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 197 | * indices are kept in LinuxAioState. Function schedules BH completion so it |
| 198 | * can be called again in a nested event loop. When there are no events left |
| 199 | * to complete the BH is being canceled. |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 200 | */ |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 201 | static void qemu_laio_process_completions(LinuxAioState *s) |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 202 | { |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 203 | struct io_event *events; |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 204 | |
| 205 | /* Reschedule so nested event loops see currently pending completions */ |
| 206 | qemu_bh_schedule(s->completion_bh); |
| 207 | |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 208 | while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events, |
| 209 | s->event_idx))) { |
| 210 | for (s->event_idx = 0; s->event_idx < s->event_max; ) { |
| 211 | struct iocb *iocb = events[s->event_idx].obj; |
| 212 | struct qemu_laiocb *laiocb = |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 213 | container_of(iocb, struct qemu_laiocb, iocb); |
| 214 | |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 215 | laiocb->ret = io_event_ret(&events[s->event_idx]); |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 216 | |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 217 | /* Change counters one-by-one because we can be nested. */ |
| 218 | s->io_q.in_flight--; |
| 219 | s->event_idx++; |
| 220 | qemu_laio_process_completion(laiocb); |
| 221 | } |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 222 | } |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 223 | |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 224 | qemu_bh_cancel(s->completion_bh); |
| 225 | |
| 226 | /* If we are nested we have to notify the level above that we are done |
| 227 | * by setting event_max to zero, upper level will then jump out of it's |
| 228 | * own `for` loop. If we are the last all counters droped to zero. */ |
| 229 | s->event_max = 0; |
| 230 | s->event_idx = 0; |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 231 | } |
Roman Pen | 9e909a5 | 2016-07-19 14:27:41 +0200 | [diff] [blame] | 232 | |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 233 | static void qemu_laio_process_completions_and_submit(LinuxAioState *s) |
| 234 | { |
Sergio Lopez | e091f0e | 2018-09-05 13:23:34 +0200 | [diff] [blame] | 235 | aio_context_acquire(s->aio_context); |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 236 | qemu_laio_process_completions(s); |
Paolo Bonzini | 1919631 | 2017-02-13 14:52:31 +0100 | [diff] [blame] | 237 | |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 238 | if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
| 239 | ioq_submit(s); |
| 240 | } |
Paolo Bonzini | 1919631 | 2017-02-13 14:52:31 +0100 | [diff] [blame] | 241 | aio_context_release(s->aio_context); |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 242 | } |
| 243 | |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 244 | static void qemu_laio_completion_bh(void *opaque) |
| 245 | { |
| 246 | LinuxAioState *s = opaque; |
| 247 | |
| 248 | qemu_laio_process_completions_and_submit(s); |
| 249 | } |
| 250 | |
Paolo Bonzini | c90caf2 | 2012-02-24 08:39:02 +0100 | [diff] [blame] | 251 | static void qemu_laio_completion_cb(EventNotifier *e) |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 252 | { |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 253 | LinuxAioState *s = container_of(e, LinuxAioState, e); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 254 | |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 255 | if (event_notifier_test_and_clear(&s->e)) { |
Roman Pen | 3407de5 | 2016-07-19 14:27:42 +0200 | [diff] [blame] | 256 | qemu_laio_process_completions_and_submit(s); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 257 | } |
| 258 | } |
| 259 | |
Stefan Hajnoczi | ee68697 | 2016-12-01 19:26:44 +0000 | [diff] [blame] | 260 | static bool qemu_laio_poll_cb(void *opaque) |
| 261 | { |
| 262 | EventNotifier *e = opaque; |
| 263 | LinuxAioState *s = container_of(e, LinuxAioState, e); |
| 264 | struct io_event *events; |
| 265 | |
Stefan Hajnoczi | 826cc32 | 2021-12-07 13:23:31 +0000 | [diff] [blame] | 266 | return io_getevents_peek(s->ctx, &events); |
| 267 | } |
| 268 | |
| 269 | static void qemu_laio_poll_ready(EventNotifier *opaque) |
| 270 | { |
| 271 | EventNotifier *e = opaque; |
| 272 | LinuxAioState *s = container_of(e, LinuxAioState, e); |
Stefan Hajnoczi | ee68697 | 2016-12-01 19:26:44 +0000 | [diff] [blame] | 273 | |
| 274 | qemu_laio_process_completions_and_submit(s); |
Stefan Hajnoczi | ee68697 | 2016-12-01 19:26:44 +0000 | [diff] [blame] | 275 | } |
| 276 | |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 277 | static void ioq_init(LaioQueue *io_q) |
| 278 | { |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 279 | QSIMPLEQ_INIT(&io_q->pending); |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 280 | io_q->plugged = 0; |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 281 | io_q->in_queue = 0; |
| 282 | io_q->in_flight = 0; |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 283 | io_q->blocked = false; |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 284 | } |
| 285 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 286 | static void ioq_submit(LinuxAioState *s) |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 287 | { |
Paolo Bonzini | 82595da | 2014-12-11 14:52:30 +0100 | [diff] [blame] | 288 | int ret, len; |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 289 | struct qemu_laiocb *aiocb; |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 290 | struct iocb *iocbs[MAX_EVENTS]; |
Paolo Bonzini | 82595da | 2014-12-11 14:52:30 +0100 | [diff] [blame] | 291 | QSIMPLEQ_HEAD(, qemu_laiocb) completed; |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 292 | |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 293 | do { |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 294 | if (s->io_q.in_flight >= MAX_EVENTS) { |
| 295 | break; |
| 296 | } |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 297 | len = 0; |
| 298 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { |
| 299 | iocbs[len++] = &aiocb->iocb; |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 300 | if (s->io_q.in_flight + len >= MAX_EVENTS) { |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 301 | break; |
| 302 | } |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 303 | } |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 304 | |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 305 | ret = io_submit(s->ctx, len, iocbs); |
| 306 | if (ret == -EAGAIN) { |
Paolo Bonzini | 82595da | 2014-12-11 14:52:30 +0100 | [diff] [blame] | 307 | break; |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 308 | } |
| 309 | if (ret < 0) { |
Kevin Wolf | 44713c9 | 2016-08-09 13:20:19 +0200 | [diff] [blame] | 310 | /* Fail the first request, retry the rest */ |
| 311 | aiocb = QSIMPLEQ_FIRST(&s->io_q.pending); |
| 312 | QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next); |
| 313 | s->io_q.in_queue--; |
| 314 | aiocb->ret = ret; |
| 315 | qemu_laio_process_completion(aiocb); |
| 316 | continue; |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 317 | } |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 318 | |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 319 | s->io_q.in_flight += ret; |
| 320 | s->io_q.in_queue -= ret; |
Paolo Bonzini | 82595da | 2014-12-11 14:52:30 +0100 | [diff] [blame] | 321 | aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); |
| 322 | QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 323 | } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 324 | s->io_q.blocked = (s->io_q.in_queue > 0); |
Roman Pen | 0ed93d8 | 2016-07-19 14:27:43 +0200 | [diff] [blame] | 325 | |
| 326 | if (s->io_q.in_flight) { |
| 327 | /* We can try to complete something just right away if there are |
| 328 | * still requests in-flight. */ |
| 329 | qemu_laio_process_completions(s); |
| 330 | /* |
| 331 | * Even we have completed everything (in_flight == 0), the queue can |
| 332 | * have still pended requests (in_queue > 0). We do not attempt to |
| 333 | * repeat submission to avoid IO hang. The reason is simple: s->e is |
| 334 | * still set and completion callback will be called shortly and all |
| 335 | * pended requests will be submitted from there. |
| 336 | */ |
| 337 | } |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 338 | } |
| 339 | |
Stefano Garzarella | 512da21 | 2021-10-26 18:23:45 +0200 | [diff] [blame] | 340 | static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) |
| 341 | { |
| 342 | uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH; |
| 343 | |
| 344 | /* |
| 345 | * AIO context can be shared between multiple block devices, so |
| 346 | * `dev_max_batch` allows reducing the batch size for latency-sensitive |
| 347 | * devices. |
| 348 | */ |
| 349 | max_batch = MIN_NON_ZERO(dev_max_batch, max_batch); |
| 350 | |
| 351 | /* limit the batch with the number of available events */ |
| 352 | max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch); |
| 353 | |
| 354 | return max_batch; |
| 355 | } |
| 356 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 357 | void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 358 | { |
Paolo Bonzini | 0187f5c | 2016-07-04 18:33:20 +0200 | [diff] [blame] | 359 | s->io_q.plugged++; |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 360 | } |
| 361 | |
Stefano Garzarella | 68d7946 | 2021-10-26 18:23:46 +0200 | [diff] [blame] | 362 | void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, |
| 363 | uint64_t dev_max_batch) |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 364 | { |
Paolo Bonzini | 6b98bd6 | 2016-04-07 18:33:34 +0200 | [diff] [blame] | 365 | assert(s->io_q.plugged); |
Stefan Hajnoczi | f387cac | 2022-06-09 17:47:11 +0100 | [diff] [blame] | 366 | s->io_q.plugged--; |
| 367 | |
Stefan Hajnoczi | 99b969f | 2022-06-09 17:47:12 +0100 | [diff] [blame] | 368 | /* |
| 369 | * Why max batch checking is performed here: |
| 370 | * Another BDS may have queued requests with a higher dev_max_batch and |
| 371 | * therefore in_queue could now exceed our dev_max_batch. Re-check the max |
| 372 | * batch so we can honor our device's dev_max_batch. |
| 373 | */ |
Stefano Garzarella | 68d7946 | 2021-10-26 18:23:46 +0200 | [diff] [blame] | 374 | if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || |
Stefan Hajnoczi | f387cac | 2022-06-09 17:47:11 +0100 | [diff] [blame] | 375 | (!s->io_q.plugged && |
Stefano Garzarella | 68d7946 | 2021-10-26 18:23:46 +0200 | [diff] [blame] | 376 | !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { |
Paolo Bonzini | de35464 | 2014-12-11 14:52:29 +0100 | [diff] [blame] | 377 | ioq_submit(s); |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 378 | } |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 379 | } |
| 380 | |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 381 | static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, |
Stefano Garzarella | 512da21 | 2021-10-26 18:23:45 +0200 | [diff] [blame] | 382 | int type, uint64_t dev_max_batch) |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 383 | { |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 384 | LinuxAioState *s = laiocb->ctx; |
| 385 | struct iocb *iocbs = &laiocb->iocb; |
| 386 | QEMUIOVector *qiov = laiocb->qiov; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 387 | |
| 388 | switch (type) { |
| 389 | case QEMU_AIO_WRITE: |
| 390 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); |
Paolo Bonzini | 7d37435 | 2018-12-13 23:37:37 +0100 | [diff] [blame] | 391 | break; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 392 | case QEMU_AIO_READ: |
| 393 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); |
Paolo Bonzini | 7d37435 | 2018-12-13 23:37:37 +0100 | [diff] [blame] | 394 | break; |
Frediano Ziglio | c30e624 | 2011-08-30 09:46:11 +0200 | [diff] [blame] | 395 | /* Currently Linux kernel does not support other operations */ |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 396 | default: |
| 397 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", |
| 398 | __func__, type); |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 399 | return -EIO; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 400 | } |
Paolo Bonzini | c90caf2 | 2012-02-24 08:39:02 +0100 | [diff] [blame] | 401 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 402 | |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 403 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 404 | s->io_q.in_queue++; |
Paolo Bonzini | 43f2376 | 2014-12-11 14:52:27 +0100 | [diff] [blame] | 405 | if (!s->io_q.blocked && |
Roman Pen | 5e1b34a | 2016-07-13 15:03:24 +0200 | [diff] [blame] | 406 | (!s->io_q.plugged || |
Stefano Garzarella | 512da21 | 2021-10-26 18:23:45 +0200 | [diff] [blame] | 407 | s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) { |
Paolo Bonzini | 28b2408 | 2014-12-11 14:52:26 +0100 | [diff] [blame] | 408 | ioq_submit(s); |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 409 | } |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 410 | |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 411 | return 0; |
| 412 | } |
| 413 | |
| 414 | int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, |
Stefano Garzarella | 512da21 | 2021-10-26 18:23:45 +0200 | [diff] [blame] | 415 | uint64_t offset, QEMUIOVector *qiov, int type, |
| 416 | uint64_t dev_max_batch) |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 417 | { |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 418 | int ret; |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 419 | struct qemu_laiocb laiocb = { |
| 420 | .co = qemu_coroutine_self(), |
Kevin Wolf | 9d52aa3 | 2016-06-03 17:36:27 +0200 | [diff] [blame] | 421 | .nbytes = qiov->size, |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 422 | .ctx = s, |
Roman Pen | 0ed93d8 | 2016-07-19 14:27:43 +0200 | [diff] [blame] | 423 | .ret = -EINPROGRESS, |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 424 | .is_read = (type == QEMU_AIO_READ), |
| 425 | .qiov = qiov, |
| 426 | }; |
| 427 | |
Stefano Garzarella | 512da21 | 2021-10-26 18:23:45 +0200 | [diff] [blame] | 428 | ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch); |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 429 | if (ret < 0) { |
| 430 | return ret; |
| 431 | } |
| 432 | |
Roman Pen | 0ed93d8 | 2016-07-19 14:27:43 +0200 | [diff] [blame] | 433 | if (laiocb.ret == -EINPROGRESS) { |
| 434 | qemu_coroutine_yield(); |
| 435 | } |
Kevin Wolf | 2174f12 | 2014-08-06 17:18:07 +0200 | [diff] [blame] | 436 | return laiocb.ret; |
| 437 | } |
| 438 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 439 | void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context) |
Stefan Hajnoczi | c2f3426 | 2014-05-08 16:34:47 +0200 | [diff] [blame] | 440 | { |
Stefan Hajnoczi | 826cc32 | 2021-12-07 13:23:31 +0000 | [diff] [blame] | 441 | aio_set_event_notifier(old_context, &s->e, false, NULL, NULL, NULL); |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 442 | qemu_bh_delete(s->completion_bh); |
Paolo Bonzini | 1919631 | 2017-02-13 14:52:31 +0100 | [diff] [blame] | 443 | s->aio_context = NULL; |
Stefan Hajnoczi | c2f3426 | 2014-05-08 16:34:47 +0200 | [diff] [blame] | 444 | } |
| 445 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 446 | void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context) |
Stefan Hajnoczi | c2f3426 | 2014-05-08 16:34:47 +0200 | [diff] [blame] | 447 | { |
Paolo Bonzini | 0187f5c | 2016-07-04 18:33:20 +0200 | [diff] [blame] | 448 | s->aio_context = new_context; |
Stefan Hajnoczi | 2cdff7f | 2014-08-04 16:56:33 +0100 | [diff] [blame] | 449 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
Fam Zheng | dca21ef | 2015-10-23 11:08:05 +0800 | [diff] [blame] | 450 | aio_set_event_notifier(new_context, &s->e, false, |
Stefan Hajnoczi | ee68697 | 2016-12-01 19:26:44 +0000 | [diff] [blame] | 451 | qemu_laio_completion_cb, |
Stefan Hajnoczi | 826cc32 | 2021-12-07 13:23:31 +0000 | [diff] [blame] | 452 | qemu_laio_poll_cb, |
| 453 | qemu_laio_poll_ready); |
Stefan Hajnoczi | c2f3426 | 2014-05-08 16:34:47 +0200 | [diff] [blame] | 454 | } |
| 455 | |
Nishanth Aravamudan | ed6e216 | 2018-06-22 12:37:00 -0700 | [diff] [blame] | 456 | LinuxAioState *laio_init(Error **errp) |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 457 | { |
Nishanth Aravamudan | ed6e216 | 2018-06-22 12:37:00 -0700 | [diff] [blame] | 458 | int rc; |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 459 | LinuxAioState *s; |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 460 | |
Anthony Liguori | 7267c09 | 2011-08-20 22:09:37 -0500 | [diff] [blame] | 461 | s = g_malloc0(sizeof(*s)); |
Nishanth Aravamudan | ed6e216 | 2018-06-22 12:37:00 -0700 | [diff] [blame] | 462 | rc = event_notifier_init(&s->e, false); |
| 463 | if (rc < 0) { |
Daniel P. Berrangé | 7a21bee | 2022-07-07 17:37:15 +0100 | [diff] [blame] | 464 | error_setg_errno(errp, -rc, "failed to initialize event notifier"); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 465 | goto out_free_state; |
Paolo Bonzini | c90caf2 | 2012-02-24 08:39:02 +0100 | [diff] [blame] | 466 | } |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 467 | |
Nishanth Aravamudan | ed6e216 | 2018-06-22 12:37:00 -0700 | [diff] [blame] | 468 | rc = io_setup(MAX_EVENTS, &s->ctx); |
| 469 | if (rc < 0) { |
| 470 | error_setg_errno(errp, -rc, "failed to create linux AIO context"); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 471 | goto out_close_efd; |
Paolo Bonzini | c90caf2 | 2012-02-24 08:39:02 +0100 | [diff] [blame] | 472 | } |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 473 | |
Ming Lei | 1b3abdc | 2014-07-04 18:04:34 +0800 | [diff] [blame] | 474 | ioq_init(&s->io_q); |
| 475 | |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 476 | return s; |
| 477 | |
| 478 | out_close_efd: |
Paolo Bonzini | c90caf2 | 2012-02-24 08:39:02 +0100 | [diff] [blame] | 479 | event_notifier_cleanup(&s->e); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 480 | out_free_state: |
Anthony Liguori | 7267c09 | 2011-08-20 22:09:37 -0500 | [diff] [blame] | 481 | g_free(s); |
Christoph Hellwig | 5c6c3a6 | 2009-08-20 16:58:35 +0200 | [diff] [blame] | 482 | return NULL; |
| 483 | } |
Stefan Hajnoczi | abd269b | 2014-05-08 16:34:48 +0200 | [diff] [blame] | 484 | |
Paolo Bonzini | dd7f7ed | 2016-04-07 18:33:35 +0200 | [diff] [blame] | 485 | void laio_cleanup(LinuxAioState *s) |
Stefan Hajnoczi | abd269b | 2014-05-08 16:34:48 +0200 | [diff] [blame] | 486 | { |
Stefan Hajnoczi | abd269b | 2014-05-08 16:34:48 +0200 | [diff] [blame] | 487 | event_notifier_cleanup(&s->e); |
Gonglei | a1abf40 | 2014-07-12 11:43:37 +0800 | [diff] [blame] | 488 | |
| 489 | if (io_destroy(s->ctx) != 0) { |
| 490 | fprintf(stderr, "%s: destroy AIO context %p failed\n", |
| 491 | __func__, &s->ctx); |
| 492 | } |
Stefan Hajnoczi | abd269b | 2014-05-08 16:34:48 +0200 | [diff] [blame] | 493 | g_free(s); |
| 494 | } |