block/io_uring.c - qemu - Git at Google

 /*
  * Linux io_uring support.
  *
  * Copyright (C) 2009 IBM, Corp.
  * Copyright (C) 2009 Red Hat, Inc.
  * Copyright (C) 2019 Aarushi Mehta
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
 #include "qemu/osdep.h"
 #include <liburing.h>
 #include "block/aio.h"
 #include "qemu/queue.h"
 #include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
 #include "qemu/defer-call.h"
 #include "qapi/error.h"
 #include "sysemu/block-backend.h"
 #include "trace.h"

 /* Only used for assertions.  */
 #include "qemu/coroutine_int.h"

 /* io_uring ring size */
 #define MAX_ENTRIES 128

 typedef struct LuringAIOCB {
     Coroutine *co;
     struct io_uring_sqe sqeq;
     ssize_t ret;
     QEMUIOVector *qiov;
     bool is_read;
     QSIMPLEQ_ENTRY(LuringAIOCB) next;

     /*
      * Buffered reads may require resubmission, see
      * luring_resubmit_short_read().
      */
     int total_read;
     QEMUIOVector resubmit_qiov;
 } LuringAIOCB;

 typedef struct LuringQueue {
     unsigned int in_queue;
     unsigned int in_flight;
     bool blocked;
     QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
 } LuringQueue;

 struct LuringState {
     AioContext *aio_context;

     struct io_uring ring;

     /* No locking required, only accessed from AioContext home thread */
     LuringQueue io_q;

     QEMUBH *completion_bh;
 };

 /**
  * luring_resubmit:
  *
  * Resubmit a request by appending it to submit_queue.  The caller must ensure
  * that ioq_submit() is called later so that submit_queue requests are started.
  */
 static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
 {
     QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
     s->io_q.in_queue++;
 }

 /**
  * luring_resubmit_short_read:
  *
  * Short reads are rare but may occur. The remaining read request needs to be
  * resubmitted.
  */
 static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
                                        int nread)
 {
     QEMUIOVector *resubmit_qiov;
     size_t remaining;

     trace_luring_resubmit_short_read(s, luringcb, nread);

     /* Update read position */
     luringcb->total_read += nread;
     remaining = luringcb->qiov->size - luringcb->total_read;

     /* Shorten qiov */
     resubmit_qiov = &luringcb->resubmit_qiov;
     if (resubmit_qiov->iov == NULL) {
         qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
     } else {
         qemu_iovec_reset(resubmit_qiov);
     }
     qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
                       remaining);

     /* Update sqe */
     luringcb->sqeq.off += nread;
     luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov;
     luringcb->sqeq.len = luringcb->resubmit_qiov.niov;

     luring_resubmit(s, luringcb);
 }

 /**
  * luring_process_completions:
  * @s: AIO state
  *
  * Fetches completed I/O requests, consumes cqes and invokes their callbacks
  * The function is somewhat tricky because it supports nested event loops, for
  * example when a request callback invokes aio_poll().
  *
  * Function schedules BH completion so it  can be called again in a nested
  * event loop.  When there are no events left  to complete the BH is being
  * canceled.
  *
  */
 static void luring_process_completions(LuringState *s)
 {
     struct io_uring_cqe *cqes;
     int total_bytes;

     defer_call_begin();

     /*
      * Request completion callbacks can run the nested event loop.
      * Schedule ourselves so the nested event loop will "see" remaining
      * completed requests and process them.  Without this, completion
      * callbacks that wait for other requests using a nested event loop
      * would hang forever.
      *
      * This workaround is needed because io_uring uses poll_wait, which
      * is woken up when new events are added to the uring, thus polling on
      * the same uring fd will block unless more events are received.
      *
      * Other leaf block drivers (drivers that access the data themselves)
      * are networking based, so they poll sockets for data and run the
      * correct coroutine.
      */
     qemu_bh_schedule(s->completion_bh);

     while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
         LuringAIOCB *luringcb;
         int ret;

         if (!cqes) {
             break;
         }

         luringcb = io_uring_cqe_get_data(cqes);
         ret = cqes->res;
         io_uring_cqe_seen(&s->ring, cqes);
         cqes = NULL;

         /* Change counters one-by-one because we can be nested. */
         s->io_q.in_flight--;
         trace_luring_process_completion(s, luringcb, ret);

         /* total_read is non-zero only for resubmitted read requests */
         total_bytes = ret + luringcb->total_read;

         if (ret < 0) {
             /*
              * Only writev/readv/fsync requests on regular files or host block
              * devices are submitted. Therefore -EAGAIN is not expected but it's
              * known to happen sometimes with Linux SCSI. Submit again and hope
              * the request completes successfully.
              *
              * For more information, see:
              * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
              *
              * If the code is changed to submit other types of requests in the
              * future, then this workaround may need to be extended to deal with
              * genuine -EAGAIN results that should not be resubmitted
              * immediately.
              */
             if (ret == -EINTR || ret == -EAGAIN) {
                 luring_resubmit(s, luringcb);
                 continue;
             }
         } else if (!luringcb->qiov) {
             goto end;
         } else if (total_bytes == luringcb->qiov->size) {
             ret = 0;
         /* Only read/write */
         } else {
             /* Short Read/Write */
             if (luringcb->is_read) {
                 if (ret > 0) {
                     luring_resubmit_short_read(s, luringcb, ret);
                     continue;
                 } else {
                     /* Pad with zeroes */
                     qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
                                       luringcb->qiov->size - total_bytes);
                     ret = 0;
                 }
             } else {
                 ret = -ENOSPC;
             }
         }
 end:
         luringcb->ret = ret;
         qemu_iovec_destroy(&luringcb->resubmit_qiov);

         /*
          * If the coroutine is already entered it must be in ioq_submit()
          * and will notice luringcb->ret has been filled in when it
          * eventually runs later. Coroutines cannot be entered recursively
          * so avoid doing that!
          */
         assert(luringcb->co->ctx == s->aio_context);
         if (!qemu_coroutine_entered(luringcb->co)) {
             aio_co_wake(luringcb->co);
         }
     }

     qemu_bh_cancel(s->completion_bh);

     defer_call_end();
 }

 static int ioq_submit(LuringState *s)
 {
     int ret = 0;
     LuringAIOCB *luringcb, *luringcb_next;

     while (s->io_q.in_queue > 0) {
         /*
          * Try to fetch sqes from the ring for requests waiting in
          * the overflow queue
          */
         QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next,
                               luringcb_next) {
             struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring);
             if (!sqes) {
                 break;
             }
             /* Prep sqe for submission */
             *sqes = luringcb->sqeq;
             QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next);
         }
         ret = io_uring_submit(&s->ring);
         trace_luring_io_uring_submit(s, ret);
         /* Prevent infinite loop if submission is refused */
         if (ret <= 0) {
             if (ret == -EAGAIN || ret == -EINTR) {
                 continue;
             }
             break;
         }
         s->io_q.in_flight += ret;
         s->io_q.in_queue  -= ret;
     }
     s->io_q.blocked = (s->io_q.in_queue > 0);

     if (s->io_q.in_flight) {
         /*
          * We can try to complete something just right away if there are
          * still requests in-flight.
          */
         luring_process_completions(s);
     }
     return ret;
 }

 static void luring_process_completions_and_submit(LuringState *s)
 {
     luring_process_completions(s);

     if (s->io_q.in_queue > 0) {
         ioq_submit(s);
     }
 }

 static void qemu_luring_completion_bh(void *opaque)
 {
     LuringState *s = opaque;
     luring_process_completions_and_submit(s);
 }

 static void qemu_luring_completion_cb(void *opaque)
 {
     LuringState *s = opaque;
     luring_process_completions_and_submit(s);
 }

 static bool qemu_luring_poll_cb(void *opaque)
 {
     LuringState *s = opaque;

     return io_uring_cq_ready(&s->ring);
 }

 static void qemu_luring_poll_ready(void *opaque)
 {
     LuringState *s = opaque;

     luring_process_completions_and_submit(s);
 }

 static void ioq_init(LuringQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->submit_queue);
     io_q->in_queue = 0;
     io_q->in_flight = 0;
     io_q->blocked = false;
 }

 static void luring_deferred_fn(void *opaque)
 {
     LuringState *s = opaque;
     trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
                            s->io_q.in_flight);
     if (!s->io_q.blocked && s->io_q.in_queue > 0) {
         ioq_submit(s);
     }
 }

 /**
  * luring_do_submit:
  * @fd: file descriptor for I/O
  * @luringcb: AIO control block
  * @s: AIO state
  * @offset: offset for request
  * @type: type of request
  *
  * Fetches sqes from ring, adds to pending queue and preps them
  *
  */
 static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
                             uint64_t offset, int type)
 {
     int ret;
     struct io_uring_sqe *sqes = &luringcb->sqeq;

     switch (type) {
     case QEMU_AIO_WRITE:
         io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
                              luringcb->qiov->niov, offset);
         break;
     case QEMU_AIO_ZONE_APPEND:
         io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
                              luringcb->qiov->niov, offset);
         break;
     case QEMU_AIO_READ:
         io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
                             luringcb->qiov->niov, offset);
         break;
     case QEMU_AIO_FLUSH:
         io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
         break;
     default:
         fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
                         __func__, type);
         abort();
     }
     io_uring_sqe_set_data(sqes, luringcb);

     QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
     s->io_q.in_queue++;
     trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
                            s->io_q.in_flight);
     if (!s->io_q.blocked) {
         if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
             ret = ioq_submit(s);
             trace_luring_do_submit_done(s, ret);
             return ret;
         }

         defer_call(luring_deferred_fn, s);
     }
     return 0;
 }

 int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
                                   QEMUIOVector *qiov, int type)
 {
     int ret;
     AioContext *ctx = qemu_get_current_aio_context();
     LuringState *s = aio_get_linux_io_uring(ctx);
     LuringAIOCB luringcb = {
         .co         = qemu_coroutine_self(),
         .ret        = -EINPROGRESS,
         .qiov       = qiov,
         .is_read    = (type == QEMU_AIO_READ),
     };
     trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
                            type);
     ret = luring_do_submit(fd, &luringcb, s, offset, type);

     if (ret < 0) {
         return ret;
     }

     if (luringcb.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
     return luringcb.ret;
 }

 void luring_detach_aio_context(LuringState *s, AioContext *old_context)
 {
     aio_set_fd_handler(old_context, s->ring.ring_fd,
                        NULL, NULL, NULL, NULL, s);
     qemu_bh_delete(s->completion_bh);
     s->aio_context = NULL;
 }

 void luring_attach_aio_context(LuringState *s, AioContext *new_context)
 {
     s->aio_context = new_context;
     s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
     aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
                        qemu_luring_completion_cb, NULL,
                        qemu_luring_poll_cb, qemu_luring_poll_ready, s);
 }

 LuringState *luring_init(Error **errp)
 {
     int rc;
     LuringState *s = g_new0(LuringState, 1);
     struct io_uring *ring = &s->ring;

     trace_luring_init_state(s, sizeof(*s));

     rc = io_uring_queue_init(MAX_ENTRIES, ring, 0);
     if (rc < 0) {
         error_setg_errno(errp, errno, "failed to init linux io_uring ring");
         g_free(s);
         return NULL;
     }

     ioq_init(&s->io_q);
     return s;

 }

 void luring_cleanup(LuringState *s)
 {
     io_uring_queue_exit(&s->ring);
     trace_luring_cleanup_state(s);
     g_free(s);
 }
	/*
	* Linux io_uring support.
	*
	* Copyright (C) 2009 IBM, Corp.
	* Copyright (C) 2009 Red Hat, Inc.
	* Copyright (C) 2019 Aarushi Mehta
	*
	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	* See the COPYING file in the top-level directory.
	*/
	#include "qemu/osdep.h"
	#include <liburing.h>
	#include "block/aio.h"
	#include "qemu/queue.h"
	#include "block/block.h"
	#include "block/raw-aio.h"
	#include "qemu/coroutine.h"
	#include "qemu/defer-call.h"
	#include "qapi/error.h"
	#include "sysemu/block-backend.h"
	#include "trace.h"

	/* Only used for assertions. */
	#include "qemu/coroutine_int.h"

	/* io_uring ring size */
	#define MAX_ENTRIES 128

	typedef struct LuringAIOCB {
	Coroutine *co;
	struct io_uring_sqe sqeq;
	ssize_t ret;
	QEMUIOVector *qiov;
	bool is_read;
	QSIMPLEQ_ENTRY(LuringAIOCB) next;

	/*
	* Buffered reads may require resubmission, see
	* luring_resubmit_short_read().
	*/
	int total_read;
	QEMUIOVector resubmit_qiov;
	} LuringAIOCB;

	typedef struct LuringQueue {
	unsigned int in_queue;
	unsigned int in_flight;
	bool blocked;
	QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
	} LuringQueue;

	struct LuringState {
	AioContext *aio_context;

	struct io_uring ring;

	/* No locking required, only accessed from AioContext home thread */
	LuringQueue io_q;

	QEMUBH *completion_bh;
	};

	/**
	* luring_resubmit:
	*
	* Resubmit a request by appending it to submit_queue. The caller must ensure
	* that ioq_submit() is called later so that submit_queue requests are started.
	*/
	static void luring_resubmit(LuringState s, LuringAIOCB luringcb)
	{
	QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
	s->io_q.in_queue++;
	}

	/**
	* luring_resubmit_short_read:
	*
	* Short reads are rare but may occur. The remaining read request needs to be
	* resubmitted.
	*/
	static void luring_resubmit_short_read(LuringState s, LuringAIOCB luringcb,
	int nread)
	{
	QEMUIOVector *resubmit_qiov;
	size_t remaining;

	trace_luring_resubmit_short_read(s, luringcb, nread);

	/* Update read position */
	luringcb->total_read += nread;
	remaining = luringcb->qiov->size - luringcb->total_read;

	/* Shorten qiov */
	resubmit_qiov = &luringcb->resubmit_qiov;
	if (resubmit_qiov->iov == NULL) {
	qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
	} else {
	qemu_iovec_reset(resubmit_qiov);
	}
	qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
	remaining);

	/* Update sqe */
	luringcb->sqeq.off += nread;
	luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov;
	luringcb->sqeq.len = luringcb->resubmit_qiov.niov;

	luring_resubmit(s, luringcb);
	}

	/**
	* luring_process_completions:
	* @s: AIO state
	*
	* Fetches completed I/O requests, consumes cqes and invokes their callbacks
	* The function is somewhat tricky because it supports nested event loops, for
	* example when a request callback invokes aio_poll().
	*
	* Function schedules BH completion so it can be called again in a nested
	* event loop. When there are no events left to complete the BH is being
	* canceled.
	*
	*/
	static void luring_process_completions(LuringState *s)
	{
	struct io_uring_cqe *cqes;
	int total_bytes;

	defer_call_begin();

	/*
	* Request completion callbacks can run the nested event loop.
	* Schedule ourselves so the nested event loop will "see" remaining
	* completed requests and process them. Without this, completion
	* callbacks that wait for other requests using a nested event loop
	* would hang forever.
	*
	* This workaround is needed because io_uring uses poll_wait, which
	* is woken up when new events are added to the uring, thus polling on
	* the same uring fd will block unless more events are received.
	*
	* Other leaf block drivers (drivers that access the data themselves)
	* are networking based, so they poll sockets for data and run the
	* correct coroutine.
	*/
	qemu_bh_schedule(s->completion_bh);

	while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
	LuringAIOCB *luringcb;
	int ret;

	if (!cqes) {
	break;
	}

	luringcb = io_uring_cqe_get_data(cqes);
	ret = cqes->res;
	io_uring_cqe_seen(&s->ring, cqes);
	cqes = NULL;

	/* Change counters one-by-one because we can be nested. */
	s->io_q.in_flight--;
	trace_luring_process_completion(s, luringcb, ret);

	/* total_read is non-zero only for resubmitted read requests */
	total_bytes = ret + luringcb->total_read;

	if (ret < 0) {
	/*
	* Only writev/readv/fsync requests on regular files or host block
	* devices are submitted. Therefore -EAGAIN is not expected but it's
	* known to happen sometimes with Linux SCSI. Submit again and hope
	* the request completes successfully.
	*
	* For more information, see:
	* https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
	*
	* If the code is changed to submit other types of requests in the
	* future, then this workaround may need to be extended to deal with
	* genuine -EAGAIN results that should not be resubmitted
	* immediately.
	*/
	if (ret == -EINTR \|\| ret == -EAGAIN) {
	luring_resubmit(s, luringcb);
	continue;
	}
	} else if (!luringcb->qiov) {
	goto end;
	} else if (total_bytes == luringcb->qiov->size) {
	ret = 0;
	/* Only read/write */
	} else {
	/* Short Read/Write */
	if (luringcb->is_read) {
	if (ret > 0) {
	luring_resubmit_short_read(s, luringcb, ret);
	continue;
	} else {
	/* Pad with zeroes */
	qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
	luringcb->qiov->size - total_bytes);
	ret = 0;
	}
	} else {
	ret = -ENOSPC;
	}
	}
	end:
	luringcb->ret = ret;
	qemu_iovec_destroy(&luringcb->resubmit_qiov);

	/*
	* If the coroutine is already entered it must be in ioq_submit()
	* and will notice luringcb->ret has been filled in when it
	* eventually runs later. Coroutines cannot be entered recursively
	* so avoid doing that!
	*/
	assert(luringcb->co->ctx == s->aio_context);
	if (!qemu_coroutine_entered(luringcb->co)) {
	aio_co_wake(luringcb->co);
	}
	}

	qemu_bh_cancel(s->completion_bh);

	defer_call_end();
	}

	static int ioq_submit(LuringState *s)
	{
	int ret = 0;
	LuringAIOCB luringcb, luringcb_next;

	while (s->io_q.in_queue > 0) {
	/*
	* Try to fetch sqes from the ring for requests waiting in
	* the overflow queue
	*/
	QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next,
	luringcb_next) {
	struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring);
	if (!sqes) {
	break;
	}
	/* Prep sqe for submission */
	*sqes = luringcb->sqeq;
	QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next);
	}
	ret = io_uring_submit(&s->ring);
	trace_luring_io_uring_submit(s, ret);
	/* Prevent infinite loop if submission is refused */
	if (ret <= 0) {
	if (ret == -EAGAIN \|\| ret == -EINTR) {
	continue;
	}
	break;
	}
	s->io_q.in_flight += ret;
	s->io_q.in_queue -= ret;
	}
	s->io_q.blocked = (s->io_q.in_queue > 0);

	if (s->io_q.in_flight) {
	/*
	* We can try to complete something just right away if there are
	* still requests in-flight.
	*/
	luring_process_completions(s);
	}
	return ret;
	}

	static void luring_process_completions_and_submit(LuringState *s)
	{
	luring_process_completions(s);

	if (s->io_q.in_queue > 0) {
	ioq_submit(s);
	}
	}

	static void qemu_luring_completion_bh(void *opaque)
	{
	LuringState *s = opaque;
	luring_process_completions_and_submit(s);
	}

	static void qemu_luring_completion_cb(void *opaque)
	{
	LuringState *s = opaque;
	luring_process_completions_and_submit(s);
	}

	static bool qemu_luring_poll_cb(void *opaque)
	{
	LuringState *s = opaque;

	return io_uring_cq_ready(&s->ring);
	}

	static void qemu_luring_poll_ready(void *opaque)
	{
	LuringState *s = opaque;

	luring_process_completions_and_submit(s);
	}

	static void ioq_init(LuringQueue *io_q)
	{
	QSIMPLEQ_INIT(&io_q->submit_queue);
	io_q->in_queue = 0;
	io_q->in_flight = 0;
	io_q->blocked = false;
	}

	static void luring_deferred_fn(void *opaque)
	{
	LuringState *s = opaque;
	trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
	s->io_q.in_flight);
	if (!s->io_q.blocked && s->io_q.in_queue > 0) {
	ioq_submit(s);
	}
	}

	/**
	* luring_do_submit:
	* @fd: file descriptor for I/O
	* @luringcb: AIO control block
	* @s: AIO state
	* @offset: offset for request
	* @type: type of request
	*
	* Fetches sqes from ring, adds to pending queue and preps them
	*
	*/
	static int luring_do_submit(int fd, LuringAIOCB luringcb, LuringState s,
	uint64_t offset, int type)
	{
	int ret;
	struct io_uring_sqe *sqes = &luringcb->sqeq;

	switch (type) {
	case QEMU_AIO_WRITE:
	io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
	luringcb->qiov->niov, offset);
	break;
	case QEMU_AIO_ZONE_APPEND:
	io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
	luringcb->qiov->niov, offset);
	break;
	case QEMU_AIO_READ:
	io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
	luringcb->qiov->niov, offset);
	break;
	case QEMU_AIO_FLUSH:
	io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
	break;
	default:
	fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
	__func__, type);
	abort();
	}
	io_uring_sqe_set_data(sqes, luringcb);

	QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
	s->io_q.in_queue++;
	trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
	s->io_q.in_flight);
	if (!s->io_q.blocked) {
	if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
	ret = ioq_submit(s);
	trace_luring_do_submit_done(s, ret);
	return ret;
	}

	defer_call(luring_deferred_fn, s);
	}
	return 0;
	}

	int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
	QEMUIOVector *qiov, int type)
	{
	int ret;
	AioContext *ctx = qemu_get_current_aio_context();
	LuringState *s = aio_get_linux_io_uring(ctx);
	LuringAIOCB luringcb = {
	.co = qemu_coroutine_self(),
	.ret = -EINPROGRESS,
	.qiov = qiov,
	.is_read = (type == QEMU_AIO_READ),
	};
	trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
	type);
	ret = luring_do_submit(fd, &luringcb, s, offset, type);

	if (ret < 0) {
	return ret;
	}

	if (luringcb.ret == -EINPROGRESS) {
	qemu_coroutine_yield();
	}
	return luringcb.ret;
	}

	void luring_detach_aio_context(LuringState s, AioContext old_context)
	{
	aio_set_fd_handler(old_context, s->ring.ring_fd,
	NULL, NULL, NULL, NULL, s);
	qemu_bh_delete(s->completion_bh);
	s->aio_context = NULL;
	}

	void luring_attach_aio_context(LuringState s, AioContext new_context)
	{
	s->aio_context = new_context;
	s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
	aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
	qemu_luring_completion_cb, NULL,
	qemu_luring_poll_cb, qemu_luring_poll_ready, s);
	}

	LuringState luring_init(Error *errp)
	{
	int rc;
	LuringState *s = g_new0(LuringState, 1);
	struct io_uring *ring = &s->ring;

	trace_luring_init_state(s, sizeof(*s));

	rc = io_uring_queue_init(MAX_ENTRIES, ring, 0);
	if (rc < 0) {
	error_setg_errno(errp, errno, "failed to init linux io_uring ring");
	g_free(s);
	return NULL;
	}

	ioq_init(&s->io_q);
	return s;

	}

	void luring_cleanup(LuringState *s)
	{
	io_uring_queue_exit(&s->ring);
	trace_luring_cleanup_state(s);
	g_free(s);
	}