| /* |
| * Copyright (c) 2019 Nutanix Inc. All rights reserved. |
| * |
| * Authors: Thanos Makatos <thanos@nutanix.com> |
| * Swapnil Ingle <swapnil.ingle@nutanix.com> |
| * Felipe Franciosi <felipe@nutanix.com> |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of Nutanix nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| * |
| */ |
| |
| /* |
| * Defines the libvfio-user server-side API. The protocol definitions can be |
| * found in vfio-user.h. |
| * |
| * This is not currently a stable API or ABI, and may change at any time. |
| * Library calls are not guaranteed thread-safe: multi-threaded consumers need |
| * to protect calls with their own exclusion methods. |
| */ |
| |
| #ifndef LIB_VFIO_USER_H |
| #define LIB_VFIO_USER_H |
| |
| #include <stdint.h> |
| #include <sys/uio.h> |
| #include <unistd.h> |
| #include <syslog.h> |
| #include <sys/queue.h> |
| |
| #include "pci_caps/dsn.h" |
| #include "pci_caps/msi.h" |
| #include "pci_caps/msix.h" |
| #include "pci_caps/pm.h" |
| #include "pci_caps/px.h" |
| #include "pci_defs.h" |
| #include "vfio-user.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| #define LIB_VFIO_USER_MAJOR 0 |
| #define LIB_VFIO_USER_MINOR 1 |
| |
| /* DMA addresses cannot be directly de-referenced. */ |
| typedef void *vfu_dma_addr_t; |
| |
| struct dma_sg; |
| typedef struct dma_sg dma_sg_t; |
| |
| typedef struct vfu_ctx vfu_ctx_t; |
| |
| /* |
| * Returns the size, in bytes, of dma_sg_t. |
| */ |
| size_t |
| dma_sg_size(void); |
| |
| /* |
| * Attaching to the transport is non-blocking. |
| * The caller must then manually call vfu_attach_ctx(), |
| * which is non-blocking, as many times as necessary. |
| * |
| * This also applies to vfu_run_ctx(). However, it's presumed that any actual |
| * reads or writes of the socket connection will not need to block, since both |
| * APIS are synchronous. |
| */ |
| #define LIBVFIO_USER_FLAG_ATTACH_NB (1 << 0) |
| |
| typedef enum { |
| VFU_TRANS_SOCK, |
| // For internal testing only |
| VFU_TRANS_PIPE, |
| VFU_TRANS_MAX |
| } vfu_trans_t; |
| |
| typedef enum { |
| VFU_DEV_TYPE_PCI |
| } vfu_dev_type_t; |
| |
| /** |
| * Creates libvfio-user context. By default one ERR and one REQ IRQs are |
| * initialized, this can be overridden with vfu_setup_device_nr_irqs. |
| * |
| * @trans: transport type |
| * @path: path to socket file. |
| * @flags: context flags (LIBVFIO_USER_FLAG_*) |
| * @pvt: private data |
| * @dev_type: device type |
| * |
| * @returns the vfu_ctx to be used or NULL on error. Sets errno. |
| */ |
| vfu_ctx_t * |
| vfu_create_ctx(vfu_trans_t trans, const char *path, |
| int flags, void *pvt, vfu_dev_type_t dev_type); |
| |
| /* |
| * Finalizes the device making it ready for vfu_attach_ctx(). This function is |
| * mandatory to be called before vfu_attach_ctx(). |
| * @vfu_ctx: the libvfio-user context |
| * |
| * @returns: 0 on success, -1 on error. Sets errno. |
| */ |
| int |
| vfu_realize_ctx(vfu_ctx_t *vfu_ctx); |
| |
| /* |
| * Attempts to attach to the transport. Attach is mandatory before vfu_run_ctx() |
| * and is non blocking if context is created with LIBVFIO_USER_FLAG_ATTACH_NB |
| * flag. |
| * |
| * @returns: 0 on success, -1 on error. Sets errno. If errno is set to EAGAIN |
| * or EWOULDBLOCK then the transport is not ready to attach to and the operation |
| * must be retried. |
| * |
| * @vfu_ctx: the libvfio-user context |
| */ |
| int |
| vfu_attach_ctx(vfu_ctx_t *vfu_ctx); |
| |
| /** |
| * Return a file descriptor suitable for waiting on via epoll() or similar. The |
| * file descriptor may change after a successful vfu_attach_ctx(), or on |
| * receiving ENOTCONN error message from vfu_run_ctx(); in those cases, |
| * vfu_get_poll_fd() should be called again to get the current correct file |
| * descriptor. |
| */ |
| int |
| vfu_get_poll_fd(vfu_ctx_t *vfu_ctx); |
| |
| /** |
| * Polls the vfu_ctx and processes the command received from client. |
| * - Blocking vfu_ctx: |
| * Blocks until new request is received from client and continues processing |
| * the requests. Exits only in case of error or if the client disconnects. |
| * - Non-blocking vfu_ctx(LIBVFIO_USER_FLAG_ATTACH_NB): |
| * Processes one request from client if it's available, otherwise it |
| * immediately returns and the caller is responsible for periodically |
| * calling again. |
| * |
| * @vfu_ctx: The libvfio-user context to poll |
| * |
| * @returns the number of requests processed (0 or more); or -1 on error, |
| * with errno set as follows: |
| * |
| * ENOTCONN: client closed connection, vfu_attach_ctx() should be called again |
| * EBUSY: the device was asked to quiesce and is still quiescing |
| * Other errno values are also possible. |
| */ |
| int |
| vfu_run_ctx(vfu_ctx_t *vfu_ctx); |
| |
| /** |
| * Destroys libvfio-user context. During this call the device must already be |
| * in quiesced state; the quiesce callback is not called. Any other device |
| * callback can be called. |
| * |
| * @vfu_ctx: the libvfio-user context to destroy |
| */ |
| void |
| vfu_destroy_ctx(vfu_ctx_t *vfu_ctx); |
| |
| /** |
| * Return the private pointer given to vfu_create_ctx(). |
| */ |
| void * |
| vfu_get_private(vfu_ctx_t *vfu_ctx); |
| |
| /** |
| * Callback function signature for log function |
| * @vfu_ctx: the libvfio-user context |
| * @level: log level as defined in syslog(3) |
| * @vfu_log_fn_t: typedef for log function. |
| * @msg: message |
| */ |
| typedef void (vfu_log_fn_t)(vfu_ctx_t *vfu_ctx, int level, const char *msg); |
| |
| /** |
| * Log to the logging function configured for this context. The format should |
| * not include a new line. |
| */ |
| void |
| vfu_log(vfu_ctx_t *vfu_ctx, int level, const char *fmt, ...) \ |
| __attribute__((format(printf, 3, 4))); |
| |
| /** |
| * Set up logging information. |
| * @vfu_ctx: the libvfio-user context |
| * @log: logging function |
| * @level: logging level as defined in syslog(3) |
| * |
| * The log handler is expected to add a newline (that is, log messages do not |
| * include a newline). |
| */ |
| int |
| vfu_setup_log(vfu_ctx_t *vfu_ctx, vfu_log_fn_t *log, int level); |
| |
| /** |
| * Prototype for region access callback. When a region is accessed, libvfio-user |
| * calls the previously registered callback with the following arguments: |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @buf: buffer containing the data to be written or data to be read into |
| * @count: number of bytes being read or written |
| * @offset: byte offset within the region |
| * @is_write: whether or not this is a write |
| * |
| * @returns the number of bytes read or written, or -1 on error, setting errno. |
| */ |
| typedef ssize_t (vfu_region_access_cb_t)(vfu_ctx_t *vfu_ctx, char *buf, |
| size_t count, loff_t offset, |
| bool is_write); |
| |
| #define VFU_REGION_FLAG_READ (1 << 0) |
| #define VFU_REGION_FLAG_WRITE (1 << 1) |
| #define VFU_REGION_FLAG_RW (VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE) |
| /* If unset, this is an IO region. */ |
| #define VFU_REGION_FLAG_MEM (1 << 2) |
| #define VFU_REGION_FLAG_ALWAYS_CB (1 << 3) |
| #define VFU_REGION_FLAG_MASK (VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM | \ |
| VFU_REGION_FLAG_ALWAYS_CB) |
| |
| /** |
| * Set up a device region. |
| * |
| * A region is an area of device memory that can be accessed by the client, |
| * either via VFIO_USER_REGION_READ/WRITE, or directly by mapping the region |
| * into the client's address space if an fd is given. |
| * |
| * A mappable region can be split into mappable sub-areas according to the |
| * @mmap_areas array. Note that the client can memory map any part of the |
| * file descriptor, even if not supposed to do so according to @mmap_areas. |
| * There is no way in Linux to avoid this. |
| * |
| * TODO maybe we should introduce per-sparse region file descriptors so that |
| * the client cannot possibly memory map areas it's not supposed to. Even if |
| * the client needs to have region under the same backing file, it is possible |
| * to create linear device-mapper targets, one for each area, and provide file |
| * descriptors of these DM targets. This is something we can document and |
| * demonstrate in a sample. |
| * |
| * Areas that are accessed via such a mapping by definition do not invoke any |
| * given callback. However, the callback can still be invoked, even on a |
| * mappable area, if the client chooses to call VFIO_USER_REGION_READ/WRITE. |
| * |
| * The following regions are special and are explained below: |
| * - VFU_PCI_DEV_CFG_REGION_IDX, |
| * - VFU_PCI_DEV_MIGR_REGION_IDX, and |
| * - VFU_GENERIC_DEV_MIGR_REG_IDX. |
| * |
| * Region VFU_PCI_DEV_CFG_REGION_IDX, corresponding to PCI config space, has |
| * special handling: |
| * |
| * - the @size argument is ignored: the region size is always the size defined |
| * by the relevant PCI specification |
| * - all accesses to the standard PCI header (i.e. the first 64 bytes of the |
| * region) are handled by the library |
| * - all accesses to known PCI capabilities (see vfu_pci_add_capability()) |
| * are handled by the library |
| * - if no callback is provided, reads to other areas are a simple memcpy(), |
| * and writes are an error |
| * - otherwise, the callback is expected to handle the access |
| * - if VFU_REGION_FLAG_ALWAYS_CB flag is set, all accesses to the config |
| * space are forwarded to the callback |
| * |
| * Regions VFU_PCI_DEV_MIGR_REGION_IDX and VFU_GENERIC_DEV_MIGR_REG_IDX, |
| * corresponding to the migration region, enable live migration support for |
| * the device. The migration region must contain at the beginning the migration |
| * registers (struct vfio_user_migration_info) and the remaining part of the |
| * region can be arbitrarily used by the device implementation. The region |
| * provided must have at least vfu_get_migr_register_area_size() bytes available |
| * at the start of the region (this size is guaranteed to be page-aligned). If |
| * mmap_areas is given, it must _not_ include this part of the region. |
| * |
| * libvfio-user offers two ways for the migration region to be used: |
| * 1. natively: the device implementation must handle accesses to the |
| * migration registers and migration data via the region callbacks. The |
| * semantics of these registers are explained in <linux/vfio.h>. |
| * 2. via the vfu_migration_callbacks_t callbacks: the device implementation |
| * registers a set of callbacks by calling vfu_setup_device_migration. |
| * The region's read/write callbacks are never called. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @region_idx: region index |
| * @size: size of the region |
| * @region_access: callback function to access region |
| * @flags: region flags (VFU_REGION_FLAG_*) |
| * @mmap_areas: array of memory mappable areas; if an fd is provided, but this |
| * is NULL, then the entire region is mappable. |
| * @nr_mmap_areas: number of sparse areas in @mmap_areas; must be provided if |
| * the @mmap_areas is non-NULL, or 0 otherwise. |
| * @fd: file descriptor of the file backing the region if the region is |
| * mappable; it is the server's responsibility to create a file suitable for |
| * memory mapping by the client. |
| * @offset: offset of the region within the fd, or zero. |
| * |
| * @returns 0 on success, -1 on error, Sets errno. |
| */ |
| int |
| vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size, |
| vfu_region_access_cb_t *region_access, int flags, |
| struct iovec *mmap_areas, uint32_t nr_mmap_areas, |
| int fd, uint64_t offset); |
| |
| typedef enum vfu_reset_type { |
| /* |
| * Client requested a device reset (for example, as part of a guest VM |
| * reboot). The vfio-user context remains valid, but it's expected that all |
| * ongoing operations are completed or cancelled, and any device state is |
| * reset to a known-good initial state (including any PCI register state). |
| */ |
| VFU_RESET_DEVICE, |
| |
| /* |
| * The vfio-user socket client connection was closed or reset. The attached |
| * context is cleaned up after returning from the reset callback, and |
| * vfu_attach_ctx() must be called to establish a new client. |
| */ |
| VFU_RESET_LOST_CONN, |
| |
| /* |
| * Client requested to initiate PCI function level reset. |
| */ |
| VFU_RESET_PCI_FLR |
| } vfu_reset_type_t; |
| |
| /* |
| * Device callback for quiescing the device. |
| * |
| * vfu_run_ctx uses this callback to request from the device to quiesce its |
| * operation. A quiesced device must not call vfu_addr_to_sgl() or vfu_sgl_*(), |
| * unless it does so from a device callback. |
| * |
| * The callback can return two values: |
| * 1) 0: this indicates that the device was quiesced. vfu_run_ctx then continues |
| * to execute and when vfu_run_ctx returns to the caller the device is |
| * unquiesced. |
| * 2) -1 with errno set to EBUSY: this indicates that the device cannot |
| * immediately quiesce. In this case, vfu_run_ctx returns -1 with errno |
| * set to EBUSY and future calls to vfu_run_ctx return the same. Until the |
| * device quiesces it can continue operate as normal. The device indicates |
| * that it quiesced by calling vfu_device_quiesced. When |
| * vfu_device_quiesced returns the device is no longer quiesced. |
| * |
| * A quiesced device should expect for any of the following callbacks to be |
| * executed: vfu_dma_register_cb_t, vfu_unregister_cb_t, vfu_reset_cb_t, and |
| * the migration transition callback. These callbacks are only called after the |
| * device has been quiesced. |
| * |
| * The following example demonstrates how a device can use the SG routines and |
| * friends while quiesced: |
| * |
| * A DMA region is mapped, libvfio-user calls the quiesce callback but the |
| * device cannot immediately quiesce: |
| * |
| * int quiesce_cb(vfu_ctx_t *vfu_ctx) { |
| * errno = EBUSY; |
| * return -1; |
| * } |
| * |
| * While quiescing, the device can continue to operate as normal, including |
| * calling functions such as vfu_sgl_get(). Then, the device finishes quiescing: |
| * |
| * vfu_quiesce_done(vfu_ctx, 0); |
| * |
| * At this point, the device must have stopped using functions like |
| * vfu_sgl_get(), for example by pausing any I/O threads. libvfio-user |
| * eventually calls the dma_register device callback before vfu_quiesce_done |
| * returns. In this callback the device is allowed to call functions such as |
| * vfu_sgl_get() |
| * |
| * void (dma_register_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) { |
| * vfu_sgl_get(ctx, ...); |
| * } |
| * |
| * Once vfu_quiesce_done returns, the device is unquiesced. |
| * |
| * |
| * @vfu_ctx: the libvfio-user context |
| * |
| * @returns: 0 on success, -1 on failure with errno set. |
| */ |
| typedef int (vfu_device_quiesce_cb_t)(vfu_ctx_t *vfu_ctx); |
| |
| /** |
| * Sets up the device quiesce callback. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @quiesce_cb: device quiesce callback |
| */ |
| void |
| vfu_setup_device_quiesce_cb(vfu_ctx_t *vfu_ctx, |
| vfu_device_quiesce_cb_t *quiesce_cb); |
| |
| /* |
| * Called by the device to complete a pending quiesce operation. After the |
| * function returns the device is unquiesced. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @quiesce_errno: 0 for success or errno in case the device fails to quiesce, |
| * in which case the operation requiring the quiesce is failed |
| * and the device is reset. |
| * |
| * @returns 0 on success, or -1 on failure. Sets errno. |
| */ |
| int |
| vfu_device_quiesced(vfu_ctx_t *vfu_ctx, int quiesce_errno); |
| |
| /* |
| * Callback function that is called when the device must be reset. |
| */ |
| typedef int (vfu_reset_cb_t)(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type); |
| |
| /** |
| * Set up device reset callback. |
| * |
| * A reset should ensure that all on-going use of device IRQs or guest memory is |
| * completed or cancelled before returning from the callback. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @reset: device reset callback |
| */ |
| int |
| vfu_setup_device_reset_cb(vfu_ctx_t *vfu_ctx, vfu_reset_cb_t *reset); |
| |
| /* |
| * Info for a guest DMA region. @iova is always valid; the other parameters |
| * will only be set if the guest DMA region is mappable. |
| * |
| * @iova: guest DMA range. This is the guest physical range (as we don't |
| * support vIOMMU) that the guest registers for DMA, via a VFIO_USER_DMA_MAP |
| * message, and is the address space used as input to vfu_addr_to_sgl(). |
| * @vaddr: if the range is mapped into this process, this is the virtual address |
| * of the start of the region. |
| * @mapping: if @vaddr is non-NULL, this range represents the actual range |
| * mmap()ed into the process. This might be (large) page aligned, and |
| * therefore be different from @vaddr + @iova.iov_len. |
| * @page_size: if @vaddr is non-NULL, page size of the mapping (e.g. 2MB) |
| * @prot: if @vaddr is non-NULL, protection settings of the mapping as per |
| * mmap(2) |
| * |
| * For a real example, using the gpio sample server, and a qemu configured to |
| * use huge pages and share its memory: |
| * |
| * gpio: mapped DMA region iova=[0xf0000-0x10000000) vaddr=0x2aaaab0f0000 |
| * page_size=0x200000 mapping=[0x2aaaab000000-0x2aaabb000000) |
| * |
| * 0xf0000 0x10000000 |
| * | | |
| * v v |
| * +-----------------------------------+ |
| * | Guest IOVA (DMA) space | |
| * +--+-----------------------------------+--+ |
| * | | | | |
| * | +-----------------------------------+ | |
| * | ^ libvfio-user server address space | |
| * +--|--------------------------------------+ |
| * ^ vaddr=0x2aaaab0f0000 ^ |
| * | | |
| * 0x2aaaab000000 0x2aaabb000000 |
| * |
| * This region can be directly accessed at 0x2aaaab0f0000, but the underlying |
| * large page mapping is in the range [0x2aaaab000000-0x2aaabb000000). |
| */ |
| typedef struct vfu_dma_info { |
| struct iovec iova; |
| void *vaddr; |
| struct iovec mapping; |
| size_t page_size; |
| uint32_t prot; |
| } vfu_dma_info_t; |
| |
| /* |
| * Called when a guest registers one of its DMA regions via a VFIO_USER_DMA_MAP |
| * message. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @info: the DMA info |
| */ |
| typedef void (vfu_dma_register_cb_t)(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info); |
| |
| /* |
| * Function that is called when the guest unregisters a DMA region. This |
| * callback is required if you want to be able to access guest memory directly |
| * via a mapping. The device must release all references to that region before |
| * the callback returns. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @info: the DMA info |
| */ |
| typedef void (vfu_dma_unregister_cb_t)(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info); |
| |
| /** |
| * Set up device DMA registration callbacks. When libvfio-user is notified of a |
| * DMA range addition or removal, these callbacks will be invoked. |
| * |
| * If this function is not called, guest DMA regions are not accessible via |
| * vfu_addr_to_sgl(). |
| * |
| * To directly access this DMA memory via a local mapping with vfu_sgl_get(), at |
| * least @dma_unregister must be provided. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @dma_register: DMA region registration callback (optional) |
| * @dma_unregister: DMA region unregistration callback (optional) |
| */ |
| |
| int |
| vfu_setup_device_dma(vfu_ctx_t *vfu_ctx, vfu_dma_register_cb_t *dma_register, |
| vfu_dma_unregister_cb_t *dma_unregister); |
| |
| enum vfu_dev_irq_type { |
| VFU_DEV_INTX_IRQ, |
| VFU_DEV_MSI_IRQ, |
| VFU_DEV_MSIX_IRQ, |
| VFU_DEV_ERR_IRQ, |
| VFU_DEV_REQ_IRQ, |
| VFU_DEV_NUM_IRQS |
| }; |
| |
| /** |
| * Set up device IRQ counts. |
| * @vfu_ctx: the libvfio-user context |
| * @type: IRQ type (VFU_DEV_INTX_IRQ ... VFU_DEV_REQ_IRQ) |
| * @count: number of irqs |
| */ |
| int |
| vfu_setup_device_nr_irqs(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type, |
| uint32_t count); |
| |
| /* |
| * Function that is called when the guest masks or unmasks an IRQ vector. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @start: starting IRQ vector |
| * @count: number of vectors |
| * @mask: indicates if the IRQ is masked or unmasked |
| */ |
| typedef void (vfu_dev_irq_state_cb_t)(vfu_ctx_t *vfu_ctx, uint32_t start, |
| uint32_t count, bool mask); |
| |
| /** |
| * Set up IRQ state change callback. When libvfio-user is notified of a |
| * change to IRQ state, whether masked or unmasked, it invokes |
| * this callback. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @type: IRQ type such as VFU_DEV_MSIX_IRQ - defined by vfu_dev_irq_type |
| * @cb: IRQ state change callback |
| * |
| * @returns 0 on success, -1 on error, sets errno. |
| */ |
| int |
| vfu_setup_irq_state_callback(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type, |
| vfu_dev_irq_state_cb_t *cb); |
| |
| typedef enum { |
| VFU_MIGR_STATE_STOP, |
| VFU_MIGR_STATE_RUNNING, |
| VFU_MIGR_STATE_STOP_AND_COPY, |
| VFU_MIGR_STATE_PRE_COPY, |
| VFU_MIGR_STATE_RESUME |
| } vfu_migr_state_t; |
| |
| #define VFU_MIGR_CALLBACKS_VERS 1 |
| |
| /* |
| * Callbacks during the pre-copy and stop-and-copy phases. |
| * |
| * The client executes the following steps to copy migration data: |
| * |
| * 1. get_pending_bytes: device must return amount of migration data |
| * 2. prepare_data: device must prepare migration data |
| * 3. read_data: device must provide migration data |
| * |
| * The client repeats the above steps until there is no more migration data to |
| * return (the device must return 0 from get_pending_bytes to indicate that |
| * there are no more migration data to be consumed in this iteration). |
| */ |
| typedef struct { |
| |
| /* |
| * Set it to VFU_MIGR_CALLBACKS_VERS. |
| */ |
| int version; |
| |
| /* |
| * Migration state transition callback. |
| * |
| * The callback should return -1 on error, setting errno. |
| * |
| * |
| * TODO rename to vfu_migration_state_transition_callback |
| * FIXME maybe we should create a single callback and pass the state? |
| */ |
| int (*transition)(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state); |
| |
| /* Callbacks for saving device state */ |
| |
| /* |
| * Function that is called to retrieve the amount of pending migration |
| * data. If migration data were previously made available (function |
| * prepare_data has been called) then calling this function signifies that |
| * they have been read (e.g. migration data can be discarded). If the |
| * function returns 0 then migration has finished and this function won't |
| * be called again. |
| * |
| * The amount of pending migration data returned by the device does not |
| * necessarily have to monotonically decrease over time and does not need |
| * to match the amount of migration data returned via the @size argument in |
| * prepare_data. It can completely fluctuate according to the needs of the |
| * device. These semantics are derived from the pending_bytes register in |
| * VFIO. Therefore the value returned by get_pending_bytes must be |
| * primarily regarded as boolean, either 0 or non-zero, as far as migration |
| * completion is concerned. More advanced vfio-user clients can make |
| * assumptions on how migration is progressing on devices that guarantee |
| * that the amount of pending migration data decreases over time. |
| */ |
| uint64_t (*get_pending_bytes)(vfu_ctx_t *vfu_ctx); |
| |
| /* |
| * Function that is called to instruct the device to prepare migration data |
| * to be read when in pre-copy or stop-and-copy state, and to prepare for |
| * receiving migration data when in resuming state. |
| * |
| * When in pre-copy and stop-and-copy state, the function must return only |
| * after migration data are available at the specified offset. This |
| * callback is called once per iteration. The amount of data available |
| * pointed to by @size can be different that the amount of data returned by |
| * get_pending_bytes in the beginning of the iteration. |
| * |
| * In VFIO, the data_offset and data_size registers can be read multiple |
| * times during an iteration and are invariant, libvfio-user simplifies |
| * this by caching the values and returning them when read, guaranteeing |
| * that prepare_data() is called only once per migration iteration. |
| * |
| * When in resuming state, @offset must be set to where migration data must |
| * written. @size points to NULL. |
| * |
| * The callback should return -1 on error, setting errno. |
| */ |
| int (*prepare_data)(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size); |
| |
| /* |
| * Function that is called to read migration data. offset and size can be |
| * any subrange on the offset and size previously returned by prepare_data. |
| * The function must return the amount of data read or -1 on error, setting |
| * errno. |
| * |
| * This function can be called even if the migration data can be memory |
| * mapped. |
| */ |
| ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, |
| uint64_t count, uint64_t offset); |
| |
| /* Callbacks for restoring device state */ |
| |
| /* |
| * Fuction that is called for writing previously stored device state. The |
| * function must return the amount of data written or -1 on error, setting |
| * errno. |
| */ |
| ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, |
| uint64_t offset); |
| |
| /* |
| * Function that is called when client has written some previously stored |
| * device state. |
| * |
| * The callback should return -1 on error, setting errno. |
| */ |
| int (*data_written)(vfu_ctx_t *vfu_ctx, uint64_t count); |
| |
| } vfu_migration_callbacks_t; |
| |
| /** |
| * The definition for VFIO_DEVICE_STATE_XXX differs with the version of vfio |
| * header file used. Some old systems wouldn't have these definitions. Some |
| * other newer systems would be using region based migration, and not |
| * have VFIO_DEVICE_STATE_V1_XXXX defined. The latest ones have |
| * VFIO_DEVICE_STATE_V1_XXXX defined. The following addresses all |
| * these scenarios. |
| */ |
| #if defined(VFIO_DEVICE_STATE_STOP) |
| |
| _Static_assert(VFIO_DEVICE_STATE_STOP == 0, |
| "incompatible VFIO_DEVICE_STATE_STOP definition"); |
| |
| #define VFIO_DEVICE_STATE_V1_STOP VFIO_DEVICE_STATE_STOP |
| #define VFIO_DEVICE_STATE_V1_RUNNING VFIO_DEVICE_STATE_RUNNING |
| #define VFIO_DEVICE_STATE_V1_SAVING VFIO_DEVICE_STATE_SAVING |
| #define VFIO_DEVICE_STATE_V1_RESUMING VFIO_DEVICE_STATE_RESUMING |
| |
| #elif !defined(VFIO_REGION_TYPE_MIGRATION_DEPRECATED) /* VFIO_DEVICE_STATE_STOP */ |
| |
| #define VFIO_DEVICE_STATE_V1_STOP (0) |
| #define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0) |
| #define VFIO_DEVICE_STATE_V1_SAVING (1 << 1) |
| #define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2) |
| #define VFIO_DEVICE_STATE_MASK ((1 << 3) - 1) |
| |
| #endif /* VFIO_REGION_TYPE_MIGRATION_DEPRECATED */ |
| |
| /* |
| * The currently defined migration registers; if using migration callbacks, |
| * these are handled internally by the library. |
| * |
| * This is analogous to struct vfio_device_migration_info. |
| */ |
| struct vfio_user_migration_info { |
| /* VFIO_DEVICE_STATE_* */ |
| uint32_t device_state; |
| uint32_t reserved; |
| uint64_t pending_bytes; |
| uint64_t data_offset; |
| uint64_t data_size; |
| }; |
| |
| /* |
| * Returns the size of the area needed to hold the migration registers at the |
| * beginning of the migration region; guaranteed to be page aligned. |
| */ |
| size_t |
| vfu_get_migr_register_area_size(void); |
| |
| /** |
| * vfu_setup_device_migration provides an abstraction over the migration |
| * protocol: the user specifies a set of callbacks which are called in response |
| * to client accesses of the migration region; the migration region read/write |
| * callbacks are not called after this function call. Offsets in callbacks are |
| * relative to @data_offset. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @callbacks: migration callbacks |
| * @data_offset: offset in the migration region where data begins. |
| * |
| * @returns 0 on success, -1 on error, sets errno. |
| */ |
| int |
| vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx, |
| const vfu_migration_callbacks_t *callbacks, |
| uint64_t data_offset); |
| |
| /** |
| * Triggers an interrupt. |
| * |
| * libvfio-user takes care of using the correct IRQ type (IRQ index: INTx or |
| * MSI/X), the caller only needs to specify the sub-index. |
| * |
| * @vfu_ctx: the libvfio-user context to trigger interrupt |
| * @subindex: vector subindex to trigger interrupt on |
| * |
| * @returns 0 on success, or -1 on failure. Sets errno. |
| */ |
| int |
| vfu_irq_trigger(vfu_ctx_t *vfu_ctx, uint32_t subindex); |
| |
| /** |
| * Takes a guest physical address range and populates an array of scatter/gather |
| * entries than can be individually mapped in the program's virtual memory. A |
| * single linear guest physical address span may need to be split into multiple |
| * scatter/gather regions due to limitations of how memory can be mapped. |
| * |
| * vfu_setup_device_dma() must have been called prior to using this function. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @dma_addr: the guest physical address |
| * @len: size of memory to be mapped |
| * @sgl: array that receives the scatter/gather entries to be mapped |
| * @max_nr_sgs: maximum number of elements in above array |
| * @prot: protection as defined in <sys/mman.h> |
| * |
| * @returns the number of scatter/gather entries created on success, and on |
| * failure: |
| * -1: if the GPA address span is invalid (errno=ENOENT) or |
| * protection violation (errno=EACCES) |
| * (-x - 1): if @max_nr_sgs is too small, where x is the number of SG |
| * entries necessary to complete this request (errno=0). |
| */ |
| int |
| vfu_addr_to_sgl(vfu_ctx_t *vfu_ctx, vfu_dma_addr_t dma_addr, size_t len, |
| dma_sg_t *sgl, size_t max_nr_sgs, int prot); |
| |
| /** |
| * Populate the given iovec array (accessible in the process's virtual memory), |
| * based upon the SGL previously built via vfu_addr_to_sgl(). |
| * It is the caller's responsibility to return the release the iovecs via |
| * vfu_sgl_put(). |
| * |
| * This is only supported when a @dma_unregister callback is provided to |
| * vfu_setup_device_dma(). |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @sgl: array of scatter/gather entries returned by vfu_addr_to_sg. These |
| * entries must not be modified and the array must not be deallocated |
| * until vfu_sgl_put() has been called. |
| * @iov: array of iovec structures (defined in <sys/uio.h>) to receive each |
| * mapping |
| * @cnt: number of scatter/gather entries to map |
| * @flags: must be 0 |
| * |
| * @returns 0 on success, -1 on failure. Sets errno. |
| */ |
| int |
| vfu_sgl_get(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, struct iovec *iov, size_t cnt, |
| int flags); |
| |
| /** |
| * Mark scatter/gather entries (previously acquired via vfu_sgl_get()) |
| * as dirty (written to). This is only necessary if vfu_sgl_put() is not called. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @sg: array of scatter/gather entries to mark as dirty |
| * @cnt: number of scatter/gather entries to mark as dirty |
| */ |
| void |
| vfu_sgl_mark_dirty(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, size_t cnt); |
| |
| /** |
| * Release the iovec array previously acquired by vfu_sgl_get(). |
| * |
| * This will automatically mark the sgl as dirty if needed. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @sgl: array of scatter/gather entries to unmap |
| * @iov: array of iovec structures for each scatter/gather entry |
| * @cnt: number of scatter/gather entries to unmap |
| */ |
| void |
| vfu_sgl_put(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, struct iovec *iov, size_t cnt); |
| |
| /** |
| * Read from the dma region exposed by the client. This can be used as an |
| * alternative to reading from a vfu_sgl_get() mapping, if the region is not |
| * directly mappable, or DMA notification callbacks have not been provided. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @sg: a DMA segment obtained from dma_addr_to_sg |
| * @data: data buffer to read into |
| * |
| * @returns 0 on success, -1 on failure. Sets errno. |
| */ |
| int |
| vfu_sgl_read(vfu_ctx_t *vfu_ctx, dma_sg_t *sg, size_t cnt, void *data); |
| |
| /** |
| * Write to the dma region exposed by the client. This can be used as an |
| * alternative to reading from a vfu_sgl_get() mapping, if the region is not |
| * directly mappable, or DMA notification callbacks have not been provided. |
| * |
| * During live migration, this call does not mark any of the written pages as |
| * dirty; the client is expected to track this. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @sg: a DMA segment obtained from dma_addr_to_sg |
| * @data: data buffer to write |
| * |
| * @returns 0 on success, -1 on failure. Sets errno. |
| */ |
| int |
| vfu_sgl_write(vfu_ctx_t *vfu_ctx, dma_sg_t *sg, size_t cnt, void *data); |
| |
| /* |
| * Supported PCI regions. |
| * |
| * Note: in VFIO, each region starts at a terabyte offset |
| * (VFIO_PCI_INDEX_TO_OFFSET) and because Linux supports up to 128 TB of user |
| * space virtual memory, there can be up to 128 device regions. PCI regions are |
| * fixed and in retrospect this choice has proven to be problematic because |
| * devices might contain potentially unused regions. New regions can now be |
| * positioned anywhere by using the VFIO_REGION_INFO_CAP_TYPE capability. In |
| * vfio-user we don't have this problem because the region index is just an |
| * identifier: the VMM memory maps a file descriptor that is passed to it and |
| * the mapping offset is derived from the mmap_areas offset value, rather than a |
| * static mapping from region index to offset. Thus, additional regions can |
| * have static indexes in vfio-user. |
| */ |
| enum { |
| VFU_PCI_DEV_BAR0_REGION_IDX, |
| VFU_PCI_DEV_BAR1_REGION_IDX, |
| VFU_PCI_DEV_BAR2_REGION_IDX, |
| VFU_PCI_DEV_BAR3_REGION_IDX, |
| VFU_PCI_DEV_BAR4_REGION_IDX, |
| VFU_PCI_DEV_BAR5_REGION_IDX, |
| VFU_PCI_DEV_ROM_REGION_IDX, |
| VFU_PCI_DEV_CFG_REGION_IDX, |
| VFU_PCI_DEV_VGA_REGION_IDX, |
| VFU_PCI_DEV_MIGR_REGION_IDX, |
| VFU_PCI_DEV_NUM_REGIONS, |
| }; |
| |
| typedef enum { |
| VFU_PCI_TYPE_CONVENTIONAL, |
| VFU_PCI_TYPE_PCI_X_1, |
| VFU_PCI_TYPE_PCI_X_2, |
| VFU_PCI_TYPE_EXPRESS |
| } vfu_pci_type_t; |
| |
| enum { |
| VFU_GENERIC_DEV_MIGR_REGION_IDX, |
| VFU_GENERIC_DEV_NUM_REGIONS |
| }; |
| |
| /** |
| * Initialize the context for a PCI device. This function must be called only |
| * once per libvfio-user context. |
| * |
| * This function initializes a buffer for the PCI config space, accessible via |
| * vfu_pci_get_config_space(). |
| * |
| * Returns 0 on success, or -1 on error, setting errno. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @pci_type: PCI type (convention PCI, PCI-X mode 1, PCI-X mode2, PCI-Express) |
| * @hdr_type: PCI header type. Only PCI_HEADER_TYPE_NORMAL is supported. |
| * @revision: PCI/PCI-X/PCIe revision |
| */ |
| int |
| vfu_pci_init(vfu_ctx_t *vfu_ctx, vfu_pci_type_t pci_type, |
| int hdr_type, int revision); |
| |
| /* |
| * Set the Vendor ID, Device ID, Subsystem Vendor ID, and Subsystem ID fields of |
| * the PCI config header (PCI3 6.2.1, 6.2.4). |
| * |
| * This must always be called for PCI devices, after vfu_pci_init(). |
| */ |
| void |
| vfu_pci_set_id(vfu_ctx_t *vfu_ctx, uint16_t vid, uint16_t did, |
| uint16_t ssvid, uint16_t ssid); |
| |
| /* |
| * Set the class code fields (base, sub-class, and programming interface) of the |
| * PCI config header (PCI3 6.2.1). |
| * |
| * If this function is not called, the fields are initialized to zero. |
| */ |
| void |
| vfu_pci_set_class(vfu_ctx_t *vfu_ctx, uint8_t base, uint8_t sub, uint8_t pi); |
| |
| |
| /* |
| * Returns a pointer to the PCI configuration space. |
| * |
| * PCI config space consists of an initial 64-byte vfu_pci_hdr_t, plus |
| * additional space, containing capabilities and/or device-specific |
| * configuration. Standard config space is 256 bytes (PCI_CFG_SPACE_SIZE); |
| * extended config space is 4096 bytes (PCI_CFG_SPACE_EXP_SIZE). |
| */ |
| vfu_pci_config_space_t * |
| vfu_pci_get_config_space(vfu_ctx_t *vfu_ctx); |
| |
| #define VFU_CAP_FLAG_EXTENDED (1 << 0) |
| #define VFU_CAP_FLAG_CALLBACK (1 << 1) |
| #define VFU_CAP_FLAG_READONLY (1 << 2) |
| |
| /** |
| * Add a PCI capability to PCI config space. |
| * |
| * Certain standard capabilities are handled entirely within the library: |
| * |
| * PCI_CAP_ID_EXP (pxcap) |
| * PCI_CAP_ID_MSIX (msixcap) |
| * PCI_CAP_ID_PM (pmcap) |
| * |
| * However, they must still be explicitly initialized and added here. |
| * |
| * The contents of @data are copied in. It must start with either a struct |
| * cap_hdr or a struct ext_cap_hdr, with the ID field set; the 'next' field is |
| * ignored. For PCI_CAP_ID_VNDR or PCI_EXT_CAP_ID_VNDR, the embedded size field |
| * must also be set; in general, any non-fixed-size capability must be |
| * initialized such that the size can be derived at this point. |
| * |
| * If @pos is non-zero, the capability will be placed at the given offset within |
| * configuration space. It must not overlap the PCI standard header, or any |
| * existing capability. Note that if a capability is added "out of order" in |
| * terms of the offset, there is no re-ordering of the capability list written |
| * in configuration space. |
| * |
| * If @pos is zero, the capability will be placed at a suitable offset |
| * automatically. |
| * |
| * The @flags field can be set as follows: |
| * |
| * VFU_CAP_FLAG_EXTENDED: this is an extended capability; supported if device is |
| * of PCI type VFU_PCI_TYPE_{PCI_X_2,EXPRESS}. |
| * |
| * VFU_CAP_FLAG_CALLBACK: all accesses to the capability are delegated to the |
| * callback for the region VFU_PCI_DEV_CFG_REGION_IDX. The callback should copy |
| * data into and out of the capability as needed (this could be directly on the |
| * config space area from vfu_pci_get_config_space()). It is not supported to |
| * allow writes to the initial capability header (ID/next fields). |
| * |
| * VFU_CAP_FLAG_READONLY: this prevents clients from writing to the capability. |
| * By default, clients are allowed to write to any part of the capability, |
| * excluding the initial header. |
| * |
| * Returns the offset of the capability in config space, or -1 on error, with |
| * errno set. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @pos: specific offset for the capability, or 0. |
| * @flags: VFU_CAP_FLAG_* |
| * @data: capability data, including the header |
| */ |
| ssize_t |
| vfu_pci_add_capability(vfu_ctx_t *vfu_ctx, size_t pos, int flags, void *data); |
| |
| /** |
| * Find the offset within config space of a given capability (if there are |
| * multiple possible matches, use vfu_pci_find_next_capability()). |
| * |
| * Returns 0 if no such capability was found, with errno set. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @extended whether capability is an extended one or not |
| * @id: capability id (PCI_CAP_ID_* or PCI_EXT_CAP_ID *) |
| */ |
| size_t |
| vfu_pci_find_capability(vfu_ctx_t *vfu_ctx, bool extended, int cap_id); |
| |
| /** |
| * Find the offset within config space of the given capability, starting from |
| * @pos, which must be the valid offset of an existing capability. This can be |
| * used to iterate through multiple capabilities with the same ID. |
| * |
| * Returns 0 if no more matching capabilities were found, with errno set. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @extended whether capability is an extended one or not |
| * @pos: offset within config space to start looking |
| * @id: capability id (PCI_CAP_ID_*) |
| */ |
| size_t |
| vfu_pci_find_next_capability(vfu_ctx_t *vfu_ctx, bool extended, |
| size_t pos, int cap_id); |
| |
| bool |
| vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg); |
| |
| /* |
| * Creates a new ioeventfd at the given setup memory region with @offset, @size, |
| * @fd, @flags and @datamatch. |
| * |
| * Returns 0 on success and -1 on failure with errno set. |
| * |
| * @vfu_ctx: the libvfio-user context |
| * @region_idx: The index of the memory region to set up the ioeventfd |
| * @fd: the value of the file descriptor |
| * @gpa_offset: The offset into the memory region |
| * @size: size of the ioeventfd |
| * @flags: Any flags to set up the ioeventfd |
| * @datamatch: sets the datamatch value |
| * @shadow_fd: File descriptor that can be mmap'ed, KVM will write there the |
| * otherwise discarded value when the ioeventfd is written to. If set to -1 |
| * then a normal ioeventfd is set up instead of a shadow one. The vfio-user |
| * client is free to ignore this, even if it supports shadow ioeventfds. |
| * Requires a kernel with shadow ioeventfd support. |
| * Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise |
| * must be -1. |
| * @shadow_offset: offset in shadow memory where value is written to. |
| */ |
| int |
| vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, |
| size_t gpa_offset, uint32_t size, uint32_t flags, |
| uint64_t datamatch, int shadow_fd, size_t shadow_offset); |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| #endif /* LIB_VFIO_USER_H */ |
| |
| /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ |