| /* |
| * VDUSE (vDPA Device in Userspace) library |
| * |
| * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. |
| * Portions of codes and concepts borrowed from libvhost-user.c, so: |
| * Copyright IBM, Corp. 2007 |
| * Copyright (c) 2016 Red Hat, Inc. |
| * |
| * Author: |
| * Xie Yongji <xieyongji@bytedance.com> |
| * Anthony Liguori <aliguori@us.ibm.com> |
| * Marc-André Lureau <mlureau@redhat.com> |
| * Victor Kaplansky <victork@redhat.com> |
| * |
| * This work is licensed under the terms of the GNU GPL, version 2 or |
| * later. See the COPYING file in the top-level directory. |
| */ |
| |
| #ifndef _GNU_SOURCE |
| #define _GNU_SOURCE |
| #endif |
| |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <errno.h> |
| #include <string.h> |
| #include <assert.h> |
| #include <endian.h> |
| #include <unistd.h> |
| #include <limits.h> |
| #include <fcntl.h> |
| #include <inttypes.h> |
| |
| #include <sys/ioctl.h> |
| #include <sys/eventfd.h> |
| #include <sys/mman.h> |
| |
| #include "include/atomic.h" |
| #include "linux-headers/linux/virtio_ring.h" |
| #include "linux-headers/linux/virtio_config.h" |
| #include "linux-headers/linux/vduse.h" |
| #include "libvduse.h" |
| |
| #define VDUSE_VQ_ALIGN 4096 |
| #define MAX_IOVA_REGIONS 256 |
| |
| #define LOG_ALIGNMENT 64 |
| |
| /* Round number down to multiple */ |
| #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) |
| |
| /* Round number up to multiple */ |
| #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) |
| |
| #ifndef unlikely |
| #define unlikely(x) __builtin_expect(!!(x), 0) |
| #endif |
| |
| typedef struct VduseDescStateSplit { |
| uint8_t inflight; |
| uint8_t padding[5]; |
| uint16_t next; |
| uint64_t counter; |
| } VduseDescStateSplit; |
| |
| typedef struct VduseVirtqLogInflight { |
| uint64_t features; |
| uint16_t version; |
| uint16_t desc_num; |
| uint16_t last_batch_head; |
| uint16_t used_idx; |
| VduseDescStateSplit desc[]; |
| } VduseVirtqLogInflight; |
| |
| typedef struct VduseVirtqLog { |
| VduseVirtqLogInflight inflight; |
| } VduseVirtqLog; |
| |
| typedef struct VduseVirtqInflightDesc { |
| uint16_t index; |
| uint64_t counter; |
| } VduseVirtqInflightDesc; |
| |
| typedef struct VduseRing { |
| unsigned int num; |
| uint64_t desc_addr; |
| uint64_t avail_addr; |
| uint64_t used_addr; |
| struct vring_desc *desc; |
| struct vring_avail *avail; |
| struct vring_used *used; |
| } VduseRing; |
| |
| struct VduseVirtq { |
| VduseRing vring; |
| uint16_t last_avail_idx; |
| uint16_t shadow_avail_idx; |
| uint16_t used_idx; |
| uint16_t signalled_used; |
| bool signalled_used_valid; |
| int index; |
| unsigned int inuse; |
| bool ready; |
| int fd; |
| VduseDev *dev; |
| VduseVirtqInflightDesc *resubmit_list; |
| uint16_t resubmit_num; |
| uint64_t counter; |
| VduseVirtqLog *log; |
| }; |
| |
| typedef struct VduseIovaRegion { |
| uint64_t iova; |
| uint64_t size; |
| uint64_t mmap_offset; |
| uint64_t mmap_addr; |
| } VduseIovaRegion; |
| |
| struct VduseDev { |
| VduseVirtq *vqs; |
| VduseIovaRegion regions[MAX_IOVA_REGIONS]; |
| int num_regions; |
| char *name; |
| uint32_t device_id; |
| uint32_t vendor_id; |
| uint16_t num_queues; |
| uint16_t queue_size; |
| uint64_t features; |
| const VduseOps *ops; |
| int fd; |
| int ctrl_fd; |
| void *priv; |
| void *log; |
| }; |
| |
| static inline size_t vduse_vq_log_size(uint16_t queue_size) |
| { |
| return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size + |
| sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT); |
| } |
| |
| static void *vduse_log_get(const char *filename, size_t size) |
| { |
| void *ptr = MAP_FAILED; |
| int fd; |
| |
| fd = open(filename, O_RDWR | O_CREAT, 0600); |
| if (fd == -1) { |
| return MAP_FAILED; |
| } |
| |
| if (ftruncate(fd, size) == -1) { |
| goto out; |
| } |
| |
| ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |
| |
| out: |
| close(fd); |
| return ptr; |
| } |
| |
| static inline bool has_feature(uint64_t features, unsigned int fbit) |
| { |
| assert(fbit < 64); |
| return !!(features & (1ULL << fbit)); |
| } |
| |
| static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit) |
| { |
| return has_feature(dev->features, fbit); |
| } |
| |
| uint64_t vduse_get_virtio_features(void) |
| { |
| return (1ULL << VIRTIO_F_IOMMU_PLATFORM) | |
| (1ULL << VIRTIO_F_VERSION_1) | |
| (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | |
| (1ULL << VIRTIO_RING_F_EVENT_IDX) | |
| (1ULL << VIRTIO_RING_F_INDIRECT_DESC); |
| } |
| |
| VduseDev *vduse_queue_get_dev(VduseVirtq *vq) |
| { |
| return vq->dev; |
| } |
| |
| int vduse_queue_get_fd(VduseVirtq *vq) |
| { |
| return vq->fd; |
| } |
| |
| void *vduse_dev_get_priv(VduseDev *dev) |
| { |
| return dev->priv; |
| } |
| |
| VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index) |
| { |
| return &dev->vqs[index]; |
| } |
| |
| int vduse_dev_get_fd(VduseDev *dev) |
| { |
| return dev->fd; |
| } |
| |
| static int vduse_inject_irq(VduseDev *dev, int index) |
| { |
| return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index); |
| } |
| |
| static int inflight_desc_compare(const void *a, const void *b) |
| { |
| VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a, |
| *desc1 = (VduseVirtqInflightDesc *)b; |
| |
| if (desc1->counter > desc0->counter && |
| (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { |
| return 1; |
| } |
| |
| return -1; |
| } |
| |
| static int vduse_queue_check_inflights(VduseVirtq *vq) |
| { |
| int i = 0; |
| VduseDev *dev = vq->dev; |
| |
| vq->used_idx = le16toh(vq->vring.used->idx); |
| vq->resubmit_num = 0; |
| vq->resubmit_list = NULL; |
| vq->counter = 0; |
| |
| if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) { |
| if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) { |
| return -1; |
| } |
| |
| vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0; |
| |
| barrier(); |
| |
| vq->log->inflight.used_idx = vq->used_idx; |
| } |
| |
| for (i = 0; i < vq->log->inflight.desc_num; i++) { |
| if (vq->log->inflight.desc[i].inflight == 1) { |
| vq->inuse++; |
| } |
| } |
| |
| vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; |
| |
| if (vq->inuse) { |
| vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc)); |
| if (!vq->resubmit_list) { |
| return -1; |
| } |
| |
| for (i = 0; i < vq->log->inflight.desc_num; i++) { |
| if (vq->log->inflight.desc[i].inflight) { |
| vq->resubmit_list[vq->resubmit_num].index = i; |
| vq->resubmit_list[vq->resubmit_num].counter = |
| vq->log->inflight.desc[i].counter; |
| vq->resubmit_num++; |
| } |
| } |
| |
| if (vq->resubmit_num > 1) { |
| qsort(vq->resubmit_list, vq->resubmit_num, |
| sizeof(VduseVirtqInflightDesc), inflight_desc_compare); |
| } |
| vq->counter = vq->resubmit_list[0].counter + 1; |
| } |
| |
| vduse_inject_irq(dev, vq->index); |
| |
| return 0; |
| } |
| |
| static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx) |
| { |
| vq->log->inflight.desc[desc_idx].counter = vq->counter++; |
| |
| barrier(); |
| |
| vq->log->inflight.desc[desc_idx].inflight = 1; |
| |
| return 0; |
| } |
| |
| static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx) |
| { |
| vq->log->inflight.last_batch_head = desc_idx; |
| |
| return 0; |
| } |
| |
| static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx) |
| { |
| vq->log->inflight.desc[desc_idx].inflight = 0; |
| |
| barrier(); |
| |
| vq->log->inflight.used_idx = vq->used_idx; |
| |
| return 0; |
| } |
| |
| static void vduse_iova_remove_region(VduseDev *dev, uint64_t start, |
| uint64_t last) |
| { |
| int i; |
| |
| if (last == start) { |
| return; |
| } |
| |
| for (i = 0; i < MAX_IOVA_REGIONS; i++) { |
| if (!dev->regions[i].mmap_addr) { |
| continue; |
| } |
| |
| if (start <= dev->regions[i].iova && |
| last >= (dev->regions[i].iova + dev->regions[i].size - 1)) { |
| munmap((void *)(uintptr_t)dev->regions[i].mmap_addr, |
| dev->regions[i].mmap_offset + dev->regions[i].size); |
| dev->regions[i].mmap_addr = 0; |
| dev->num_regions--; |
| } |
| } |
| } |
| |
| static int vduse_iova_add_region(VduseDev *dev, int fd, |
| uint64_t offset, uint64_t start, |
| uint64_t last, int prot) |
| { |
| int i; |
| uint64_t size = last - start + 1; |
| void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0); |
| |
| if (mmap_addr == MAP_FAILED) { |
| close(fd); |
| return -EINVAL; |
| } |
| |
| for (i = 0; i < MAX_IOVA_REGIONS; i++) { |
| if (!dev->regions[i].mmap_addr) { |
| dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr; |
| dev->regions[i].mmap_offset = offset; |
| dev->regions[i].iova = start; |
| dev->regions[i].size = size; |
| dev->num_regions++; |
| break; |
| } |
| } |
| assert(i < MAX_IOVA_REGIONS); |
| close(fd); |
| |
| return 0; |
| } |
| |
| static int perm_to_prot(uint8_t perm) |
| { |
| int prot = 0; |
| |
| switch (perm) { |
| case VDUSE_ACCESS_WO: |
| prot |= PROT_WRITE; |
| break; |
| case VDUSE_ACCESS_RO: |
| prot |= PROT_READ; |
| break; |
| case VDUSE_ACCESS_RW: |
| prot |= PROT_READ | PROT_WRITE; |
| break; |
| default: |
| break; |
| } |
| |
| return prot; |
| } |
| |
| static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova) |
| { |
| int i, ret; |
| struct vduse_iotlb_entry entry; |
| |
| for (i = 0; i < MAX_IOVA_REGIONS; i++) { |
| VduseIovaRegion *r = &dev->regions[i]; |
| |
| if (!r->mmap_addr) { |
| continue; |
| } |
| |
| if ((iova >= r->iova) && (iova < (r->iova + r->size))) { |
| if ((iova + *plen) > (r->iova + r->size)) { |
| *plen = r->iova + r->size - iova; |
| } |
| return (void *)(uintptr_t)(iova - r->iova + |
| r->mmap_addr + r->mmap_offset); |
| } |
| } |
| |
| entry.start = iova; |
| entry.last = iova + 1; |
| ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry); |
| if (ret < 0) { |
| return NULL; |
| } |
| |
| if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start, |
| entry.last, perm_to_prot(entry.perm))) { |
| return iova_to_va(dev, plen, iova); |
| } |
| |
| return NULL; |
| } |
| |
| static inline uint16_t vring_avail_flags(VduseVirtq *vq) |
| { |
| return le16toh(vq->vring.avail->flags); |
| } |
| |
| static inline uint16_t vring_avail_idx(VduseVirtq *vq) |
| { |
| vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); |
| |
| return vq->shadow_avail_idx; |
| } |
| |
| static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i) |
| { |
| return le16toh(vq->vring.avail->ring[i]); |
| } |
| |
| static inline uint16_t vring_get_used_event(VduseVirtq *vq) |
| { |
| return vring_avail_ring(vq, vq->vring.num); |
| } |
| |
| static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx, |
| unsigned int *head) |
| { |
| /* |
| * Grab the next descriptor number they're advertising, and increment |
| * the index we've seen. |
| */ |
| *head = vring_avail_ring(vq, idx % vq->vring.num); |
| |
| /* If their number is silly, that's a fatal mistake. */ |
| if (*head >= vq->vring.num) { |
| fprintf(stderr, "Guest says index %u is available\n", *head); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static int |
| vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc, |
| uint64_t addr, size_t len) |
| { |
| struct vring_desc *ori_desc; |
| uint64_t read_len; |
| |
| if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { |
| return -1; |
| } |
| |
| if (len == 0) { |
| return -1; |
| } |
| |
| while (len) { |
| read_len = len; |
| ori_desc = iova_to_va(dev, &read_len, addr); |
| if (!ori_desc) { |
| return -1; |
| } |
| |
| memcpy(desc, ori_desc, read_len); |
| len -= read_len; |
| addr += read_len; |
| desc += read_len; |
| } |
| |
| return 0; |
| } |
| |
| enum { |
| VIRTQUEUE_READ_DESC_ERROR = -1, |
| VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ |
| VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ |
| }; |
| |
| static int vduse_queue_read_next_desc(struct vring_desc *desc, int i, |
| unsigned int max, unsigned int *next) |
| { |
| /* If this descriptor says it doesn't chain, we're done. */ |
| if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { |
| return VIRTQUEUE_READ_DESC_DONE; |
| } |
| |
| /* Check they're not leading us off end of descriptors. */ |
| *next = desc[i].next; |
| /* Make sure compiler knows to grab that: we don't want it changing! */ |
| smp_wmb(); |
| |
| if (*next >= max) { |
| fprintf(stderr, "Desc next is %u\n", *next); |
| return VIRTQUEUE_READ_DESC_ERROR; |
| } |
| |
| return VIRTQUEUE_READ_DESC_MORE; |
| } |
| |
| /* |
| * Fetch avail_idx from VQ memory only when we really need to know if |
| * guest has added some buffers. |
| */ |
| static bool vduse_queue_empty(VduseVirtq *vq) |
| { |
| if (unlikely(!vq->vring.avail)) { |
| return true; |
| } |
| |
| if (vq->shadow_avail_idx != vq->last_avail_idx) { |
| return false; |
| } |
| |
| return vring_avail_idx(vq) == vq->last_avail_idx; |
| } |
| |
| static bool vduse_queue_should_notify(VduseVirtq *vq) |
| { |
| VduseDev *dev = vq->dev; |
| uint16_t old, new; |
| bool v; |
| |
| /* We need to expose used array entries before checking used event. */ |
| smp_mb(); |
| |
| /* Always notify when queue is empty (when feature acknowledge) */ |
| if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && |
| !vq->inuse && vduse_queue_empty(vq)) { |
| return true; |
| } |
| |
| if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { |
| return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); |
| } |
| |
| v = vq->signalled_used_valid; |
| vq->signalled_used_valid = true; |
| old = vq->signalled_used; |
| new = vq->signalled_used = vq->used_idx; |
| return !v || vring_need_event(vring_get_used_event(vq), new, old); |
| } |
| |
| void vduse_queue_notify(VduseVirtq *vq) |
| { |
| VduseDev *dev = vq->dev; |
| |
| if (unlikely(!vq->vring.avail)) { |
| return; |
| } |
| |
| if (!vduse_queue_should_notify(vq)) { |
| return; |
| } |
| |
| if (vduse_inject_irq(dev, vq->index) < 0) { |
| fprintf(stderr, "Error inject irq for vq %d: %s\n", |
| vq->index, strerror(errno)); |
| } |
| } |
| |
| static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val) |
| { |
| uint16_t val_le = htole16(val); |
| memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); |
| } |
| |
| static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg, |
| struct iovec *iov, unsigned int max_num_sg, |
| bool is_write, uint64_t pa, size_t sz) |
| { |
| unsigned num_sg = *p_num_sg; |
| VduseDev *dev = vq->dev; |
| |
| assert(num_sg <= max_num_sg); |
| |
| if (!sz) { |
| fprintf(stderr, "virtio: zero sized buffers are not allowed\n"); |
| return false; |
| } |
| |
| while (sz) { |
| uint64_t len = sz; |
| |
| if (num_sg == max_num_sg) { |
| fprintf(stderr, |
| "virtio: too many descriptors in indirect table\n"); |
| return false; |
| } |
| |
| iov[num_sg].iov_base = iova_to_va(dev, &len, pa); |
| if (iov[num_sg].iov_base == NULL) { |
| fprintf(stderr, "virtio: invalid address for buffers\n"); |
| return false; |
| } |
| iov[num_sg++].iov_len = len; |
| sz -= len; |
| pa += len; |
| } |
| |
| *p_num_sg = num_sg; |
| return true; |
| } |
| |
| static void *vduse_queue_alloc_element(size_t sz, unsigned out_num, |
| unsigned in_num) |
| { |
| VduseVirtqElement *elem; |
| size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); |
| size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); |
| size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); |
| |
| assert(sz >= sizeof(VduseVirtqElement)); |
| elem = malloc(out_sg_end); |
| if (!elem) { |
| return NULL; |
| } |
| elem->out_num = out_num; |
| elem->in_num = in_num; |
| elem->in_sg = (void *)elem + in_sg_ofs; |
| elem->out_sg = (void *)elem + out_sg_ofs; |
| return elem; |
| } |
| |
| static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz) |
| { |
| struct vring_desc *desc = vq->vring.desc; |
| VduseDev *dev = vq->dev; |
| uint64_t desc_addr, read_len; |
| unsigned int desc_len; |
| unsigned int max = vq->vring.num; |
| unsigned int i = idx; |
| VduseVirtqElement *elem; |
| struct iovec iov[VIRTQUEUE_MAX_SIZE]; |
| struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; |
| unsigned int out_num = 0, in_num = 0; |
| int rc; |
| |
| if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { |
| if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { |
| fprintf(stderr, "Invalid size for indirect buffer table\n"); |
| return NULL; |
| } |
| |
| /* loop over the indirect descriptor table */ |
| desc_addr = le64toh(desc[i].addr); |
| desc_len = le32toh(desc[i].len); |
| max = desc_len / sizeof(struct vring_desc); |
| read_len = desc_len; |
| desc = iova_to_va(dev, &read_len, desc_addr); |
| if (unlikely(desc && read_len != desc_len)) { |
| /* Failed to use zero copy */ |
| desc = NULL; |
| if (!vduse_queue_read_indirect_desc(dev, desc_buf, |
| desc_addr, |
| desc_len)) { |
| desc = desc_buf; |
| } |
| } |
| if (!desc) { |
| fprintf(stderr, "Invalid indirect buffer table\n"); |
| return NULL; |
| } |
| i = 0; |
| } |
| |
| /* Collect all the descriptors */ |
| do { |
| if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { |
| if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num, |
| VIRTQUEUE_MAX_SIZE - out_num, |
| true, le64toh(desc[i].addr), |
| le32toh(desc[i].len))) { |
| return NULL; |
| } |
| } else { |
| if (in_num) { |
| fprintf(stderr, "Incorrect order for descriptors\n"); |
| return NULL; |
| } |
| if (!vduse_queue_map_single_desc(vq, &out_num, iov, |
| VIRTQUEUE_MAX_SIZE, false, |
| le64toh(desc[i].addr), |
| le32toh(desc[i].len))) { |
| return NULL; |
| } |
| } |
| |
| /* If we've got too many, that implies a descriptor loop. */ |
| if ((in_num + out_num) > max) { |
| fprintf(stderr, "Looped descriptor\n"); |
| return NULL; |
| } |
| rc = vduse_queue_read_next_desc(desc, i, max, &i); |
| } while (rc == VIRTQUEUE_READ_DESC_MORE); |
| |
| if (rc == VIRTQUEUE_READ_DESC_ERROR) { |
| fprintf(stderr, "read descriptor error\n"); |
| return NULL; |
| } |
| |
| /* Now copy what we have collected and mapped */ |
| elem = vduse_queue_alloc_element(sz, out_num, in_num); |
| if (!elem) { |
| fprintf(stderr, "read descriptor error\n"); |
| return NULL; |
| } |
| elem->index = idx; |
| for (i = 0; i < out_num; i++) { |
| elem->out_sg[i] = iov[i]; |
| } |
| for (i = 0; i < in_num; i++) { |
| elem->in_sg[i] = iov[out_num + i]; |
| } |
| |
| return elem; |
| } |
| |
| void *vduse_queue_pop(VduseVirtq *vq, size_t sz) |
| { |
| unsigned int head; |
| VduseVirtqElement *elem; |
| VduseDev *dev = vq->dev; |
| int i; |
| |
| if (unlikely(!vq->vring.avail)) { |
| return NULL; |
| } |
| |
| if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { |
| i = (--vq->resubmit_num); |
| elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz); |
| |
| if (!vq->resubmit_num) { |
| free(vq->resubmit_list); |
| vq->resubmit_list = NULL; |
| } |
| |
| return elem; |
| } |
| |
| if (vduse_queue_empty(vq)) { |
| return NULL; |
| } |
| /* Needed after virtio_queue_empty() */ |
| smp_rmb(); |
| |
| if (vq->inuse >= vq->vring.num) { |
| fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse); |
| return NULL; |
| } |
| |
| if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) { |
| return NULL; |
| } |
| |
| if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { |
| vring_set_avail_event(vq, vq->last_avail_idx); |
| } |
| |
| elem = vduse_queue_map_desc(vq, head, sz); |
| |
| if (!elem) { |
| return NULL; |
| } |
| |
| vq->inuse++; |
| |
| vduse_queue_inflight_get(vq, head); |
| |
| return elem; |
| } |
| |
| static inline void vring_used_write(VduseVirtq *vq, |
| struct vring_used_elem *uelem, int i) |
| { |
| struct vring_used *used = vq->vring.used; |
| |
| used->ring[i] = *uelem; |
| } |
| |
| static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem, |
| unsigned int len, unsigned int idx) |
| { |
| struct vring_used_elem uelem; |
| |
| if (unlikely(!vq->vring.used)) { |
| return; |
| } |
| |
| idx = (idx + vq->used_idx) % vq->vring.num; |
| |
| uelem.id = htole32(elem->index); |
| uelem.len = htole32(len); |
| vring_used_write(vq, &uelem, idx); |
| } |
| |
| static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val) |
| { |
| vq->vring.used->idx = htole16(val); |
| vq->used_idx = val; |
| } |
| |
| static void vduse_queue_flush(VduseVirtq *vq, unsigned int count) |
| { |
| uint16_t old, new; |
| |
| if (unlikely(!vq->vring.used)) { |
| return; |
| } |
| |
| /* Make sure buffer is written before we update index. */ |
| smp_wmb(); |
| |
| old = vq->used_idx; |
| new = old + count; |
| vring_used_idx_set(vq, new); |
| vq->inuse -= count; |
| if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { |
| vq->signalled_used_valid = false; |
| } |
| } |
| |
| void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, |
| unsigned int len) |
| { |
| vduse_queue_fill(vq, elem, len, 0); |
| vduse_queue_inflight_pre_put(vq, elem->index); |
| vduse_queue_flush(vq, 1); |
| vduse_queue_inflight_post_put(vq, elem->index); |
| } |
| |
| static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr, |
| uint64_t avail_addr, uint64_t used_addr) |
| { |
| struct VduseDev *dev = vq->dev; |
| uint64_t len; |
| |
| len = sizeof(struct vring_desc); |
| vq->vring.desc = iova_to_va(dev, &len, desc_addr); |
| if (len != sizeof(struct vring_desc)) { |
| return -EINVAL; |
| } |
| |
| len = sizeof(struct vring_avail); |
| vq->vring.avail = iova_to_va(dev, &len, avail_addr); |
| if (len != sizeof(struct vring_avail)) { |
| return -EINVAL; |
| } |
| |
| len = sizeof(struct vring_used); |
| vq->vring.used = iova_to_va(dev, &len, used_addr); |
| if (len != sizeof(struct vring_used)) { |
| return -EINVAL; |
| } |
| |
| if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) { |
| fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index); |
| return -EINVAL; |
| } |
| |
| return 0; |
| } |
| |
| static void vduse_queue_enable(VduseVirtq *vq) |
| { |
| struct VduseDev *dev = vq->dev; |
| struct vduse_vq_info vq_info; |
| struct vduse_vq_eventfd vq_eventfd; |
| int fd; |
| |
| vq_info.index = vq->index; |
| if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) { |
| fprintf(stderr, "Failed to get vq[%d] info: %s\n", |
| vq->index, strerror(errno)); |
| return; |
| } |
| |
| if (!vq_info.ready) { |
| return; |
| } |
| |
| vq->vring.num = vq_info.num; |
| vq->vring.desc_addr = vq_info.desc_addr; |
| vq->vring.avail_addr = vq_info.driver_addr; |
| vq->vring.used_addr = vq_info.device_addr; |
| |
| if (vduse_queue_update_vring(vq, vq_info.desc_addr, |
| vq_info.driver_addr, vq_info.device_addr)) { |
| fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index); |
| return; |
| } |
| |
| fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); |
| if (fd < 0) { |
| fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index); |
| return; |
| } |
| |
| vq_eventfd.index = vq->index; |
| vq_eventfd.fd = fd; |
| if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) { |
| fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index); |
| close(fd); |
| return; |
| } |
| |
| vq->fd = fd; |
| vq->signalled_used_valid = false; |
| vq->ready = true; |
| |
| if (vduse_queue_check_inflights(vq)) { |
| fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index); |
| close(fd); |
| return; |
| } |
| |
| dev->ops->enable_queue(dev, vq); |
| } |
| |
| static void vduse_queue_disable(VduseVirtq *vq) |
| { |
| struct VduseDev *dev = vq->dev; |
| struct vduse_vq_eventfd eventfd; |
| |
| if (!vq->ready) { |
| return; |
| } |
| |
| dev->ops->disable_queue(dev, vq); |
| |
| eventfd.index = vq->index; |
| eventfd.fd = VDUSE_EVENTFD_DEASSIGN; |
| ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd); |
| close(vq->fd); |
| |
| assert(vq->inuse == 0); |
| |
| vq->vring.num = 0; |
| vq->vring.desc_addr = 0; |
| vq->vring.avail_addr = 0; |
| vq->vring.used_addr = 0; |
| vq->vring.desc = 0; |
| vq->vring.avail = 0; |
| vq->vring.used = 0; |
| vq->ready = false; |
| vq->fd = -1; |
| } |
| |
| static void vduse_dev_start_dataplane(VduseDev *dev) |
| { |
| int i; |
| |
| if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { |
| fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); |
| return; |
| } |
| assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1)); |
| |
| for (i = 0; i < dev->num_queues; i++) { |
| vduse_queue_enable(&dev->vqs[i]); |
| } |
| } |
| |
| static void vduse_dev_stop_dataplane(VduseDev *dev) |
| { |
| size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); |
| int i; |
| |
| for (i = 0; i < dev->num_queues; i++) { |
| vduse_queue_disable(&dev->vqs[i]); |
| } |
| if (dev->log) { |
| memset(dev->log, 0, log_size); |
| } |
| dev->features = 0; |
| vduse_iova_remove_region(dev, 0, ULONG_MAX); |
| } |
| |
| int vduse_dev_handler(VduseDev *dev) |
| { |
| struct vduse_dev_request req; |
| struct vduse_dev_response resp = { 0 }; |
| VduseVirtq *vq; |
| int i, ret; |
| |
| ret = read(dev->fd, &req, sizeof(req)); |
| if (ret != sizeof(req)) { |
| fprintf(stderr, "Read request error [%d]: %s\n", |
| ret, strerror(errno)); |
| return -errno; |
| } |
| resp.request_id = req.request_id; |
| |
| switch (req.type) { |
| case VDUSE_GET_VQ_STATE: |
| vq = &dev->vqs[req.vq_state.index]; |
| resp.vq_state.split.avail_index = vq->last_avail_idx; |
| resp.result = VDUSE_REQ_RESULT_OK; |
| break; |
| case VDUSE_SET_STATUS: |
| if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) { |
| vduse_dev_start_dataplane(dev); |
| } else if (req.s.status == 0) { |
| vduse_dev_stop_dataplane(dev); |
| } |
| resp.result = VDUSE_REQ_RESULT_OK; |
| break; |
| case VDUSE_UPDATE_IOTLB: |
| /* The iova will be updated by iova_to_va() later, so just remove it */ |
| vduse_iova_remove_region(dev, req.iova.start, req.iova.last); |
| for (i = 0; i < dev->num_queues; i++) { |
| vq = &dev->vqs[i]; |
| if (vq->ready) { |
| if (vduse_queue_update_vring(vq, vq->vring.desc_addr, |
| vq->vring.avail_addr, |
| vq->vring.used_addr)) { |
| fprintf(stderr, "Failed to update vring for vq[%d]\n", |
| vq->index); |
| } |
| } |
| } |
| resp.result = VDUSE_REQ_RESULT_OK; |
| break; |
| default: |
| resp.result = VDUSE_REQ_RESULT_FAILED; |
| break; |
| } |
| |
| ret = write(dev->fd, &resp, sizeof(resp)); |
| if (ret != sizeof(resp)) { |
| fprintf(stderr, "Write request %d error [%d]: %s\n", |
| req.type, ret, strerror(errno)); |
| return -errno; |
| } |
| return 0; |
| } |
| |
| int vduse_dev_update_config(VduseDev *dev, uint32_t size, |
| uint32_t offset, char *buffer) |
| { |
| int ret; |
| struct vduse_config_data *data; |
| |
| data = malloc(offsetof(struct vduse_config_data, buffer) + size); |
| if (!data) { |
| return -ENOMEM; |
| } |
| |
| data->offset = offset; |
| data->length = size; |
| memcpy(data->buffer, buffer, size); |
| |
| ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data); |
| free(data); |
| |
| if (ret) { |
| return -errno; |
| } |
| |
| if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) { |
| return -errno; |
| } |
| |
| return 0; |
| } |
| |
| int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size) |
| { |
| VduseVirtq *vq = &dev->vqs[index]; |
| struct vduse_vq_config vq_config = { 0 }; |
| |
| if (max_size > VIRTQUEUE_MAX_SIZE) { |
| return -EINVAL; |
| } |
| |
| vq_config.index = vq->index; |
| vq_config.max_size = max_size; |
| |
| if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) { |
| return -errno; |
| } |
| |
| vduse_queue_enable(vq); |
| |
| return 0; |
| } |
| |
| int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename) |
| { |
| |
| size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); |
| void *log; |
| int i; |
| |
| dev->log = log = vduse_log_get(filename, log_size); |
| if (log == MAP_FAILED) { |
| fprintf(stderr, "Failed to get vduse log\n"); |
| return -EINVAL; |
| } |
| |
| for (i = 0; i < dev->num_queues; i++) { |
| dev->vqs[i].log = log; |
| dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE; |
| log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE)); |
| } |
| |
| return 0; |
| } |
| |
| static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues) |
| { |
| VduseVirtq *vqs; |
| int i; |
| |
| vqs = calloc(sizeof(VduseVirtq), num_queues); |
| if (!vqs) { |
| return -ENOMEM; |
| } |
| |
| for (i = 0; i < num_queues; i++) { |
| vqs[i].index = i; |
| vqs[i].dev = dev; |
| vqs[i].fd = -1; |
| } |
| dev->vqs = vqs; |
| |
| return 0; |
| } |
| |
| static int vduse_dev_init(VduseDev *dev, const char *name, |
| uint16_t num_queues, const VduseOps *ops, |
| void *priv) |
| { |
| char *dev_path, *dev_name; |
| int ret, fd; |
| |
| dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1); |
| if (!dev_path) { |
| return -ENOMEM; |
| } |
| sprintf(dev_path, "/dev/vduse/%s", name); |
| |
| fd = open(dev_path, O_RDWR); |
| free(dev_path); |
| if (fd < 0) { |
| fprintf(stderr, "Failed to open vduse dev %s: %s\n", |
| name, strerror(errno)); |
| return -errno; |
| } |
| |
| if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { |
| fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); |
| close(fd); |
| return -errno; |
| } |
| |
| dev_name = strdup(name); |
| if (!dev_name) { |
| close(fd); |
| return -ENOMEM; |
| } |
| |
| ret = vduse_dev_init_vqs(dev, num_queues); |
| if (ret) { |
| free(dev_name); |
| close(fd); |
| return ret; |
| } |
| |
| dev->name = dev_name; |
| dev->num_queues = num_queues; |
| dev->fd = fd; |
| dev->ops = ops; |
| dev->priv = priv; |
| |
| return 0; |
| } |
| |
| static inline bool vduse_name_is_invalid(const char *name) |
| { |
| return strlen(name) >= VDUSE_NAME_MAX || strstr(name, ".."); |
| } |
| |
| VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues, |
| const VduseOps *ops, void *priv) |
| { |
| VduseDev *dev; |
| int ret; |
| |
| if (!ops || !ops->enable_queue || !ops->disable_queue) { |
| fprintf(stderr, "Invalid parameter for vduse\n"); |
| return NULL; |
| } |
| |
| dev = calloc(sizeof(VduseDev), 1); |
| if (!dev) { |
| fprintf(stderr, "Failed to allocate vduse device\n"); |
| return NULL; |
| } |
| |
| if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { |
| fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); |
| free(dev); |
| return NULL; |
| } |
| |
| ret = vduse_dev_init_vqs(dev, num_queues); |
| if (ret) { |
| fprintf(stderr, "Failed to init vqs\n"); |
| free(dev); |
| return NULL; |
| } |
| |
| dev->num_queues = num_queues; |
| dev->fd = fd; |
| dev->ops = ops; |
| dev->priv = priv; |
| |
| return dev; |
| } |
| |
| VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues, |
| const VduseOps *ops, void *priv) |
| { |
| VduseDev *dev; |
| int ret; |
| |
| if (!name || vduse_name_is_invalid(name) || !ops || |
| !ops->enable_queue || !ops->disable_queue) { |
| fprintf(stderr, "Invalid parameter for vduse\n"); |
| return NULL; |
| } |
| |
| dev = calloc(sizeof(VduseDev), 1); |
| if (!dev) { |
| fprintf(stderr, "Failed to allocate vduse device\n"); |
| return NULL; |
| } |
| |
| ret = vduse_dev_init(dev, name, num_queues, ops, priv); |
| if (ret < 0) { |
| fprintf(stderr, "Failed to init vduse device %s: %s\n", |
| name, strerror(-ret)); |
| free(dev); |
| return NULL; |
| } |
| |
| return dev; |
| } |
| |
| VduseDev *vduse_dev_create(const char *name, uint32_t device_id, |
| uint32_t vendor_id, uint64_t features, |
| uint16_t num_queues, uint32_t config_size, |
| char *config, const VduseOps *ops, void *priv) |
| { |
| VduseDev *dev; |
| int ret, ctrl_fd; |
| uint64_t version; |
| struct vduse_dev_config *dev_config; |
| size_t size = offsetof(struct vduse_dev_config, config); |
| |
| if (!name || vduse_name_is_invalid(name) || |
| !has_feature(features, VIRTIO_F_VERSION_1) || !config || |
| !config_size || !ops || !ops->enable_queue || !ops->disable_queue) { |
| fprintf(stderr, "Invalid parameter for vduse\n"); |
| return NULL; |
| } |
| |
| dev = calloc(sizeof(VduseDev), 1); |
| if (!dev) { |
| fprintf(stderr, "Failed to allocate vduse device\n"); |
| return NULL; |
| } |
| |
| ctrl_fd = open("/dev/vduse/control", O_RDWR); |
| if (ctrl_fd < 0) { |
| fprintf(stderr, "Failed to open /dev/vduse/control: %s\n", |
| strerror(errno)); |
| goto err_ctrl; |
| } |
| |
| version = VDUSE_API_VERSION; |
| if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) { |
| fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n", |
| version, strerror(errno)); |
| goto err_dev; |
| } |
| |
| dev_config = calloc(size + config_size, 1); |
| if (!dev_config) { |
| fprintf(stderr, "Failed to allocate config space\n"); |
| goto err_dev; |
| } |
| |
| assert(!vduse_name_is_invalid(name)); |
| strcpy(dev_config->name, name); |
| dev_config->device_id = device_id; |
| dev_config->vendor_id = vendor_id; |
| dev_config->features = features; |
| dev_config->vq_num = num_queues; |
| dev_config->vq_align = VDUSE_VQ_ALIGN; |
| dev_config->config_size = config_size; |
| memcpy(dev_config->config, config, config_size); |
| |
| ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config); |
| free(dev_config); |
| if (ret && errno != EEXIST) { |
| fprintf(stderr, "Failed to create vduse device %s: %s\n", |
| name, strerror(errno)); |
| goto err_dev; |
| } |
| dev->ctrl_fd = ctrl_fd; |
| |
| ret = vduse_dev_init(dev, name, num_queues, ops, priv); |
| if (ret < 0) { |
| fprintf(stderr, "Failed to init vduse device %s: %s\n", |
| name, strerror(-ret)); |
| goto err; |
| } |
| |
| return dev; |
| err: |
| ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name); |
| err_dev: |
| close(ctrl_fd); |
| err_ctrl: |
| free(dev); |
| |
| return NULL; |
| } |
| |
| int vduse_dev_destroy(VduseDev *dev) |
| { |
| size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); |
| int i, ret = 0; |
| |
| if (dev->log) { |
| munmap(dev->log, log_size); |
| } |
| for (i = 0; i < dev->num_queues; i++) { |
| free(dev->vqs[i].resubmit_list); |
| } |
| free(dev->vqs); |
| if (dev->fd >= 0) { |
| close(dev->fd); |
| dev->fd = -1; |
| } |
| if (dev->ctrl_fd >= 0) { |
| if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) { |
| ret = -errno; |
| } |
| close(dev->ctrl_fd); |
| dev->ctrl_fd = -1; |
| } |
| free(dev->name); |
| free(dev); |
| |
| return ret; |
| } |