| /* | 
 |  * vhost-vdpa | 
 |  * | 
 |  *  Copyright(c) 2017-2018 Intel Corporation. | 
 |  *  Copyright(c) 2020 Red Hat, Inc. | 
 |  * | 
 |  * This work is licensed under the terms of the GNU GPL, version 2 or later. | 
 |  * See the COPYING file in the top-level directory. | 
 |  * | 
 |  */ | 
 |  | 
 | #include "qemu/osdep.h" | 
 | #include <linux/vhost.h> | 
 | #include <linux/vfio.h> | 
 | #include <sys/eventfd.h> | 
 | #include <sys/ioctl.h> | 
 | #include "exec/target_page.h" | 
 | #include "hw/virtio/vhost.h" | 
 | #include "hw/virtio/vhost-backend.h" | 
 | #include "hw/virtio/virtio-net.h" | 
 | #include "hw/virtio/vhost-shadow-virtqueue.h" | 
 | #include "hw/virtio/vhost-vdpa.h" | 
 | #include "exec/address-spaces.h" | 
 | #include "migration/blocker.h" | 
 | #include "qemu/cutils.h" | 
 | #include "qemu/main-loop.h" | 
 | #include "trace.h" | 
 | #include "qapi/error.h" | 
 |  | 
 | /* | 
 |  * Return one past the end of the end of section. Be careful with uint64_t | 
 |  * conversions! | 
 |  */ | 
 | static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, | 
 |                                      int page_mask) | 
 | { | 
 |     Int128 llend = int128_make64(section->offset_within_address_space); | 
 |     llend = int128_add(llend, section->size); | 
 |     llend = int128_and(llend, int128_exts64(page_mask)); | 
 |  | 
 |     return llend; | 
 | } | 
 |  | 
 | static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, | 
 |                                                 uint64_t iova_min, | 
 |                                                 uint64_t iova_max, | 
 |                                                 int page_mask) | 
 | { | 
 |     Int128 llend; | 
 |     bool is_ram = memory_region_is_ram(section->mr); | 
 |     bool is_iommu = memory_region_is_iommu(section->mr); | 
 |     bool is_protected = memory_region_is_protected(section->mr); | 
 |  | 
 |     /* vhost-vDPA doesn't allow MMIO to be mapped  */ | 
 |     bool is_ram_device = memory_region_is_ram_device(section->mr); | 
 |  | 
 |     if ((!is_ram && !is_iommu) || is_protected || is_ram_device) { | 
 |         trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected, | 
 |                                                 is_ram_device, iova_min, | 
 |                                                 iova_max, page_mask); | 
 |         return true; | 
 |     } | 
 |  | 
 |     if (section->offset_within_address_space < iova_min) { | 
 |         error_report("RAM section out of device range (min=0x%" PRIx64 | 
 |                      ", addr=0x%" HWADDR_PRIx ")", | 
 |                      iova_min, section->offset_within_address_space); | 
 |         return true; | 
 |     } | 
 |     /* | 
 |      * While using vIOMMU, sometimes the section will be larger than iova_max, | 
 |      * but the memory that actually maps is smaller, so move the check to | 
 |      * function vhost_vdpa_iommu_map_notify(). That function will use the actual | 
 |      * size that maps to the kernel | 
 |      */ | 
 |  | 
 |     if (!is_iommu) { | 
 |         llend = vhost_vdpa_section_end(section, page_mask); | 
 |         if (int128_gt(llend, int128_make64(iova_max))) { | 
 |             error_report("RAM section out of device range (max=0x%" PRIx64 | 
 |                          ", end addr=0x%" PRIx64 ")", | 
 |                          iova_max, int128_get64(llend)); | 
 |             return true; | 
 |         } | 
 |     } | 
 |  | 
 |     return false; | 
 | } | 
 |  | 
 | /* | 
 |  * The caller must set asid = 0 if the device does not support asid. | 
 |  * This is not an ABI break since it is set to 0 by the initializer anyway. | 
 |  */ | 
 | int vhost_vdpa_dma_map(VhostVDPAShared *s, uint32_t asid, hwaddr iova, | 
 |                        hwaddr size, void *vaddr, bool readonly) | 
 | { | 
 |     struct vhost_msg_v2 msg = {}; | 
 |     int fd = s->device_fd; | 
 |     int ret = 0; | 
 |  | 
 |     msg.type = VHOST_IOTLB_MSG_V2; | 
 |     msg.asid = asid; | 
 |     msg.iotlb.iova = iova; | 
 |     msg.iotlb.size = size; | 
 |     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; | 
 |     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; | 
 |     msg.iotlb.type = VHOST_IOTLB_UPDATE; | 
 |  | 
 |     trace_vhost_vdpa_dma_map(s, fd, msg.type, msg.asid, msg.iotlb.iova, | 
 |                              msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm, | 
 |                              msg.iotlb.type); | 
 |  | 
 |     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { | 
 |         error_report("failed to write, fd=%d, errno=%d (%s)", | 
 |             fd, errno, strerror(errno)); | 
 |         return -EIO ; | 
 |     } | 
 |  | 
 |     return ret; | 
 | } | 
 |  | 
 | /* | 
 |  * The caller must set asid = 0 if the device does not support asid. | 
 |  * This is not an ABI break since it is set to 0 by the initializer anyway. | 
 |  */ | 
 | int vhost_vdpa_dma_unmap(VhostVDPAShared *s, uint32_t asid, hwaddr iova, | 
 |                          hwaddr size) | 
 | { | 
 |     struct vhost_msg_v2 msg = {}; | 
 |     int fd = s->device_fd; | 
 |     int ret = 0; | 
 |  | 
 |     msg.type = VHOST_IOTLB_MSG_V2; | 
 |     msg.asid = asid; | 
 |     msg.iotlb.iova = iova; | 
 |     msg.iotlb.size = size; | 
 |     msg.iotlb.type = VHOST_IOTLB_INVALIDATE; | 
 |  | 
 |     trace_vhost_vdpa_dma_unmap(s, fd, msg.type, msg.asid, msg.iotlb.iova, | 
 |                                msg.iotlb.size, msg.iotlb.type); | 
 |  | 
 |     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { | 
 |         error_report("failed to write, fd=%d, errno=%d (%s)", | 
 |             fd, errno, strerror(errno)); | 
 |         return -EIO ; | 
 |     } | 
 |  | 
 |     return ret; | 
 | } | 
 |  | 
 | static void vhost_vdpa_listener_begin_batch(VhostVDPAShared *s) | 
 | { | 
 |     int fd = s->device_fd; | 
 |     struct vhost_msg_v2 msg = { | 
 |         .type = VHOST_IOTLB_MSG_V2, | 
 |         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, | 
 |     }; | 
 |  | 
 |     trace_vhost_vdpa_listener_begin_batch(s, fd, msg.type, msg.iotlb.type); | 
 |     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { | 
 |         error_report("failed to write, fd=%d, errno=%d (%s)", | 
 |                      fd, errno, strerror(errno)); | 
 |     } | 
 | } | 
 |  | 
 | static void vhost_vdpa_iotlb_batch_begin_once(VhostVDPAShared *s) | 
 | { | 
 |     if (s->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && | 
 |         !s->iotlb_batch_begin_sent) { | 
 |         vhost_vdpa_listener_begin_batch(s); | 
 |     } | 
 |  | 
 |     s->iotlb_batch_begin_sent = true; | 
 | } | 
 |  | 
 | static void vhost_vdpa_listener_commit(MemoryListener *listener) | 
 | { | 
 |     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener); | 
 |     struct vhost_msg_v2 msg = {}; | 
 |     int fd = s->device_fd; | 
 |  | 
 |     if (!(s->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { | 
 |         return; | 
 |     } | 
 |  | 
 |     if (!s->iotlb_batch_begin_sent) { | 
 |         return; | 
 |     } | 
 |  | 
 |     msg.type = VHOST_IOTLB_MSG_V2; | 
 |     msg.iotlb.type = VHOST_IOTLB_BATCH_END; | 
 |  | 
 |     trace_vhost_vdpa_listener_commit(s, fd, msg.type, msg.iotlb.type); | 
 |     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { | 
 |         error_report("failed to write, fd=%d, errno=%d (%s)", | 
 |                      fd, errno, strerror(errno)); | 
 |     } | 
 |  | 
 |     s->iotlb_batch_begin_sent = false; | 
 | } | 
 |  | 
 | static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) | 
 | { | 
 |     struct vdpa_iommu *iommu = container_of(n, struct vdpa_iommu, n); | 
 |  | 
 |     hwaddr iova = iotlb->iova + iommu->iommu_offset; | 
 |     VhostVDPAShared *s = iommu->dev_shared; | 
 |     void *vaddr; | 
 |     int ret; | 
 |     Int128 llend; | 
 |     Error *local_err = NULL; | 
 |  | 
 |     if (iotlb->target_as != &address_space_memory) { | 
 |         error_report("Wrong target AS \"%s\", only system memory is allowed", | 
 |                      iotlb->target_as->name ? iotlb->target_as->name : "none"); | 
 |         return; | 
 |     } | 
 |     RCU_READ_LOCK_GUARD(); | 
 |     /* check if RAM section out of device range */ | 
 |     llend = int128_add(int128_makes64(iotlb->addr_mask), int128_makes64(iova)); | 
 |     if (int128_gt(llend, int128_make64(s->iova_range.last))) { | 
 |         error_report("RAM section out of device range (max=0x%" PRIx64 | 
 |                      ", end addr=0x%" PRIx64 ")", | 
 |                      s->iova_range.last, int128_get64(llend)); | 
 |         return; | 
 |     } | 
 |  | 
 |     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { | 
 |         bool read_only; | 
 |  | 
 |         if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, | 
 |                                   &local_err)) { | 
 |             error_report_err(local_err); | 
 |             return; | 
 |         } | 
 |         ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, | 
 |                                  iotlb->addr_mask + 1, vaddr, read_only); | 
 |         if (ret) { | 
 |             error_report("vhost_vdpa_dma_map(%p, 0x%" HWADDR_PRIx ", " | 
 |                          "0x%" HWADDR_PRIx ", %p) = %d (%m)", | 
 |                          s, iova, iotlb->addr_mask + 1, vaddr, ret); | 
 |         } | 
 |     } else { | 
 |         ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova, | 
 |                                    iotlb->addr_mask + 1); | 
 |         if (ret) { | 
 |             error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", " | 
 |                          "0x%" HWADDR_PRIx ") = %d (%m)", | 
 |                          s, iova, iotlb->addr_mask + 1, ret); | 
 |         } | 
 |     } | 
 | } | 
 |  | 
 | static void vhost_vdpa_iommu_region_add(MemoryListener *listener, | 
 |                                         MemoryRegionSection *section) | 
 | { | 
 |     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener); | 
 |  | 
 |     struct vdpa_iommu *iommu; | 
 |     Int128 end; | 
 |     int iommu_idx; | 
 |     IOMMUMemoryRegion *iommu_mr; | 
 |     int ret; | 
 |  | 
 |     iommu_mr = IOMMU_MEMORY_REGION(section->mr); | 
 |  | 
 |     iommu = g_malloc0(sizeof(*iommu)); | 
 |     end = int128_add(int128_make64(section->offset_within_region), | 
 |                      section->size); | 
 |     end = int128_sub(end, int128_one()); | 
 |     iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, | 
 |                                                    MEMTXATTRS_UNSPECIFIED); | 
 |     iommu->iommu_mr = iommu_mr; | 
 |     iommu_notifier_init(&iommu->n, vhost_vdpa_iommu_map_notify, | 
 |                         IOMMU_NOTIFIER_IOTLB_EVENTS, | 
 |                         section->offset_within_region, | 
 |                         int128_get64(end), | 
 |                         iommu_idx); | 
 |     iommu->iommu_offset = section->offset_within_address_space - | 
 |                           section->offset_within_region; | 
 |     iommu->dev_shared = s; | 
 |  | 
 |     ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL); | 
 |     if (ret) { | 
 |         g_free(iommu); | 
 |         return; | 
 |     } | 
 |  | 
 |     QLIST_INSERT_HEAD(&s->iommu_list, iommu, iommu_next); | 
 |     memory_region_iommu_replay(iommu->iommu_mr, &iommu->n); | 
 |  | 
 |     return; | 
 | } | 
 |  | 
 | static void vhost_vdpa_iommu_region_del(MemoryListener *listener, | 
 |                                         MemoryRegionSection *section) | 
 | { | 
 |     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener); | 
 |  | 
 |     struct vdpa_iommu *iommu; | 
 |  | 
 |     QLIST_FOREACH(iommu, &s->iommu_list, iommu_next) | 
 |     { | 
 |         if (MEMORY_REGION(iommu->iommu_mr) == section->mr && | 
 |             iommu->n.start == section->offset_within_region) { | 
 |             memory_region_unregister_iommu_notifier(section->mr, &iommu->n); | 
 |             QLIST_REMOVE(iommu, iommu_next); | 
 |             g_free(iommu); | 
 |             break; | 
 |         } | 
 |     } | 
 | } | 
 |  | 
 | static void vhost_vdpa_listener_region_add(MemoryListener *listener, | 
 |                                            MemoryRegionSection *section) | 
 | { | 
 |     DMAMap mem_region = {}; | 
 |     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener); | 
 |     hwaddr iova; | 
 |     Int128 llend, llsize; | 
 |     void *vaddr; | 
 |     int ret; | 
 |     int page_size = qemu_target_page_size(); | 
 |     int page_mask = -page_size; | 
 |  | 
 |     if (vhost_vdpa_listener_skipped_section(section, s->iova_range.first, | 
 |                                             s->iova_range.last, page_mask)) { | 
 |         return; | 
 |     } | 
 |     if (memory_region_is_iommu(section->mr)) { | 
 |         vhost_vdpa_iommu_region_add(listener, section); | 
 |         return; | 
 |     } | 
 |  | 
 |     if (unlikely((section->offset_within_address_space & ~page_mask) != | 
 |                  (section->offset_within_region & ~page_mask))) { | 
 |         trace_vhost_vdpa_listener_region_add_unaligned(s, section->mr->name, | 
 |                        section->offset_within_address_space & ~page_mask, | 
 |                        section->offset_within_region & ~page_mask); | 
 |         return; | 
 |     } | 
 |  | 
 |     iova = ROUND_UP(section->offset_within_address_space, page_size); | 
 |     llend = vhost_vdpa_section_end(section, page_mask); | 
 |     if (int128_ge(int128_make64(iova), llend)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     memory_region_ref(section->mr); | 
 |  | 
 |     /* Here we assume that memory_region_is_ram(section->mr)==true */ | 
 |  | 
 |     vaddr = memory_region_get_ram_ptr(section->mr) + | 
 |             section->offset_within_region + | 
 |             (iova - section->offset_within_address_space); | 
 |  | 
 |     trace_vhost_vdpa_listener_region_add(s, iova, int128_get64(llend), | 
 |                                          vaddr, section->readonly); | 
 |  | 
 |     llsize = int128_sub(llend, int128_make64(iova)); | 
 |     if (s->shadow_data) { | 
 |         int r; | 
 |  | 
 |         mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr, | 
 |         mem_region.size = int128_get64(llsize) - 1, | 
 |         mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly), | 
 |  | 
 |         r = vhost_iova_tree_map_alloc(s->iova_tree, &mem_region); | 
 |         if (unlikely(r != IOVA_OK)) { | 
 |             error_report("Can't allocate a mapping (%d)", r); | 
 |             goto fail; | 
 |         } | 
 |  | 
 |         iova = mem_region.iova; | 
 |     } | 
 |  | 
 |     vhost_vdpa_iotlb_batch_begin_once(s); | 
 |     ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, | 
 |                              int128_get64(llsize), vaddr, section->readonly); | 
 |     if (ret) { | 
 |         error_report("vhost vdpa map fail!"); | 
 |         goto fail_map; | 
 |     } | 
 |  | 
 |     return; | 
 |  | 
 | fail_map: | 
 |     if (s->shadow_data) { | 
 |         vhost_iova_tree_remove(s->iova_tree, mem_region); | 
 |     } | 
 |  | 
 | fail: | 
 |     /* | 
 |      * On the initfn path, store the first error in the container so we | 
 |      * can gracefully fail.  Runtime, there's not much we can do other | 
 |      * than throw a hardware error. | 
 |      */ | 
 |     error_report("vhost-vdpa: DMA mapping failed, unable to continue"); | 
 |     return; | 
 |  | 
 | } | 
 |  | 
 | static void vhost_vdpa_listener_region_del(MemoryListener *listener, | 
 |                                            MemoryRegionSection *section) | 
 | { | 
 |     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener); | 
 |     hwaddr iova; | 
 |     Int128 llend, llsize; | 
 |     int ret; | 
 |     int page_size = qemu_target_page_size(); | 
 |     int page_mask = -page_size; | 
 |  | 
 |     if (vhost_vdpa_listener_skipped_section(section, s->iova_range.first, | 
 |                                             s->iova_range.last, page_mask)) { | 
 |         return; | 
 |     } | 
 |     if (memory_region_is_iommu(section->mr)) { | 
 |         vhost_vdpa_iommu_region_del(listener, section); | 
 |     } | 
 |  | 
 |     if (unlikely((section->offset_within_address_space & ~page_mask) != | 
 |                  (section->offset_within_region & ~page_mask))) { | 
 |         trace_vhost_vdpa_listener_region_del_unaligned(s, section->mr->name, | 
 |                        section->offset_within_address_space & ~page_mask, | 
 |                        section->offset_within_region & ~page_mask); | 
 |         return; | 
 |     } | 
 |  | 
 |     iova = ROUND_UP(section->offset_within_address_space, page_size); | 
 |     llend = vhost_vdpa_section_end(section, page_mask); | 
 |  | 
 |     trace_vhost_vdpa_listener_region_del(s, iova, | 
 |         int128_get64(int128_sub(llend, int128_one()))); | 
 |  | 
 |     if (int128_ge(int128_make64(iova), llend)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     llsize = int128_sub(llend, int128_make64(iova)); | 
 |  | 
 |     if (s->shadow_data) { | 
 |         const DMAMap *result; | 
 |         const void *vaddr = memory_region_get_ram_ptr(section->mr) + | 
 |             section->offset_within_region + | 
 |             (iova - section->offset_within_address_space); | 
 |         DMAMap mem_region = { | 
 |             .translated_addr = (hwaddr)(uintptr_t)vaddr, | 
 |             .size = int128_get64(llsize) - 1, | 
 |         }; | 
 |  | 
 |         result = vhost_iova_tree_find_iova(s->iova_tree, &mem_region); | 
 |         if (!result) { | 
 |             /* The memory listener map wasn't mapped */ | 
 |             return; | 
 |         } | 
 |         iova = result->iova; | 
 |         vhost_iova_tree_remove(s->iova_tree, *result); | 
 |     } | 
 |     vhost_vdpa_iotlb_batch_begin_once(s); | 
 |     /* | 
 |      * The unmap ioctl doesn't accept a full 64-bit. need to check it | 
 |      */ | 
 |     if (int128_eq(llsize, int128_2_64())) { | 
 |         llsize = int128_rshift(llsize, 1); | 
 |         ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova, | 
 |                                    int128_get64(llsize)); | 
 |  | 
 |         if (ret) { | 
 |             error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", " | 
 |                          "0x%" HWADDR_PRIx ") = %d (%m)", | 
 |                          s, iova, int128_get64(llsize), ret); | 
 |         } | 
 |         iova += int128_get64(llsize); | 
 |     } | 
 |     ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova, | 
 |                                int128_get64(llsize)); | 
 |  | 
 |     if (ret) { | 
 |         error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", " | 
 |                      "0x%" HWADDR_PRIx ") = %d (%m)", | 
 |                      s, iova, int128_get64(llsize), ret); | 
 |     } | 
 |  | 
 |     memory_region_unref(section->mr); | 
 | } | 
 | /* | 
 |  * IOTLB API is used by vhost-vdpa which requires incremental updating | 
 |  * of the mapping. So we can not use generic vhost memory listener which | 
 |  * depends on the addnop(). | 
 |  */ | 
 | static const MemoryListener vhost_vdpa_memory_listener = { | 
 |     .name = "vhost-vdpa", | 
 |     .commit = vhost_vdpa_listener_commit, | 
 |     .region_add = vhost_vdpa_listener_region_add, | 
 |     .region_del = vhost_vdpa_listener_region_del, | 
 | }; | 
 |  | 
 | static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, | 
 |                              void *arg) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int fd = v->shared->device_fd; | 
 |     int ret; | 
 |  | 
 |     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); | 
 |  | 
 |     ret = ioctl(fd, request, arg); | 
 |     return ret < 0 ? -errno : ret; | 
 | } | 
 |  | 
 | static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) | 
 | { | 
 |     uint8_t s; | 
 |     int ret; | 
 |  | 
 |     trace_vhost_vdpa_add_status(dev, status); | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); | 
 |     if (ret < 0) { | 
 |         return ret; | 
 |     } | 
 |     if ((s & status) == status) { | 
 |         /* Don't set bits already set */ | 
 |         return 0; | 
 |     } | 
 |  | 
 |     s |= status; | 
 |  | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); | 
 |     if (ret < 0) { | 
 |         return ret; | 
 |     } | 
 |  | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); | 
 |     if (ret < 0) { | 
 |         return ret; | 
 |     } | 
 |  | 
 |     if (!(s & status)) { | 
 |         return -EIO; | 
 |     } | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range) | 
 | { | 
 |     int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range); | 
 |  | 
 |     return ret < 0 ? -errno : 0; | 
 | } | 
 |  | 
 | /* | 
 |  * The use of this function is for requests that only need to be | 
 |  * applied once. Typically such request occurs at the beginning | 
 |  * of operation, and before setting up queues. It should not be | 
 |  * used for request that performs operation until all queues are | 
 |  * set, which would need to check dev->vq_index_end instead. | 
 |  */ | 
 | static bool vhost_vdpa_first_dev(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     return v->index == 0; | 
 | } | 
 |  | 
 | static bool vhost_vdpa_last_dev(struct vhost_dev *dev) | 
 | { | 
 |     return dev->vq_index + dev->nvqs == dev->vq_index_end; | 
 | } | 
 |  | 
 | static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, | 
 |                                        uint64_t *features) | 
 | { | 
 |     int ret; | 
 |  | 
 |     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); | 
 |     trace_vhost_vdpa_get_features(dev, *features); | 
 |     return ret; | 
 | } | 
 |  | 
 | static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v) | 
 | { | 
 |     g_autoptr(GPtrArray) shadow_vqs = NULL; | 
 |  | 
 |     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); | 
 |     for (unsigned n = 0; n < hdev->nvqs; ++n) { | 
 |         VhostShadowVirtqueue *svq; | 
 |  | 
 |         svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque); | 
 |         g_ptr_array_add(shadow_vqs, svq); | 
 |     } | 
 |  | 
 |     v->shadow_vqs = g_steal_pointer(&shadow_vqs); | 
 | } | 
 |  | 
 | static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) | 
 | { | 
 |     struct vhost_vdpa *v = opaque; | 
 |     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); | 
 |     trace_vhost_vdpa_init(dev, v->shared, opaque); | 
 |     int ret; | 
 |  | 
 |     v->dev = dev; | 
 |     dev->opaque =  opaque ; | 
 |     v->shared->listener = vhost_vdpa_memory_listener; | 
 |     vhost_vdpa_init_svq(dev, v); | 
 |  | 
 |     error_propagate(&dev->migration_blocker, v->migration_blocker); | 
 |     if (!vhost_vdpa_first_dev(dev)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     /* | 
 |      * If dev->shadow_vqs_enabled at initialization that means the device has | 
 |      * been started with x-svq=on, so don't block migration | 
 |      */ | 
 |     if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) { | 
 |         /* We don't have dev->features yet */ | 
 |         uint64_t features; | 
 |         ret = vhost_vdpa_get_dev_features(dev, &features); | 
 |         if (unlikely(ret)) { | 
 |             error_setg_errno(errp, -ret, "Could not get device features"); | 
 |             return ret; | 
 |         } | 
 |         vhost_svq_valid_features(features, &dev->migration_blocker); | 
 |     } | 
 |  | 
 |     /* | 
 |      * Similar to VFIO, we end up pinning all guest memory and have to | 
 |      * disable discarding of RAM. | 
 |      */ | 
 |     ret = ram_block_discard_disable(true); | 
 |     if (ret) { | 
 |         error_report("Cannot set discarding of RAM broken"); | 
 |         return ret; | 
 |     } | 
 |  | 
 |     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | | 
 |                                VIRTIO_CONFIG_S_DRIVER); | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, | 
 |                                             int queue_index) | 
 | { | 
 |     size_t page_size = qemu_real_host_page_size(); | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     VirtIODevice *vdev = dev->vdev; | 
 |     VhostVDPAHostNotifier *n; | 
 |  | 
 |     n = &v->notifier[queue_index]; | 
 |  | 
 |     if (n->addr) { | 
 |         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); | 
 |         object_unparent(OBJECT(&n->mr)); | 
 |         munmap(n->addr, page_size); | 
 |         n->addr = NULL; | 
 |     } | 
 | } | 
 |  | 
 | static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) | 
 | { | 
 |     size_t page_size = qemu_real_host_page_size(); | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     VirtIODevice *vdev = dev->vdev; | 
 |     VhostVDPAHostNotifier *n; | 
 |     int fd = v->shared->device_fd; | 
 |     void *addr; | 
 |     char *name; | 
 |  | 
 |     vhost_vdpa_host_notifier_uninit(dev, queue_index); | 
 |  | 
 |     n = &v->notifier[queue_index]; | 
 |  | 
 |     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, | 
 |                 queue_index * page_size); | 
 |     if (addr == MAP_FAILED) { | 
 |         goto err; | 
 |     } | 
 |  | 
 |     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", | 
 |                            v, queue_index); | 
 |     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, | 
 |                                       page_size, addr); | 
 |     g_free(name); | 
 |  | 
 |     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { | 
 |         object_unparent(OBJECT(&n->mr)); | 
 |         munmap(addr, page_size); | 
 |         goto err; | 
 |     } | 
 |     n->addr = addr; | 
 |  | 
 |     return 0; | 
 |  | 
 | err: | 
 |     return -1; | 
 | } | 
 |  | 
 | static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) | 
 | { | 
 |     int i; | 
 |  | 
 |     /* | 
 |      * Pack all the changes to the memory regions in a single | 
 |      * transaction to avoid a few updating of the address space | 
 |      * topology. | 
 |      */ | 
 |     memory_region_transaction_begin(); | 
 |  | 
 |     for (i = dev->vq_index; i < dev->vq_index + n; i++) { | 
 |         vhost_vdpa_host_notifier_uninit(dev, i); | 
 |     } | 
 |  | 
 |     memory_region_transaction_commit(); | 
 | } | 
 |  | 
 | static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int i; | 
 |  | 
 |     if (v->shadow_vqs_enabled) { | 
 |         /* FIXME SVQ is not compatible with host notifiers mr */ | 
 |         return; | 
 |     } | 
 |  | 
 |     /* | 
 |      * Pack all the changes to the memory regions in a single | 
 |      * transaction to avoid a few updating of the address space | 
 |      * topology. | 
 |      */ | 
 |     memory_region_transaction_begin(); | 
 |  | 
 |     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { | 
 |         if (vhost_vdpa_host_notifier_init(dev, i)) { | 
 |             vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); | 
 |             break; | 
 |         } | 
 |     } | 
 |  | 
 |     memory_region_transaction_commit(); | 
 | } | 
 |  | 
 | static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     size_t idx; | 
 |  | 
 |     for (idx = 0; idx < v->shadow_vqs->len; ++idx) { | 
 |         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); | 
 |     } | 
 |     g_ptr_array_free(v->shadow_vqs, true); | 
 | } | 
 |  | 
 | static int vhost_vdpa_cleanup(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v; | 
 |     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); | 
 |     v = dev->opaque; | 
 |     trace_vhost_vdpa_cleanup(dev, v); | 
 |     if (vhost_vdpa_first_dev(dev)) { | 
 |         ram_block_discard_disable(false); | 
 |         memory_listener_unregister(&v->shared->listener); | 
 |     } | 
 |  | 
 |     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); | 
 |     vhost_vdpa_svq_cleanup(dev); | 
 |  | 
 |     dev->opaque = NULL; | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) | 
 | { | 
 |     trace_vhost_vdpa_memslots_limit(dev, INT_MAX); | 
 |     return INT_MAX; | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, | 
 |                                     struct vhost_memory *mem) | 
 | { | 
 |     if (!vhost_vdpa_first_dev(dev)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); | 
 |     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && | 
 |         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { | 
 |         int i; | 
 |         for (i = 0; i < mem->nregions; i++) { | 
 |             trace_vhost_vdpa_dump_regions(dev, i, | 
 |                                           mem->regions[i].guest_phys_addr, | 
 |                                           mem->regions[i].memory_size, | 
 |                                           mem->regions[i].userspace_addr, | 
 |                                           mem->regions[i].flags_padding); | 
 |         } | 
 |     } | 
 |     if (mem->padding) { | 
 |         return -EINVAL; | 
 |     } | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_features(struct vhost_dev *dev, | 
 |                                    uint64_t features) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int ret; | 
 |  | 
 |     if (!vhost_vdpa_first_dev(dev)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     if (v->shadow_vqs_enabled) { | 
 |         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { | 
 |             /* | 
 |              * QEMU is just trying to enable or disable logging. SVQ handles | 
 |              * this sepparately, so no need to forward this. | 
 |              */ | 
 |             v->acked_features = features; | 
 |             return 0; | 
 |         } | 
 |  | 
 |         v->acked_features = features; | 
 |  | 
 |         /* We must not ack _F_LOG if SVQ is enabled */ | 
 |         features &= ~BIT_ULL(VHOST_F_LOG_ALL); | 
 |     } | 
 |  | 
 |     trace_vhost_vdpa_set_features(dev, features); | 
 |     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); | 
 |     if (ret) { | 
 |         return ret; | 
 |     } | 
 |  | 
 |     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     uint64_t features; | 
 |     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | | 
 |         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | | 
 |         0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | | 
 |         0x1ULL << VHOST_BACKEND_F_SUSPEND; | 
 |     int r; | 
 |  | 
 |     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { | 
 |         return -EFAULT; | 
 |     } | 
 |  | 
 |     features &= f; | 
 |  | 
 |     if (vhost_vdpa_first_dev(dev)) { | 
 |         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); | 
 |         if (r) { | 
 |             return -EFAULT; | 
 |         } | 
 |     } | 
 |  | 
 |     dev->backend_cap = features; | 
 |     v->shared->backend_cap = features; | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | static int vhost_vdpa_get_device_id(struct vhost_dev *dev, | 
 |                                     uint32_t *device_id) | 
 | { | 
 |     int ret; | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); | 
 |     trace_vhost_vdpa_get_device_id(dev, *device_id); | 
 |     return ret; | 
 | } | 
 |  | 
 | static int vhost_vdpa_reset_device(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int ret; | 
 |     uint8_t status = 0; | 
 |  | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); | 
 |     trace_vhost_vdpa_reset_device(dev); | 
 |     v->suspended = false; | 
 |     return ret; | 
 | } | 
 |  | 
 | static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) | 
 | { | 
 |     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); | 
 |  | 
 |     trace_vhost_vdpa_get_vq_index(dev, idx, idx); | 
 |     return idx; | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx, | 
 |                                            int enable) | 
 | { | 
 |     struct vhost_dev *dev = v->dev; | 
 |     struct vhost_vring_state state = { | 
 |         .index = idx, | 
 |         .num = enable, | 
 |     }; | 
 |     int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); | 
 |  | 
 |     trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r); | 
 |     return r; | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     unsigned int i; | 
 |     int ret; | 
 |  | 
 |     for (i = 0; i < dev->nvqs; ++i) { | 
 |         ret = vhost_vdpa_set_vring_enable_one(v, i, enable); | 
 |         if (ret < 0) { | 
 |             return ret; | 
 |         } | 
 |     } | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) | 
 | { | 
 |     return vhost_vdpa_set_vring_enable_one(v, idx, 1); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_config_call(struct vhost_dev *dev, | 
 |                                        int fd) | 
 | { | 
 |     trace_vhost_vdpa_set_config_call(dev, fd); | 
 |     return vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG_CALL, &fd); | 
 | } | 
 |  | 
 | static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, | 
 |                                    uint32_t config_len) | 
 | { | 
 |     g_autoptr(GString) str = g_string_sized_new(4 * 16); | 
 |     size_t b, len; | 
 |  | 
 |     for (b = 0; b < config_len; b += len) { | 
 |         len = MIN(config_len - b, 16); | 
 |  | 
 |         g_string_truncate(str, 0); | 
 |         qemu_hexdump_line(str, config + b, len, 1, 4); | 
 |         trace_vhost_vdpa_dump_config(dev, b, str->str); | 
 |     } | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, | 
 |                                    uint32_t offset, uint32_t size, | 
 |                                    uint32_t flags) | 
 | { | 
 |     struct vhost_vdpa_config *config; | 
 |     int ret; | 
 |     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); | 
 |  | 
 |     trace_vhost_vdpa_set_config(dev, offset, size, flags); | 
 |     config = g_malloc(size + config_size); | 
 |     config->off = offset; | 
 |     config->len = size; | 
 |     memcpy(config->buf, data, size); | 
 |     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && | 
 |         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { | 
 |         vhost_vdpa_dump_config(dev, data, size); | 
 |     } | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); | 
 |     g_free(config); | 
 |     return ret; | 
 | } | 
 |  | 
 | static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, | 
 |                                    uint32_t config_len, Error **errp) | 
 | { | 
 |     struct vhost_vdpa_config *v_config; | 
 |     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); | 
 |     int ret; | 
 |  | 
 |     trace_vhost_vdpa_get_config(dev, config, config_len); | 
 |     v_config = g_malloc(config_len + config_size); | 
 |     v_config->len = config_len; | 
 |     v_config->off = 0; | 
 |     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); | 
 |     memcpy(config, v_config->buf, config_len); | 
 |     g_free(v_config); | 
 |     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && | 
 |         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { | 
 |         vhost_vdpa_dump_config(dev, config, config_len); | 
 |     } | 
 |     return ret; | 
 |  } | 
 |  | 
 | static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, | 
 |                                          struct vhost_vring_state *ring) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num, | 
 |                                         v->shadow_vqs_enabled); | 
 |     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, | 
 |                                          struct vhost_vring_file *file) | 
 | { | 
 |     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); | 
 |     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, | 
 |                                          struct vhost_vring_file *file) | 
 | { | 
 |     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); | 
 |     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, | 
 |                                          struct vhost_vring_addr *addr) | 
 | { | 
 |     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, | 
 |                                 addr->desc_user_addr, addr->used_user_addr, | 
 |                                 addr->avail_user_addr, | 
 |                                 addr->log_guest_addr); | 
 |  | 
 |     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); | 
 |  | 
 | } | 
 |  | 
 | /** | 
 |  * Set the shadow virtqueue descriptors to the device | 
 |  * | 
 |  * @dev: The vhost device model | 
 |  * @svq: The shadow virtqueue | 
 |  * @idx: The index of the virtqueue in the vhost device | 
 |  * @errp: Error | 
 |  * | 
 |  * Note that this function does not rewind kick file descriptor if cannot set | 
 |  * call one. | 
 |  */ | 
 | static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, | 
 |                                   VhostShadowVirtqueue *svq, unsigned idx, | 
 |                                   Error **errp) | 
 | { | 
 |     struct vhost_vring_file file = { | 
 |         .index = dev->vq_index + idx, | 
 |     }; | 
 |     const EventNotifier *event_notifier = &svq->hdev_kick; | 
 |     int r; | 
 |  | 
 |     r = event_notifier_init(&svq->hdev_kick, 0); | 
 |     if (r != 0) { | 
 |         error_setg_errno(errp, -r, "Couldn't create kick event notifier"); | 
 |         goto err_init_hdev_kick; | 
 |     } | 
 |  | 
 |     r = event_notifier_init(&svq->hdev_call, 0); | 
 |     if (r != 0) { | 
 |         error_setg_errno(errp, -r, "Couldn't create call event notifier"); | 
 |         goto err_init_hdev_call; | 
 |     } | 
 |  | 
 |     file.fd = event_notifier_get_fd(event_notifier); | 
 |     r = vhost_vdpa_set_vring_dev_kick(dev, &file); | 
 |     if (unlikely(r != 0)) { | 
 |         error_setg_errno(errp, -r, "Can't set device kick fd"); | 
 |         goto err_init_set_dev_fd; | 
 |     } | 
 |  | 
 |     event_notifier = &svq->hdev_call; | 
 |     file.fd = event_notifier_get_fd(event_notifier); | 
 |     r = vhost_vdpa_set_vring_dev_call(dev, &file); | 
 |     if (unlikely(r != 0)) { | 
 |         error_setg_errno(errp, -r, "Can't set device call fd"); | 
 |         goto err_init_set_dev_fd; | 
 |     } | 
 |  | 
 |     return 0; | 
 |  | 
 | err_init_set_dev_fd: | 
 |     event_notifier_set_handler(&svq->hdev_call, NULL); | 
 |  | 
 | err_init_hdev_call: | 
 |     event_notifier_cleanup(&svq->hdev_kick); | 
 |  | 
 | err_init_hdev_kick: | 
 |     return r; | 
 | } | 
 |  | 
 | /** | 
 |  * Unmap a SVQ area in the device | 
 |  */ | 
 | static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr) | 
 | { | 
 |     const DMAMap needle = { | 
 |         .translated_addr = addr, | 
 |     }; | 
 |     const DMAMap *result = vhost_iova_tree_find_iova(v->shared->iova_tree, | 
 |                                                      &needle); | 
 |     hwaddr size; | 
 |     int r; | 
 |  | 
 |     if (unlikely(!result)) { | 
 |         error_report("Unable to find SVQ address to unmap"); | 
 |         return; | 
 |     } | 
 |  | 
 |     size = ROUND_UP(result->size, qemu_real_host_page_size()); | 
 |     r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id, result->iova, | 
 |                              size); | 
 |     if (unlikely(r < 0)) { | 
 |         error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r); | 
 |         return; | 
 |     } | 
 |  | 
 |     vhost_iova_tree_remove(v->shared->iova_tree, *result); | 
 | } | 
 |  | 
 | static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, | 
 |                                        const VhostShadowVirtqueue *svq) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     struct vhost_vring_addr svq_addr; | 
 |  | 
 |     vhost_svq_get_vring_addr(svq, &svq_addr); | 
 |  | 
 |     vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr); | 
 |  | 
 |     vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr); | 
 | } | 
 |  | 
 | /** | 
 |  * Map the SVQ area in the device | 
 |  * | 
 |  * @v: Vhost-vdpa device | 
 |  * @needle: The area to search iova | 
 |  * @errorp: Error pointer | 
 |  */ | 
 | static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, | 
 |                                     Error **errp) | 
 | { | 
 |     int r; | 
 |  | 
 |     r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle); | 
 |     if (unlikely(r != IOVA_OK)) { | 
 |         error_setg(errp, "Cannot allocate iova (%d)", r); | 
 |         return false; | 
 |     } | 
 |  | 
 |     r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova, | 
 |                            needle->size + 1, | 
 |                            (void *)(uintptr_t)needle->translated_addr, | 
 |                            needle->perm == IOMMU_RO); | 
 |     if (unlikely(r != 0)) { | 
 |         error_setg_errno(errp, -r, "Cannot map region to device"); | 
 |         vhost_iova_tree_remove(v->shared->iova_tree, *needle); | 
 |     } | 
 |  | 
 |     return r == 0; | 
 | } | 
 |  | 
 | /** | 
 |  * Map the shadow virtqueue rings in the device | 
 |  * | 
 |  * @dev: The vhost device | 
 |  * @svq: The shadow virtqueue | 
 |  * @addr: Assigned IOVA addresses | 
 |  * @errp: Error pointer | 
 |  */ | 
 | static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, | 
 |                                      const VhostShadowVirtqueue *svq, | 
 |                                      struct vhost_vring_addr *addr, | 
 |                                      Error **errp) | 
 | { | 
 |     ERRP_GUARD(); | 
 |     DMAMap device_region, driver_region; | 
 |     struct vhost_vring_addr svq_addr; | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     size_t device_size = vhost_svq_device_area_size(svq); | 
 |     size_t driver_size = vhost_svq_driver_area_size(svq); | 
 |     size_t avail_offset; | 
 |     bool ok; | 
 |  | 
 |     vhost_svq_get_vring_addr(svq, &svq_addr); | 
 |  | 
 |     driver_region = (DMAMap) { | 
 |         .translated_addr = svq_addr.desc_user_addr, | 
 |         .size = driver_size - 1, | 
 |         .perm = IOMMU_RO, | 
 |     }; | 
 |     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); | 
 |     if (unlikely(!ok)) { | 
 |         error_prepend(errp, "Cannot create vq driver region: "); | 
 |         return false; | 
 |     } | 
 |     addr->desc_user_addr = driver_region.iova; | 
 |     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; | 
 |     addr->avail_user_addr = driver_region.iova + avail_offset; | 
 |  | 
 |     device_region = (DMAMap) { | 
 |         .translated_addr = svq_addr.used_user_addr, | 
 |         .size = device_size - 1, | 
 |         .perm = IOMMU_RW, | 
 |     }; | 
 |     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); | 
 |     if (unlikely(!ok)) { | 
 |         error_prepend(errp, "Cannot create vq device region: "); | 
 |         vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr); | 
 |     } | 
 |     addr->used_user_addr = device_region.iova; | 
 |  | 
 |     return ok; | 
 | } | 
 |  | 
 | static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, | 
 |                                  VhostShadowVirtqueue *svq, unsigned idx, | 
 |                                  Error **errp) | 
 | { | 
 |     uint16_t vq_index = dev->vq_index + idx; | 
 |     struct vhost_vring_state s = { | 
 |         .index = vq_index, | 
 |     }; | 
 |     int r; | 
 |  | 
 |     r = vhost_vdpa_set_dev_vring_base(dev, &s); | 
 |     if (unlikely(r)) { | 
 |         error_setg_errno(errp, -r, "Cannot set vring base"); | 
 |         return false; | 
 |     } | 
 |  | 
 |     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); | 
 |     return r == 0; | 
 | } | 
 |  | 
 | static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     Error *err = NULL; | 
 |     unsigned i; | 
 |  | 
 |     if (!v->shadow_vqs_enabled) { | 
 |         return true; | 
 |     } | 
 |  | 
 |     for (i = 0; i < v->shadow_vqs->len; ++i) { | 
 |         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); | 
 |         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); | 
 |         struct vhost_vring_addr addr = { | 
 |             .index = dev->vq_index + i, | 
 |         }; | 
 |         int r; | 
 |         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); | 
 |         if (unlikely(!ok)) { | 
 |             goto err; | 
 |         } | 
 |  | 
 |         vhost_svq_start(svq, dev->vdev, vq, v->shared->iova_tree); | 
 |         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); | 
 |         if (unlikely(!ok)) { | 
 |             goto err_map; | 
 |         } | 
 |  | 
 |         /* Override vring GPA set by vhost subsystem */ | 
 |         r = vhost_vdpa_set_vring_dev_addr(dev, &addr); | 
 |         if (unlikely(r != 0)) { | 
 |             error_setg_errno(&err, -r, "Cannot set device address"); | 
 |             goto err_set_addr; | 
 |         } | 
 |     } | 
 |  | 
 |     return true; | 
 |  | 
 | err_set_addr: | 
 |     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); | 
 |  | 
 | err_map: | 
 |     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); | 
 |  | 
 | err: | 
 |     error_reportf_err(err, "Cannot setup SVQ %u: ", i); | 
 |     for (unsigned j = 0; j < i; ++j) { | 
 |         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); | 
 |         vhost_vdpa_svq_unmap_rings(dev, svq); | 
 |         vhost_svq_stop(svq); | 
 |     } | 
 |  | 
 |     return false; | 
 | } | 
 |  | 
 | static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     if (!v->shadow_vqs_enabled) { | 
 |         return; | 
 |     } | 
 |  | 
 |     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { | 
 |         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); | 
 |  | 
 |         vhost_svq_stop(svq); | 
 |         vhost_vdpa_svq_unmap_rings(dev, svq); | 
 |  | 
 |         event_notifier_cleanup(&svq->hdev_kick); | 
 |         event_notifier_cleanup(&svq->hdev_call); | 
 |     } | 
 | } | 
 |  | 
 | static void vhost_vdpa_suspend(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int r; | 
 |  | 
 |     if (!vhost_vdpa_first_dev(dev)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) { | 
 |         trace_vhost_vdpa_suspend(dev); | 
 |         r = ioctl(v->shared->device_fd, VHOST_VDPA_SUSPEND); | 
 |         if (unlikely(r)) { | 
 |             error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno); | 
 |         } else { | 
 |             v->suspended = true; | 
 |             return; | 
 |         } | 
 |     } | 
 |  | 
 |     vhost_vdpa_reset_device(dev); | 
 | } | 
 |  | 
 | static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     bool ok; | 
 |     trace_vhost_vdpa_dev_start(dev, started); | 
 |  | 
 |     if (started) { | 
 |         vhost_vdpa_host_notifiers_init(dev); | 
 |         ok = vhost_vdpa_svqs_start(dev); | 
 |         if (unlikely(!ok)) { | 
 |             return -1; | 
 |         } | 
 |     } else { | 
 |         vhost_vdpa_suspend(dev); | 
 |         vhost_vdpa_svqs_stop(dev); | 
 |         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); | 
 |     } | 
 |  | 
 |     if (!vhost_vdpa_last_dev(dev)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     if (started) { | 
 |         if (vhost_dev_has_iommu(dev) && (v->shadow_vqs_enabled)) { | 
 |             error_report("SVQ can not work while IOMMU enable, please disable" | 
 |                          "IOMMU and try again"); | 
 |             return -1; | 
 |         } | 
 |         memory_listener_register(&v->shared->listener, dev->vdev->dma_as); | 
 |  | 
 |         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); | 
 |     } | 
 |  | 
 |     return 0; | 
 | } | 
 |  | 
 | static void vhost_vdpa_reset_status(struct vhost_dev *dev) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     if (!vhost_vdpa_last_dev(dev)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     vhost_vdpa_reset_device(dev); | 
 |     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | | 
 |                                VIRTIO_CONFIG_S_DRIVER); | 
 |     memory_listener_unregister(&v->shared->listener); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, | 
 |                                      struct vhost_log *log) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, | 
 |                                   log->log); | 
 |     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, | 
 |                                        struct vhost_vring_addr *addr) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     if (v->shadow_vqs_enabled) { | 
 |         /* | 
 |          * Device vring addr was set at device start. SVQ base is handled by | 
 |          * VirtQueue code. | 
 |          */ | 
 |         return 0; | 
 |     } | 
 |  | 
 |     return vhost_vdpa_set_vring_dev_addr(dev, addr); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, | 
 |                                       struct vhost_vring_state *ring) | 
 | { | 
 |     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); | 
 |     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, | 
 |                                        struct vhost_vring_state *ring) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |  | 
 |     if (v->shadow_vqs_enabled) { | 
 |         /* | 
 |          * Device vring base was set at device start. SVQ base is handled by | 
 |          * VirtQueue code. | 
 |          */ | 
 |         return 0; | 
 |     } | 
 |  | 
 |     return vhost_vdpa_set_dev_vring_base(dev, ring); | 
 | } | 
 |  | 
 | static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, | 
 |                                        struct vhost_vring_state *ring) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int ret; | 
 |  | 
 |     if (v->shadow_vqs_enabled) { | 
 |         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); | 
 |         trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true); | 
 |         return 0; | 
 |     } | 
 |  | 
 |     if (!v->suspended) { | 
 |         /* | 
 |          * Cannot trust in value returned by device, let vhost recover used | 
 |          * idx from guest. | 
 |          */ | 
 |         return -1; | 
 |     } | 
 |  | 
 |     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); | 
 |     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false); | 
 |     return ret; | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, | 
 |                                        struct vhost_vring_file *file) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int vdpa_idx = file->index - dev->vq_index; | 
 |  | 
 |     if (v->shadow_vqs_enabled) { | 
 |         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); | 
 |         vhost_svq_set_svq_kick_fd(svq, file->fd); | 
 |         return 0; | 
 |     } else { | 
 |         return vhost_vdpa_set_vring_dev_kick(dev, file); | 
 |     } | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, | 
 |                                        struct vhost_vring_file *file) | 
 | { | 
 |     struct vhost_vdpa *v = dev->opaque; | 
 |     int vdpa_idx = file->index - dev->vq_index; | 
 |     VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); | 
 |  | 
 |     /* Remember last call fd because we can switch to SVQ anytime. */ | 
 |     vhost_svq_set_svq_call_fd(svq, file->fd); | 
 |     /* | 
 |      * When SVQ is transitioning to off, shadow_vqs_enabled has | 
 |      * not been set back to false yet, but the underlying call fd | 
 |      * will have to switch back to the guest notifier to signal the | 
 |      * passthrough virtqueues. In other situations, SVQ's own call | 
 |      * fd shall be used to signal the device model. | 
 |      */ | 
 |     if (v->shadow_vqs_enabled && | 
 |         v->shared->svq_switching != SVQ_TSTATE_DISABLING) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     return vhost_vdpa_set_vring_dev_call(dev, file); | 
 | } | 
 |  | 
 | static int vhost_vdpa_get_features(struct vhost_dev *dev, | 
 |                                      uint64_t *features) | 
 | { | 
 |     int ret = vhost_vdpa_get_dev_features(dev, features); | 
 |  | 
 |     if (ret == 0) { | 
 |         /* Add SVQ logging capabilities */ | 
 |         *features |= BIT_ULL(VHOST_F_LOG_ALL); | 
 |     } | 
 |  | 
 |     return ret; | 
 | } | 
 |  | 
 | static int vhost_vdpa_set_owner(struct vhost_dev *dev) | 
 | { | 
 |     if (!vhost_vdpa_first_dev(dev)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     trace_vhost_vdpa_set_owner(dev); | 
 |     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); | 
 | } | 
 |  | 
 | static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, | 
 |                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) | 
 | { | 
 |     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); | 
 |     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; | 
 |     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; | 
 |     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; | 
 |     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, | 
 |                                  addr->avail_user_addr, addr->used_user_addr); | 
 |     return 0; | 
 | } | 
 |  | 
 | static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev) | 
 | { | 
 |     return true; | 
 | } | 
 |  | 
 | const VhostOps vdpa_ops = { | 
 |         .backend_type = VHOST_BACKEND_TYPE_VDPA, | 
 |         .vhost_backend_init = vhost_vdpa_init, | 
 |         .vhost_backend_cleanup = vhost_vdpa_cleanup, | 
 |         .vhost_set_log_base = vhost_vdpa_set_log_base, | 
 |         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, | 
 |         .vhost_set_vring_num = vhost_vdpa_set_vring_num, | 
 |         .vhost_set_vring_base = vhost_vdpa_set_vring_base, | 
 |         .vhost_get_vring_base = vhost_vdpa_get_vring_base, | 
 |         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, | 
 |         .vhost_set_vring_call = vhost_vdpa_set_vring_call, | 
 |         .vhost_get_features = vhost_vdpa_get_features, | 
 |         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, | 
 |         .vhost_set_owner = vhost_vdpa_set_owner, | 
 |         .vhost_set_vring_endian = NULL, | 
 |         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, | 
 |         .vhost_set_mem_table = vhost_vdpa_set_mem_table, | 
 |         .vhost_set_features = vhost_vdpa_set_features, | 
 |         .vhost_reset_device = vhost_vdpa_reset_device, | 
 |         .vhost_get_vq_index = vhost_vdpa_get_vq_index, | 
 |         .vhost_set_vring_enable = vhost_vdpa_set_vring_enable, | 
 |         .vhost_get_config  = vhost_vdpa_get_config, | 
 |         .vhost_set_config = vhost_vdpa_set_config, | 
 |         .vhost_requires_shm_log = NULL, | 
 |         .vhost_migration_done = NULL, | 
 |         .vhost_net_set_mtu = NULL, | 
 |         .vhost_set_iotlb_callback = NULL, | 
 |         .vhost_send_device_iotlb_msg = NULL, | 
 |         .vhost_dev_start = vhost_vdpa_dev_start, | 
 |         .vhost_get_device_id = vhost_vdpa_get_device_id, | 
 |         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, | 
 |         .vhost_force_iommu = vhost_vdpa_force_iommu, | 
 |         .vhost_set_config_call = vhost_vdpa_set_config_call, | 
 |         .vhost_reset_status = vhost_vdpa_reset_status, | 
 | }; |