Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 1 | /* |
| 2 | * VFIO utility |
| 3 | * |
| 4 | * Copyright 2016 - 2018 Red Hat, Inc. |
| 5 | * |
| 6 | * Authors: |
| 7 | * Fam Zheng <famz@redhat.com> |
| 8 | * |
| 9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. |
| 10 | * See the COPYING file in the top-level directory. |
| 11 | */ |
| 12 | |
| 13 | #include "qemu/osdep.h" |
| 14 | #include <sys/ioctl.h> |
| 15 | #include <linux/vfio.h> |
| 16 | #include "qapi/error.h" |
| 17 | #include "exec/ramlist.h" |
| 18 | #include "exec/cpu-common.h" |
| 19 | #include "trace.h" |
| 20 | #include "qemu/queue.h" |
| 21 | #include "qemu/error-report.h" |
| 22 | #include "standard-headers/linux/pci_regs.h" |
| 23 | #include "qemu/event_notifier.h" |
| 24 | #include "qemu/vfio-helpers.h" |
| 25 | #include "trace.h" |
| 26 | |
| 27 | #define QEMU_VFIO_DEBUG 0 |
| 28 | |
| 29 | #define QEMU_VFIO_IOVA_MIN 0x10000ULL |
| 30 | /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, |
| 31 | * we can use a runtime limit; alternatively it's also possible to do platform |
| 32 | * specific detection by reading sysfs entries. Until then, 39 is a safe bet. |
| 33 | **/ |
| 34 | #define QEMU_VFIO_IOVA_MAX (1ULL << 39) |
| 35 | |
| 36 | typedef struct { |
| 37 | /* Page aligned addr. */ |
| 38 | void *host; |
| 39 | size_t size; |
| 40 | uint64_t iova; |
| 41 | } IOVAMapping; |
| 42 | |
| 43 | struct QEMUVFIOState { |
| 44 | QemuMutex lock; |
| 45 | |
| 46 | /* These fields are protected by BQL */ |
| 47 | int container; |
| 48 | int group; |
| 49 | int device; |
| 50 | RAMBlockNotifier ram_notifier; |
| 51 | struct vfio_region_info config_region_info, bar_region_info[6]; |
| 52 | |
| 53 | /* These fields are protected by @lock */ |
| 54 | /* VFIO's IO virtual address space is managed by splitting into a few |
| 55 | * sections: |
| 56 | * |
| 57 | * --------------- <= 0 |
| 58 | * |xxxxxxxxxxxxx| |
| 59 | * |-------------| <= QEMU_VFIO_IOVA_MIN |
| 60 | * | | |
| 61 | * | Fixed | |
| 62 | * | | |
| 63 | * |-------------| <= low_water_mark |
| 64 | * | | |
| 65 | * | Free | |
| 66 | * | | |
| 67 | * |-------------| <= high_water_mark |
| 68 | * | | |
| 69 | * | Temp | |
| 70 | * | | |
| 71 | * |-------------| <= QEMU_VFIO_IOVA_MAX |
| 72 | * |xxxxxxxxxxxxx| |
| 73 | * |xxxxxxxxxxxxx| |
| 74 | * --------------- |
| 75 | * |
| 76 | * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; |
| 77 | * |
| 78 | * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of |
| 79 | * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be |
| 80 | * reclaimed - low_water_mark never shrinks; |
| 81 | * |
| 82 | * - IOVAs in range [low_water_mark, high_water_mark) are free; |
| 83 | * |
| 84 | * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile |
| 85 | * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area |
| 86 | * is recycled. The caller should make sure I/O's depending on these |
| 87 | * mappings are completed before calling. |
| 88 | **/ |
| 89 | uint64_t low_water_mark; |
| 90 | uint64_t high_water_mark; |
| 91 | IOVAMapping *mappings; |
| 92 | int nr_mappings; |
| 93 | }; |
| 94 | |
| 95 | /** |
| 96 | * Find group file by PCI device address as specified @device, and return the |
| 97 | * path. The returned string is owned by caller and should be g_free'ed later. |
| 98 | */ |
| 99 | static char *sysfs_find_group_file(const char *device, Error **errp) |
| 100 | { |
| 101 | char *sysfs_link; |
| 102 | char *sysfs_group; |
| 103 | char *p; |
| 104 | char *path = NULL; |
| 105 | |
| 106 | sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); |
Paolo Bonzini | 78d8c99 | 2018-02-13 09:52:40 +0800 | [diff] [blame] | 107 | sysfs_group = g_malloc0(PATH_MAX); |
Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 108 | if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { |
| 109 | error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); |
| 110 | goto out; |
| 111 | } |
| 112 | p = strrchr(sysfs_group, '/'); |
| 113 | if (!p) { |
| 114 | error_setg(errp, "Failed to find iommu group number"); |
| 115 | goto out; |
| 116 | } |
| 117 | |
| 118 | path = g_strdup_printf("/dev/vfio/%s", p + 1); |
| 119 | out: |
| 120 | g_free(sysfs_link); |
| 121 | g_free(sysfs_group); |
| 122 | return path; |
| 123 | } |
| 124 | |
| 125 | static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) |
| 126 | { |
| 127 | assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); |
| 128 | } |
| 129 | |
| 130 | static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) |
| 131 | { |
| 132 | assert_bar_index_valid(s, index); |
| 133 | s->bar_region_info[index] = (struct vfio_region_info) { |
| 134 | .index = VFIO_PCI_BAR0_REGION_INDEX + index, |
| 135 | .argsz = sizeof(struct vfio_region_info), |
| 136 | }; |
| 137 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { |
| 138 | error_setg_errno(errp, errno, "Failed to get BAR region info"); |
| 139 | return -errno; |
| 140 | } |
| 141 | |
| 142 | return 0; |
| 143 | } |
| 144 | |
| 145 | /** |
| 146 | * Map a PCI bar area. |
| 147 | */ |
| 148 | void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, |
| 149 | uint64_t offset, uint64_t size, |
| 150 | Error **errp) |
| 151 | { |
| 152 | void *p; |
| 153 | assert_bar_index_valid(s, index); |
| 154 | p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), |
| 155 | PROT_READ | PROT_WRITE, MAP_SHARED, |
| 156 | s->device, s->bar_region_info[index].offset + offset); |
| 157 | if (p == MAP_FAILED) { |
| 158 | error_setg_errno(errp, errno, "Failed to map BAR region"); |
| 159 | p = NULL; |
| 160 | } |
| 161 | return p; |
| 162 | } |
| 163 | |
| 164 | /** |
| 165 | * Unmap a PCI bar area. |
| 166 | */ |
| 167 | void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, |
| 168 | uint64_t offset, uint64_t size) |
| 169 | { |
| 170 | if (bar) { |
| 171 | munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | /** |
| 176 | * Initialize device IRQ with @irq_type and and register an event notifier. |
| 177 | */ |
| 178 | int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, |
| 179 | int irq_type, Error **errp) |
| 180 | { |
| 181 | int r; |
| 182 | struct vfio_irq_set *irq_set; |
| 183 | size_t irq_set_size; |
| 184 | struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; |
| 185 | |
| 186 | irq_info.index = irq_type; |
| 187 | if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { |
| 188 | error_setg_errno(errp, errno, "Failed to get device interrupt info"); |
| 189 | return -errno; |
| 190 | } |
| 191 | if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { |
| 192 | error_setg(errp, "Device interrupt doesn't support eventfd"); |
| 193 | return -EINVAL; |
| 194 | } |
| 195 | |
| 196 | irq_set_size = sizeof(*irq_set) + sizeof(int); |
| 197 | irq_set = g_malloc0(irq_set_size); |
| 198 | |
| 199 | /* Get to a known IRQ state */ |
| 200 | *irq_set = (struct vfio_irq_set) { |
| 201 | .argsz = irq_set_size, |
| 202 | .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, |
| 203 | .index = irq_info.index, |
| 204 | .start = 0, |
| 205 | .count = 1, |
| 206 | }; |
| 207 | |
| 208 | *(int *)&irq_set->data = event_notifier_get_fd(e); |
| 209 | r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); |
| 210 | g_free(irq_set); |
| 211 | if (r) { |
| 212 | error_setg_errno(errp, errno, "Failed to setup device interrupt"); |
| 213 | return -errno; |
| 214 | } |
| 215 | return 0; |
| 216 | } |
| 217 | |
| 218 | static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, |
| 219 | int size, int ofs) |
| 220 | { |
| 221 | int ret; |
| 222 | |
| 223 | do { |
| 224 | ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); |
| 225 | } while (ret == -1 && errno == EINTR); |
| 226 | return ret == size ? 0 : -errno; |
| 227 | } |
| 228 | |
| 229 | static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) |
| 230 | { |
| 231 | int ret; |
| 232 | |
| 233 | do { |
| 234 | ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); |
| 235 | } while (ret == -1 && errno == EINTR); |
| 236 | return ret == size ? 0 : -errno; |
| 237 | } |
| 238 | |
| 239 | static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, |
| 240 | Error **errp) |
| 241 | { |
| 242 | int ret; |
| 243 | int i; |
| 244 | uint16_t pci_cmd; |
| 245 | struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; |
| 246 | struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; |
| 247 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; |
| 248 | char *group_file = NULL; |
| 249 | |
| 250 | /* Create a new container */ |
| 251 | s->container = open("/dev/vfio/vfio", O_RDWR); |
| 252 | |
| 253 | if (s->container == -1) { |
| 254 | error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); |
| 255 | return -errno; |
| 256 | } |
| 257 | if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { |
| 258 | error_setg(errp, "Invalid VFIO version"); |
| 259 | ret = -EINVAL; |
| 260 | goto fail_container; |
| 261 | } |
| 262 | |
| 263 | if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { |
| 264 | error_setg_errno(errp, errno, "VFIO IOMMU check failed"); |
| 265 | ret = -EINVAL; |
| 266 | goto fail_container; |
| 267 | } |
| 268 | |
| 269 | /* Open the group */ |
| 270 | group_file = sysfs_find_group_file(device, errp); |
| 271 | if (!group_file) { |
| 272 | ret = -EINVAL; |
| 273 | goto fail_container; |
| 274 | } |
| 275 | |
| 276 | s->group = open(group_file, O_RDWR); |
| 277 | if (s->group == -1) { |
| 278 | error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", |
| 279 | group_file); |
| 280 | g_free(group_file); |
| 281 | ret = -errno; |
| 282 | goto fail_container; |
| 283 | } |
| 284 | g_free(group_file); |
| 285 | |
| 286 | /* Test the group is viable and available */ |
| 287 | if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { |
| 288 | error_setg_errno(errp, errno, "Failed to get VFIO group status"); |
| 289 | ret = -errno; |
| 290 | goto fail; |
| 291 | } |
| 292 | |
| 293 | if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { |
| 294 | error_setg(errp, "VFIO group is not viable"); |
| 295 | ret = -EINVAL; |
| 296 | goto fail; |
| 297 | } |
| 298 | |
| 299 | /* Add the group to the container */ |
| 300 | if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { |
| 301 | error_setg_errno(errp, errno, "Failed to add group to VFIO container"); |
| 302 | ret = -errno; |
| 303 | goto fail; |
| 304 | } |
| 305 | |
| 306 | /* Enable the IOMMU model we want */ |
| 307 | if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { |
| 308 | error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); |
| 309 | ret = -errno; |
| 310 | goto fail; |
| 311 | } |
| 312 | |
| 313 | /* Get additional IOMMU info */ |
| 314 | if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { |
| 315 | error_setg_errno(errp, errno, "Failed to get IOMMU info"); |
| 316 | ret = -errno; |
| 317 | goto fail; |
| 318 | } |
| 319 | |
| 320 | s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); |
| 321 | |
| 322 | if (s->device < 0) { |
| 323 | error_setg_errno(errp, errno, "Failed to get device fd"); |
| 324 | ret = -errno; |
| 325 | goto fail; |
| 326 | } |
| 327 | |
| 328 | /* Test and setup the device */ |
| 329 | if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { |
| 330 | error_setg_errno(errp, errno, "Failed to get device info"); |
| 331 | ret = -errno; |
| 332 | goto fail; |
| 333 | } |
| 334 | |
| 335 | if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { |
| 336 | error_setg(errp, "Invalid device regions"); |
| 337 | ret = -EINVAL; |
| 338 | goto fail; |
| 339 | } |
| 340 | |
| 341 | s->config_region_info = (struct vfio_region_info) { |
| 342 | .index = VFIO_PCI_CONFIG_REGION_INDEX, |
| 343 | .argsz = sizeof(struct vfio_region_info), |
| 344 | }; |
| 345 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { |
| 346 | error_setg_errno(errp, errno, "Failed to get config region info"); |
| 347 | ret = -errno; |
| 348 | goto fail; |
| 349 | } |
| 350 | |
Li Qiang | 9e722eb | 2018-11-30 01:53:58 -0800 | [diff] [blame] | 351 | for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { |
Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 352 | ret = qemu_vfio_pci_init_bar(s, i, errp); |
| 353 | if (ret) { |
| 354 | goto fail; |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | /* Enable bus master */ |
| 359 | ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); |
| 360 | if (ret) { |
| 361 | goto fail; |
| 362 | } |
| 363 | pci_cmd |= PCI_COMMAND_MASTER; |
| 364 | ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); |
| 365 | if (ret) { |
| 366 | goto fail; |
| 367 | } |
| 368 | return 0; |
| 369 | fail: |
| 370 | close(s->group); |
| 371 | fail_container: |
| 372 | close(s->container); |
| 373 | return ret; |
| 374 | } |
| 375 | |
| 376 | static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, |
| 377 | void *host, size_t size) |
| 378 | { |
| 379 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); |
| 380 | trace_qemu_vfio_ram_block_added(s, host, size); |
| 381 | qemu_vfio_dma_map(s, host, size, false, NULL); |
| 382 | } |
| 383 | |
| 384 | static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, |
| 385 | void *host, size_t size) |
| 386 | { |
| 387 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); |
| 388 | if (host) { |
| 389 | trace_qemu_vfio_ram_block_removed(s, host, size); |
| 390 | qemu_vfio_dma_unmap(s, host); |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, |
| 395 | ram_addr_t offset, ram_addr_t length, |
| 396 | void *opaque) |
| 397 | { |
| 398 | int ret; |
| 399 | QEMUVFIOState *s = opaque; |
| 400 | |
| 401 | if (!host_addr) { |
| 402 | return 0; |
| 403 | } |
| 404 | ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); |
| 405 | if (ret) { |
| 406 | fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", |
| 407 | host_addr, (uint64_t)length); |
| 408 | } |
| 409 | return 0; |
| 410 | } |
| 411 | |
| 412 | static void qemu_vfio_open_common(QEMUVFIOState *s) |
| 413 | { |
Markus Armbruster | 549b50a | 2018-11-27 09:41:43 +0100 | [diff] [blame] | 414 | qemu_mutex_init(&s->lock); |
Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 415 | s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; |
| 416 | s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; |
| 417 | ram_block_notifier_add(&s->ram_notifier); |
| 418 | s->low_water_mark = QEMU_VFIO_IOVA_MIN; |
| 419 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; |
| 420 | qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); |
Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 421 | } |
| 422 | |
| 423 | /** |
| 424 | * Open a PCI device, e.g. "0000:00:01.0". |
| 425 | */ |
| 426 | QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) |
| 427 | { |
| 428 | int r; |
| 429 | QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); |
| 430 | |
| 431 | r = qemu_vfio_init_pci(s, device, errp); |
| 432 | if (r) { |
| 433 | g_free(s); |
| 434 | return NULL; |
| 435 | } |
| 436 | qemu_vfio_open_common(s); |
| 437 | return s; |
| 438 | } |
| 439 | |
| 440 | static void qemu_vfio_dump_mapping(IOVAMapping *m) |
| 441 | { |
| 442 | if (QEMU_VFIO_DEBUG) { |
| 443 | printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, |
| 444 | (uint64_t)m->size, (uint64_t)m->iova); |
| 445 | } |
| 446 | } |
| 447 | |
| 448 | static void qemu_vfio_dump_mappings(QEMUVFIOState *s) |
| 449 | { |
| 450 | int i; |
| 451 | |
| 452 | if (QEMU_VFIO_DEBUG) { |
| 453 | printf("vfio mappings\n"); |
| 454 | for (i = 0; i < s->nr_mappings; ++i) { |
| 455 | qemu_vfio_dump_mapping(&s->mappings[i]); |
| 456 | } |
| 457 | } |
| 458 | } |
| 459 | |
| 460 | /** |
| 461 | * Find the mapping entry that contains [host, host + size) and set @index to |
| 462 | * the position. If no entry contains it, @index is the position _after_ which |
| 463 | * to insert the new mapping. IOW, it is the index of the largest element that |
| 464 | * is smaller than @host, or -1 if no entry is. |
| 465 | */ |
| 466 | static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, |
| 467 | int *index) |
| 468 | { |
| 469 | IOVAMapping *p = s->mappings; |
| 470 | IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; |
| 471 | IOVAMapping *mid; |
| 472 | trace_qemu_vfio_find_mapping(s, host); |
| 473 | if (!p) { |
| 474 | *index = -1; |
| 475 | return NULL; |
| 476 | } |
| 477 | while (true) { |
| 478 | mid = p + (q - p) / 2; |
| 479 | if (mid == p) { |
| 480 | break; |
| 481 | } |
| 482 | if (mid->host > host) { |
| 483 | q = mid; |
| 484 | } else if (mid->host < host) { |
| 485 | p = mid; |
| 486 | } else { |
| 487 | break; |
| 488 | } |
| 489 | } |
| 490 | if (mid->host > host) { |
| 491 | mid--; |
| 492 | } else if (mid < &s->mappings[s->nr_mappings - 1] |
| 493 | && (mid + 1)->host <= host) { |
| 494 | mid++; |
| 495 | } |
| 496 | *index = mid - &s->mappings[0]; |
| 497 | if (mid >= &s->mappings[0] && |
| 498 | mid->host <= host && mid->host + mid->size > host) { |
| 499 | assert(mid < &s->mappings[s->nr_mappings]); |
| 500 | return mid; |
| 501 | } |
| 502 | /* At this point *index + 1 is the right position to insert the new |
| 503 | * mapping.*/ |
| 504 | return NULL; |
| 505 | } |
| 506 | |
| 507 | /** |
| 508 | * Allocate IOVA and and create a new mapping record and insert it in @s. |
| 509 | */ |
| 510 | static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, |
| 511 | void *host, size_t size, |
| 512 | int index, uint64_t iova) |
| 513 | { |
| 514 | int shift; |
| 515 | IOVAMapping m = {.host = host, .size = size, .iova = iova}; |
| 516 | IOVAMapping *insert; |
| 517 | |
| 518 | assert(QEMU_IS_ALIGNED(size, getpagesize())); |
| 519 | assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); |
| 520 | assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); |
| 521 | trace_qemu_vfio_new_mapping(s, host, size, index, iova); |
| 522 | |
| 523 | assert(index >= 0); |
| 524 | s->nr_mappings++; |
Olaf Hering | d29eb67 | 2018-05-15 08:31:28 +0200 | [diff] [blame] | 525 | s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); |
Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 526 | insert = &s->mappings[index]; |
| 527 | shift = s->nr_mappings - index - 1; |
| 528 | if (shift) { |
| 529 | memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); |
| 530 | } |
| 531 | *insert = m; |
| 532 | return insert; |
| 533 | } |
| 534 | |
| 535 | /* Do the DMA mapping with VFIO. */ |
| 536 | static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, |
| 537 | uint64_t iova) |
| 538 | { |
| 539 | struct vfio_iommu_type1_dma_map dma_map = { |
| 540 | .argsz = sizeof(dma_map), |
| 541 | .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, |
| 542 | .iova = iova, |
| 543 | .vaddr = (uintptr_t)host, |
| 544 | .size = size, |
| 545 | }; |
| 546 | trace_qemu_vfio_do_mapping(s, host, size, iova); |
| 547 | |
| 548 | if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { |
| 549 | error_report("VFIO_MAP_DMA: %d", -errno); |
| 550 | return -errno; |
| 551 | } |
| 552 | return 0; |
| 553 | } |
| 554 | |
| 555 | /** |
| 556 | * Undo the DMA mapping from @s with VFIO, and remove from mapping list. |
| 557 | */ |
| 558 | static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, |
| 559 | Error **errp) |
| 560 | { |
| 561 | int index; |
| 562 | struct vfio_iommu_type1_dma_unmap unmap = { |
| 563 | .argsz = sizeof(unmap), |
| 564 | .flags = 0, |
| 565 | .iova = mapping->iova, |
| 566 | .size = mapping->size, |
| 567 | }; |
| 568 | |
| 569 | index = mapping - s->mappings; |
| 570 | assert(mapping->size > 0); |
| 571 | assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); |
| 572 | assert(index >= 0 && index < s->nr_mappings); |
| 573 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { |
| 574 | error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno); |
| 575 | } |
| 576 | memmove(mapping, &s->mappings[index + 1], |
| 577 | sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); |
| 578 | s->nr_mappings--; |
Olaf Hering | d29eb67 | 2018-05-15 08:31:28 +0200 | [diff] [blame] | 579 | s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); |
Fam Zheng | 418026c | 2018-01-16 14:08:54 +0800 | [diff] [blame] | 580 | } |
| 581 | |
| 582 | /* Check if the mapping list is (ascending) ordered. */ |
| 583 | static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) |
| 584 | { |
| 585 | int i; |
| 586 | if (QEMU_VFIO_DEBUG) { |
| 587 | for (i = 0; i < s->nr_mappings - 1; ++i) { |
| 588 | if (!(s->mappings[i].host < s->mappings[i + 1].host)) { |
| 589 | fprintf(stderr, "item %d not sorted!\n", i); |
| 590 | qemu_vfio_dump_mappings(s); |
| 591 | return false; |
| 592 | } |
| 593 | if (!(s->mappings[i].host + s->mappings[i].size <= |
| 594 | s->mappings[i + 1].host)) { |
| 595 | fprintf(stderr, "item %d overlap with next!\n", i); |
| 596 | qemu_vfio_dump_mappings(s); |
| 597 | return false; |
| 598 | } |
| 599 | } |
| 600 | } |
| 601 | return true; |
| 602 | } |
| 603 | |
| 604 | /* Map [host, host + size) area into a contiguous IOVA address space, and store |
| 605 | * the result in @iova if not NULL. The caller need to make sure the area is |
| 606 | * aligned to page size, and mustn't overlap with existing mapping areas (split |
| 607 | * mapping status within this area is not allowed). |
| 608 | */ |
| 609 | int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, |
| 610 | bool temporary, uint64_t *iova) |
| 611 | { |
| 612 | int ret = 0; |
| 613 | int index; |
| 614 | IOVAMapping *mapping; |
| 615 | uint64_t iova0; |
| 616 | |
| 617 | assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); |
| 618 | assert(QEMU_IS_ALIGNED(size, getpagesize())); |
| 619 | trace_qemu_vfio_dma_map(s, host, size, temporary, iova); |
| 620 | qemu_mutex_lock(&s->lock); |
| 621 | mapping = qemu_vfio_find_mapping(s, host, &index); |
| 622 | if (mapping) { |
| 623 | iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); |
| 624 | } else { |
| 625 | if (s->high_water_mark - s->low_water_mark + 1 < size) { |
| 626 | ret = -ENOMEM; |
| 627 | goto out; |
| 628 | } |
| 629 | if (!temporary) { |
| 630 | iova0 = s->low_water_mark; |
| 631 | mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); |
| 632 | if (!mapping) { |
| 633 | ret = -ENOMEM; |
| 634 | goto out; |
| 635 | } |
| 636 | assert(qemu_vfio_verify_mappings(s)); |
| 637 | ret = qemu_vfio_do_mapping(s, host, size, iova0); |
| 638 | if (ret) { |
| 639 | qemu_vfio_undo_mapping(s, mapping, NULL); |
| 640 | goto out; |
| 641 | } |
| 642 | s->low_water_mark += size; |
| 643 | qemu_vfio_dump_mappings(s); |
| 644 | } else { |
| 645 | iova0 = s->high_water_mark - size; |
| 646 | ret = qemu_vfio_do_mapping(s, host, size, iova0); |
| 647 | if (ret) { |
| 648 | goto out; |
| 649 | } |
| 650 | s->high_water_mark -= size; |
| 651 | } |
| 652 | } |
| 653 | if (iova) { |
| 654 | *iova = iova0; |
| 655 | } |
| 656 | out: |
| 657 | qemu_mutex_unlock(&s->lock); |
| 658 | return ret; |
| 659 | } |
| 660 | |
| 661 | /* Reset the high watermark and free all "temporary" mappings. */ |
| 662 | int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) |
| 663 | { |
| 664 | struct vfio_iommu_type1_dma_unmap unmap = { |
| 665 | .argsz = sizeof(unmap), |
| 666 | .flags = 0, |
| 667 | .iova = s->high_water_mark, |
| 668 | .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, |
| 669 | }; |
| 670 | trace_qemu_vfio_dma_reset_temporary(s); |
| 671 | qemu_mutex_lock(&s->lock); |
| 672 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { |
| 673 | error_report("VFIO_UNMAP_DMA: %d", -errno); |
| 674 | qemu_mutex_unlock(&s->lock); |
| 675 | return -errno; |
| 676 | } |
| 677 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; |
| 678 | qemu_mutex_unlock(&s->lock); |
| 679 | return 0; |
| 680 | } |
| 681 | |
| 682 | /* Unmapping the whole area that was previously mapped with |
| 683 | * qemu_vfio_dma_map(). */ |
| 684 | void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) |
| 685 | { |
| 686 | int index = 0; |
| 687 | IOVAMapping *m; |
| 688 | |
| 689 | if (!host) { |
| 690 | return; |
| 691 | } |
| 692 | |
| 693 | trace_qemu_vfio_dma_unmap(s, host); |
| 694 | qemu_mutex_lock(&s->lock); |
| 695 | m = qemu_vfio_find_mapping(s, host, &index); |
| 696 | if (!m) { |
| 697 | goto out; |
| 698 | } |
| 699 | qemu_vfio_undo_mapping(s, m, NULL); |
| 700 | out: |
| 701 | qemu_mutex_unlock(&s->lock); |
| 702 | } |
| 703 | |
| 704 | static void qemu_vfio_reset(QEMUVFIOState *s) |
| 705 | { |
| 706 | ioctl(s->device, VFIO_DEVICE_RESET); |
| 707 | } |
| 708 | |
| 709 | /* Close and free the VFIO resources. */ |
| 710 | void qemu_vfio_close(QEMUVFIOState *s) |
| 711 | { |
| 712 | int i; |
| 713 | |
| 714 | if (!s) { |
| 715 | return; |
| 716 | } |
| 717 | for (i = 0; i < s->nr_mappings; ++i) { |
| 718 | qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); |
| 719 | } |
| 720 | ram_block_notifier_remove(&s->ram_notifier); |
| 721 | qemu_vfio_reset(s); |
| 722 | close(s->device); |
| 723 | close(s->group); |
| 724 | close(s->container); |
| 725 | } |