Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1 | /* |
| 2 | * generic functions used by VFIO devices |
| 3 | * |
| 4 | * Copyright Red Hat, Inc. 2012 |
| 5 | * |
| 6 | * Authors: |
| 7 | * Alex Williamson <alex.williamson@redhat.com> |
| 8 | * |
| 9 | * This work is licensed under the terms of the GNU GPL, version 2. See |
| 10 | * the COPYING file in the top-level directory. |
| 11 | * |
| 12 | * Based on qemu-kvm device-assignment: |
| 13 | * Adapted for KVM by Qumranet. |
| 14 | * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) |
| 15 | * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) |
| 16 | * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) |
| 17 | * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) |
| 18 | * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) |
| 19 | */ |
| 20 | |
Peter Maydell | c6eacb1 | 2016-01-26 18:17:14 +0000 | [diff] [blame] | 21 | #include "qemu/osdep.h" |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 22 | #include <sys/ioctl.h> |
Markus Armbruster | a9c9427 | 2016-06-22 19:11:19 +0200 | [diff] [blame] | 23 | #ifdef CONFIG_KVM |
| 24 | #include <linux/kvm.h> |
| 25 | #endif |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 26 | #include <linux/vfio.h> |
| 27 | |
| 28 | #include "hw/vfio/vfio-common.h" |
| 29 | #include "hw/vfio/vfio.h" |
| 30 | #include "exec/address-spaces.h" |
| 31 | #include "exec/memory.h" |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 32 | #include "exec/ram_addr.h" |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 33 | #include "hw/hw.h" |
| 34 | #include "qemu/error-report.h" |
Markus Armbruster | db72581 | 2019-08-12 07:23:50 +0200 | [diff] [blame] | 35 | #include "qemu/main-loop.h" |
Alexey Kardashevskiy | f4ec5e2 | 2016-07-04 13:33:05 +1000 | [diff] [blame] | 36 | #include "qemu/range.h" |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 37 | #include "sysemu/kvm.h" |
Markus Armbruster | 71e8a91 | 2019-08-12 07:23:38 +0200 | [diff] [blame] | 38 | #include "sysemu/reset.h" |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 39 | #include "trace.h" |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 40 | #include "qapi/error.h" |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 41 | #include "migration/migration.h" |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 42 | |
Paolo Bonzini | f481ee2 | 2018-12-06 11:56:15 +0100 | [diff] [blame] | 43 | VFIOGroupList vfio_group_list = |
Chen Fan | 39cb514 | 2015-02-04 11:45:32 -0700 | [diff] [blame] | 44 | QLIST_HEAD_INITIALIZER(vfio_group_list); |
Paolo Bonzini | 10ca76b | 2018-12-10 17:58:54 +0100 | [diff] [blame] | 45 | static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 46 | QLIST_HEAD_INITIALIZER(vfio_address_spaces); |
| 47 | |
| 48 | #ifdef CONFIG_KVM |
| 49 | /* |
| 50 | * We have a single VFIO pseudo device per KVM VM. Once created it lives |
| 51 | * for the life of the VM. Closing the file descriptor only drops our |
| 52 | * reference to it and the device's reference to kvm. Therefore once |
| 53 | * initialized, this file descriptor is only released on QEMU exit and |
| 54 | * we'll re-use it should another vfio device be attached before then. |
| 55 | */ |
| 56 | static int vfio_kvm_device_fd = -1; |
| 57 | #endif |
| 58 | |
| 59 | /* |
| 60 | * Common VFIO interrupt disable |
| 61 | */ |
| 62 | void vfio_disable_irqindex(VFIODevice *vbasedev, int index) |
| 63 | { |
| 64 | struct vfio_irq_set irq_set = { |
| 65 | .argsz = sizeof(irq_set), |
| 66 | .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, |
| 67 | .index = index, |
| 68 | .start = 0, |
| 69 | .count = 0, |
| 70 | }; |
| 71 | |
| 72 | ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); |
| 73 | } |
| 74 | |
| 75 | void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) |
| 76 | { |
| 77 | struct vfio_irq_set irq_set = { |
| 78 | .argsz = sizeof(irq_set), |
| 79 | .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, |
| 80 | .index = index, |
| 81 | .start = 0, |
| 82 | .count = 1, |
| 83 | }; |
| 84 | |
| 85 | ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); |
| 86 | } |
| 87 | |
| 88 | void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) |
| 89 | { |
| 90 | struct vfio_irq_set irq_set = { |
| 91 | .argsz = sizeof(irq_set), |
| 92 | .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, |
| 93 | .index = index, |
| 94 | .start = 0, |
| 95 | .count = 1, |
| 96 | }; |
| 97 | |
| 98 | ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); |
| 99 | } |
| 100 | |
Eric Auger | 201a733 | 2019-06-13 09:57:37 -0600 | [diff] [blame] | 101 | static inline const char *action_to_str(int action) |
| 102 | { |
| 103 | switch (action) { |
| 104 | case VFIO_IRQ_SET_ACTION_MASK: |
| 105 | return "MASK"; |
| 106 | case VFIO_IRQ_SET_ACTION_UNMASK: |
| 107 | return "UNMASK"; |
| 108 | case VFIO_IRQ_SET_ACTION_TRIGGER: |
| 109 | return "TRIGGER"; |
| 110 | default: |
| 111 | return "UNKNOWN ACTION"; |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | static const char *index_to_str(VFIODevice *vbasedev, int index) |
| 116 | { |
| 117 | if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { |
| 118 | return NULL; |
| 119 | } |
| 120 | |
| 121 | switch (index) { |
| 122 | case VFIO_PCI_INTX_IRQ_INDEX: |
| 123 | return "INTX"; |
| 124 | case VFIO_PCI_MSI_IRQ_INDEX: |
| 125 | return "MSI"; |
| 126 | case VFIO_PCI_MSIX_IRQ_INDEX: |
| 127 | return "MSIX"; |
| 128 | case VFIO_PCI_ERR_IRQ_INDEX: |
| 129 | return "ERR"; |
| 130 | case VFIO_PCI_REQ_IRQ_INDEX: |
| 131 | return "REQ"; |
| 132 | default: |
| 133 | return NULL; |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, |
| 138 | int action, int fd, Error **errp) |
| 139 | { |
| 140 | struct vfio_irq_set *irq_set; |
| 141 | int argsz, ret = 0; |
| 142 | const char *name; |
| 143 | int32_t *pfd; |
| 144 | |
| 145 | argsz = sizeof(*irq_set) + sizeof(*pfd); |
| 146 | |
| 147 | irq_set = g_malloc0(argsz); |
| 148 | irq_set->argsz = argsz; |
| 149 | irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; |
| 150 | irq_set->index = index; |
| 151 | irq_set->start = subindex; |
| 152 | irq_set->count = 1; |
| 153 | pfd = (int32_t *)&irq_set->data; |
| 154 | *pfd = fd; |
| 155 | |
| 156 | if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { |
| 157 | ret = -errno; |
| 158 | } |
| 159 | g_free(irq_set); |
| 160 | |
| 161 | if (!ret) { |
| 162 | return 0; |
| 163 | } |
| 164 | |
| 165 | error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure"); |
| 166 | |
| 167 | name = index_to_str(vbasedev, index); |
| 168 | if (name) { |
| 169 | error_prepend(errp, "%s-%d: ", name, subindex); |
| 170 | } else { |
| 171 | error_prepend(errp, "index %d-%d: ", index, subindex); |
| 172 | } |
| 173 | error_prepend(errp, |
| 174 | "Failed to %s %s eventfd signaling for interrupt ", |
| 175 | fd < 0 ? "tear down" : "set up", action_to_str(action)); |
| 176 | return ret; |
| 177 | } |
| 178 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 179 | /* |
| 180 | * IO Port/MMIO - Beware of the endians, VFIO is always little endian |
| 181 | */ |
| 182 | void vfio_region_write(void *opaque, hwaddr addr, |
| 183 | uint64_t data, unsigned size) |
| 184 | { |
| 185 | VFIORegion *region = opaque; |
| 186 | VFIODevice *vbasedev = region->vbasedev; |
| 187 | union { |
| 188 | uint8_t byte; |
| 189 | uint16_t word; |
| 190 | uint32_t dword; |
| 191 | uint64_t qword; |
| 192 | } buf; |
| 193 | |
| 194 | switch (size) { |
| 195 | case 1: |
| 196 | buf.byte = data; |
| 197 | break; |
| 198 | case 2: |
| 199 | buf.word = cpu_to_le16(data); |
| 200 | break; |
| 201 | case 4: |
| 202 | buf.dword = cpu_to_le32(data); |
| 203 | break; |
Jose Ricardo Ziviani | 38d49e8 | 2017-05-03 14:52:34 -0600 | [diff] [blame] | 204 | case 8: |
| 205 | buf.qword = cpu_to_le64(data); |
| 206 | break; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 207 | default: |
Zhengui li | c624b6b | 2020-10-19 14:23:46 +0000 | [diff] [blame] | 208 | hw_error("vfio: unsupported write size, %u bytes", size); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 209 | break; |
| 210 | } |
| 211 | |
| 212 | if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { |
| 213 | error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 |
| 214 | ",%d) failed: %m", |
| 215 | __func__, vbasedev->name, region->nr, |
| 216 | addr, data, size); |
| 217 | } |
| 218 | |
| 219 | trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); |
| 220 | |
| 221 | /* |
| 222 | * A read or write to a BAR always signals an INTx EOI. This will |
| 223 | * do nothing if not pending (including not in INTx mode). We assume |
| 224 | * that a BAR access is in response to an interrupt and that BAR |
| 225 | * accesses will service the interrupt. Unfortunately, we don't know |
| 226 | * which access will service the interrupt, so we're potentially |
| 227 | * getting quite a few host interrupts per guest interrupt. |
| 228 | */ |
| 229 | vbasedev->ops->vfio_eoi(vbasedev); |
| 230 | } |
| 231 | |
| 232 | uint64_t vfio_region_read(void *opaque, |
| 233 | hwaddr addr, unsigned size) |
| 234 | { |
| 235 | VFIORegion *region = opaque; |
| 236 | VFIODevice *vbasedev = region->vbasedev; |
| 237 | union { |
| 238 | uint8_t byte; |
| 239 | uint16_t word; |
| 240 | uint32_t dword; |
| 241 | uint64_t qword; |
| 242 | } buf; |
| 243 | uint64_t data = 0; |
| 244 | |
| 245 | if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { |
| 246 | error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", |
| 247 | __func__, vbasedev->name, region->nr, |
| 248 | addr, size); |
| 249 | return (uint64_t)-1; |
| 250 | } |
| 251 | switch (size) { |
| 252 | case 1: |
| 253 | data = buf.byte; |
| 254 | break; |
| 255 | case 2: |
| 256 | data = le16_to_cpu(buf.word); |
| 257 | break; |
| 258 | case 4: |
| 259 | data = le32_to_cpu(buf.dword); |
| 260 | break; |
Jose Ricardo Ziviani | 38d49e8 | 2017-05-03 14:52:34 -0600 | [diff] [blame] | 261 | case 8: |
| 262 | data = le64_to_cpu(buf.qword); |
| 263 | break; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 264 | default: |
Zhengui li | c624b6b | 2020-10-19 14:23:46 +0000 | [diff] [blame] | 265 | hw_error("vfio: unsupported read size, %u bytes", size); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 266 | break; |
| 267 | } |
| 268 | |
| 269 | trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); |
| 270 | |
| 271 | /* Same as write above */ |
| 272 | vbasedev->ops->vfio_eoi(vbasedev); |
| 273 | |
| 274 | return data; |
| 275 | } |
| 276 | |
| 277 | const MemoryRegionOps vfio_region_ops = { |
| 278 | .read = vfio_region_read, |
| 279 | .write = vfio_region_write, |
| 280 | .endianness = DEVICE_LITTLE_ENDIAN, |
Jose Ricardo Ziviani | 15126cb | 2017-05-03 14:52:34 -0600 | [diff] [blame] | 281 | .valid = { |
| 282 | .min_access_size = 1, |
| 283 | .max_access_size = 8, |
| 284 | }, |
Jose Ricardo Ziviani | 38d49e8 | 2017-05-03 14:52:34 -0600 | [diff] [blame] | 285 | .impl = { |
| 286 | .min_access_size = 1, |
| 287 | .max_access_size = 8, |
| 288 | }, |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 289 | }; |
| 290 | |
| 291 | /* |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 292 | * Device state interfaces |
| 293 | */ |
| 294 | |
Kirti Wankhede | 3710586 | 2020-10-26 15:06:27 +0530 | [diff] [blame] | 295 | bool vfio_mig_active(void) |
| 296 | { |
| 297 | VFIOGroup *group; |
| 298 | VFIODevice *vbasedev; |
| 299 | |
| 300 | if (QLIST_EMPTY(&vfio_group_list)) { |
| 301 | return false; |
| 302 | } |
| 303 | |
| 304 | QLIST_FOREACH(group, &vfio_group_list, next) { |
| 305 | QLIST_FOREACH(vbasedev, &group->device_list, next) { |
| 306 | if (vbasedev->migration_blocker) { |
| 307 | return false; |
| 308 | } |
| 309 | } |
| 310 | } |
| 311 | return true; |
| 312 | } |
| 313 | |
Kirti Wankhede | bb0990d | 2020-11-23 19:53:19 +0530 | [diff] [blame] | 314 | static bool vfio_devices_all_saving(VFIOContainer *container) |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 315 | { |
| 316 | VFIOGroup *group; |
| 317 | VFIODevice *vbasedev; |
| 318 | MigrationState *ms = migrate_get_current(); |
| 319 | |
| 320 | if (!migration_is_setup_or_active(ms->state)) { |
| 321 | return false; |
| 322 | } |
| 323 | |
| 324 | QLIST_FOREACH(group, &container->group_list, container_next) { |
| 325 | QLIST_FOREACH(vbasedev, &group->device_list, next) { |
| 326 | VFIOMigration *migration = vbasedev->migration; |
| 327 | |
| 328 | if (!migration) { |
| 329 | return false; |
| 330 | } |
| 331 | |
Kirti Wankhede | bb0990d | 2020-11-23 19:53:19 +0530 | [diff] [blame] | 332 | if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { |
| 333 | if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) |
| 334 | && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { |
| 335 | return false; |
| 336 | } |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 337 | continue; |
| 338 | } else { |
| 339 | return false; |
| 340 | } |
| 341 | } |
| 342 | } |
| 343 | return true; |
| 344 | } |
| 345 | |
Kirti Wankhede | 9e7b044 | 2020-10-26 15:06:25 +0530 | [diff] [blame] | 346 | static bool vfio_devices_all_running_and_saving(VFIOContainer *container) |
| 347 | { |
| 348 | VFIOGroup *group; |
| 349 | VFIODevice *vbasedev; |
| 350 | MigrationState *ms = migrate_get_current(); |
| 351 | |
| 352 | if (!migration_is_setup_or_active(ms->state)) { |
| 353 | return false; |
| 354 | } |
| 355 | |
| 356 | QLIST_FOREACH(group, &container->group_list, container_next) { |
| 357 | QLIST_FOREACH(vbasedev, &group->device_list, next) { |
| 358 | VFIOMigration *migration = vbasedev->migration; |
| 359 | |
| 360 | if (!migration) { |
| 361 | return false; |
| 362 | } |
| 363 | |
| 364 | if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && |
| 365 | (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { |
| 366 | continue; |
| 367 | } else { |
| 368 | return false; |
| 369 | } |
| 370 | } |
| 371 | } |
| 372 | return true; |
| 373 | } |
| 374 | |
| 375 | static int vfio_dma_unmap_bitmap(VFIOContainer *container, |
| 376 | hwaddr iova, ram_addr_t size, |
| 377 | IOMMUTLBEntry *iotlb) |
| 378 | { |
| 379 | struct vfio_iommu_type1_dma_unmap *unmap; |
| 380 | struct vfio_bitmap *bitmap; |
| 381 | uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; |
| 382 | int ret; |
| 383 | |
| 384 | unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); |
| 385 | |
| 386 | unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); |
| 387 | unmap->iova = iova; |
| 388 | unmap->size = size; |
| 389 | unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; |
| 390 | bitmap = (struct vfio_bitmap *)&unmap->data; |
| 391 | |
| 392 | /* |
| 393 | * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of |
| 394 | * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to |
| 395 | * TARGET_PAGE_SIZE. |
| 396 | */ |
| 397 | |
| 398 | bitmap->pgsize = TARGET_PAGE_SIZE; |
| 399 | bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / |
| 400 | BITS_PER_BYTE; |
| 401 | |
| 402 | if (bitmap->size > container->max_dirty_bitmap_size) { |
| 403 | error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, |
| 404 | (uint64_t)bitmap->size); |
| 405 | ret = -E2BIG; |
| 406 | goto unmap_exit; |
| 407 | } |
| 408 | |
| 409 | bitmap->data = g_try_malloc0(bitmap->size); |
| 410 | if (!bitmap->data) { |
| 411 | ret = -ENOMEM; |
| 412 | goto unmap_exit; |
| 413 | } |
| 414 | |
| 415 | ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); |
| 416 | if (!ret) { |
| 417 | cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data, |
| 418 | iotlb->translated_addr, pages); |
| 419 | } else { |
| 420 | error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); |
| 421 | } |
| 422 | |
| 423 | g_free(bitmap->data); |
| 424 | unmap_exit: |
| 425 | g_free(unmap); |
| 426 | return ret; |
| 427 | } |
| 428 | |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 429 | /* |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 430 | * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 |
| 431 | */ |
| 432 | static int vfio_dma_unmap(VFIOContainer *container, |
Kirti Wankhede | 9e7b044 | 2020-10-26 15:06:25 +0530 | [diff] [blame] | 433 | hwaddr iova, ram_addr_t size, |
| 434 | IOMMUTLBEntry *iotlb) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 435 | { |
| 436 | struct vfio_iommu_type1_dma_unmap unmap = { |
| 437 | .argsz = sizeof(unmap), |
| 438 | .flags = 0, |
| 439 | .iova = iova, |
| 440 | .size = size, |
| 441 | }; |
| 442 | |
Kirti Wankhede | 9e7b044 | 2020-10-26 15:06:25 +0530 | [diff] [blame] | 443 | if (iotlb && container->dirty_pages_supported && |
| 444 | vfio_devices_all_running_and_saving(container)) { |
| 445 | return vfio_dma_unmap_bitmap(container, iova, size, iotlb); |
| 446 | } |
| 447 | |
Alex Williamson | 567d7d3 | 2019-02-21 21:07:03 -0700 | [diff] [blame] | 448 | while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { |
| 449 | /* |
| 450 | * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c |
| 451 | * v4.15) where an overflow in its wrap-around check prevents us from |
| 452 | * unmapping the last page of the address space. Test for the error |
| 453 | * condition and re-try the unmap excluding the last page. The |
| 454 | * expectation is that we've never mapped the last page anyway and this |
| 455 | * unmap request comes via vIOMMU support which also makes it unlikely |
| 456 | * that this page is used. This bug was introduced well after type1 v2 |
| 457 | * support was introduced, so we shouldn't need to test for v1. A fix |
| 458 | * is queued for kernel v5.0 so this workaround can be removed once |
| 459 | * affected kernels are sufficiently deprecated. |
| 460 | */ |
| 461 | if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && |
| 462 | container->iommu_type == VFIO_TYPE1v2_IOMMU) { |
| 463 | trace_vfio_dma_unmap_overflow_workaround(); |
| 464 | unmap.size -= 1ULL << ctz64(container->pgsizes); |
| 465 | continue; |
| 466 | } |
Michal Privoznik | b09d51c | 2020-02-14 10:55:19 +0100 | [diff] [blame] | 467 | error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 468 | return -errno; |
| 469 | } |
| 470 | |
| 471 | return 0; |
| 472 | } |
| 473 | |
| 474 | static int vfio_dma_map(VFIOContainer *container, hwaddr iova, |
| 475 | ram_addr_t size, void *vaddr, bool readonly) |
| 476 | { |
| 477 | struct vfio_iommu_type1_dma_map map = { |
| 478 | .argsz = sizeof(map), |
| 479 | .flags = VFIO_DMA_MAP_FLAG_READ, |
| 480 | .vaddr = (__u64)(uintptr_t)vaddr, |
| 481 | .iova = iova, |
| 482 | .size = size, |
| 483 | }; |
| 484 | |
| 485 | if (!readonly) { |
| 486 | map.flags |= VFIO_DMA_MAP_FLAG_WRITE; |
| 487 | } |
| 488 | |
| 489 | /* |
| 490 | * Try the mapping, if it fails with EBUSY, unmap the region and try |
| 491 | * again. This shouldn't be necessary, but we sometimes see it in |
Daniel P. Berrange | b6af097 | 2015-08-26 12:17:13 +0100 | [diff] [blame] | 492 | * the VGA ROM space. |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 493 | */ |
| 494 | if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || |
Kirti Wankhede | 9e7b044 | 2020-10-26 15:06:25 +0530 | [diff] [blame] | 495 | (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 496 | ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { |
| 497 | return 0; |
| 498 | } |
| 499 | |
Michal Privoznik | b09d51c | 2020-02-14 10:55:19 +0100 | [diff] [blame] | 500 | error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 501 | return -errno; |
| 502 | } |
| 503 | |
Alexey Kardashevskiy | f4ec5e2 | 2016-07-04 13:33:05 +1000 | [diff] [blame] | 504 | static void vfio_host_win_add(VFIOContainer *container, |
| 505 | hwaddr min_iova, hwaddr max_iova, |
| 506 | uint64_t iova_pgsizes) |
| 507 | { |
| 508 | VFIOHostDMAWindow *hostwin; |
| 509 | |
| 510 | QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { |
| 511 | if (ranges_overlap(hostwin->min_iova, |
| 512 | hostwin->max_iova - hostwin->min_iova + 1, |
| 513 | min_iova, |
| 514 | max_iova - min_iova + 1)) { |
| 515 | hw_error("%s: Overlapped IOMMU are not enabled", __func__); |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | hostwin = g_malloc0(sizeof(*hostwin)); |
| 520 | |
| 521 | hostwin->min_iova = min_iova; |
| 522 | hostwin->max_iova = max_iova; |
| 523 | hostwin->iova_pgsizes = iova_pgsizes; |
| 524 | QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); |
| 525 | } |
| 526 | |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 527 | static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova, |
| 528 | hwaddr max_iova) |
| 529 | { |
| 530 | VFIOHostDMAWindow *hostwin; |
| 531 | |
| 532 | QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { |
| 533 | if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { |
| 534 | QLIST_REMOVE(hostwin, hostwin_next); |
| 535 | return 0; |
| 536 | } |
| 537 | } |
| 538 | |
| 539 | return -1; |
| 540 | } |
| 541 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 542 | static bool vfio_listener_skipped_section(MemoryRegionSection *section) |
| 543 | { |
| 544 | return (!memory_region_is_ram(section->mr) && |
| 545 | !memory_region_is_iommu(section->mr)) || |
| 546 | /* |
| 547 | * Sizing an enabled 64-bit BAR can cause spurious mappings to |
| 548 | * addresses in the upper part of the 64-bit address space. These |
| 549 | * are never accessed by the CPU and beyond the address width of |
| 550 | * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. |
| 551 | */ |
| 552 | section->offset_within_address_space & (1ULL << 63); |
| 553 | } |
| 554 | |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 555 | /* Called with rcu_read_lock held. */ |
Kirti Wankhede | 9a04fe0 | 2020-10-26 15:06:24 +0530 | [diff] [blame] | 556 | static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, |
| 557 | ram_addr_t *ram_addr, bool *read_only) |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 558 | { |
| 559 | MemoryRegion *mr; |
| 560 | hwaddr xlat; |
| 561 | hwaddr len = iotlb->addr_mask + 1; |
| 562 | bool writable = iotlb->perm & IOMMU_WO; |
| 563 | |
| 564 | /* |
| 565 | * The IOMMU TLB entry we have just covers translation through |
| 566 | * this IOMMU to its immediate target. We need to translate |
| 567 | * it the rest of the way through to memory. |
| 568 | */ |
| 569 | mr = address_space_translate(&address_space_memory, |
| 570 | iotlb->translated_addr, |
Peter Maydell | bc6b1ce | 2018-05-31 14:50:52 +0100 | [diff] [blame] | 571 | &xlat, &len, writable, |
| 572 | MEMTXATTRS_UNSPECIFIED); |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 573 | if (!memory_region_is_ram(mr)) { |
| 574 | error_report("iommu map to non memory area %"HWADDR_PRIx"", |
| 575 | xlat); |
| 576 | return false; |
| 577 | } |
| 578 | |
| 579 | /* |
| 580 | * Translation truncates length to the IOMMU page size, |
| 581 | * check that it did not truncate too much. |
| 582 | */ |
| 583 | if (len & iotlb->addr_mask) { |
| 584 | error_report("iommu has granularity incompatible with target AS"); |
| 585 | return false; |
| 586 | } |
| 587 | |
Kirti Wankhede | 9a04fe0 | 2020-10-26 15:06:24 +0530 | [diff] [blame] | 588 | if (vaddr) { |
| 589 | *vaddr = memory_region_get_ram_ptr(mr) + xlat; |
| 590 | } |
| 591 | |
| 592 | if (ram_addr) { |
| 593 | *ram_addr = memory_region_get_ram_addr(mr) + xlat; |
| 594 | } |
| 595 | |
| 596 | if (read_only) { |
| 597 | *read_only = !writable || mr->readonly; |
| 598 | } |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 599 | |
| 600 | return true; |
| 601 | } |
| 602 | |
Peter Xu | cdb3081 | 2016-09-23 13:02:26 +0800 | [diff] [blame] | 603 | static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 604 | { |
| 605 | VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); |
| 606 | VFIOContainer *container = giommu->container; |
Alexey Kardashevskiy | d78c19b | 2016-05-26 09:43:23 -0600 | [diff] [blame] | 607 | hwaddr iova = iotlb->iova + giommu->iommu_offset; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 608 | void *vaddr; |
| 609 | int ret; |
| 610 | |
Peter Xu | 3213835 | 2017-02-07 16:28:03 +0800 | [diff] [blame] | 611 | trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", |
| 612 | iova, iova + iotlb->addr_mask); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 613 | |
Alexey Kardashevskiy | f1f9365 | 2016-05-26 09:43:23 -0600 | [diff] [blame] | 614 | if (iotlb->target_as != &address_space_memory) { |
| 615 | error_report("Wrong target AS \"%s\", only system memory is allowed", |
| 616 | iotlb->target_as->name ? iotlb->target_as->name : "none"); |
| 617 | return; |
| 618 | } |
| 619 | |
Paolo Bonzini | 41063e1 | 2015-03-18 14:21:43 +0100 | [diff] [blame] | 620 | rcu_read_lock(); |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 621 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 622 | if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { |
Kirti Wankhede | 9a04fe0 | 2020-10-26 15:06:24 +0530 | [diff] [blame] | 623 | bool read_only; |
| 624 | |
| 625 | if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { |
Peter Xu | dfbd90e | 2017-02-07 16:28:05 +0800 | [diff] [blame] | 626 | goto out; |
| 627 | } |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 628 | /* |
| 629 | * vaddr is only valid until rcu_read_unlock(). But after |
| 630 | * vfio_dma_map has set up the mapping the pages will be |
| 631 | * pinned by the kernel. This makes sure that the RAM backend |
| 632 | * of vaddr will always be there, even if the memory object is |
| 633 | * destroyed and its backing memory munmap-ed. |
| 634 | */ |
Alexey Kardashevskiy | d78c19b | 2016-05-26 09:43:23 -0600 | [diff] [blame] | 635 | ret = vfio_dma_map(container, iova, |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 636 | iotlb->addr_mask + 1, vaddr, |
Peter Xu | 4a4b88f | 2017-02-07 16:28:04 +0800 | [diff] [blame] | 637 | read_only); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 638 | if (ret) { |
| 639 | error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " |
| 640 | "0x%"HWADDR_PRIx", %p) = %d (%m)", |
Alexey Kardashevskiy | d78c19b | 2016-05-26 09:43:23 -0600 | [diff] [blame] | 641 | container, iova, |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 642 | iotlb->addr_mask + 1, vaddr, ret); |
| 643 | } |
| 644 | } else { |
Kirti Wankhede | 9e7b044 | 2020-10-26 15:06:25 +0530 | [diff] [blame] | 645 | ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 646 | if (ret) { |
| 647 | error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " |
| 648 | "0x%"HWADDR_PRIx") = %d (%m)", |
Alexey Kardashevskiy | d78c19b | 2016-05-26 09:43:23 -0600 | [diff] [blame] | 649 | container, iova, |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 650 | iotlb->addr_mask + 1, ret); |
| 651 | } |
| 652 | } |
Paolo Bonzini | 41063e1 | 2015-03-18 14:21:43 +0100 | [diff] [blame] | 653 | out: |
| 654 | rcu_read_unlock(); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 655 | } |
| 656 | |
| 657 | static void vfio_listener_region_add(MemoryListener *listener, |
| 658 | MemoryRegionSection *section) |
| 659 | { |
David Gibson | ee0bf0e | 2015-09-30 12:13:51 +1000 | [diff] [blame] | 660 | VFIOContainer *container = container_of(listener, VFIOContainer, listener); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 661 | hwaddr iova, end; |
Bandan Das | 55efcc5 | 2016-03-23 20:37:25 -0400 | [diff] [blame] | 662 | Int128 llend, llsize; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 663 | void *vaddr; |
| 664 | int ret; |
Alexey Kardashevskiy | f4ec5e2 | 2016-07-04 13:33:05 +1000 | [diff] [blame] | 665 | VFIOHostDMAWindow *hostwin; |
| 666 | bool hostwin_found; |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 667 | Error *err = NULL; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 668 | |
| 669 | if (vfio_listener_skipped_section(section)) { |
| 670 | trace_vfio_listener_region_add_skip( |
| 671 | section->offset_within_address_space, |
| 672 | section->offset_within_address_space + |
| 673 | int128_get64(int128_sub(section->size, int128_one()))); |
| 674 | return; |
| 675 | } |
| 676 | |
| 677 | if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != |
| 678 | (section->offset_within_region & ~TARGET_PAGE_MASK))) { |
| 679 | error_report("%s received unaligned region", __func__); |
| 680 | return; |
| 681 | } |
| 682 | |
| 683 | iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); |
| 684 | llend = int128_make64(section->offset_within_address_space); |
| 685 | llend = int128_add(llend, section->size); |
| 686 | llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); |
| 687 | |
| 688 | if (int128_ge(int128_make64(iova), llend)) { |
| 689 | return; |
| 690 | } |
Bandan Das | 55efcc5 | 2016-03-23 20:37:25 -0400 | [diff] [blame] | 691 | end = int128_get64(int128_sub(llend, int128_one())); |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 692 | |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 693 | if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 694 | hwaddr pgsize = 0; |
| 695 | |
| 696 | /* For now intersections are not allowed, we may relax this later */ |
| 697 | QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { |
| 698 | if (ranges_overlap(hostwin->min_iova, |
| 699 | hostwin->max_iova - hostwin->min_iova + 1, |
| 700 | section->offset_within_address_space, |
| 701 | int128_get64(section->size))) { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 702 | error_setg(&err, |
| 703 | "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" |
| 704 | "host DMA window [0x%"PRIx64",0x%"PRIx64"]", |
| 705 | section->offset_within_address_space, |
| 706 | section->offset_within_address_space + |
| 707 | int128_get64(section->size) - 1, |
| 708 | hostwin->min_iova, hostwin->max_iova); |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 709 | goto fail; |
| 710 | } |
| 711 | } |
| 712 | |
| 713 | ret = vfio_spapr_create_window(container, section, &pgsize); |
| 714 | if (ret) { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 715 | error_setg_errno(&err, -ret, "Failed to create SPAPR window"); |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 716 | goto fail; |
| 717 | } |
| 718 | |
| 719 | vfio_host_win_add(container, section->offset_within_address_space, |
| 720 | section->offset_within_address_space + |
| 721 | int128_get64(section->size) - 1, pgsize); |
Alexey Kardashevskiy | 07bc681 | 2018-02-06 11:08:24 -0700 | [diff] [blame] | 722 | #ifdef CONFIG_KVM |
| 723 | if (kvm_enabled()) { |
| 724 | VFIOGroup *group; |
| 725 | IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); |
| 726 | struct kvm_vfio_spapr_tce param; |
| 727 | struct kvm_device_attr attr = { |
| 728 | .group = KVM_DEV_VFIO_GROUP, |
| 729 | .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, |
| 730 | .addr = (uint64_t)(unsigned long)¶m, |
| 731 | }; |
| 732 | |
| 733 | if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, |
| 734 | ¶m.tablefd)) { |
| 735 | QLIST_FOREACH(group, &container->group_list, container_next) { |
| 736 | param.groupfd = group->fd; |
| 737 | if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { |
| 738 | error_report("vfio: failed to setup fd %d " |
| 739 | "for a group with fd %d: %s", |
| 740 | param.tablefd, param.groupfd, |
| 741 | strerror(errno)); |
| 742 | return; |
| 743 | } |
| 744 | trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); |
| 745 | } |
| 746 | } |
| 747 | } |
| 748 | #endif |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 749 | } |
| 750 | |
Alexey Kardashevskiy | f4ec5e2 | 2016-07-04 13:33:05 +1000 | [diff] [blame] | 751 | hostwin_found = false; |
| 752 | QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { |
| 753 | if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { |
| 754 | hostwin_found = true; |
| 755 | break; |
| 756 | } |
| 757 | } |
| 758 | |
| 759 | if (!hostwin_found) { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 760 | error_setg(&err, "Container %p can't map guest IOVA region" |
| 761 | " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 762 | goto fail; |
| 763 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 764 | |
| 765 | memory_region_ref(section->mr); |
| 766 | |
| 767 | if (memory_region_is_iommu(section->mr)) { |
| 768 | VFIOGuestIOMMU *giommu; |
Alexey Kardashevskiy | 3df9d74 | 2017-07-11 13:56:19 +1000 | [diff] [blame] | 769 | IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); |
Peter Maydell | cb1efcf | 2018-06-15 14:57:16 +0100 | [diff] [blame] | 770 | int iommu_idx; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 771 | |
Bandan Das | 55efcc5 | 2016-03-23 20:37:25 -0400 | [diff] [blame] | 772 | trace_vfio_listener_region_add_iommu(iova, end); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 773 | /* |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 774 | * FIXME: For VFIO iommu types which have KVM acceleration to |
| 775 | * avoid bouncing all map/unmaps through qemu this way, this |
| 776 | * would be the right place to wire that up (tell the KVM |
| 777 | * device emulation the VFIO iommu handles to use). |
| 778 | */ |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 779 | giommu = g_malloc0(sizeof(*giommu)); |
Alexey Kardashevskiy | 3df9d74 | 2017-07-11 13:56:19 +1000 | [diff] [blame] | 780 | giommu->iommu = iommu_mr; |
Alexey Kardashevskiy | d78c19b | 2016-05-26 09:43:23 -0600 | [diff] [blame] | 781 | giommu->iommu_offset = section->offset_within_address_space - |
| 782 | section->offset_within_region; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 783 | giommu->container = container; |
Peter Xu | 698feb5 | 2017-04-07 18:59:07 +0800 | [diff] [blame] | 784 | llend = int128_add(int128_make64(section->offset_within_region), |
| 785 | section->size); |
| 786 | llend = int128_sub(llend, int128_one()); |
Peter Maydell | cb1efcf | 2018-06-15 14:57:16 +0100 | [diff] [blame] | 787 | iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, |
| 788 | MEMTXATTRS_UNSPECIFIED); |
Peter Xu | 698feb5 | 2017-04-07 18:59:07 +0800 | [diff] [blame] | 789 | iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, |
| 790 | IOMMU_NOTIFIER_ALL, |
| 791 | section->offset_within_region, |
Peter Maydell | cb1efcf | 2018-06-15 14:57:16 +0100 | [diff] [blame] | 792 | int128_get64(llend), |
| 793 | iommu_idx); |
David Gibson | 508ce5e | 2015-09-30 12:13:56 +1000 | [diff] [blame] | 794 | |
Bharat Bhushan | b917749 | 2020-10-30 19:05:08 +0100 | [diff] [blame] | 795 | ret = memory_region_iommu_set_page_size_mask(giommu->iommu, |
| 796 | container->pgsizes, |
| 797 | &err); |
| 798 | if (ret) { |
| 799 | g_free(giommu); |
| 800 | goto fail; |
| 801 | } |
| 802 | |
Eric Auger | 549d4005 | 2019-09-24 10:25:17 +0200 | [diff] [blame] | 803 | ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, |
| 804 | &err); |
| 805 | if (ret) { |
| 806 | g_free(giommu); |
| 807 | goto fail; |
| 808 | } |
| 809 | QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); |
Peter Xu | ad52359 | 2017-05-19 11:19:41 +0800 | [diff] [blame] | 810 | memory_region_iommu_replay(giommu->iommu, &giommu->n); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 811 | |
| 812 | return; |
| 813 | } |
| 814 | |
| 815 | /* Here we assume that memory_region_is_ram(section->mr)==true */ |
| 816 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 817 | vaddr = memory_region_get_ram_ptr(section->mr) + |
| 818 | section->offset_within_region + |
| 819 | (iova - section->offset_within_address_space); |
| 820 | |
Bandan Das | 55efcc5 | 2016-03-23 20:37:25 -0400 | [diff] [blame] | 821 | trace_vfio_listener_region_add_ram(iova, end, vaddr); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 822 | |
Bandan Das | 55efcc5 | 2016-03-23 20:37:25 -0400 | [diff] [blame] | 823 | llsize = int128_sub(llend, int128_make64(iova)); |
| 824 | |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 825 | if (memory_region_is_ram_device(section->mr)) { |
| 826 | hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; |
| 827 | |
| 828 | if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { |
Eric Auger | 5c08600 | 2018-04-04 22:30:50 +0200 | [diff] [blame] | 829 | trace_vfio_listener_region_add_no_dma_map( |
| 830 | memory_region_name(section->mr), |
| 831 | section->offset_within_address_space, |
| 832 | int128_getlo(section->size), |
| 833 | pgmask + 1); |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 834 | return; |
| 835 | } |
| 836 | } |
| 837 | |
Bandan Das | 55efcc5 | 2016-03-23 20:37:25 -0400 | [diff] [blame] | 838 | ret = vfio_dma_map(container, iova, int128_get64(llsize), |
| 839 | vaddr, section->readonly); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 840 | if (ret) { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 841 | error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " |
| 842 | "0x%"HWADDR_PRIx", %p) = %d (%m)", |
| 843 | container, iova, int128_get64(llsize), vaddr, ret); |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 844 | if (memory_region_is_ram_device(section->mr)) { |
| 845 | /* Allow unexpected mappings not to be fatal for RAM devices */ |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 846 | error_report_err(err); |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 847 | return; |
| 848 | } |
David Gibson | ac6dc38 | 2015-09-30 12:13:52 +1000 | [diff] [blame] | 849 | goto fail; |
| 850 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 851 | |
David Gibson | ac6dc38 | 2015-09-30 12:13:52 +1000 | [diff] [blame] | 852 | return; |
| 853 | |
| 854 | fail: |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 855 | if (memory_region_is_ram_device(section->mr)) { |
| 856 | error_report("failed to vfio_dma_map. pci p2p may not work"); |
| 857 | return; |
| 858 | } |
David Gibson | ac6dc38 | 2015-09-30 12:13:52 +1000 | [diff] [blame] | 859 | /* |
| 860 | * On the initfn path, store the first error in the container so we |
| 861 | * can gracefully fail. Runtime, there's not much we can do other |
| 862 | * than throw a hardware error. |
| 863 | */ |
| 864 | if (!container->initialized) { |
| 865 | if (!container->error) { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 866 | error_propagate_prepend(&container->error, err, |
| 867 | "Region %s: ", |
| 868 | memory_region_name(section->mr)); |
| 869 | } else { |
| 870 | error_free(err); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 871 | } |
David Gibson | ac6dc38 | 2015-09-30 12:13:52 +1000 | [diff] [blame] | 872 | } else { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 873 | error_report_err(err); |
David Gibson | ac6dc38 | 2015-09-30 12:13:52 +1000 | [diff] [blame] | 874 | hw_error("vfio: DMA mapping failed, unable to continue"); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 875 | } |
| 876 | } |
| 877 | |
| 878 | static void vfio_listener_region_del(MemoryListener *listener, |
| 879 | MemoryRegionSection *section) |
| 880 | { |
David Gibson | ee0bf0e | 2015-09-30 12:13:51 +1000 | [diff] [blame] | 881 | VFIOContainer *container = container_of(listener, VFIOContainer, listener); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 882 | hwaddr iova, end; |
Alexey Kardashevskiy | 7a057b4 | 2016-05-26 09:43:22 -0600 | [diff] [blame] | 883 | Int128 llend, llsize; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 884 | int ret; |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 885 | bool try_unmap = true; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 886 | |
| 887 | if (vfio_listener_skipped_section(section)) { |
| 888 | trace_vfio_listener_region_del_skip( |
| 889 | section->offset_within_address_space, |
| 890 | section->offset_within_address_space + |
| 891 | int128_get64(int128_sub(section->size, int128_one()))); |
| 892 | return; |
| 893 | } |
| 894 | |
| 895 | if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != |
| 896 | (section->offset_within_region & ~TARGET_PAGE_MASK))) { |
| 897 | error_report("%s received unaligned region", __func__); |
| 898 | return; |
| 899 | } |
| 900 | |
| 901 | if (memory_region_is_iommu(section->mr)) { |
| 902 | VFIOGuestIOMMU *giommu; |
| 903 | |
| 904 | QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { |
Alexey Kardashevskiy | 3df9d74 | 2017-07-11 13:56:19 +1000 | [diff] [blame] | 905 | if (MEMORY_REGION(giommu->iommu) == section->mr && |
Peter Xu | 698feb5 | 2017-04-07 18:59:07 +0800 | [diff] [blame] | 906 | giommu->n.start == section->offset_within_region) { |
Alexey Kardashevskiy | 3df9d74 | 2017-07-11 13:56:19 +1000 | [diff] [blame] | 907 | memory_region_unregister_iommu_notifier(section->mr, |
Alexey Kardashevskiy | d22d895 | 2016-06-30 13:00:23 -0600 | [diff] [blame] | 908 | &giommu->n); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 909 | QLIST_REMOVE(giommu, giommu_next); |
| 910 | g_free(giommu); |
| 911 | break; |
| 912 | } |
| 913 | } |
| 914 | |
| 915 | /* |
| 916 | * FIXME: We assume the one big unmap below is adequate to |
| 917 | * remove any individual page mappings in the IOMMU which |
| 918 | * might have been copied into VFIO. This works for a page table |
| 919 | * based IOMMU where a big unmap flattens a large range of IO-PTEs. |
| 920 | * That may not be true for all IOMMU types. |
| 921 | */ |
| 922 | } |
| 923 | |
| 924 | iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); |
Alexey Kardashevskiy | 7a057b4 | 2016-05-26 09:43:22 -0600 | [diff] [blame] | 925 | llend = int128_make64(section->offset_within_address_space); |
| 926 | llend = int128_add(llend, section->size); |
| 927 | llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 928 | |
Alexey Kardashevskiy | 7a057b4 | 2016-05-26 09:43:22 -0600 | [diff] [blame] | 929 | if (int128_ge(int128_make64(iova), llend)) { |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 930 | return; |
| 931 | } |
Alexey Kardashevskiy | 7a057b4 | 2016-05-26 09:43:22 -0600 | [diff] [blame] | 932 | end = int128_get64(int128_sub(llend, int128_one())); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 933 | |
Alexey Kardashevskiy | 7a057b4 | 2016-05-26 09:43:22 -0600 | [diff] [blame] | 934 | llsize = int128_sub(llend, int128_make64(iova)); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 935 | |
Alexey Kardashevskiy | 7a057b4 | 2016-05-26 09:43:22 -0600 | [diff] [blame] | 936 | trace_vfio_listener_region_del(iova, end); |
| 937 | |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 938 | if (memory_region_is_ram_device(section->mr)) { |
| 939 | hwaddr pgmask; |
| 940 | VFIOHostDMAWindow *hostwin; |
| 941 | bool hostwin_found = false; |
| 942 | |
| 943 | QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { |
| 944 | if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { |
| 945 | hostwin_found = true; |
| 946 | break; |
| 947 | } |
| 948 | } |
| 949 | assert(hostwin_found); /* or region_add() would have failed */ |
| 950 | |
| 951 | pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; |
| 952 | try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 953 | } |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 954 | |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 955 | if (try_unmap) { |
Jean-Philippe Brucker | 1b296c3 | 2020-10-30 19:05:10 +0100 | [diff] [blame] | 956 | if (int128_eq(llsize, int128_2_64())) { |
| 957 | /* The unmap ioctl doesn't accept a full 64-bit span. */ |
| 958 | llsize = int128_rshift(llsize, 1); |
| 959 | ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); |
| 960 | if (ret) { |
| 961 | error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " |
| 962 | "0x%"HWADDR_PRIx") = %d (%m)", |
| 963 | container, iova, int128_get64(llsize), ret); |
| 964 | } |
| 965 | iova += int128_get64(llsize); |
| 966 | } |
Kirti Wankhede | 9e7b044 | 2020-10-26 15:06:25 +0530 | [diff] [blame] | 967 | ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); |
Alexey Kardashevskiy | 567b5b3 | 2018-03-13 11:17:30 -0600 | [diff] [blame] | 968 | if (ret) { |
| 969 | error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " |
| 970 | "0x%"HWADDR_PRIx") = %d (%m)", |
| 971 | container, iova, int128_get64(llsize), ret); |
| 972 | } |
| 973 | } |
| 974 | |
| 975 | memory_region_unref(section->mr); |
| 976 | |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 977 | if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { |
| 978 | vfio_spapr_remove_window(container, |
| 979 | section->offset_within_address_space); |
| 980 | if (vfio_host_win_del(container, |
| 981 | section->offset_within_address_space, |
| 982 | section->offset_within_address_space + |
| 983 | int128_get64(section->size) - 1) < 0) { |
| 984 | hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, |
| 985 | __func__, section->offset_within_address_space); |
| 986 | } |
| 987 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 988 | } |
| 989 | |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 990 | static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, |
| 991 | uint64_t size, ram_addr_t ram_addr) |
| 992 | { |
| 993 | struct vfio_iommu_type1_dirty_bitmap *dbitmap; |
| 994 | struct vfio_iommu_type1_dirty_bitmap_get *range; |
| 995 | uint64_t pages; |
| 996 | int ret; |
| 997 | |
| 998 | dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); |
| 999 | |
| 1000 | dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); |
| 1001 | dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; |
| 1002 | range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; |
| 1003 | range->iova = iova; |
| 1004 | range->size = size; |
| 1005 | |
| 1006 | /* |
| 1007 | * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of |
| 1008 | * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to |
| 1009 | * TARGET_PAGE_SIZE. |
| 1010 | */ |
| 1011 | range->bitmap.pgsize = TARGET_PAGE_SIZE; |
| 1012 | |
| 1013 | pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; |
| 1014 | range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / |
| 1015 | BITS_PER_BYTE; |
| 1016 | range->bitmap.data = g_try_malloc0(range->bitmap.size); |
| 1017 | if (!range->bitmap.data) { |
| 1018 | ret = -ENOMEM; |
| 1019 | goto err_out; |
| 1020 | } |
| 1021 | |
| 1022 | ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); |
| 1023 | if (ret) { |
| 1024 | error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 |
| 1025 | " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, |
| 1026 | (uint64_t)range->size, errno); |
| 1027 | goto err_out; |
| 1028 | } |
| 1029 | |
| 1030 | cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data, |
| 1031 | ram_addr, pages); |
| 1032 | |
| 1033 | trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, |
| 1034 | range->bitmap.size, ram_addr); |
| 1035 | err_out: |
| 1036 | g_free(range->bitmap.data); |
| 1037 | g_free(dbitmap); |
| 1038 | |
| 1039 | return ret; |
| 1040 | } |
| 1041 | |
Kirti Wankhede | 9a04fe0 | 2020-10-26 15:06:24 +0530 | [diff] [blame] | 1042 | typedef struct { |
| 1043 | IOMMUNotifier n; |
| 1044 | VFIOGuestIOMMU *giommu; |
| 1045 | } vfio_giommu_dirty_notifier; |
| 1046 | |
| 1047 | static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) |
| 1048 | { |
| 1049 | vfio_giommu_dirty_notifier *gdn = container_of(n, |
| 1050 | vfio_giommu_dirty_notifier, n); |
| 1051 | VFIOGuestIOMMU *giommu = gdn->giommu; |
| 1052 | VFIOContainer *container = giommu->container; |
| 1053 | hwaddr iova = iotlb->iova + giommu->iommu_offset; |
| 1054 | ram_addr_t translated_addr; |
| 1055 | |
| 1056 | trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); |
| 1057 | |
| 1058 | if (iotlb->target_as != &address_space_memory) { |
| 1059 | error_report("Wrong target AS \"%s\", only system memory is allowed", |
| 1060 | iotlb->target_as->name ? iotlb->target_as->name : "none"); |
| 1061 | return; |
| 1062 | } |
| 1063 | |
| 1064 | rcu_read_lock(); |
| 1065 | if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { |
| 1066 | int ret; |
| 1067 | |
| 1068 | ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, |
| 1069 | translated_addr); |
| 1070 | if (ret) { |
| 1071 | error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " |
| 1072 | "0x%"HWADDR_PRIx") = %d (%m)", |
| 1073 | container, iova, |
| 1074 | iotlb->addr_mask + 1, ret); |
| 1075 | } |
| 1076 | } |
| 1077 | rcu_read_unlock(); |
| 1078 | } |
| 1079 | |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 1080 | static int vfio_sync_dirty_bitmap(VFIOContainer *container, |
| 1081 | MemoryRegionSection *section) |
| 1082 | { |
| 1083 | ram_addr_t ram_addr; |
| 1084 | |
Kirti Wankhede | 9a04fe0 | 2020-10-26 15:06:24 +0530 | [diff] [blame] | 1085 | if (memory_region_is_iommu(section->mr)) { |
| 1086 | VFIOGuestIOMMU *giommu; |
| 1087 | |
| 1088 | QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { |
| 1089 | if (MEMORY_REGION(giommu->iommu) == section->mr && |
| 1090 | giommu->n.start == section->offset_within_region) { |
| 1091 | Int128 llend; |
| 1092 | vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; |
| 1093 | int idx = memory_region_iommu_attrs_to_index(giommu->iommu, |
| 1094 | MEMTXATTRS_UNSPECIFIED); |
| 1095 | |
| 1096 | llend = int128_add(int128_make64(section->offset_within_region), |
| 1097 | section->size); |
| 1098 | llend = int128_sub(llend, int128_one()); |
| 1099 | |
| 1100 | iommu_notifier_init(&gdn.n, |
| 1101 | vfio_iommu_map_dirty_notify, |
| 1102 | IOMMU_NOTIFIER_MAP, |
| 1103 | section->offset_within_region, |
| 1104 | int128_get64(llend), |
| 1105 | idx); |
| 1106 | memory_region_iommu_replay(giommu->iommu, &gdn.n); |
| 1107 | break; |
| 1108 | } |
| 1109 | } |
| 1110 | return 0; |
| 1111 | } |
| 1112 | |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 1113 | ram_addr = memory_region_get_ram_addr(section->mr) + |
| 1114 | section->offset_within_region; |
| 1115 | |
| 1116 | return vfio_get_dirty_bitmap(container, |
| 1117 | TARGET_PAGE_ALIGN(section->offset_within_address_space), |
| 1118 | int128_get64(section->size), ram_addr); |
| 1119 | } |
| 1120 | |
| 1121 | static void vfio_listerner_log_sync(MemoryListener *listener, |
| 1122 | MemoryRegionSection *section) |
| 1123 | { |
| 1124 | VFIOContainer *container = container_of(listener, VFIOContainer, listener); |
| 1125 | |
| 1126 | if (vfio_listener_skipped_section(section) || |
| 1127 | !container->dirty_pages_supported) { |
| 1128 | return; |
| 1129 | } |
| 1130 | |
Kirti Wankhede | bb0990d | 2020-11-23 19:53:19 +0530 | [diff] [blame] | 1131 | if (vfio_devices_all_saving(container)) { |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 1132 | vfio_sync_dirty_bitmap(container, section); |
| 1133 | } |
| 1134 | } |
| 1135 | |
Alexey Kardashevskiy | 51b833f | 2015-03-02 11:38:55 -0700 | [diff] [blame] | 1136 | static const MemoryListener vfio_memory_listener = { |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1137 | .region_add = vfio_listener_region_add, |
| 1138 | .region_del = vfio_listener_region_del, |
Kirti Wankhede | b6dd650 | 2020-10-26 15:06:23 +0530 | [diff] [blame] | 1139 | .log_sync = vfio_listerner_log_sync, |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1140 | }; |
| 1141 | |
Alexey Kardashevskiy | 51b833f | 2015-03-02 11:38:55 -0700 | [diff] [blame] | 1142 | static void vfio_listener_release(VFIOContainer *container) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1143 | { |
David Gibson | ee0bf0e | 2015-09-30 12:13:51 +1000 | [diff] [blame] | 1144 | memory_listener_unregister(&container->listener); |
Alexey Kardashevskiy | 318f67c | 2016-07-04 13:33:04 +1000 | [diff] [blame] | 1145 | if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { |
| 1146 | memory_listener_unregister(&container->prereg_listener); |
| 1147 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1148 | } |
| 1149 | |
Matthew Rosato | 3ab7a0b | 2020-10-26 11:34:32 -0400 | [diff] [blame] | 1150 | static struct vfio_info_cap_header * |
| 1151 | vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1152 | { |
| 1153 | struct vfio_info_cap_header *hdr; |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1154 | |
Matthew Rosato | 3ab7a0b | 2020-10-26 11:34:32 -0400 | [diff] [blame] | 1155 | for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1156 | if (hdr->id == id) { |
| 1157 | return hdr; |
| 1158 | } |
| 1159 | } |
| 1160 | |
| 1161 | return NULL; |
| 1162 | } |
| 1163 | |
Matthew Rosato | 3ab7a0b | 2020-10-26 11:34:32 -0400 | [diff] [blame] | 1164 | struct vfio_info_cap_header * |
| 1165 | vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) |
| 1166 | { |
| 1167 | if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { |
| 1168 | return NULL; |
| 1169 | } |
| 1170 | |
| 1171 | return vfio_get_cap((void *)info, info->cap_offset, id); |
| 1172 | } |
| 1173 | |
Matthew Rosato | 7486a62 | 2020-10-26 11:34:33 -0400 | [diff] [blame] | 1174 | static struct vfio_info_cap_header * |
| 1175 | vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) |
| 1176 | { |
| 1177 | if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { |
| 1178 | return NULL; |
| 1179 | } |
| 1180 | |
| 1181 | return vfio_get_cap((void *)info, info->cap_offset, id); |
| 1182 | } |
| 1183 | |
Matthew Rosato | 92fe289 | 2020-10-26 11:34:40 -0400 | [diff] [blame] | 1184 | struct vfio_info_cap_header * |
| 1185 | vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) |
| 1186 | { |
| 1187 | if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) { |
| 1188 | return NULL; |
| 1189 | } |
| 1190 | |
| 1191 | return vfio_get_cap((void *)info, info->cap_offset, id); |
| 1192 | } |
| 1193 | |
Matthew Rosato | 7486a62 | 2020-10-26 11:34:33 -0400 | [diff] [blame] | 1194 | bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, |
| 1195 | unsigned int *avail) |
| 1196 | { |
| 1197 | struct vfio_info_cap_header *hdr; |
| 1198 | struct vfio_iommu_type1_info_dma_avail *cap; |
| 1199 | |
| 1200 | /* If the capability cannot be found, assume no DMA limiting */ |
| 1201 | hdr = vfio_get_iommu_type1_info_cap(info, |
| 1202 | VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); |
| 1203 | if (hdr == NULL) { |
| 1204 | return false; |
| 1205 | } |
| 1206 | |
| 1207 | if (avail != NULL) { |
| 1208 | cap = (void *) hdr; |
| 1209 | *avail = cap->avail; |
| 1210 | } |
| 1211 | |
| 1212 | return true; |
| 1213 | } |
| 1214 | |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1215 | static int vfio_setup_region_sparse_mmaps(VFIORegion *region, |
| 1216 | struct vfio_region_info *info) |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1217 | { |
| 1218 | struct vfio_info_cap_header *hdr; |
| 1219 | struct vfio_region_info_cap_sparse_mmap *sparse; |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1220 | int i, j; |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1221 | |
| 1222 | hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); |
| 1223 | if (!hdr) { |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1224 | return -ENODEV; |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1225 | } |
| 1226 | |
| 1227 | sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); |
| 1228 | |
| 1229 | trace_vfio_region_sparse_mmap_header(region->vbasedev->name, |
| 1230 | region->nr, sparse->nr_areas); |
| 1231 | |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1232 | region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1233 | |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1234 | for (i = 0, j = 0; i < sparse->nr_areas; i++) { |
| 1235 | trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, |
| 1236 | sparse->areas[i].offset + |
| 1237 | sparse->areas[i].size); |
| 1238 | |
| 1239 | if (sparse->areas[i].size) { |
| 1240 | region->mmaps[j].offset = sparse->areas[i].offset; |
| 1241 | region->mmaps[j].size = sparse->areas[i].size; |
| 1242 | j++; |
| 1243 | } |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1244 | } |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1245 | |
| 1246 | region->nr_mmaps = j; |
| 1247 | region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); |
| 1248 | |
| 1249 | return 0; |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1250 | } |
| 1251 | |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1252 | int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, |
| 1253 | int index, const char *name) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1254 | { |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1255 | struct vfio_region_info *info; |
| 1256 | int ret; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1257 | |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1258 | ret = vfio_get_region_info(vbasedev, index, &info); |
| 1259 | if (ret) { |
| 1260 | return ret; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1261 | } |
| 1262 | |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1263 | region->vbasedev = vbasedev; |
| 1264 | region->flags = info->flags; |
| 1265 | region->size = info->size; |
| 1266 | region->fd_offset = info->offset; |
| 1267 | region->nr = index; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1268 | |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1269 | if (region->size) { |
| 1270 | region->mem = g_new0(MemoryRegion, 1); |
| 1271 | memory_region_init_io(region->mem, obj, &vfio_region_ops, |
| 1272 | region, name, region->size); |
| 1273 | |
| 1274 | if (!vbasedev->no_mmap && |
Yongji Xie | 9525172 | 2016-10-31 09:53:04 -0600 | [diff] [blame] | 1275 | region->flags & VFIO_REGION_INFO_FLAG_MMAP) { |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1276 | |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1277 | ret = vfio_setup_region_sparse_mmaps(region, info); |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1278 | |
Alex Williamson | 24acf72 | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1279 | if (ret) { |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 1280 | region->nr_mmaps = 1; |
| 1281 | region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); |
| 1282 | region->mmaps[0].offset = 0; |
| 1283 | region->mmaps[0].size = region->size; |
| 1284 | } |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1285 | } |
| 1286 | } |
| 1287 | |
| 1288 | g_free(info); |
| 1289 | |
| 1290 | trace_vfio_region_setup(vbasedev->name, index, name, |
| 1291 | region->flags, region->fd_offset, region->size); |
| 1292 | return 0; |
| 1293 | } |
| 1294 | |
Kirti Wankhede | 0f7a903 | 2020-10-26 15:06:11 +0530 | [diff] [blame] | 1295 | static void vfio_subregion_unmap(VFIORegion *region, int index) |
| 1296 | { |
| 1297 | trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), |
| 1298 | region->mmaps[index].offset, |
| 1299 | region->mmaps[index].offset + |
| 1300 | region->mmaps[index].size - 1); |
| 1301 | memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); |
| 1302 | munmap(region->mmaps[index].mmap, region->mmaps[index].size); |
| 1303 | object_unparent(OBJECT(®ion->mmaps[index].mem)); |
| 1304 | region->mmaps[index].mmap = NULL; |
| 1305 | } |
| 1306 | |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1307 | int vfio_region_mmap(VFIORegion *region) |
| 1308 | { |
| 1309 | int i, prot = 0; |
| 1310 | char *name; |
| 1311 | |
| 1312 | if (!region->mem) { |
| 1313 | return 0; |
| 1314 | } |
| 1315 | |
| 1316 | prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; |
| 1317 | prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; |
| 1318 | |
| 1319 | for (i = 0; i < region->nr_mmaps; i++) { |
| 1320 | region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, |
| 1321 | MAP_SHARED, region->vbasedev->fd, |
| 1322 | region->fd_offset + |
| 1323 | region->mmaps[i].offset); |
| 1324 | if (region->mmaps[i].mmap == MAP_FAILED) { |
| 1325 | int ret = -errno; |
| 1326 | |
| 1327 | trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, |
| 1328 | region->fd_offset + |
| 1329 | region->mmaps[i].offset, |
| 1330 | region->fd_offset + |
| 1331 | region->mmaps[i].offset + |
| 1332 | region->mmaps[i].size - 1, ret); |
| 1333 | |
| 1334 | region->mmaps[i].mmap = NULL; |
| 1335 | |
| 1336 | for (i--; i >= 0; i--) { |
Kirti Wankhede | 0f7a903 | 2020-10-26 15:06:11 +0530 | [diff] [blame] | 1337 | vfio_subregion_unmap(region, i); |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1338 | } |
| 1339 | |
| 1340 | return ret; |
| 1341 | } |
| 1342 | |
| 1343 | name = g_strdup_printf("%s mmaps[%d]", |
| 1344 | memory_region_name(region->mem), i); |
Alex Williamson | 21e00fa | 2016-10-31 09:53:03 -0600 | [diff] [blame] | 1345 | memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, |
| 1346 | memory_region_owner(region->mem), |
| 1347 | name, region->mmaps[i].size, |
| 1348 | region->mmaps[i].mmap); |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1349 | g_free(name); |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1350 | memory_region_add_subregion(region->mem, region->mmaps[i].offset, |
| 1351 | ®ion->mmaps[i].mem); |
| 1352 | |
| 1353 | trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), |
| 1354 | region->mmaps[i].offset, |
| 1355 | region->mmaps[i].offset + |
| 1356 | region->mmaps[i].size - 1); |
| 1357 | } |
| 1358 | |
| 1359 | return 0; |
| 1360 | } |
| 1361 | |
Kirti Wankhede | 0f7a903 | 2020-10-26 15:06:11 +0530 | [diff] [blame] | 1362 | void vfio_region_unmap(VFIORegion *region) |
| 1363 | { |
| 1364 | int i; |
| 1365 | |
| 1366 | if (!region->mem) { |
| 1367 | return; |
| 1368 | } |
| 1369 | |
| 1370 | for (i = 0; i < region->nr_mmaps; i++) { |
| 1371 | if (region->mmaps[i].mmap) { |
| 1372 | vfio_subregion_unmap(region, i); |
| 1373 | } |
| 1374 | } |
| 1375 | } |
| 1376 | |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1377 | void vfio_region_exit(VFIORegion *region) |
| 1378 | { |
| 1379 | int i; |
| 1380 | |
| 1381 | if (!region->mem) { |
| 1382 | return; |
| 1383 | } |
| 1384 | |
| 1385 | for (i = 0; i < region->nr_mmaps; i++) { |
| 1386 | if (region->mmaps[i].mmap) { |
| 1387 | memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); |
| 1388 | } |
| 1389 | } |
| 1390 | |
| 1391 | trace_vfio_region_exit(region->vbasedev->name, region->nr); |
| 1392 | } |
| 1393 | |
| 1394 | void vfio_region_finalize(VFIORegion *region) |
| 1395 | { |
| 1396 | int i; |
| 1397 | |
| 1398 | if (!region->mem) { |
| 1399 | return; |
| 1400 | } |
| 1401 | |
| 1402 | for (i = 0; i < region->nr_mmaps; i++) { |
| 1403 | if (region->mmaps[i].mmap) { |
| 1404 | munmap(region->mmaps[i].mmap, region->mmaps[i].size); |
| 1405 | object_unparent(OBJECT(®ion->mmaps[i].mem)); |
| 1406 | } |
| 1407 | } |
| 1408 | |
| 1409 | object_unparent(OBJECT(region->mem)); |
| 1410 | |
| 1411 | g_free(region->mem); |
| 1412 | g_free(region->mmaps); |
| 1413 | |
| 1414 | trace_vfio_region_finalize(region->vbasedev->name, region->nr); |
Gerd Hoffmann | 92f86bf | 2018-03-13 11:17:29 -0600 | [diff] [blame] | 1415 | |
| 1416 | region->mem = NULL; |
| 1417 | region->mmaps = NULL; |
| 1418 | region->nr_mmaps = 0; |
| 1419 | region->size = 0; |
| 1420 | region->flags = 0; |
| 1421 | region->nr = 0; |
Alex Williamson | db0da02 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 1422 | } |
| 1423 | |
| 1424 | void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) |
| 1425 | { |
| 1426 | int i; |
| 1427 | |
| 1428 | if (!region->mem) { |
| 1429 | return; |
| 1430 | } |
| 1431 | |
| 1432 | for (i = 0; i < region->nr_mmaps; i++) { |
| 1433 | if (region->mmaps[i].mmap) { |
| 1434 | memory_region_set_enabled(®ion->mmaps[i].mem, enabled); |
| 1435 | } |
| 1436 | } |
| 1437 | |
| 1438 | trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), |
| 1439 | enabled); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1440 | } |
| 1441 | |
| 1442 | void vfio_reset_handler(void *opaque) |
| 1443 | { |
| 1444 | VFIOGroup *group; |
| 1445 | VFIODevice *vbasedev; |
| 1446 | |
| 1447 | QLIST_FOREACH(group, &vfio_group_list, next) { |
| 1448 | QLIST_FOREACH(vbasedev, &group->device_list, next) { |
Alex Williamson | 7da624e | 2017-07-10 10:39:43 -0600 | [diff] [blame] | 1449 | if (vbasedev->dev->realized) { |
| 1450 | vbasedev->ops->vfio_compute_needs_reset(vbasedev); |
| 1451 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1452 | } |
| 1453 | } |
| 1454 | |
| 1455 | QLIST_FOREACH(group, &vfio_group_list, next) { |
| 1456 | QLIST_FOREACH(vbasedev, &group->device_list, next) { |
Alex Williamson | 7da624e | 2017-07-10 10:39:43 -0600 | [diff] [blame] | 1457 | if (vbasedev->dev->realized && vbasedev->needs_reset) { |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1458 | vbasedev->ops->vfio_hot_reset_multi(vbasedev); |
| 1459 | } |
| 1460 | } |
| 1461 | } |
| 1462 | } |
| 1463 | |
| 1464 | static void vfio_kvm_device_add_group(VFIOGroup *group) |
| 1465 | { |
| 1466 | #ifdef CONFIG_KVM |
| 1467 | struct kvm_device_attr attr = { |
| 1468 | .group = KVM_DEV_VFIO_GROUP, |
| 1469 | .attr = KVM_DEV_VFIO_GROUP_ADD, |
| 1470 | .addr = (uint64_t)(unsigned long)&group->fd, |
| 1471 | }; |
| 1472 | |
| 1473 | if (!kvm_enabled()) { |
| 1474 | return; |
| 1475 | } |
| 1476 | |
| 1477 | if (vfio_kvm_device_fd < 0) { |
| 1478 | struct kvm_create_device cd = { |
| 1479 | .type = KVM_DEV_TYPE_VFIO, |
| 1480 | }; |
| 1481 | |
| 1482 | if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { |
Gonglei | 78e5b17 | 2015-02-25 12:22:33 +0800 | [diff] [blame] | 1483 | error_report("Failed to create KVM VFIO device: %m"); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1484 | return; |
| 1485 | } |
| 1486 | |
| 1487 | vfio_kvm_device_fd = cd.fd; |
| 1488 | } |
| 1489 | |
| 1490 | if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { |
| 1491 | error_report("Failed to add group %d to KVM VFIO device: %m", |
| 1492 | group->groupid); |
| 1493 | } |
| 1494 | #endif |
| 1495 | } |
| 1496 | |
| 1497 | static void vfio_kvm_device_del_group(VFIOGroup *group) |
| 1498 | { |
| 1499 | #ifdef CONFIG_KVM |
| 1500 | struct kvm_device_attr attr = { |
| 1501 | .group = KVM_DEV_VFIO_GROUP, |
| 1502 | .attr = KVM_DEV_VFIO_GROUP_DEL, |
| 1503 | .addr = (uint64_t)(unsigned long)&group->fd, |
| 1504 | }; |
| 1505 | |
| 1506 | if (vfio_kvm_device_fd < 0) { |
| 1507 | return; |
| 1508 | } |
| 1509 | |
| 1510 | if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { |
| 1511 | error_report("Failed to remove group %d from KVM VFIO device: %m", |
| 1512 | group->groupid); |
| 1513 | } |
| 1514 | #endif |
| 1515 | } |
| 1516 | |
| 1517 | static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) |
| 1518 | { |
| 1519 | VFIOAddressSpace *space; |
| 1520 | |
| 1521 | QLIST_FOREACH(space, &vfio_address_spaces, list) { |
| 1522 | if (space->as == as) { |
| 1523 | return space; |
| 1524 | } |
| 1525 | } |
| 1526 | |
| 1527 | /* No suitable VFIOAddressSpace, create a new one */ |
| 1528 | space = g_malloc0(sizeof(*space)); |
| 1529 | space->as = as; |
| 1530 | QLIST_INIT(&space->containers); |
| 1531 | |
| 1532 | QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); |
| 1533 | |
| 1534 | return space; |
| 1535 | } |
| 1536 | |
| 1537 | static void vfio_put_address_space(VFIOAddressSpace *space) |
| 1538 | { |
| 1539 | if (QLIST_EMPTY(&space->containers)) { |
| 1540 | QLIST_REMOVE(space, list); |
| 1541 | g_free(space); |
| 1542 | } |
| 1543 | } |
| 1544 | |
Eric Auger | 2b6326c | 2019-02-21 21:07:03 -0700 | [diff] [blame] | 1545 | /* |
| 1546 | * vfio_get_iommu_type - selects the richest iommu_type (v2 first) |
| 1547 | */ |
| 1548 | static int vfio_get_iommu_type(VFIOContainer *container, |
| 1549 | Error **errp) |
| 1550 | { |
| 1551 | int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, |
| 1552 | VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; |
| 1553 | int i; |
| 1554 | |
| 1555 | for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { |
| 1556 | if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { |
| 1557 | return iommu_types[i]; |
| 1558 | } |
| 1559 | } |
| 1560 | error_setg(errp, "No available IOMMU models"); |
| 1561 | return -EINVAL; |
| 1562 | } |
| 1563 | |
| 1564 | static int vfio_init_container(VFIOContainer *container, int group_fd, |
| 1565 | Error **errp) |
| 1566 | { |
| 1567 | int iommu_type, ret; |
| 1568 | |
| 1569 | iommu_type = vfio_get_iommu_type(container, errp); |
| 1570 | if (iommu_type < 0) { |
| 1571 | return iommu_type; |
| 1572 | } |
| 1573 | |
| 1574 | ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd); |
| 1575 | if (ret) { |
| 1576 | error_setg_errno(errp, errno, "Failed to set group container"); |
| 1577 | return -errno; |
| 1578 | } |
| 1579 | |
| 1580 | while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) { |
| 1581 | if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { |
| 1582 | /* |
| 1583 | * On sPAPR, despite the IOMMU subdriver always advertises v1 and |
| 1584 | * v2, the running platform may not support v2 and there is no |
| 1585 | * way to guess it until an IOMMU group gets added to the container. |
| 1586 | * So in case it fails with v2, try v1 as a fallback. |
| 1587 | */ |
| 1588 | iommu_type = VFIO_SPAPR_TCE_IOMMU; |
| 1589 | continue; |
| 1590 | } |
| 1591 | error_setg_errno(errp, errno, "Failed to set iommu for container"); |
| 1592 | return -errno; |
| 1593 | } |
| 1594 | |
| 1595 | container->iommu_type = iommu_type; |
| 1596 | return 0; |
| 1597 | } |
| 1598 | |
Kirti Wankhede | 87ea529 | 2020-10-26 15:06:21 +0530 | [diff] [blame] | 1599 | static int vfio_get_iommu_info(VFIOContainer *container, |
| 1600 | struct vfio_iommu_type1_info **info) |
| 1601 | { |
| 1602 | |
| 1603 | size_t argsz = sizeof(struct vfio_iommu_type1_info); |
| 1604 | |
| 1605 | *info = g_new0(struct vfio_iommu_type1_info, 1); |
| 1606 | again: |
| 1607 | (*info)->argsz = argsz; |
| 1608 | |
| 1609 | if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { |
| 1610 | g_free(*info); |
| 1611 | *info = NULL; |
| 1612 | return -errno; |
| 1613 | } |
| 1614 | |
| 1615 | if (((*info)->argsz > argsz)) { |
| 1616 | argsz = (*info)->argsz; |
| 1617 | *info = g_realloc(*info, argsz); |
| 1618 | goto again; |
| 1619 | } |
| 1620 | |
| 1621 | return 0; |
| 1622 | } |
| 1623 | |
| 1624 | static struct vfio_info_cap_header * |
| 1625 | vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) |
| 1626 | { |
| 1627 | struct vfio_info_cap_header *hdr; |
| 1628 | void *ptr = info; |
| 1629 | |
| 1630 | if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { |
| 1631 | return NULL; |
| 1632 | } |
| 1633 | |
| 1634 | for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { |
| 1635 | if (hdr->id == id) { |
| 1636 | return hdr; |
| 1637 | } |
| 1638 | } |
| 1639 | |
| 1640 | return NULL; |
| 1641 | } |
| 1642 | |
| 1643 | static void vfio_get_iommu_info_migration(VFIOContainer *container, |
| 1644 | struct vfio_iommu_type1_info *info) |
| 1645 | { |
| 1646 | struct vfio_info_cap_header *hdr; |
| 1647 | struct vfio_iommu_type1_info_cap_migration *cap_mig; |
| 1648 | |
| 1649 | hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); |
| 1650 | if (!hdr) { |
| 1651 | return; |
| 1652 | } |
| 1653 | |
| 1654 | cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, |
| 1655 | header); |
| 1656 | |
| 1657 | /* |
| 1658 | * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of |
| 1659 | * TARGET_PAGE_SIZE to mark those dirty. |
| 1660 | */ |
| 1661 | if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { |
| 1662 | container->dirty_pages_supported = true; |
| 1663 | container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; |
| 1664 | container->dirty_pgsizes = cap_mig->pgsize_bitmap; |
| 1665 | } |
| 1666 | } |
| 1667 | |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1668 | static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, |
| 1669 | Error **errp) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1670 | { |
| 1671 | VFIOContainer *container; |
| 1672 | int ret, fd; |
| 1673 | VFIOAddressSpace *space; |
| 1674 | |
| 1675 | space = vfio_get_address_space(as); |
| 1676 | |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1677 | /* |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1678 | * VFIO is currently incompatible with discarding of RAM insofar as the |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1679 | * madvise to purge (zap) the page from QEMU's address space does not |
| 1680 | * interact with the memory API and therefore leaves stale virtual to |
| 1681 | * physical mappings in the IOMMU if the page was previously pinned. We |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1682 | * therefore set discarding broken for each group added to a container, |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1683 | * whether the container is used individually or shared. This provides |
| 1684 | * us with options to allow devices within a group to opt-in and allow |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1685 | * discarding, so long as it is done consistently for a group (for instance |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1686 | * if the device is an mdev device where it is known that the host vendor |
| 1687 | * driver will never pin pages outside of the working set of the guest |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1688 | * driver, which would thus not be discarding candidates). |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1689 | * |
| 1690 | * The first opportunity to induce pinning occurs here where we attempt to |
| 1691 | * attach the group to existing containers within the AddressSpace. If any |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1692 | * pages are already zapped from the virtual address space, such as from |
| 1693 | * previous discards, new pinning will cause valid mappings to be |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1694 | * re-established. Likewise, when the overall MemoryListener for a new |
| 1695 | * container is registered, a replay of mappings within the AddressSpace |
| 1696 | * will occur, re-establishing any previously zapped pages as well. |
| 1697 | * |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1698 | * Especially virtio-balloon is currently only prevented from discarding |
| 1699 | * new memory, it will not yet set ram_block_discard_set_required() and |
| 1700 | * therefore, neither stops us here or deals with the sudden memory |
| 1701 | * consumption of inflated memory. |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1702 | */ |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1703 | ret = ram_block_discard_disable(true); |
| 1704 | if (ret) { |
| 1705 | error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); |
| 1706 | return ret; |
| 1707 | } |
Alex Williamson | c65ee43 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 1708 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1709 | QLIST_FOREACH(container, &space->containers, next) { |
| 1710 | if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { |
| 1711 | group->container = container; |
| 1712 | QLIST_INSERT_HEAD(&container->group_list, group, container_next); |
Alex Williamson | 2016986 | 2017-12-13 10:19:32 -0700 | [diff] [blame] | 1713 | vfio_kvm_device_add_group(group); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1714 | return 0; |
| 1715 | } |
| 1716 | } |
| 1717 | |
Daniel P. Berrangé | 448058a | 2020-07-21 13:25:21 +0100 | [diff] [blame] | 1718 | fd = qemu_open_old("/dev/vfio/vfio", O_RDWR); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1719 | if (fd < 0) { |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1720 | error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio"); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1721 | ret = -errno; |
| 1722 | goto put_space_exit; |
| 1723 | } |
| 1724 | |
| 1725 | ret = ioctl(fd, VFIO_GET_API_VERSION); |
| 1726 | if (ret != VFIO_API_VERSION) { |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1727 | error_setg(errp, "supported vfio version: %d, " |
| 1728 | "reported version: %d", VFIO_API_VERSION, ret); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1729 | ret = -EINVAL; |
| 1730 | goto close_fd_exit; |
| 1731 | } |
| 1732 | |
| 1733 | container = g_malloc0(sizeof(*container)); |
| 1734 | container->space = space; |
| 1735 | container->fd = fd; |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 1736 | container->error = NULL; |
Kirti Wankhede | 87ea529 | 2020-10-26 15:06:21 +0530 | [diff] [blame] | 1737 | container->dirty_pages_supported = false; |
Liu, Yi L | f7f9c7b | 2017-12-13 10:19:33 -0700 | [diff] [blame] | 1738 | QLIST_INIT(&container->giommu_list); |
| 1739 | QLIST_INIT(&container->hostwin_list); |
Eric Auger | 2b6326c | 2019-02-21 21:07:03 -0700 | [diff] [blame] | 1740 | |
| 1741 | ret = vfio_init_container(container, group->fd, errp); |
| 1742 | if (ret) { |
| 1743 | goto free_container_exit; |
| 1744 | } |
| 1745 | |
| 1746 | switch (container->iommu_type) { |
| 1747 | case VFIO_TYPE1v2_IOMMU: |
| 1748 | case VFIO_TYPE1_IOMMU: |
| 1749 | { |
Kirti Wankhede | 87ea529 | 2020-10-26 15:06:21 +0530 | [diff] [blame] | 1750 | struct vfio_iommu_type1_info *info; |
Alex Williamson | 2e6e697 | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 1751 | |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 1752 | /* |
| 1753 | * FIXME: This assumes that a Type1 IOMMU can map any 64-bit |
| 1754 | * IOVA whatsoever. That's not actually true, but the current |
| 1755 | * kernel interface doesn't tell us what it can map, and the |
| 1756 | * existing Type1 IOMMUs generally support any IOVA we're |
| 1757 | * going to actually try in practice. |
| 1758 | */ |
Kirti Wankhede | 87ea529 | 2020-10-26 15:06:21 +0530 | [diff] [blame] | 1759 | ret = vfio_get_iommu_info(container, &info); |
| 1760 | |
| 1761 | if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) { |
Alexey Kardashevskiy | f4ec5e2 | 2016-07-04 13:33:05 +1000 | [diff] [blame] | 1762 | /* Assume 4k IOVA page size */ |
Kirti Wankhede | 87ea529 | 2020-10-26 15:06:21 +0530 | [diff] [blame] | 1763 | info->iova_pgsizes = 4096; |
David Gibson | 7a140a5 | 2015-09-30 12:13:54 +1000 | [diff] [blame] | 1764 | } |
Kirti Wankhede | 87ea529 | 2020-10-26 15:06:21 +0530 | [diff] [blame] | 1765 | vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); |
| 1766 | container->pgsizes = info->iova_pgsizes; |
| 1767 | |
| 1768 | if (!ret) { |
| 1769 | vfio_get_iommu_info_migration(container, info); |
| 1770 | } |
| 1771 | g_free(info); |
Eric Auger | 2b6326c | 2019-02-21 21:07:03 -0700 | [diff] [blame] | 1772 | break; |
| 1773 | } |
| 1774 | case VFIO_SPAPR_TCE_v2_IOMMU: |
| 1775 | case VFIO_SPAPR_TCE_IOMMU: |
| 1776 | { |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 1777 | struct vfio_iommu_spapr_tce_info info; |
Eric Auger | 2b6326c | 2019-02-21 21:07:03 -0700 | [diff] [blame] | 1778 | bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1779 | |
| 1780 | /* |
| 1781 | * The host kernel code implementing VFIO_IOMMU_DISABLE is called |
| 1782 | * when container fd is closed so we do not call it explicitly |
| 1783 | * in this file. |
| 1784 | */ |
Alexey Kardashevskiy | 318f67c | 2016-07-04 13:33:04 +1000 | [diff] [blame] | 1785 | if (!v2) { |
| 1786 | ret = ioctl(fd, VFIO_IOMMU_ENABLE); |
| 1787 | if (ret) { |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1788 | error_setg_errno(errp, errno, "failed to enable container"); |
Alexey Kardashevskiy | 318f67c | 2016-07-04 13:33:04 +1000 | [diff] [blame] | 1789 | ret = -errno; |
| 1790 | goto free_container_exit; |
| 1791 | } |
| 1792 | } else { |
| 1793 | container->prereg_listener = vfio_prereg_listener; |
| 1794 | |
| 1795 | memory_listener_register(&container->prereg_listener, |
| 1796 | &address_space_memory); |
| 1797 | if (container->error) { |
| 1798 | memory_listener_unregister(&container->prereg_listener); |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 1799 | ret = -1; |
| 1800 | error_propagate_prepend(errp, container->error, |
| 1801 | "RAM memory listener initialization failed: "); |
Alexey Kardashevskiy | 318f67c | 2016-07-04 13:33:04 +1000 | [diff] [blame] | 1802 | goto free_container_exit; |
| 1803 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1804 | } |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 1805 | |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 1806 | info.argsz = sizeof(info); |
| 1807 | ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); |
| 1808 | if (ret) { |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1809 | error_setg_errno(errp, errno, |
| 1810 | "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 1811 | ret = -errno; |
Alexey Kardashevskiy | 318f67c | 2016-07-04 13:33:04 +1000 | [diff] [blame] | 1812 | if (v2) { |
| 1813 | memory_listener_unregister(&container->prereg_listener); |
| 1814 | } |
David Gibson | 3898aad | 2015-09-30 12:13:53 +1000 | [diff] [blame] | 1815 | goto free_container_exit; |
| 1816 | } |
David Gibson | 7a140a5 | 2015-09-30 12:13:54 +1000 | [diff] [blame] | 1817 | |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 1818 | if (v2) { |
Alexey Kardashevskiy | c26bc18 | 2018-06-20 19:10:12 +1000 | [diff] [blame] | 1819 | container->pgsizes = info.ddw.pgsizes; |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 1820 | /* |
| 1821 | * There is a default window in just created container. |
| 1822 | * To make region_add/del simpler, we better remove this |
| 1823 | * window now and let those iommu_listener callbacks |
| 1824 | * create/remove them when needed. |
| 1825 | */ |
| 1826 | ret = vfio_spapr_remove_window(container, info.dma32_window_start); |
| 1827 | if (ret) { |
Eric Auger | 01905f5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1828 | error_setg_errno(errp, -ret, |
| 1829 | "failed to remove existing window"); |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 1830 | goto free_container_exit; |
| 1831 | } |
| 1832 | } else { |
| 1833 | /* The default table uses 4K pages */ |
Alexey Kardashevskiy | c26bc18 | 2018-06-20 19:10:12 +1000 | [diff] [blame] | 1834 | container->pgsizes = 0x1000; |
Alexey Kardashevskiy | 2e4109d | 2016-07-04 13:33:06 +1000 | [diff] [blame] | 1835 | vfio_host_win_add(container, info.dma32_window_start, |
| 1836 | info.dma32_window_start + |
| 1837 | info.dma32_window_size - 1, |
| 1838 | 0x1000); |
| 1839 | } |
Eric Auger | 2b6326c | 2019-02-21 21:07:03 -0700 | [diff] [blame] | 1840 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1841 | } |
| 1842 | |
Alexey Kardashevskiy | 8c37faa | 2017-07-17 12:39:09 -0600 | [diff] [blame] | 1843 | vfio_kvm_device_add_group(group); |
| 1844 | |
| 1845 | QLIST_INIT(&container->group_list); |
| 1846 | QLIST_INSERT_HEAD(&space->containers, container, next); |
| 1847 | |
| 1848 | group->container = container; |
| 1849 | QLIST_INSERT_HEAD(&container->group_list, group, container_next); |
| 1850 | |
David Gibson | ee0bf0e | 2015-09-30 12:13:51 +1000 | [diff] [blame] | 1851 | container->listener = vfio_memory_listener; |
| 1852 | |
| 1853 | memory_listener_register(&container->listener, container->space->as); |
| 1854 | |
| 1855 | if (container->error) { |
Eric Auger | d7d8783 | 2019-09-24 10:25:16 +0200 | [diff] [blame] | 1856 | ret = -1; |
| 1857 | error_propagate_prepend(errp, container->error, |
| 1858 | "memory listener initialization failed: "); |
David Gibson | ee0bf0e | 2015-09-30 12:13:51 +1000 | [diff] [blame] | 1859 | goto listener_release_exit; |
| 1860 | } |
| 1861 | |
| 1862 | container->initialized = true; |
| 1863 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1864 | return 0; |
| 1865 | listener_release_exit: |
Alexey Kardashevskiy | 8c37faa | 2017-07-17 12:39:09 -0600 | [diff] [blame] | 1866 | QLIST_REMOVE(group, container_next); |
| 1867 | QLIST_REMOVE(container, next); |
| 1868 | vfio_kvm_device_del_group(group); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1869 | vfio_listener_release(container); |
| 1870 | |
| 1871 | free_container_exit: |
| 1872 | g_free(container); |
| 1873 | |
| 1874 | close_fd_exit: |
| 1875 | close(fd); |
| 1876 | |
| 1877 | put_space_exit: |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1878 | ram_block_discard_disable(false); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1879 | vfio_put_address_space(space); |
| 1880 | |
| 1881 | return ret; |
| 1882 | } |
| 1883 | |
| 1884 | static void vfio_disconnect_container(VFIOGroup *group) |
| 1885 | { |
| 1886 | VFIOContainer *container = group->container; |
| 1887 | |
Peter Xu | 3696862 | 2018-01-22 14:02:43 +0800 | [diff] [blame] | 1888 | QLIST_REMOVE(group, container_next); |
| 1889 | group->container = NULL; |
| 1890 | |
| 1891 | /* |
| 1892 | * Explicitly release the listener first before unset container, |
| 1893 | * since unset may destroy the backend container if it's the last |
| 1894 | * group. |
| 1895 | */ |
| 1896 | if (QLIST_EMPTY(&container->group_list)) { |
| 1897 | vfio_listener_release(container); |
| 1898 | } |
| 1899 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1900 | if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { |
| 1901 | error_report("vfio: error disconnecting group %d from container", |
| 1902 | group->groupid); |
| 1903 | } |
| 1904 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1905 | if (QLIST_EMPTY(&container->group_list)) { |
| 1906 | VFIOAddressSpace *space = container->space; |
Alexey Kardashevskiy | f8d8a94 | 2015-07-06 12:15:15 -0600 | [diff] [blame] | 1907 | VFIOGuestIOMMU *giommu, *tmp; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1908 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1909 | QLIST_REMOVE(container, next); |
Alexey Kardashevskiy | f8d8a94 | 2015-07-06 12:15:15 -0600 | [diff] [blame] | 1910 | |
| 1911 | QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { |
Alexey Kardashevskiy | 3df9d74 | 2017-07-11 13:56:19 +1000 | [diff] [blame] | 1912 | memory_region_unregister_iommu_notifier( |
| 1913 | MEMORY_REGION(giommu->iommu), &giommu->n); |
Alexey Kardashevskiy | f8d8a94 | 2015-07-06 12:15:15 -0600 | [diff] [blame] | 1914 | QLIST_REMOVE(giommu, giommu_next); |
| 1915 | g_free(giommu); |
| 1916 | } |
| 1917 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1918 | trace_vfio_disconnect_container(container->fd); |
| 1919 | close(container->fd); |
| 1920 | g_free(container); |
| 1921 | |
| 1922 | vfio_put_address_space(space); |
| 1923 | } |
| 1924 | } |
| 1925 | |
Eric Auger | 1b808d5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1926 | VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1927 | { |
| 1928 | VFIOGroup *group; |
| 1929 | char path[32]; |
| 1930 | struct vfio_group_status status = { .argsz = sizeof(status) }; |
| 1931 | |
| 1932 | QLIST_FOREACH(group, &vfio_group_list, next) { |
| 1933 | if (group->groupid == groupid) { |
| 1934 | /* Found it. Now is it already in the right context? */ |
| 1935 | if (group->container->space->as == as) { |
| 1936 | return group; |
| 1937 | } else { |
Eric Auger | 1b808d5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1938 | error_setg(errp, "group %d used in multiple address spaces", |
| 1939 | group->groupid); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1940 | return NULL; |
| 1941 | } |
| 1942 | } |
| 1943 | } |
| 1944 | |
| 1945 | group = g_malloc0(sizeof(*group)); |
| 1946 | |
| 1947 | snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); |
Daniel P. Berrangé | 448058a | 2020-07-21 13:25:21 +0100 | [diff] [blame] | 1948 | group->fd = qemu_open_old(path, O_RDWR); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1949 | if (group->fd < 0) { |
Eric Auger | 1b808d5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1950 | error_setg_errno(errp, errno, "failed to open %s", path); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1951 | goto free_group_exit; |
| 1952 | } |
| 1953 | |
| 1954 | if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { |
Eric Auger | 1b808d5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1955 | error_setg_errno(errp, errno, "failed to get group %d status", groupid); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1956 | goto close_fd_exit; |
| 1957 | } |
| 1958 | |
| 1959 | if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { |
Eric Auger | 1b808d5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1960 | error_setg(errp, "group %d is not viable", groupid); |
| 1961 | error_append_hint(errp, |
| 1962 | "Please ensure all devices within the iommu_group " |
| 1963 | "are bound to their vfio bus driver.\n"); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1964 | goto close_fd_exit; |
| 1965 | } |
| 1966 | |
| 1967 | group->groupid = groupid; |
| 1968 | QLIST_INIT(&group->device_list); |
| 1969 | |
Eric Auger | 1b808d5 | 2016-10-17 10:57:59 -0600 | [diff] [blame] | 1970 | if (vfio_connect_container(group, as, errp)) { |
| 1971 | error_prepend(errp, "failed to setup container for group %d: ", |
| 1972 | groupid); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1973 | goto close_fd_exit; |
| 1974 | } |
| 1975 | |
| 1976 | if (QLIST_EMPTY(&vfio_group_list)) { |
| 1977 | qemu_register_reset(vfio_reset_handler, NULL); |
| 1978 | } |
| 1979 | |
| 1980 | QLIST_INSERT_HEAD(&vfio_group_list, group, next); |
| 1981 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1982 | return group; |
| 1983 | |
| 1984 | close_fd_exit: |
| 1985 | close(group->fd); |
| 1986 | |
| 1987 | free_group_exit: |
| 1988 | g_free(group); |
| 1989 | |
| 1990 | return NULL; |
| 1991 | } |
| 1992 | |
| 1993 | void vfio_put_group(VFIOGroup *group) |
| 1994 | { |
Paolo Bonzini | 77a10d0 | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 1995 | if (!group || !QLIST_EMPTY(&group->device_list)) { |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 1996 | return; |
| 1997 | } |
| 1998 | |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 1999 | if (!group->ram_block_discard_allowed) { |
| 2000 | ram_block_discard_disable(false); |
Alex Williamson | 238e917 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 2001 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2002 | vfio_kvm_device_del_group(group); |
| 2003 | vfio_disconnect_container(group); |
| 2004 | QLIST_REMOVE(group, next); |
| 2005 | trace_vfio_put_group(group->fd); |
| 2006 | close(group->fd); |
| 2007 | g_free(group); |
| 2008 | |
| 2009 | if (QLIST_EMPTY(&vfio_group_list)) { |
| 2010 | qemu_unregister_reset(vfio_reset_handler, NULL); |
| 2011 | } |
| 2012 | } |
| 2013 | |
| 2014 | int vfio_get_device(VFIOGroup *group, const char *name, |
Eric Auger | 59f7d67 | 2016-10-17 10:58:00 -0600 | [diff] [blame] | 2015 | VFIODevice *vbasedev, Error **errp) |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2016 | { |
| 2017 | struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; |
Paolo Bonzini | 217e9fd | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2018 | int ret, fd; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2019 | |
Paolo Bonzini | 217e9fd | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2020 | fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); |
| 2021 | if (fd < 0) { |
Eric Auger | 59f7d67 | 2016-10-17 10:58:00 -0600 | [diff] [blame] | 2022 | error_setg_errno(errp, errno, "error getting device from group %d", |
| 2023 | group->groupid); |
| 2024 | error_append_hint(errp, |
| 2025 | "Verify all devices in group %d are bound to vfio-<bus> " |
| 2026 | "or pci-stub and not already in use\n", group->groupid); |
Paolo Bonzini | 217e9fd | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2027 | return fd; |
| 2028 | } |
| 2029 | |
| 2030 | ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info); |
| 2031 | if (ret) { |
Eric Auger | 59f7d67 | 2016-10-17 10:58:00 -0600 | [diff] [blame] | 2032 | error_setg_errno(errp, errno, "error getting device info"); |
Paolo Bonzini | 217e9fd | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2033 | close(fd); |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2034 | return ret; |
| 2035 | } |
| 2036 | |
Alex Williamson | 238e917 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 2037 | /* |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 2038 | * Set discarding of RAM as not broken for this group if the driver knows |
| 2039 | * the device operates compatibly with discarding. Setting must be |
| 2040 | * consistent per group, but since compatibility is really only possible |
| 2041 | * with mdev currently, we expect singleton groups. |
Alex Williamson | 238e917 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 2042 | */ |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 2043 | if (vbasedev->ram_block_discard_allowed != |
| 2044 | group->ram_block_discard_allowed) { |
Alex Williamson | 238e917 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 2045 | if (!QLIST_EMPTY(&group->device_list)) { |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 2046 | error_setg(errp, "Inconsistent setting of support for discarding " |
| 2047 | "RAM (e.g., balloon) within group"); |
Alex Williamson | 8709b39 | 2018-08-23 10:45:58 -0600 | [diff] [blame] | 2048 | close(fd); |
Alex Williamson | 238e917 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 2049 | return -1; |
| 2050 | } |
| 2051 | |
David Hildenbrand | aff92b8 | 2020-06-26 09:22:30 +0200 | [diff] [blame] | 2052 | if (!group->ram_block_discard_allowed) { |
| 2053 | group->ram_block_discard_allowed = true; |
| 2054 | ram_block_discard_disable(false); |
Alex Williamson | 238e917 | 2018-08-17 09:27:16 -0600 | [diff] [blame] | 2055 | } |
| 2056 | } |
| 2057 | |
Paolo Bonzini | 217e9fd | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2058 | vbasedev->fd = fd; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2059 | vbasedev->group = group; |
| 2060 | QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); |
| 2061 | |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2062 | vbasedev->num_irqs = dev_info.num_irqs; |
| 2063 | vbasedev->num_regions = dev_info.num_regions; |
| 2064 | vbasedev->flags = dev_info.flags; |
| 2065 | |
| 2066 | trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, |
| 2067 | dev_info.num_irqs); |
| 2068 | |
| 2069 | vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); |
Paolo Bonzini | 217e9fd | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2070 | return 0; |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2071 | } |
| 2072 | |
| 2073 | void vfio_put_base_device(VFIODevice *vbasedev) |
| 2074 | { |
Paolo Bonzini | 77a10d0 | 2015-02-10 10:25:44 -0700 | [diff] [blame] | 2075 | if (!vbasedev->group) { |
| 2076 | return; |
| 2077 | } |
Eric Auger | e2c7d02 | 2014-12-22 09:54:51 -0700 | [diff] [blame] | 2078 | QLIST_REMOVE(vbasedev, next); |
| 2079 | vbasedev->group = NULL; |
| 2080 | trace_vfio_put_base_device(vbasedev->fd); |
| 2081 | close(vbasedev->fd); |
| 2082 | } |
| 2083 | |
Alex Williamson | 4690022 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 2084 | int vfio_get_region_info(VFIODevice *vbasedev, int index, |
| 2085 | struct vfio_region_info **info) |
| 2086 | { |
| 2087 | size_t argsz = sizeof(struct vfio_region_info); |
| 2088 | |
| 2089 | *info = g_malloc0(argsz); |
| 2090 | |
| 2091 | (*info)->index = index; |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 2092 | retry: |
Alex Williamson | 4690022 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 2093 | (*info)->argsz = argsz; |
| 2094 | |
| 2095 | if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { |
| 2096 | g_free(*info); |
Alex Williamson | e61a424 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 2097 | *info = NULL; |
Alex Williamson | 4690022 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 2098 | return -errno; |
| 2099 | } |
| 2100 | |
Alex Williamson | b53b0f6 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 2101 | if ((*info)->argsz > argsz) { |
| 2102 | argsz = (*info)->argsz; |
| 2103 | *info = g_realloc(*info, argsz); |
| 2104 | |
| 2105 | goto retry; |
| 2106 | } |
| 2107 | |
Alex Williamson | 4690022 | 2016-03-10 09:39:07 -0700 | [diff] [blame] | 2108 | return 0; |
| 2109 | } |
| 2110 | |
Alex Williamson | e61a424 | 2016-05-26 09:43:20 -0600 | [diff] [blame] | 2111 | int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, |
| 2112 | uint32_t subtype, struct vfio_region_info **info) |
| 2113 | { |
| 2114 | int i; |
| 2115 | |
| 2116 | for (i = 0; i < vbasedev->num_regions; i++) { |
| 2117 | struct vfio_info_cap_header *hdr; |
| 2118 | struct vfio_region_info_cap_type *cap_type; |
| 2119 | |
| 2120 | if (vfio_get_region_info(vbasedev, i, info)) { |
| 2121 | continue; |
| 2122 | } |
| 2123 | |
| 2124 | hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); |
| 2125 | if (!hdr) { |
| 2126 | g_free(*info); |
| 2127 | continue; |
| 2128 | } |
| 2129 | |
| 2130 | cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); |
| 2131 | |
| 2132 | trace_vfio_get_dev_region(vbasedev->name, i, |
| 2133 | cap_type->type, cap_type->subtype); |
| 2134 | |
| 2135 | if (cap_type->type == type && cap_type->subtype == subtype) { |
| 2136 | return 0; |
| 2137 | } |
| 2138 | |
| 2139 | g_free(*info); |
| 2140 | } |
| 2141 | |
| 2142 | *info = NULL; |
| 2143 | return -ENODEV; |
| 2144 | } |
| 2145 | |
Alexey Kardashevskiy | ae0215b | 2018-03-13 11:17:31 -0600 | [diff] [blame] | 2146 | bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) |
| 2147 | { |
| 2148 | struct vfio_region_info *info = NULL; |
| 2149 | bool ret = false; |
| 2150 | |
| 2151 | if (!vfio_get_region_info(vbasedev, region, &info)) { |
| 2152 | if (vfio_get_region_info_cap(info, cap_type)) { |
| 2153 | ret = true; |
| 2154 | } |
| 2155 | g_free(info); |
| 2156 | } |
| 2157 | |
| 2158 | return ret; |
| 2159 | } |
| 2160 | |
David Gibson | 3153119 | 2016-03-09 11:56:06 +1100 | [diff] [blame] | 2161 | /* |
| 2162 | * Interfaces for IBM EEH (Enhanced Error Handling) |
| 2163 | */ |
| 2164 | static bool vfio_eeh_container_ok(VFIOContainer *container) |
| 2165 | { |
| 2166 | /* |
| 2167 | * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO |
| 2168 | * implementation is broken if there are multiple groups in a |
| 2169 | * container. The hardware works in units of Partitionable |
| 2170 | * Endpoints (== IOMMU groups) and the EEH operations naively |
| 2171 | * iterate across all groups in the container, without any logic |
| 2172 | * to make sure the groups have their state synchronized. For |
| 2173 | * certain operations (ENABLE) that might be ok, until an error |
| 2174 | * occurs, but for others (GET_STATE) it's clearly broken. |
| 2175 | */ |
| 2176 | |
| 2177 | /* |
| 2178 | * XXX Once fixed kernels exist, test for them here |
| 2179 | */ |
| 2180 | |
| 2181 | if (QLIST_EMPTY(&container->group_list)) { |
| 2182 | return false; |
| 2183 | } |
| 2184 | |
| 2185 | if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { |
| 2186 | return false; |
| 2187 | } |
| 2188 | |
| 2189 | return true; |
| 2190 | } |
| 2191 | |
| 2192 | static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) |
| 2193 | { |
| 2194 | struct vfio_eeh_pe_op pe_op = { |
| 2195 | .argsz = sizeof(pe_op), |
| 2196 | .op = op, |
| 2197 | }; |
| 2198 | int ret; |
| 2199 | |
| 2200 | if (!vfio_eeh_container_ok(container)) { |
| 2201 | error_report("vfio/eeh: EEH_PE_OP 0x%x: " |
| 2202 | "kernel requires a container with exactly one group", op); |
| 2203 | return -EPERM; |
| 2204 | } |
| 2205 | |
| 2206 | ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); |
| 2207 | if (ret < 0) { |
| 2208 | error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); |
| 2209 | return -errno; |
| 2210 | } |
| 2211 | |
Gavin Shan | d917e88 | 2016-06-15 14:28:27 +1000 | [diff] [blame] | 2212 | return ret; |
David Gibson | 3153119 | 2016-03-09 11:56:06 +1100 | [diff] [blame] | 2213 | } |
| 2214 | |
| 2215 | static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) |
| 2216 | { |
| 2217 | VFIOAddressSpace *space = vfio_get_address_space(as); |
| 2218 | VFIOContainer *container = NULL; |
| 2219 | |
| 2220 | if (QLIST_EMPTY(&space->containers)) { |
| 2221 | /* No containers to act on */ |
| 2222 | goto out; |
| 2223 | } |
| 2224 | |
| 2225 | container = QLIST_FIRST(&space->containers); |
| 2226 | |
| 2227 | if (QLIST_NEXT(container, next)) { |
| 2228 | /* We don't yet have logic to synchronize EEH state across |
| 2229 | * multiple containers */ |
| 2230 | container = NULL; |
| 2231 | goto out; |
| 2232 | } |
| 2233 | |
| 2234 | out: |
| 2235 | vfio_put_address_space(space); |
| 2236 | return container; |
| 2237 | } |
| 2238 | |
| 2239 | bool vfio_eeh_as_ok(AddressSpace *as) |
| 2240 | { |
| 2241 | VFIOContainer *container = vfio_eeh_as_container(as); |
| 2242 | |
| 2243 | return (container != NULL) && vfio_eeh_container_ok(container); |
| 2244 | } |
| 2245 | |
| 2246 | int vfio_eeh_as_op(AddressSpace *as, uint32_t op) |
| 2247 | { |
| 2248 | VFIOContainer *container = vfio_eeh_as_container(as); |
| 2249 | |
| 2250 | if (!container) { |
| 2251 | return -ENODEV; |
| 2252 | } |
| 2253 | return vfio_eeh_container_op(container, op); |
| 2254 | } |