Merge remote-tracking branch 'remotes/awilliam/tags/vfio-pci-for-qemu-20140630.0' into staging

VFIO patches: MSI-X masking performance fix, Endian fixes, fix runstate on device error

# gpg: Signature made Mon 30 Jun 2014 18:13:40 BST using RSA key ID 3BB08B22
# gpg: Can't check signature: public key not found

* remotes/awilliam/tags/vfio-pci-for-qemu-20140630.0:
  vfio: use correct runstate
  vfio: Make BARs native endian
  vfio-pci: Fix MSI-X masking performance
  vfio-pci: Fix MSI/X debug code

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 7b279c4..aef4c9c 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -121,6 +121,7 @@
 
 typedef struct VFIOMSIVector {
     EventNotifier interrupt; /* eventfd triggered on interrupt */
+    EventNotifier kvm_interrupt; /* eventfd triggered for KVM irqfd bypass */
     struct VFIODevice *vdev; /* back pointer to device */
     MSIMessage msg; /* cache the MSI message so we know when it changes */
     int virq; /* KVM irqchip route for QEMU bypass */
@@ -642,9 +643,9 @@
     MSIMessage msg;
 
     if (vdev->interrupt == VFIO_INT_MSIX) {
-        msg = msi_get_message(&vdev->pdev, nr);
-    } else if (vdev->interrupt == VFIO_INT_MSI) {
         msg = msix_get_message(&vdev->pdev, nr);
+    } else if (vdev->interrupt == VFIO_INT_MSI) {
+        msg = msi_get_message(&vdev->pdev, nr);
     } else {
         abort();
     }
@@ -682,10 +683,11 @@
     for (i = 0; i < vdev->nr_vectors; i++) {
         if (!vdev->msi_vectors[i].use) {
             fds[i] = -1;
-            continue;
+        } else if (vdev->msi_vectors[i].virq >= 0) {
+            fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
+        } else {
+            fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
         }
-
-        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
     }
 
     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
@@ -695,6 +697,52 @@
     return ret;
 }
 
+static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
+                                  bool msix)
+{
+    int virq;
+
+    if ((msix && !VFIO_ALLOW_KVM_MSIX) ||
+        (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) {
+        return;
+    }
+
+    if (event_notifier_init(&vector->kvm_interrupt, 0)) {
+        return;
+    }
+
+    virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
+    if (virq < 0) {
+        event_notifier_cleanup(&vector->kvm_interrupt);
+        return;
+    }
+
+    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
+                                       NULL, virq) < 0) {
+        kvm_irqchip_release_virq(kvm_state, virq);
+        event_notifier_cleanup(&vector->kvm_interrupt);
+        return;
+    }
+
+    vector->msg = *msg;
+    vector->virq = virq;
+}
+
+static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
+{
+    kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
+                                      vector->virq);
+    kvm_irqchip_release_virq(kvm_state, vector->virq);
+    vector->virq = -1;
+    event_notifier_cleanup(&vector->kvm_interrupt);
+}
+
+static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
+{
+    kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
+    vector->msg = msg;
+}
+
 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
                                    MSIMessage *msg, IOHandler *handler)
 {
@@ -707,30 +755,32 @@
             vdev->host.function, nr);
 
     vector = &vdev->msi_vectors[nr];
-    vector->vdev = vdev;
-    vector->use = true;
 
-    msix_vector_use(pdev, nr);
-
-    if (event_notifier_init(&vector->interrupt, 0)) {
-        error_report("vfio: Error: event_notifier_init failed");
+    if (!vector->use) {
+        vector->vdev = vdev;
+        vector->virq = -1;
+        if (event_notifier_init(&vector->interrupt, 0)) {
+            error_report("vfio: Error: event_notifier_init failed");
+        }
+        vector->use = true;
+        msix_vector_use(pdev, nr);
     }
 
+    qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                        handler, NULL, vector);
+
     /*
      * Attempt to enable route through KVM irqchip,
      * default to userspace handling if unavailable.
      */
-    vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
-                   kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
-    if (vector->virq < 0 ||
-        kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
-                                       NULL, vector->virq) < 0) {
-        if (vector->virq >= 0) {
-            kvm_irqchip_release_virq(kvm_state, vector->virq);
-            vector->virq = -1;
+    if (vector->virq >= 0) {
+        if (!msg) {
+            vfio_remove_kvm_msi_virq(vector);
+        } else {
+            vfio_update_kvm_msi_virq(vector, *msg);
         }
-        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                            handler, NULL, vector);
+    } else {
+        vfio_add_kvm_msi_virq(vector, msg, true);
     }
 
     /*
@@ -761,7 +811,11 @@
         irq_set->count = 1;
         pfd = (int32_t *)&irq_set->data;
 
-        *pfd = event_notifier_get_fd(&vector->interrupt);
+        if (vector->virq >= 0) {
+            *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
+        } else {
+            *pfd = event_notifier_get_fd(&vector->interrupt);
+        }
 
         ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
         g_free(irq_set);
@@ -783,50 +837,41 @@
 {
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
-    int argsz;
-    struct vfio_irq_set *irq_set;
-    int32_t *pfd;
 
     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function, nr);
 
     /*
-     * XXX What's the right thing to do here?  This turns off the interrupt
-     * completely, but do we really just want to switch the interrupt to
-     * bouncing through userspace and let msix.c drop it?  Not sure.
+     * There are still old guests that mask and unmask vectors on every
+     * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
+     * the KVM setup in place, simply switch VFIO to use the non-bypass
+     * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
+     * core will mask the interrupt and set pending bits, allowing it to
+     * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
      */
-    msix_vector_unuse(pdev, nr);
+    if (vector->virq >= 0) {
+        int argsz;
+        struct vfio_irq_set *irq_set;
+        int32_t *pfd;
 
-    argsz = sizeof(*irq_set) + sizeof(*pfd);
+        argsz = sizeof(*irq_set) + sizeof(*pfd);
 
-    irq_set = g_malloc0(argsz);
-    irq_set->argsz = argsz;
-    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
-                     VFIO_IRQ_SET_ACTION_TRIGGER;
-    irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-    irq_set->start = nr;
-    irq_set->count = 1;
-    pfd = (int32_t *)&irq_set->data;
+        irq_set = g_malloc0(argsz);
+        irq_set->argsz = argsz;
+        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                         VFIO_IRQ_SET_ACTION_TRIGGER;
+        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+        irq_set->start = nr;
+        irq_set->count = 1;
+        pfd = (int32_t *)&irq_set->data;
 
-    *pfd = -1;
+        *pfd = event_notifier_get_fd(&vector->interrupt);
 
-    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+        ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
-    g_free(irq_set);
-
-    if (vector->virq < 0) {
-        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                            NULL, NULL, NULL);
-    } else {
-        kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
-                                          vector->virq);
-        kvm_irqchip_release_virq(kvm_state, vector->virq);
-        vector->virq = -1;
+        g_free(irq_set);
     }
-
-    event_notifier_cleanup(&vector->interrupt);
-    vector->use = false;
 }
 
 static void vfio_enable_msix(VFIODevice *vdev)
@@ -876,28 +921,28 @@
         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 
         vector->vdev = vdev;
+        vector->virq = -1;
         vector->use = true;
 
         if (event_notifier_init(&vector->interrupt, 0)) {
             error_report("vfio: Error: event_notifier_init failed");
         }
 
+        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                            vfio_msi_interrupt, NULL, vector);
+
         vector->msg = msi_get_message(&vdev->pdev, i);
 
         /*
          * Attempt to enable route through KVM irqchip,
          * default to userspace handling if unavailable.
          */
-        vector->virq = VFIO_ALLOW_KVM_MSI ?
-                       kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
-        if (vector->virq < 0 ||
-            kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
-                                           NULL, vector->virq) < 0) {
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                vfio_msi_interrupt, NULL, vector);
-        }
+        vfio_add_kvm_msi_virq(vector, &vector->msg, false);
     }
 
+    /* Set interrupt type prior to possible interrupts */
+    vdev->interrupt = VFIO_INT_MSI;
+
     ret = vfio_enable_vectors(vdev, false);
     if (ret) {
         if (ret < 0) {
@@ -910,14 +955,10 @@
         for (i = 0; i < vdev->nr_vectors; i++) {
             VFIOMSIVector *vector = &vdev->msi_vectors[i];
             if (vector->virq >= 0) {
-                kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
-                                                  vector->virq);
-                kvm_irqchip_release_virq(kvm_state, vector->virq);
-                vector->virq = -1;
-            } else {
-                qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                    NULL, NULL, NULL);
+                vfio_remove_kvm_msi_virq(vector);
             }
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                NULL, NULL, NULL);
             event_notifier_cleanup(&vector->interrupt);
         }
 
@@ -929,11 +970,17 @@
         }
         vdev->nr_vectors = 0;
 
+        /*
+         * Failing to setup MSI doesn't really fall within any specification.
+         * Let's try leaving interrupts disabled and hope the guest figures
+         * out to fall back to INTx for this device.
+         */
+        error_report("vfio: Error: Failed to enable MSI");
+        vdev->interrupt = VFIO_INT_NONE;
+
         return;
     }
 
-    vdev->interrupt = VFIO_INT_MSI;
-
     DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function, vdev->nr_vectors);
@@ -941,6 +988,20 @@
 
 static void vfio_disable_msi_common(VFIODevice *vdev)
 {
+    int i;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
+        if (vdev->msi_vectors[i].use) {
+            if (vector->virq >= 0) {
+                vfio_remove_kvm_msi_virq(vector);
+            }
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                NULL, NULL, NULL);
+            event_notifier_cleanup(&vector->interrupt);
+        }
+    }
+
     g_free(vdev->msi_vectors);
     vdev->msi_vectors = NULL;
     vdev->nr_vectors = 0;
@@ -962,6 +1023,7 @@
     for (i = 0; i < vdev->nr_vectors; i++) {
         if (vdev->msi_vectors[i].use) {
             vfio_msix_vector_release(&vdev->pdev, i);
+            msix_vector_unuse(&vdev->pdev, i);
         }
     }
 
@@ -977,30 +1039,7 @@
 
 static void vfio_disable_msi(VFIODevice *vdev)
 {
-    int i;
-
     vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
-
-    for (i = 0; i < vdev->nr_vectors; i++) {
-        VFIOMSIVector *vector = &vdev->msi_vectors[i];
-
-        if (!vector->use) {
-            continue;
-        }
-
-        if (vector->virq >= 0) {
-            kvm_irqchip_remove_irqfd_notifier(kvm_state,
-                                              &vector->interrupt, vector->virq);
-            kvm_irqchip_release_virq(kvm_state, vector->virq);
-            vector->virq = -1;
-        } else {
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                NULL, NULL, NULL);
-        }
-
-        event_notifier_cleanup(&vector->interrupt);
-    }
-
     vfio_disable_msi_common(vdev);
 
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
@@ -1020,17 +1059,7 @@
         }
 
         msg = msi_get_message(&vdev->pdev, i);
-
-        if (msg.address != vector->msg.address ||
-            msg.data != vector->msg.data) {
-
-            DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
-                    __func__, vdev->host.domain, vdev->host.bus,
-                    vdev->host.slot, vdev->host.function, i);
-
-            kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
-            vector->msg = msg;
-        }
+        vfio_update_kvm_msi_virq(vector, msg);
     }
 }
 
@@ -1053,10 +1082,10 @@
         buf.byte = data;
         break;
     case 2:
-        buf.word = cpu_to_le16(data);
+        buf.word = data;
         break;
     case 4:
-        buf.dword = cpu_to_le32(data);
+        buf.dword = data;
         break;
     default:
         hw_error("vfio: unsupported write size, %d bytes", size);
@@ -1113,10 +1142,10 @@
         data = buf.byte;
         break;
     case 2:
-        data = le16_to_cpu(buf.word);
+        data = buf.word;
         break;
     case 4:
-        data = le32_to_cpu(buf.dword);
+        data = buf.dword;
         break;
     default:
         hw_error("vfio: unsupported read size, %d bytes", size);
@@ -1143,7 +1172,7 @@
 static const MemoryRegionOps vfio_bar_ops = {
     .read = vfio_bar_read,
     .write = vfio_bar_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
+    .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
 static void vfio_pci_load_rom(VFIODevice *vdev)
@@ -1205,21 +1234,42 @@
 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
 {
     VFIODevice *vdev = opaque;
-    uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
+    union {
+        uint8_t byte;
+        uint16_t word;
+        uint32_t dword;
+        uint64_t qword;
+    } buf;
+    uint64_t data = 0;
 
     /* Load the ROM lazily when the guest tries to read it */
     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
         vfio_pci_load_rom(vdev);
     }
 
-    memcpy(&val, vdev->rom + addr,
+    memcpy(&buf, vdev->rom + addr,
            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
 
+    switch (size) {
+    case 1:
+        data = buf.byte;
+        break;
+    case 2:
+        data = buf.word;
+        break;
+    case 4:
+        data = buf.dword;
+        break;
+    default:
+        hw_error("vfio: unsupported read size, %d bytes", size);
+        break;
+    }
+
     DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
             __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
-            vdev->host.function, addr, size, val);
+            vdev->host.function, addr, size, data);
 
-    return val;
+    return data;
 }
 
 static void vfio_rom_write(void *opaque, hwaddr addr,
@@ -1230,7 +1280,7 @@
 static const MemoryRegionOps vfio_rom_ops = {
     .read = vfio_rom_read,
     .write = vfio_rom_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
+    .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
 static bool vfio_blacklist_opt_rom(VFIODevice *vdev)
@@ -4012,7 +4062,7 @@
                  __func__, vdev->host.domain, vdev->host.bus,
                  vdev->host.slot, vdev->host.function);
 
-    vm_stop(RUN_STATE_IO_ERROR);
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
 }
 
 /*